From 33d0facf4e2dadb1322435c9da14878502e200bc Mon Sep 17 00:00:00 2001 From: Mohammad Mahfuzur Rahman Mamun Date: Wed, 13 Jul 2011 17:16:03 +0600 Subject: [PATCH] selection of text by character done ... need some testing and debug --- core/textpage.cpp | 211 ++++++++++++++++++++++++---------------------- core/textpage_p.h | 7 +- 2 files changed, 112 insertions(+), 106 deletions(-) diff --git a/core/textpage.cpp b/core/textpage.cpp index ded26966d..999e501d0 100644 --- a/core/textpage.cpp +++ b/core/textpage.cpp @@ -143,6 +143,8 @@ class TinyTextEntity class RegionText{ public: + RegionText(){}; + RegionText(TextList &list,QRect &area) : m_region_text(list) ,m_area(area) { @@ -959,6 +961,14 @@ bool compareTinyTextEntityY(TinyTextEntity* first, TinyTextEntity* second){ return firstArea.top() < secondArea.top(); } +bool compareRegionTextY(RegionText first, RegionText second){ + return first.area().top() < second.area().top(); +} + +bool compareRegionTextX(RegionText first, RegionText second){ + return first.area().left() < second.area().left(); +} + void TextPagePrivate::printTextList(int i, TextList list){ @@ -1093,11 +1103,14 @@ void TextPage::makeWord(){ int newLeft,newRight,newTop,newBottom; int pageWidth = d->m_page->m_page->width(), pageHeight = d->m_page->m_page->height(); int index = 0; - QString spaceString(" "); - //For RegionTextList + // It will contain a list of RegionText, where each RegionText contains a word, which comprises of + // TextList which is a list of TinyTextEntity which contains characters info and a QRect which contains + // the area of the region. RegionTextList regionWordList; + //WordTocharacterList d->m_word_char_map + //for every non-space texts(characters/words) in the textList for( ; it != itEnd ; it++){ @@ -1109,15 +1122,10 @@ void TextPage::makeWord(){ tmpIt = it; -// cout << "first : "; -// printRect(lineArea) ; - int space = 0; while(space <= 1){ -// if(textString == spaceString) break; - // we must have to put this line before the if condition of it==itEnd // otherwise the last character can be missed if(textString.length()){ @@ -1189,15 +1197,23 @@ void TextPage::makeWord(){ if(newString.length()){ NormalizedRect newRect(lineArea,pageWidth,pageHeight); + TinyTextEntity *ent = new TinyTextEntity(newString.normalized + (QString::NormalizationForm_KC), newRect ); + newList.append(ent); - newList.append( new TinyTextEntity(newString.normalized - (QString::NormalizationForm_KC), newRect )); -// cout << "newString: " << newString.toAscii().data() << endl; QRect rect = newRect.geometry(pageWidth,pageHeight); RegionText regionWord(word,rect); regionWordList.append(regionWord); + + + int keyRect = rect.left() * rect.top() + + rect.right() * rect.bottom(); + + // if there are more than one element in the same key + d->m_word_chars_map.insertMulti(keyRect,regionWord); + index++; } @@ -1209,23 +1225,21 @@ void TextPage::makeWord(){ d->m_region_words = regionWordList; cout << "words: " << index << endl; -// cout << " ............................................................ " << endl; d->copy(newList); // for(int i = 0 ; i < d->m_words.length() ; i++){ + // TinyTextEntity *ent = d->m_words.at(i); -// cout << ent->text().toAscii().data() << endl; -// printRect(ent->area.roundedGeometry(pageWidth,pageHeight)); -// } +// QRect entArea = ent->area.geometry(pageWidth,pageHeight); +// int key = entArea.top() * entArea.left() + entArea.right() * entArea.bottom(); -// cout << endl; +// RegionText text_list = d->m_word_chars_map.value(key); +// TextList list = text_list.text(); -// for(int i = 0 ; i < d->m_region_words.length() ; i++){ -// RegionText word = d->m_region_words.at(i); -// TextList text = word.text(); -// for( int j = 0 ; j < text.length() ; j++){ -// TinyTextEntity* ent = text.at(j); +// cout << "key: " << key << " text: "; +// for( int l = 0 ; l < list.length() ; l++){ +// ent = list.at(l); // cout << ent->text().toAscii().data(); // } // cout << endl; @@ -1253,7 +1267,6 @@ void TextPage::makeAndSortLines(){ TextList tmpList = d->m_words; qSort(tmpList.begin(),tmpList.end(),compareTinyTextEntityY); -// d->printTextList(0,tmpList); // Step 2: ....................................... @@ -1347,9 +1360,6 @@ void TextPage::makeAndSortLines(){ qSort(list.begin(),list.end(),compareTinyTextEntityX); d->m_lines.replace(i,list); -// d->printTextList(i,list); -// printRect(d->m_line_rects.at(i)); - } } @@ -1429,14 +1439,12 @@ void TextPage::XYCutForBoundingBoxes(int tcx,int tcy){ if (proj_on_yaxis[j] > maxY) maxY = proj_on_yaxis[j]; // cout << "index: " << j << " value: " << proj_on_yaxis[j] << endl; } -// cout << endl; // cout << "projection on x axis " << endl << endl; for( j = 0 ; j < size_proj_x ; j++ ){ if(proj_on_xaxis[j] > maxX) maxX = proj_on_xaxis[j]; // cout << "index: " << j << " value: " << proj_on_xaxis[j] << endl; } -// cout << endl; /** 2. Cleanup Boundary White Spaces and removal of noise ..................... **/ @@ -1458,7 +1466,6 @@ void TextPage::XYCutForBoundingBoxes(int tcx,int tcy){ yend--; } -// printRect(regionRect); //update the regionRect int old_left = regionRect.left(), old_top = regionRect.top(); @@ -1574,39 +1581,19 @@ void TextPage::XYCutForBoundingBoxes(int tcx,int tcy){ regionRect.height()); - // horizontal split (top rect, bottom rect) - cout << "main: "; - printRect(regionRect); - if(gap_y >= gap_x && gap_y > tcy){ -// cout << "toprect: "; -// printRect(topRect); -// cout << "bottomrect: "; -// printRect(bottomRect); cut_hor = true; } //vertical cut (left rect, right rect) else if(gap_y >= gap_x && gap_y <= tcy && gap_x > tcx){ -// cout << "leftrect: "; -// printRect(leftRect); -// cout << "rightrect: "; -// printRect(rightRect); cut_ver = true; } //vertical cut else if(gap_x >= gap_y && gap_x > tcx){ -// cout << "leftrect: "; -// printRect(leftRect); -// cout << "rightrect: "; -// printRect(rightRect); cut_ver = true; } //horizontal cut else if(gap_x >= gap_y && gap_x <= tcx && gap_y > tcy){ -// cout << "toprect: "; -// printRect(topRect); -// cout << "bottomrect: "; -// printRect(bottomRect); cut_hor = true; } //no cut possible @@ -1692,8 +1679,6 @@ void TextPage::XYCutForBoundingBoxes(int tcx,int tcy){ //correct the textOrder, all layout recognition works here void TextPage::correctTextOrder(){ - // create words from characters (crashes) - removeSpace(); makeWord(); @@ -1771,19 +1756,15 @@ void TextPage::correctTextOrder(){ QRect max_area1,max_area2; QString before_max, after_max; -// d->printTextList(i,list); // for every line for( ; it != itEnd ; it++ ){ -// cout << (*it)->text().toAscii().data() << endl; QRect area1 = (*it)->area.roundedGeometry(pageWidth,pageHeight); if( it+1 == itEnd ) break; -// printRect(area1); QRect area2 = (*(it+1))->area.roundedGeometry(pageWidth,pageHeight); int space = area2.left() - area1.right(); -// printRect(area2); if(space > maxSpace){ max_area1 = area1; @@ -1795,9 +1776,6 @@ void TextPage::correctTextOrder(){ after_max = (*(it+1))->text(); } -// cout << (*it)->text().toAscii().data() << " " << (*(it+1))->text().toAscii().data(); -// cout << " space: " << space << endl; - if(space < minSpace && space != 0) minSpace = space; //if we found a real space, whose length is not zero and also less than the pageWidth @@ -1819,14 +1797,10 @@ void TextPage::correctTextOrder(){ QRect rect(left,top,right-left,bottom-top); line_space_rects.append(rect); -// cout << space << " "; } -// cout << "space: " << space << " " << area1.right() << " " << area2.left() << endl; } -// cout << endl << "maxSpace " << maxSpace << " ----------------------------------------------- " << endl << endl; - space_rects.append(line_space_rects); if(hor_space_stat.contains(maxSpace)){ @@ -2073,12 +2047,13 @@ void TextPagePrivate::addNecessarySpace(RegionTextList tree){ // we will use the concept of line and line sorting here once again /** - 1. we will first add spaces regionWise - 2. Then we will sort all the texts in the region by Y - 3. After that, we will create a line containing all overlapping Y - 4. Now, we will sort texts in every line by X - 5. And, finally we will extract all the space separated texts from each region and + 1. We will sort all the texts in the region by Y + 2. After that, we will create a line containing all overlapping Y + 3. Now, we will sort texts in every line by X + 4. We will now add spaces between two words in a line + 5. And, then we will extract all the space separated texts from each region and make m_words nice again. + 6. Then we will merge all the texts from every region to make one TextList and assign it to m_words **/ // m_spaces;m_words; @@ -2089,38 +2064,21 @@ void TextPagePrivate::addNecessarySpace(RegionTextList tree){ // we will only change the texts under RegionTexts, not the area for(j = 0 ; j < tree.length() ; j++){ RegionText tmp = tree.at(j); - QRect area = tmp.area(); TextList tmpList = tmp.text(); - // 1. adding space -// TextList::Iterator it1 = m_tmp_words.begin(), itEnd1 = m_tmp_words.end(); -// for( ; it1 != itEnd1 ; it1++){ - -// QRect entArea = (*it1)->area.geometry(pageWidth,pageHeight); -// QPoint center = entArea.center(); -// QString text = (*it1)->text(); - -// // if some space is in the region, add its TinyTextEntity to the tmpList -// if(area.contains(center) && text == spaceStr){ -// tmpList.append((*it1)); -// } -// } - - // now we have to keep tmpList in order and then set tmp with the tmpList - - // 2. sorting by Y + // 1. sorting by Y qSort(tmpList.begin(),tmpList.end(),compareTinyTextEntityY); //print the tmpList - cout << "printing the tmpList " << " ..................................... " << endl; - for( i = 0 ; i < tmpList.length() ; i++){ - TinyTextEntity* ent = tmpList.at(i); - cout << ent->text().toAscii().data(); - } - cout << endl << endl; +// cout << "printing the tmpList " << " ..................................... " << endl; +// for( i = 0 ; i < tmpList.length() ; i++){ +// TinyTextEntity* ent = tmpList.at(i); +// cout << ent->text().toAscii().data(); +// } +// cout << endl << endl; - // 3. create line by Y overlap + // 2. create line by Y overlap TextList::Iterator it = tmpList.begin(), itEnd = tmpList.end(); int newLeft,newRight,newTop,newBottom; @@ -2180,7 +2138,6 @@ void TextPagePrivate::addNecessarySpace(RegionTextList tree){ } // // when we have found a new line - // // create a new TextList containing only one element and append it to the m_lines if(!found){ TextList tmp; tmp.append((*it)); @@ -2189,7 +2146,7 @@ void TextPagePrivate::addNecessarySpace(RegionTextList tree){ } } - // 4. sort texts in each line by X + // 3. sort texts in each line by X for(i = 0 ; i < m_lines.length() ; i++){ TextList list = m_lines.at(i); @@ -2197,10 +2154,10 @@ void TextPagePrivate::addNecessarySpace(RegionTextList tree){ qSort(list.begin(),list.end(),compareTinyTextEntityX); m_lines.replace(i,list); - printTextList(i,list); +// printTextList(i,list); } - // Bonus ;): Now, we add space in between texts in a region + // 4. Now, we add space in between texts in a region for(i = 0 ; i < m_lines.length() ; i++){ TextList list = m_lines.at(i); @@ -2258,26 +2215,74 @@ void TextPagePrivate::addNecessarySpace(RegionTextList tree){ tree.replace(j,tmp); } - TextList tmp; - int count = 0; + // Merge all the texts from each region + TextList tmp; for(i = 0 ; i < tree.length() ; i++){ - TextList list = tree.at(i).text(); - cout << "node: " << i << endl << endl; - for(j = 0 ; j < list.length() ; j++){ TinyTextEntity *ent = list.at(j); - cout << ent->text().toAscii().data(); - if(ent->text() == spaceStr) - count++; tmp.append(ent); } - cout << endl << endl; + } + copy(tmp); + + + + // break the words into characters/smallest part that was primarily + while(tmp.length()) tmp.pop_back(); + int count = 0; + for(int i = 0 ; i < m_words.length() ; i++){ + + TinyTextEntity *ent = m_words.at(i); + QRect rect = ent->area.geometry(pageWidth,pageHeight); + + // the spaces contains only one character, so we can skip them + if(ent->text() == spaceStr){ + tmp.append(ent); + } + else{ + + int key = rect.left() * rect.top() + + rect.right() * rect.bottom(); + + RegionText word_text = m_word_chars_map.value(key); + TextList list = word_text.text(); + count = m_word_chars_map.count(key); + + if(count > 1){ + cout << "count : " << count << endl; + + QMap::iterator it = m_word_chars_map.find(key); + while( it != m_word_chars_map.end() && it.key() == key ){ + + word_text = it.value(); + it++; + + list = word_text.text(); + QRect regionRect = word_text.area(); + + if(regionRect.left() == rect.left() && regionRect.top() == rect.top()) + break; + } + + } + + tmp.append(list); + } } copy(tmp); + + // print the final text + for( i = 0 ; i < m_words.length() ; i++){ + + TinyTextEntity* ent = m_words.at(i); + cout << ent->text().toAscii().data(); + + } + } diff --git a/core/textpage_p.h b/core/textpage_p.h index be2785769..3512beaf2 100644 --- a/core/textpage_p.h +++ b/core/textpage_p.h @@ -25,9 +25,6 @@ namespace Okular class PagePrivate; typedef QList< TinyTextEntity* > TextList; -/** list of RegionText -- keeps a bunch of TextList with their bounding rectangles **/ -typedef QList RegionTextList; - typedef bool ( *TextComparisonFunction )( const QStringRef & from, const QStringRef & to, int *fromLength, int *toLength ); @@ -39,6 +36,8 @@ We will make a line of TextList and also store the bounding rectangle of line typedef QList SortedTextList; typedef QList LineRect; +/** list of RegionText -- keeps a bunch of TextList with their bounding rectangles **/ +typedef QList RegionTextList; class TextPagePrivate { @@ -68,6 +67,8 @@ class TextPagePrivate **/ void addNecessarySpace(RegionTextList tree); + + QMap m_word_chars_map; RegionTextList m_region_words; TextList m_spaces; TextList m_words;