From 27d0f2f8c6fb284a8efd2f0942b4c402183b6114 Mon Sep 17 00:00:00 2001 From: Mohammad Mahfuzur Rahman Mamun Date: Wed, 29 Jun 2011 18:17:15 +0600 Subject: [PATCH] create and sort lines by y overlap first and then according to x ordering --- core/document.cpp | 2 +- core/textpage.cpp | 158 +++++++++++++++++++++++++++++++++++++++++++++- core/textpage.h | 6 ++ core/textpage_p.h | 11 ++++ 4 files changed, 175 insertions(+), 2 deletions(-) diff --git a/core/document.cpp b/core/document.cpp index 21bf50fa6..ac2a2a999 100644 --- a/core/document.cpp +++ b/core/document.cpp @@ -2323,7 +2323,7 @@ void Document::requestTextPage( uint page ) TextPage *tmpPage = d->m_pagesVector[page]->d->m_text; - tmpPage->removeSpace(); +// tmpPage->removeSpace(); tmpPage->correctTextOrder(); tmpPage->addNecessarySpace(); } diff --git a/core/textpage.cpp b/core/textpage.cpp index e2fa20664..e878c40b2 100644 --- a/core/textpage.cpp +++ b/core/textpage.cpp @@ -19,6 +19,7 @@ #include "page_p.h" #include +#include //On Debugging Purpose #include @@ -852,7 +853,7 @@ void TextPage::printTextPageContent(){ } -//remove unEvenSpace, currently removes necessary spaces also :( +//remove all the spaces between texts, it will keep all the generators same, whether they save spaces or not void TextPage::removeSpace(){ TextList::Iterator it = d->m_words.begin(), itEnd = d->m_words.end(), tmpIt = it; @@ -867,11 +868,166 @@ void TextPage::removeSpace(){ } + +bool compareTinyTextEntityX(TinyTextEntity* first, TinyTextEntity* second){ + QRect firstArea = first->area.geometry(1000,1000); + QRect secondArea = second->area.geometry(1000,1000); + + return firstArea.left() < secondArea.left(); +} + +bool compareTinyTextEntityY(TinyTextEntity* first, TinyTextEntity* second){ + QRect firstArea = first->area.geometry(1000,1000); + QRect secondArea = second->area.geometry(1000,1000); + + return firstArea.top() < secondArea.bottom(); +} + + //correct the textOrder, all layout recognition works here void TextPage::correctTextOrder(){ + +/** + + we cannot assume that the generator will give us texts in the right order. We can only assume + that we will get texts in the page and their bounding rectangle. The texts can be character, word, + half-word anything. So, we need to: + + 1. Sort rectangles/boxes containing texts by y0(top) + 2. Create textline where there is y overlap between TinyTextEntity 's + 3. Within each line sort the TinyTextEntity 's by x0(left) + + 4. Make character analysis to differentiate between word spacing and column spacing + 5. Break the lines if there is some column spacing somewhere in the line and also calculate + the column spacing rectangle + +**/ + + + // Step:1 ....................................... + + TextList tmpList = d->m_words; + qSort(tmpList.begin(),tmpList.end(),compareTinyTextEntityY); + + // Step 2: ....................................... + + TextList::Iterator it = tmpList.begin(), itEnd = tmpList.end(), tmpIt = it; + int i =0, index,j = 0; + int newLeft,newRight,newTop,newBottom,newWidth,newHeight; + + //for every non-space texts(characters/words) in the textList + for( ; it != itEnd ; it++){ + + //the textEntity area + QRect elementArea = (*it)->area.geometry(d->m_page->m_page->width(),d->m_page->m_page->height()); + + //d->m_lines in a QList of TextList and TextList is a QList of TinyTextEntity* + // see, whether the new text should be inserted to an existing line + index = i; + bool found = false; + for( i = 0 ; i < d->m_lines.length() ; i++){ + + + //the line area + QRect lineArea = d->m_line_rects.at(i); + + int text_y1 = elementArea.top() ,text_y2 = elementArea.bottom(), text_x1 = elementArea.left(), + text_x2 = elementArea.right(); + int line_y1 = lineArea.top() ,line_y2 = lineArea.bottom(), + line_x1 = lineArea.left(), line_x2 = lineArea.right(); + + // if the new text and line has y overlapping parts of more than 50%, the text will go to this line + if(text_y2 > line_y1 && line_y2 > text_y1){ + + TextList tmp = d->m_lines.at(i); + tmp.append((*it)); + + d->m_lines.replace(i,tmp); + + newLeft = lineArea.left(); + if(text_x1 < newLeft) newLeft = text_x1; + newRight = text_x2; + if(lineArea.right() > text_x2) newRight = lineArea.right(); + + newTop = text_y1 > line_y1 ? line_y1 : text_y1; + newBottom = text_y2 > line_y2 ? text_y2 : line_y2; + newWidth = newRight - newLeft; + newHeight = newBottom - newTop; + + d->m_line_rects.replace( i, QRect(newLeft,newTop,newWidth,newHeight) ); + found = true; + } + + } + + // when we have found a new line + // create a new TextList containing only one element and append it to the m_lines + if(!found){ + //(*it) is a TinyTextEntity* + TextList tmp; + tmp.append((*it)); + d->m_lines.append(tmp); + d->m_line_rects.append(elementArea); + } + } + + cout << "m_lines length: " << d->m_lines.length() << endl; + + // print every line +// for(i = 0 ; i < d->m_lines.length() ; i++){ +// // list is a line +// TextList list = d->m_lines.at(i); + +// if(!i){ + +// QRect rect = d->m_line_rects.at(i); +// cout << "L:" << rect.left() << " R:" << rect.right() << " T:" << rect.top() << " B:" << rect.bottom() << endl; + +// cout << "Line " << i << ": "; + +// for(j = 0 ; j < list.length() ; j++){ +// TinyTextEntity* ent = list.at(j); +// cout << ent->text().toAscii().data(); +// } +// cout << endl; +// } + +// } + + + + // Step 3: ....................................... + for(i = 0 ; i < d->m_lines.length() ; i++){ + TextList list = d->m_lines.at(i); + + qSort(list.begin(),list.end(),compareTinyTextEntityX); + + //print lines after sorting + if(1){ + + QRect rect = d->m_line_rects.at(i); + cout << "L:" << rect.left() << " R:" << rect.right() << " T:" << rect.top() << " B:" << rect.bottom() << endl; + + cout << "Line " << i << ": "; + + for(j = 0 ; j < list.length() ; j++){ + TinyTextEntity* ent = list.at(j); + cout << ent->text().toAscii().data(); + } + cout << endl; + } + } + + + // Step 4: ........................................... + for(i = 0 ; i < d->m_lines.length() ; i++){ + TextList list = d->m_lines.at(i); + } + } + //add necessary spaces in the text - mainly for copy purpose void TextPage::addNecessarySpace(){ diff --git a/core/textpage.h b/core/textpage.h index 1d5aced3a..80fb11e21 100644 --- a/core/textpage.h +++ b/core/textpage.h @@ -190,6 +190,12 @@ class OKULAR_EXPORT TextPage **/ void addNecessarySpace(); +// //comparison function which compares two TinyTextEntity by left position +// bool compareTinyTextEntityX(TinyTextEntity first, TinyTextEntity second); + +// // by top +// bool compareTinyTextEntityX(TinyTextEntity first, TinyTextEntity second); + private: TextPagePrivate* const d; diff --git a/core/textpage_p.h b/core/textpage_p.h index bf2abafcb..a247d1301 100644 --- a/core/textpage_p.h +++ b/core/textpage_p.h @@ -27,6 +27,15 @@ typedef QList< TinyTextEntity* > TextList; typedef bool ( *TextComparisonFunction )( const QStringRef & from, const QStringRef & to, int *fromLength, int *toLength ); +//mamun.nightcrawler@gmail.com + +/** +We will make a line of TextList and also store the bounding rectangle of line +**/ +typedef QList SortedTextList; +typedef QList LineRect; + + class TextPagePrivate { public: @@ -47,6 +56,8 @@ class TextPagePrivate TextList m_words; QMap< int, SearchPoint* > m_searchPoints; PagePrivate *m_page; + SortedTextList m_lines; + LineRect m_line_rects; }; }