diff --git a/core/textpage.cpp b/core/textpage.cpp index 537b10642..650f9f93e 100644 --- a/core/textpage.cpp +++ b/core/textpage.cpp @@ -971,19 +971,7 @@ void TextPage::makeAndSortLines(){ // for(i = 0 ; i < d->m_lines.length() ; i++){ // // list is a line // TextList list = d->m_lines.at(i); - -// if(!i){ - -// QRect rect = d->m_line_rects.at(i); -// cout << "L:" << rect.left() << " R:" << rect.right() << " T:" << rect.top() << " B:" << rect.bottom() << endl; - -// cout << "Line " << i << ": "; - -// for(j = 0 ; j < list.length() ; j++){ -// TinyTextEntity* ent = list.at(j); -// cout << ent->text().toAscii().data(); -// } -// cout << endl; +// d->printTextList(i,list); // } // } @@ -999,24 +987,73 @@ void TextPage::makeAndSortLines(){ d->m_lines.replace(i,list); //print lines after sorting -// if(1){ + //d->printTextList(i,list); + + } +} -// QRect rect = d->m_line_rects.at(i); -// cout << "L:" << rect.left() << " R:" << rect.right() << " T:" << rect.top() << " B:" << rect.bottom() << endl; -// cout << "Line " << i << ": "; +// if the horizontal arm of one rectangle fully contains the other (example below) +// -------- ---- ----- first +// ---- -------- ----- second -// for(j = 0 ; j < list.length() ; j++){ -// TinyTextEntity* ent = list.at(j); -// cout << ent->text().toAscii().data(); -// } -// cout << endl; -// } +// or we can make it overlap of spaces by 80% + +bool doesConsumeX(QRect first, QRect second){ + +// int threshold = 2; + + // if one consumes another fully + if(first.left() >= second.left() && first.right() >= second.right()) return true; + if(first.left() <= second.left() && first.right() <= second.right()) return true; + + //or if there is overlap of space by more than 80% + // there is overlap + + int overlap; + if(second.right() >= first.left() && first.right() >= second.left()){ + + int percentage; + if(second.right() > first.right()) overlap = first.right() - second.left(); + else overlap = second.right() - first.left(); + + //we will divide by the smaller rectangle to calculate the overlap + if( first.width() < second.width()){ + + percentage = overlap * 100 / (first.width()); + + } + else{ + percentage = overlap * 100 / (second.width()); + } + + if(percentage > 80) return true; + + } + + return false; +} + +void printRect(QRect rect){ + cout << "l: " << rect.left() << " r: " << rect.right() << " t: " << rect.top() << " b: " << rect.bottom() << endl; +} + +void TextPagePrivate::printTextList(int i, TextList list){ + QRect rect = m_line_rects.at(i); + cout << "L:" << rect.left() << " R:" << rect.right() << " T:" << rect.top() << " B:" << rect.bottom() << endl; + cout << "Line " << i << ": "; + + for(int j = 0 ; j < list.length() ; j++){ + TinyTextEntity* ent = list.at(j); + cout << ent->text().toAscii().data(); } + cout << endl; + } + //correct the textOrder, all layout recognition works here void TextPage::correctTextOrder(){ @@ -1042,8 +1079,11 @@ void TextPage::correctTextOrder(){ //we would like to use QMap instead of QHash as it will keep the keys sorted QMap hor_space_stat; //this is to find word spacing QMap col_space_stat; //this is to find column spacing + QList< QList > space_rects; // to save all the word spacing or column spacing rects + QList max_hor_space_rects; + int i,j; for(i = 0 ; i < d->m_lines.length() ; i++){ @@ -1051,23 +1091,13 @@ void TextPage::correctTextOrder(){ TextList list = d->m_lines.at(i); QList line_space_rects; -// if(1){ -// QRect rect = d->m_line_rects.at(i); -// cout << "L:" << rect.left() << " R:" << rect.right() << " T:" << rect.top() << " B:" << rect.bottom() << endl; -// cout << "Line " << i << ": "; -// for(j = 0 ; j < list.length() ; j++){ -// TinyTextEntity* ent = list.at(j); -// cout << ent->text().toAscii().data(); -// } -// cout << endl; -// } - int maxSpace = 0, minSpace = d->m_page->m_page->width(); // for every TinyTextEntity element in the line TextList::Iterator it = list.begin(), itEnd = list.end(); + QRect max_area1,max_area2; + -// cout << "Line " << i << ":"; for( ; it != itEnd ; it++ ){ // cout << (*it)->text().toAscii().data(); @@ -1076,7 +1106,13 @@ void TextPage::correctTextOrder(){ QRect area2 = (*(it+1))->area.geometry(d->m_page->m_page->width(),d->m_page->m_page->height()); int space = area2.left() - area1.right(); - if(space > maxSpace) maxSpace = space; + + if(space > maxSpace){ + max_area1 = area1; + max_area2 = area2; + maxSpace = space; + } + if(space < minSpace && space != 0) minSpace = space; //if we found a real space, whose length is not zero and also less than the pageWidth @@ -1089,9 +1125,6 @@ void TextPage::correctTextOrder(){ left = area1.right(); right = area2.left(); -// cout << "left: " << left << ", right: " << right << endl; - - area1.top() > area2.top() ? top = area2.top() : top = area1.top(); area1.bottom() < area2.bottom() ? bottom = area2.bottom() : bottom = area1.bottom(); @@ -1116,13 +1149,27 @@ void TextPage::correctTextOrder(){ if (col_space_stat.contains(maxSpace)) col_space_stat[maxSpace] = col_space_stat[maxSpace]++; else col_space_stat[maxSpace] = 1; + + //store the max rect of each line + int left,right,top,bottom; + left = max_area1.right(); + right = max_area2.left(); + + max_area1.top() > max_area2.top() ? top = max_area2.top() : top = max_area1.top(); + max_area1.bottom() < max_area2.bottom() ? bottom = max_area2.bottom() : bottom = max_area1.bottom(); + + QRect rect(QPoint(left,top),QPoint(right,bottom)); + max_hor_space_rects.append(rect); + } // cout << endl; // cout << minSpace << " "<< maxSpace << endl; } - // All the space counts are in hor_space_stat + // All the between word space counts are in hor_space_stat + + /*int word_spacing; cout << "Word Spacing: " << endl; QMapIterator iterate(hor_space_stat); while (iterate.hasNext()) { @@ -1137,28 +1184,172 @@ void TextPage::correctTextOrder(){ iterate_col.next(); cout << iterate_col.key() << ": " << iterate_col.value() << endl; if(iterate_col.value() > col_spacing) col_spacing = iterate_col.value(); - } + }*/ - cout << "Column Spacing is: " << col_spacing << endl; + //show all space rects (between words, word spacing or column spacing) +// for( i = 0 ; i < space_rects.length() ; i++){ - //print some space rects - for( i = 0 ; i < space_rects.length() ; i++){ +// QList rectList = space_rects.at(i); - QList rectList = space_rects.at(i); +// for( j = 0 ; j < rectList.length() ; j++){ - for( j = 0 ; j < rectList.length() ; j++){ +// QRect rect = rectList.at(j); +// cout << "rect:(left,right,top,bottom) : " << rect.left() << "," << rect.right() << "," +// << rect.top() << "," << rect.bottom() << endl; - QRect rect = rectList.at(j); - cout << "rect:(left,right,top,bottom) : " << rect.left() << "," << rect.right() << "," - << rect.top() << "," << rect.bottom() << endl; +// } +// } + +// cout << "max_hor_space_rectangle:" << endl; +// for(j = 0 ; j < max_hor_space_rects.length() ; j++){ + +// QRect rect = max_hor_space_rects.at(j); +// cout << "rect:(left,right,top,bottom) : " << rect.left() << "," << rect.right() << "," +// << rect.top() << "," << rect.bottom() << endl; +// } - } - } /** Step 2: ........................................................................ **/ + /** + We will start with the max whitespace rectangle within the first line, if any. Then, we will get the max whitespace + rectangle of the second line. + + If both of them are at the same position, we can say, they creates a column. + Else we will check + if there is any whitespace rectangle under the previous line's maximum whitespace rectangle. In this cae, we can say + its a noisy line, which do not preserve the column separation. if we find 3(col_threshold) lines of this type consecutively, + we can break the column separation, and say that these 3 lines fully are in the same column. + + else, the line is a single line in a column. We do not need to separate this. + + **/ + +// int col_threshold = 3; +// int col_spacing_threshold = 2; + int length_line_list = d->m_lines.length(); + bool consume12 = false, consume23 = false, consume13 = false; + + for(i = 0 ; i < length_line_list ; i++){ + + int index1 = i % length_line_list, index2 = (i + 1) % length_line_list, + index3 = (i + 2) % length_line_list; + + // We will take 3 lines at a time, so that one noisy data do not give wrong idea. + // We will see whether they creates a column or not + TextList line1 = d->m_lines.at(index1); + TextList line2 = d->m_lines.at(index2); +// TextList line3 = d->m_lines.at(index3); + + // the estimated column space rectangles of those lines + QRect columnRect1 = max_hor_space_rects.at(index1); + QRect columnRect2 = max_hor_space_rects.at(index2); +// QRect columnRect3 = max_hor_space_rects.at(index3); + + QRect rect1,rect2,rect3; + + cout << "rect1: "; + printRect(columnRect1); + + cout << "rect2: "; + printRect(columnRect2); + + + //if the maxRectangle of line1 and line2 are at the same place, they may create a column + if(doesConsumeX(columnRect1,columnRect2)){ + consume12 = true; + rect1 = columnRect1; + rect2 = columnRect2; +// d->printTextList(index1,line1); +// cout << "rect1: " << columnRect1.left() << " , " << columnRect1.right() << endl; + +// d->printTextList(index2,line2); +// cout << "rect2: " << columnRect2.left() << " , " << columnRect2.right() << endl; + + } + //else if one of the lines is noisy and do not maintain column spacing correctly, + //so that, maxSpacing is not column spacing but, some other word spacing, so we search + //if some rectangle smaller than some word spacing rectangle remains which is consumed by + //the other lines maxSpacing rectangle. + else{ + //1. see whether maxSpacing of line1 consumes any space rectangle in line2 + rect1 = columnRect1; + QList line2_space_rect = space_rects.at(index2); + + for(j = 0 ; j < line2_space_rect.length() ; j++){ + rect2 = line2_space_rect.at(j); + if(doesConsumeX(rect1,rect2)){ + consume12 = true; + break; + } + } + + if(consume12) break; + + //2. see whether maxSpacing of line2 consumes any space rectangle in line1 + rect2 = columnRect2; + QList line1_space_rect = space_rects.at(index1); + + for(j = 0 ; j < line1_space_rect.length() ; j++){ + rect1 = line1_space_rect.at(j); + if(doesConsumeX(rect1,rect2)){ + consume12 = true; + break; + } + } + + } + +// if consume12 is still false, then we do not get some column spacing, the spacing are random, +// so, possibly line1 and line2 are not column separated lines. so, we don't need to split them. + + // possibly we have got a column separator, so, we break the lines in two parts, and + // 1. edit previous lines(delete the part after column) + // 2. add two new lines and append them to the last of the list + if(consume12){ + //the separating rectangles are rect1 and rect2 + QRect linerect1 = d->m_line_rects.at(i),linerect2 = linerect1; + TextList tmp; + TinyTextEntity* tmp_entity; + + for(j = line1.length() - 1 ; j >= 0 ; j --){ + + tmp_entity = line1.at(j); + QRect area = tmp_entity->area.geometry(d->m_page->m_page->width(),d->m_page->m_page->height()); + + // we have got maxSpace rect + if(area.left() == rect1.right()){ + linerect1.setRight(rect1.left()); + linerect2.setLeft(rect1.right()); + + tmp.push_front(tmp_entity); + line1.pop_back(); + + break; + } + + //push in front in the new line and pop from the back of the old line + tmp.push_front(tmp_entity); + line1.pop_back(); + + } + + d->m_lines.replace(i,line1); + d->m_line_rects.replace(i,linerect1); + + d->m_lines.append(tmp); + d->m_line_rects.append(linerect2); + + d->printTextList(i,d->m_lines.at(i)); + d->printTextList(length_line_list + i,d->m_lines.at(i+length_line_list)); + } + +// break; + } + + /** diff --git a/core/textpage_p.h b/core/textpage_p.h index a247d1301..2ad9aff63 100644 --- a/core/textpage_p.h +++ b/core/textpage_p.h @@ -52,6 +52,8 @@ class TextPagePrivate TextComparisonFunction comparer, const TextList::ConstIterator &start, const TextList::ConstIterator &end ); + /** prints a line **/ + void printTextList(int i, TextList list); TextList m_words; QMap< int, SearchPoint* > m_searchPoints;