Find word spacing and column spacing

remotes/origin/textfind-and-transparency
Mohammad Mahfuzur Rahman Mamun 15 years ago
parent 27d0f2f8c6
commit 4c5d844e28
  1. 7
      core/document.cpp
  2. 187
      core/textpage.cpp
  3. 5
      core/textpage.h

@ -82,6 +82,9 @@
#include <config-okular.h>
#include <iostream>
using namespace std;
using namespace Okular;
struct AllocatedPixmap
@ -2313,6 +2316,8 @@ void Document::requestPixmaps( const QLinkedList< PixmapRequest * > & requests,
void Document::requestTextPage( uint page )
{
cout << "textPage requested: " << endl;
Page * kp = d->m_pagesVector[ page ];
if ( !d->m_generator || !kp )
return;
@ -2323,7 +2328,7 @@ void Document::requestTextPage( uint page )
TextPage *tmpPage = d->m_pagesVector[page]->d->m_text;
// tmpPage->removeSpace();
tmpPage->removeSpace();
tmpPage->correctTextOrder();
tmpPage->addNecessarySpace();
}

@ -884,12 +884,11 @@ bool compareTinyTextEntityY(TinyTextEntity* first, TinyTextEntity* second){
}
//correct the textOrder, all layout recognition works here
void TextPage::correctTextOrder(){
/** mamun_nightcrawler@gmail.com **/
void TextPage::makeAndSortLines(){
/**
/**
we cannot assume that the generator will give us texts in the right order. We can only assume
that we will get texts in the page and their bounding rectangle. The texts can be character, word,
half-word anything. So, we need to:
@ -897,13 +896,7 @@ void TextPage::correctTextOrder(){
1. Sort rectangles/boxes containing texts by y0(top)
2. Create textline where there is y overlap between TinyTextEntity 's
3. Within each line sort the TinyTextEntity 's by x0(left)
4. Make character analysis to differentiate between word spacing and column spacing
5. Break the lines if there is some column spacing somewhere in the line and also calculate
the column spacing rectangle
**/
**/
// Step:1 .......................................
@ -1003,24 +996,178 @@ void TextPage::correctTextOrder(){
qSort(list.begin(),list.end(),compareTinyTextEntityX);
d->m_lines.replace(i,list);
//print lines after sorting
if(1){
// if(1){
// QRect rect = d->m_line_rects.at(i);
// cout << "L:" << rect.left() << " R:" << rect.right() << " T:" << rect.top() << " B:" << rect.bottom() << endl;
// cout << "Line " << i << ": ";
// for(j = 0 ; j < list.length() ; j++){
// TinyTextEntity* ent = list.at(j);
// cout << ent->text().toAscii().data();
// }
// cout << endl;
// }
}
}
//correct the textOrder, all layout recognition works here
void TextPage::correctTextOrder(){
makeAndSortLines();
/**
Firt Part: Create Text Lines
1. Make character analysis to differentiate between word spacing and column spacing.
2. Break the lines if there is some column spacing somewhere in the line and also calculate
the column spacing rectangle if necessary.
3. Find if some line contains more than one lines (it can happend if in the left column there is some
Big Text like heading and in the right column there is normal texts, so several normal lines from
right can be erroneously inserted in same line in merged position)
For those lines first sort them again using yoverlap and then x ordering
**/
/** Step 1: ........................................................................ **/
//we would like to use QMap instead of QHash as it will keep the keys sorted
QMap<int,int> hor_space_stat; //this is to find word spacing
QMap<int,int> col_space_stat; //this is to find column spacing
QList< QList<QRect> > space_rects; // to save all the word spacing or column spacing rects
int i,j;
for(i = 0 ; i < d->m_lines.length() ; i++){
// list contains a line
TextList list = d->m_lines.at(i);
QList<QRect> line_space_rects;
// if(1){
// QRect rect = d->m_line_rects.at(i);
// cout << "L:" << rect.left() << " R:" << rect.right() << " T:" << rect.top() << " B:" << rect.bottom() << endl;
// cout << "Line " << i << ": ";
// for(j = 0 ; j < list.length() ; j++){
// TinyTextEntity* ent = list.at(j);
// cout << ent->text().toAscii().data();
// }
// cout << endl;
// }
int maxSpace = 0, minSpace = d->m_page->m_page->width();
// for every TinyTextEntity element in the line
TextList::Iterator it = list.begin(), itEnd = list.end();
QRect rect = d->m_line_rects.at(i);
cout << "L:" << rect.left() << " R:" << rect.right() << " T:" << rect.top() << " B:" << rect.bottom() << endl;
// cout << "Line " << i << ":";
for( ; it != itEnd ; it++ ){
// cout << (*it)->text().toAscii().data();
cout << "Line " << i << ": ";
QRect area1 = (*it)->area.geometry(d->m_page->m_page->width(),d->m_page->m_page->height());
if( it+1 == itEnd ) break;
for(j = 0 ; j < list.length() ; j++){
TinyTextEntity* ent = list.at(j);
cout << ent->text().toAscii().data();
QRect area2 = (*(it+1))->area.geometry(d->m_page->m_page->width(),d->m_page->m_page->height());
int space = area2.left() - area1.right();
if(space > maxSpace) maxSpace = space;
if(space < minSpace && space != 0) minSpace = space;
//if we found a real space, whose length is not zero and also less than the pageWidth
if(space != 0 && space != d->m_page->m_page->width()){
if(hor_space_stat.contains(space)) hor_space_stat[space] = hor_space_stat[space]++;
else hor_space_stat[space] = 1;
//if we have found a space, put it in a list of rectangles
int left,right,top,bottom;
left = area1.right();
right = area2.left();
// cout << "left: " << left << ", right: " << right << endl;
area1.top() > area2.top() ? top = area2.top() : top = area1.top();
area1.bottom() < area2.bottom() ? bottom = area2.bottom() : bottom = area1.bottom();
QRect rect(left,top,right-left,bottom-top);
line_space_rects.append(rect);
// cout << space << " ";
}
cout << endl;
// cout << "space: " << space << " " << area1.right() << " " << area2.left() << endl;
}
space_rects.append(line_space_rects);
if(hor_space_stat.contains(maxSpace)){
if(hor_space_stat[maxSpace] != 1)
hor_space_stat[maxSpace] = hor_space_stat[maxSpace]--;
else hor_space_stat.remove(maxSpace);
}
if(maxSpace != 0){
if (col_space_stat.contains(maxSpace))
col_space_stat[maxSpace] = col_space_stat[maxSpace]++;
else col_space_stat[maxSpace] = 1;
}
// cout << endl;
// cout << minSpace << " "<< maxSpace << endl;
}
// All the space counts are in hor_space_stat
cout << "Word Spacing: " << endl;
QMapIterator<int, int> iterate(hor_space_stat);
while (iterate.hasNext()) {
iterate.next();
cout << iterate.key() << ": " << iterate.value() << endl;
}
int col_spacing = 0;
cout << "Column Spacing: " << endl;
QMapIterator<int, int> iterate_col(col_space_stat);
while (iterate_col.hasNext()) {
iterate_col.next();
cout << iterate_col.key() << ": " << iterate_col.value() << endl;
if(iterate_col.value() > col_spacing) col_spacing = iterate_col.value();
}
cout << "Column Spacing is: " << col_spacing << endl;
//print some space rects
for( i = 0 ; i < space_rects.length() ; i++){
QList<QRect> rectList = space_rects.at(i);
for( j = 0 ; j < rectList.length() ; j++){
QRect rect = rectList.at(j);
cout << "rect:(left,right,top,bottom) : " << rect.left() << "," << rect.right() << ","
<< rect.top() << "," << rect.bottom() << endl;
}
}
// Step 4: ...........................................
/** Step 2: ........................................................................ **/
/**
Second Part: Now we have Text Lines in our hand, we have to find their reading order. We will need to consider both
the horizontal spacing and vertical spacing here. We need the concept of line spacing here.
**/
//Find Line spacing/ Vertical spacing for row separators
//It will be necessary for reading order detection
for(i = 0 ; i < d->m_lines.length() ; i++){
TextList list = d->m_lines.at(i);
}

@ -184,6 +184,11 @@ class OKULAR_EXPORT TextPage
**/
void correctTextOrder();
/**
Create lines from TextList and sort them according to their position
**/
void makeAndSortLines();
/**
add additional spaces between words, if necessary, which can make the words valuable
while copying after selection

Loading…
Cancel
Save