selection of text by character done ... need some testing and debug

remotes/origin/textfind-and-transparency
Mohammad Mahfuzur Rahman Mamun 15 years ago
parent 3d0564fa40
commit 33d0facf4e
  1. 211
      core/textpage.cpp
  2. 7
      core/textpage_p.h

@ -143,6 +143,8 @@ class TinyTextEntity
class RegionText{ class RegionText{
public: public:
RegionText(){};
RegionText(TextList &list,QRect &area) RegionText(TextList &list,QRect &area)
: m_region_text(list) ,m_area(area) : m_region_text(list) ,m_area(area)
{ {
@ -959,6 +961,14 @@ bool compareTinyTextEntityY(TinyTextEntity* first, TinyTextEntity* second){
return firstArea.top() < secondArea.top(); return firstArea.top() < secondArea.top();
} }
bool compareRegionTextY(RegionText first, RegionText second){
return first.area().top() < second.area().top();
}
bool compareRegionTextX(RegionText first, RegionText second){
return first.area().left() < second.area().left();
}
void TextPagePrivate::printTextList(int i, TextList list){ void TextPagePrivate::printTextList(int i, TextList list){
@ -1093,11 +1103,14 @@ void TextPage::makeWord(){
int newLeft,newRight,newTop,newBottom; int newLeft,newRight,newTop,newBottom;
int pageWidth = d->m_page->m_page->width(), pageHeight = d->m_page->m_page->height(); int pageWidth = d->m_page->m_page->width(), pageHeight = d->m_page->m_page->height();
int index = 0; int index = 0;
QString spaceString(" ");
//For RegionTextList // It will contain a list of RegionText, where each RegionText contains a word, which comprises of
// TextList which is a list of TinyTextEntity which contains characters info and a QRect which contains
// the area of the region.
RegionTextList regionWordList; RegionTextList regionWordList;
//WordTocharacterList d->m_word_char_map
//for every non-space texts(characters/words) in the textList //for every non-space texts(characters/words) in the textList
for( ; it != itEnd ; it++){ for( ; it != itEnd ; it++){
@ -1109,15 +1122,10 @@ void TextPage::makeWord(){
tmpIt = it; tmpIt = it;
// cout << "first : ";
// printRect(lineArea) ;
int space = 0; int space = 0;
while(space <= 1){ while(space <= 1){
// if(textString == spaceString) break;
// we must have to put this line before the if condition of it==itEnd // we must have to put this line before the if condition of it==itEnd
// otherwise the last character can be missed // otherwise the last character can be missed
if(textString.length()){ if(textString.length()){
@ -1189,15 +1197,23 @@ void TextPage::makeWord(){
if(newString.length()){ if(newString.length()){
NormalizedRect newRect(lineArea,pageWidth,pageHeight); NormalizedRect newRect(lineArea,pageWidth,pageHeight);
TinyTextEntity *ent = new TinyTextEntity(newString.normalized
(QString::NormalizationForm_KC), newRect );
newList.append(ent);
newList.append( new TinyTextEntity(newString.normalized
(QString::NormalizationForm_KC), newRect ));
// cout << "newString: " << newString.toAscii().data() << endl;
QRect rect = newRect.geometry(pageWidth,pageHeight); QRect rect = newRect.geometry(pageWidth,pageHeight);
RegionText regionWord(word,rect); RegionText regionWord(word,rect);
regionWordList.append(regionWord); regionWordList.append(regionWord);
int keyRect = rect.left() * rect.top()
+ rect.right() * rect.bottom();
// if there are more than one element in the same key
d->m_word_chars_map.insertMulti(keyRect,regionWord);
index++; index++;
} }
@ -1209,23 +1225,21 @@ void TextPage::makeWord(){
d->m_region_words = regionWordList; d->m_region_words = regionWordList;
cout << "words: " << index << endl; cout << "words: " << index << endl;
// cout << " ............................................................ " << endl;
d->copy(newList); d->copy(newList);
// for(int i = 0 ; i < d->m_words.length() ; i++){ // for(int i = 0 ; i < d->m_words.length() ; i++){
// TinyTextEntity *ent = d->m_words.at(i); // TinyTextEntity *ent = d->m_words.at(i);
// cout << ent->text().toAscii().data() << endl; // QRect entArea = ent->area.geometry(pageWidth,pageHeight);
// printRect(ent->area.roundedGeometry(pageWidth,pageHeight)); // int key = entArea.top() * entArea.left() + entArea.right() * entArea.bottom();
// }
// cout << endl; // RegionText text_list = d->m_word_chars_map.value(key);
// TextList list = text_list.text();
// for(int i = 0 ; i < d->m_region_words.length() ; i++){ // cout << "key: " << key << " text: ";
// RegionText word = d->m_region_words.at(i); // for( int l = 0 ; l < list.length() ; l++){
// TextList text = word.text(); // ent = list.at(l);
// for( int j = 0 ; j < text.length() ; j++){
// TinyTextEntity* ent = text.at(j);
// cout << ent->text().toAscii().data(); // cout << ent->text().toAscii().data();
// } // }
// cout << endl; // cout << endl;
@ -1253,7 +1267,6 @@ void TextPage::makeAndSortLines(){
TextList tmpList = d->m_words; TextList tmpList = d->m_words;
qSort(tmpList.begin(),tmpList.end(),compareTinyTextEntityY); qSort(tmpList.begin(),tmpList.end(),compareTinyTextEntityY);
// d->printTextList(0,tmpList);
// Step 2: ....................................... // Step 2: .......................................
@ -1347,9 +1360,6 @@ void TextPage::makeAndSortLines(){
qSort(list.begin(),list.end(),compareTinyTextEntityX); qSort(list.begin(),list.end(),compareTinyTextEntityX);
d->m_lines.replace(i,list); d->m_lines.replace(i,list);
// d->printTextList(i,list);
// printRect(d->m_line_rects.at(i));
} }
} }
@ -1429,14 +1439,12 @@ void TextPage::XYCutForBoundingBoxes(int tcx,int tcy){
if (proj_on_yaxis[j] > maxY) maxY = proj_on_yaxis[j]; if (proj_on_yaxis[j] > maxY) maxY = proj_on_yaxis[j];
// cout << "index: " << j << " value: " << proj_on_yaxis[j] << endl; // cout << "index: " << j << " value: " << proj_on_yaxis[j] << endl;
} }
// cout << endl;
// cout << "projection on x axis " << endl << endl; // cout << "projection on x axis " << endl << endl;
for( j = 0 ; j < size_proj_x ; j++ ){ for( j = 0 ; j < size_proj_x ; j++ ){
if(proj_on_xaxis[j] > maxX) maxX = proj_on_xaxis[j]; if(proj_on_xaxis[j] > maxX) maxX = proj_on_xaxis[j];
// cout << "index: " << j << " value: " << proj_on_xaxis[j] << endl; // cout << "index: " << j << " value: " << proj_on_xaxis[j] << endl;
} }
// cout << endl;
/** 2. Cleanup Boundary White Spaces and removal of noise ..................... **/ /** 2. Cleanup Boundary White Spaces and removal of noise ..................... **/
@ -1458,7 +1466,6 @@ void TextPage::XYCutForBoundingBoxes(int tcx,int tcy){
yend--; yend--;
} }
// printRect(regionRect);
//update the regionRect //update the regionRect
int old_left = regionRect.left(), old_top = regionRect.top(); int old_left = regionRect.left(), old_top = regionRect.top();
@ -1574,39 +1581,19 @@ void TextPage::XYCutForBoundingBoxes(int tcx,int tcy){
regionRect.height()); regionRect.height());
// horizontal split (top rect, bottom rect)
cout << "main: ";
printRect(regionRect);
if(gap_y >= gap_x && gap_y > tcy){ if(gap_y >= gap_x && gap_y > tcy){
// cout << "toprect: ";
// printRect(topRect);
// cout << "bottomrect: ";
// printRect(bottomRect);
cut_hor = true; cut_hor = true;
} }
//vertical cut (left rect, right rect) //vertical cut (left rect, right rect)
else if(gap_y >= gap_x && gap_y <= tcy && gap_x > tcx){ else if(gap_y >= gap_x && gap_y <= tcy && gap_x > tcx){
// cout << "leftrect: ";
// printRect(leftRect);
// cout << "rightrect: ";
// printRect(rightRect);
cut_ver = true; cut_ver = true;
} }
//vertical cut //vertical cut
else if(gap_x >= gap_y && gap_x > tcx){ else if(gap_x >= gap_y && gap_x > tcx){
// cout << "leftrect: ";
// printRect(leftRect);
// cout << "rightrect: ";
// printRect(rightRect);
cut_ver = true; cut_ver = true;
} }
//horizontal cut //horizontal cut
else if(gap_x >= gap_y && gap_x <= tcx && gap_y > tcy){ else if(gap_x >= gap_y && gap_x <= tcx && gap_y > tcy){
// cout << "toprect: ";
// printRect(topRect);
// cout << "bottomrect: ";
// printRect(bottomRect);
cut_hor = true; cut_hor = true;
} }
//no cut possible //no cut possible
@ -1692,8 +1679,6 @@ void TextPage::XYCutForBoundingBoxes(int tcx,int tcy){
//correct the textOrder, all layout recognition works here //correct the textOrder, all layout recognition works here
void TextPage::correctTextOrder(){ void TextPage::correctTextOrder(){
// create words from characters (crashes)
removeSpace(); removeSpace();
makeWord(); makeWord();
@ -1771,19 +1756,15 @@ void TextPage::correctTextOrder(){
QRect max_area1,max_area2; QRect max_area1,max_area2;
QString before_max, after_max; QString before_max, after_max;
// d->printTextList(i,list);
// for every line // for every line
for( ; it != itEnd ; it++ ){ for( ; it != itEnd ; it++ ){
// cout << (*it)->text().toAscii().data() << endl;
QRect area1 = (*it)->area.roundedGeometry(pageWidth,pageHeight); QRect area1 = (*it)->area.roundedGeometry(pageWidth,pageHeight);
if( it+1 == itEnd ) break; if( it+1 == itEnd ) break;
// printRect(area1);
QRect area2 = (*(it+1))->area.roundedGeometry(pageWidth,pageHeight); QRect area2 = (*(it+1))->area.roundedGeometry(pageWidth,pageHeight);
int space = area2.left() - area1.right(); int space = area2.left() - area1.right();
// printRect(area2);
if(space > maxSpace){ if(space > maxSpace){
max_area1 = area1; max_area1 = area1;
@ -1795,9 +1776,6 @@ void TextPage::correctTextOrder(){
after_max = (*(it+1))->text(); after_max = (*(it+1))->text();
} }
// cout << (*it)->text().toAscii().data() << " " << (*(it+1))->text().toAscii().data();
// cout << " space: " << space << endl;
if(space < minSpace && space != 0) minSpace = space; if(space < minSpace && space != 0) minSpace = space;
//if we found a real space, whose length is not zero and also less than the pageWidth //if we found a real space, whose length is not zero and also less than the pageWidth
@ -1819,14 +1797,10 @@ void TextPage::correctTextOrder(){
QRect rect(left,top,right-left,bottom-top); QRect rect(left,top,right-left,bottom-top);
line_space_rects.append(rect); line_space_rects.append(rect);
// cout << space << " ";
} }
// cout << "space: " << space << " " << area1.right() << " " << area2.left() << endl;
} }
// cout << endl << "maxSpace " << maxSpace << " ----------------------------------------------- " << endl << endl;
space_rects.append(line_space_rects); space_rects.append(line_space_rects);
if(hor_space_stat.contains(maxSpace)){ if(hor_space_stat.contains(maxSpace)){
@ -2073,12 +2047,13 @@ void TextPagePrivate::addNecessarySpace(RegionTextList tree){
// we will use the concept of line and line sorting here once again // we will use the concept of line and line sorting here once again
/** /**
1. we will first add spaces regionWise 1. We will sort all the texts in the region by Y
2. Then we will sort all the texts in the region by Y 2. After that, we will create a line containing all overlapping Y
3. After that, we will create a line containing all overlapping Y 3. Now, we will sort texts in every line by X
4. Now, we will sort texts in every line by X 4. We will now add spaces between two words in a line
5. And, finally we will extract all the space separated texts from each region and 5. And, then we will extract all the space separated texts from each region and
make m_words nice again. make m_words nice again.
6. Then we will merge all the texts from every region to make one TextList and assign it to m_words
**/ **/
// m_spaces;m_words; // m_spaces;m_words;
@ -2089,38 +2064,21 @@ void TextPagePrivate::addNecessarySpace(RegionTextList tree){
// we will only change the texts under RegionTexts, not the area // we will only change the texts under RegionTexts, not the area
for(j = 0 ; j < tree.length() ; j++){ for(j = 0 ; j < tree.length() ; j++){
RegionText tmp = tree.at(j); RegionText tmp = tree.at(j);
QRect area = tmp.area();
TextList tmpList = tmp.text(); TextList tmpList = tmp.text();
// 1. adding space // 1. sorting by Y
// TextList::Iterator it1 = m_tmp_words.begin(), itEnd1 = m_tmp_words.end();
// for( ; it1 != itEnd1 ; it1++){
// QRect entArea = (*it1)->area.geometry(pageWidth,pageHeight);
// QPoint center = entArea.center();
// QString text = (*it1)->text();
// // if some space is in the region, add its TinyTextEntity to the tmpList
// if(area.contains(center) && text == spaceStr){
// tmpList.append((*it1));
// }
// }
// now we have to keep tmpList in order and then set tmp with the tmpList
// 2. sorting by Y
qSort(tmpList.begin(),tmpList.end(),compareTinyTextEntityY); qSort(tmpList.begin(),tmpList.end(),compareTinyTextEntityY);
//print the tmpList //print the tmpList
cout << "printing the tmpList " << " ..................................... " << endl; // cout << "printing the tmpList " << " ..................................... " << endl;
for( i = 0 ; i < tmpList.length() ; i++){ // for( i = 0 ; i < tmpList.length() ; i++){
TinyTextEntity* ent = tmpList.at(i); // TinyTextEntity* ent = tmpList.at(i);
cout << ent->text().toAscii().data(); // cout << ent->text().toAscii().data();
} // }
cout << endl << endl; // cout << endl << endl;
// 3. create line by Y overlap // 2. create line by Y overlap
TextList::Iterator it = tmpList.begin(), itEnd = tmpList.end(); TextList::Iterator it = tmpList.begin(), itEnd = tmpList.end();
int newLeft,newRight,newTop,newBottom; int newLeft,newRight,newTop,newBottom;
@ -2180,7 +2138,6 @@ void TextPagePrivate::addNecessarySpace(RegionTextList tree){
} }
// // when we have found a new line // // when we have found a new line
// // create a new TextList containing only one element and append it to the m_lines
if(!found){ if(!found){
TextList tmp; TextList tmp;
tmp.append((*it)); tmp.append((*it));
@ -2189,7 +2146,7 @@ void TextPagePrivate::addNecessarySpace(RegionTextList tree){
} }
} }
// 4. sort texts in each line by X // 3. sort texts in each line by X
for(i = 0 ; i < m_lines.length() ; i++){ for(i = 0 ; i < m_lines.length() ; i++){
TextList list = m_lines.at(i); TextList list = m_lines.at(i);
@ -2197,10 +2154,10 @@ void TextPagePrivate::addNecessarySpace(RegionTextList tree){
qSort(list.begin(),list.end(),compareTinyTextEntityX); qSort(list.begin(),list.end(),compareTinyTextEntityX);
m_lines.replace(i,list); m_lines.replace(i,list);
printTextList(i,list); // printTextList(i,list);
} }
// Bonus ;): Now, we add space in between texts in a region // 4. Now, we add space in between texts in a region
for(i = 0 ; i < m_lines.length() ; i++){ for(i = 0 ; i < m_lines.length() ; i++){
TextList list = m_lines.at(i); TextList list = m_lines.at(i);
@ -2258,26 +2215,74 @@ void TextPagePrivate::addNecessarySpace(RegionTextList tree){
tree.replace(j,tmp); tree.replace(j,tmp);
} }
TextList tmp;
int count = 0;
// Merge all the texts from each region
TextList tmp;
for(i = 0 ; i < tree.length() ; i++){ for(i = 0 ; i < tree.length() ; i++){
TextList list = tree.at(i).text(); TextList list = tree.at(i).text();
cout << "node: " << i << endl << endl;
for(j = 0 ; j < list.length() ; j++){ for(j = 0 ; j < list.length() ; j++){
TinyTextEntity *ent = list.at(j); TinyTextEntity *ent = list.at(j);
cout << ent->text().toAscii().data();
if(ent->text() == spaceStr)
count++;
tmp.append(ent); tmp.append(ent);
} }
cout << endl << endl; }
copy(tmp);
// break the words into characters/smallest part that was primarily
while(tmp.length()) tmp.pop_back();
int count = 0;
for(int i = 0 ; i < m_words.length() ; i++){
TinyTextEntity *ent = m_words.at(i);
QRect rect = ent->area.geometry(pageWidth,pageHeight);
// the spaces contains only one character, so we can skip them
if(ent->text() == spaceStr){
tmp.append(ent);
}
else{
int key = rect.left() * rect.top()
+ rect.right() * rect.bottom();
RegionText word_text = m_word_chars_map.value(key);
TextList list = word_text.text();
count = m_word_chars_map.count(key);
if(count > 1){
cout << "count : " << count << endl;
QMap<int, RegionText>::iterator it = m_word_chars_map.find(key);
while( it != m_word_chars_map.end() && it.key() == key ){
word_text = it.value();
it++;
list = word_text.text();
QRect regionRect = word_text.area();
if(regionRect.left() == rect.left() && regionRect.top() == rect.top())
break;
}
}
tmp.append(list);
}
} }
copy(tmp); copy(tmp);
// print the final text
for( i = 0 ; i < m_words.length() ; i++){
TinyTextEntity* ent = m_words.at(i);
cout << ent->text().toAscii().data();
}
} }

@ -25,9 +25,6 @@ namespace Okular
class PagePrivate; class PagePrivate;
typedef QList< TinyTextEntity* > TextList; typedef QList< TinyTextEntity* > TextList;
/** list of RegionText -- keeps a bunch of TextList with their bounding rectangles **/
typedef QList<RegionText> RegionTextList;
typedef bool ( *TextComparisonFunction )( const QStringRef & from, const QStringRef & to, typedef bool ( *TextComparisonFunction )( const QStringRef & from, const QStringRef & to,
int *fromLength, int *toLength ); int *fromLength, int *toLength );
@ -39,6 +36,8 @@ We will make a line of TextList and also store the bounding rectangle of line
typedef QList<TextList> SortedTextList; typedef QList<TextList> SortedTextList;
typedef QList<QRect> LineRect; typedef QList<QRect> LineRect;
/** list of RegionText -- keeps a bunch of TextList with their bounding rectangles **/
typedef QList<RegionText> RegionTextList;
class TextPagePrivate class TextPagePrivate
{ {
@ -68,6 +67,8 @@ class TextPagePrivate
**/ **/
void addNecessarySpace(RegionTextList tree); void addNecessarySpace(RegionTextList tree);
QMap<int, RegionText> m_word_chars_map;
RegionTextList m_region_words; RegionTextList m_region_words;
TextList m_spaces; TextList m_spaces;
TextList m_words; TextList m_words;

Loading…
Cancel
Save