From a80922d45e66605075a2838ee8836cfe8219bfe7 Mon Sep 17 00:00:00 2001 From: Jaan Vajakas Date: Tue, 25 Feb 2014 23:57:06 +0100 Subject: [PATCH] Improve XY Cut layout recognition code It was a simple bug in the XY Cut layout recognition code that made it too eager to see columns everywhere. Also removed the dependence of the layout analysis algorithms on the display DPI (introduced by the recently added feature of using KScreen) to make their behavior more predictable and reproducible. BUGS: 326207 BUGS: 331090 FIXED-IN: 4.13.0 REVIEW: 115759 --- core/textpage.cpp | 76 +++++++------ tests/searchtest.cpp | 248 +++++++++++++++++++++++++------------------ 2 files changed, 192 insertions(+), 132 deletions(-) diff --git a/core/textpage.cpp b/core/textpage.cpp index a95f7c6b2..348dea084 100644 --- a/core/textpage.cpp +++ b/core/textpage.cpp @@ -62,37 +62,49 @@ static bool CaseSensitiveCmpFn( const QStringRef & from, const QStringRef & to ) return from.compare( to, Qt::CaseSensitive ) == 0; } + /** - * If the vertical arm of one rectangle fully contains the other (example below) - * -------- ---- ----- first - * ---- -------- ----- second - * or we can make it overlap of spaces by threshold% -*/ -static bool doesConsumeY(const QRect& first, const QRect& second, int threshold) + * Returns true iff segments [@p left1, @p right1] and [@p left2, @p right2] on the real line + * overlap within @p threshold percent, i. e. iff the ratio of the length of the + * intersection of the segments to the length of the shortest of the two input segments + * is not smaller than the threshold. + */ +static bool segmentsOverlap(double left1, double right1, double left2, double right2, int threshold) { - // if one consumes another fully - if(first.top() <= second.top() && first.bottom() >= second.bottom()) + // check if one consumes another fully (speed optimization) + + if (left1 <= left2 && right1 >= right2) return true; - if(first.top() >= second.top() && first.bottom() <= second.bottom()) + if (left1 >= left2 && right1 <= right2) return true; - // or if there is overlap of space by more than 80% - // there is overlap - if(second.bottom() >= first.top() && first.bottom() >= second.top()) + // check if there is overlap above threshold + if (right2 >= left1 && right1 >= left2) { - const int overlap = (second.bottom() >= first.bottom()) ? first.bottom() - second.top() - : second.bottom() - first.top(); - //we will divide by the smaller rectangle to calculate the overlap - const int percentage = (first.height() < second.height()) ? overlap * 100 / (first.bottom() - first.top()) - : overlap * 100 / (second.bottom() - second.top()); + double overlap = (right2 >= right1) ? right1 - left2 + : right2 - left1; + + double length1 = right1 - left1, + length2 = right2 - left2; - if(percentage >= threshold) return true; + return overlap * 100 >= threshold * qMin(length1, length2); } return false; } +static bool doesConsumeY(const QRect& first, const QRect& second, int threshold) +{ + return segmentsOverlap(first.top(), first.bottom(), second.top(), second.bottom(), threshold); +} + +static bool doesConsumeY(const NormalizedRect& first, const NormalizedRect& second, int threshold) +{ + return segmentsOverlap(first.top, first.bottom, second.top, second.bottom, threshold); +} + + /* Rationale behind TinyTextEntity: @@ -764,7 +776,7 @@ RegularAreaRect* TextPage::findText( int searchID, const QString &query, SearchD // we have a '-' just followed by a '\n' character // check if the string contains a '-' character // if the '-' is the last entry -static int stringLengthAdaptedWithHyphen(const QString &str, const TextList::ConstIterator &it, const TextList::ConstIterator &textListEnd, PagePrivate *page) +static int stringLengthAdaptedWithHyphen(const QString &str, const TextList::ConstIterator &it, const TextList::ConstIterator &textListEnd) { int len = str.length(); @@ -786,11 +798,8 @@ static int stringLengthAdaptedWithHyphen(const QString &str, const TextList::Con else { // 2. if the next word is in a different line or not - const int pageWidth = page->m_page->width(); - const int pageHeight = page->m_page->height(); - - const QRect hyphenArea = (*it)->area.roundedGeometry(pageWidth, pageHeight); - const QRect lookaheadArea = (*(it + 1))->area.roundedGeometry(pageWidth, pageHeight); + const NormalizedRect& hyphenArea = (*it)->area; + const NormalizedRect& lookaheadArea = (*(it + 1))->area; // lookahead to check whether both the '-' rect and next character rect overlap if( !doesConsumeY( hyphenArea, lookaheadArea, 70 ) ) @@ -852,7 +861,7 @@ RegularAreaRect* TextPagePrivate::findTextInternalForward( int searchID, const Q { const TinyTextEntity* curEntity = *it; const QString& str = curEntity->text(); - int len = stringLengthAdaptedWithHyphen(str, it, m_words.constEnd(), m_page); + int len = stringLengthAdaptedWithHyphen(str, it, m_words.constEnd()); if (offset >= len) { @@ -970,7 +979,7 @@ RegularAreaRect* TextPagePrivate::findTextInternalBackward( int searchID, const const TinyTextEntity* curEntity = *it; const QString& str = curEntity->text(); - int len = stringLengthAdaptedWithHyphen(str, it, m_words.constEnd(), m_page); + int len = stringLengthAdaptedWithHyphen(str, it, m_words.constEnd()); if (offset <= 0) { @@ -1447,7 +1456,7 @@ static void calculateStatisticalInformation(const QList &wor if(space != 0 && space != pageWidth) { // increase the count of the space amount - if(hor_space_stat.contains(space)) hor_space_stat[space] = hor_space_stat[space]++; + if(hor_space_stat.contains(space)) hor_space_stat[space]++; else hor_space_stat[space] = 1; int left,right,top,bottom; @@ -1468,14 +1477,14 @@ static void calculateStatisticalInformation(const QList &wor if(hor_space_stat.contains(maxSpace)) { if(hor_space_stat[maxSpace] != 1) - hor_space_stat[maxSpace] = hor_space_stat[maxSpace]--; + hor_space_stat[maxSpace]--; else hor_space_stat.remove(maxSpace); } if(maxSpace != 0) { if (col_space_stat.contains(maxSpace)) - col_space_stat[maxSpace] = col_space_stat[maxSpace]++; + col_space_stat[maxSpace]++; else col_space_stat[maxSpace] = 1; //store the max rect of each line @@ -1866,8 +1875,13 @@ WordsWithCharacters addNecessarySpace(RegionTextList tree, int pageWidth, int pa */ void TextPagePrivate::correctTextOrder() { - const int pageWidth = m_page->m_page->width(); - const int pageHeight = m_page->m_page->height(); + //m_page->m_page->width() and m_page->m_page->height() are in pixels at + //100% zoom level, and thus depend on display DPI. We scale pageWidth and + //pageHeight to remove the dependence. Otherwise bugs would be more difficult + //to reproduce and Okular could fail in extreme cases like a large TV with low DPI. + const double scalingFactor = 2000.0 / (m_page->m_page->width() + m_page->m_page->height()); + const int pageWidth = (int) (scalingFactor * m_page->m_page->width() ); + const int pageHeight = (int) (scalingFactor * m_page->m_page->height()); TextList characters = m_words; diff --git a/tests/searchtest.cpp b/tests/searchtest.cpp index caceb6fd3..e24ab24e7 100644 --- a/tests/searchtest.cpp +++ b/tests/searchtest.cpp @@ -46,6 +46,8 @@ class SearchTest : public QObject void testHyphenAtEndOfLineWithoutYOverlap(); void testHyphenWithYOverlap(); void testHyphenAtEndOfPage(); + void testOneColumn(); + void testTwoColumns(); }; void SearchTest::initTestCase() @@ -54,23 +56,28 @@ void SearchTest::initTestCase() Okular::SettingsCore::instance( "searchtest" ); } -static Okular::TextPage* createTextPage(const QString text[], const Okular::NormalizedRect rect[], - int n, Okular::Page*& page) { +static void createTextPage(const QVector& text, const QVector& rect, + Okular::TextPage*& tp, Okular::Page*& page) { - Okular::TextPage* tp = new Okular::TextPage(); - for (int i = 0; i < n; i++) { + tp = new Okular::TextPage(); + for (int i = 0; i < text.size(); i++) { tp->append(text[i], new Okular::NormalizedRect(rect[i])); } - //We must create a Page object because TextPage::stringLengthAdaptedWithHyphen uses - //m_page->width() and m_page->height() (for some dubious reason). + //The Page::setTextPage method invokes the layout analysis algorithms tested by some tests here + //and also sets the tp->d->m_page field (the latter was used in older versions of Okular by + //TextPage::stringLengthAdaptedWithHyphen). //Note that calling "delete page;" will delete the TextPage as well. page = new Okular::Page(1, 100, 100, Okular::Rotation0); page -> setTextPage(tp); - - return tp; } +#define CREATE_PAGE \ + QCOMPARE(text.size(), rect.size()); \ + Okular::Page* page; \ + Okular::TextPage* tp; \ + createTextPage(text, rect, tp, page); + #define TEST_NEXT_PREV(searchType, expectedStatus) \ { \ Okular::RegularAreaRect* result = tp->findText(0, searchString, searchType, Qt::CaseSensitive, NULL); \ @@ -130,16 +137,14 @@ void SearchTest::testNextAndPrevious() for (int i = 0; i < TEST_NEXT_PREV_SITUATION_COUNT; i++) { const QVector& text = texts[i]; const QString& searchString = searchStrings[i]; - const int n = text.size(); - Okular::NormalizedRect* rect = new Okular::NormalizedRect[n]; \ + QVector rect; \ - for (int i = 0; i < n; i++) { - rect[i] = Okular::NormalizedRect(0.1*i, 0.0, 0.1*(i+1), 0.1); \ + for (int i = 0; i < text.size(); i++) { + rect << Okular::NormalizedRect(0.1*i, 0.0, 0.1*(i+1), 0.1); \ } - Okular::Page* page; - Okular::TextPage* tp = createTextPage(text.data(), rect, n, page); + CREATE_PAGE; //Test 3 of the 8 cases listed above: //FromTop, Next-Next (match) and Next-Next (no match) @@ -156,7 +161,6 @@ void SearchTest::testNextAndPrevious() TEST_NEXT_PREV(Okular::PreviousResult, false); delete page; - delete[] rect; } } @@ -194,15 +198,13 @@ void SearchTest::test311232() void SearchTest::test323262() { - QString text[] = { - "a\n" - }; - Okular::NormalizedRect rect[] = { - Okular::NormalizedRect(1, 2, 3, 4) - }; + QVector text; + text << "a\n"; + + QVector rect; + rect << Okular::NormalizedRect(1, 2, 3, 4); - Okular::Page* page; - Okular::TextPage* tp = createTextPage(text, rect, 1, page); + CREATE_PAGE; Okular::RegularAreaRect* result = tp->findText(0, "a", Okular::FromBottom, Qt::CaseSensitive, NULL); QVERIFY(result); @@ -213,17 +215,15 @@ void SearchTest::test323262() void SearchTest::test323263() { - QString text[] = { - "a", "a", "b" - }; - Okular::NormalizedRect rect[] = { - Okular::NormalizedRect(0, 0, 1, 1), - Okular::NormalizedRect(1, 0, 2, 1), - Okular::NormalizedRect(2, 0, 3, 1) - }; + QVector text; + text << "a" << "a" << "b"; + + QVector rect; + rect << Okular::NormalizedRect(0, 0, 1, 1) + << Okular::NormalizedRect(1, 0, 2, 1) + << Okular::NormalizedRect(2, 0, 3, 1); - Okular::Page* page; - Okular::TextPage* tp = createTextPage(text, rect, 3, page); + CREATE_PAGE; Okular::RegularAreaRect* result = tp->findText(0, "ab", Okular::FromTop, Qt::CaseSensitive, NULL); QVERIFY(result); @@ -248,15 +248,13 @@ void SearchTest::testDottedI() //mode as well (QString::compare does not match them, at least in non-Turkish locales, since it follows //the Unicode case-folding rules http://www.unicode.org/Public/6.2.0/ucd/CaseFolding.txt). - QString text[] = { - QString::fromUtf8("İ") - }; - Okular::NormalizedRect rect[] = { - Okular::NormalizedRect(1, 2, 3, 4) - }; + QVector text; + text << QString::fromUtf8("İ"); - Okular::Page* page; - Okular::TextPage* tp = createTextPage(text, rect, 1, page); + QVector rect; + rect << Okular::NormalizedRect(1, 2, 3, 4); + + CREATE_PAGE; Okular::RegularAreaRect* result = tp->findText(0, QString::fromUtf8("İ"), Okular::FromTop, Qt::CaseInsensitive, NULL); QVERIFY(result); @@ -265,34 +263,29 @@ void SearchTest::testDottedI() delete page; } -void SearchTest::testHyphenAtEndOfLineWithoutYOverlap() { - QString text[] = { - "super-", - "cali-\n", - "fragilistic", "-", - "expiali", "-\n", - "docious" - }; - - Okular::NormalizedRect rect[] = { - Okular::NormalizedRect(0.4, 0.0, 0.9, 0.1), - Okular::NormalizedRect(0.0, 0.1, 0.6, 0.2), - Okular::NormalizedRect(0.0, 0.2, 0.8, 0.3), Okular::NormalizedRect(0.8, 0.2, 0.9, 0.3), - Okular::NormalizedRect(0.0, 0.3, 0.8, 0.4), Okular::NormalizedRect(0.8, 0.3, 0.9, 0.4), - Okular::NormalizedRect(0.0, 0.4, 0.7, 0.5) - }; - - size_t n = sizeof(text)/sizeof(QString); - QCOMPARE(n, sizeof(rect)/sizeof(Okular::NormalizedRect)); - - Okular::Page* page; - Okular::TextPage* tp = createTextPage(text, rect, n, page); +void SearchTest::testHyphenAtEndOfLineWithoutYOverlap() +{ + QVector text; + text << "super-" + << "cali-\n" + << "fragilistic" << "-" + << "expiali" << "-\n" + << "docious"; + + QVector rect; + rect << Okular::NormalizedRect(0.4, 0.0, 0.9, 0.1) + << Okular::NormalizedRect(0.0, 0.1, 0.6, 0.2) + << Okular::NormalizedRect(0.0, 0.2, 0.8, 0.3) << Okular::NormalizedRect(0.8, 0.2, 0.9, 0.3) + << Okular::NormalizedRect(0.0, 0.3, 0.8, 0.4) << Okular::NormalizedRect(0.8, 0.3, 0.9, 0.4) + << Okular::NormalizedRect(0.0, 0.4, 0.7, 0.5); + + CREATE_PAGE; Okular::RegularAreaRect* result = tp->findText(0, "supercalifragilisticexpialidocious", Okular::FromTop, Qt::CaseSensitive, NULL); QVERIFY(result); Okular::RegularAreaRect expected; - for (unsigned i = 0; i < n; i++) { + for (int i = 0; i < text.size(); i++) { expected.append(rect[i]); } expected.simplify(); @@ -302,69 +295,63 @@ void SearchTest::testHyphenAtEndOfLineWithoutYOverlap() { delete page; } -static bool testHyphen(Okular::NormalizedRect rect[], QString searchString) { - QString text[] = { - "a-", - "b" - }; - - int n = 2; - - Okular::Page* page; - Okular::TextPage* tp = createTextPage(text, rect, n, page); - - Okular::RegularAreaRect* result = tp->findText(0, searchString, - Okular::FromTop, Qt::CaseSensitive, NULL); - - bool found = result; - - delete result; - delete page; - - return found; +#define CREATE_PAGE_AND_TEST_SEARCH(searchString, matchExpected) \ +{ \ + CREATE_PAGE; \ + \ + Okular::RegularAreaRect* result = tp->findText(0, searchString, \ + Okular::FromTop, Qt::CaseSensitive, NULL); \ + \ + QCOMPARE(!!result, matchExpected); \ + \ + delete result; \ + delete page; \ } -void SearchTest::testHyphenWithYOverlap() { - Okular::NormalizedRect rect[2]; +void SearchTest::testHyphenWithYOverlap() +{ + QVector text; + text << "a-" + << "b"; + + QVector rect(2); //different lines (50% y-coordinate overlap), first rectangle has larger height rect[0] = Okular::NormalizedRect(0.0, 0.0, 0.9, 0.35); rect[1] = Okular::NormalizedRect(0.0, 0.3, 0.2, 0.4); - QVERIFY(testHyphen(rect, "ab")); + CREATE_PAGE_AND_TEST_SEARCH("ab", true); //different lines (50% y-coordinate overlap), second rectangle has larger height rect[0] = Okular::NormalizedRect(0.0, 0.0, 0.9, 0.1); rect[1] = Okular::NormalizedRect(0.0, 0.05, 0.2, 0.4); - QVERIFY(testHyphen(rect, "ab")); + CREATE_PAGE_AND_TEST_SEARCH("ab", true); //same line (90% y-coordinate overlap), first rectangle has larger height rect[0] = Okular::NormalizedRect(0.0, 0.0, 0.4, 0.2); rect[1] = Okular::NormalizedRect(0.4, 0.11, 0.6, 0.21); - QVERIFY(!testHyphen(rect, "ab")); - QVERIFY(testHyphen(rect, "a-b")); + CREATE_PAGE_AND_TEST_SEARCH("ab", false); + CREATE_PAGE_AND_TEST_SEARCH("a-b", true); //same line (90% y-coordinate overlap), second rectangle has larger height rect[0] = Okular::NormalizedRect(0.0, 0.0, 0.4, 0.1); rect[1] = Okular::NormalizedRect(0.4, 0.01, 0.6, 0.2); - QVERIFY(!testHyphen(rect, "ab")); - QVERIFY(testHyphen(rect, "a-b")); + CREATE_PAGE_AND_TEST_SEARCH("ab", false); + CREATE_PAGE_AND_TEST_SEARCH("a-b", true); } -void SearchTest::testHyphenAtEndOfPage() { +void SearchTest::testHyphenAtEndOfPage() +{ //Tests for segmentation fault that would occur if //we tried look ahead (for determining whether the //next character is at the same line) at the end of the page. - QString text[] = { - "a-" - }; + QVector text; + text << "a-"; - Okular::NormalizedRect rect[] = { - Okular::NormalizedRect(0, 0, 1, 1) - }; + QVector rect; + rect << Okular::NormalizedRect(0, 0, 1, 1); - Okular::Page* page; - Okular::TextPage* tp = createTextPage(text, rect, 1, page); + CREATE_PAGE; { Okular::RegularAreaRect* result = tp->findText(0, "a", @@ -383,6 +370,65 @@ void SearchTest::testHyphenAtEndOfPage() { delete page; } +void SearchTest::testOneColumn() +{ + //Tests that the layout analysis algorithm does not create too many columns. + //Bug 326207 was caused by the fact that if all the horizontal breaks in a line + //had the same length and were smaller than vertical breaks between lines then + //the horizontal breaks were treated as column separators. + //(Note that "same length" means "same length after rounding rectangles to integer pixels". + //The resolution used by the XY Cut algorithm with a square page is 1000 x 1000, + //and the horizontal spaces in the example are 0.1, so they are indeed both exactly 100 pixels.) + + QVector text; + text << "Only" << "one" << "column" + << "here"; + + //characters and line breaks have length 0.05, word breaks 0.1 + QVector rect; + rect << Okular::NormalizedRect(0.0, 0.0, 0.2, 0.1) + << Okular::NormalizedRect(0.3, 0.0, 0.5, 0.1) + << Okular::NormalizedRect(0.6, 0.0, 0.9, 0.1) + << Okular::NormalizedRect(0.0, 0.15, 0.2, 0.25); + + CREATE_PAGE; + + Okular::RegularAreaRect* result = tp->findText(0, "Only one column", + Okular::FromTop, Qt::CaseSensitive, NULL); + QVERIFY(result); + delete result; + + delete page; +} + +void SearchTest::testTwoColumns() +{ + //Tests that the layout analysis algorithm can detect two columns. + + QVector text; + text << "This" << "text" << "in" << "two" + << "is" << "set" << "columns."; + + //characters, word breaks and line breaks have length 0.05 + QVector rect; + rect << Okular::NormalizedRect(0.0, 0.0, 0.20, 0.1) + << Okular::NormalizedRect(0.25, 0.0, 0.45, 0.1) + << Okular::NormalizedRect(0.6, 0.0, 0.7, 0.1) + << Okular::NormalizedRect(0.75, 0.0, 0.9, 0.1) + << Okular::NormalizedRect(0.0, 0.15, 0.1, 0.25) + << Okular::NormalizedRect(0.15, 0.15, 0.3, 0.25) + << Okular::NormalizedRect(0.6, 0.15, 1.0, 0.25); + + CREATE_PAGE; + + Okular::RegularAreaRect* result = tp->findText(0, "This text in", + Okular::FromTop, Qt::CaseSensitive, NULL); + QVERIFY(!result); + delete result; + + delete page; +} + QTEST_KDEMAIN( SearchTest, GUI ) #include "searchtest.moc"