/* * Kchmviewer - a CHM and EPUB file viewer with broad language support * Copyright (C) 2004-2014 George Yunaev, gyunaev@ulduzsoft.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include #include "ebook.h" #include "ebook_search.h" #include "helper_search_index.h" static const int DICT_VERSION = 4; namespace QtAs { // Those characters are splitters (i.e. split the word), but added themselves into dictionary too. // This makes the dictionary MUCH larger, but ensure that for the piece of "window->print" both // search for "print" and "->print" will find it. static const char SPLIT_CHARACTERS[] = "!()*&^%#@[]{}':;,.?/|/?<>\\-+=~`"; // Those characters are parts of word - for example, '_' is here, and search for _debug will find only _debug. static const char WORD_CHARACTERS[] = "$_"; struct Term { Term() : frequency(-1) { } Term(const QString &t, int f, const QVector &l) : term(t) , frequency(f) , documents(l) { } QString term; int frequency; QVector documents; bool operator<(const Term &i2) const { return frequency < i2.frequency; } }; QDataStream &operator>>(QDataStream &s, Document &l) { s >> l.docNumber; s >> l.frequency; return s; } QDataStream &operator<<(QDataStream &s, const Document l) { s << (short)l.docNumber; s << (short)l.frequency; return s; } Index::Index() : QObject(nullptr) { lastWindowClosed = false; connect(qApp, &QGuiApplication::lastWindowClosed, this, &Index::setLastWinClosed); } void Index::setLastWinClosed() { lastWindowClosed = true; } bool Index::makeIndex(const QList &docs, EBook *chmFile) { if (docs.isEmpty()) return false; docList = docs; if (chmFile->hasFeature(EBook::FEATURE_ENCODING)) entityDecoder.changeEncoding(QTextCodec::codecForName(chmFile->currentEncoding().toUtf8())); QList::ConstIterator it = docList.constBegin(); int steps = docList.count() / 100; if (!steps) steps++; int prog = 0; for (int i = 0; it != docList.constEnd(); ++it, ++i) { if (lastWindowClosed) return false; QUrl filename = *it; QStringList terms; if (parseDocumentToStringlist(chmFile, filename, terms)) { for (QStringList::ConstIterator tit = terms.constBegin(); tit != terms.constEnd(); ++tit) insertInDict(*tit, i); } if (i % steps == 0) { prog++; prog = qMin(prog, 99); emit indexingProgress(prog, tr("Processing document %1").arg((*it).path())); } } emit indexingProgress(100, tr("Processing completed")); return true; } void Index::insertInDict(const QString &str, int docNum) { Entry *e = nullptr; if (!dict.isEmpty()) e = dict[str]; if (e) { if (e->documents.last().docNumber != docNum) e->documents.append(Document(docNum, 1)); else e->documents.last().frequency++; } else { dict.insert(str, new Entry(docNum)); } } bool Index::parseDocumentToStringlist(EBook *chmFile, const QUrl &filename, QStringList &tokenlist) { QString parsedbuf, parseentity, text; if (!chmFile->getFileContentAsString(text, filename) || text.isEmpty()) { qWarning("Search index generator: could not retrieve the document content for %s", qPrintable(filename.toString())); return false; } m_charssplit = SPLIT_CHARACTERS; m_charsword = WORD_CHARACTERS; tokenlist.clear(); // State machine states enum state_t { STATE_OUTSIDE_TAGS, // outside HTML tags; parse text STATE_IN_HTML_TAG, // inside HTML tags; wait for end tag STATE_IN_QUOTES, // inside HTML tags and inside quotes; wait for end quote (in var QuoteChar) STATE_IN_HTML_ENTITY // inside HTML entity; parse the entity }; state_t state = STATE_OUTSIDE_TAGS; QChar QuoteChar; // used in STATE_IN_QUOTES for (int j = 0; j < text.length(); j++) { QChar ch = text[j]; if ((j % 20000) == 0) qApp->processEvents(QEventLoop::ExcludeUserInputEvents); if (state == STATE_IN_HTML_TAG) { // We are inside HTML tag. // Ignore everything until we see '>' (end of HTML tag) or quote char (quote start) if (ch == '"' || ch == '\'') { state = STATE_IN_QUOTES; QuoteChar = ch; } else if (ch == '>') state = STATE_OUTSIDE_TAGS; continue; } else if (state == STATE_IN_QUOTES) { // We are inside quoted text inside HTML tag. // Ignore everything until we see the quote character again if (ch == QuoteChar) state = STATE_IN_HTML_TAG; continue; } else if (state == STATE_IN_HTML_ENTITY) { // We are inside encoded HTML entity (like  ). // Collect to parsedbuf everything until we see ; if (ch.isLetterOrNumber()) { // get next character of this entity parseentity.append(ch); continue; } // The entity ended state = STATE_OUTSIDE_TAGS; // Some shitty HTML does not terminate entities correctly. Screw it. if (ch != ';' && ch != '<') { if (parseentity.isEmpty()) { // straight '&' symbol. Add and continue. parsedbuf += QLatin1String("&"); } else qWarning("Index::parseDocument: incorrectly terminated HTML entity '&%s%c', ignoring", qPrintable(parseentity), ch.toLatin1()); j--; // parse this character again, but in different state continue; } // Don't we have a space? if (parseentity.toLower() != QLatin1String("nbsp")) { QString entity = entityDecoder.decode(parseentity); if (entity.isNull()) { // decodeEntity() already printed error message // qWarning( "Index::parseDocument: failed to decode entity &%s;", parsedbuf.ascii() ); continue; } parsedbuf += entity; continue; } else ch = ' '; // We got a space, so treat it like it, and not add it to parsebuf } // // Now process STATE_OUTSIDE_TAGS // // Check for start of HTML tag, and switch to STATE_IN_HTML_TAG if it is if (ch == '<') { state = STATE_IN_HTML_TAG; goto tokenize_buf; } // Check for start of HTML entity if (ch == '&') { state = STATE_IN_HTML_ENTITY; parseentity = QString(); continue; } // Replace quote by ' - quotes are used in search window to set the phrase if (ch == '"') ch = '\''; // Ok, we have a valid character outside HTML tags, and probably some in buffer already. // If it is char or letter, add it and continue if (ch.isLetterOrNumber() || m_charsword.indexOf(ch) != -1) { parsedbuf.append(ch); continue; } // If it is a split char, add the word to the dictionary, and then add the char itself. if (m_charssplit.indexOf(ch) != -1) { if (!parsedbuf.isEmpty()) tokenlist.push_back(parsedbuf.toLower()); tokenlist.push_back(ch.toLower()); parsedbuf = QString(); continue; } tokenize_buf: // Just add the word; it is most likely a space or terminated by tokenizer. if (!parsedbuf.isEmpty()) { tokenlist.push_back(parsedbuf.toLower()); parsedbuf = QString(); } } // Add the last word if still here - for broken htmls. if (!parsedbuf.isEmpty()) tokenlist.push_back(parsedbuf.toLower()); return true; } void Index::writeDict(QDataStream &stream) { stream << DICT_VERSION; stream << m_charssplit; stream << m_charsword; // Document list stream << docList; // Dictionary for (QHash::ConstIterator it = dict.constBegin(); it != dict.constEnd(); ++it) { stream << it.key(); stream << (int)it.value()->documents.count(); stream << it.value()->documents; } } bool Index::readDict(QDataStream &stream) { dict.clear(); docList.clear(); QString key; int version, numOfDocs; stream >> version; if (version < 2) return false; stream >> m_charssplit; stream >> m_charsword; // Read the document list stream >> docList; while (!stream.atEnd()) { stream >> key; stream >> numOfDocs; QVector docs(numOfDocs); stream >> docs; dict.insert(key, new Entry(docs)); } return dict.size() > 0; } QList Index::query(const QStringList &terms, const QStringList &termSeq, const QStringList &seqWords, EBook *chmFile) { QList termList; QStringList::ConstIterator it = terms.begin(); for (it = terms.begin(); it != terms.end(); ++it) { Entry *e = nullptr; if (dict[*it]) { e = dict[*it]; termList.append(Term(*it, e->documents.count(), e->documents)); } else { return QList(); } } if (termList.isEmpty()) return QList(); std::sort(termList.begin(), termList.end()); QVector minDocs = termList.takeFirst().documents; for (const Term &t : qAsConst(termList)) { const QVector docs = t.documents; for (QVector::Iterator minDoc_it = minDocs.begin(); minDoc_it != minDocs.end();) { bool found = false; for (QVector::ConstIterator doc_it = docs.constBegin(); doc_it != docs.constEnd(); ++doc_it) { if ((*minDoc_it).docNumber == (*doc_it).docNumber) { (*minDoc_it).frequency += (*doc_it).frequency; found = true; break; } } if (!found) minDoc_it = minDocs.erase(minDoc_it); else ++minDoc_it; } } QList results; std::sort(minDocs.begin(), minDocs.end()); if (termSeq.isEmpty()) { for (const Document &doc : qAsConst(minDocs)) results << docList.at((int)doc.docNumber); return results; } QUrl fileName; for (const Document &doc : qAsConst(minDocs)) { fileName = docList[(int)doc.docNumber]; if (searchForPhrases(termSeq, seqWords, fileName, chmFile)) results << fileName; } return results; } bool Index::searchForPhrases(const QStringList &phrases, const QStringList &words, const QUrl &filename, EBook *chmFile) { QStringList parsed_document; if (!parseDocumentToStringlist(chmFile, filename, parsed_document)) return false; miniDict.clear(); // Initialize the dictionary with the words in phrase(s) for (const QString &word : words) miniDict.insert(word, new PosEntry(0)); // Fill the dictionary with the words from the document unsigned int word_offset = 3; for (QStringList::ConstIterator it = parsed_document.constBegin(); it != parsed_document.constEnd(); it++, word_offset++) { PosEntry *entry = miniDict[*it]; if (entry) entry->positions.append(word_offset); } // Dump it /* QDictIterator it( miniDict ); for( ; it.current(); ++it ) { QString text( it.currentKey() ); QValueList pos = miniDict[text]->positions; for ( unsigned int i = 1; i < pos.size(); i++ ) text += " " + QString::number( pos[i] ); qDebug( "%s", text.ascii()); } */ QList first_word_positions; for (QStringList::ConstIterator phrase_it = phrases.constBegin(); phrase_it != phrases.constEnd(); phrase_it++) { QStringList phrasewords = phrase_it->split(' '); first_word_positions = miniDict[phrasewords[0]]->positions; for (int j = 1; j < phrasewords.count(); ++j) { QList next_word_it = miniDict[phrasewords[j]]->positions; QList::iterator dict_it = first_word_positions.begin(); while (dict_it != first_word_positions.end()) { if (next_word_it.indexOf(*dict_it + 1) != -1) { (*dict_it)++; ++dict_it; } else dict_it = first_word_positions.erase(dict_it); } } } return !first_word_positions.isEmpty(); } };