You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
461 lines
13 KiB
461 lines
13 KiB
/* |
|
Kchmviewer - a CHM and EPUB file viewer with broad language support |
|
SPDX-FileCopyrightText: 2004-2014 George Yunaev gyunaev@ulduzsoft.com |
|
|
|
SPDX-License-Identifier: GPL-3.0-or-later |
|
*/ |
|
|
|
#include <QApplication> |
|
#include <QTextCodec> |
|
|
|
#include "ebook.h" |
|
#include "ebook_search.h" |
|
#include "helper_search_index.h" |
|
|
|
static const int DICT_VERSION = 4; |
|
|
|
namespace QtAs |
|
{ |
|
// Those characters are splitters (i.e. split the word), but added themselves into dictionary too. |
|
// This makes the dictionary MUCH larger, but ensure that for the piece of "window->print" both |
|
// search for "print" and "->print" will find it. |
|
#define SPLIT_CHARACTERS QStringLiteral("!()*&^%#@[]{}':;,.?/|/?<>\\-+=~`") |
|
|
|
// Those characters are parts of word - for example, '_' is here, and search for _debug will find only _debug. |
|
#define WORD_CHARACTERS QStringLiteral("$_") |
|
|
|
struct Term { |
|
Term() |
|
: frequency(-1) |
|
{ |
|
} |
|
Term(const QString &t, int f, const QVector<Document> &l) |
|
: term(t) |
|
, frequency(f) |
|
, documents(l) |
|
{ |
|
} |
|
QString term; |
|
int frequency; |
|
QVector<Document> documents; |
|
bool operator<(const Term &i2) const |
|
{ |
|
return frequency < i2.frequency; |
|
} |
|
}; |
|
|
|
QDataStream &operator>>(QDataStream &s, Document &l) |
|
{ |
|
s >> l.docNumber; |
|
s >> l.frequency; |
|
return s; |
|
} |
|
|
|
QDataStream &operator<<(QDataStream &s, const Document l) |
|
{ |
|
s << (short)l.docNumber; |
|
s << (short)l.frequency; |
|
return s; |
|
} |
|
|
|
Index::Index() |
|
: QObject(nullptr) |
|
{ |
|
lastWindowClosed = false; |
|
connect(qApp, &QGuiApplication::lastWindowClosed, this, &Index::setLastWinClosed); |
|
} |
|
|
|
void Index::setLastWinClosed() |
|
{ |
|
lastWindowClosed = true; |
|
} |
|
|
|
bool Index::makeIndex(const QList<QUrl> &docs, EBook *chmFile) |
|
{ |
|
if (docs.isEmpty()) { |
|
return false; |
|
} |
|
|
|
docList = docs; |
|
|
|
if (chmFile->hasFeature(EBook::FEATURE_ENCODING)) { |
|
entityDecoder.changeEncoding(QTextCodec::codecForName(chmFile->currentEncoding().toUtf8())); |
|
} |
|
|
|
QList<QUrl>::ConstIterator it = docList.constBegin(); |
|
int steps = docList.count() / 100; |
|
|
|
if (!steps) { |
|
steps++; |
|
} |
|
|
|
int prog = 0; |
|
|
|
for (int i = 0; it != docList.constEnd(); ++it, ++i) { |
|
if (lastWindowClosed) { |
|
return false; |
|
} |
|
|
|
const QUrl &filename = *it; |
|
QStringList terms; |
|
|
|
if (parseDocumentToStringlist(chmFile, filename, terms)) { |
|
for (QStringList::ConstIterator tit = terms.constBegin(); tit != terms.constEnd(); ++tit) { |
|
insertInDict(*tit, i); |
|
} |
|
} |
|
|
|
if (i % steps == 0) { |
|
prog++; |
|
prog = qMin(prog, 99); |
|
Q_EMIT indexingProgress(prog, tr("Processing document %1").arg((*it).path())); |
|
} |
|
} |
|
|
|
Q_EMIT indexingProgress(100, tr("Processing completed")); |
|
return true; |
|
} |
|
|
|
void Index::insertInDict(const QString &str, int docNum) |
|
{ |
|
Entry *e = nullptr; |
|
if (!dict.isEmpty()) { |
|
e = dict[str]; |
|
} |
|
|
|
if (e) { |
|
if (e->documents.last().docNumber != docNum) { |
|
e->documents.append(Document(docNum, 1)); |
|
} else { |
|
e->documents.last().frequency++; |
|
} |
|
} else { |
|
dict.insert(str, new Entry(docNum)); |
|
} |
|
} |
|
|
|
bool Index::parseDocumentToStringlist(EBook *chmFile, const QUrl &filename, QStringList &tokenlist) |
|
{ |
|
QString parsedbuf, parseentity, text; |
|
|
|
if (!chmFile->getFileContentAsString(text, filename) || text.isEmpty()) { |
|
qWarning("Search index generator: could not retrieve the document content for %s", qPrintable(filename.toString())); |
|
return false; |
|
} |
|
|
|
m_charssplit = SPLIT_CHARACTERS; |
|
m_charsword = WORD_CHARACTERS; |
|
|
|
tokenlist.clear(); |
|
|
|
// State machine states |
|
enum state_t { |
|
STATE_OUTSIDE_TAGS, // outside HTML tags; parse text |
|
STATE_IN_HTML_TAG, // inside HTML tags; wait for end tag |
|
STATE_IN_QUOTES, // inside HTML tags and inside quotes; wait for end quote (in var QuoteChar) |
|
STATE_IN_HTML_ENTITY // inside HTML entity; parse the entity |
|
}; |
|
|
|
state_t state = STATE_OUTSIDE_TAGS; |
|
QChar QuoteChar; // used in STATE_IN_QUOTES |
|
|
|
for (int j = 0; j < text.length(); j++) { |
|
QChar ch = text[j]; |
|
|
|
if ((j % 20000) == 0) { |
|
qApp->processEvents(QEventLoop::ExcludeUserInputEvents); |
|
} |
|
|
|
if (state == STATE_IN_HTML_TAG) { |
|
// We are inside HTML tag. |
|
// Ignore everything until we see '>' (end of HTML tag) or quote char (quote start) |
|
if (ch == QLatin1Char('"') || ch == QLatin1Char('\'')) { |
|
state = STATE_IN_QUOTES; |
|
QuoteChar = ch; |
|
} else if (ch == QLatin1Char('>')) { |
|
state = STATE_OUTSIDE_TAGS; |
|
} |
|
|
|
continue; |
|
} else if (state == STATE_IN_QUOTES) { |
|
// We are inside quoted text inside HTML tag. |
|
// Ignore everything until we see the quote character again |
|
if (ch == QuoteChar) { |
|
state = STATE_IN_HTML_TAG; |
|
} |
|
|
|
continue; |
|
} else if (state == STATE_IN_HTML_ENTITY) { |
|
// We are inside encoded HTML entity (like ). |
|
// Collect to parsedbuf everything until we see ; |
|
if (ch.isLetterOrNumber()) { |
|
// get next character of this entity |
|
parseentity.append(ch); |
|
continue; |
|
} |
|
|
|
// The entity ended |
|
state = STATE_OUTSIDE_TAGS; |
|
|
|
// Some shitty HTML does not terminate entities correctly. Screw it. |
|
if (ch != QLatin1Char(';') && ch != QLatin1Char('<')) { |
|
if (parseentity.isEmpty()) { |
|
// straight '&' symbol. Add and continue. |
|
parsedbuf += QLatin1String("&"); |
|
} else { |
|
qWarning("Index::parseDocument: incorrectly terminated HTML entity '&%s%c', ignoring", qPrintable(parseentity), ch.toLatin1()); |
|
} |
|
|
|
j--; // parse this character again, but in different state |
|
continue; |
|
} |
|
|
|
// Don't we have a space? |
|
if (parseentity.toLower() != QLatin1String("nbsp")) { |
|
QString entity = entityDecoder.decode(parseentity); |
|
|
|
if (entity.isNull()) { |
|
// decodeEntity() already printed error message |
|
// qWarning( "Index::parseDocument: failed to decode entity &%s;", parsedbuf.ascii() ); |
|
continue; |
|
} |
|
|
|
parsedbuf += entity; |
|
continue; |
|
} else { |
|
ch = QLatin1Char(' '); // We got a space, so treat it like it, and not add it to parsebuf |
|
} |
|
} |
|
|
|
// |
|
// Now process STATE_OUTSIDE_TAGS |
|
// |
|
|
|
// Check for start of HTML tag, and switch to STATE_IN_HTML_TAG if it is |
|
if (ch == QLatin1Char('<')) { |
|
state = STATE_IN_HTML_TAG; |
|
goto tokenize_buf; |
|
} |
|
|
|
// Check for start of HTML entity |
|
if (ch == QLatin1Char('&')) { |
|
state = STATE_IN_HTML_ENTITY; |
|
parseentity = QString(); |
|
continue; |
|
} |
|
|
|
// Replace quote by ' - quotes are used in search window to set the phrase |
|
if (ch == QLatin1Char('"')) { |
|
ch = QLatin1Char('\''); |
|
} |
|
|
|
// Ok, we have a valid character outside HTML tags, and probably some in buffer already. |
|
// If it is char or letter, add it and continue |
|
if (ch.isLetterOrNumber() || m_charsword.indexOf(ch) != -1) { |
|
parsedbuf.append(ch); |
|
continue; |
|
} |
|
|
|
// If it is a split char, add the word to the dictionary, and then add the char itself. |
|
if (m_charssplit.indexOf(ch) != -1) { |
|
if (!parsedbuf.isEmpty()) { |
|
tokenlist.push_back(parsedbuf.toLower()); |
|
} |
|
|
|
tokenlist.push_back(ch.toLower()); |
|
parsedbuf = QString(); |
|
continue; |
|
} |
|
|
|
tokenize_buf: |
|
// Just add the word; it is most likely a space or terminated by tokenizer. |
|
if (!parsedbuf.isEmpty()) { |
|
tokenlist.push_back(parsedbuf.toLower()); |
|
parsedbuf = QString(); |
|
} |
|
} |
|
|
|
// Add the last word if still here - for broken htmls. |
|
if (!parsedbuf.isEmpty()) { |
|
tokenlist.push_back(parsedbuf.toLower()); |
|
} |
|
|
|
return true; |
|
} |
|
|
|
void Index::writeDict(QDataStream &stream) |
|
{ |
|
stream << DICT_VERSION; |
|
stream << m_charssplit; |
|
stream << m_charsword; |
|
|
|
// Document list |
|
stream << docList; |
|
|
|
// Dictionary |
|
for (QHash<QString, Entry *>::ConstIterator it = dict.constBegin(); it != dict.constEnd(); ++it) { |
|
stream << it.key(); |
|
stream << (int)it.value()->documents.count(); |
|
stream << it.value()->documents; |
|
} |
|
} |
|
|
|
bool Index::readDict(QDataStream &stream) |
|
{ |
|
dict.clear(); |
|
docList.clear(); |
|
|
|
QString key; |
|
int version, numOfDocs; |
|
|
|
stream >> version; |
|
|
|
if (version < 2) { |
|
return false; |
|
} |
|
|
|
stream >> m_charssplit; |
|
stream >> m_charsword; |
|
|
|
// Read the document list |
|
stream >> docList; |
|
|
|
while (!stream.atEnd()) { |
|
stream >> key; |
|
stream >> numOfDocs; |
|
|
|
QVector<Document> docs(numOfDocs); |
|
|
|
stream >> docs; |
|
dict.insert(key, new Entry(docs)); |
|
} |
|
|
|
return dict.size() > 0; |
|
} |
|
|
|
QList<QUrl> Index::query(const QStringList &terms, const QStringList &termSeq, const QStringList &seqWords, EBook *chmFile) |
|
{ |
|
QList<Term> termList; |
|
|
|
QStringList::ConstIterator it = terms.begin(); |
|
for (it = terms.begin(); it != terms.end(); ++it) { |
|
Entry *e = nullptr; |
|
|
|
if (dict[*it]) { |
|
e = dict[*it]; |
|
termList.append(Term(*it, e->documents.count(), e->documents)); |
|
} else { |
|
return QList<QUrl>(); |
|
} |
|
} |
|
|
|
if (termList.isEmpty()) { |
|
return QList<QUrl>(); |
|
} |
|
|
|
std::sort(termList.begin(), termList.end()); |
|
|
|
QVector<Document> minDocs = termList.takeFirst().documents; |
|
for (const Term &t : qAsConst(termList)) { |
|
const QVector<Document> docs = t.documents; |
|
for (QVector<Document>::Iterator minDoc_it = minDocs.begin(); minDoc_it != minDocs.end();) { |
|
bool found = false; |
|
for (QVector<Document>::ConstIterator doc_it = docs.constBegin(); doc_it != docs.constEnd(); ++doc_it) { |
|
if ((*minDoc_it).docNumber == (*doc_it).docNumber) { |
|
(*minDoc_it).frequency += (*doc_it).frequency; |
|
found = true; |
|
break; |
|
} |
|
} |
|
if (!found) { |
|
minDoc_it = minDocs.erase(minDoc_it); |
|
} else { |
|
++minDoc_it; |
|
} |
|
} |
|
} |
|
|
|
QList<QUrl> results; |
|
std::sort(minDocs.begin(), minDocs.end()); |
|
if (termSeq.isEmpty()) { |
|
for (const Document &doc : qAsConst(minDocs)) { |
|
results << docList.at((int)doc.docNumber); |
|
} |
|
return results; |
|
} |
|
|
|
QUrl fileName; |
|
for (const Document &doc : qAsConst(minDocs)) { |
|
fileName = docList[(int)doc.docNumber]; |
|
if (searchForPhrases(termSeq, seqWords, fileName, chmFile)) { |
|
results << fileName; |
|
} |
|
} |
|
|
|
return results; |
|
} |
|
|
|
bool Index::searchForPhrases(const QStringList &phrases, const QStringList &words, const QUrl &filename, EBook *chmFile) |
|
{ |
|
QStringList parsed_document; |
|
|
|
if (!parseDocumentToStringlist(chmFile, filename, parsed_document)) { |
|
return false; |
|
} |
|
|
|
miniDict.clear(); |
|
|
|
// Initialize the dictionary with the words in phrase(s) |
|
for (const QString &word : words) { |
|
miniDict.insert(word, new PosEntry(0)); |
|
} |
|
|
|
// Fill the dictionary with the words from the document |
|
unsigned int word_offset = 3; |
|
for (QStringList::ConstIterator it = parsed_document.constBegin(); it != parsed_document.constEnd(); it++, word_offset++) { |
|
PosEntry *entry = miniDict[*it]; |
|
|
|
if (entry) { |
|
entry->positions.append(word_offset); |
|
} |
|
} |
|
|
|
// Dump it |
|
/* |
|
QDictIterator<PosEntry> it( miniDict ); |
|
for( ; it.current(); ++it ) |
|
{ |
|
QString text( it.currentKey() ); |
|
QValueList<uint> pos = miniDict[text]->positions; |
|
for ( unsigned int i = 1; i < pos.size(); i++ ) |
|
text += " " + QString::number( pos[i] ); |
|
|
|
qDebug( "%s", text.ascii()); |
|
} |
|
*/ |
|
|
|
QList<uint> first_word_positions; |
|
|
|
for (QStringList::ConstIterator phrase_it = phrases.constBegin(); phrase_it != phrases.constEnd(); phrase_it++) { |
|
QStringList phrasewords = phrase_it->split(QLatin1Char(' ')); |
|
first_word_positions = miniDict[phrasewords[0]]->positions; |
|
|
|
for (int j = 1; j < phrasewords.count(); ++j) { |
|
QList<uint> next_word_it = miniDict[phrasewords[j]]->positions; |
|
QList<uint>::iterator dict_it = first_word_positions.begin(); |
|
|
|
while (dict_it != first_word_positions.end()) { |
|
if (next_word_it.indexOf(*dict_it + 1) != -1) { |
|
(*dict_it)++; |
|
++dict_it; |
|
} else { |
|
dict_it = first_word_positions.erase(dict_it); |
|
} |
|
} |
|
} |
|
} |
|
|
|
return !first_word_positions.isEmpty(); |
|
} |
|
|
|
};
|
|
|