You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
493 lines
12 KiB
493 lines
12 KiB
/* |
|
* Kchmviewer - a CHM and EPUB file viewer with broad language support |
|
* Copyright (C) 2004-2014 George Yunaev, gyunaev@ulduzsoft.com |
|
* |
|
* This program is free software: you can redistribute it and/or modify |
|
* it under the terms of the GNU General Public License as published by |
|
* the Free Software Foundation, either version 3 of the License, or |
|
* (at your option) any later version. |
|
* |
|
* This program is distributed in the hope that it will be useful, |
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
* GNU General Public License for more details. |
|
* |
|
* You should have received a copy of the GNU General Public License |
|
* along with this program. If not, see <http://www.gnu.org/licenses/>. |
|
*/ |
|
|
|
#include <QApplication> |
|
#include <QTextCodec> |
|
|
|
#include "ebook.h" |
|
#include "ebook_search.h" |
|
#include "helper_search_index.h" |
|
|
|
static const int DICT_VERSION = 4; |
|
|
|
namespace QtAs { |
|
|
|
// Those characters are splitters (i.e. split the word), but added themselves into dictionary too. |
|
// This makes the dictionary MUCH larger, but ensure that for the piece of "window->print" both |
|
// search for "print" and "->print" will find it. |
|
static const char SPLIT_CHARACTERS[] = "!()*&^%#@[]{}':;,.?/|/?<>\\-+=~`"; |
|
|
|
// Those characters are parts of word - for example, '_' is here, and search for _debug will find only _debug. |
|
static const char WORD_CHARACTERS[] = "$_"; |
|
|
|
|
|
struct Term |
|
{ |
|
Term() : frequency(-1) {} |
|
Term( const QString &t, int f, QVector<Document> l ) : term( t ), frequency( f ), documents( l ) {} |
|
QString term; |
|
int frequency; |
|
QVector<Document>documents; |
|
bool operator<( const Term &i2 ) const { return frequency < i2.frequency; } |
|
}; |
|
|
|
|
|
QDataStream &operator>>( QDataStream &s, Document &l ) |
|
{ |
|
s >> l.docNumber; |
|
s >> l.frequency; |
|
return s; |
|
} |
|
|
|
QDataStream &operator<<( QDataStream &s, const Document &l ) |
|
{ |
|
s << (short)l.docNumber; |
|
s << (short)l.frequency; |
|
return s; |
|
} |
|
|
|
Index::Index() |
|
: QObject( 0 ) |
|
{ |
|
lastWindowClosed = false; |
|
connect( qApp, SIGNAL( lastWindowClosed() ), this, SLOT( setLastWinClosed() ) ); |
|
} |
|
|
|
void Index::setLastWinClosed() |
|
{ |
|
lastWindowClosed = true; |
|
} |
|
|
|
|
|
bool Index::makeIndex(const QList< QUrl >& docs, EBook *chmFile ) |
|
{ |
|
if ( docs.isEmpty() ) |
|
return false; |
|
|
|
docList = docs; |
|
|
|
if ( chmFile->hasFeature( EBook::FEATURE_ENCODING ) ) |
|
entityDecoder.changeEncoding( QTextCodec::codecForName( chmFile->currentEncoding().toUtf8() ) ); |
|
|
|
QList< QUrl >::ConstIterator it = docList.constBegin(); |
|
int steps = docList.count() / 100; |
|
|
|
if ( !steps ) |
|
steps++; |
|
|
|
int prog = 0; |
|
|
|
for ( int i = 0; it != docList.constEnd(); ++it, ++i ) |
|
{ |
|
if ( lastWindowClosed ) |
|
return false; |
|
|
|
QUrl filename = *it; |
|
QStringList terms; |
|
|
|
if ( parseDocumentToStringlist( chmFile, filename, terms ) ) |
|
{ |
|
for ( QStringList::ConstIterator tit = terms.constBegin(); tit != terms.constEnd(); ++tit ) |
|
insertInDict( *tit, i ); |
|
} |
|
|
|
if ( i%steps == 0 ) |
|
{ |
|
prog++; |
|
prog = qMin( prog, 99 ); |
|
emit indexingProgress( prog, tr("Processing document %1") .arg( (*it).path() ) ); |
|
} |
|
} |
|
|
|
emit indexingProgress( 100, tr("Processing completed") ); |
|
return true; |
|
} |
|
|
|
|
|
void Index::insertInDict( const QString &str, int docNum ) |
|
{ |
|
Entry *e = 0; |
|
if ( dict.count() ) |
|
e = dict[ str ]; |
|
|
|
if ( e ) |
|
{ |
|
if ( e->documents.last().docNumber != docNum ) |
|
e->documents.append( Document(docNum, 1 ) ); |
|
else |
|
e->documents.last().frequency++; |
|
} |
|
else |
|
{ |
|
dict.insert( str, new Entry( docNum ) ); |
|
} |
|
} |
|
|
|
|
|
bool Index::parseDocumentToStringlist(EBook *chmFile, const QUrl& filename, QStringList& tokenlist ) |
|
{ |
|
QString parsedbuf, parseentity, text; |
|
|
|
if ( !chmFile->getFileContentAsString( text, filename ) |
|
|| text.isEmpty() ) |
|
{ |
|
qWarning( "Search index generator: could not retrieve the document content for %s", qPrintable( filename.toString() ) ); |
|
return false; |
|
} |
|
|
|
m_charssplit = SPLIT_CHARACTERS; |
|
m_charsword = WORD_CHARACTERS; |
|
|
|
tokenlist.clear(); |
|
|
|
// State machine states |
|
enum state_t |
|
{ |
|
STATE_OUTSIDE_TAGS, // outside HTML tags; parse text |
|
STATE_IN_HTML_TAG, // inside HTML tags; wait for end tag |
|
STATE_IN_QUOTES, // inside HTML tags and inside quotes; wait for end quote (in var QuoteChar) |
|
STATE_IN_HTML_ENTITY // inside HTML entity; parse the entity |
|
}; |
|
|
|
state_t state = STATE_OUTSIDE_TAGS; |
|
QChar QuoteChar; // used in STATE_IN_QUOTES |
|
|
|
for ( int j = 0; j < text.length(); j++ ) |
|
{ |
|
QChar ch = text[j]; |
|
|
|
if ( (j % 20000) == 0 ) |
|
qApp->processEvents( QEventLoop::ExcludeUserInputEvents ); |
|
|
|
if ( state == STATE_IN_HTML_TAG ) |
|
{ |
|
// We are inside HTML tag. |
|
// Ignore everything until we see '>' (end of HTML tag) or quote char (quote start) |
|
if ( ch == '"' || ch == '\'' ) |
|
{ |
|
state = STATE_IN_QUOTES; |
|
QuoteChar = ch; |
|
} |
|
else if ( ch == '>' ) |
|
state = STATE_OUTSIDE_TAGS; |
|
|
|
continue; |
|
} |
|
else if ( state == STATE_IN_QUOTES ) |
|
{ |
|
// We are inside quoted text inside HTML tag. |
|
// Ignore everything until we see the quote character again |
|
if ( ch == QuoteChar ) |
|
state = STATE_IN_HTML_TAG; |
|
|
|
continue; |
|
} |
|
else if ( state == STATE_IN_HTML_ENTITY ) |
|
{ |
|
// We are inside encoded HTML entity (like ). |
|
// Collect to parsedbuf everything until we see ; |
|
if ( ch.isLetterOrNumber() ) |
|
{ |
|
// get next character of this entity |
|
parseentity.append( ch ); |
|
continue; |
|
} |
|
|
|
// The entity ended |
|
state = STATE_OUTSIDE_TAGS; |
|
|
|
// Some shitty HTML does not terminate entities correctly. Screw it. |
|
if ( ch != ';' && ch != '<' ) |
|
{ |
|
if ( parseentity.isEmpty() ) |
|
{ |
|
// straight '&' symbol. Add and continue. |
|
parsedbuf += "&"; |
|
} |
|
else |
|
qWarning( "Index::parseDocument: incorrectly terminated HTML entity '&%s%c', ignoring", qPrintable( parseentity ), ch.toLatin1() ); |
|
|
|
j--; // parse this character again, but in different state |
|
continue; |
|
} |
|
|
|
// Don't we have a space? |
|
if ( parseentity.toLower() != "nbsp" ) |
|
{ |
|
QString entity = entityDecoder.decode( parseentity ); |
|
|
|
if ( entity.isNull() ) |
|
{ |
|
// decodeEntity() already printed error message |
|
//qWarning( "Index::parseDocument: failed to decode entity &%s;", parsedbuf.ascii() ); |
|
continue; |
|
} |
|
|
|
parsedbuf += entity; |
|
continue; |
|
} |
|
else |
|
ch = ' '; // We got a space, so treat it like it, and not add it to parsebuf |
|
} |
|
|
|
// |
|
// Now process STATE_OUTSIDE_TAGS |
|
// |
|
|
|
// Check for start of HTML tag, and switch to STATE_IN_HTML_TAG if it is |
|
if ( ch == '<' ) |
|
{ |
|
state = STATE_IN_HTML_TAG; |
|
goto tokenize_buf; |
|
} |
|
|
|
// Check for start of HTML entity |
|
if ( ch == '&' ) |
|
{ |
|
state = STATE_IN_HTML_ENTITY; |
|
parseentity = QString::null; |
|
continue; |
|
} |
|
|
|
// Replace quote by ' - quotes are used in search window to set the phrase |
|
if ( ch == '"' ) |
|
ch = '\''; |
|
|
|
// Ok, we have a valid character outside HTML tags, and probably some in buffer already. |
|
// If it is char or letter, add it and continue |
|
if ( ch.isLetterOrNumber() || m_charsword.indexOf( ch ) != -1 ) |
|
{ |
|
parsedbuf.append( ch ); |
|
continue; |
|
} |
|
|
|
// If it is a split char, add the word to the dictionary, and then add the char itself. |
|
if ( m_charssplit.indexOf( ch ) != -1 ) |
|
{ |
|
if ( !parsedbuf.isEmpty() ) |
|
tokenlist.push_back( parsedbuf.toLower() ); |
|
|
|
tokenlist.push_back( ch.toLower() ); |
|
parsedbuf = QString::null; |
|
continue; |
|
} |
|
|
|
tokenize_buf: |
|
// Just add the word; it is most likely a space or terminated by tokenizer. |
|
if ( !parsedbuf.isEmpty() ) |
|
{ |
|
tokenlist.push_back( parsedbuf.toLower() ); |
|
parsedbuf = QString::null; |
|
} |
|
} |
|
|
|
// Add the last word if still here - for broken htmls. |
|
if ( !parsedbuf.isEmpty() ) |
|
tokenlist.push_back( parsedbuf.toLower() ); |
|
|
|
return true; |
|
} |
|
|
|
|
|
void Index::writeDict( QDataStream& stream ) |
|
{ |
|
stream << DICT_VERSION; |
|
stream << m_charssplit; |
|
stream << m_charsword; |
|
|
|
// Document list |
|
stream << docList; |
|
|
|
// Dictionary |
|
for( QHash<QString, Entry *>::ConstIterator it = dict.constBegin(); it != dict.constEnd(); ++it ) |
|
{ |
|
stream << it.key(); |
|
stream << (int) it.value()->documents.count(); |
|
stream << it.value()->documents; |
|
} |
|
} |
|
|
|
|
|
bool Index::readDict( QDataStream& stream ) |
|
{ |
|
dict.clear(); |
|
docList.clear(); |
|
|
|
QString key; |
|
int version, numOfDocs; |
|
|
|
stream >> version; |
|
|
|
if ( version < 2 ) |
|
return false; |
|
|
|
stream >> m_charssplit; |
|
stream >> m_charsword; |
|
|
|
// Read the document list |
|
stream >> docList; |
|
|
|
while ( !stream.atEnd() ) |
|
{ |
|
stream >> key; |
|
stream >> numOfDocs; |
|
|
|
QVector<Document> docs( numOfDocs ); |
|
|
|
stream >> docs; |
|
dict.insert( key, new Entry( docs ) ); |
|
} |
|
|
|
return dict.size() > 0; |
|
} |
|
|
|
|
|
QList< QUrl > Index::query(const QStringList &terms, const QStringList &termSeq, const QStringList &seqWords, EBook *chmFile ) |
|
{ |
|
QList<Term> termList; |
|
|
|
QStringList::ConstIterator it = terms.begin(); |
|
for ( it = terms.begin(); it != terms.end(); ++it ) |
|
{ |
|
Entry *e = 0; |
|
|
|
if ( dict[ *it ] ) |
|
{ |
|
e = dict[ *it ]; |
|
termList.append( Term( *it, e->documents.count(), e->documents ) ); |
|
} |
|
else |
|
{ |
|
return QList< QUrl >(); |
|
} |
|
} |
|
|
|
if ( !termList.count() ) |
|
return QList< QUrl >(); |
|
|
|
qSort( termList ); |
|
|
|
QVector<Document> minDocs = termList.takeFirst().documents; |
|
for(QList<Term>::Iterator it = termList.begin(); it != termList.end(); ++it) { |
|
Term *t = &(*it); |
|
QVector<Document> docs = t->documents; |
|
for(QVector<Document>::Iterator minDoc_it = minDocs.begin(); minDoc_it != minDocs.end(); ) { |
|
bool found = false; |
|
for (QVector<Document>::ConstIterator doc_it = docs.constBegin(); doc_it != docs.constEnd(); ++doc_it ) { |
|
if ( (*minDoc_it).docNumber == (*doc_it).docNumber ) { |
|
(*minDoc_it).frequency += (*doc_it).frequency; |
|
found = true; |
|
break; |
|
} |
|
} |
|
if ( !found ) |
|
minDoc_it = minDocs.erase( minDoc_it ); |
|
else |
|
++minDoc_it; |
|
} |
|
} |
|
|
|
QList< QUrl > results; |
|
qSort( minDocs ); |
|
if ( termSeq.isEmpty() ) { |
|
for(QVector<Document>::Iterator it = minDocs.begin(); it != minDocs.end(); ++it) |
|
results << docList.at((int)(*it).docNumber); |
|
return results; |
|
} |
|
|
|
QUrl fileName; |
|
for(QVector<Document>::Iterator it = minDocs.begin(); it != minDocs.end(); ++it) { |
|
fileName = docList[ (int)(*it).docNumber ]; |
|
if ( searchForPhrases( termSeq, seqWords, fileName, chmFile ) ) |
|
results << fileName; |
|
} |
|
|
|
return results; |
|
} |
|
|
|
|
|
bool Index::searchForPhrases( const QStringList &phrases, const QStringList &words, const QUrl &filename, EBook * chmFile ) |
|
{ |
|
QStringList parsed_document; |
|
|
|
if ( !parseDocumentToStringlist( chmFile, filename, parsed_document ) ) |
|
return false; |
|
|
|
miniDict.clear(); |
|
|
|
// Initialize the dictionary with the words in phrase(s) |
|
for ( QStringList::ConstIterator cIt = words.begin(); cIt != words.end(); ++cIt ) |
|
miniDict.insert( *cIt, new PosEntry( 0 ) ); |
|
|
|
// Fill the dictionary with the words from the document |
|
unsigned int word_offset = 3; |
|
for ( QStringList::ConstIterator it = parsed_document.constBegin(); it != parsed_document.constEnd(); it++, word_offset++ ) |
|
{ |
|
PosEntry * entry = miniDict[ *it ]; |
|
|
|
if ( entry ) |
|
entry->positions.append( word_offset ); |
|
} |
|
|
|
// Dump it |
|
/* |
|
QDictIterator<PosEntry> it( miniDict ); |
|
for( ; it.current(); ++it ) |
|
{ |
|
QString text( it.currentKey() ); |
|
QValueList<uint> pos = miniDict[text]->positions; |
|
for ( unsigned int i = 1; i < pos.size(); i++ ) |
|
text += " " + QString::number( pos[i] ); |
|
|
|
qDebug( "%s", text.ascii()); |
|
} |
|
*/ |
|
|
|
QList<uint> first_word_positions; |
|
|
|
for ( QStringList::ConstIterator phrase_it = phrases.constBegin(); phrase_it != phrases.constEnd(); phrase_it++ ) |
|
{ |
|
QStringList phrasewords = phrase_it->split( ' ' ); |
|
first_word_positions = miniDict[ phrasewords[0] ]->positions; |
|
|
|
for ( int j = 1; j < phrasewords.count(); ++j ) |
|
{ |
|
QList<uint> next_word_it = miniDict[ phrasewords[j] ]->positions; |
|
QList<uint>::iterator dict_it = first_word_positions.begin(); |
|
|
|
while ( dict_it != first_word_positions.end() ) |
|
{ |
|
if ( next_word_it.indexOf( *dict_it + 1 ) != -1 ) |
|
{ |
|
(*dict_it)++; |
|
++dict_it; |
|
} |
|
else |
|
dict_it = first_word_positions.erase( dict_it ); |
|
} |
|
} |
|
} |
|
|
|
if ( first_word_positions.count() ) |
|
return true; |
|
|
|
return false; |
|
} |
|
|
|
|
|
};
|
|
|