You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

225 lines
5.2 KiB

/*
* Kchmviewer - a CHM and EPUB file viewer with broad language support
* Copyright (C) 2004-2014 George Yunaev, gyunaev@ulduzsoft.com
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <QApplication>
#include "ebook.h"
#include "ebook_search.h"
// Helper class to simplicity state management and data keeping
class SearchDataKeeper
{
public:
SearchDataKeeper() { m_inPhrase = false; }
void beginPhrase()
{
phrase_terms.clear();
m_inPhrase = true;
}
void endPhrase()
{
m_inPhrase = false;
phrasewords += phrase_terms;
phrases.push_back( phrase_terms.join(" ") );
}
bool isInPhrase() const { return m_inPhrase; }
void addTerm( const QString& term )
{
if ( !term.isEmpty() )
{
terms.push_back( term );
if ( m_inPhrase )
phrase_terms.push_back( term );
}
}
// Should contain all the search terms present in query, includind those from phrases. One element - one term .
QStringList terms;
// Should contain phrases present in query without quotes. One element - one phrase.
QStringList phrases;
// Should contain all the terms present in all the phrases (but not outside).
QStringList phrasewords;
private:
bool m_inPhrase;
QStringList phrase_terms;
};
EBookSearch::EBookSearch()
{
m_Index = nullptr;
}
EBookSearch::~ EBookSearch()
{
delete m_Index;
}
bool EBookSearch::loadIndex( QDataStream & stream )
{
delete m_Index;
m_Index = new QtAs::Index();
return m_Index->readDict( stream );
}
bool EBookSearch::generateIndex( EBook * ebookFile, QDataStream & stream )
{
QList< QUrl > documents;
QList< QUrl > alldocuments;
emit progressStep( 0, "Generating the list of documents" );
processEvents();
// Enumerate the documents
if ( !ebookFile->enumerateFiles( alldocuments ) )
return false;
if ( m_Index )
delete m_Index;
m_Index = new QtAs::Index();
connect( m_Index, &QtAs::Index::indexingProgress, this, &EBookSearch::updateProgress );
// Process the list of files in CHM archive and keep only HTML document files from there
for ( const QUrl &allDocumentsI : qAsConst( alldocuments ) )
{
const QString docpath = allDocumentsI.path();
if ( docpath.endsWith( ".html", Qt::CaseInsensitive )
|| docpath.endsWith( ".htm", Qt::CaseInsensitive )
|| docpath.endsWith( ".xhtml", Qt::CaseInsensitive ) )
documents.push_back( allDocumentsI );
}
if ( !m_Index->makeIndex( documents, ebookFile ) )
{
delete m_Index;
m_Index = nullptr;
return false;
}
m_Index->writeDict( stream );
m_keywordDocuments.clear();
return true;
}
void EBookSearch::cancelIndexGeneration()
{
m_Index->setLastWinClosed();
}
void EBookSearch::updateProgress(int value, const QString & stepName)
{
emit progressStep( value, stepName );
}
void EBookSearch::processEvents()
{
// Do it up to ten times; some events generate other events
for ( int i = 0; i < 10; i++ )
qApp->processEvents( QEventLoop::ExcludeUserInputEvents );
}
bool EBookSearch::searchQuery(const QString & query, QList< QUrl > * results, EBook *ebookFile, unsigned int limit)
{
// We should have index
if ( !m_Index )
return false;
// Characters which split the words. We need to make them separate tokens
QString splitChars = m_Index->getCharsSplit();
// Characters which are part of the word. We should keep them apart.
QString partOfWordChars = m_Index->getCharsPartOfWord();
// Variables to store current state
SearchDataKeeper keeper;
QString term;
for ( const QChar &iChar : query )
{
const QChar ch = iChar.toLower();
// a quote either begins or ends the phrase
if ( ch == '"' )
{
keeper.addTerm( term );
if ( keeper.isInPhrase() )
keeper.endPhrase();
else
keeper.beginPhrase();
continue;
}
// If new char does not stop the word, add ot and continue
if ( ch.isLetterOrNumber() || partOfWordChars.indexOf( ch ) != -1 )
{
term.append( ch );
continue;
}
// If it is a split char, add this term and split char as separate term
if ( splitChars.indexOf( ch ) != -1 )
{
// Add existing term if present
keeper.addTerm( term );
// Change the term variable, so it will be added when we exit this block
term = ch;
}
// Just add the word; it is most likely a space or terminated by tokenizer.
keeper.addTerm( term );
term = QString();
}
keeper.addTerm( term );
if ( keeper.isInPhrase() )
return false;
QList< QUrl > foundDocs = m_Index->query( keeper.terms, keeper.phrases, keeper.phrasewords, ebookFile );
for ( QList< QUrl >::iterator it = foundDocs.begin(); it != foundDocs.end() && limit > 0; ++it, limit-- )
results->push_back( *it );
return true;
}
bool EBookSearch::hasIndex() const
{
return m_Index != nullptr;
}