/*
* Kchmviewer - a CHM and EPUB file viewer with broad language support
* Copyright (C) 2004-2014 George Yunaev, gyunaev@ulduzsoft.com
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
#include
#include
#include
#include "ebook_chm.h"
#include "ebook_chm_encoding.h"
#include "bitfiddle.h"
// Big-enough buffer size for use with various routines.
#define BUF_SIZE 4096
#define COMMON_BUF_LEN 1025
#define TOPICS_ENTRY_LEN 16
#define URLTBL_ENTRY_LEN 12
//#define DEBUGPARSER(A) qDebug A
#define DEBUGPARSER(A)
static const char *URL_SCHEME_CHM = "ms-its";
EBook_CHM::EBook_CHM()
: EBook()
{
m_envOptions = qgetenv("KCHMVIEWEROPTS");
m_chmFile = nullptr;
m_filename = m_font = QString();
m_textCodec = nullptr;
m_textCodecForSpecialFiles = nullptr;
m_detectedLCID = 0;
m_currentEncoding = QStringLiteral("UTF-8");
m_htmlEntityDecoder = nullptr;
}
EBook_CHM::~EBook_CHM()
{
close();
}
void EBook_CHM::close()
{
if (m_chmFile == nullptr)
return;
chm_close(m_chmFile);
m_chmFile = nullptr;
m_filename = m_font = QString();
m_home.clear();
m_topicsFile.clear();
m_indexFile.clear();
m_textCodec = nullptr;
m_textCodecForSpecialFiles = nullptr;
m_detectedLCID = 0;
m_currentEncoding = QStringLiteral("UTF-8");
}
QString EBook_CHM::title() const
{
return encodeWithCurrentCodec(m_title);
}
QUrl EBook_CHM::homeUrl() const
{
return pathToUrl(m_home);
}
bool EBook_CHM::hasFeature(EBook::Feature code) const
{
switch (code) {
case FEATURE_TOC:
return m_tocAvailable;
case FEATURE_INDEX:
return m_indexAvailable;
case FEATURE_ENCODING:
return true;
}
return false;
}
bool EBook_CHM::getTableOfContents(QList &toc) const
{
if (parseBinaryTOC(toc))
return true;
// Parse the plain text TOC
QList parsed;
if (!parseFileAndFillArray(m_topicsFile, parsed, false))
return false;
// Find out the root offset, and reduce the indent level to it
// so the toc starts from zero offset.
int root_offset = -1;
// Fill up the real toc
toc.reserve(parsed.size());
for (const ParsedEntry &e : qAsConst(parsed)) {
if (root_offset == -1)
root_offset = e.indent;
EBookTocEntry entry;
entry.iconid = (EBookTocEntry::Icon)e.iconid;
entry.indent = e.indent - root_offset;
entry.name = e.name;
if (!e.urls.empty())
entry.url = e.urls[0];
toc.append(entry);
}
return true;
}
bool EBook_CHM::getIndex(QList &index) const
{
// Parse the plain text index
QList parsed;
if (!parseFileAndFillArray(m_indexFile, parsed, true))
return false;
// Find out the root offset, and reduce the indent level to it
// so the index starts from zero offset.
int root_offset = 0;
// Fill up the real index
index.reserve(parsed.size());
// Find the index root offset
const QList &parsedList = parsed;
for (const ParsedEntry &e : parsedList) {
if (e.urls.empty())
continue;
root_offset = qMin(root_offset, e.indent);
}
// And apply the index
for (const ParsedEntry &e : parsedList) {
if (e.urls.empty())
continue;
EBookIndexEntry entry;
entry.name = e.name;
entry.urls = e.urls;
entry.seealso = e.seealso;
// If the index array is empty, make sure the first entry is on root offset
if (index.isEmpty())
entry.indent = root_offset;
else
entry.indent = e.indent - root_offset;
index.append(entry);
printf("%d: %s\n", entry.indent, qPrintable(entry.name));
}
return true;
}
bool EBook_CHM::getFileContentAsString(QString &str, const QUrl &url) const
{
return getTextContent(str, urlToPath(url));
}
bool EBook_CHM::getFileContentAsBinary(QByteArray &data, const QUrl &url) const
{
return getBinaryContent(data, urlToPath(url));
}
bool EBook_CHM::getBinaryContent(QByteArray &data, const QString &url) const
{
chmUnitInfo ui;
if (!ResolveObject(url, &ui))
return false;
data.resize(ui.length);
if (RetrieveObject(&ui, (unsigned char *)data.data(), 0, ui.length))
return true;
return false;
}
bool EBook_CHM::getTextContent(QString &str, const QString &url, bool internal_encoding) const
{
QByteArray buf;
if (getBinaryContent(buf, url)) {
unsigned int length = buf.size();
if (length > 0) {
buf.resize(length + 1);
buf[length] = '\0';
str = internal_encoding ? (QString)(buf.constData()) : encodeWithCurrentCodec(buf.constData());
return true;
}
}
return false;
}
int EBook_CHM::getContentSize(const QString &url)
{
chmUnitInfo ui;
if (!ResolveObject(url, &ui))
return -1;
return ui.length;
}
bool EBook_CHM::load(const QString &archiveName)
{
QString filename;
// If the file has a file:// prefix, remove it
if (archiveName.startsWith(QLatin1String("file://")))
filename = archiveName.mid(7); // strip it
else
filename = archiveName;
if (m_chmFile)
close();
#if defined(WIN32)
// chm_open on Windows OS uses the following prototype:
// struct chmFile* chm_open(BSTR filename);
//
// however internally it simply passes the filename
// directly to CreateFileW function without any conversion.
// Thus we need to pass it as WCHAR * and not BSTR.
m_chmFile = chm_open((BSTR)filename.toStdWString().c_str());
#else
m_chmFile = chm_open(QFile::encodeName(filename));
#endif
if (m_chmFile == nullptr)
return false;
m_filename = filename;
// Reset encoding
m_textCodec = nullptr;
m_textCodecForSpecialFiles = nullptr;
m_currentEncoding = QStringLiteral("UTF-8");
// Get information from /#WINDOWS and /#SYSTEM files (encoding, title, context file and so)
// and guess the encoding
getInfoFromWindows();
getInfoFromSystem();
guessTextEncoding();
// Check whether the search tables are present
if (ResolveObject(QStringLiteral("/#TOPICS"), &m_chmTOPICS) && ResolveObject(QStringLiteral("/#STRINGS"), &m_chmSTRINGS) && ResolveObject(QStringLiteral("/#URLTBL"), &m_chmURLTBL) &&
ResolveObject(QStringLiteral("/#URLSTR"), &m_chmURLSTR)) {
m_lookupTablesValid = true;
fillTopicsUrlMap();
} else
m_lookupTablesValid = false;
// Some CHM files have toc and index files, but do not set the name properly.
// Some heuristics here.
if (m_topicsFile.isEmpty() && hasFile(QStringLiteral("/toc.hhc")))
m_topicsFile = "/toc.hhc";
if (m_indexFile.isEmpty() && hasFile(QStringLiteral("/index.hhk")))
m_indexFile = "/index.hhk";
if (!m_topicsFile.isEmpty() || (m_lookupTablesValid && hasFile(QStringLiteral("/#TOCIDX"))))
m_tocAvailable = true;
else
m_tocAvailable = false;
if (!m_indexFile.isEmpty() || (m_lookupTablesValid && hasFile(QStringLiteral("/$WWKeywordLinks/BTree"))))
m_indexAvailable = true;
else
m_indexAvailable = false;
return true;
}
int EBook_CHM::findStringInQuotes(const QString &tag, int offset, QString &value, bool firstquote, bool decodeentities) const
{
int qbegin = tag.indexOf('"', offset);
if (qbegin == -1)
qFatal("EBook_CHMImpl::findStringInQuotes: cannot find first quote in tag: '%s'", qPrintable(tag));
int qend = firstquote ? tag.indexOf('"', qbegin + 1) : tag.lastIndexOf('"');
if (qend == -1 || qend <= qbegin)
qFatal("EBook_CHMImpl::findStringInQuotes: cannot find last quote in tag: '%s'", qPrintable(tag));
// If we do not need to decode HTML entities, just return.
if (decodeentities) {
QString htmlentity = QString();
bool fill_entity = false;
value.reserve(qend - qbegin); // to avoid multiple memory allocations
for (int i = qbegin + 1; i < qend; i++) {
if (!fill_entity) {
if (tag[i] == '&') // HTML entity starts
fill_entity = true;
else
value.append(tag[i]);
} else {
if (tag[i] == ';') // HTML entity ends
{
// If entity is an ASCII code, just decode it
QString decode = m_htmlEntityDecoder.decode(htmlentity);
if (decode.isNull())
break;
value.append(decode);
htmlentity = QString();
fill_entity = false;
} else
htmlentity.append(tag[i]);
}
}
} else
value = tag.mid(qbegin + 1, qend - qbegin - 1);
return qend + 1;
}
bool EBook_CHM::parseFileAndFillArray(const QString &file, QList &data, bool asIndex) const
{
QString src;
const int MAX_NEST_DEPTH = 256;
if (!getTextContent(src, file) || src.isEmpty())
return false;
/*
// Save the index for debugging purposes
QFile outfile( "parsed.htm" );
if ( outfile.open( QIODevice::WriteOnly ) )
{
QTextStream textstream( &outfile );
textstream << src;
outfile.close();
}
*/
EBookTocEntry::Icon defaultimagenum = EBookTocEntry::IMAGE_AUTO;
int pos = 0, indent = 0, root_indent_offset = 0;
bool in_object = false, root_indent_offset_set = false;
ParsedEntry entry;
entry.iconid = defaultimagenum;
// Split the HHC file by HTML tags
int stringlen = src.length();
while (pos < stringlen && (pos = src.indexOf('<', pos)) != -1) {
int i, word_end = 0;
for (i = ++pos; i < stringlen; i++) {
// If a " or ' is found, skip to the next one.
if ((src[i] == '"' || src[i] == '\'')) {
// find where quote ends, either by another quote, or by '>' symbol (some people don't know HTML)
int nextpos = src.indexOf(src[i], i + 1);
if (nextpos == -1 && (nextpos = src.indexOf('>', i + 1)) == -1) {
qWarning("EBook_CHMImpl::ParseHhcAndFillTree: corrupted TOC: %s", qPrintable(src.mid(i)));
return false;
}
i = nextpos;
} else if (src[i] == '>')
break;
else if (!src[i].isLetterOrNumber() && src[i] != '/' && !word_end)
word_end = i;
}
QString tagword, tag = src.mid(pos, i - pos);
if (word_end)
tagword = src.mid(pos, word_end - pos).toLower();
else
tagword = tag.toLower();
// DEBUGPARSER(("tag: '%s', tagword: '%s'\n", qPrintable( tag ), qPrintable( tagword ) ));
//