/* * Kchmviewer - a CHM and EPUB file viewer with broad language support * Copyright (C) 2004-2014 George Yunaev, gyunaev@ulduzsoft.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include "helper_entitydecoder.h" HelperEntityDecoder::HelperEntityDecoder(QTextCodec *encoder) { changeEncoding( encoder ); } static inline QString encodeWithCodec( QTextCodec *encoder, const QByteArray& str ) { return (encoder ? encoder->toUnicode( str.constData () ) : str); } void HelperEntityDecoder::changeEncoding(QTextCodec *encoder) { // Set up m_entityDecodeMap characters according to current textCodec m_entityDecodeMap.clear(); m_entityDecodeMap[QStringLiteral("AElig")] = encodeWithCodec( encoder, "\306"); // capital AE diphthong (ligature) m_entityDecodeMap[QStringLiteral("Aacute")] = encodeWithCodec( encoder, "\301"); // capital A, acute accent m_entityDecodeMap[QStringLiteral("Acirc")] = encodeWithCodec( encoder, "\302"); // capital A, circumflex accent m_entityDecodeMap[QStringLiteral("Agrave")] = encodeWithCodec( encoder, "\300"); // capital A, grave accent m_entityDecodeMap[QStringLiteral("Aring")] = encodeWithCodec( encoder, "\305"); // capital A, ring m_entityDecodeMap[QStringLiteral("Atilde")] = encodeWithCodec( encoder, "\303"); // capital A, tilde m_entityDecodeMap[QStringLiteral("Auml")] = encodeWithCodec( encoder, "\304"); // capital A, dieresis or umlaut mark m_entityDecodeMap[QStringLiteral("Ccedil")] = encodeWithCodec( encoder, "\307"); // capital C, cedilla m_entityDecodeMap[QStringLiteral("Dstrok")] = encodeWithCodec( encoder, "\320"); // whatever m_entityDecodeMap[QStringLiteral("ETH")] = encodeWithCodec( encoder, "\320"); // capital Eth, Icelandic m_entityDecodeMap[QStringLiteral("Eacute")] = encodeWithCodec( encoder, "\311"); // capital E, acute accent m_entityDecodeMap[QStringLiteral("Ecirc")] = encodeWithCodec( encoder, "\312"); // capital E, circumflex accent m_entityDecodeMap[QStringLiteral("Egrave")] = encodeWithCodec( encoder, "\310"); // capital E, grave accent m_entityDecodeMap[QStringLiteral("Euml")] = encodeWithCodec( encoder, "\313"); // capital E, dieresis or umlaut mark m_entityDecodeMap[QStringLiteral("Iacute")] = encodeWithCodec( encoder, "\315"); // capital I, acute accent m_entityDecodeMap[QStringLiteral("Icirc")] = encodeWithCodec( encoder, "\316"); // capital I, circumflex accent m_entityDecodeMap[QStringLiteral("Igrave")] = encodeWithCodec( encoder, "\314"); // capital I, grave accent m_entityDecodeMap[QStringLiteral("Iuml")] = encodeWithCodec( encoder, "\317"); // capital I, dieresis or umlaut mark m_entityDecodeMap[QStringLiteral("Ntilde")] = encodeWithCodec( encoder, "\321"); // capital N, tilde m_entityDecodeMap[QStringLiteral("Oacute")] = encodeWithCodec( encoder, "\323"); // capital O, acute accent m_entityDecodeMap[QStringLiteral("Ocirc")] = encodeWithCodec( encoder, "\324"); // capital O, circumflex accent m_entityDecodeMap[QStringLiteral("Ograve")] = encodeWithCodec( encoder, "\322"); // capital O, grave accent m_entityDecodeMap[QStringLiteral("Oslash")] = encodeWithCodec( encoder, "\330"); // capital O, slash m_entityDecodeMap[QStringLiteral("Otilde")] = encodeWithCodec( encoder, "\325"); // capital O, tilde m_entityDecodeMap[QStringLiteral("Ouml")] = encodeWithCodec( encoder, "\326"); // capital O, dieresis or umlaut mark m_entityDecodeMap[QStringLiteral("THORN")] = encodeWithCodec( encoder, "\336"); // capital THORN, Icelandic m_entityDecodeMap[QStringLiteral("Uacute")] = encodeWithCodec( encoder, "\332"); // capital U, acute accent m_entityDecodeMap[QStringLiteral("Ucirc")] = encodeWithCodec( encoder, "\333"); // capital U, circumflex accent m_entityDecodeMap[QStringLiteral("Ugrave")] = encodeWithCodec( encoder, "\331"); // capital U, grave accent m_entityDecodeMap[QStringLiteral("Uuml")] = encodeWithCodec( encoder, "\334"); // capital U, dieresis or umlaut mark m_entityDecodeMap[QStringLiteral("Yacute")] = encodeWithCodec( encoder, "\335"); // capital Y, acute accent m_entityDecodeMap[QStringLiteral("OElig")] = encodeWithCodec( encoder, "\338"); // capital Y, acute accent m_entityDecodeMap[QStringLiteral("oelig")] = encodeWithCodec( encoder, "\339"); // capital Y, acute accent m_entityDecodeMap[QStringLiteral("aacute")] = encodeWithCodec( encoder, "\341"); // small a, acute accent m_entityDecodeMap[QStringLiteral("acirc")] = encodeWithCodec( encoder, "\342"); // small a, circumflex accent m_entityDecodeMap[QStringLiteral("aelig")] = encodeWithCodec( encoder, "\346"); // small ae diphthong (ligature) m_entityDecodeMap[QStringLiteral("agrave")] = encodeWithCodec( encoder, "\340"); // small a, grave accent m_entityDecodeMap[QStringLiteral("aring")] = encodeWithCodec( encoder, "\345"); // small a, ring m_entityDecodeMap[QStringLiteral("atilde")] = encodeWithCodec( encoder, "\343"); // small a, tilde m_entityDecodeMap[QStringLiteral("auml")] = encodeWithCodec( encoder, "\344"); // small a, dieresis or umlaut mark m_entityDecodeMap[QStringLiteral("ccedil")] = encodeWithCodec( encoder, "\347"); // small c, cedilla m_entityDecodeMap[QStringLiteral("eacute")] = encodeWithCodec( encoder, "\351"); // small e, acute accent m_entityDecodeMap[QStringLiteral("ecirc")] = encodeWithCodec( encoder, "\352"); // small e, circumflex accent m_entityDecodeMap[QStringLiteral("Scaron")] = encodeWithCodec( encoder, "\352"); // small e, circumflex accent m_entityDecodeMap[QStringLiteral("egrave")] = encodeWithCodec( encoder, "\350"); // small e, grave accent m_entityDecodeMap[QStringLiteral("eth")] = encodeWithCodec( encoder, "\360"); // small eth, Icelandic m_entityDecodeMap[QStringLiteral("euml")] = encodeWithCodec( encoder, "\353"); // small e, dieresis or umlaut mark m_entityDecodeMap[QStringLiteral("iacute")] = encodeWithCodec( encoder, "\355"); // small i, acute accent m_entityDecodeMap[QStringLiteral("icirc")] = encodeWithCodec( encoder, "\356"); // small i, circumflex accent m_entityDecodeMap[QStringLiteral("igrave")] = encodeWithCodec( encoder, "\354"); // small i, grave accent m_entityDecodeMap[QStringLiteral("iuml")] = encodeWithCodec( encoder, "\357"); // small i, dieresis or umlaut mark m_entityDecodeMap[QStringLiteral("ntilde")] = encodeWithCodec( encoder, "\361"); // small n, tilde m_entityDecodeMap[QStringLiteral("oacute")] = encodeWithCodec( encoder, "\363"); // small o, acute accent m_entityDecodeMap[QStringLiteral("ocirc")] = encodeWithCodec( encoder, "\364"); // small o, circumflex accent m_entityDecodeMap[QStringLiteral("ograve")] = encodeWithCodec( encoder, "\362"); // small o, grave accent m_entityDecodeMap[QStringLiteral("oslash")] = encodeWithCodec( encoder, "\370"); // small o, slash m_entityDecodeMap[QStringLiteral("otilde")] = encodeWithCodec( encoder, "\365"); // small o, tilde m_entityDecodeMap[QStringLiteral("ouml")] = encodeWithCodec( encoder, "\366"); // small o, dieresis or umlaut mark m_entityDecodeMap[QStringLiteral("szlig")] = encodeWithCodec( encoder, "\337"); // small sharp s, German (sz ligature) m_entityDecodeMap[QStringLiteral("thorn")] = encodeWithCodec( encoder, "\376"); // small thorn, Icelandic m_entityDecodeMap[QStringLiteral("uacute")] = encodeWithCodec( encoder, "\372"); // small u, acute accent m_entityDecodeMap[QStringLiteral("ucirc")] = encodeWithCodec( encoder, "\373"); // small u, circumflex accent m_entityDecodeMap[QStringLiteral("ugrave")] = encodeWithCodec( encoder, "\371"); // small u, grave accent m_entityDecodeMap[QStringLiteral("uuml")] = encodeWithCodec( encoder, "\374"); // small u, dieresis or umlaut mark m_entityDecodeMap[QStringLiteral("yacute")] = encodeWithCodec( encoder, "\375"); // small y, acute accent m_entityDecodeMap[QStringLiteral("yuml")] = encodeWithCodec( encoder, "\377"); // small y, dieresis or umlaut mark m_entityDecodeMap[QStringLiteral("iexcl")] = encodeWithCodec( encoder, "\241"); m_entityDecodeMap[QStringLiteral("cent")] = encodeWithCodec( encoder, "\242"); m_entityDecodeMap[QStringLiteral("pound")] = encodeWithCodec( encoder, "\243"); m_entityDecodeMap[QStringLiteral("curren")] = encodeWithCodec( encoder, "\244"); m_entityDecodeMap[QStringLiteral("yen")] = encodeWithCodec( encoder, "\245"); m_entityDecodeMap[QStringLiteral("brvbar")] = encodeWithCodec( encoder, "\246"); m_entityDecodeMap[QStringLiteral("sect")] = encodeWithCodec( encoder, "\247"); m_entityDecodeMap[QStringLiteral("uml")] = encodeWithCodec( encoder, "\250"); m_entityDecodeMap[QStringLiteral("ordf")] = encodeWithCodec( encoder, "\252"); m_entityDecodeMap[QStringLiteral("laquo")] = encodeWithCodec( encoder, "\253"); m_entityDecodeMap[QStringLiteral("not")] = encodeWithCodec( encoder, "\254"); m_entityDecodeMap[QStringLiteral("shy")] = encodeWithCodec( encoder, "\255"); m_entityDecodeMap[QStringLiteral("macr")] = encodeWithCodec( encoder, "\257"); m_entityDecodeMap[QStringLiteral("deg")] = encodeWithCodec( encoder, "\260"); m_entityDecodeMap[QStringLiteral("plusmn")] = encodeWithCodec( encoder, "\261"); m_entityDecodeMap[QStringLiteral("sup1")] = encodeWithCodec( encoder, "\271"); m_entityDecodeMap[QStringLiteral("sup2")] = encodeWithCodec( encoder, "\262"); m_entityDecodeMap[QStringLiteral("sup3")] = encodeWithCodec( encoder, "\263"); m_entityDecodeMap[QStringLiteral("acute")] = encodeWithCodec( encoder, "\264"); m_entityDecodeMap[QStringLiteral("micro")] = encodeWithCodec( encoder, "\265"); m_entityDecodeMap[QStringLiteral("para")] = encodeWithCodec( encoder, "\266"); m_entityDecodeMap[QStringLiteral("middot")] = encodeWithCodec( encoder, "\267"); m_entityDecodeMap[QStringLiteral("cedil")] = encodeWithCodec( encoder, "\270"); m_entityDecodeMap[QStringLiteral("ordm")] = encodeWithCodec( encoder, "\272"); m_entityDecodeMap[QStringLiteral("raquo")] = encodeWithCodec( encoder, "\273"); m_entityDecodeMap[QStringLiteral("frac14")] = encodeWithCodec( encoder, "\274"); m_entityDecodeMap[QStringLiteral("frac12")] = encodeWithCodec( encoder, "\275"); m_entityDecodeMap[QStringLiteral("frac34")] = encodeWithCodec( encoder, "\276"); m_entityDecodeMap[QStringLiteral("iquest")] = encodeWithCodec( encoder, "\277"); m_entityDecodeMap[QStringLiteral("times")] = encodeWithCodec( encoder, "\327"); m_entityDecodeMap[QStringLiteral("divide")] = encodeWithCodec( encoder, "\367"); m_entityDecodeMap[QStringLiteral("copy")] = encodeWithCodec( encoder, "\251"); // copyright sign m_entityDecodeMap[QStringLiteral("reg")] = encodeWithCodec( encoder, "\256"); // registered sign m_entityDecodeMap[QStringLiteral("nbsp")] = encodeWithCodec( encoder, "\240"); // non breaking space m_entityDecodeMap[QStringLiteral("fnof")] = QChar((unsigned short) 402); m_entityDecodeMap[QStringLiteral("Delta")] = QChar((unsigned short) 916); m_entityDecodeMap[QStringLiteral("Pi")] = QChar((unsigned short) 928); m_entityDecodeMap[QStringLiteral("Sigma")] = QChar((unsigned short) 931); m_entityDecodeMap[QStringLiteral("beta")] = QChar((unsigned short) 946); m_entityDecodeMap[QStringLiteral("gamma")] = QChar((unsigned short) 947); m_entityDecodeMap[QStringLiteral("delta")] = QChar((unsigned short) 948); m_entityDecodeMap[QStringLiteral("eta")] = QChar((unsigned short) 951); m_entityDecodeMap[QStringLiteral("theta")] = QChar((unsigned short) 952); m_entityDecodeMap[QStringLiteral("lambda")] = QChar((unsigned short) 955); m_entityDecodeMap[QStringLiteral("mu")] = QChar((unsigned short) 956); m_entityDecodeMap[QStringLiteral("nu")] = QChar((unsigned short) 957); m_entityDecodeMap[QStringLiteral("pi")] = QChar((unsigned short) 960); m_entityDecodeMap[QStringLiteral("rho")] = QChar((unsigned short) 961); m_entityDecodeMap[QStringLiteral("lsquo")] = QChar((unsigned short) 8216); m_entityDecodeMap[QStringLiteral("rsquo")] = QChar((unsigned short) 8217); m_entityDecodeMap[QStringLiteral("rdquo")] = QChar((unsigned short) 8221); m_entityDecodeMap[QStringLiteral("bdquo")] = QChar((unsigned short) 8222); m_entityDecodeMap[QStringLiteral("trade")] = QChar((unsigned short) 8482); m_entityDecodeMap[QStringLiteral("ldquo")] = QChar((unsigned short) 8220); m_entityDecodeMap[QStringLiteral("ndash")] = QChar((unsigned short) 8211); m_entityDecodeMap[QStringLiteral("mdash")] = QChar((unsigned short) 8212); m_entityDecodeMap[QStringLiteral("bull")] = QChar((unsigned short) 8226); m_entityDecodeMap[QStringLiteral("hellip")] = QChar((unsigned short) 8230); m_entityDecodeMap[QStringLiteral("emsp")] = QChar((unsigned short) 8195); m_entityDecodeMap[QStringLiteral("rarr")] = QChar((unsigned short) 8594); m_entityDecodeMap[QStringLiteral("rArr")] = QChar((unsigned short) 8658); m_entityDecodeMap[QStringLiteral("crarr")] = QChar((unsigned short) 8629); m_entityDecodeMap[QStringLiteral("le")] = QChar((unsigned short) 8804); m_entityDecodeMap[QStringLiteral("ge")] = QChar((unsigned short) 8805); m_entityDecodeMap[QStringLiteral("lte")] = QChar((unsigned short) 8804); // wrong, but used somewhere m_entityDecodeMap[QStringLiteral("gte")] = QChar((unsigned short) 8805); // wrong, but used somewhere m_entityDecodeMap[QStringLiteral("dagger")] = QChar((unsigned short) 8224); m_entityDecodeMap[QStringLiteral("Dagger")] = QChar((unsigned short) 8225); m_entityDecodeMap[QStringLiteral("euro")] = QChar((unsigned short) 8364); m_entityDecodeMap[QStringLiteral("asymp")] = QChar((unsigned short) 8776); m_entityDecodeMap[QStringLiteral("isin")] = QChar((unsigned short) 8712); m_entityDecodeMap[QStringLiteral("notin")] = QChar((unsigned short) 8713); m_entityDecodeMap[QStringLiteral("prod")] = QChar((unsigned short) 8719); m_entityDecodeMap[QStringLiteral("ne")] = QChar((unsigned short) 8800); m_entityDecodeMap[QStringLiteral("amp")] = QStringLiteral("&"); // ampersand m_entityDecodeMap[QStringLiteral("gt")] = QStringLiteral(">"); // greater than m_entityDecodeMap[QStringLiteral("lt")] = QStringLiteral("<"); // less than m_entityDecodeMap[QStringLiteral("quot")] = QStringLiteral("\""); // double quote m_entityDecodeMap[QStringLiteral("apos")] = QStringLiteral("'"); // single quote m_entityDecodeMap[QStringLiteral("frasl")] = QStringLiteral("/"); m_entityDecodeMap[QStringLiteral("minus")] = QStringLiteral("-"); m_entityDecodeMap[QStringLiteral("oplus")] = QStringLiteral("+"); m_entityDecodeMap[QStringLiteral("Prime")] = QStringLiteral("\""); } QString HelperEntityDecoder::decode( const QString &entity ) const { // If entity is an ASCII code like 〽 - just decode it if ( entity.isEmpty() ) { return QLatin1String(""); } else if ( entity[0] == '#' ) { bool valid; unsigned int ascode = entity.midRef(1).toUInt( &valid ); if ( !valid ) { qWarning ( "HelperEntityDecoder::decode: could not decode HTML entity '%s'", qPrintable( entity ) ); return QString(); } return (QString) (QChar( ascode )); } else { QMap::const_iterator it = m_entityDecodeMap.find( entity ); if ( it == m_entityDecodeMap.end() ) { qWarning ("HelperEntityDecoder::decode: could not decode HTML entity '%s'", qPrintable( entity )); return QLatin1String(""); } return *it; } }