okular/generators/chm/lib/helper_entitydecoder.cpp

/*
 *  Kchmviewer - a CHM and EPUB file viewer with broad language support
 *  Copyright (C) 2004-2014 George Yunaev, gyunaev@ulduzsoft.com
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#include <QTextCodec>

#include "helper_entitydecoder.h"


HelperEntityDecoder::HelperEntityDecoder(QTextCodec *encoder)
{
	changeEncoding( encoder );

}

static inline QString encodeWithCodec( QTextCodec *encoder, const QByteArray& str )
{
	return (encoder ? encoder->toUnicode( str.constData () ) : str);
}

void HelperEntityDecoder::changeEncoding(QTextCodec *encoder)
{
	// Set up m_entityDecodeMap characters according to current textCodec
	m_entityDecodeMap.clear();

	m_entityDecodeMap["AElig"]	= encodeWithCodec( encoder, "\306"); // capital AE diphthong (ligature)
	m_entityDecodeMap["Aacute"]	= encodeWithCodec( encoder, "\301"); // capital A, acute accent
	m_entityDecodeMap["Acirc"]	= encodeWithCodec( encoder, "\302"); // capital A, circumflex accent
	m_entityDecodeMap["Agrave"]	= encodeWithCodec( encoder, "\300"); // capital A, grave accent
	m_entityDecodeMap["Aring"]	= encodeWithCodec( encoder, "\305"); // capital A, ring
	m_entityDecodeMap["Atilde"]	= encodeWithCodec( encoder, "\303"); // capital A, tilde
	m_entityDecodeMap["Auml"]	= encodeWithCodec( encoder, "\304"); // capital A, dieresis or umlaut mark
	m_entityDecodeMap["Ccedil"]	= encodeWithCodec( encoder, "\307"); // capital C, cedilla
	m_entityDecodeMap["Dstrok"]	= encodeWithCodec( encoder, "\320"); // whatever
	m_entityDecodeMap["ETH"]	= encodeWithCodec( encoder, "\320"); // capital Eth, Icelandic
	m_entityDecodeMap["Eacute"]	= encodeWithCodec( encoder, "\311"); // capital E, acute accent
	m_entityDecodeMap["Ecirc"]	= encodeWithCodec( encoder, "\312"); // capital E, circumflex accent
	m_entityDecodeMap["Egrave"]	= encodeWithCodec( encoder, "\310"); // capital E, grave accent
	m_entityDecodeMap["Euml"]	= encodeWithCodec( encoder, "\313"); // capital E, dieresis or umlaut mark
	m_entityDecodeMap["Iacute"]	= encodeWithCodec( encoder, "\315"); // capital I, acute accent
	m_entityDecodeMap["Icirc"]	= encodeWithCodec( encoder, "\316"); // capital I, circumflex accent
	m_entityDecodeMap["Igrave"]	= encodeWithCodec( encoder, "\314"); // capital I, grave accent
	m_entityDecodeMap["Iuml"]	= encodeWithCodec( encoder, "\317"); // capital I, dieresis or umlaut mark
	m_entityDecodeMap["Ntilde"]	= encodeWithCodec( encoder, "\321"); // capital N, tilde
	m_entityDecodeMap["Oacute"]	= encodeWithCodec( encoder, "\323"); // capital O, acute accent
	m_entityDecodeMap["Ocirc"]	= encodeWithCodec( encoder, "\324"); // capital O, circumflex accent
	m_entityDecodeMap["Ograve"]	= encodeWithCodec( encoder, "\322"); // capital O, grave accent
	m_entityDecodeMap["Oslash"]	= encodeWithCodec( encoder, "\330"); // capital O, slash
	m_entityDecodeMap["Otilde"]	= encodeWithCodec( encoder, "\325"); // capital O, tilde
	m_entityDecodeMap["Ouml"]	= encodeWithCodec( encoder, "\326"); // capital O, dieresis or umlaut mark
	m_entityDecodeMap["THORN"]	= encodeWithCodec( encoder, "\336"); // capital THORN, Icelandic
	m_entityDecodeMap["Uacute"]	= encodeWithCodec( encoder, "\332"); // capital U, acute accent
	m_entityDecodeMap["Ucirc"]	= encodeWithCodec( encoder, "\333"); // capital U, circumflex accent
	m_entityDecodeMap["Ugrave"]	= encodeWithCodec( encoder, "\331"); // capital U, grave accent
	m_entityDecodeMap["Uuml"]	= encodeWithCodec( encoder, "\334"); // capital U, dieresis or umlaut mark
	m_entityDecodeMap["Yacute"]	= encodeWithCodec( encoder, "\335"); // capital Y, acute accent
	m_entityDecodeMap["OElig"]	= encodeWithCodec( encoder, "\338"); // capital Y, acute accent
	m_entityDecodeMap["oelig"]	= encodeWithCodec( encoder, "\339"); // capital Y, acute accent

	m_entityDecodeMap["aacute"]	= encodeWithCodec( encoder, "\341"); // small a, acute accent
	m_entityDecodeMap["acirc"]	= encodeWithCodec( encoder, "\342"); // small a, circumflex accent
	m_entityDecodeMap["aelig"]	= encodeWithCodec( encoder, "\346"); // small ae diphthong (ligature)
	m_entityDecodeMap["agrave"]	= encodeWithCodec( encoder, "\340"); // small a, grave accent
	m_entityDecodeMap["aring"]	= encodeWithCodec( encoder, "\345"); // small a, ring
	m_entityDecodeMap["atilde"]	= encodeWithCodec( encoder, "\343"); // small a, tilde
	m_entityDecodeMap["auml"]	= encodeWithCodec( encoder, "\344"); // small a, dieresis or umlaut mark
	m_entityDecodeMap["ccedil"]	= encodeWithCodec( encoder, "\347"); // small c, cedilla
	m_entityDecodeMap["eacute"]	= encodeWithCodec( encoder, "\351"); // small e, acute accent
	m_entityDecodeMap["ecirc"]	= encodeWithCodec( encoder, "\352"); // small e, circumflex accent
	m_entityDecodeMap["Scaron"]	= encodeWithCodec( encoder, "\352"); // small e, circumflex accent
	m_entityDecodeMap["egrave"]	= encodeWithCodec( encoder, "\350"); // small e, grave accent
	m_entityDecodeMap["eth"]	= encodeWithCodec( encoder, "\360"); // small eth, Icelandic
	m_entityDecodeMap["euml"]	= encodeWithCodec( encoder, "\353"); // small e, dieresis or umlaut mark
	m_entityDecodeMap["iacute"]	= encodeWithCodec( encoder, "\355"); // small i, acute accent
	m_entityDecodeMap["icirc"]	= encodeWithCodec( encoder, "\356"); // small i, circumflex accent
	m_entityDecodeMap["igrave"]	= encodeWithCodec( encoder, "\354"); // small i, grave accent
	m_entityDecodeMap["iuml"]	= encodeWithCodec( encoder, "\357"); // small i, dieresis or umlaut mark
	m_entityDecodeMap["ntilde"]	= encodeWithCodec( encoder, "\361"); // small n, tilde
	m_entityDecodeMap["oacute"]	= encodeWithCodec( encoder, "\363"); // small o, acute accent
	m_entityDecodeMap["ocirc"]	= encodeWithCodec( encoder, "\364"); // small o, circumflex accent
	m_entityDecodeMap["ograve"]	= encodeWithCodec( encoder, "\362"); // small o, grave accent
	m_entityDecodeMap["oslash"]	= encodeWithCodec( encoder, "\370"); // small o, slash
	m_entityDecodeMap["otilde"]	= encodeWithCodec( encoder, "\365"); // small o, tilde
	m_entityDecodeMap["ouml"]	= encodeWithCodec( encoder, "\366"); // small o, dieresis or umlaut mark
	m_entityDecodeMap["szlig"]	= encodeWithCodec( encoder, "\337"); // small sharp s, German (sz ligature)
	m_entityDecodeMap["thorn"]	= encodeWithCodec( encoder, "\376"); // small thorn, Icelandic
	m_entityDecodeMap["uacute"]	= encodeWithCodec( encoder, "\372"); // small u, acute accent
	m_entityDecodeMap["ucirc"]	= encodeWithCodec( encoder, "\373"); // small u, circumflex accent
	m_entityDecodeMap["ugrave"]	= encodeWithCodec( encoder, "\371"); // small u, grave accent
	m_entityDecodeMap["uuml"]	= encodeWithCodec( encoder, "\374"); // small u, dieresis or umlaut mark
	m_entityDecodeMap["yacute"]	= encodeWithCodec( encoder, "\375"); // small y, acute accent
	m_entityDecodeMap["yuml"]	= encodeWithCodec( encoder, "\377"); // small y, dieresis or umlaut mark

	m_entityDecodeMap["iexcl"]	= encodeWithCodec( encoder, "\241");
	m_entityDecodeMap["cent"]	= encodeWithCodec( encoder, "\242");
	m_entityDecodeMap["pound"]	= encodeWithCodec( encoder, "\243");
	m_entityDecodeMap["curren"]	= encodeWithCodec( encoder, "\244");
	m_entityDecodeMap["yen"]	= encodeWithCodec( encoder, "\245");
	m_entityDecodeMap["brvbar"]	= encodeWithCodec( encoder, "\246");
	m_entityDecodeMap["sect"]	= encodeWithCodec( encoder, "\247");
	m_entityDecodeMap["uml"]	= encodeWithCodec( encoder, "\250");
	m_entityDecodeMap["ordf"]	= encodeWithCodec( encoder, "\252");
	m_entityDecodeMap["laquo"]	= encodeWithCodec( encoder, "\253");
	m_entityDecodeMap["not"]	= encodeWithCodec( encoder, "\254");
	m_entityDecodeMap["shy"]	= encodeWithCodec( encoder, "\255");
	m_entityDecodeMap["macr"]	= encodeWithCodec( encoder, "\257");
	m_entityDecodeMap["deg"]	= encodeWithCodec( encoder, "\260");
	m_entityDecodeMap["plusmn"]	= encodeWithCodec( encoder, "\261");
	m_entityDecodeMap["sup1"]	= encodeWithCodec( encoder, "\271");
	m_entityDecodeMap["sup2"]	= encodeWithCodec( encoder, "\262");
	m_entityDecodeMap["sup3"]	= encodeWithCodec( encoder, "\263");
	m_entityDecodeMap["acute"]	= encodeWithCodec( encoder, "\264");
	m_entityDecodeMap["micro"]	= encodeWithCodec( encoder, "\265");
	m_entityDecodeMap["para"]	= encodeWithCodec( encoder, "\266");
	m_entityDecodeMap["middot"]	= encodeWithCodec( encoder, "\267");
	m_entityDecodeMap["cedil"]	= encodeWithCodec( encoder, "\270");
	m_entityDecodeMap["ordm"]	= encodeWithCodec( encoder, "\272");
	m_entityDecodeMap["raquo"]	= encodeWithCodec( encoder, "\273");
	m_entityDecodeMap["frac14"]	= encodeWithCodec( encoder, "\274");
	m_entityDecodeMap["frac12"]	= encodeWithCodec( encoder, "\275");
	m_entityDecodeMap["frac34"]	= encodeWithCodec( encoder, "\276");
	m_entityDecodeMap["iquest"]	= encodeWithCodec( encoder, "\277");
	m_entityDecodeMap["times"]	= encodeWithCodec( encoder, "\327");
	m_entityDecodeMap["divide"]	= encodeWithCodec( encoder, "\367");

	m_entityDecodeMap["copy"]	= encodeWithCodec( encoder, "\251"); // copyright sign
	m_entityDecodeMap["reg"]	= encodeWithCodec( encoder, "\256"); // registered sign
	m_entityDecodeMap["nbsp"]	= encodeWithCodec( encoder, "\240"); // non breaking space

	m_entityDecodeMap["fnof"]	= QChar((unsigned short) 402);

	m_entityDecodeMap["Delta"]	= QChar((unsigned short) 916);
	m_entityDecodeMap["Pi"]	= QChar((unsigned short) 928);
	m_entityDecodeMap["Sigma"]	= QChar((unsigned short) 931);

	m_entityDecodeMap["beta"]	= QChar((unsigned short) 946);
	m_entityDecodeMap["gamma"]	= QChar((unsigned short) 947);
	m_entityDecodeMap["delta"]	= QChar((unsigned short) 948);
	m_entityDecodeMap["eta"]	= QChar((unsigned short) 951);
	m_entityDecodeMap["theta"]	= QChar((unsigned short) 952);
	m_entityDecodeMap["lambda"]	= QChar((unsigned short) 955);
	m_entityDecodeMap["mu"]	= QChar((unsigned short) 956);
	m_entityDecodeMap["nu"]	= QChar((unsigned short) 957);
	m_entityDecodeMap["pi"]	= QChar((unsigned short) 960);
	m_entityDecodeMap["rho"]	= QChar((unsigned short) 961);

	m_entityDecodeMap["lsquo"]	= QChar((unsigned short) 8216);
	m_entityDecodeMap["rsquo"]	= QChar((unsigned short) 8217);
	m_entityDecodeMap["rdquo"]	= QChar((unsigned short) 8221);
	m_entityDecodeMap["bdquo"]	= QChar((unsigned short) 8222);
	m_entityDecodeMap["trade"]  = QChar((unsigned short) 8482);
	m_entityDecodeMap["ldquo"]  = QChar((unsigned short) 8220);
	m_entityDecodeMap["ndash"]  = QChar((unsigned short) 8211);
	m_entityDecodeMap["mdash"]  = QChar((unsigned short) 8212);
	m_entityDecodeMap["bull"]  = QChar((unsigned short) 8226);
	m_entityDecodeMap["hellip"]  = QChar((unsigned short) 8230);
	m_entityDecodeMap["emsp"]  = QChar((unsigned short) 8195);
	m_entityDecodeMap["rarr"]  = QChar((unsigned short) 8594);
	m_entityDecodeMap["rArr"]  = QChar((unsigned short) 8658);
	m_entityDecodeMap["crarr"]  = QChar((unsigned short) 8629);
	m_entityDecodeMap["le"]  = QChar((unsigned short) 8804);
	m_entityDecodeMap["ge"]  = QChar((unsigned short) 8805);
	m_entityDecodeMap["lte"]  = QChar((unsigned short) 8804); // wrong, but used somewhere
	m_entityDecodeMap["gte"]  = QChar((unsigned short) 8805); // wrong, but used somewhere
	m_entityDecodeMap["dagger"]  = QChar((unsigned short) 8224);
	m_entityDecodeMap["Dagger"]  = QChar((unsigned short) 8225);
	m_entityDecodeMap["euro"]  = QChar((unsigned short) 8364);
	m_entityDecodeMap["asymp"]  = QChar((unsigned short) 8776);
	m_entityDecodeMap["isin"]  = QChar((unsigned short) 8712);
	m_entityDecodeMap["notin"]  = QChar((unsigned short) 8713);
	m_entityDecodeMap["prod"]  = QChar((unsigned short) 8719);
	m_entityDecodeMap["ne"]  = QChar((unsigned short) 8800);

	m_entityDecodeMap["amp"]	= "&";	// ampersand
	m_entityDecodeMap["gt"] = ">";	// greater than
	m_entityDecodeMap["lt"] = "<"; 	// less than
	m_entityDecodeMap["quot"] = "\""; // double quote
	m_entityDecodeMap["apos"] = "'"; 	// single quote
	m_entityDecodeMap["frasl"]  = "/";
	m_entityDecodeMap["minus"]  = "-";
	m_entityDecodeMap["oplus"] = "+";
	m_entityDecodeMap["Prime"] = "\"";
}


QString HelperEntityDecoder::decode( const QString &entity ) const
{
	// If entity is an ASCII code like &#12349; - just decode it
	if ( entity.isEmpty() )
	{
		return "";
	}
	else if ( entity[0] == '#' )
	{
		bool valid;
		unsigned int ascode = entity.mid(1).toUInt( &valid );

		if ( !valid )
		{
			qWarning ( "HelperEntityDecoder::decode: could not decode HTML entity '%s'", qPrintable( entity ) );
			return QString();
		}

		return (QString) (QChar( ascode ));
	}
	else
	{
		QMap<QString, QString>::const_iterator it = m_entityDecodeMap.find( entity );

		if ( it == m_entityDecodeMap.end() )
		{
			qWarning ("HelperEntityDecoder::decode: could not decode HTML entity '%s'", qPrintable( entity ));
			return "";
		}

		return *it;
	}
}