You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1296 lines
43 KiB
1296 lines
43 KiB
/* |
|
This file was taken from the KDE 4.x libraries and backported to Qt 3. |
|
|
|
Copyright (C) 1999 Lars Knoll (knoll@kde.org) |
|
Copyright (C) 2003 Dirk Mueller (mueller@kde.org) |
|
Copyright (C) 2003 Apple Computer, Inc. |
|
Copyright (C) 2007 Nick Shaforostoff (shafff@ukr.net) |
|
|
|
This library is free software; you can redistribute it and/or |
|
modify it under the terms of the GNU Library General Public |
|
License as published by the Free Software Foundation; either |
|
version 2 of the License, or (at your option) any later version. |
|
|
|
This library is distributed in the hope that it will be useful, |
|
but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
Library General Public License for more details. |
|
|
|
You should have received a copy of the GNU Library General Public License |
|
along with this library; see the file COPYING.LIB. If not, write to |
|
the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, |
|
Boston, MA 02110-1301, USA. |
|
*/ |
|
//---------------------------------------------------------------------------- |
|
// |
|
// decoder for input stream |
|
|
|
#include "encodingdetector.h" |
|
|
|
#undef DECODE_DEBUG |
|
//#define DECODE_DEBUG |
|
|
|
#define MAX_BUFFER 16*1024 |
|
|
|
#include <assert.h> |
|
#include <stdlib.h> |
|
|
|
#include "encodingdetector_ja_p.h" |
|
|
|
#include <qregexp.h> |
|
#include <qtextcodec.h> |
|
|
|
#include <kglobal.h> |
|
#include <kcharsets.h> |
|
#include <kdebug.h> |
|
#include <klocale.h> |
|
|
|
#include <ctype.h> |
|
|
|
// The following table was taken from libpango 1.19.3 and slightly modified. |
|
// Multiple scripts per language were removed and the entries were reordered so |
|
// that simple substring matching will work. For example, bam was put before ba |
|
// so that the first match will be likely the right match. Otherwise "ba" would |
|
// match "bam" but we would have to search on to find "bam" which is what we want. |
|
// The original file is called pango-script-lang-table.h |
|
|
|
/* pango-script-lang-table.h: |
|
* |
|
* Generated by gen-script-for-lang-new.c |
|
* Date: 2007-10-26 |
|
* Source: fontconfig-2.4.91 |
|
* |
|
* Do not edit. // I did. Sue me ;) |
|
*/ |
|
typedef struct _PangoScriptForLang { |
|
const char lang[6]; |
|
EncodingDetector::AutoDetectScript scripts[1]; |
|
} PangoScriptForLang; |
|
|
|
//Unfortunately EncodingDetector does not know all scripts that Pango knows. |
|
//Also, using EncodingDetector::CentralEuropean for the appropriate countries |
|
//might give better results in some cases. |
|
//One especially important (many speakers/literates) omission is the lack of |
|
//Indian scripts. |
|
|
|
#define PANGO_SCRIPT_ARMENIAN EncodingDetector::None |
|
#define PANGO_SCRIPT_BENGALI EncodingDetector::None |
|
#define PANGO_SCRIPT_CANADIAN_ABORIGINAL EncodingDetector::None |
|
#define PANGO_SCRIPT_CHEROKEE EncodingDetector::None |
|
#define PANGO_SCRIPT_DEVANAGARI EncodingDetector::None |
|
#define PANGO_SCRIPT_ETHIOPIC EncodingDetector::None |
|
#define PANGO_SCRIPT_GUJARATI EncodingDetector::None |
|
#define PANGO_SCRIPT_GURMUKHI EncodingDetector::None |
|
#define PANGO_SCRIPT_KANNADA EncodingDetector::None |
|
#define PANGO_SCRIPT_KHMER EncodingDetector::None |
|
#define PANGO_SCRIPT_LAO EncodingDetector::None |
|
#define PANGO_SCRIPT_MALAYALAM EncodingDetector::None |
|
#define PANGO_SCRIPT_MONGOLIAN EncodingDetector::None |
|
#define PANGO_SCRIPT_MYANMAR EncodingDetector::None |
|
#define PANGO_SCRIPT_ORIYA EncodingDetector::None |
|
#define PANGO_SCRIPT_SINHALA EncodingDetector::None |
|
#define PANGO_SCRIPT_SYRIAC EncodingDetector::None |
|
#define PANGO_SCRIPT_TAGALOG EncodingDetector::None |
|
#define PANGO_SCRIPT_TAMIL EncodingDetector::None |
|
#define PANGO_SCRIPT_TIBETAN EncodingDetector::None |
|
#define PANGO_SCRIPT_TELUGU EncodingDetector::None |
|
|
|
//Instead of changing the table even more... |
|
#define PANGO_SCRIPT_ARABIC EncodingDetector::Arabic |
|
#define PANGO_SCRIPT_CYRILLIC EncodingDetector::Cyrillic |
|
#define PANGO_SCRIPT_GEORGIAN EncodingDetector::SouthEasternEurope |
|
#define PANGO_SCRIPT_GREEK EncodingDetector::Greek |
|
#define PANGO_SCRIPT_HEBREW EncodingDetector::Hebrew |
|
#define PANGO_SCRIPT_LATIN EncodingDetector::WesternEuropean |
|
#define PANGO_SCRIPT_THAI EncodingDetector::Thai |
|
|
|
|
|
static const PangoScriptForLang pango_script_for_lang[] = { |
|
{ "aa", { PANGO_SCRIPT_LATIN/*62*/ } }, |
|
{ "ab", { PANGO_SCRIPT_CYRILLIC/*90*/ } }, |
|
{ "af", { PANGO_SCRIPT_LATIN/*69*/ } }, |
|
{ "am", { PANGO_SCRIPT_ETHIOPIC/*218*/ } }, |
|
{ "ar", { PANGO_SCRIPT_ARABIC/*125*/ } }, |
|
{ "as", { PANGO_SCRIPT_BENGALI/*89*/ } }, |
|
{ "ast", { PANGO_SCRIPT_LATIN/*66*/ } }, |
|
{ "ava", { PANGO_SCRIPT_CYRILLIC/*67*/ } }, |
|
{ "ay", { PANGO_SCRIPT_LATIN/*60*/ } }, |
|
{ "az-ir", { PANGO_SCRIPT_ARABIC/*129*/ } }, |
|
{ "az", { PANGO_SCRIPT_CYRILLIC/*80*/ } }, //, PANGO_SCRIPT_LATIN/*68*/ } }, |
|
{ "bam", { PANGO_SCRIPT_LATIN/*60*/ } }, |
|
{ "ba", { PANGO_SCRIPT_CYRILLIC/*82*/ } }, |
|
{ "be", { PANGO_SCRIPT_CYRILLIC/*68*/ } }, |
|
{ "bg", { PANGO_SCRIPT_CYRILLIC/*60*/ } }, |
|
{ "bh", { PANGO_SCRIPT_DEVANAGARI/*68*/ } }, |
|
{ "bho", { PANGO_SCRIPT_DEVANAGARI/*68*/ } }, |
|
{ "bi", { PANGO_SCRIPT_LATIN/*58*/ } }, |
|
{ "bin", { PANGO_SCRIPT_LATIN/*76*/ } }, |
|
{ "bn", { PANGO_SCRIPT_BENGALI/*89*/ } }, |
|
{ "bo", { PANGO_SCRIPT_TIBETAN/*95*/ } }, |
|
{ "br", { PANGO_SCRIPT_LATIN/*64*/ } }, |
|
{ "bs", { PANGO_SCRIPT_LATIN/*62*/ } }, |
|
{ "bua", { PANGO_SCRIPT_CYRILLIC/*70*/ } }, |
|
{ "ca", { PANGO_SCRIPT_LATIN/*74*/ } }, |
|
{ "ce", { PANGO_SCRIPT_CYRILLIC/*67*/ } }, |
|
{ "chm", { PANGO_SCRIPT_CYRILLIC/*76*/ } }, |
|
{ "chr", { PANGO_SCRIPT_CHEROKEE/*85*/ } }, |
|
{ "ch", { PANGO_SCRIPT_LATIN/*58*/ } }, |
|
{ "co", { PANGO_SCRIPT_LATIN/*84*/ } }, |
|
{ "cs", { PANGO_SCRIPT_LATIN/*82*/ } }, |
|
{ "cu", { PANGO_SCRIPT_CYRILLIC/*103*/ } }, |
|
{ "cv", { PANGO_SCRIPT_CYRILLIC/*72*/ } }, //, PANGO_SCRIPT_LATIN/*2*/ } }, |
|
{ "cy", { PANGO_SCRIPT_LATIN/*78*/ } }, |
|
{ "da", { PANGO_SCRIPT_LATIN/*70*/ } }, |
|
{ "de", { PANGO_SCRIPT_LATIN/*59*/ } }, |
|
{ "dz", { PANGO_SCRIPT_TIBETAN/*95*/ } }, |
|
{ "el", { PANGO_SCRIPT_GREEK/*69*/ } }, |
|
{ "en", { PANGO_SCRIPT_LATIN/*72*/ } }, |
|
{ "eo", { PANGO_SCRIPT_LATIN/*64*/ } }, |
|
{ "es", { PANGO_SCRIPT_LATIN/*66*/ } }, |
|
// { "et", { PANGO_SCRIPT_LATIN/*64*/ } }, |
|
{ "et", { EncodingDetector::Baltic } }, |
|
{ "eu", { PANGO_SCRIPT_LATIN/*56*/ } }, |
|
{ "fa", { PANGO_SCRIPT_ARABIC/*129*/ } }, |
|
{ "fi", { PANGO_SCRIPT_LATIN/*62*/ } }, |
|
{ "fj", { PANGO_SCRIPT_LATIN/*52*/ } }, |
|
{ "fo", { PANGO_SCRIPT_LATIN/*68*/ } }, |
|
{ "fr", { PANGO_SCRIPT_LATIN/*84*/ } }, |
|
{ "ful", { PANGO_SCRIPT_LATIN/*62*/ } }, |
|
{ "fur", { PANGO_SCRIPT_LATIN/*66*/ } }, |
|
{ "fy", { PANGO_SCRIPT_LATIN/*75*/ } }, |
|
{ "ga", { PANGO_SCRIPT_LATIN/*80*/ } }, |
|
{ "gd", { PANGO_SCRIPT_LATIN/*70*/ } }, |
|
{ "gez", { PANGO_SCRIPT_ETHIOPIC/*218*/ } }, |
|
{ "gl", { PANGO_SCRIPT_LATIN/*66*/ } }, |
|
{ "gn", { PANGO_SCRIPT_LATIN/*70*/ } }, |
|
{ "gu", { PANGO_SCRIPT_GUJARATI/*78*/ } }, |
|
{ "gv", { PANGO_SCRIPT_LATIN/*54*/ } }, |
|
{ "ha", { PANGO_SCRIPT_LATIN/*60*/ } }, |
|
{ "haw", { PANGO_SCRIPT_LATIN/*62*/ } }, |
|
{ "he", { PANGO_SCRIPT_HEBREW/*27*/ } }, |
|
{ "hi", { PANGO_SCRIPT_DEVANAGARI/*68*/ } }, |
|
{ "ho", { PANGO_SCRIPT_LATIN/*52*/ } }, |
|
{ "hr", { PANGO_SCRIPT_LATIN/*62*/ } }, |
|
{ "hu", { PANGO_SCRIPT_LATIN/*70*/ } }, |
|
{ "hy", { PANGO_SCRIPT_ARMENIAN/*77*/ } }, |
|
{ "ia", { PANGO_SCRIPT_LATIN/*52*/ } }, |
|
{ "ibo", { PANGO_SCRIPT_LATIN/*58*/ } }, |
|
{ "id", { PANGO_SCRIPT_LATIN/*54*/ } }, |
|
{ "ie", { PANGO_SCRIPT_LATIN/*52*/ } }, |
|
{ "ik", { PANGO_SCRIPT_CYRILLIC/*68*/ } }, |
|
{ "io", { PANGO_SCRIPT_LATIN/*52*/ } }, |
|
{ "is", { PANGO_SCRIPT_LATIN/*70*/ } }, |
|
{ "it", { PANGO_SCRIPT_LATIN/*72*/ } }, |
|
{ "iu", { PANGO_SCRIPT_CANADIAN_ABORIGINAL/*161*/ } }, |
|
// { "ja", { PANGO_SCRIPT_HAN/*6356*/, PANGO_SCRIPT_KATAKANA/*88*/, PANGO_SCRIPT_HIRAGANA/*85*/ } }, |
|
{ "ja", { EncodingDetector::Japanese } }, |
|
{ "kaa", { PANGO_SCRIPT_CYRILLIC/*78*/ } }, |
|
{ "ka", { PANGO_SCRIPT_GEORGIAN/*33*/ } }, |
|
{ "ki", { PANGO_SCRIPT_LATIN/*56*/ } }, |
|
{ "kk", { PANGO_SCRIPT_CYRILLIC/*77*/ } }, |
|
{ "kl", { PANGO_SCRIPT_LATIN/*81*/ } }, |
|
{ "km", { PANGO_SCRIPT_KHMER/*70*/ } }, |
|
{ "kn", { PANGO_SCRIPT_KANNADA/*80*/ } }, |
|
// { "ko", { PANGO_SCRIPT_HANGUL/*2443*/ } }, |
|
{ "ko", { EncodingDetector::Korean } }, |
|
{ "kok", { PANGO_SCRIPT_DEVANAGARI/*68*/ } }, |
|
{ "ks", { PANGO_SCRIPT_DEVANAGARI/*68*/ } }, |
|
{ "ku-ir", { PANGO_SCRIPT_ARABIC/*32*/ } }, |
|
{ "ku", { PANGO_SCRIPT_CYRILLIC/*60*/ } }, //, PANGO_SCRIPT_LATIN/*4*/ } }, |
|
{ "kum", { PANGO_SCRIPT_CYRILLIC/*66*/ } }, |
|
{ "kv", { PANGO_SCRIPT_CYRILLIC/*70*/ } }, |
|
{ "kw", { PANGO_SCRIPT_LATIN/*64*/ } }, |
|
{ "ky", { PANGO_SCRIPT_CYRILLIC/*70*/ } }, |
|
{ "la", { PANGO_SCRIPT_LATIN/*68*/ } }, |
|
{ "lb", { PANGO_SCRIPT_LATIN/*75*/ } }, |
|
{ "lez", { PANGO_SCRIPT_CYRILLIC/*67*/ } }, |
|
{ "ln", { PANGO_SCRIPT_LATIN/*78*/ } }, |
|
{ "lo", { PANGO_SCRIPT_LAO/*65*/ } }, |
|
// { "lt", { PANGO_SCRIPT_LATIN/*70*/ } }, |
|
{ "lt", { EncodingDetector::Baltic } }, |
|
// { "lv", { PANGO_SCRIPT_LATIN/*78*/ } }, |
|
{ "lv", { EncodingDetector::Baltic } }, |
|
{ "mg", { PANGO_SCRIPT_LATIN/*56*/ } }, |
|
{ "mh", { PANGO_SCRIPT_LATIN/*62*/ } }, |
|
{ "mi", { PANGO_SCRIPT_LATIN/*64*/ } }, |
|
{ "mk", { PANGO_SCRIPT_CYRILLIC/*42*/ } }, |
|
{ "ml", { PANGO_SCRIPT_MALAYALAM/*78*/ } }, |
|
{ "mn", { PANGO_SCRIPT_MONGOLIAN/*130*/ } }, |
|
{ "mo", { PANGO_SCRIPT_CYRILLIC/*66*/ } }, //, PANGO_SCRIPT_LATIN/*62*/ } }, |
|
{ "mr", { PANGO_SCRIPT_DEVANAGARI/*68*/ } }, |
|
{ "mt", { PANGO_SCRIPT_LATIN/*72*/ } }, |
|
{ "my", { PANGO_SCRIPT_MYANMAR/*48*/ } }, |
|
{ "nb", { PANGO_SCRIPT_LATIN/*70*/ } }, |
|
{ "nds", { PANGO_SCRIPT_LATIN/*59*/ } }, |
|
{ "ne", { PANGO_SCRIPT_DEVANAGARI/*68*/ } }, |
|
{ "nl", { PANGO_SCRIPT_LATIN/*82*/ } }, |
|
{ "nn", { PANGO_SCRIPT_LATIN/*76*/ } }, |
|
{ "no", { PANGO_SCRIPT_LATIN/*70*/ } }, |
|
{ "nr", { PANGO_SCRIPT_LATIN/*52*/ } }, |
|
{ "nso", { PANGO_SCRIPT_LATIN/*58*/ } }, |
|
{ "ny", { PANGO_SCRIPT_LATIN/*54*/ } }, |
|
{ "oc", { PANGO_SCRIPT_LATIN/*70*/ } }, |
|
{ "om", { PANGO_SCRIPT_LATIN/*52*/ } }, |
|
{ "or", { PANGO_SCRIPT_ORIYA/*79*/ } }, |
|
{ "os", { PANGO_SCRIPT_CYRILLIC/*66*/ } }, |
|
{ "pa", { PANGO_SCRIPT_GURMUKHI/*63*/ } }, |
|
{ "pl", { PANGO_SCRIPT_LATIN/*70*/ } }, |
|
{ "ps-af", { PANGO_SCRIPT_ARABIC/*49*/ } }, |
|
{ "ps-pk", { PANGO_SCRIPT_ARABIC/*49*/ } }, |
|
{ "pt", { PANGO_SCRIPT_LATIN/*82*/ } }, |
|
{ "rm", { PANGO_SCRIPT_LATIN/*66*/ } }, |
|
{ "ro", { PANGO_SCRIPT_LATIN/*62*/ } }, |
|
{ "ru", { PANGO_SCRIPT_CYRILLIC/*66*/ } }, |
|
{ "sah", { PANGO_SCRIPT_CYRILLIC/*76*/ } }, |
|
{ "sa", { PANGO_SCRIPT_DEVANAGARI/*68*/ } }, |
|
{ "sco", { PANGO_SCRIPT_LATIN/*56*/ } }, |
|
{ "sel", { PANGO_SCRIPT_CYRILLIC/*66*/ } }, |
|
{ "se", { PANGO_SCRIPT_LATIN/*66*/ } }, |
|
{ "sh", { PANGO_SCRIPT_CYRILLIC/*76*/ } }, |
|
{ "si", { PANGO_SCRIPT_SINHALA/*77*/ } }, |
|
{ "sk", { PANGO_SCRIPT_LATIN/*86*/ } }, |
|
{ "sl", { PANGO_SCRIPT_LATIN/*62*/ } }, |
|
{ "sma", { PANGO_SCRIPT_LATIN/*60*/ } }, |
|
{ "smj", { PANGO_SCRIPT_LATIN/*60*/ } }, |
|
{ "smn", { PANGO_SCRIPT_LATIN/*68*/ } }, |
|
{ "sms", { PANGO_SCRIPT_LATIN/*80*/ } }, |
|
{ "sm", { PANGO_SCRIPT_LATIN/*52*/ } }, |
|
{ "so", { PANGO_SCRIPT_LATIN/*52*/ } }, |
|
{ "sq", { PANGO_SCRIPT_LATIN/*56*/ } }, |
|
{ "sr", { PANGO_SCRIPT_CYRILLIC/*76*/ } }, |
|
{ "ss", { PANGO_SCRIPT_LATIN/*52*/ } }, |
|
{ "st", { PANGO_SCRIPT_LATIN/*52*/ } }, |
|
{ "sv", { PANGO_SCRIPT_LATIN/*68*/ } }, |
|
{ "sw", { PANGO_SCRIPT_LATIN/*52*/ } }, |
|
{ "syr", { PANGO_SCRIPT_SYRIAC/*45*/ } }, |
|
{ "ta", { PANGO_SCRIPT_TAMIL/*48*/ } }, |
|
{ "te", { PANGO_SCRIPT_TELUGU/*80*/ } }, |
|
{ "tg", { PANGO_SCRIPT_CYRILLIC/*78*/ } }, |
|
{ "th", { PANGO_SCRIPT_THAI/*86*/ } }, |
|
{ "ti-er", { PANGO_SCRIPT_ETHIOPIC/*255*/ } }, |
|
{ "ti-et", { PANGO_SCRIPT_ETHIOPIC/*255*/ } }, |
|
{ "tig", { PANGO_SCRIPT_ETHIOPIC/*221*/ } }, |
|
{ "tk", { PANGO_SCRIPT_CYRILLIC/*74*/ } }, |
|
{ "tl", { PANGO_SCRIPT_TAGALOG/*19*/ } }, |
|
{ "tn", { PANGO_SCRIPT_LATIN/*58*/ } }, |
|
{ "to", { PANGO_SCRIPT_LATIN/*52*/ } }, |
|
// { "tr", { PANGO_SCRIPT_LATIN/*70*/ } }, |
|
{ "tr", { EncodingDetector::Turkish } }, |
|
{ "ts", { PANGO_SCRIPT_LATIN/*52*/ } }, |
|
{ "tt", { PANGO_SCRIPT_CYRILLIC/*76*/ } }, |
|
{ "tw", { PANGO_SCRIPT_LATIN/*70*/ } }, |
|
{ "tyv", { PANGO_SCRIPT_CYRILLIC/*70*/ } }, |
|
{ "ug", { PANGO_SCRIPT_ARABIC/*125*/ } }, |
|
{ "uk", { PANGO_SCRIPT_CYRILLIC/*72*/ } }, |
|
{ "ur", { PANGO_SCRIPT_ARABIC/*145*/ } }, |
|
{ "uz", { PANGO_SCRIPT_CYRILLIC/*68*/ } }, |
|
{ "ven", { PANGO_SCRIPT_LATIN/*62*/ } }, |
|
{ "vi", { PANGO_SCRIPT_LATIN/*186*/ } }, |
|
{ "vot", { PANGO_SCRIPT_LATIN/*62*/ } }, |
|
{ "vo", { PANGO_SCRIPT_LATIN/*54*/ } }, |
|
{ "wa", { PANGO_SCRIPT_LATIN/*70*/ } }, |
|
{ "wen", { PANGO_SCRIPT_LATIN/*76*/ } }, |
|
{ "wo", { PANGO_SCRIPT_LATIN/*66*/ } }, |
|
{ "xh", { PANGO_SCRIPT_LATIN/*52*/ } }, |
|
{ "yap", { PANGO_SCRIPT_LATIN/*58*/ } }, |
|
{ "yi", { PANGO_SCRIPT_HEBREW/*27*/ } }, |
|
{ "yo", { PANGO_SCRIPT_LATIN/*114*/ } }, |
|
// { "zh-cn", { PANGO_SCRIPT_HAN/*6763*/ } }, |
|
{ "zh-cn", { EncodingDetector::ChineseSimplified } }, |
|
// { "zh-hk", { PANGO_SCRIPT_HAN/*2213*/ } }, |
|
{ "zh-hk", { EncodingDetector::ChineseTraditional } }, |
|
// { "zh-mo", { PANGO_SCRIPT_HAN/*2213*/ } }, |
|
{ "zh-mo", { EncodingDetector::ChineseTraditional } }, |
|
// { "zh-sg", { PANGO_SCRIPT_HAN/*6763*/ } }, |
|
{ "zh-sg", { EncodingDetector::ChineseSimplified } }, |
|
// { "zh-tw", { PANGO_SCRIPT_HAN/*13063*/ } }, |
|
{ "zh-tw", { EncodingDetector::ChineseTraditional } }, |
|
{ "zu", { PANGO_SCRIPT_LATIN/*52*/ } }, |
|
{ "\x00", { EncodingDetector::None } } //end mark |
|
}; |
|
|
|
enum MIB |
|
{ |
|
MibLatin1 = 4, |
|
Mib8859_8 = 85, |
|
MibUtf8 = 106, |
|
MibUcs2 = 1000, |
|
MibUtf16 = 1015, |
|
MibUtf16BE = 1013, |
|
MibUtf16LE = 1014 |
|
}; |
|
|
|
static bool is16Bit(QTextCodec* codec) |
|
{ |
|
switch (codec->mibEnum()) |
|
{ |
|
case MibUtf16: |
|
case MibUtf16BE: |
|
case MibUtf16LE: |
|
case MibUcs2: |
|
return true; |
|
default: |
|
return false; |
|
} |
|
} |
|
|
|
class EncodingDetectorPrivate |
|
{ |
|
public: |
|
QTextCodec *m_codec; |
|
QTextDecoder *m_decoder; // utf16 |
|
QTextCodec *m_defaultCodec; |
|
QCString m_storeDecoderName; |
|
|
|
EncodingDetector::EncodingChoiceSource m_source; |
|
EncodingDetector::AutoDetectScript m_autoDetectLanguage; |
|
|
|
bool m_visualRTL : 1; |
|
bool m_seenBody : 1; |
|
bool m_writtingHappened : 1; |
|
bool m_analyzeCalled : 1; //for decode() |
|
int m_multiByte; |
|
|
|
QCString m_bufferForDefferedEncDetection; |
|
|
|
EncodingDetectorPrivate() |
|
: m_codec(QTextCodec::codecForMib(MibLatin1)) |
|
, m_decoder(m_codec->makeDecoder()) |
|
, m_defaultCodec(m_codec) |
|
, m_source(EncodingDetector::DefaultEncoding) |
|
, m_autoDetectLanguage(EncodingDetector::SemiautomaticDetection) |
|
, m_visualRTL(false) |
|
, m_seenBody(false) |
|
, m_writtingHappened(false) |
|
, m_analyzeCalled(false) |
|
, m_multiByte(0) |
|
{ |
|
} |
|
|
|
EncodingDetectorPrivate(QTextCodec* codec,EncodingDetector::EncodingChoiceSource source, EncodingDetector::AutoDetectScript script) |
|
: m_codec(codec) |
|
, m_decoder(m_codec->makeDecoder()) |
|
, m_defaultCodec(m_codec) |
|
, m_source(source) |
|
, m_autoDetectLanguage(script) |
|
, m_visualRTL(false) |
|
, m_seenBody(false) |
|
, m_writtingHappened(false) |
|
, m_analyzeCalled(false) |
|
, m_multiByte(0) |
|
{ |
|
} |
|
|
|
~EncodingDetectorPrivate() |
|
{ |
|
delete m_decoder; |
|
} |
|
}; |
|
|
|
|
|
static QCString automaticDetectionForArabic( const unsigned char* ptr, int size ) |
|
{ |
|
for ( int i = 0; i < size; ++i ) { |
|
if ( ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) || ptr[ i ] == 0xA1 || ptr[ i ] == 0xA2 || ptr[ i ] == 0xA3 |
|
|| ( ptr[ i ] >= 0xA5 && ptr[ i ] <= 0xAB ) || ( ptr[ i ] >= 0xAE && ptr[ i ] <= 0xBA ) |
|
|| ptr[ i ] == 0xBC || ptr[ i ] == 0xBD || ptr[ i ] == 0xBE || ptr[ i ] == 0xC0 |
|
|| ( ptr[ i ] >= 0xDB && ptr[ i ] <= 0xDF ) || ( ptr[ i ] >= 0xF3 ) ) { |
|
return "cp1256"; |
|
} |
|
} |
|
|
|
return "iso-8859-6"; |
|
} |
|
|
|
static QCString automaticDetectionForBaltic( const unsigned char* ptr, int size ) |
|
{ |
|
for ( int i = 0; i < size; ++i ) { |
|
if ( ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9E ) ) |
|
return "cp1257"; |
|
|
|
if ( ptr[ i ] == 0xA1 || ptr[ i ] == 0xA5 ) |
|
return "iso-8859-13"; |
|
} |
|
|
|
return "iso-8859-13"; |
|
} |
|
|
|
static QCString automaticDetectionForCentralEuropean(const unsigned char* ptr, int size ) |
|
{ |
|
QCString charset; |
|
for ( int i = 0; i < size; ++i ) { |
|
if ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) { |
|
if ( ptr[ i ] == 0x81 || ptr[ i ] == 0x83 || ptr[ i ] == 0x90 || ptr[ i ] == 0x98 ) |
|
return "ibm852"; |
|
|
|
if ( i + 1 > size ) |
|
return "cp1250"; |
|
else { // maybe ibm852 ? |
|
charset = "cp1250"; |
|
continue; |
|
} |
|
} |
|
if ( ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE || ptr[ i ] == 0xBE || ptr[ i ] == 0xC3 || ptr[ i ] == 0xD0 || ptr[ i ] == 0xE3 || ptr[ i ] == 0xF0 ) { |
|
if ( i + 1 > size ) |
|
return "iso-8859-2"; |
|
else { // maybe ibm852 ? |
|
if ( charset.isNull() ) |
|
charset = "iso-8859-2"; |
|
continue; |
|
} |
|
} |
|
} |
|
|
|
if ( charset.isNull() ) |
|
charset = "iso-8859-3"; |
|
|
|
return charset.data(); |
|
} |
|
|
|
static QCString automaticDetectionForCyrillic( const unsigned char* ptr, int size) |
|
{ |
|
#ifdef DECODE_DEBUG |
|
kWarning() << "EncodingDetector: Cyr heuristics"; |
|
#endif |
|
|
|
// if (ptr[0]==0xef && ptr[1]==0xbb && ptr[2]==0xbf) |
|
// return "utf8"; |
|
int utf8_mark=0; |
|
int koi_score=0; |
|
int cp1251_score=0; |
|
|
|
int koi_st=0; |
|
int cp1251_st=0; |
|
|
|
// int koi_na=0; |
|
// int cp1251_na=0; |
|
|
|
int koi_o_capital=0; |
|
int koi_o=0; |
|
int cp1251_o_capital=0; |
|
int cp1251_o=0; |
|
|
|
int koi_a_capital=0; |
|
int koi_a=0; |
|
int cp1251_a_capital=0; |
|
int cp1251_a=0; |
|
|
|
int koi_s_capital=0; |
|
int koi_s=0; |
|
int cp1251_s_capital=0; |
|
int cp1251_s=0; |
|
|
|
int koi_i_capital=0; |
|
int koi_i=0; |
|
int cp1251_i_capital=0; |
|
int cp1251_i=0; |
|
|
|
int cp1251_small_range=0; |
|
int koi_small_range=0; |
|
int ibm866_small_range=0; |
|
|
|
int i; |
|
for (i=1; (i<size) && (cp1251_small_range+koi_small_range<1000) ;++i) |
|
{ |
|
if (ptr[i]>0xdf) |
|
{ |
|
++cp1251_small_range; |
|
|
|
if (ptr[i]==0xee)//small o |
|
++cp1251_o; |
|
else if (ptr[i]==0xe0)//small a |
|
++cp1251_a; |
|
else if (ptr[i]==0xe8)//small i |
|
++cp1251_i; |
|
else if (ptr[i]==0xf1)//small s |
|
++cp1251_s; |
|
else if (ptr[i]==0xf2 && ptr[i-1]==0xf1)//small st |
|
++cp1251_st; |
|
|
|
else if (ptr[i]==0xef) |
|
++koi_o_capital; |
|
else if (ptr[i]==0xe1) |
|
++koi_a_capital; |
|
else if (ptr[i]==0xe9) |
|
++koi_i_capital; |
|
else if (ptr[i]==0xf3) |
|
++koi_s_capital; |
|
|
|
} |
|
else if (ptr[i]>0xbf) |
|
{ |
|
++koi_small_range; |
|
|
|
if (ptr[i]==0xd0||ptr[i]==0xd1)//small o |
|
++utf8_mark; |
|
else if (ptr[i]==0xcf)//small o |
|
++koi_o; |
|
else if (ptr[i]==0xc1)//small a |
|
++koi_a; |
|
else if (ptr[i]==0xc9)//small i |
|
++koi_i; |
|
else if (ptr[i]==0xd3)//small s |
|
++koi_s; |
|
else if (ptr[i]==0xd4 && ptr[i-1]==0xd3)//small st |
|
++koi_st; |
|
|
|
else if (ptr[i]==0xce) |
|
++cp1251_o_capital; |
|
else if (ptr[i]==0xc0) |
|
++cp1251_a_capital; |
|
else if (ptr[i]==0xc8) |
|
++cp1251_i_capital; |
|
else if (ptr[i]==0xd1) |
|
++cp1251_s_capital; |
|
} |
|
else if (ptr[i]>0x9f && ptr[i]<0xb0) //first 16 letterz is 60% |
|
++ibm866_small_range; |
|
|
|
} |
|
|
|
//cannot decide? |
|
if (cp1251_small_range+koi_small_range+ibm866_small_range<8) |
|
{ |
|
return ""; |
|
} |
|
|
|
if (3*utf8_mark>cp1251_small_range+koi_small_range+ibm866_small_range) |
|
{ |
|
#ifdef DECODE_DEBUG |
|
kWarning() << "Cyr Enc Detection: UTF8"; |
|
#endif |
|
return "UTF-8"; |
|
} |
|
|
|
if (ibm866_small_range>cp1251_small_range+koi_small_range) |
|
return "ibm866"; |
|
|
|
// QCString koi_string = "koi8-u"; |
|
// QCString cp1251_string = "cp1251"; |
|
|
|
if (cp1251_st==0 && koi_st>1) |
|
koi_score+=10; |
|
else if (koi_st==0 && cp1251_st>1) |
|
cp1251_score+=10; |
|
|
|
if (cp1251_st && koi_st) |
|
{ |
|
if (cp1251_st/koi_st>2) |
|
cp1251_score+=20; |
|
else if (koi_st/cp1251_st>2) |
|
koi_score+=20; |
|
} |
|
|
|
if (cp1251_a>koi_a) |
|
cp1251_score+=10; |
|
else if (cp1251_a || koi_a) |
|
koi_score+=10; |
|
|
|
if (cp1251_o>koi_o) |
|
cp1251_score+=10; |
|
else if (cp1251_o || koi_o) |
|
koi_score+=10; |
|
|
|
if (cp1251_i>koi_i) |
|
cp1251_score+=10; |
|
else if (cp1251_i || koi_i) |
|
koi_score+=10; |
|
|
|
if (cp1251_s>koi_s) |
|
cp1251_score+=10; |
|
else if (cp1251_s || koi_s) |
|
koi_score+=10; |
|
|
|
if (cp1251_a_capital>koi_a_capital) |
|
cp1251_score+=9; |
|
else if (cp1251_a_capital || koi_a_capital) |
|
koi_score+=9; |
|
|
|
if (cp1251_o_capital>koi_o_capital) |
|
cp1251_score+=9; |
|
else if (cp1251_o_capital || koi_o_capital) |
|
koi_score+=9; |
|
|
|
if (cp1251_i_capital>koi_i_capital) |
|
cp1251_score+=9; |
|
else if (cp1251_i_capital || koi_i_capital) |
|
koi_score+=9; |
|
|
|
if (cp1251_s_capital>koi_s_capital) |
|
cp1251_score+=9; |
|
else if (cp1251_s_capital || koi_s_capital) |
|
koi_score+=9; |
|
#ifdef DECODE_DEBUG |
|
kWarning()<<"koi_score " << koi_score << " cp1251_score " << cp1251_score; |
|
#endif |
|
if (abs(koi_score-cp1251_score)<10) |
|
{ |
|
//fallback... |
|
cp1251_score=cp1251_small_range; |
|
koi_score=koi_small_range; |
|
} |
|
if (cp1251_score>koi_score) |
|
return "cp1251"; |
|
else |
|
return "koi8-u"; |
|
|
|
|
|
// if (cp1251_score>koi_score) |
|
// setEncoding("cp1251",AutoDetectedEncoding); |
|
// else |
|
// setEncoding("koi8-u",AutoDetectedEncoding); |
|
// return true; |
|
|
|
} |
|
|
|
static QCString automaticDetectionForGreek( const unsigned char* ptr, int size ) |
|
{ |
|
for ( int i = 0; i < size; ++i ) { |
|
if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x87 ) || ptr[ i ] == 0x89 || ptr[ i ] == 0x8B |
|
|| ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x97 ) || ptr[ i ] == 0x99 || ptr[ i ] == 0x9B || ptr[ i ] == 0xA4 |
|
|| ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE ) { |
|
return "cp1253"; |
|
} |
|
} |
|
|
|
return "iso-8859-7"; |
|
} |
|
|
|
static QCString automaticDetectionForHebrew( const unsigned char* ptr, int size ) |
|
{ |
|
for ( int i = 0; i < size; ++i ) { |
|
if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x89 ) || ptr[ i ] == 0x8B |
|
|| ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x99 ) || ptr[ i ] == 0x9B || ptr[ i ] == 0xA1 || ( ptr[ i ] >= 0xBF && ptr[ i ] <= 0xC9 ) |
|
|| ( ptr[ i ] >= 0xCB && ptr[ i ] <= 0xD8 ) ) { |
|
return "cp1255"; |
|
} |
|
|
|
if ( ptr[ i ] == 0xDF ) |
|
return "iso-8859-8-i"; |
|
} |
|
|
|
return "iso-8859-8-i"; |
|
} |
|
|
|
static QCString automaticDetectionForJapanese( const unsigned char* ptr, int size ) |
|
{ |
|
JapaneseCode kc; |
|
|
|
switch ( kc.guess_jp( (const char*)ptr, size ) ) { |
|
case JapaneseCode::JIS: |
|
return "jis7"; |
|
case JapaneseCode::EUC: |
|
return "eucjp"; |
|
case JapaneseCode::SJIS: |
|
return "sjis"; |
|
case JapaneseCode::UTF8: |
|
return "utf8"; |
|
default: |
|
break; |
|
} |
|
|
|
return ""; |
|
} |
|
|
|
static QCString automaticDetectionForTurkish( const unsigned char* ptr, int size ) |
|
{ |
|
for ( int i = 0; i < size; ++i ) { |
|
if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x8C ) || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x9C ) || ptr[ i ] == 0x9F ) { |
|
return "cp1254"; |
|
} |
|
} |
|
|
|
return "iso-8859-9"; |
|
} |
|
|
|
static QCString automaticDetectionForWesternEuropean( const unsigned char* ptr, int size ) |
|
{ |
|
uint nonansi_count=0; |
|
for (int i=0; i<size; ++i) |
|
{ |
|
if (ptr[i]>0x79) |
|
{ |
|
++nonansi_count; |
|
if ( ptr[i]>0xc1 && ptr[i]<0xf0 && i+1<size && ptr[i+1]>0x7f && ptr[i+1]<0xc0) |
|
{ |
|
return "UTF-8"; |
|
} |
|
if (ptr[i] >= 0x78 && ptr[i] <= 0x9 ) |
|
{ |
|
return "cp1252"; |
|
} |
|
} |
|
|
|
} |
|
|
|
if (nonansi_count>0) |
|
return "iso-8859-15"; |
|
|
|
return ""; |
|
} |
|
|
|
bool EncodingDetector::errorsIfUtf8 (const char* data, int length) |
|
{ |
|
if (d->m_codec->mibEnum()!=MibUtf8) |
|
return false; //means no errors |
|
// #define highest1Bits (unsigned char)0x80 |
|
// #define highest2Bits (unsigned char)0xC0 |
|
// #define highest3Bits (unsigned char)0xE0 |
|
// #define highest4Bits (unsigned char)0xF0 |
|
// #define highest5Bits (unsigned char)0xF8 |
|
static const unsigned char highest1Bits = 0x80; |
|
static const unsigned char highest2Bits = 0xC0; |
|
static const unsigned char highest3Bits = 0xE0; |
|
static const unsigned char highest4Bits = 0xF0; |
|
static const unsigned char highest5Bits = 0xF8; |
|
|
|
for (int i=0; i<length; ++i) |
|
{ |
|
unsigned char c = data[i]; |
|
|
|
if (d->m_multiByte>0) |
|
{ |
|
if ((c & highest2Bits) == 0x80) |
|
{ |
|
--(d->m_multiByte); |
|
continue; |
|
} |
|
#ifdef DECODE_DEBUG |
|
kWarning() << "EncDetector: Broken UTF8"; |
|
#endif |
|
return true; |
|
} |
|
|
|
// most significant bit zero, single char |
|
if ((c & highest1Bits) == 0x00) |
|
continue; |
|
|
|
// 110xxxxx => init 1 following bytes |
|
if ((c & highest3Bits) == 0xC0) |
|
{ |
|
d->m_multiByte = 1; |
|
continue; |
|
} |
|
|
|
// 1110xxxx => init 2 following bytes |
|
if ((c & highest4Bits) == 0xE0) |
|
{ |
|
d->m_multiByte = 2; |
|
continue; |
|
} |
|
|
|
// 11110xxx => init 3 following bytes |
|
if ((c & highest5Bits) == 0xF0) |
|
{ |
|
d->m_multiByte = 3; |
|
continue; |
|
} |
|
#ifdef DECODE_DEBUG |
|
kWarning() << "EncDetector:_Broken UTF8"; |
|
#endif |
|
return true; |
|
} |
|
return false; |
|
} |
|
|
|
EncodingDetector::EncodingDetector() : d(new EncodingDetectorPrivate) |
|
{ |
|
} |
|
|
|
EncodingDetector::EncodingDetector(QTextCodec* codec, EncodingChoiceSource source, AutoDetectScript script) : |
|
d(new EncodingDetectorPrivate(codec,source,script)) |
|
{ |
|
} |
|
|
|
EncodingDetector::~EncodingDetector() |
|
{ |
|
delete d; |
|
} |
|
|
|
void EncodingDetector::setAutoDetectLanguage( EncodingDetector::AutoDetectScript lang) |
|
{ |
|
d->m_autoDetectLanguage=lang; |
|
} |
|
EncodingDetector::AutoDetectScript EncodingDetector::autoDetectLanguage() const |
|
{ |
|
return d->m_autoDetectLanguage; |
|
} |
|
|
|
EncodingDetector::EncodingChoiceSource EncodingDetector::encodingChoiceSource() const |
|
{ |
|
return d->m_source; |
|
} |
|
|
|
const char* EncodingDetector::encoding() const |
|
{ |
|
d->m_storeDecoderName = d->m_codec->name(); |
|
return d->m_storeDecoderName.data(); |
|
} |
|
|
|
bool EncodingDetector::visuallyOrdered() const |
|
{ |
|
return d->m_visualRTL; |
|
} |
|
|
|
// const QTextCodec* EncodingDetector::codec() const |
|
// { |
|
// return d->m_codec; |
|
// } |
|
|
|
QTextDecoder* EncodingDetector::decoder() |
|
{ |
|
return d->m_decoder; |
|
} |
|
|
|
bool EncodingDetector::setEncoding(const char *_encoding, EncodingChoiceSource type) |
|
{ |
|
QTextCodec *codec; |
|
QCString enc(_encoding); |
|
if(/*enc.isNull() || */enc.isEmpty()) |
|
{ |
|
if (type==DefaultEncoding) |
|
codec=d->m_defaultCodec; |
|
else |
|
return false; |
|
} |
|
else |
|
{ |
|
//QString->QTextCodec |
|
|
|
enc = enc.lower(); |
|
// hebrew visually ordered |
|
if(enc=="visual") |
|
enc="iso8859-8"; |
|
bool b; |
|
codec = KGlobal::charsets()->codecForName(enc, b); |
|
if (!b) |
|
return false; |
|
} |
|
|
|
if (d->m_codec->mibEnum()==codec->mibEnum()) |
|
return true; |
|
|
|
if ((type==EncodingFromMetaTag || type==EncodingFromXMLHeader) && is16Bit(codec)) |
|
{ |
|
//Sometimes the codec specified is absurd, i.e. UTF-16 despite |
|
//us decoding a meta tag as ASCII. In that case, ignore it. |
|
return false; |
|
} |
|
|
|
if (codec->mibEnum() == Mib8859_8) |
|
{ |
|
//We do NOT want to use Qt's QHebrewCodec, since it tries to reorder itself. |
|
codec = QTextCodec::codecForName("iso8859-8-i"); |
|
|
|
// visually ordered unless one of the following |
|
if(!(enc=="iso-8859-8-i"||enc=="iso_8859-8-i"||enc=="csiso88598i"||enc=="logical")) |
|
d->m_visualRTL = true; |
|
} |
|
|
|
d->m_codec = codec; |
|
d->m_source = type; |
|
delete d->m_decoder; |
|
d->m_decoder = d->m_codec->makeDecoder(); |
|
#ifdef DECODE_DEBUG |
|
kDebug(6005) << "EncodingDetector::encoding used is" << d->m_codec->name(); |
|
#endif |
|
return true; |
|
} |
|
|
|
bool EncodingDetector::analyze(const QByteArray &data) |
|
{ |
|
return analyze( data.data(), data.size() ); |
|
} |
|
|
|
bool EncodingDetector::analyze(const char *data, int len) |
|
{ |
|
// Check for UTF-16 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding. |
|
// maximumBOMLength = 10 |
|
// Even if the user has chosen utf16 we still need to auto-detect the endianness |
|
if (len >= 10 && ((d->m_source != UserChosenEncoding) || is16Bit(d->m_codec))) |
|
{ |
|
// Extract the first three bytes. |
|
const uchar *udata = (const uchar *)data; |
|
uchar c1 = *udata++; |
|
uchar c2 = *udata++; |
|
uchar c3 = *udata++; |
|
|
|
// Check for the BOM |
|
const char *autoDetectedEncoding; |
|
if ((c1 == 0xFE && c2 == 0xFF) || (c1 == 0xFF && c2 == 0xFE)) |
|
{ |
|
autoDetectedEncoding = "ISO-10646-UCS-2"; |
|
} |
|
else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) |
|
{ |
|
autoDetectedEncoding = "UTF-8"; |
|
} |
|
else if (c1 == 0x00 || c2 == 0x00) |
|
{ |
|
uchar c4 = *udata++; |
|
uchar c5 = *udata++; |
|
uchar c6 = *udata++; |
|
uchar c7 = *udata++; |
|
uchar c8 = *udata++; |
|
uchar c9 = *udata++; |
|
uchar c10 = *udata++; |
|
|
|
int nul_count_even = (c2 != 0) + (c4 != 0) + (c6 != 0) + (c8 != 0) + (c10 != 0); |
|
int nul_count_odd = (c1 != 0) + (c3 != 0) + (c5 != 0) + (c7 != 0) + (c9 != 0); |
|
if ((nul_count_even==0 && nul_count_odd==5) || (nul_count_even==5 && nul_count_odd==0)) |
|
autoDetectedEncoding = "ISO-10646-UCS-2"; |
|
else |
|
autoDetectedEncoding = 0; |
|
} |
|
else |
|
{ |
|
autoDetectedEncoding = 0; |
|
} |
|
|
|
// If we found a BOM, use the encoding it implies. |
|
if (autoDetectedEncoding != 0) |
|
{ |
|
d->m_source = BOM; |
|
d->m_codec = QTextCodec::codecForName(autoDetectedEncoding); |
|
assert(d->m_codec); |
|
//enc = d->m_codec->name(); |
|
delete d->m_decoder; |
|
d->m_decoder = d->m_codec->makeDecoder(); |
|
#ifdef DECODE_DEBUG |
|
kWarning() << "Detection by BOM"; |
|
#endif |
|
if (is16Bit(d->m_codec) && c2==0x00) |
|
{ |
|
// utf16LE, we need to put the decoder in LE mode |
|
char reverseUtf16[3] = {(char)0xFF, (char)0xFE, 0x00}; |
|
d->m_decoder->toUnicode(reverseUtf16, 2); |
|
} |
|
return true; |
|
} |
|
} |
|
|
|
//exit from routine in case it was called to only detect byte order for utf-16 |
|
if (d->m_source==UserChosenEncoding) |
|
{ |
|
#ifdef DECODE_DEBUG |
|
kWarning() << "EncodingDetector: UserChosenEncoding exit "; |
|
#endif |
|
|
|
if (errorsIfUtf8(data, len)) |
|
setEncoding("",DefaultEncoding); |
|
return true; |
|
} |
|
#if 0 //This is for plaintext, so don't try to parse HTML headers -- ahartmetz |
|
if (!d->m_seenBody) |
|
{ |
|
// we still don't have an encoding, and are in the head |
|
// the following tags are allowed in <head>: |
|
// SCRIPT|STYLE|META|LINK|OBJECT|TITLE|BASE |
|
const char *ptr = data; |
|
const char *pEnd = data+len; |
|
|
|
while(ptr != pEnd) |
|
{ |
|
if(*ptr!='<') |
|
{ |
|
++ptr; |
|
continue; |
|
} |
|
++ptr; |
|
// Handle comments. |
|
if (ptr[0] == '!' && ptr[1] == '-' && ptr[2] == '-') |
|
{ |
|
ptr += 3; |
|
skipComment(ptr, pEnd); |
|
continue; |
|
} |
|
|
|
// Handle XML header, which can have encoding in it. |
|
if (ptr[0]=='?' && ptr[1]=='x' && ptr[2]=='m' && ptr[3]=='l') |
|
{ |
|
const char *end = ptr; |
|
while (*end != '>' && end < pEnd) |
|
end++; |
|
if (*end == '\0' || end == pEnd) |
|
break; |
|
QCString str(ptr, end - ptr + 1); |
|
int length; |
|
int pos = findXMLEncoding(str, length); |
|
// also handles the case when specified encoding aint correct |
|
if (pos!=-1 && setEncoding(str.mid(pos, length), EncodingFromXMLHeader)) |
|
{ |
|
return true; |
|
} |
|
} |
|
|
|
//look for <meta>, stop if we reach <body> |
|
while ( |
|
!((*ptr >= 'a') && (*ptr <= 'z') || |
|
(*ptr >= 'A') && (*ptr <= 'Z')) |
|
&& ptr < pEnd |
|
) |
|
++ptr; |
|
|
|
char tmp[5]; |
|
int length=0; |
|
const char* max=ptr+4; |
|
if (pEnd<max) |
|
max=pEnd; |
|
while ( |
|
((*ptr >= 'a') && (*ptr <= 'z') || |
|
(*ptr >= 'A') && (*ptr <= 'Z') || |
|
(*ptr >= '0') && (*ptr <= '9')) |
|
&& ptr < max |
|
) |
|
{ |
|
tmp[length] = tolower( *ptr ); |
|
++ptr; |
|
++length; |
|
} |
|
tmp[length] = 0; |
|
if (tmp[0]=='m'&&tmp[1]=='e'&&tmp[2]=='t'&&tmp[3]=='a') |
|
{ |
|
// found a meta tag... |
|
const char* end = ptr; |
|
while(*end != '>' && *end != '\0' && end<pEnd) |
|
end++; |
|
//if ( *end == '\0' ) break; |
|
QCString str( ptr, (end-ptr)+1); |
|
str = str.lower(); |
|
int pos=0; |
|
//if( (pos = str.find("http-equiv", pos)) == -1) break; |
|
//if( (pos = str.find("content-type", pos)) == -1) break; |
|
if( (pos = str.find("charset")) == -1) |
|
continue; |
|
pos+=6; |
|
// skip to '=' |
|
if( (pos = str.find('=', pos)) == -1) |
|
continue; |
|
|
|
// skip whitespace before encoding itself |
|
while (pos < (int)str.length() && str[pos] <= ' ') |
|
++pos; |
|
if ( pos == (int)str.length()) |
|
continue; |
|
|
|
int endpos = pos; |
|
while( endpos < str.length() && |
|
(str[endpos] != ' ' && str[endpos] != '"' && str[endpos] != '\'' |
|
&& str[endpos] != ';' && str[endpos] != '>') ) |
|
++endpos; |
|
#ifdef DECODE_DEBUG |
|
kDebug( 6005 ) << "EncodingDetector: found charset in <meta>: " << str.mid(pos,endpos-pos).data(); |
|
#endif |
|
if (setEncoding(str.mid(pos,endpos-pos), EncodingFromMetaTag)) |
|
return true; |
|
} |
|
else if (tmp[0]=='b'&&tmp[1]=='o'&&tmp[2]=='d'&&tmp[3]=='y') |
|
{ |
|
d->m_seenBody=true; |
|
break; |
|
} |
|
} |
|
} |
|
|
|
if (d->m_source==EncodingFromHTTPHeader) |
|
return true; |
|
#endif |
|
//if (len<20) //make a guess even if the file is short -- ahartmetz |
|
if (len < 1) |
|
{ |
|
setEncoding("",DefaultEncoding); |
|
return false; |
|
} |
|
#ifdef DECODE_DEBUG |
|
kDebug( 6005 ) << "EncodingDetector: using heuristics (" << strlen(data) << ")"; |
|
#endif |
|
|
|
switch ( d->m_autoDetectLanguage ) |
|
{ |
|
case EncodingDetector::Arabic: |
|
return setEncoding(automaticDetectionForArabic( (const unsigned char*) data, len ), AutoDetectedEncoding); |
|
// break; |
|
case EncodingDetector::Baltic: |
|
return setEncoding(automaticDetectionForBaltic( (const unsigned char*) data, len ), AutoDetectedEncoding); |
|
// break; |
|
case EncodingDetector::CentralEuropean: |
|
return setEncoding(automaticDetectionForCentralEuropean( (const unsigned char*) data, len ), AutoDetectedEncoding); |
|
break; |
|
case EncodingDetector::Cyrillic: |
|
return setEncoding(automaticDetectionForCyrillic( (const unsigned char*) data, len), AutoDetectedEncoding); |
|
// break; |
|
case EncodingDetector::Greek: |
|
return setEncoding(automaticDetectionForGreek( (const unsigned char*) data, len ), AutoDetectedEncoding); |
|
// break; |
|
case EncodingDetector::Hebrew: |
|
return setEncoding(automaticDetectionForHebrew( (const unsigned char*) data, len ), AutoDetectedEncoding); |
|
// break; |
|
case EncodingDetector::Japanese: |
|
return setEncoding(automaticDetectionForJapanese( (const unsigned char*) data, len ), AutoDetectedEncoding); |
|
// break; |
|
case EncodingDetector::Turkish: |
|
return setEncoding(automaticDetectionForTurkish( (const unsigned char*) data, len ), AutoDetectedEncoding); |
|
// break; |
|
case EncodingDetector::WesternEuropean: |
|
if (setEncoding(automaticDetectionForWesternEuropean( (const unsigned char*) data, len ), AutoDetectedEncoding)) |
|
return true; |
|
else if (d->m_defaultCodec->mibEnum()==MibLatin1) //detection for khtml |
|
{ |
|
return setEncoding("iso-8859-15",AutoDetectedEncoding); |
|
} |
|
else //use default provided by eg katepart |
|
{ |
|
return setEncoding("",DefaultEncoding); |
|
} |
|
// break; |
|
case EncodingDetector::SemiautomaticDetection: |
|
case EncodingDetector::ChineseSimplified: |
|
case EncodingDetector::ChineseTraditional: |
|
case EncodingDetector::Korean: |
|
case EncodingDetector::Thai: |
|
case EncodingDetector::Unicode: |
|
case EncodingDetector::NorthernSaami: |
|
case EncodingDetector::SouthEasternEurope: |
|
case EncodingDetector::None: |
|
// huh. somethings broken in this code ### FIXME |
|
//enc = 0; //Reset invalid codec we tried, so we get back to latin1 fallback. |
|
break; |
|
} |
|
|
|
setEncoding("",DefaultEncoding); |
|
return true; |
|
} |
|
|
|
|
|
EncodingDetector::AutoDetectScript EncodingDetector::scriptForName(const QString& lang) |
|
{ |
|
if (lang.isEmpty()) |
|
return EncodingDetector::None; |
|
else if (lang==i18n("@item Text character set", "Unicode")) |
|
return EncodingDetector::Unicode; |
|
else if (lang==i18n("@item Text character set", "Cyrillic")) |
|
return EncodingDetector::Cyrillic; |
|
else if (lang==i18n("@item Text character set", "Western European")) |
|
return EncodingDetector::WesternEuropean; |
|
else if (lang==i18n("@item Text character set", "Central European")) |
|
return EncodingDetector::CentralEuropean; |
|
else if (lang==i18n("@item Text character set", "Greek")) |
|
return EncodingDetector::Greek; |
|
else if (lang==i18n("@item Text character set", "Hebrew")) |
|
return EncodingDetector::Hebrew; |
|
else if (lang==i18n("@item Text character set", "Turkish")) |
|
return EncodingDetector::Turkish; |
|
else if (lang==i18n("@item Text character set", "Japanese")) |
|
return EncodingDetector::Japanese; |
|
else if (lang==i18n("@item Text character set", "Baltic")) |
|
return EncodingDetector::Baltic; |
|
else if (lang==i18n("@item Text character set", "Arabic")) |
|
return EncodingDetector::Arabic; |
|
|
|
return EncodingDetector::None; |
|
} |
|
|
|
bool EncodingDetector::hasAutoDetectionForScript(EncodingDetector::AutoDetectScript script) |
|
{ |
|
switch (script) |
|
{ |
|
case EncodingDetector::Arabic: |
|
return true; |
|
case EncodingDetector::Baltic: |
|
return true; |
|
case EncodingDetector::CentralEuropean: |
|
return true; |
|
case EncodingDetector::Cyrillic: |
|
return true; |
|
case EncodingDetector::Greek: |
|
return true; |
|
case EncodingDetector::Hebrew: |
|
return true; |
|
case EncodingDetector::Japanese: |
|
return true; |
|
case EncodingDetector::Turkish: |
|
return true; |
|
case EncodingDetector::WesternEuropean: |
|
return true; |
|
case EncodingDetector::ChineseTraditional: |
|
return true; |
|
case EncodingDetector::ChineseSimplified: |
|
return true; |
|
case EncodingDetector::Unicode: |
|
return true; |
|
break; |
|
default: |
|
return false; |
|
} |
|
} |
|
|
|
QString EncodingDetector::nameForScript(EncodingDetector::AutoDetectScript script) |
|
{ |
|
switch (script) |
|
{ |
|
case EncodingDetector::Arabic: |
|
return i18n("@item Text character set", "Arabic"); |
|
break; |
|
case EncodingDetector::Baltic: |
|
return i18n("@item Text character set", "Baltic"); |
|
break; |
|
case EncodingDetector::CentralEuropean: |
|
return i18n("@item Text character set", "Central European"); |
|
break; |
|
case EncodingDetector::Cyrillic: |
|
return i18n("@item Text character set", "Cyrillic"); |
|
break; |
|
case EncodingDetector::Greek: |
|
return i18n("@item Text character set", "Greek"); |
|
break; |
|
case EncodingDetector::Hebrew: |
|
return i18n("@item Text character set", "Hebrew"); |
|
break; |
|
case EncodingDetector::Japanese: |
|
return i18n("@item Text character set", "Japanese"); |
|
break; |
|
case EncodingDetector::Turkish: |
|
return i18n("@item Text character set", "Turkish"); |
|
break; |
|
case EncodingDetector::WesternEuropean: |
|
return i18n("@item Text character set", "Western European"); |
|
break; |
|
case EncodingDetector::ChineseTraditional: |
|
return i18n("@item Text character set", "Chinese Traditional"); |
|
break; |
|
case EncodingDetector::ChineseSimplified: |
|
return i18n("@item Text character set", "Chinese Simplified"); |
|
break; |
|
case EncodingDetector::Korean: |
|
return i18n("@item Text character set", "Korean"); |
|
break; |
|
case EncodingDetector::Thai: |
|
return i18n("@item Text character set", "Thai"); |
|
break; |
|
case EncodingDetector::Unicode: |
|
return i18n("@item Text character set", "Unicode"); |
|
break; |
|
//case EncodingDetector::SemiautomaticDetection: |
|
default: |
|
return QString(); |
|
|
|
} |
|
} |
|
|
|
EncodingDetector::AutoDetectScript EncodingDetector::scriptForLanguageCode(const QString &lc) |
|
{ |
|
// It might make sense to do something special if the locale ends with |
|
// ".UTF-8" or "@utf8" |
|
const char *langStr = pango_script_for_lang[0].lang; |
|
// There is obvious optimization potential... |
|
for ( int i = 0; langStr; i++ ) { |
|
langStr = pango_script_for_lang[i].lang; |
|
// startsWith() works for empty strings: every string "starts with" an empty string. |
|
if ( lc.startsWith( QString::fromAscii( langStr ) ) ) |
|
return pango_script_for_lang[i].scripts[0]; |
|
} |
|
return None; |
|
} |
|
|
|
#undef DECODE_DEBUG |
|
|
|
|