From bfb7e863cc6cd20f9f32389bee289607a212b12b Mon Sep 17 00:00:00 2001 From: Silent Date: Sun, 14 Oct 2018 14:09:35 +0200 Subject: [PATCH] Update uchardet to 0.0.6 to improve UTF-8 detection quality This PR updates uchardet (https://www.freedesktop.org/wiki/Software/uchardet/) to the newest stable version (0.0.6). This version seems to improve UTF-8 detection quality, so updating this dependency is expected to squash numerous charset detection issues. Fixes #4878, Fixes #4767, Fixes #4428, Fixes #4246, Fixes #3705, Fixes #3588, Fixes #431, Closes #4925 --- PowerEditor/src/uchardet/CharDistribution.h | 10 +- PowerEditor/src/uchardet/JpCntx.cpp | 22 +- PowerEditor/src/uchardet/LangGreekModel.cpp | 242 ---- .../src/uchardet/LangHungarianModel.cpp | 238 ---- .../uchardet/LangModels/LangArabicModel.cpp | 265 +++++ .../{ => LangModels}/LangBulgarianModel.cpp | 64 +- .../uchardet/LangModels/LangDanishModel.cpp | 198 ++++ .../LangModels/LangEsperantoModel.cpp | 141 +++ .../uchardet/LangModels/LangFrenchModel.cpp | 206 ++++ .../uchardet/LangModels/LangGermanModel.cpp | 168 +++ .../uchardet/LangModels/LangGreekModel.cpp | 229 ++++ .../{ => LangModels}/LangHebrewModel.cpp | 41 +- .../LangModels/LangHungarianModel.cpp | 169 +++ .../LangRussianModel.cpp} | 154 +-- .../uchardet/LangModels/LangSpanishModel.cpp | 201 ++++ .../src/uchardet/LangModels/LangThaiModel.cpp | 265 +++++ .../uchardet/LangModels/LangTurkishModel.cpp | 173 +++ .../LangModels/LangVietnameseModel.cpp | 247 ++++ PowerEditor/src/uchardet/LangThaiModel.cpp | 220 ---- PowerEditor/src/uchardet/README.TXT | 10 - PowerEditor/src/uchardet/README.md | 294 +++++ PowerEditor/src/uchardet/nsBig5Prober.h | 2 +- PowerEditor/src/uchardet/nsCharSetProber.cpp | 10 +- .../src/uchardet/nsCodingStateMachine.h | 211 ++-- PowerEditor/src/uchardet/nsEUCTWProber.h | 2 +- .../src/uchardet/nsEscCharsetProber.cpp | 10 +- PowerEditor/src/uchardet/nsEscSM.cpp | 37 +- PowerEditor/src/uchardet/nsGB2312Prober.h | 4 +- PowerEditor/src/uchardet/nsHebrewProber.cpp | 2 +- PowerEditor/src/uchardet/nsLatin1Prober.h | 2 +- .../src/uchardet/nsMBCSGroupProber.cpp | 9 +- PowerEditor/src/uchardet/nsMBCSSM.cpp | 1020 +++++++++-------- PowerEditor/src/uchardet/nsPkgInt.h | 182 ++- .../src/uchardet/nsSBCSGroupProber.cpp | 72 +- PowerEditor/src/uchardet/nsSBCSGroupProber.h | 2 +- .../src/uchardet/nsSBCharSetProber.cpp | 38 +- PowerEditor/src/uchardet/nsSBCharSetProber.h | 306 ++--- PowerEditor/src/uchardet/nsSJISProber.cpp | 1 - PowerEditor/src/uchardet/nsSJISProber.h | 2 +- .../src/uchardet/nsUniversalDetector.cpp | 108 +- .../src/uchardet/nsUniversalDetector.h | 2 +- PowerEditor/src/uchardet/uchardet.cpp | 27 +- PowerEditor/src/uchardet/uchardet.h | 12 +- PowerEditor/visual.net/notepadPlus.vcxproj | 20 +- 44 files changed, 3836 insertions(+), 1802 deletions(-) delete mode 100644 PowerEditor/src/uchardet/LangGreekModel.cpp delete mode 100644 PowerEditor/src/uchardet/LangHungarianModel.cpp create mode 100644 PowerEditor/src/uchardet/LangModels/LangArabicModel.cpp rename PowerEditor/src/uchardet/{ => LangModels}/LangBulgarianModel.cpp (89%) create mode 100644 PowerEditor/src/uchardet/LangModels/LangDanishModel.cpp create mode 100644 PowerEditor/src/uchardet/LangModels/LangEsperantoModel.cpp create mode 100644 PowerEditor/src/uchardet/LangModels/LangFrenchModel.cpp create mode 100644 PowerEditor/src/uchardet/LangModels/LangGermanModel.cpp create mode 100644 PowerEditor/src/uchardet/LangModels/LangGreekModel.cpp rename PowerEditor/src/uchardet/{ => LangModels}/LangHebrewModel.cpp (92%) create mode 100644 PowerEditor/src/uchardet/LangModels/LangHungarianModel.cpp rename PowerEditor/src/uchardet/{LangCyrillicModel.cpp => LangModels/LangRussianModel.cpp} (80%) create mode 100644 PowerEditor/src/uchardet/LangModels/LangSpanishModel.cpp create mode 100644 PowerEditor/src/uchardet/LangModels/LangThaiModel.cpp create mode 100644 PowerEditor/src/uchardet/LangModels/LangTurkishModel.cpp create mode 100644 PowerEditor/src/uchardet/LangModels/LangVietnameseModel.cpp delete mode 100644 PowerEditor/src/uchardet/LangThaiModel.cpp delete mode 100644 PowerEditor/src/uchardet/README.TXT create mode 100644 PowerEditor/src/uchardet/README.md diff --git a/PowerEditor/src/uchardet/CharDistribution.h b/PowerEditor/src/uchardet/CharDistribution.h index 5b22295d..814aeb39 100644 --- a/PowerEditor/src/uchardet/CharDistribution.h +++ b/PowerEditor/src/uchardet/CharDistribution.h @@ -50,7 +50,7 @@ public: CharDistributionAnalysis() {Reset(PR_FALSE);} //feed a block of data and do distribution analysis - void HandleData(const char*, PRUint32) {} + void HandleData(const char* /*aBuf*/, PRUint32 /*aLen*/) {} //Feed a character with known length void HandleOneChar(const char* aStr, PRUint32 aCharLen) @@ -96,7 +96,7 @@ protected: //we do not handle character base on its original encoding string, but //convert this encoding string to a number, here called order. //This allow multiple encoding of a language to share one frequency table - virtual PRInt32 GetOrder(const char* ) {return -1;} + virtual PRInt32 GetOrder(const char* /*str*/) {return -1;} //If this flag is set to PR_TRUE, detection is done and conclusion has been made PRBool mDone; @@ -128,12 +128,12 @@ public: EUCTWDistributionAnalysis(); protected: - //for euc-TW encoding, we are interested + //for EUC-TW encoding, we are interested // first byte range: 0xc4 -- 0xfe // second byte range: 0xa1 -- 0xfe //no validation needed here. State machine has done that - PRInt32 GetOrder(const char* str) { - if ((unsigned char)*str >= (unsigned char)0xc4) + PRInt32 GetOrder(const char* str) + { if ((unsigned char)*str >= (unsigned char)0xc4) return 94*((unsigned char)str[0]-(unsigned char)0xc4) + (unsigned char)str[1] - (unsigned char)0xa1; else return -1; diff --git a/PowerEditor/src/uchardet/JpCntx.cpp b/PowerEditor/src/uchardet/JpCntx.cpp index 7da04139..f834e3c3 100644 --- a/PowerEditor/src/uchardet/JpCntx.cpp +++ b/PowerEditor/src/uchardet/JpCntx.cpp @@ -195,16 +195,16 @@ float JapaneseContextAnalysis::GetConfidence(void) PRInt32 SJISContextAnalysis::GetOrder(const char* str, PRUint32 *charLen) { //find out current char's byte length - if ((unsigned char)*str >= (unsigned char)0x81 && (unsigned char)*str <= (unsigned char)0x9f || - (unsigned char)*str >= (unsigned char)0xe0 && (unsigned char)*str <= (unsigned char)0xfc ) + if (((unsigned char)*str >= (unsigned char)0x81 && (unsigned char)*str <= (unsigned char)0x9f) || + ((unsigned char)*str >= (unsigned char)0xe0 && (unsigned char)*str <= (unsigned char)0xfc)) *charLen = 2; - else + else *charLen = 1; //return its order if it is hiragana - if (*str == '\202' && - (unsigned char)*(str+1) >= (unsigned char)0x9f && - (unsigned char)*(str+1) <= (unsigned char)0xf1) + if (*str == '\202' && + (unsigned char)*(str+1) >= (unsigned char)0x9f && + (unsigned char)*(str+1) <= (unsigned char)0xf1) return (unsigned char)*(str+1) - (unsigned char)0x9f; return -1; } @@ -213,17 +213,17 @@ PRInt32 EUCJPContextAnalysis::GetOrder(const char* str, PRUint32 *charLen) { //find out current char's byte length if ((unsigned char)*str == (unsigned char)0x8e || - (unsigned char)*str >= (unsigned char)0xa1 && - (unsigned char)*str <= (unsigned char)0xfe) + ((unsigned char)*str >= (unsigned char)0xa1 && + (unsigned char)*str <= (unsigned char)0xfe)) *charLen = 2; else if ((unsigned char)*str == (unsigned char)0x8f) - *charLen = 3; + *charLen = 3; else - *charLen = 1; + *charLen = 1; //return its order if it is hiragana if ((unsigned char)*str == (unsigned char)0xa4 && - (unsigned char)*(str+1) >= (unsigned char)0xa1 && + (unsigned char)*(str+1) >= (unsigned char)0xa1 && (unsigned char)*(str+1) <= (unsigned char)0xf3) return (unsigned char)*(str+1) - (unsigned char)0xa1; return -1; diff --git a/PowerEditor/src/uchardet/LangGreekModel.cpp b/PowerEditor/src/uchardet/LangGreekModel.cpp deleted file mode 100644 index d90ced9d..00000000 --- a/PowerEditor/src/uchardet/LangGreekModel.cpp +++ /dev/null @@ -1,242 +0,0 @@ -/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ -/* ***** BEGIN LICENSE BLOCK ***** - * Version: MPL 1.1/GPL 2.0/LGPL 2.1 - * - * The contents of this file are subject to the Mozilla Public License Version - * 1.1 (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * http://www.mozilla.org/MPL/ - * - * Software distributed under the License is distributed on an "AS IS" basis, - * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License - * for the specific language governing rights and limitations under the - * License. - * - * The Original Code is Mozilla Communicator client code. - * - * The Initial Developer of the Original Code is - * Netscape Communications Corporation. - * Portions created by the Initial Developer are Copyright (C) 1998 - * the Initial Developer. All Rights Reserved. - * - * Contributor(s): - * - * Alternatively, the contents of this file may be used under the terms of - * either the GNU General Public License Version 2 or later (the "GPL"), or - * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), - * in which case the provisions of the GPL or the LGPL are applicable instead - * of those above. If you wish to allow use of your version of this file only - * under the terms of either the GPL or the LGPL, and not to allow others to - * use your version of this file under the terms of the MPL, indicate your - * decision by deleting the provisions above and replace them with the notice - * and other provisions required by the GPL or the LGPL. If you do not delete - * the provisions above, a recipient may use your version of this file under - * the terms of any one of the MPL, the GPL or the LGPL. - * - * ***** END LICENSE BLOCK ***** */ - -#include "nsSBCharSetProber.h" -/**************************************************************** -255: Control characters that usually does not exist in any text -254: Carriage/Return -253: symbol (punctuation) that does not belong to word -252: 0 - 9 - -*****************************************************************/ - -//Character Mapping Table: -static const unsigned char Latin7_CharToOrderMap[] = -{ -255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 -255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 -+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20 -252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30 -253, 82,100,104, 94, 98,101,116,102,111,187,117, 92, 88,113, 85, //40 - 79,118,105, 83, 67,114,119, 95, 99,109,188,253,253,253,253,253, //50 -253, 72, 70, 80, 81, 60, 96, 93, 89, 68,120, 97, 77, 86, 69, 55, //60 - 78,115, 65, 66, 58, 76,106,103, 87,107,112,253,253,253,253,253, //70 -255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //80 -255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //90 -+253,233, 90,253,253,253,253,253,253,253,253,253,253, 74,253,253, //a0 -253,253,253,253,247,248, 61, 36, 46, 71, 73,253, 54,253,108,123, //b0 -110, 31, 51, 43, 41, 34, 91, 40, 52, 47, 44, 53, 38, 49, 59, 39, //c0 - 35, 48,250, 37, 33, 45, 56, 50, 84, 57,120,121, 17, 18, 22, 15, //d0 -124, 1, 29, 20, 21, 3, 32, 13, 25, 5, 11, 16, 10, 6, 30, 4, //e0 - 9, 8, 14, 7, 2, 12, 28, 23, 42, 24, 64, 75, 19, 26, 27,253, //f0 -}; - - - -static const unsigned char win1253_CharToOrderMap[] = -{ -255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 -255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 -+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20 -252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30 -253, 82,100,104, 94, 98,101,116,102,111,187,117, 92, 88,113, 85, //40 - 79,118,105, 83, 67,114,119, 95, 99,109,188,253,253,253,253,253, //50 -253, 72, 70, 80, 81, 60, 96, 93, 89, 68,120, 97, 77, 86, 69, 55, //60 - 78,115, 65, 66, 58, 76,106,103, 87,107,112,253,253,253,253,253, //70 -255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //80 -255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //90 -+253,233, 61,253,253,253,253,253,253,253,253,253,253, 74,253,253, //a0 -253,253,253,253,247,253,253, 36, 46, 71, 73,253, 54,253,108,123, //b0 -110, 31, 51, 43, 41, 34, 91, 40, 52, 47, 44, 53, 38, 49, 59, 39, //c0 - 35, 48,250, 37, 33, 45, 56, 50, 84, 57,120,121, 17, 18, 22, 15, //d0 -124, 1, 29, 20, 21, 3, 32, 13, 25, 5, 11, 16, 10, 6, 30, 4, //e0 - 9, 8, 14, 7, 2, 12, 28, 23, 42, 24, 64, 75, 19, 26, 27,253, //f0 -}; - -//Model Table: -//total sequences: 100% -//first 512 sequences: 98.2851% -//first 1024 sequences:1.7001% -//rest sequences: 0.0359% -//negative sequences: 0.0148% -static const PRUint8 GreekLangModel[] = -{ -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,3,2,2,3,3,3,3,3,3,3,3,1,3,3,3,0,2,2,3,3,0,3,0,3,2,0,3,3,3,0, -3,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,3,3,3,3,3,0,3,3,0,3,2,3,3,0,3,2,3,3,3,0,0,3,0,3,0,3,3,2,0,0,0, -2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0, -0,2,3,2,2,3,3,3,3,3,3,3,3,0,3,3,3,3,0,2,3,3,0,3,3,3,3,2,3,3,3,0, -2,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,2,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,0,2,1,3,3,3,3,2,3,3,2,3,3,2,0, -0,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,3,3,3,3,0,3,3,3,3,3,3,0,3,3,0,3,3,3,3,3,3,3,3,3,3,0,3,2,3,3,0, -2,0,1,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0, -0,3,3,3,3,3,2,3,0,0,0,0,3,3,0,3,1,3,3,3,0,3,3,0,3,3,3,3,0,0,0,0, -2,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,3,3,3,3,3,0,3,0,3,3,3,3,3,0,3,2,2,2,3,0,2,3,3,3,3,3,2,3,3,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,3,3,3,3,3,3,2,2,2,3,3,3,3,0,3,1,3,3,3,3,2,3,3,3,3,3,3,3,2,2,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,3,3,3,3,3,2,0,3,0,0,0,3,3,2,3,3,3,3,3,0,0,3,2,3,0,2,3,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,3,0,3,3,3,3,0,0,3,3,0,2,3,0,3,0,3,3,3,0,0,3,0,3,0,2,2,3,3,0,0, -0,0,1,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,3,3,3,3,3,2,0,3,2,3,3,3,3,0,3,3,3,3,3,0,3,3,2,3,2,3,3,2,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,3,3,2,3,2,3,3,3,3,3,3,0,2,3,2,3,2,2,2,3,2,3,3,2,3,0,2,2,2,3,0, -2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,3,0,0,0,3,3,3,2,3,3,0,0,3,0,3,0,0,0,3,2,0,3,0,3,0,0,2,0,2,0, -0,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,3,3,3,3,0,3,3,3,3,3,3,0,3,3,0,3,0,0,0,3,3,0,3,3,3,0,0,1,2,3,0, -3,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,3,3,3,3,3,2,0,0,3,2,2,3,3,0,3,3,3,3,3,2,1,3,0,3,2,3,3,2,1,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,3,3,0,2,3,3,3,3,3,3,0,0,3,0,3,0,0,0,3,3,0,3,2,3,0,0,3,3,3,0, -3,0,0,0,2,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,3,3,3,3,0,3,3,3,3,3,3,0,0,3,0,3,0,0,0,3,2,0,3,2,3,0,0,3,2,3,0, -2,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,3,1,2,2,3,3,3,3,3,3,0,2,3,0,3,0,0,0,3,3,0,3,0,2,0,0,2,3,1,0, -2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,3,0,3,3,3,3,0,3,0,3,3,2,3,0,3,3,3,3,3,3,0,3,3,3,0,2,3,0,0,3,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,3,0,3,3,3,0,0,3,0,0,0,3,3,0,3,0,2,3,3,0,0,3,0,3,0,3,3,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,3,0,0,0,3,3,3,3,3,3,0,0,3,0,2,0,0,0,3,3,0,3,0,3,0,0,2,0,2,0, -0,0,0,0,1,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,3,3,3,3,3,3,0,3,0,2,0,3,2,0,3,2,3,2,3,0,0,3,2,3,2,3,3,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,3,0,0,2,3,3,3,3,3,0,0,0,3,0,2,1,0,0,3,2,2,2,0,3,0,0,2,2,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,3,0,3,3,3,2,0,3,0,3,0,3,3,0,2,1,2,3,3,0,0,3,0,3,0,3,3,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,2,3,3,3,0,3,3,3,3,3,3,0,2,3,0,3,0,0,0,2,1,0,2,2,3,0,0,2,2,2,0, -0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,3,0,0,2,3,3,3,2,3,0,0,1,3,0,2,0,0,0,0,3,0,1,0,2,0,0,1,1,1,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,3,3,3,3,3,1,0,3,0,0,0,3,2,0,3,2,3,3,3,0,0,3,0,3,2,2,2,1,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,3,0,3,3,3,0,0,3,0,0,0,0,2,0,2,3,3,2,2,2,2,3,0,2,0,2,2,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,3,3,3,3,2,0,0,0,0,0,0,2,3,0,2,0,2,3,2,0,0,3,0,3,0,3,1,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,0,0,0,0,3,2,3,3,2,2,3,0,2,0,3,0,0,0,2,0,0,0,0,1,2,0,2,0,2,0, -0,2,0,2,0,2,2,0,0,1,0,2,2,2,0,2,2,2,0,2,2,2,0,0,2,0,0,1,0,0,0,0, -0,2,0,3,3,2,0,0,0,0,0,0,1,3,0,2,0,2,2,2,0,0,2,0,3,0,0,2,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,3,0,2,3,2,0,2,2,0,2,0,2,2,0,2,0,2,2,2,0,0,0,0,0,0,2,3,0,0,0,2, -0,1,2,0,0,0,0,2,2,0,0,0,2,1,0,2,2,0,0,0,0,0,0,1,0,2,0,0,0,0,0,0, -0,0,2,1,0,2,3,2,2,3,2,3,2,0,0,3,3,3,0,0,3,2,0,0,0,1,1,0,2,0,2,2, -0,2,0,2,0,2,2,0,0,2,0,2,2,2,0,2,2,2,2,0,0,2,0,0,0,2,0,1,0,0,0,0, -0,3,0,3,3,2,2,0,3,0,0,0,2,2,0,2,2,2,1,2,0,0,1,2,2,0,0,3,0,0,0,2, -0,1,2,0,0,0,1,2,0,0,0,0,0,0,0,2,2,0,1,0,0,2,0,0,0,2,0,0,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,2,3,3,2,2,0,0,0,2,0,2,3,3,0,2,0,0,0,0,0,0,2,2,2,0,2,2,0,2,0,2, -0,2,2,0,0,2,2,2,2,1,0,0,2,2,0,2,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0, -0,2,0,3,2,3,0,0,0,3,0,0,2,2,0,2,0,2,2,2,0,0,2,0,0,0,0,0,0,0,0,2, -0,0,2,2,0,0,2,2,2,0,0,0,0,0,0,2,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,2,0,0,3,2,0,2,2,2,2,2,0,0,0,2,0,0,0,0,2,0,1,0,0,2,0,1,0,0,0, -0,2,2,2,0,2,2,0,1,2,0,2,2,2,0,2,2,2,2,1,2,2,0,0,2,0,0,0,0,0,0,0, -0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0, -0,2,0,2,0,2,2,0,0,0,0,1,2,1,0,0,2,2,0,0,2,0,0,0,0,0,0,0,0,0,0,0, -0,0,0,3,2,3,0,0,2,0,0,0,2,2,0,2,0,0,0,1,0,0,2,0,2,0,2,2,0,0,0,0, -0,0,2,0,0,0,0,2,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0, -0,2,2,3,2,2,0,0,0,0,0,0,1,3,0,2,0,2,2,0,0,0,1,0,2,0,0,0,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,2,0,2,0,3,2,0,2,0,0,0,0,0,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1, -0,0,2,0,0,0,0,1,1,0,0,2,1,2,0,2,2,0,1,0,0,1,0,0,0,2,0,0,0,0,0,0, -0,3,0,2,2,2,0,0,2,0,0,0,2,0,0,0,2,3,0,2,0,0,0,0,0,0,2,2,0,0,0,2, -0,1,2,0,0,0,1,2,2,1,0,0,0,2,0,0,2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0, -0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,2,1,2,0,2,2,0,2,0,0,2,0,0,0,0,1,2,1,0,2,1,0,0,0,0,0,0,0,0,0,0, -0,0,2,0,0,0,3,1,2,2,0,2,0,0,0,0,2,0,0,0,2,0,0,3,0,0,0,0,2,2,2,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,2,1,0,2,0,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,1,0,0,0,0,0,0,2, -0,2,2,0,0,2,2,2,2,2,0,1,2,0,0,0,2,2,0,1,0,2,0,0,2,2,0,0,0,0,0,0, -0,0,0,0,1,0,0,0,0,0,0,0,3,0,0,2,0,0,0,0,0,0,0,0,2,0,2,0,0,0,0,2, -0,1,2,0,0,0,0,2,2,1,0,1,0,1,0,2,2,2,1,0,0,0,0,0,0,1,0,0,0,0,0,0, -0,2,0,1,2,0,0,0,0,0,0,0,0,0,0,2,0,0,2,2,0,0,0,0,1,0,0,0,0,0,0,2, -0,2,2,0,0,0,0,2,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,2,0,0,2,0,0,0, -0,2,2,2,2,0,0,0,3,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,1, -0,0,2,0,0,0,0,1,2,0,0,0,0,0,0,2,2,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0, -0,2,0,2,2,2,0,0,2,0,0,0,0,0,0,0,2,2,2,0,0,0,2,0,0,0,0,0,0,0,0,2, -0,0,1,0,0,0,0,2,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0, -0,3,0,2,0,0,0,0,0,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,2,0,0,0,0,2, -0,0,2,0,0,0,0,2,2,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,2,0,2,2,1,0,0,0,0,0,0,2,0,0,2,0,2,2,2,0,0,0,0,0,0,2,0,0,0,0,2, -0,0,2,0,0,2,0,2,2,0,0,0,0,2,0,2,0,0,0,0,0,2,0,0,0,2,0,0,0,0,0,0, -0,0,3,0,0,0,2,2,0,2,2,0,0,0,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,2,0,0,0,0,0, -0,2,2,2,2,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1, -0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,0,0,0,0,0,2,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0, -0,2,0,0,0,2,0,0,0,0,0,1,0,0,0,0,2,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0, -0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,2,0,0,0, -0,2,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,2,0,2,0,0,0, -0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,2,0,0,0,1,2,0,0,0,0,0,0,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -}; - -const SequenceModel Latin7Model ( - Latin7_CharToOrderMap, - GreekLangModel, - (float)0.982851, - PR_FALSE, - "ISO-8859-7" -); - -const SequenceModel Win1253Model( - win1253_CharToOrderMap, - GreekLangModel, - (float)0.982851, - PR_FALSE, - "windows-1253" -); diff --git a/PowerEditor/src/uchardet/LangHungarianModel.cpp b/PowerEditor/src/uchardet/LangHungarianModel.cpp deleted file mode 100644 index 856644af..00000000 --- a/PowerEditor/src/uchardet/LangHungarianModel.cpp +++ /dev/null @@ -1,238 +0,0 @@ -/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ -/* ***** BEGIN LICENSE BLOCK ***** - * Version: MPL 1.1/GPL 2.0/LGPL 2.1 - * - * The contents of this file are subject to the Mozilla Public License Version - * 1.1 (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * http://www.mozilla.org/MPL/ - * - * Software distributed under the License is distributed on an "AS IS" basis, - * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License - * for the specific language governing rights and limitations under the - * License. - * - * The Original Code is Mozilla Communicator client code. - * - * The Initial Developer of the Original Code is - * Netscape Communications Corporation. - * Portions created by the Initial Developer are Copyright (C) 1998 - * the Initial Developer. All Rights Reserved. - * - * Contributor(s): - * - * Alternatively, the contents of this file may be used under the terms of - * either the GNU General Public License Version 2 or later (the "GPL"), or - * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), - * in which case the provisions of the GPL or the LGPL are applicable instead - * of those above. If you wish to allow use of your version of this file only - * under the terms of either the GPL or the LGPL, and not to allow others to - * use your version of this file under the terms of the MPL, indicate your - * decision by deleting the provisions above and replace them with the notice - * and other provisions required by the GPL or the LGPL. If you do not delete - * the provisions above, a recipient may use your version of this file under - * the terms of any one of the MPL, the GPL or the LGPL. - * - * ***** END LICENSE BLOCK ***** */ - -#include "nsSBCharSetProber.h" -/**************************************************************** -255: Control characters that usually does not exist in any text -254: Carriage/Return -253: symbol (punctuation) that does not belong to word -252: 0 - 9 - -*****************************************************************/ - -//Character Mapping Table: -static const unsigned char Latin2_HungarianCharToOrderMap[] = -{ -255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 -255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 -+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20 -252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30 -253, 28, 40, 54, 45, 32, 50, 49, 38, 39, 53, 36, 41, 34, 35, 47, - 46, 71, 43, 33, 37, 57, 48, 64, 68, 55, 52,253,253,253,253,253, -253, 2, 18, 26, 17, 1, 27, 12, 20, 9, 22, 7, 6, 13, 4, 8, - 23, 67, 10, 5, 3, 21, 19, 65, 62, 16, 11,253,253,253,253,253, -159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174, -175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190, -191,192,193,194,195,196,197, 75,198,199,200,201,202,203,204,205, - 79,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220, -221, 51, 81,222, 78,223,224,225,226, 44,227,228,229, 61,230,231, -232,233,234, 58,235, 66, 59,236,237,238, 60, 69, 63,239,240,241, - 82, 14, 74,242, 70, 80,243, 72,244, 15, 83, 77, 84, 30, 76, 85, -245,246,247, 25, 73, 42, 24,248,249,250, 31, 56, 29,251,252,253, -}; - -static const unsigned char win1250HungarianCharToOrderMap[] = -{ -255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 -255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 -+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20 -252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30 -253, 28, 40, 54, 45, 32, 50, 49, 38, 39, 53, 36, 41, 34, 35, 47, - 46, 72, 43, 33, 37, 57, 48, 64, 68, 55, 52,253,253,253,253,253, -253, 2, 18, 26, 17, 1, 27, 12, 20, 9, 22, 7, 6, 13, 4, 8, - 23, 67, 10, 5, 3, 21, 19, 65, 62, 16, 11,253,253,253,253,253, -161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176, -177,178,179,180, 78,181, 69,182,183,184,185,186,187,188,189,190, -191,192,193,194,195,196,197, 76,198,199,200,201,202,203,204,205, - 81,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220, -221, 51, 83,222, 80,223,224,225,226, 44,227,228,229, 61,230,231, -232,233,234, 58,235, 66, 59,236,237,238, 60, 70, 63,239,240,241, - 84, 14, 75,242, 71, 82,243, 73,244, 15, 85, 79, 86, 30, 77, 87, -245,246,247, 25, 74, 42, 24,248,249,250, 31, 56, 29,251,252,253, -}; - -//Model Table: -//total sequences: 100% -//first 512 sequences: 94.7368% -//first 1024 sequences:5.2623% -//rest sequences: 0.8894% -//negative sequences: 0.0009% -static const PRUint8 HungarianLangModel[] = -{ -0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, -3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,2,3,3,1,1,2,2,2,2,2,1,2, -3,2,2,3,3,3,3,3,2,3,3,3,3,3,3,1,2,3,3,3,3,2,3,3,1,1,3,3,0,1,1,1, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0, -3,2,1,3,3,3,3,3,2,3,3,3,3,3,1,1,2,3,3,3,3,3,3,3,1,1,3,2,0,1,1,1, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0, -3,3,3,3,3,3,3,3,3,3,3,1,1,2,3,3,3,1,3,3,3,3,3,1,3,3,2,2,0,3,2,3, -0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0, -3,3,3,3,3,3,2,3,3,3,2,3,3,2,3,3,3,3,3,2,3,3,2,2,3,2,3,2,0,3,2,2, -0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0, -3,3,3,3,3,3,2,3,3,3,3,3,2,3,3,3,1,2,3,2,2,3,1,2,3,3,2,2,0,3,3,3, -0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0, -3,3,3,3,3,3,3,3,3,3,2,2,3,3,3,3,3,3,2,3,3,3,3,2,3,3,3,3,0,2,3,2, -0,0,0,1,1,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0, -3,3,3,3,3,3,3,3,3,3,3,1,1,1,3,3,2,1,3,2,2,3,2,1,3,2,2,1,0,3,3,1, -0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0, -3,2,2,3,3,3,3,3,1,2,3,3,3,3,1,2,1,3,3,3,3,2,2,3,1,1,3,2,0,1,1,1, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0, -3,3,3,3,3,3,3,3,2,2,3,3,3,3,3,2,1,3,3,3,3,3,2,2,1,3,3,3,0,1,1,2, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0, -3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,2,3,3,2,3,3,3,2,0,3,2,3, -0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,1,0, -3,3,3,3,3,3,2,3,3,3,2,3,2,3,3,3,1,3,2,2,2,3,1,1,3,3,1,1,0,3,3,2, -0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0, -3,3,3,3,3,3,3,2,3,3,3,2,3,2,3,3,3,2,3,3,3,3,3,1,2,3,2,2,0,2,2,2, -0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0, -3,3,3,2,2,2,3,1,3,3,2,2,1,3,3,3,1,1,3,1,2,3,2,3,2,2,2,1,0,2,2,2, -0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0, -3,1,1,3,3,3,3,3,1,2,3,3,3,3,1,2,1,3,3,3,2,2,3,2,1,0,3,2,0,1,1,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -3,1,1,3,3,3,3,3,1,2,3,3,3,3,1,1,0,3,3,3,3,0,2,3,0,0,2,1,0,1,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -3,3,3,3,3,3,2,2,3,3,2,2,2,2,3,3,0,1,2,3,2,3,2,2,3,2,1,2,0,2,2,2, -0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0, -3,3,3,3,3,3,1,2,3,3,3,2,1,2,3,3,2,2,2,3,2,3,3,1,3,3,1,1,0,2,3,2, -0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0, -3,3,3,1,2,2,2,2,3,3,3,1,1,1,3,3,1,1,3,1,1,3,2,1,2,3,1,1,0,2,2,2, -0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0, -3,3,3,2,1,2,1,1,3,3,1,1,1,1,3,3,1,1,2,2,1,2,1,1,2,2,1,1,0,2,2,1, -0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0, -3,3,3,1,1,2,1,1,3,3,1,0,1,1,3,3,2,0,1,1,2,3,1,0,2,2,1,0,0,1,3,2, -0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0, -3,2,1,3,3,3,3,3,1,2,3,2,3,3,2,1,1,3,2,3,2,1,2,2,0,1,2,1,0,0,1,1, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0, -3,3,3,3,2,2,2,2,3,1,2,2,1,1,3,3,0,3,2,1,2,3,2,1,3,3,1,1,0,2,1,3, -0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0, -3,3,3,2,2,2,3,2,3,3,3,2,1,1,3,3,1,1,1,2,2,3,2,3,2,2,2,1,0,2,2,1, -0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0, -1,0,0,3,3,3,3,3,0,0,3,3,2,3,0,0,0,2,3,3,1,0,1,2,0,0,1,1,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -3,1,2,3,3,3,3,3,1,2,3,3,2,2,1,1,0,3,3,2,2,1,2,2,1,0,2,2,0,1,1,1, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -3,3,2,2,1,3,1,2,3,3,2,2,1,1,2,2,1,1,1,1,3,2,1,1,1,1,2,1,0,1,2,1, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0, -2,3,3,1,1,1,1,1,3,3,3,0,1,1,3,3,1,1,1,1,1,2,2,0,3,1,1,2,0,2,1,1, -0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0, -3,1,0,1,2,1,2,2,0,1,2,3,1,2,0,0,0,2,1,1,1,1,1,2,0,0,1,1,0,0,0,0, -1,2,1,2,2,2,1,2,1,2,0,2,0,2,2,1,1,2,1,1,2,1,1,1,0,1,0,0,0,1,1,0, -1,1,1,2,3,2,3,3,0,1,2,2,3,1,0,1,0,2,1,2,2,0,1,1,0,0,1,1,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -1,0,0,3,3,2,2,1,0,0,3,2,3,2,0,0,0,1,1,3,0,0,1,1,0,0,2,1,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -3,1,1,2,2,3,3,1,0,1,3,2,3,1,1,1,0,1,1,1,1,1,3,1,0,0,2,2,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -3,1,1,1,2,2,2,1,0,1,2,3,3,2,0,0,0,2,1,1,1,2,1,1,1,0,1,1,1,0,0,0, -1,2,2,2,2,2,1,1,1,2,0,2,1,1,1,1,1,2,1,1,1,1,1,1,0,1,1,1,0,0,1,1, -3,2,2,1,0,0,1,1,2,2,0,3,0,1,2,1,1,0,0,1,1,1,0,1,1,1,1,0,2,1,1,1, -2,2,1,1,1,2,1,2,1,1,1,1,1,1,1,2,1,1,1,2,3,1,1,1,1,1,1,1,1,1,0,1, -2,3,3,0,1,0,0,0,3,3,1,0,0,1,2,2,1,0,0,0,0,2,0,0,1,1,1,0,2,1,1,1, -2,1,1,1,1,1,1,2,1,1,0,1,1,0,1,1,1,0,1,2,1,1,0,1,1,1,1,1,1,1,0,1, -2,3,3,0,1,0,0,0,2,2,0,0,0,0,1,2,2,0,0,0,0,1,0,0,1,1,0,0,2,0,1,0, -2,1,1,1,1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,2,0,1,1,1,1,1,0,1, -3,2,2,0,1,0,1,0,2,3,2,0,0,1,2,2,1,0,0,1,1,1,0,0,2,1,0,1,2,2,1,1, -2,1,1,1,1,1,1,2,1,1,1,1,1,1,0,2,1,0,1,1,0,1,1,1,0,1,1,2,1,1,0,1, -2,2,2,0,0,1,0,0,2,2,1,1,0,0,2,1,1,0,0,0,1,2,0,0,2,1,0,0,2,1,1,1, -2,1,1,1,1,2,1,2,1,1,1,2,2,1,1,2,1,1,1,2,1,1,1,1,1,1,1,1,1,1,0,1, -1,2,3,0,0,0,1,0,3,2,1,0,0,1,2,1,1,0,0,0,0,2,1,0,1,1,0,0,2,1,2,1, -1,1,0,0,0,1,0,1,1,1,1,1,2,0,0,1,0,0,0,2,0,0,1,1,1,1,1,1,1,1,0,1, -3,0,0,2,1,2,2,1,0,0,2,1,2,2,0,0,0,2,1,1,1,0,1,1,0,0,1,1,2,0,0,0, -1,2,1,2,2,1,1,2,1,2,0,1,1,1,1,1,1,1,1,1,2,1,1,0,0,1,1,1,1,0,0,1, -1,3,2,0,0,0,1,0,2,2,2,0,0,0,2,2,1,0,0,0,0,3,1,1,1,1,0,0,2,1,1,1, -2,1,0,1,1,1,0,1,1,1,1,1,1,1,0,2,1,0,0,1,0,1,1,0,1,1,1,1,1,1,0,1, -2,3,2,0,0,0,1,0,2,2,0,0,0,0,2,1,1,0,0,0,0,2,1,0,1,1,0,0,2,1,1,0, -2,1,1,1,1,2,1,2,1,2,0,1,1,1,0,2,1,1,1,2,1,1,1,1,0,1,1,1,1,1,0,1, -3,1,1,2,2,2,3,2,1,1,2,2,1,1,0,1,0,2,2,1,1,1,1,1,0,0,1,1,0,1,1,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -2,2,2,0,0,0,0,0,2,2,0,0,0,0,2,2,1,0,0,0,1,1,0,0,1,2,0,0,2,1,1,1, -2,2,1,1,1,2,1,2,1,1,0,1,1,1,1,2,1,1,1,2,1,1,1,1,0,1,2,1,1,1,0,1, -1,0,0,1,2,3,2,1,0,0,2,0,1,1,0,0,0,1,1,1,1,0,1,1,0,0,1,0,0,0,0,0, -1,2,1,2,1,2,1,1,1,2,0,2,1,1,1,0,1,2,0,0,1,1,1,0,0,0,0,0,0,0,0,0, -2,3,2,0,0,0,0,0,1,1,2,1,0,0,1,1,1,0,0,0,0,2,0,0,1,1,0,0,2,1,1,1, -2,1,1,1,1,1,1,2,1,0,1,1,1,1,0,2,1,1,1,1,1,1,0,1,0,1,1,1,1,1,0,1, -1,2,2,0,1,1,1,0,2,2,2,0,0,0,3,2,1,0,0,0,1,1,0,0,1,1,0,1,1,1,0,0, -1,1,0,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,2,1,1,1,0,0,1,1,1,0,1,0,1, -2,1,0,2,1,1,2,2,1,1,2,1,1,1,0,0,0,1,1,0,1,1,1,1,0,0,1,1,1,0,0,0, -1,2,2,2,2,2,1,1,1,2,0,2,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,0,0,1,0, -1,2,3,0,0,0,1,0,2,2,0,0,0,0,2,2,0,0,0,0,0,1,0,0,1,0,0,0,2,0,1,0, -2,1,1,1,1,1,0,2,0,0,0,1,2,1,1,1,1,0,1,2,0,1,0,1,0,1,1,1,0,1,0,1, -2,2,2,0,0,0,1,0,2,1,2,0,0,0,1,1,2,0,0,0,0,1,0,0,1,1,0,0,2,1,0,1, -2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,0,1,1,1,1,1,0,1, -1,2,2,0,0,0,1,0,2,2,2,0,0,0,1,1,0,0,0,0,0,1,1,0,2,0,0,1,1,1,0,1, -1,0,1,1,1,1,1,1,0,1,1,1,1,0,0,1,0,0,1,1,0,1,0,1,1,1,1,1,0,0,0,1, -1,0,0,1,0,1,2,1,0,0,1,1,1,2,0,0,0,1,1,0,1,0,1,1,0,0,1,0,0,0,0,0, -0,2,1,2,1,1,1,1,1,2,0,2,0,1,1,0,1,2,1,0,1,1,1,0,0,0,0,0,0,1,0,0, -2,1,1,0,1,2,0,0,1,1,1,0,0,0,1,1,0,0,0,0,0,1,0,0,1,0,0,0,2,1,0,1, -2,2,1,1,1,1,1,2,1,1,0,1,1,1,1,2,1,1,1,2,1,1,0,1,0,1,1,1,1,1,0,1, -1,2,2,0,0,0,0,0,1,1,0,0,0,0,2,1,0,0,0,0,0,2,0,0,2,2,0,0,2,0,0,1, -2,1,1,1,1,1,1,1,0,1,1,0,1,1,0,1,0,0,0,1,1,1,1,0,0,1,1,1,1,0,0,1, -1,1,2,0,0,3,1,0,2,1,1,1,0,0,1,1,1,0,0,0,1,1,0,0,0,1,0,0,1,0,1,0, -1,2,1,0,1,1,1,2,1,1,0,1,1,1,1,1,0,0,0,1,1,1,1,1,0,1,0,0,0,1,0,0, -2,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,2,0,0,0, -2,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,2,1,1,0,0,1,1,1,1,1,0,1, -2,1,1,1,2,1,1,1,0,1,1,2,1,0,0,0,0,1,1,1,1,0,1,0,0,0,0,1,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -1,1,0,1,1,1,1,1,0,0,1,1,2,1,0,0,0,1,1,0,0,0,1,1,0,0,1,0,1,0,0,0, -1,2,1,1,1,1,1,1,1,1,0,1,0,1,1,1,1,1,1,0,1,1,1,0,0,0,0,0,0,1,0,0, -2,0,0,0,1,1,1,1,0,0,1,1,0,0,0,0,0,1,1,1,2,0,0,1,0,0,1,0,1,0,0,0, -0,1,1,1,1,1,1,1,1,2,0,1,1,1,1,0,1,1,1,0,1,1,1,0,0,0,0,0,0,0,0,0, -1,0,0,1,1,1,1,1,0,0,2,1,0,1,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0, -0,1,1,1,1,1,1,0,1,1,0,1,0,1,1,0,1,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0, -1,0,0,1,1,1,0,0,0,0,1,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0, -0,1,1,1,1,1,0,0,1,1,0,1,0,1,0,0,1,1,1,0,1,1,1,0,0,0,0,0,0,0,0,0, -0,0,0,1,0,0,0,0,0,0,1,1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,1,1,1,0,1,0,0,1,1,0,1,0,1,1,0,1,1,1,0,1,1,1,0,0,0,0,0,0,0,0,0, -2,1,1,1,1,1,1,1,1,1,1,0,0,1,1,1,0,0,1,0,0,1,0,1,0,1,1,1,0,0,1,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -1,0,0,1,1,1,1,0,0,0,1,1,1,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0, -0,1,1,1,1,1,1,0,1,1,0,1,0,1,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0, -}; - -const SequenceModel Latin2HungarianModel( - Latin2_HungarianCharToOrderMap, - HungarianLangModel, - (float)0.947368, - PR_TRUE, - "ISO-8859-2"); - -const SequenceModel Win1250HungarianModel( - win1250HungarianCharToOrderMap, - HungarianLangModel, - (float)0.947368, - PR_TRUE, - "windows-1250"); diff --git a/PowerEditor/src/uchardet/LangModels/LangArabicModel.cpp b/PowerEditor/src/uchardet/LangModels/LangArabicModel.cpp new file mode 100644 index 00000000..ba404ceb --- /dev/null +++ b/PowerEditor/src/uchardet/LangModels/LangArabicModel.cpp @@ -0,0 +1,265 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Mozilla Communicator client code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "../nsSBCharSetProber.h" + +/********* Language model for: Arabic *********/ + +/** + * Generated by BuildLangModel.py + * On: 2015-12-13 18:33:58.848027 + **/ + +/* Character Mapping Table: + * ILL: illegal character. + * CTR: control character specific to the charset. + * RET: carriage/return. + * SYM: symbol (punctuation) that does not belong to word. + * NUM: 0 - 9. + * + * Other characters are ordered by probabilities + * (0 is the most common character in the language). + * + * Orders are generic to a language. So the codepoint with order X in + * CHARSET1 maps to the same character as the codepoint with the same + * order X in CHARSET2 for the same language. + * As such, it is possible to get missing order. For instance the + * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 + * even though they are both used for French. Same for the euro sign. + */ +static const unsigned char Iso_8859_6_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 52, 72, 61, 68, 74, 69, 59, 78, 60, 90, 86, 67, 65, 71, 75, /* 4X */ + 64, 85, 76, 55, 57, 79, 81, 70, 82, 87, 91,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 37, 58, 49, 47, 38, 54, 66, 46, 39, 88, 63, 45, 51, 43, 40, /* 6X */ + 62, 89, 42, 44, 41, 50, 77, 73, 83, 56, 80,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,ILL,ILL,ILL,SYM,ILL,ILL,ILL,ILL,ILL,ILL,ILL,SYM,SYM,ILL,ILL, /* AX */ + ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL,SYM,ILL,ILL,ILL,SYM, /* BX */ + ILL, 32, 34, 15, 35, 22, 31, 0, 9, 8, 7, 27, 19, 18, 25, 11, /* CX */ + 30, 5, 26, 12, 21, 23, 28,SYM, 33, 10, 29,ILL,ILL,ILL,ILL,ILL, /* DX */ + 36, 13, 14, 17, 1, 3, 6, 16, 4, 24, 2,SYM,SYM,SYM,SYM,SYM, /* EX */ + SYM,SYM,SYM,ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Windows_1256_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 52, 72, 61, 68, 74, 69, 59, 78, 60, 90, 86, 67, 65, 71, 75, /* 4X */ + 64, 85, 76, 55, 57, 79, 81, 70, 82, 87, 91,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 37, 58, 49, 47, 38, 54, 66, 46, 39, 88, 63, 45, 51, 43, 40, /* 6X */ + 62, 89, 42, 44, 41, 50, 77, 73, 83, 56, 80,SYM,SYM,SYM,SYM,CTR, /* 7X */ + SYM, 48,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 95,SYM, 96, 92, 97, 98, /* 8X */ + 53,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 84,SYM, 99,SYM,100,SYM,SYM,101, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,102,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 103, 32, 34, 15, 35, 22, 31, 0, 9, 8, 7, 27, 19, 18, 25, 11, /* CX */ + 30, 5, 26, 12, 21, 23, 28,SYM, 20, 33, 10, 29, 36, 13, 14, 17, /* DX */ + 104, 1, 93, 3, 6, 16, 4,105,106, 94,107,108, 24, 2,109,110, /* EX */ + SYM,SYM,SYM,SYM,111,SYM,SYM,SYM,SYM,112,SYM,113,114,SYM,SYM,115, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + + +/* Model Table: + * Total sequences: 1479 + * First 512 sequences: 0.9696025116913417 + * Next 512 sequences (512-1024): 0.029166911858880054 + * Rest: 0.0012305764497782395 + * Negative sequences: TODO + */ +static const PRUint8 ArabicLangModel[] = +{ + 2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,1,3,1,3,3,3,3,2,2,3, + 3,2,1,2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2, + 1,2,3,2,2,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,0,3,1,3,3,3,3,2,2,3, + 2,2,0,2,2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,1,3,2,3,3,3,2,2,2,2, + 0,2,1,3,1,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2, + 2,2,2,1,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,1,2,3,2,3,2,3,3,2,3, + 1,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0, + 3,2,3,3,3,2,2,3,3,3,3,3,3,3,3,1,3,2,3,3,3,3,0,3,2,2,3,2,2,2,3,2, + 0,3,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,2,3,3,2,2, + 0,3,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 2,0,0,0,0,0,0,0,0,1,3,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,2,3,3,2,2,1,2,2,2,2,2,2,2, + 1,2,2,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,2,3,3,2,3,0,3,2,0,2,2,3,0,3,2,0,3,3,3,0,2,0, + 0,3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,2,2,0,2,0,0,3,3,2,3,0,2,0,2, + 2,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,2,3,3,2,3,3,1,0,0,2,2,0,1,0,1,0,1, + 0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0, + 3,3,3,2,3,3,3,3,3,2,3,2,3,2,3,2,3,2,2,2,2,2,2,2,2,2,2,1,3,2,2,2, + 1,3,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,0,2,0,2,1,3,2,0,3,2,0,2,0,3,0,2,0, + 0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,3,3,3,3,3,3,3,2,3,3,3,3,3,3,0,3,3,3,3,3,3,0,3,2,3,2,3,2,3,2,2, + 0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,2,3,3,1,3,2,1,2,0,2,2,0,3,2,2,0,0,2,0,2,1,2,0,3,0, + 0,1,0,2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,2,2,3,2,2,2,2,2,2,2,2,2,2,1,0,2,3,3,0,1,3,0, + 0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,2,3,3,2,1,3,3,3,3,0,2,3,0,3,2,2,0,3,2,0,3,2,3,0,2,0, + 0,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,2,2,0,2,3,1,2,1,0,1,0,0,1,0,3,2,0,2,2,2, + 0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,2,3,3,3,2,3,3,2,2,3,2,3,2,2,0,2,1,2,1,1,0,2,1,0,0,0,1,0,2, + 1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,2,3,3,3,3,2,3,2,3,3,2,1,2,2,2,3,3,2,2,2,0,0,0,2,3,1,0,0,2,1,2, + 0,1,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,3,3,3,1,2,3,2,0,2,3,3,3,2,3,0,2,2,2,3,2,2,0,3,0,2,2,2,3,2,3,1, + 0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,2,3,3,2,3,0,3,2,0,2,1,3,0,2,0,0,2,2,2,0,0,0,2,0,0, + 0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0, + 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,2,3,2,3,2,3,2,2,0,0,2,0,0,1,3,2,0,3,0,1,2,0,2,0,2,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,2,2,3,3,2,2,0,2,2,1,2,2,2,2,0,0,0,0,1,2,2,0,0,1,0,2, + 2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,2,3,2,2,1,1,2,3,1,2,2,0,0,0,0,0,0,1,0,0,2,0,1,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,2,2,3,2,3,2,0,2,0,1,2,0,2,1,2,0,0,0,2,2,0,0,0,2,0,2, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,2,2,3,2,1,2,2,2,0,0,2,0,0,2,2,1,0,2,1,0,2,0,2,0,2,0, + 0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,2,2,2,2,2,2,2,0,0,0,2,2,0,3,3,0,2,0,0,0,0,2,2,0,0,0,0,0,1, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,2,3,2,2,3,2,2,2,2,2,2,0,2,2,2,2,2,2,0,1,0,1,2,0,1,1,1,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 2,0,1,1,1,0,0,2,2,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,2,2,3,2,2,1,2,3,2,0,0,0,2,0,0,3,0,0,0,1,0,0,0,2,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,3,2,2,2,3,2,2,0,2,0,2,2,2,2,0,1,2,1,1,0,2,0,1,0,3,1,2,0,1,2,1, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 2,2,2,2,3,2,1,2,1,1,0,2,2,0,2,0,2,2,0,0,0,2,0,0,2,2,1,2,0,0,0,0, + 0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,0,1,0,1,1,1,0,1,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0, + 0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,2,2,1,2,2,2,2,2,1,2,0,2,1,2,0,0,1,0,1,0,1,0,0,0,1,1, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,2,2,1,1,2,2,2,2,2,0,2,0,2,1,2,0,0,1,0,0,0,2,0,0,0,1,2, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,2,2,1,2,2,2,2,2,2,0,2,0,2,1,2,0,0,1,0,0,0,1,0,0,0,1,1, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,1,0,1,2,2,2,2,2,1,1,2,0,2,2,2,0,0,2,0,0,0,1,0,0,0,2,2, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,2,2,2,2,2,2,1,2,2,2,1,0,1,1,1,0,0,0,0,2,0,2,0,0,0,1,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,2,2,2,2,2,1,2,2,2,0,1,0,2,1,2,0,0,0,0,2,0,1,0,0,0,0,2, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,2,2,2,2,2,0,1,2,1,1,2,0,2,1,0,0,0,1,0,1,0,0,0,0,0,0,1, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,2,2,2,1,2,0,0,2,1,2,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,1,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,2,2,2,2,1,0,0,1,2,0,2,0,0,1,0,0,0,1,1,1,0,1,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,2,2,2,2,2,2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,2,2,2,1,1,1,1,1,0,1,1,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0, + 2,2,1,0,2,2,1,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,2,2,2,2,2,1,0,0,1,2,0,0,1,2,0,0,0,0,0,1,0,0,0,0,0,0,1, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,1,1,1,0,2,2,2,2,1,0,2,0,1,0,2,0,0,0,0,0,0,2,0,0,0,0,1, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,2,2,2,2,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,2,0,0,0,0,0,1,0, + 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,2,2,2,1,1,0,0,0,1,0,0,0,0,0,0,0,1,1,0,1,0,0,1, + 2,2,2,0,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,1,2,1,0,1,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,2,1,0,2,2,0,0,0,0,0,0,0,1,0,0,2,0,0,0,0,1,0,0,1,2,2,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,1,0,1,0,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,1,2,1,0,1,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,2,0,1,1,0,1,0,1,1,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,2,0,0,2,0,2,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,1,0,2,1,1,0,0,0,0,0,0,1,0,0,2,0,1,0,2,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,1,0,1,1,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,2,1,0,1,0,0,0,0,1,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,1,1,1,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0, +}; + + +const SequenceModel Iso_8859_6ArabicModel = +{ + Iso_8859_6_CharToOrderMap, + ArabicLangModel, + 64, + (float)0.9696025116913417, + PR_FALSE, + "ISO-8859-6" +}; + +const SequenceModel Windows_1256ArabicModel = +{ + Windows_1256_CharToOrderMap, + ArabicLangModel, + 64, + (float)0.9696025116913417, + PR_FALSE, + "WINDOWS-1256" +}; \ No newline at end of file diff --git a/PowerEditor/src/uchardet/LangBulgarianModel.cpp b/PowerEditor/src/uchardet/LangModels/LangBulgarianModel.cpp similarity index 89% rename from PowerEditor/src/uchardet/LangBulgarianModel.cpp rename to PowerEditor/src/uchardet/LangModels/LangBulgarianModel.cpp index 77686607..18c58ee2 100644 --- a/PowerEditor/src/uchardet/LangBulgarianModel.cpp +++ b/PowerEditor/src/uchardet/LangModels/LangBulgarianModel.cpp @@ -35,12 +35,12 @@ * * ***** END LICENSE BLOCK ***** */ -#include "nsSBCharSetProber.h" +#include "../nsSBCharSetProber.h" /**************************************************************** -255: Control characters that usually does not exist in any text -254: Carriage/Return -253: symbol (punctuation) that does not belong to word -252: 0 - 9 +CTR: Control characters that usually does not exist in any text +RET: Carriage/Return +SYM: symbol (punctuation) that does not belong to word +NUM: 0 - 9 *****************************************************************/ @@ -50,14 +50,14 @@ static const unsigned char Latin5_BulgarianCharToOrderMap[] = { -255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 -255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 -+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20 -252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30 -253, 77, 90, 99,100, 72,109,107,101, 79,185, 81,102, 76, 94, 82, //40 -110,186,108, 91, 74,119, 84, 96,111,187,115,253,253,253,253,253, //50 -253, 65, 69, 70, 66, 63, 68,112,103, 92,194,104, 95, 86, 87, 71, //60 -116,195, 85, 93, 97,113,196,197,198,199,200,253,253,253,253,253, //70 +CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, //00 +CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, //10 +SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, //20 +NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, //30 +SYM, 77, 90, 99,100, 72,109,107,101, 79,185, 81,102, 76, 94, 82, //40 +110,186,108, 91, 74,119, 84, 96,111,187,115,SYM,SYM,SYM,SYM,SYM, //50 +SYM, 65, 69, 70, 66, 63, 68,112,103, 92,194,104, 95, 86, 87, 71, //60 +116,195, 85, 93, 97,113,196,197,198,199,200,SYM,SYM,SYM,SYM,SYM, //70 194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209, //80 210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225, //90 81,226,227,228,229,230,105,231,232,233,234,235,236, 45,237,238, //a0 @@ -65,27 +65,27 @@ static const unsigned char Latin5_BulgarianCharToOrderMap[] = 39, 28, 34, 51, 48, 49, 53, 50, 54, 57, 61,239, 67,240, 60, 56, //c0 1, 18, 9, 20, 11, 3, 23, 15, 2, 26, 12, 10, 14, 6, 4, 13, //d0 7, 8, 5, 19, 29, 25, 22, 21, 27, 24, 17, 75, 52,241, 42, 16, //e0 - 62,242,243,244, 58,245, 98,246,247,248,249,250,251, 91,252,253, //f0 + 62,242,243,244, 58,245, 98,246,247,248,249,250,251, 91,NUM,SYM, //f0 }; static const unsigned char win1251BulgarianCharToOrderMap[] = { -255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 -255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 -+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20 -252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30 -253, 77, 90, 99,100, 72,109,107,101, 79,185, 81,102, 76, 94, 82, //40 -110,186,108, 91, 74,119, 84, 96,111,187,115,253,253,253,253,253, //50 -253, 65, 69, 70, 66, 63, 68,112,103, 92,194,104, 95, 86, 87, 71, //60 -116,195, 85, 93, 97,113,196,197,198,199,200,253,253,253,253,253, //70 +CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, //00 +CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, //10 +SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, //20 +NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, //30 +SYM, 77, 90, 99,100, 72,109,107,101, 79,185, 81,102, 76, 94, 82, //40 +110,186,108, 91, 74,119, 84, 96,111,187,115,SYM,SYM,SYM,SYM,SYM, //50 +SYM, 65, 69, 70, 66, 63, 68,112,103, 92,194,104, 95, 86, 87, 71, //60 +116,195, 85, 93, 97,113,196,197,198,199,200,SYM,SYM,SYM,SYM,SYM, //70 206,207,208,209,210,211,212,213,120,214,215,216,217,218,219,220, //80 -221, 78, 64, 83,121, 98,117,105,222,223,224,225,226,227,228,229, //90 +221, 78, 64, 83,121, 98,117,105,ILL,223,224,225,226,227,228,229, //90 88,230,231,232,233,122, 89,106,234,235,236,237,238, 45,239,240, //a0 73, 80,118,114,241,242,243,244,245, 62, 58,246,247,248,249,250, //b0 31, 32, 35, 43, 37, 44, 55, 47, 40, 59, 33, 46, 38, 36, 41, 30, //c0 - 39, 28, 34, 51, 48, 49, 53, 50, 54, 57, 61,251, 67,252, 60, 56, //d0 + 39, 28, 34, 51, 48, 49, 53, 50, 54, 57, 61,251, 67,NUM, 60, 56, //d0 1, 18, 9, 20, 11, 3, 23, 15, 2, 26, 12, 10, 14, 6, 4, 13, //e0 - 7, 8, 5, 19, 29, 25, 22, 21, 27, 24, 17, 75, 52,253, 42, 16, //f0 + 7, 8, 5, 19, 29, 25, 22, 21, 27, 24, 17, 75, 52,SYM, 42, 16, //f0 }; //Model Table: @@ -226,18 +226,22 @@ static const PRUint8 BulgarianLangModel[] = 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1, }; -const SequenceModel Latin5BulgarianModel( +const SequenceModel Latin5BulgarianModel = +{ Latin5_BulgarianCharToOrderMap, BulgarianLangModel, + 64, (float)0.969392, PR_FALSE, "ISO-8859-5" -); +}; -const SequenceModel Win1251BulgarianModel( +const SequenceModel Win1251BulgarianModel = +{ win1251BulgarianCharToOrderMap, BulgarianLangModel, + 64, (float)0.969392, PR_FALSE, - "windows-1251" -); + "WINDOWS-1251" +}; diff --git a/PowerEditor/src/uchardet/LangModels/LangDanishModel.cpp b/PowerEditor/src/uchardet/LangModels/LangDanishModel.cpp new file mode 100644 index 00000000..46b6f289 --- /dev/null +++ b/PowerEditor/src/uchardet/LangModels/LangDanishModel.cpp @@ -0,0 +1,198 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Mozilla Communicator client code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "../nsSBCharSetProber.h" + +/********* Language model for: Danish *********/ + +/** + * Generated by BuildLangModel.py + * On: 2016-02-19 17:56:42.163975 + **/ + +/* Character Mapping Table: + * ILL: illegal character. + * CTR: control character specific to the charset. + * RET: carriage/return. + * SYM: symbol (punctuation) that does not belong to word. + * NUM: 0 - 9. + * + * Other characters are ordered by probabilities + * (0 is the most common character in the language). + * + * Orders are generic to a language. So the codepoint with order X in + * CHARSET1 maps to the same character as the codepoint with the same + * order X in CHARSET2 for the same language. + * As such, it is possible to get missing order. For instance the + * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 + * even though they are both used for French. Same for the euro sign. + */ +static const unsigned char Iso_8859_15_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 4, 15, 24, 7, 0, 13, 10, 18, 5, 23, 11, 8, 12, 2, 9, /* 4X */ + 17, 29, 1, 6, 3, 16, 14, 25, 27, 20, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 4, 15, 24, 7, 0, 13, 10, 18, 5, 23, 11, 8, 12, 2, 9, /* 6X */ + 17, 29, 1, 6, 3, 16, 14, 25, 27, 20, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM, 39,SYM, 39,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM, 53, 42,SYM,SYM, 54,SYM,SYM,SYM, 55, 56, 57,SYM, /* BX */ + 58, 33, 40, 35, 32, 21, 22, 38, 41, 28, 49, 45, 59, 34, 60, 50, /* CX */ + 43, 47, 51, 36, 52, 61, 30,SYM, 19, 62, 37, 44, 31, 46, 63, 48, /* DX */ + 64, 33, 40, 35, 32, 21, 22, 38, 41, 28, 49, 45, 65, 34, 66, 50, /* EX */ + 43, 47, 51, 36, 52, 67, 30,SYM, 19, 68, 37, 44, 31, 46, 69, 70, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_1_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 4, 15, 24, 7, 0, 13, 10, 18, 5, 23, 11, 8, 12, 2, 9, /* 4X */ + 17, 29, 1, 6, 3, 16, 14, 25, 27, 20, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 4, 15, 24, 7, 0, 13, 10, 18, 5, 23, 11, 8, 12, 2, 9, /* 6X */ + 17, 29, 1, 6, 3, 16, 14, 25, 27, 20, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 42,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 71, 33, 40, 35, 32, 21, 22, 38, 41, 28, 49, 45, 72, 34, 73, 50, /* CX */ + 43, 47, 51, 36, 52, 74, 30,SYM, 19, 75, 37, 44, 31, 46, 76, 48, /* DX */ + 77, 33, 40, 35, 32, 21, 22, 38, 41, 28, 49, 45, 78, 34, 79, 50, /* EX */ + 43, 47, 51, 36, 52, 80, 30,SYM, 19, 81, 37, 44, 31, 46, 82, 83, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Windows_1252_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 4, 15, 24, 7, 0, 13, 10, 18, 5, 23, 11, 8, 12, 2, 9, /* 4X */ + 17, 29, 1, 6, 3, 16, 14, 25, 27, 20, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 4, 15, 24, 7, 0, 13, 10, 18, 5, 23, 11, 8, 12, 2, 9, /* 6X */ + 17, 29, 1, 6, 3, 16, 14, 25, 27, 20, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */ + SYM,ILL,SYM, 84,SYM,SYM,SYM,SYM,SYM,SYM, 39,SYM, 85,ILL, 86,ILL, /* 8X */ + ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 39,SYM, 87,ILL, 88, 89, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 42,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 90, 33, 40, 35, 32, 21, 22, 38, 41, 28, 49, 45, 91, 34, 92, 50, /* CX */ + 43, 47, 51, 36, 52, 93, 30,SYM, 19, 94, 37, 44, 31, 46, 95, 48, /* DX */ + 96, 33, 40, 35, 32, 21, 22, 38, 41, 28, 49, 45, 97, 34, 98, 50, /* EX */ + 43, 47, 51, 36, 52, 99, 30,SYM, 19,100, 37, 44, 31, 46,101,102, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + + +/* Model Table: + * Total sequences: 964 + * First 512 sequences: 0.9968082796759031 + * Next 512 sequences (512-1024): 0.0031917203240968304 + * Rest: 3.903127820947816e-17 + * Negative sequences: TODO + */ +static const PRUint8 DanishLangModel[] = +{ + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,3,3,3,2,3,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,0,2,3,3,3,3,3,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,0,2,3,3,2,3,3,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,0,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,0,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,0,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,3,3,3,2,2,0,0, + 3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,2,3,3,3,3,3,3,2,2,2,2,3,2, + 3,3,3,3,3,3,3,2,3,3,2,3,3,2,3,2,3,2,3,3,3,3,3,2,2,2,2,2,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,2,3,0, + 3,3,3,3,3,3,3,2,3,3,3,2,2,3,3,3,3,2,3,3,3,3,3,3,2,2,2,2,2,0, + 3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,2,3,2,2,2,2,3,3,3,2,2,0,0,2,0, + 3,3,3,3,3,3,3,2,3,3,2,2,2,2,2,3,3,2,2,3,3,3,3,3,2,2,0,0,2,0, + 3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,2,3,3,2,3,0,2,2,3,2,3,3,0,2, + 3,3,3,3,3,3,3,3,3,3,3,2,2,3,3,3,3,3,3,3,2,3,3,2,2,0,2,0,2,0, + 3,3,3,3,3,3,2,2,3,3,2,2,3,2,3,2,3,2,2,3,3,3,3,3,2,3,2,2,2,0, + 3,3,3,3,2,2,3,3,3,2,3,3,3,2,3,3,0,2,2,2,2,0,0,3,0,0,2,0,0,0, + 3,3,3,3,3,2,3,3,3,3,3,3,3,2,3,3,3,3,2,2,2,0,0,0,2,2,2,0,0,0, + 3,3,3,3,2,0,3,3,3,2,3,3,2,2,3,3,0,2,2,2,0,0,0,0,0,0,0,0,0,0, + 2,3,3,3,0,3,3,3,3,2,3,3,3,3,3,3,2,2,2,0,0,0,0,0,2,0,0,0,0,0, + 3,3,2,3,3,3,3,3,3,3,2,2,2,2,2,2,3,2,2,3,3,2,3,2,2,0,0,0,0,0, + 3,3,2,3,3,3,2,2,3,3,2,3,2,2,0,2,3,2,3,0,3,0,0,2,3,2,2,0,2,2, + 3,2,2,2,3,3,2,2,2,3,0,2,2,2,0,2,2,0,2,0,2,0,0,0,2,2,2,0,0,0, + 3,2,2,2,3,3,2,2,0,3,0,2,2,0,0,2,2,2,2,2,2,0,0,2,2,0,2,0,0,0, + 3,2,0,2,2,3,2,0,2,2,0,0,2,2,2,2,2,2,2,2,0,0,0,0,2,2,0,0,2,0, + 2,3,2,2,2,0,2,2,2,2,2,2,2,0,2,2,0,2,0,0,0,0,0,0,2,0,0,0,0,0, + 0,0,0,0,3,2,2,2,2,2,0,0,0,0,2,2,3,0,2,0,0,0,0,0,0,0,0,0,0,2, +}; + + +const SequenceModel Iso_8859_15DanishModel = +{ + Iso_8859_15_CharToOrderMap, + DanishLangModel, + 30, + (float)0.9968082796759031, + PR_TRUE, + "ISO-8859-15" +}; + +const SequenceModel Iso_8859_1DanishModel = +{ + Iso_8859_1_CharToOrderMap, + DanishLangModel, + 30, + (float)0.9968082796759031, + PR_TRUE, + "ISO-8859-1" +}; + +const SequenceModel Windows_1252DanishModel = +{ + Windows_1252_CharToOrderMap, + DanishLangModel, + 30, + (float)0.9968082796759031, + PR_TRUE, + "WINDOWS-1252" +}; \ No newline at end of file diff --git a/PowerEditor/src/uchardet/LangModels/LangEsperantoModel.cpp b/PowerEditor/src/uchardet/LangModels/LangEsperantoModel.cpp new file mode 100644 index 00000000..4993abcd --- /dev/null +++ b/PowerEditor/src/uchardet/LangModels/LangEsperantoModel.cpp @@ -0,0 +1,141 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Mozilla Communicator client code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "../nsSBCharSetProber.h" + +/********* Language model for: Esperanto *********/ + +/** + * Generated by BuildLangModel.py + * On: 2015-12-04 01:27:38.177516 + **/ + +/* Character Mapping Table: + * ILL: illegal character. + * CTR: control character specific to the charset. + * RET: carriage/return. + * SYM: symbol (punctuation) that does not belong to word. + * NUM: 0 - 9. + * + * Other characters are ordered by probabilities + * (0 is the most common character in the language). + * + * Orders are generic to a language. So the codepoint with order X in + * CHARSET1 maps to the same character as the codepoint with the same + * order X in CHARSET2 for the same language. + * As such, it is possible to get missing order. For instance the + * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 + * even though they are both used for French. Same for the euro sign. + */ +static const unsigned char Iso_8859_3_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 18, 17, 10, 2, 19, 15, 21, 3, 11, 9, 7, 13, 4, 1, /* 4X */ + 14, 32, 5, 8, 6, 12, 16, 27, 33, 25, 20,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 18, 17, 10, 2, 19, 15, 21, 3, 11, 9, 7, 13, 4, 1, /* 6X */ + 14, 32, 5, 8, 6, 12, 16, 27, 33, 25, 20,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM, 56,SYM,SYM,SYM,ILL, 34,SYM,SYM, 57, 53, 58, 28,SYM,ILL, 40, /* AX */ + SYM, 59,SYM,SYM,SYM,SYM, 34,SYM,SYM, 60, 53, 61, 28,SYM,ILL, 40, /* BX */ + 44, 29, 46,ILL, 43, 62, 24, 38, 41, 31, 48, 50, 54, 35, 49, 52, /* CX */ + ILL, 42, 63, 30, 47, 64, 36,SYM, 22, 51, 39, 55, 37, 23, 26, 45, /* DX */ + 44, 29, 46,ILL, 43, 65, 24, 38, 41, 31, 48, 50, 54, 35, 49, 52, /* EX */ + ILL, 42, 66, 30, 47, 67, 36,SYM, 22, 51, 39, 55, 37, 23, 26,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + + +/* Model Table: + * Total sequences: 989 + * First 512 sequences: 0.9942980632768038 + * Next 512 sequences (512-1024): 0.0057019367231962385 + * Rest: -5.0306980803327406e-17 + * Negative sequences: TODO + */ +static const PRUint8 EsperantoLangModel[] = +{ + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,0,0,2,3,3, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,3,2,0,0,0,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,0,0,0,2,3,3, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,3,2,2,2,3,0,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,2,2,3,2,3,2,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,0,3,3,3,2,2,2, + 3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,2,3,2,3,3,3,3,0,0,2,3,2,2,2,3,3,2,0,2,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,3,2,2,2,3,2,2,0,3,3,3,2,0,2, + 3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,2,3,3,3,3,3,3,0,0,0,3,0,2,0,3,2,3,2,2,0, + 3,3,3,3,3,3,3,3,3,2,3,2,3,3,3,2,3,3,2,3,3,3,0,0,0,3,2,0,2,3,2,2,0,0,0, + 3,3,3,3,3,3,2,3,3,2,3,2,3,3,3,3,3,2,2,2,3,3,0,0,2,3,0,3,2,2,2,2,0,0,0, + 3,3,3,3,3,3,3,3,3,2,3,2,3,3,2,2,0,2,2,2,2,2,2,0,0,0,0,0,0,3,3,2,0,2,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,0,3,2,3,2,0,0,0,2,0,2,2, + 3,3,3,3,3,3,3,3,3,3,2,2,3,3,3,2,2,3,3,3,2,0,0,0,2,3,2,2,0,3,2,2,0,0,0, + 3,3,3,3,2,3,3,3,3,2,2,2,3,2,3,2,0,2,2,2,2,3,0,0,0,2,2,0,0,3,2,2,0,0,0, + 3,3,3,3,3,3,2,3,3,2,3,0,3,3,2,2,3,2,2,2,2,3,0,2,2,3,2,2,2,2,2,3,0,2,0, + 3,3,3,3,2,3,2,2,2,2,2,3,3,2,2,2,0,0,2,0,2,2,0,0,2,2,0,0,0,3,2,2,0,0,0, + 3,3,3,3,0,3,3,3,3,3,2,0,3,2,2,2,0,3,2,2,3,3,0,0,0,3,0,0,0,2,2,2,2,2,2, + 3,3,3,3,3,3,3,3,3,2,3,2,3,2,2,2,2,2,3,2,0,2,0,0,0,3,2,0,0,3,3,3,0,0,0, + 3,3,3,3,0,3,3,3,2,2,2,2,3,3,2,3,2,0,2,3,0,0,0,0,0,2,0,0,0,0,0,2,0,3,0, + 3,3,3,3,3,2,2,3,3,3,2,2,3,2,2,2,2,3,3,2,2,0,0,0,0,3,2,2,0,2,2,2,2,0,0, + 3,3,3,3,3,3,3,3,2,2,2,0,3,3,2,0,2,0,2,2,0,2,0,0,0,2,0,2,0,2,2,2,0,2,0, + 3,3,3,3,0,0,2,3,0,0,2,2,3,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,2,3,3,2,3,3,3,3,3,3,2,2,2,2,3,2,0,2,2,3,2,0,0,2,0,3,0,0,0,0,0,0,0,0, + 3,3,3,3,0,0,2,2,0,2,3,2,3,3,0,0,2,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,2,3,3,2,3,2,2,2,2,2,2,2,2,2,0,0,0,0,0,0,0,2,0,2,0,2,0,0,0, + 3,3,3,3,2,2,3,2,0,2,0,2,3,2,2,0,3,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,2,2,2,2,3,2,0,0,2,0,0,0,0,0,0,2,0,2,0,0,0,2,0,3,0,0,2,0,0,0,0, + 3,3,2,2,2,2,0,2,0,2,0,0,3,0,0,2,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,2,0,3,3,3,3,3,2,3,0,0,2,2,2,2,3,2,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,3,3,2,2,2,2,2,2,0,0,2,2,2,0,2,2,3,0,0,0,0,0,0,3,0,0,0,0,0,0,0, + 2,2,2,0,3,3,3,3,3,2,2,0,2,2,2,2,2,2,2,2,2,0,0,0,0,0,0,0,0,0,0,0,2,0,0, + 2,0,0,2,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0, + 2,2,2,3,0,0,2,2,0,0,0,0,2,2,2,2,0,0,0,2,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0, + 3,3,3,2,2,0,2,0,0,0,2,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +}; + + +const SequenceModel Iso_8859_3EsperantoModel = +{ + Iso_8859_3_CharToOrderMap, + EsperantoLangModel, + 35, + (float)0.9942980632768038, + PR_FALSE, + "ISO-8859-3" +}; \ No newline at end of file diff --git a/PowerEditor/src/uchardet/LangModels/LangFrenchModel.cpp b/PowerEditor/src/uchardet/LangModels/LangFrenchModel.cpp new file mode 100644 index 00000000..4c05498a --- /dev/null +++ b/PowerEditor/src/uchardet/LangModels/LangFrenchModel.cpp @@ -0,0 +1,206 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Mozilla Communicator client code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "../nsSBCharSetProber.h" + +/********* Language model for: French *********/ + +/** + * Generated by BuildLangModel.py + * On: 2015-12-03 21:10:27.685575 + **/ + +/* Character Mapping Table: + * ILL: illegal character. + * CTR: control character specific to the charset. + * RET: carriage/return. + * SYM: symbol (punctuation) that does not belong to word. + * NUM: 0 - 9. + * + * Other characters are ordered by probabilities + * (0 is the most common character in the language). + * + * Orders are generic to a language. So the codepoint with order X in + * CHARSET1 maps to the same character as the codepoint with the same + * order X in CHARSET2 for the same language. + * As such, it is possible to get missing order. For instance the + * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 + * even though they are both used for French. Same for the euro sign. + */ +static const unsigned char Windows_1252_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 2, 18, 11, 10, 0, 17, 15, 19, 4, 25, 26, 7, 13, 3, 8, /* 4X */ + 12, 20, 5, 1, 6, 9, 16, 30, 21, 22, 29,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 2, 18, 11, 10, 0, 17, 15, 19, 4, 25, 26, 7, 13, 3, 8, /* 6X */ + 12, 20, 5, 1, 6, 9, 16, 30, 21, 22, 29,SYM,SYM,SYM,SYM,CTR, /* 7X */ + SYM,ILL,SYM, 56,SYM,SYM,SYM,SYM,SYM,SYM, 51,SYM, 35,ILL, 57,ILL, /* 8X */ + ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 51,SYM, 35,ILL, 58, 59, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 60,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 24, 38, 32, 46, 49, 61, 47, 27, 23, 14, 28, 41, 62, 39, 33, 36, /* CX */ + 48, 45, 54, 40, 31, 55, 42,SYM, 52, 37, 43, 34, 44, 53, 50, 63, /* DX */ + 24, 38, 32, 46, 49, 64, 47, 27, 23, 14, 28, 41, 65, 39, 33, 36, /* EX */ + 48, 45, 54, 40, 31, 55, 42,SYM, 52, 37, 43, 34, 44, 53, 50, 66, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_1_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 2, 18, 11, 10, 0, 17, 15, 19, 4, 25, 26, 7, 13, 3, 8, /* 4X */ + 12, 20, 5, 1, 6, 9, 16, 30, 21, 22, 29,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 2, 18, 11, 10, 0, 17, 15, 19, 4, 25, 26, 7, 13, 3, 8, /* 6X */ + 12, 20, 5, 1, 6, 9, 16, 30, 21, 22, 29,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 67,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 24, 38, 32, 46, 49, 68, 47, 27, 23, 14, 28, 41, 69, 39, 33, 36, /* CX */ + 48, 45, 54, 40, 31, 55, 42,SYM, 52, 37, 43, 34, 44, 53, 50, 70, /* DX */ + 24, 38, 32, 46, 49, 71, 47, 27, 23, 14, 28, 41, 72, 39, 33, 36, /* EX */ + 48, 45, 54, 40, 31, 55, 42,SYM, 52, 37, 43, 34, 44, 53, 50, 73, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_15_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 2, 18, 11, 10, 0, 17, 15, 19, 4, 25, 26, 7, 13, 3, 8, /* 4X */ + 12, 20, 5, 1, 6, 9, 16, 30, 21, 22, 29,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 2, 18, 11, 10, 0, 17, 15, 19, 4, 25, 26, 7, 13, 3, 8, /* 6X */ + 12, 20, 5, 1, 6, 9, 16, 30, 21, 22, 29,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM, 51,SYM, 51,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM, 74, 75,SYM,SYM, 76,SYM,SYM,SYM, 35, 35, 77,SYM, /* BX */ + 24, 38, 32, 46, 49, 78, 47, 27, 23, 14, 28, 41, 79, 39, 33, 36, /* CX */ + 48, 45, 54, 40, 31, 55, 42,SYM, 52, 37, 43, 34, 44, 53, 50, 80, /* DX */ + 24, 38, 32, 46, 49, 81, 47, 27, 23, 14, 28, 41, 82, 39, 33, 36, /* EX */ + 48, 45, 54, 40, 31, 55, 42,SYM, 52, 37, 43, 34, 44, 53, 50, 83, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + + +/* Model Table: + * Total sequences: 914 + * First 512 sequences: 0.997057879992383 + * Next 512 sequences (512-1024): 0.002942120007616917 + * Rest: 3.8163916471489756e-17 + * Negative sequences: TODO + */ +static const PRUint8 FrenchLangModel[] = +{ + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,3,3,3,3,0,0,3,3,3,0,3,3,0,0,0,2,0,2,0, + 3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,2,2,3,3,3,3,0,3,3,0,0,3,0,0,2,3,0,0,0,2,2,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,0,3,3,3,0,3,3,2,2,3,0,0,3,0, + 3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,3,3,0,3,3,3,2,3,2,0,2,2,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,0,2,3,0,2,3,2,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,0,2,3,3,3,2,3,3,3,0,2,0,0,0, + 3,3,3,2,3,3,3,3,3,3,2,3,3,3,3,2,2,2,3,3,2,2,3,3,2,0,2,0,3,3,2,3,2,0,0,0,0,0, + 3,3,3,2,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,0,3,3,3,2,3,0,0,2,2,2,2,0,2,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,0,3,3,0,0,3,3,0,0,2,3,0,3,3, + 3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,2,3,3,2,0,3,3,2,3,3,2,0,0,0,0,0,2,0, + 3,3,3,2,3,3,3,2,3,3,3,2,2,3,3,3,2,2,2,3,0,0,3,3,0,3,0,0,2,2,3,2,2,2,3,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,2,2,2,2,3,3,0,3,3,0,0,3,0,2,2,2,3,2,0,0,2,0,0, + 3,3,3,2,3,3,3,3,3,3,2,2,3,2,3,0,0,2,2,3,0,0,3,3,0,0,2,2,3,2,2,3,2,0,0,0,0,0, + 3,3,3,3,3,2,3,2,3,3,2,3,3,3,3,2,0,2,3,2,0,0,3,3,0,2,2,0,3,0,2,2,3,0,2,2,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,0,0,0,3,2,2,0,3,0,0,2,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,2,2,2,3,3,3,2,2,3,3,0,2,3,3,0,0,0,0,2,0,2,0,2,0,0,0,0,0, + 3,2,3,2,3,3,0,2,3,3,0,0,0,2,3,0,2,2,0,0,0,0,2,3,0,0,2,0,3,0,0,0,0,0,0,2,0,0, + 3,3,3,2,3,3,3,3,3,3,2,2,2,3,3,2,0,3,0,0,0,0,0,3,0,2,0,0,3,0,0,0,0,0,2,2,0,0, + 3,3,3,3,3,3,3,3,3,3,3,2,2,2,3,2,2,0,3,2,0,0,3,2,0,3,0,0,0,0,0,0,3,2,0,2,0,0, + 3,3,3,3,3,3,3,3,3,3,0,2,0,3,3,0,0,2,2,0,0,0,3,3,0,2,2,0,2,2,2,3,3,0,0,2,0,0, + 0,0,2,0,0,0,0,2,2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,0,3,0,3,0,3,2,3,2,2,3,3,2,3,0,3,2,2,2,2,3,2,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0, + 3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,0,2,2,2,0,3,2,0,0,2,2,0,0,0,0,0,0,0, + 0,3,0,3,0,3,3,3,0,0,3,3,2,3,0,3,3,2,3,0,3,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,2,3,2,3,2,2,2,3,3,2,2,2,2,3,0,0,0,0,0,0,0,0,0,3,2,0,0,0,0,0,0,2,0,0,0,0,0, + 3,3,3,2,3,3,2,3,3,3,0,0,2,3,2,2,2,2,2,3,0,0,3,0,0,0,2,0,0,0,2,0,0,0,0,0,0,0, + 0,0,3,0,0,0,0,0,3,3,0,0,0,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,2,0,0,3,2,0,0,0,3,0,3,0,0,2,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,0,3,2,3,2,0,2,3,3,0,2,0,2,2,2,0,0,2,2,2,0,3,0,0,0,2,0,0,3,2,0,0,0,0,0,0,0, + 3,2,3,2,3,2,2,2,3,2,0,2,0,0,2,0,0,2,2,2,0,0,2,0,0,0,2,0,0,0,2,0,0,0,0,0,0,0, + 0,2,0,3,0,0,3,3,0,0,0,0,2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,2,0,2,2,0,3,3,0,0,0,3,2,2,0,3,0,0,2,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,3,0,0,3,3,0,0,0,2,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,2,3,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,2,0,0,2,0,2,2,0,3,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 2,2,2,2,0,2,2,3,0,0,2,2,0,2,0,2,0,2,2,0,2,0,0,0,0,0,2,0,0,0,2,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +}; + + +const SequenceModel Windows_1252FrenchModel = +{ + Windows_1252_CharToOrderMap, + FrenchLangModel, + 38, + (float)0.997057879992383, + PR_TRUE, + "WINDOWS-1252" +}; + +const SequenceModel Iso_8859_1FrenchModel = +{ + Iso_8859_1_CharToOrderMap, + FrenchLangModel, + 38, + (float)0.997057879992383, + PR_TRUE, + "ISO-8859-1" +}; + +const SequenceModel Iso_8859_15FrenchModel = +{ + Iso_8859_15_CharToOrderMap, + FrenchLangModel, + 38, + (float)0.997057879992383, + PR_TRUE, + "ISO-8859-15" +}; \ No newline at end of file diff --git a/PowerEditor/src/uchardet/LangModels/LangGermanModel.cpp b/PowerEditor/src/uchardet/LangModels/LangGermanModel.cpp new file mode 100644 index 00000000..7a2436b8 --- /dev/null +++ b/PowerEditor/src/uchardet/LangModels/LangGermanModel.cpp @@ -0,0 +1,168 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Mozilla Communicator client code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "../nsSBCharSetProber.h" + +/********* Language model for: German *********/ + +/** + * Generated by BuildLangModel.py + * On: 2015-12-03 22:50:46.518374 + **/ + +/* Character Mapping Table: + * ILL: illegal character. + * CTR: control character specific to the charset. + * RET: carriage/return. + * SYM: symbol (punctuation) that does not belong to word. + * NUM: 0 - 9. + * + * Other characters are ordered by probabilities + * (0 is the most common character in the language). + * + * Orders are generic to a language. So the codepoint with order X in + * CHARSET1 maps to the same character as the codepoint with the same + * order X in CHARSET2 for the same language. + * As such, it is possible to get missing order. For instance the + * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 + * even though they are both used for French. Same for the euro sign. + */ +static const unsigned char Windows_1252_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 5, 15, 12, 8, 0, 17, 14, 7, 3, 23, 16, 9, 13, 2, 11, /* 4X */ + 18, 30, 1, 4, 6, 10, 21, 19, 28, 25, 20,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 5, 15, 12, 8, 0, 17, 14, 7, 3, 23, 16, 9, 13, 2, 11, /* 6X */ + 18, 30, 1, 4, 6, 10, 21, 19, 28, 25, 20,SYM,SYM,SYM,SYM,CTR, /* 7X */ + SYM,ILL,SYM, 59,SYM,SYM,SYM,SYM,SYM,SYM, 36,SYM, 54,ILL, 42,ILL, /* 8X */ + ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 36,SYM, 54,ILL, 42, 56, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 60,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 41, 31, 37, 44, 22, 49, 50, 35, 32, 29, 48, 43, 57, 33, 47, 52, /* CX */ + 53, 39, 51, 34, 40, 55, 26,SYM, 38, 58, 46, 61, 24, 45, 62, 27, /* DX */ + 41, 31, 37, 44, 22, 49, 50, 35, 32, 29, 48, 43, 57, 33, 47, 52, /* EX */ + 53, 39, 51, 34, 40, 55, 26,SYM, 38, 58, 46, 63, 24, 45, 64, 56, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_1_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 5, 15, 12, 8, 0, 17, 14, 7, 3, 23, 16, 9, 13, 2, 11, /* 4X */ + 18, 30, 1, 4, 6, 10, 21, 19, 28, 25, 20,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 5, 15, 12, 8, 0, 17, 14, 7, 3, 23, 16, 9, 13, 2, 11, /* 6X */ + 18, 30, 1, 4, 6, 10, 21, 19, 28, 25, 20,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 65,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 41, 31, 37, 44, 22, 49, 50, 35, 32, 29, 48, 43, 57, 33, 47, 52, /* CX */ + 53, 39, 51, 34, 40, 55, 26,SYM, 38, 58, 46, 66, 24, 45, 67, 27, /* DX */ + 41, 31, 37, 44, 22, 49, 50, 35, 32, 29, 48, 43, 57, 33, 47, 52, /* EX */ + 53, 39, 51, 34, 40, 55, 26,SYM, 38, 58, 46, 68, 24, 45, 69, 56, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + + +/* Model Table: + * Total sequences: 1188 + * First 512 sequences: 0.9934041448127945 + * Next 512 sequences (512-1024): 0.006482829516922903 + * Rest: 0.0001130256702826099 + * Negative sequences: TODO + */ +static const PRUint8 GermanLangModel[] = +{ + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,3,2,3,3,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,2,2,3,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,1,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,0,2,2,3,3,2,3, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,0,0,3,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,0,3,0,3,3,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,2,3,3,3,0,0,2,2, + 3,3,3,3,3,3,3,3,2,3,3,3,2,3,3,3,3,3,2,3,2,2,3,2,3,3,3,0,0,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,2,3,2,3,2,2,3,2,3,3,2,0,0,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,2,2,2, + 3,3,3,3,3,3,3,3,3,3,2,2,3,3,3,3,3,3,3,3,3,3,2,2,2,2,0,3,3,3,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,0,3,0,3,3,1,2, + 3,3,2,3,2,3,3,3,2,3,3,3,3,2,2,2,3,2,2,2,2,2,2,2,1,3,2,0,1,2,3, + 3,3,2,3,3,3,3,2,3,3,3,3,3,3,2,3,2,3,3,2,2,2,3,2,3,3,3,0,0,2,2, + 3,3,3,3,3,3,3,3,2,3,3,3,2,3,3,2,3,2,2,2,3,2,3,2,3,3,2,0,2,2,1, + 3,3,3,3,3,3,3,3,2,3,3,3,2,2,3,3,2,2,2,2,2,2,3,2,3,3,3,0,0,2,0, + 3,3,3,3,3,3,3,3,2,3,3,3,1,3,2,2,3,3,3,2,2,2,3,2,3,3,3,0,1,2,1, + 3,3,3,3,3,3,3,2,2,3,3,3,2,3,3,2,3,3,2,2,2,2,3,2,3,2,3,0,0,2,0, + 3,3,2,3,3,3,3,3,3,3,3,3,2,2,2,2,2,3,3,2,2,2,3,2,2,2,2,0,0,2,0, + 3,3,3,3,3,3,2,2,2,2,3,3,1,2,2,2,2,2,2,2,2,2,3,3,3,2,3,0,0,0,0, + 3,2,2,3,3,3,3,2,2,3,3,3,2,3,2,3,2,2,2,3,3,2,2,2,3,3,3,0,0,2,2, + 3,2,2,3,2,3,2,0,2,2,2,3,1,2,2,2,2,2,2,2,2,2,2,1,0,2,3,0,0,2,1, + 2,3,3,3,3,2,3,3,3,3,3,2,3,3,3,2,2,3,2,0,2,2,0,0,0,0,0,2,0,0,2, + 3,2,2,3,2,3,2,2,2,2,3,3,2,2,2,1,2,1,2,0,2,0,3,2,3,2,2,0,0,2,0, + 2,3,3,0,3,1,3,3,3,3,0,0,3,2,3,3,2,2,2,1,1,0,0,0,0,0,0,2,0,0,0, + 3,3,3,2,3,3,2,2,2,3,2,3,3,3,2,2,3,2,3,2,2,2,0,2,2,2,1,0,0,1,0, + 2,3,3,2,3,0,3,3,2,3,0,1,3,3,3,2,2,3,2,2,2,2,0,0,0,0,1,3,1,0,0, + 3,2,2,3,2,2,3,2,1,2,2,2,0,2,2,3,2,2,2,2,2,2,0,0,0,0,0,0,0,0,0, + 3,1,2,3,1,3,3,2,1,2,2,2,2,0,0,2,2,2,3,2,0,2,0,0,0,2,0,0,2,2,0, + 2,3,2,0,2,2,2,2,2,2,2,2,2,2,2,3,2,2,2,1,2,2,0,2,0,0,0,0,0,0,2, + 0,1,0,2,0,2,0,0,0,0,3,2,0,0,0,0,0,1,0,2,0,0,0,0,0,0,0,0,0,0,0, +}; + + +const SequenceModel Windows_1252GermanModel = +{ + Windows_1252_CharToOrderMap, + GermanLangModel, + 31, + (float)0.9934041448127945, + PR_TRUE, + "WINDOWS-1252" +}; + +const SequenceModel Iso_8859_1GermanModel = +{ + Iso_8859_1_CharToOrderMap, + GermanLangModel, + 31, + (float)0.9934041448127945, + PR_TRUE, + "ISO-8859-1" +}; \ No newline at end of file diff --git a/PowerEditor/src/uchardet/LangModels/LangGreekModel.cpp b/PowerEditor/src/uchardet/LangModels/LangGreekModel.cpp new file mode 100644 index 00000000..499affe7 --- /dev/null +++ b/PowerEditor/src/uchardet/LangModels/LangGreekModel.cpp @@ -0,0 +1,229 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Mozilla Communicator client code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "../nsSBCharSetProber.h" + +/********* Language model for: Greek *********/ + +/** + * Generated by BuildLangModel.py + * On: 2016-05-25 15:21:50.073117 + **/ + +/* Character Mapping Table: + * ILL: illegal character. + * CTR: control character specific to the charset. + * RET: carriage/return. + * SYM: symbol (punctuation) that does not belong to word. + * NUM: 0 - 9. + * + * Other characters are ordered by probabilities + * (0 is the most common character in the language). + * + * Orders are generic to a language. So the codepoint with order X in + * CHARSET1 maps to the same character as the codepoint with the same + * order X in CHARSET2 for the same language. + * As such, it is possible to get missing order. For instance the + * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 + * even though they are both used for French. Same for the euro sign. + */ +static const unsigned char Windows_1253_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 32, 46, 41, 40, 30, 52, 48, 42, 33, 56, 49, 39, 44, 36, 34, /* 4X */ + 47, 59, 35, 38, 37, 43, 54, 50, 58, 53, 57,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 32, 46, 41, 40, 30, 52, 48, 42, 33, 56, 49, 39, 44, 36, 34, /* 6X */ + 47, 59, 35, 38, 37, 43, 54, 50, 58, 53, 57,SYM,SYM,SYM,SYM,CTR, /* 7X */ + SYM,ILL,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM,ILL,SYM,ILL,ILL,ILL,ILL, /* 8X */ + ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM,ILL,SYM,ILL,ILL,ILL,ILL, /* 9X */ + SYM,SYM, 17,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 62,SYM,SYM, 19, 22, 15,SYM, 16,SYM, 24, 28, /* BX */ + 55, 0, 25, 18, 20, 5, 29, 10, 26, 3, 8, 14, 13, 4, 31, 1, /* CX */ + 11, 6,ILL, 7, 2, 12, 27, 23, 45, 21, 51, 60, 17, 19, 22, 15, /* DX */ + 61, 0, 25, 18, 20, 5, 29, 10, 26, 3, 8, 14, 13, 4, 31, 1, /* EX */ + 11, 6, 9, 7, 2, 12, 27, 23, 45, 21, 51, 60, 16, 24, 28,ILL, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_7_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 32, 46, 41, 40, 30, 52, 48, 42, 33, 56, 49, 39, 44, 36, 34, /* 4X */ + 47, 59, 35, 38, 37, 43, 54, 50, 58, 53, 57,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 32, 46, 41, 40, 30, 52, 48, 42, 33, 56, 49, 39, 44, 36, 34, /* 6X */ + 47, 59, 35, 38, 37, 43, 54, 50, 58, 53, 57,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM,SYM, 17,SYM, 19, 22, 15,SYM, 16,SYM, 24, 28, /* BX */ + 55, 0, 25, 18, 20, 5, 29, 10, 26, 3, 8, 14, 13, 4, 31, 1, /* CX */ + 11, 6,ILL, 7, 2, 12, 27, 23, 45, 21, 51, 60, 17, 19, 22, 15, /* DX */ + 61, 0, 25, 18, 20, 5, 29, 10, 26, 3, 8, 14, 13, 4, 31, 1, /* EX */ + 11, 6, 9, 7, 2, 12, 27, 23, 45, 21, 51, 60, 16, 24, 28,ILL, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + + +/* Model Table: + * Total sequences: 1579 + * First 512 sequences: 0.958419074626211 + * Next 512 sequences (512-1024): 0.03968891876305471 + * Rest: 0.0018920066107342773 + * Negative sequences: TODO + */ +static const PRUint8 GreekLangModel[] = +{ + 1,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,2,2,3,2,3,1,2, + 3,3,3,3,3,1,3,0,3,0,0,0,0,0,0,1,0,0,1,0,0,0,2, + 2,2,3,3,3,3,3,3,3,3,2,3,3,3,3,3,1,2,3,2,3,1,2, + 3,3,3,3,3,2,2,0,2,0,0,0,0,0,0,0,0,1,0,0,1,0,2, + 3,3,2,3,2,3,3,3,2,3,3,1,3,2,2,3,3,3,2,3,0,3,3, + 2,2,2,2,2,3,3,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,2,3,3,3,3,3,3,3,3,1,3,3,1,3,3,3,3,3,3,2, + 3,1,3,3,2,3,3,0,2,0,0,1,0,0,0,1,0,0,0,0,0,0,2, + 3,3,3,3,3,3,2,3,2,2,3,1,2,2,2,3,3,3,3,3,3,3,3, + 2,2,1,3,2,3,2,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0, + 2,3,3,3,3,2,3,3,3,3,2,3,3,3,3,3,2,2,3,1,3,3,1, + 3,3,3,3,3,2,2,0,3,0,0,0,0,0,0,1,0,0,0,0,0,0,2, + 3,3,3,3,3,3,3,3,3,2,3,2,3,3,3,3,3,3,3,3,3,3,3, + 3,3,2,3,2,3,2,0,2,0,0,1,0,0,0,0,0,0,0,0,0,0,1, + 3,3,3,3,2,3,2,3,3,0,3,3,3,3,2,3,3,3,2,3,2,3,3, + 3,3,2,2,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,2,3,3,2,3,2,3,2,3,2,3,3,3,3,1,3,3,3,3, + 2,3,2,2,2,3,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0, + 1,1,0,1,1,1,0,1,1,0,2,1,0,1,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, + 1,1,3,0,3,2,3,3,3,3,0,3,0,3,3,1,0,0,3,1,2,0,0, + 2,1,1,3,2,0,0,0,2,0,0,1,0,0,0,0,0,0,1,0,0,0,2, + 3,3,3,3,2,3,3,2,1,1,3,2,3,1,3,3,3,3,1,3,0,3,3, + 1,2,1,1,1,2,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0, + 3,2,3,2,3,2,3,3,3,3,2,3,0,3,3,2,2,3,3,2,3,1,2, + 3,0,3,3,2,1,3,0,2,0,0,1,0,0,0,0,0,0,0,0,0,0,2, + 3,3,1,3,2,3,1,2,1,2,3,3,2,3,1,3,3,3,1,3,1,3,3, + 1,2,3,0,3,2,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2, + 3,3,3,3,2,3,1,2,2,2,3,2,3,3,3,3,3,3,2,3,2,3,3, + 2,3,2,2,2,3,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1, + 3,3,3,1,3,3,3,3,3,3,2,3,0,3,3,0,0,0,3,0,3,3,0, + 3,0,2,3,2,0,3,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,2, + 2,2,3,2,3,3,3,3,3,3,2,3,1,3,3,0,0,0,3,0,3,1,0, + 3,1,2,2,3,0,2,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,2, + 2,2,3,3,3,2,3,3,3,3,2,3,1,3,3,0,0,0,3,0,3,1,0, + 3,0,3,3,3,0,3,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,2, + 3,3,0,3,3,3,3,0,3,0,3,0,2,3,3,3,3,3,3,3,2,3,3, + 2,2,0,0,0,3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,2,3,3,3,3,2,3,1,3,3,0,0,0,3,0,3,3,0, + 3,0,3,3,3,0,2,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,2, + 3,3,1,3,2,3,3,1,0,0,3,0,3,1,0,3,3,3,0,3,0,3,3, + 0,3,0,0,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 2,1,3,2,3,1,3,3,2,3,1,3,1,3,2,2,1,2,3,1,2,0,2, + 2,0,3,3,2,1,1,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,3,1,3,1,3,3,3,3,1,2,0,3,3,0,0,0,2,0,2,1,0, + 2,0,1,3,2,0,1,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,2, + 3,3,3,3,3,3,3,1,0,1,3,1,2,2,2,3,2,3,0,3,0,3,3, + 0,2,1,3,1,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 2,3,3,2,3,3,3,3,2,3,2,3,0,3,3,0,0,0,3,0,2,1,0, + 2,0,2,3,2,0,2,0,2,0,0,0,0,0,1,0,0,0,0,0,0,0,2, + 3,3,1,3,2,3,3,1,1,1,2,1,2,0,3,3,3,3,2,3,2,2,2, + 0,2,2,0,0,2,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,0,3,3,3,3,1,1,0,3,0,3,3,3,2,2,3,1,3,0,2,3, + 0,2,0,0,1,3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,2,3,1,3,3,3,2,0,3,1,3,1,2,3,3,3,2,3,0,3,3, + 0,2,0,2,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,3,2,3,0,3,3,2,3,2,3,0,3,2,0,0,0,1,0,2,1,0, + 1,0,2,2,1,0,2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,1,3,1,3,1,1,1,0,2,0,2,2,1,2,2,2,1,2,0,3,2, + 0,2,1,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,1,0,0,1,0,1,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,2,1,2,2,2,3,3,2,3,2,2,2,2,2,2,0, + 3,3,1,3,1,3,0,0,1,0,3,1,2,1,1,2,2,3,1,2,0,2,2, + 0,3,0,0,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,1,0,1,1,0,0,0,1,0,0,1,1,1,1,0,0,0,1,0,0,0,0, + 0,0,0,1,1,0,0,2,0,2,2,1,3,3,3,2,3,2,2,2,2,2,0, + 0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0, + 0,0,1,0,0,0,0,2,0,3,2,3,2,3,3,3,2,2,3,1,2,2,0, + 0,0,1,0,1,0,0,1,0,1,0,0,1,1,0,0,0,0,0,0,1,0,0, + 0,1,0,1,0,0,0,2,0,2,2,2,3,3,2,2,2,2,2,2,2,2,0, + 0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,1,0,0,0,3,0,3,3,3,2,2,2,2,2,2,2,1,2,2,0, + 0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0, + 0,0,0,0,0,0,0,3,0,3,2,2,1,2,2,2,2,3,2,1,2,1,0, + 1,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0, + 0,0,0,0,0,0,1,3,0,3,3,3,2,1,2,2,2,1,1,3,2,2,0, + 0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,2,0,2,2,2,1,1,3,2,2,1,2,2,2,2,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,3,0,3,3,2,1,1,2,2,2,2,1,1,2,2,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,3,0,2,2,2,2,1,1,2,2,1,2,1,2,1,0, + 0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,2,0,2,2,2,2,1,2,1,2,2,2,3,2,1,0, + 0,0,0,0,0,0,0,0,0,0,0,1,0,2,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,3,0,2,2,2,2,2,2,1,2,1,1,1,2,2,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,2,0,2,2,1,2,2,2,2,2,2,2,1,1,2,0, + 1,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0, + 0,0,0,0,0,0,0,3,0,2,2,2,1,1,1,2,2,1,1,1,2,2,0, + 2,2,0,2,0,3,0,0,0,0,3,0,2,0,0,2,1,1,0,1,0,1,2, + 0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +}; + + +const SequenceModel Windows_1253GreekModel = +{ + Windows_1253_CharToOrderMap, + GreekLangModel, + 46, + (float)0.958419074626211, + PR_FALSE, + "WINDOWS-1253" +}; + +const SequenceModel Iso_8859_7GreekModel = +{ + Iso_8859_7_CharToOrderMap, + GreekLangModel, + 46, + (float)0.958419074626211, + PR_FALSE, + "ISO-8859-7" +}; diff --git a/PowerEditor/src/uchardet/LangHebrewModel.cpp b/PowerEditor/src/uchardet/LangModels/LangHebrewModel.cpp similarity index 92% rename from PowerEditor/src/uchardet/LangHebrewModel.cpp rename to PowerEditor/src/uchardet/LangModels/LangHebrewModel.cpp index 99a36e72..af9ac2b0 100644 --- a/PowerEditor/src/uchardet/LangHebrewModel.cpp +++ b/PowerEditor/src/uchardet/LangModels/LangHebrewModel.cpp @@ -37,14 +37,14 @@ * * ***** END LICENSE BLOCK ***** */ -#include "nsSBCharSetProber.h" +#include "../nsSBCharSetProber.h" /**************************************************************** -255: Control characters that usually does not exist in any text -254: Carriage/Return -253: symbol (punctuation) that does not belong to word -252: 0 - 9 +CTR: Control characters that usually does not exist in any text +RET: Carriage/Return +SYM: symbol (punctuation) that does not belong to word +NUM: 0 - 9 *****************************************************************/ @@ -52,22 +52,22 @@ //Character Mapping Table: static const unsigned char win1255_CharToOrderMap[] = { -255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 -255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 -+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20 -252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30 -253, 69, 91, 79, 80, 92, 89, 97, 90, 68,111,112, 82, 73, 95, 85, //40 - 78,121, 86, 71, 67,102,107, 84,114,103,115,253,253,253,253,253, //50 -253, 50, 74, 60, 61, 42, 76, 70, 64, 53,105, 93, 56, 65, 54, 49, //60 - 66,110, 51, 43, 44, 63, 81, 77, 98, 75,108,253,253,253,253,253, //70 -124,202,203,204,205, 40, 58,206,207,208,209,210,211,212,213,214, -215, 83, 52, 47, 46, 72, 32, 94,216,113,217,109,218,219,220,221, +CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, //00 +CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, //10 +SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, //20 +NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, //30 +SYM, 69, 91, 79, 80, 92, 89, 97, 90, 68,111,112, 82, 73, 95, 85, //40 + 78,121, 86, 71, 67,102,107, 84,114,103,115,SYM,SYM,SYM,SYM,SYM, //50 +SYM, 50, 74, 60, 61, 42, 76, 70, 64, 53,105, 93, 56, 65, 54, 49, //60 + 66,110, 51, 43, 44, 63, 81, 77, 98, 75,108,SYM,SYM,SYM,SYM,SYM, //70 +124,ILL,203,204,205, 40, 58,206,207,208,ILL,210,ILL,ILL,ILL,ILL, +ILL, 83, 52, 47, 46, 72, 32, 94,216,113,ILL,109,ILL,ILL,ILL,ILL, 34,116,222,118,100,223,224,117,119,104,125,225,226, 87, 99,227, 106,122,123,228, 55,229,230,101,231,232,120,233, 48, 39, 57,234, 30, 59, 41, 88, 33, 37, 36, 31, 29, 35,235, 62, 28,236,126,237, -238, 38, 45,239,240,241,242,243,127,244,245,246,247,248,249,250, +238, 38, 45,239,240,241,242,243,127,ILL,ILL,ILL,ILL,ILL,ILL,ILL, 9, 8, 20, 16, 3, 2, 24, 14, 22, 1, 25, 15, 4, 11, 6, 23, - 12, 19, 13, 26, 18, 27, 21, 17, 7, 10, 5,251,252,128, 96,253, + 12, 19, 13, 26, 18, 27, 21, 17, 7, 10, 5,ILL,ILL,128, 96,ILL, }; //Model Table: @@ -208,10 +208,13 @@ static const PRUint8 HebrewLangModel[] = 0,0,0,0,0,0,0,0,0,0,1,2,1,0,0,0,0,0,1,1,1,1,1,0,1,0,0,0,1,1,0,0, }; -const SequenceModel Win1255Model( +const SequenceModel Win1255Model = +{ win1255_CharToOrderMap, HebrewLangModel, + 64, (float)0.984004, PR_FALSE, - "windows-1255"); + "WINDOWS-1255" +}; diff --git a/PowerEditor/src/uchardet/LangModels/LangHungarianModel.cpp b/PowerEditor/src/uchardet/LangModels/LangHungarianModel.cpp new file mode 100644 index 00000000..54708b2e --- /dev/null +++ b/PowerEditor/src/uchardet/LangModels/LangHungarianModel.cpp @@ -0,0 +1,169 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Mozilla Communicator client code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "../nsSBCharSetProber.h" + +/********* Language model for: Hungarian *********/ + +/** + * Generated by BuildLangModel.py + * On: 2015-12-12 18:02:46.730481 + **/ + +/* Character Mapping Table: + * ILL: illegal character. + * CTR: control character specific to the charset. + * RET: carriage/return. + * SYM: symbol (punctuation) that does not belong to word. + * NUM: 0 - 9. + * + * Other characters are ordered by probabilities + * (0 is the most common character in the language). + * + * Orders are generic to a language. So the codepoint with order X in + * CHARSET1 maps to the same character as the codepoint with the same + * order X in CHARSET2 for the same language. + * As such, it is possible to get missing order. For instance the + * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 + * even though they are both used for French. Same for the euro sign. + */ +static const unsigned char Iso_8859_2_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 1, 15, 23, 16, 0, 24, 13, 20, 7, 22, 9, 4, 12, 6, 8, /* 4X */ + 21, 34, 5, 3, 2, 19, 17, 32, 33, 18, 10,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 1, 15, 23, 16, 0, 24, 13, 20, 7, 22, 9, 4, 12, 6, 8, /* 6X */ + 21, 34, 5, 3, 2, 19, 17, 32, 33, 18, 10,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM, 55,SYM, 42,SYM, 56, 46,SYM,SYM, 37, 52, 57, 58,SYM, 48, 59, /* AX */ + SYM, 60,SYM, 42,SYM, 61, 46,SYM,SYM, 37, 52, 62, 63,SYM, 48, 64, /* BX */ + 65, 11, 40, 36, 35, 66, 38, 39, 41, 14, 50, 67, 53, 28, 45, 68, /* CX */ + 49, 43, 54, 26, 69, 27, 25,SYM, 44, 70, 30, 31, 29, 47, 51, 71, /* DX */ + 72, 11, 40, 36, 35, 73, 38, 39, 41, 14, 50, 74, 53, 28, 45, 75, /* EX */ + 49, 43, 54, 26, 76, 27, 25,SYM, 44, 77, 30, 31, 29, 47, 51,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Windows_1250_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 1, 15, 23, 16, 0, 24, 13, 20, 7, 22, 9, 4, 12, 6, 8, /* 4X */ + 21, 34, 5, 3, 2, 19, 17, 32, 33, 18, 10,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 1, 15, 23, 16, 0, 24, 13, 20, 7, 22, 9, 4, 12, 6, 8, /* 6X */ + 21, 34, 5, 3, 2, 19, 17, 32, 33, 18, 10,SYM,SYM,SYM,SYM,CTR, /* 7X */ + SYM,ILL,SYM,ILL,SYM,SYM,SYM,SYM,ILL,SYM, 37,SYM, 46, 78, 48, 79, /* 8X */ + ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM, 37,SYM, 46, 80, 48, 81, /* 9X */ + SYM,SYM,SYM, 42,SYM, 82,SYM,SYM,SYM,SYM, 52,SYM,SYM,SYM,SYM, 83, /* AX */ + SYM,SYM,SYM, 42,SYM,SYM,SYM,SYM,SYM, 84, 52,SYM, 85,SYM, 86, 87, /* BX */ + 88, 11, 40, 36, 35, 89, 38, 39, 41, 14, 50, 90, 53, 28, 45, 91, /* CX */ + 49, 43, 54, 26, 92, 27, 25,SYM, 44, 93, 30, 31, 29, 47, 51, 94, /* DX */ + 95, 11, 40, 36, 35, 96, 38, 39, 41, 14, 50, 97, 53, 28, 45, 98, /* EX */ + 49, 43, 54, 26, 99, 27, 25,SYM, 44,100, 30, 31, 29, 47, 51,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + + +/* Model Table: + * Total sequences: 1084 + * First 512 sequences: 0.9748272224933486 + * Next 512 sequences (512-1024): 0.024983863604162403 + * Rest: 0.0001889139024889644 + * Negative sequences: TODO + */ +static const PRUint8 HungarianLangModel[] = +{ + 3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,3,3,3,3,3,3,3,3,3,3,3,1,0,2,2,0,0, + 3,2,3,3,3,3,3,3,2,3,3,2,3,3,2,3,3,3,3,3,3,3,3,3,3,0,0,2,2,1,2,1, + 3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,3,3,3,3,2,3,2,2,3,3,3,3,3,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,3,2,3,3,3,2,3,2,2,3,3,3,3,3,2, + 3,3,3,3,3,2,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,3,3,3,3,3,3,2, + 3,3,3,3,3,2,3,3,3,3,3,3,2,3,3,3,3,3,3,3,2,2,2,3,3,3,2,3,2,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,1,3,3,3,2,3,3,2,3,0,2,2,2,2, + 3,2,3,3,3,3,3,2,2,3,3,2,3,3,0,3,3,3,2,3,3,3,2,3,3,0,2,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,2,3,2,2,3,3,2,2,2,3,2,2,2,2,2,3,3,2,3,3,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,3,2,2,2,3,3,3,3,3,2,2, + 1,2,3,3,3,3,3,3,2,3,3,0,3,3,2,3,3,3,2,2,2,3,3,3,2,0,0,0,2,0,0,0, + 3,3,3,2,3,2,2,3,3,2,3,3,3,2,3,3,2,2,2,3,2,3,2,2,2,2,3,2,2,2,2,3, + 3,3,3,3,3,3,3,3,3,3,2,3,2,3,3,3,2,3,3,3,3,2,3,2,2,3,3,2,3,2,2,2, + 0,1,3,3,3,3,3,2,2,3,3,0,3,3,2,3,3,3,0,0,2,3,2,3,0,0,0,0,0,2,0,0, + 3,3,2,3,3,3,2,3,3,2,2,3,2,1,3,3,3,2,2,3,1,2,2,2,2,2,3,3,3,2,2,2, + 3,3,3,3,2,3,3,3,3,2,2,3,3,2,3,2,2,3,2,3,2,2,3,2,2,3,3,3,3,2,2,2, + 3,3,2,2,2,2,2,3,3,2,0,3,0,2,3,2,2,2,1,2,2,0,2,1,2,3,2,3,3,2,2,2, + 3,3,3,3,2,2,3,3,3,2,3,3,3,2,3,3,2,3,1,3,3,2,2,2,2,2,2,2,2,2,2,3, + 3,2,3,3,3,3,3,2,2,3,2,3,3,3,0,3,3,2,2,2,2,2,2,3,2,0,0,0,1,0,0,0, + 3,3,2,2,2,2,2,3,3,2,0,3,2,2,2,2,2,2,2,3,2,0,2,2,2,2,2,2,3,2,2,2, + 3,3,3,3,3,3,2,3,3,2,2,3,1,2,3,2,2,2,2,3,2,3,3,3,2,2,2,2,3,3,2,0, + 3,3,3,2,2,2,3,2,3,2,2,3,2,2,3,2,3,2,0,3,2,2,2,2,2,2,3,0,2,2,3,2, + 3,3,2,3,2,2,2,3,3,3,3,2,2,2,3,2,2,2,2,2,3,0,0,2,2,2,2,0,3,0,0,0, + 3,3,2,2,2,3,2,3,3,0,0,2,2,2,3,2,2,2,2,3,0,2,2,2,2,3,2,3,2,3,2,2, + 2,0,3,3,3,3,3,0,0,3,3,0,2,3,0,3,3,3,0,0,2,2,2,2,1,0,0,0,0,0,0,0, + 2,2,3,3,3,3,3,3,2,3,3,2,3,3,2,3,3,3,0,0,2,3,3,2,2,2,0,0,1,2,2,0, + 2,2,3,3,3,3,2,3,2,3,3,2,2,2,2,3,3,2,0,0,2,2,3,2,2,1,0,0,1,2,1,0, + 0,2,3,2,2,3,3,2,2,2,3,0,3,3,0,2,2,3,0,2,1,2,3,2,2,0,0,0,0,0,0,0, + 0,0,3,2,3,2,3,0,0,3,2,0,2,3,0,0,2,2,0,0,1,0,2,0,0,0,0,0,0,0,0,0, + 2,2,3,3,3,2,3,0,0,2,2,0,0,3,0,2,2,2,0,0,2,2,3,2,1,0,0,0,0,0,0,0, + 2,2,2,2,3,2,2,2,0,3,2,0,2,2,0,2,2,3,0,2,2,0,2,2,2,0,0,0,0,0,0,0, +}; + + +const SequenceModel Iso_8859_2HungarianModel = +{ + Iso_8859_2_CharToOrderMap, + HungarianLangModel, + 32, + (float)0.9748272224933486, + PR_FALSE, + "ISO-8859-2" +}; + +const SequenceModel Windows_1250HungarianModel = +{ + Windows_1250_CharToOrderMap, + HungarianLangModel, + 32, + (float)0.9748272224933486, + PR_FALSE, + "WINDOWS-1250" +}; \ No newline at end of file diff --git a/PowerEditor/src/uchardet/LangCyrillicModel.cpp b/PowerEditor/src/uchardet/LangModels/LangRussianModel.cpp similarity index 80% rename from PowerEditor/src/uchardet/LangCyrillicModel.cpp rename to PowerEditor/src/uchardet/LangModels/LangRussianModel.cpp index 42f28876..a5320494 100644 --- a/PowerEditor/src/uchardet/LangCyrillicModel.cpp +++ b/PowerEditor/src/uchardet/LangModels/LangRussianModel.cpp @@ -35,7 +35,7 @@ * * ***** END LICENSE BLOCK ***** */ -#include "nsSBCharSetProber.h" +#include "../nsSBCharSetProber.h" @@ -43,18 +43,18 @@ //Character Mapping Table: static const unsigned char KOI8R_CharToOrderMap[] = { -255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 -255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 -+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20 -252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30 -253,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40 -155,156,157,158,159,160,161,162,163,164,165,253,253,253,253,253, //50 -253, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60 - 67,179, 78, 73,180,181, 79,182,183,184,185,253,253,253,253,253, //70 +CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, //00 +CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, //10 +SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, //20 +NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, //30 +SYM,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40 +155,156,157,158,159,160,161,162,163,164,165,SYM,SYM,SYM,SYM,SYM, //50 +SYM, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60 + 67,179, 78, 73,180,181, 79,182,183,184,185,SYM,SYM,SYM,SYM,SYM, //70 191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206, //80 207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222, //90 223,224,225, 68,226,227,228,229,230,231,232,233,234,235,236,237, //a0 -238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253, //b0 +238,239,240,241,242,243,244,245,246,247,248,249,250,251,NUM,SYM, //b0 27, 3, 21, 28, 13, 2, 39, 19, 26, 4, 23, 11, 8, 12, 5, 1, //c0 15, 16, 9, 7, 6, 14, 24, 10, 17, 18, 20, 25, 30, 29, 22, 54, //d0 59, 37, 44, 58, 41, 48, 53, 46, 55, 42, 60, 36, 49, 38, 31, 34, //e0 @@ -63,18 +63,18 @@ static const unsigned char KOI8R_CharToOrderMap[] = static const unsigned char win1251_CharToOrderMap[] = { -255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 -255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 -+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20 -252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30 -253,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40 -155,156,157,158,159,160,161,162,163,164,165,253,253,253,253,253, //50 -253, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60 - 67,179, 78, 73,180,181, 79,182,183,184,185,253,253,253,253,253, //70 +CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, //00 +CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, //10 +SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, //20 +NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, //30 +SYM,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40 +155,156,157,158,159,160,161,162,163,164,165,SYM,SYM,SYM,SYM,SYM, //50 +SYM, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60 + 67,179, 78, 73,180,181, 79,182,183,184,185,SYM,SYM,SYM,SYM,SYM, //70 191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206, -207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222, +207,208,209,210,211,212,213,214,ILL,216,217,218,219,220,221,222, 223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238, -239,240,241,242,243,244,245,246, 68,247,248,249,250,251,252,253, +239,240,241,242,243,244,245,246, 68,247,248,249,250,251,NUM,SYM, 37, 44, 33, 46, 41, 48, 56, 51, 42, 60, 36, 49, 38, 31, 34, 35, 45, 32, 40, 52, 53, 55, 58, 50, 57, 63, 70, 62, 61, 47, 59, 43, 3, 21, 10, 19, 13, 2, 24, 20, 4, 23, 11, 8, 12, 5, 1, 15, @@ -83,14 +83,14 @@ static const unsigned char win1251_CharToOrderMap[] = static const unsigned char latin5_CharToOrderMap[] = { -255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 -255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 -+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20 -252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30 -253,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40 -155,156,157,158,159,160,161,162,163,164,165,253,253,253,253,253, //50 -253, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60 - 67,179, 78, 73,180,181, 79,182,183,184,185,253,253,253,253,253, //70 +CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, //00 +CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, //10 +SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, //20 +NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, //30 +SYM,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40 +155,156,157,158,159,160,161,162,163,164,165,SYM,SYM,SYM,SYM,SYM, //50 +SYM, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60 + 67,179, 78, 73,180,181, 79,182,183,184,185,SYM,SYM,SYM,SYM,SYM, //70 191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206, 207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222, 223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238, @@ -98,39 +98,39 @@ static const unsigned char latin5_CharToOrderMap[] = 45, 32, 40, 52, 53, 55, 58, 50, 57, 63, 70, 62, 61, 47, 59, 43, 3, 21, 10, 19, 13, 2, 24, 20, 4, 23, 11, 8, 12, 5, 1, 15, 9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16, -239, 68,240,241,242,243,244,245,246,247,248,249,250,251,252,255, +239, 68,240,241,242,243,244,245,246,247,248,249,250,251,NUM,CTR, }; static const unsigned char macCyrillic_CharToOrderMap[] = { -255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 -255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 -+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20 -252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30 -253,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40 -155,156,157,158,159,160,161,162,163,164,165,253,253,253,253,253, //50 -253, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60 - 67,179, 78, 73,180,181, 79,182,183,184,185,253,253,253,253,253, //70 +CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, //00 +CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, //10 +SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, //20 +NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, //30 +SYM,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40 +155,156,157,158,159,160,161,162,163,164,165,SYM,SYM,SYM,SYM,SYM, //50 +SYM, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60 + 67,179, 78, 73,180,181, 79,182,183,184,185,SYM,SYM,SYM,SYM,SYM, //70 37, 44, 33, 46, 41, 48, 56, 51, 42, 60, 36, 49, 38, 31, 34, 35, 45, 32, 40, 52, 53, 55, 58, 50, 57, 63, 70, 62, 61, 47, 59, 43, 191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206, 207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222, 223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238, -239,240,241,242,243,244,245,246,247,248,249,250,251,252, 68, 16, +239,240,241,242,243,244,245,246,247,248,249,250,251,NUM, 68, 16, 3, 21, 10, 19, 13, 2, 24, 20, 4, 23, 11, 8, 12, 5, 1, 15, - 9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27,255, + 9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27,CTR, }; static const unsigned char IBM855_CharToOrderMap[] = { -255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 -255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 -+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20 -252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30 -253,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40 -155,156,157,158,159,160,161,162,163,164,165,253,253,253,253,253, //50 -253, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60 - 67,179, 78, 73,180,181, 79,182,183,184,185,253,253,253,253,253, //70 +CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, //00 +CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, //10 +SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, //20 +NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, //30 +SYM,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40 +155,156,157,158,159,160,161,162,163,164,165,SYM,SYM,SYM,SYM,SYM, //50 +SYM, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60 + 67,179, 78, 73,180,181, 79,182,183,184,185,SYM,SYM,SYM,SYM,SYM, //70 191,192,193,194, 68,195,196,197,198,199,200,201,202,203,204,205, 206,207,208,209,210,211,212,213,214,215,216,217, 27, 59, 54, 70, 3, 37, 21, 44, 28, 58, 13, 41, 2, 48, 39, 53, 19, 46,218,219, @@ -138,19 +138,19 @@ static const unsigned char IBM855_CharToOrderMap[] = 230,231,232,233,234,235, 11, 36,236,237,238,239,240,241,242,243, 8, 49, 12, 38, 5, 31, 1, 34, 15,244,245,246,247, 35, 16,248, 43, 9, 45, 7, 32, 6, 40, 14, 52, 24, 56, 10, 33, 17, 61,249, -250, 18, 62, 20, 51, 25, 57, 30, 47, 29, 63, 22, 50,251,252,255, +250, 18, 62, 20, 51, 25, 57, 30, 47, 29, 63, 22, 50,251,NUM,CTR, }; static const unsigned char IBM866_CharToOrderMap[] = { -255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 -255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 -+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20 -252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30 -253,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40 -155,156,157,158,159,160,161,162,163,164,165,253,253,253,253,253, //50 -253, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60 - 67,179, 78, 73,180,181, 79,182,183,184,185,253,253,253,253,253, //70 +CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, //00 +CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, //10 +SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, //20 +NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, //30 +SYM,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40 +155,156,157,158,159,160,161,162,163,164,165,SYM,SYM,SYM,SYM,SYM, //50 +SYM, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60 + 67,179, 78, 73,180,181, 79,182,183,184,185,SYM,SYM,SYM,SYM,SYM, //70 37, 44, 33, 46, 41, 48, 56, 51, 42, 60, 36, 49, 38, 31, 34, 35, 45, 32, 40, 52, 53, 55, 58, 50, 57, 63, 70, 62, 61, 47, 59, 43, 3, 21, 10, 19, 13, 2, 24, 20, 4, 23, 11, 8, 12, 5, 1, 15, @@ -158,7 +158,7 @@ static const unsigned char IBM866_CharToOrderMap[] = 207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222, 223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238, 9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16, -239, 68,240,241,242,243,244,245,246,247,248,249,250,251,252,255, +239, 68,240,241,242,243,244,245,246,247,248,249,250,251,NUM,CTR, }; //Model Table: @@ -300,50 +300,62 @@ static const PRUint8 RussianLangModel[] = }; -const SequenceModel Koi8rModel( +const SequenceModel Koi8rRussianModel = +{ KOI8R_CharToOrderMap, RussianLangModel, + 64, (float)0.976601, PR_FALSE, "KOI8-R" -); +}; -const SequenceModel Win1251Model( +const SequenceModel Win1251RussianModel = +{ win1251_CharToOrderMap, RussianLangModel, + 64, (float)0.976601, PR_FALSE, - "windows-1251" -); + "WINDOWS-1251" +}; -const SequenceModel Latin5Model( +const SequenceModel Latin5RussianModel = +{ latin5_CharToOrderMap, RussianLangModel, + 64, (float)0.976601, PR_FALSE, "ISO-8859-5" -); +}; -const SequenceModel MacCyrillicModel( +const SequenceModel MacCyrillicRussianModel = +{ macCyrillic_CharToOrderMap, RussianLangModel, + 64, (float)0.976601, PR_FALSE, - "x-mac-cyrillic" -); + "MAC-CYRILLIC" +}; -const SequenceModel Ibm866Model( +const SequenceModel Ibm866RussianModel = +{ IBM866_CharToOrderMap, RussianLangModel, + 64, (float)0.976601, PR_FALSE, "IBM866" -); +}; -const SequenceModel Ibm855Model( +const SequenceModel Ibm855RussianModel = +{ IBM855_CharToOrderMap, RussianLangModel, + 64, (float)0.976601, PR_FALSE, "IBM855" -); +}; diff --git a/PowerEditor/src/uchardet/LangModels/LangSpanishModel.cpp b/PowerEditor/src/uchardet/LangModels/LangSpanishModel.cpp new file mode 100644 index 00000000..362bc5ea --- /dev/null +++ b/PowerEditor/src/uchardet/LangModels/LangSpanishModel.cpp @@ -0,0 +1,201 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Mozilla Communicator client code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "../nsSBCharSetProber.h" + +/********* Language model for: Spanish *********/ + +/** + * Generated by BuildLangModel.py + * On: 2015-12-12 18:39:02.290370 + **/ + +/* Character Mapping Table: + * ILL: illegal character. + * CTR: control character specific to the charset. + * RET: carriage/return. + * SYM: symbol (punctuation) that does not belong to word. + * NUM: 0 - 9. + * + * Other characters are ordered by probabilities + * (0 is the most common character in the language). + * + * Orders are generic to a language. So the codepoint with order X in + * CHARSET1 maps to the same character as the codepoint with the same + * order X in CHARSET2 for the same language. + * As such, it is possible to get missing order. For instance the + * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 + * even though they are both used for French. Same for the euro sign. + */ +static const unsigned char Iso_8859_1_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 1, 14, 10, 8, 0, 16, 15, 20, 5, 23, 27, 7, 12, 3, 2, /* 4X */ + 13, 21, 6, 4, 9, 11, 18, 31, 28, 17, 24,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 1, 14, 10, 8, 0, 16, 15, 20, 5, 23, 27, 7, 12, 3, 2, /* 6X */ + 13, 21, 6, 4, 9, 11, 18, 31, 28, 17, 24,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 52,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 33, 25, 39, 46, 37, 45, 47, 35, 36, 26, 48, 40, 53, 22, 41, 43, /* CX */ + 49, 29, 38, 19, 50, 54, 34,SYM, 44, 51, 30, 55, 32, 42, 56, 57, /* DX */ + 33, 25, 39, 46, 37, 45, 47, 35, 36, 26, 48, 40, 58, 22, 41, 43, /* EX */ + 49, 29, 38, 19, 50, 59, 34,SYM, 44, 51, 30, 60, 32, 42, 61, 62, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_15_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 1, 14, 10, 8, 0, 16, 15, 20, 5, 23, 27, 7, 12, 3, 2, /* 4X */ + 13, 21, 6, 4, 9, 11, 18, 31, 28, 17, 24,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 1, 14, 10, 8, 0, 16, 15, 20, 5, 23, 27, 7, 12, 3, 2, /* 6X */ + 13, 21, 6, 4, 9, 11, 18, 31, 28, 17, 24,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM, 63,SYM, 64,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM, 65, 66,SYM,SYM, 67,SYM,SYM,SYM, 68, 69, 70,SYM, /* BX */ + 33, 25, 39, 46, 37, 45, 47, 35, 36, 26, 48, 40, 71, 22, 41, 43, /* CX */ + 49, 29, 38, 19, 50, 72, 34,SYM, 44, 51, 30, 73, 32, 42, 74, 75, /* DX */ + 33, 25, 39, 46, 37, 45, 47, 35, 36, 26, 48, 40, 76, 22, 41, 43, /* EX */ + 49, 29, 38, 19, 50, 77, 34,SYM, 44, 51, 30, 78, 32, 42, 79, 80, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Windows_1252_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 1, 14, 10, 8, 0, 16, 15, 20, 5, 23, 27, 7, 12, 3, 2, /* 4X */ + 13, 21, 6, 4, 9, 11, 18, 31, 28, 17, 24,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 1, 14, 10, 8, 0, 16, 15, 20, 5, 23, 27, 7, 12, 3, 2, /* 6X */ + 13, 21, 6, 4, 9, 11, 18, 31, 28, 17, 24,SYM,SYM,SYM,SYM,CTR, /* 7X */ + SYM,ILL,SYM, 81,SYM,SYM,SYM,SYM,SYM,SYM, 82,SYM, 83,ILL, 84,ILL, /* 8X */ + ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 85,SYM, 86,ILL, 87, 88, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 89,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 33, 25, 39, 46, 37, 45, 47, 35, 36, 26, 48, 40, 90, 22, 41, 43, /* CX */ + 49, 29, 38, 19, 50, 91, 34,SYM, 44, 51, 30, 92, 32, 42, 93, 94, /* DX */ + 33, 25, 39, 46, 37, 45, 47, 35, 36, 26, 48, 40, 95, 22, 41, 43, /* EX */ + 49, 29, 38, 19, 50, 96, 34,SYM, 44, 51, 30, 97, 32, 42, 98, 99, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + + +/* Model Table: + * Total sequences: 897 + * First 512 sequences: 0.9970385677528184 + * Next 512 sequences (512-1024): 0.0029614322471815486 + * Rest: 4.597017211338539e-17 + * Negative sequences: TODO + */ +static const PRUint8 SpanishLangModel[] = +{ + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,0,3,3,3,2,3,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,2,3,3,3,3,3,3,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,2,3,3,2,2,3,3,2,2,3,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,0,3,2,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,0,2,3,3,3,0,0,2,2,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,3,3,3,3,2,0,3,2,2, + 3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,2,3,3,2,2,0,2,2,0, + 3,3,3,2,3,3,3,3,2,2,2,3,3,2,2,3,2,3,3,3,3,2,3,2,2,3,3,2,0,0,2,2,2, + 3,3,3,3,3,3,3,3,2,3,3,3,2,2,3,2,2,3,2,3,3,0,3,2,2,3,3,0,0,0,2,2,2, + 3,3,3,3,3,3,3,3,2,3,3,3,2,2,2,2,2,3,0,3,3,2,3,0,2,3,3,3,0,0,2,0,0, + 3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,2,3,0,2,0, + 3,3,3,3,3,3,2,2,2,2,2,3,3,3,3,2,2,3,0,3,2,0,3,2,0,3,3,2,2,0,3,2,2, + 3,3,3,2,3,3,3,3,2,3,3,3,2,3,3,0,2,2,2,3,3,0,3,2,0,3,3,2,0,0,3,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,2,2,3,2,3,2,2,3,3,0,3,2,2,0,0,2,2,0, + 3,3,3,3,3,3,3,3,3,3,0,3,3,0,2,2,2,2,2,3,3,0,3,2,2,2,3,2,0,0,3,2,3, + 3,3,3,2,2,3,3,3,2,3,2,3,2,2,2,2,3,2,0,3,0,0,3,2,0,2,2,2,0,0,3,2,0, + 3,3,3,3,3,3,3,3,2,2,2,3,2,2,2,2,2,2,0,3,2,0,0,2,2,2,2,2,0,0,2,2,0, + 3,3,3,2,2,3,2,2,2,0,2,3,0,2,0,2,2,2,2,3,0,0,3,0,0,2,3,2,0,0,0,0,0, + 0,0,0,3,3,0,3,3,3,3,3,0,3,3,2,3,2,0,3,0,0,0,0,0,0,0,0,0,2,0,0,0,0, + 3,3,3,3,2,3,3,3,3,3,2,3,3,0,2,0,2,3,2,2,2,0,3,2,2,2,3,0,2,0,2,2,2, + 2,3,2,0,2,2,0,2,2,2,0,3,0,0,0,0,0,0,0,0,0,2,2,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,0,2,2,3,3,3,2,3,2,3,3,3,0,2,0,0,2,0,2,2,0,0,0,0,0,0,0,0, + 3,3,3,2,0,3,2,2,2,2,0,3,2,2,0,0,0,0,0,3,0,0,2,2,0,2,3,0,0,0,2,0,2, + 3,3,3,2,0,3,2,0,2,2,2,3,2,2,2,3,0,2,0,3,2,3,2,0,3,3,2,2,0,0,2,0,0, + 2,0,0,3,3,2,3,3,2,3,3,2,3,3,2,3,3,2,2,0,2,2,0,2,2,0,0,0,2,2,0,0,0, + 2,3,2,3,3,2,3,3,3,3,3,2,2,3,2,3,2,2,2,0,0,0,0,2,0,0,0,0,3,0,0,0,0, + 3,3,3,2,3,3,3,3,2,2,2,3,3,0,2,2,2,3,2,0,2,0,2,0,0,0,0,2,0,0,2,2,0, + 3,3,3,2,2,3,2,2,2,3,3,3,2,3,2,0,2,2,3,2,2,2,0,2,0,2,2,2,3,0,0,2,0, + 3,3,3,0,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,2,0,0,3,0,0,2,0,0,0,0,0,0,0, + 2,3,2,3,3,0,2,3,2,3,2,0,3,2,3,0,2,0,0,0,0,0,0,2,2,0,0,0,0,2,0,0,0, + 3,3,3,3,2,3,2,2,2,2,2,2,0,0,2,0,2,2,0,0,2,0,0,2,0,2,0,2,0,0,0,2,0, + 3,0,0,2,2,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,2,0,0,0,0,0,0, +}; + + +const SequenceModel Iso_8859_1SpanishModel = +{ + Iso_8859_1_CharToOrderMap, + SpanishLangModel, + 33, + (float)0.9970385677528184, + PR_TRUE, + "ISO-8859-1" +}; + +const SequenceModel Iso_8859_15SpanishModel = +{ + Iso_8859_15_CharToOrderMap, + SpanishLangModel, + 33, + (float)0.9970385677528184, + PR_TRUE, + "ISO-8859-15" +}; + +const SequenceModel Windows_1252SpanishModel = +{ + Windows_1252_CharToOrderMap, + SpanishLangModel, + 33, + (float)0.9970385677528184, + PR_TRUE, + "WINDOWS-1252" +}; \ No newline at end of file diff --git a/PowerEditor/src/uchardet/LangModels/LangThaiModel.cpp b/PowerEditor/src/uchardet/LangModels/LangThaiModel.cpp new file mode 100644 index 00000000..8e90afbb --- /dev/null +++ b/PowerEditor/src/uchardet/LangModels/LangThaiModel.cpp @@ -0,0 +1,265 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Mozilla Communicator client code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "../nsSBCharSetProber.h" + +/********* Language model for: Thai *********/ + +/** + * Generated by BuildLangModel.py + * On: 2015-12-04 03:05:06.182099 + **/ + +/* Character Mapping Table: + * ILL: illegal character. + * CTR: control character specific to the charset. + * RET: carriage/return. + * SYM: symbol (punctuation) that does not belong to word. + * NUM: 0 - 9. + * + * Other characters are ordered by probabilities + * (0 is the most common character in the language). + * + * Orders are generic to a language. So the codepoint with order X in + * CHARSET1 maps to the same character as the codepoint with the same + * order X in CHARSET2 for the same language. + * As such, it is possible to get missing order. For instance the + * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 + * even though they are both used for French. Same for the euro sign. + */ +static const unsigned char Tis_620_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 66, 70, 67, 80, 78, 87, 85, 73, 79, 93, 88, 84, 68, 77, 81, /* 4X */ + 75,101, 74, 61, 71, 86, 96, 90,103,100, 99,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 35, 64, 48, 52, 32, 60, 65, 54, 36, 97, 76, 46, 56, 41, 40, /* 6X */ + 59,104, 43, 45, 44, 55, 72, 82, 94, 57, 92,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + ILL, 3, 23,105, 15,106, 89, 5, 21, 63, 26, 31,102, 42, 69, 58, /* AX */ + 49, 91, 83, 34, 9, 17, 30, 12, 39, 1, 16, 19, 33, 62, 22, 47, /* BX */ + 38, 7, 10, 2, 50, 11,107, 8, 28, 37, 13, 18, 98, 4, 53, 95, /* CX */ + 14,SYM, 0, 29,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,ILL,ILL,ILL,SYM, /* DX */ + 6, 20, 27, 24, 25,108, 51,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,109, /* EX */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,110,111,ILL,ILL,ILL,ILL, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_11_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 66, 70, 67, 80, 78, 87, 85, 73, 79, 93, 88, 84, 68, 77, 81, /* 4X */ + 75,101, 74, 61, 71, 86, 96, 90,103,100, 99,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 35, 64, 48, 52, 32, 60, 65, 54, 36, 97, 76, 46, 56, 41, 40, /* 6X */ + 59,104, 43, 45, 44, 55, 72, 82, 94, 57, 92,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM, 3, 23,112, 15,113, 89, 5, 21, 63, 26, 31,102, 42, 69, 58, /* AX */ + 49, 91, 83, 34, 9, 17, 30, 12, 39, 1, 16, 19, 33, 62, 22, 47, /* BX */ + 38, 7, 10, 2, 50, 11,114, 8, 28, 37, 13, 18, 98, 4, 53, 95, /* CX */ + 14,SYM, 0, 29,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,ILL,ILL,ILL,SYM, /* DX */ + 6, 20, 27, 24, 25,115, 51,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,116, /* EX */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,117,118,ILL,ILL,ILL,ILL, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + + +/* Model Table: + * Total sequences: 2324 + * First 512 sequences: 0.8815720594354438 + * Next 512 sequences (512-1024): 0.0920860122682917 + * Rest: 0.026341928296264486 + * Negative sequences: TODO + */ +static const PRUint8 ThaiLangModel[] = +{ + 0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,2,3, + 0,2,3,0,0,3,2,3,0,0,2,0,0,0,0,2,0,1,1,1,0,2,0,0,0,0,1,0,0,0,1,1, + 3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3, + 0,3,0,0,0,1,3,3,0,0,1,0,0,0,0,2,0,2,1,2,0,1,0,0,0,0,0,0,0,0,2,1, + 3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,2,3,3,3,3,3,3,2,2,2,3,1,3,2, + 0,2,3,0,0,2,2,1,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,2,1, + 3,3,3,3,3,2,3,3,3,3,2,3,3,3,2,3,2,3,3,3,3,3,3,3,3,3,2,3,2,3,2,3, + 0,2,1,0,0,3,2,1,0,0,0,0,0,0,0,1,0,3,3,1,0,1,0,0,0,0,3,0,0,0,1,1, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,2,2,2,2,3,3,2,2,1,2,2,2, + 0,2,0,0,0,0,2,2,0,0,1,0,0,0,0,2,0,0,1,1,0,1,0,0,0,0,0,0,0,0,1,1, + 3,3,3,3,3,2,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2, + 0,3,0,0,0,1,2,2,0,0,1,0,0,0,0,2,0,1,1,2,0,2,0,0,0,0,0,0,0,0,2,1, + 0,3,3,3,3,2,0,3,3,3,3,3,3,3,0,3,3,3,3,3,0,3,3,3,0,0,3,0,3,0,1,3, + 0,2,0,0,0,2,2,2,0,0,0,0,0,0,0,3,0,0,0,0,0,2,0,0,0,0,0,0,0,0,1,3, + 3,3,3,3,3,2,3,3,3,3,2,3,3,3,2,3,3,3,3,3,3,2,3,3,3,3,2,2,1,0,2,1, + 0,2,2,0,1,2,2,1,0,0,1,0,0,0,0,1,0,1,1,1,0,1,0,0,0,0,0,0,0,0,1,1, + 3,3,3,3,3,3,3,3,2,3,3,3,3,2,2,2,3,2,2,2,3,3,3,2,2,2,2,2,2,0,2,2, + 0,1,2,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,3,1,0,1,0,0,0,0,0,0,0,0,1,1, + 3,3,3,3,3,3,3,2,3,2,3,3,3,3,0,3,2,3,2,2,3,2,2,3,3,3,2,2,1,3,2,1, + 0,1,0,0,0,0,2,1,0,0,0,0,0,0,0,1,0,1,1,1,0,1,0,0,0,0,0,0,0,0,1,1, + 3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,2,2,1,2,2, + 0,2,0,0,0,0,3,1,0,0,1,0,0,0,0,1,0,1,1,1,0,1,0,0,0,0,0,0,0,0,1,1, + 3,3,2,3,3,3,3,3,3,3,2,3,3,3,3,2,2,3,2,2,2,2,1,3,2,2,2,2,1,3,1,2, + 0,1,0,0,0,0,1,1,0,0,1,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,1, + 3,3,3,1,2,1,2,1,2,3,3,1,1,2,2,3,2,1,2,1,1,1,2,1,1,1,1,1,3,3,0,1, + 0,0,0,0,0,1,1,3,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,2,3,3,3,3,3,3,2,3,2,2,2,2,3,3,3,2,2,1,1,1,2,2,1,2,1,3,3,2, + 0,1,0,0,0,0,2,2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1, + 0,3,3,3,3,1,3,3,3,3,3,2,3,3,0,3,3,3,3,3,3,3,3,2,3,3,3,3,2,0,2,2, + 0,2,1,0,0,0,2,2,0,0,1,0,0,0,0,1,0,1,1,0,0,2,0,0,0,0,1,0,0,0,1,1, + 3,3,3,1,3,2,2,3,3,2,2,3,1,1,2,2,1,2,1,2,1,3,1,1,1,1,1,2,0,3,0,1, + 0,0,2,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0, + 3,3,3,3,3,1,3,2,3,3,2,3,3,3,1,3,3,3,3,3,3,2,2,2,3,3,2,2,2,2,2,2, + 0,2,0,0,0,0,2,1,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,0,0,0,1,0,0,0,1,1, + 3,3,3,3,3,1,2,1,2,1,3,2,2,2,3,1,2,2,1,1,2,1,1,2,2,1,1,2,1,3,3,1, + 0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0, + 3,3,3,1,2,1,0,3,3,1,2,3,1,1,1,0,0,3,1,1,0,0,1,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,2,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,2,3,3,3,1,2,1,2,2,2,3,2,2,2,1,1,2,1,2,2,2,1,1,2,2,1,1,1,0,2,1, + 0,1,0,0,0,0,1,1,0,0,1,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,3,0,0,0,0,0, + 0,3,3,3,3,1,0,3,2,2,2,3,3,3,0,3,3,3,3,3,0,1,2,2,0,0,1,0,0,0,3,3, + 0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0, + 3,3,3,3,3,1,3,2,2,2,1,1,2,2,3,2,1,2,1,1,2,3,3,2,2,2,1,2,0,3,1,2, + 0,1,1,0,0,0,1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1, + 3,1,3,2,3,1,2,2,3,2,3,3,3,2,0,1,3,1,1,1,2,2,1,2,1,1,1,1,1,1,1,0, + 0,1,1,0,0,0,1,1,0,0,1,0,0,0,0,1,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,1,1,3,0,1,1,2,1,2,1,2,1,0,1,1,2,1,1,1,1,1,1,1,1,1,1,0,1,1,1, + 0,0,3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,3,0,3,0,0,0,0,0,2,1,0,0,2,0,1,1,3,3,1,0,3,0,0,0,0,3,0,0,0,0,0, + 0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,1,3,2,2,0,0,3,3,3,0,2,3,1,0,2,2,2,2,3,0,1,1,3,0,0,1,0,0,0,1,2, + 0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0, + 3,3,1,2,3,1,2,2,2,1,2,2,2,2,1,1,2,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1, + 0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0, + 0,3,3,2,3,0,0,2,1,3,2,3,3,1,0,3,2,3,1,2,0,2,2,1,0,0,1,0,1,0,1,2, + 0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1, + 3,3,2,2,2,0,2,2,2,1,2,1,2,2,0,1,1,2,1,1,2,2,1,2,2,2,1,1,1,0,1,1, + 0,0,0,0,0,2,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,2,0,0,0,0,0,0,0,0,1,0, + 0,3,3,3,2,2,3,2,2,2,1,3,2,2,0,3,2,2,3,1,3,1,2,2,3,2,1,2,1,0,2,1, + 0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0, + 3,2,1,1,2,1,2,2,2,1,1,2,2,1,1,1,2,1,1,1,2,1,1,1,2,1,1,1,1,0,1,0, + 0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0, + 3,3,1,1,3,2,2,1,1,1,1,2,1,0,1,1,1,2,0,1,1,0,0,0,0,1,1,1,0,0,0,1, + 0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0, + 2,0,0,2,2,0,0,0,2,3,0,3,2,3,3,0,2,0,0,0,2,0,1,2,2,1,0,2,2,1,0,0, + 1,2,0,1,0,1,1,1,1,1,2,3,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,1,2,2,1,1,1,1,1,1,1,1,2,2,3,1,1,1,2,1,1,1,1,1,1,1,1,1,1,0,1,1, + 0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0, + 1,0,0,1,2,0,0,0,1,3,0,3,3,2,3,0,2,0,0,0,2,0,1,1,2,2,0,2,1,1,0,0, + 0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 2,0,0,3,1,0,0,0,3,3,0,2,3,3,2,0,3,0,0,0,2,0,1,1,2,0,0,1,1,0,0,0, + 3,1,1,2,1,0,1,1,1,1,2,0,2,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,0,1,1, + 0,1,3,0,0,1,2,0,0,0,0,0,0,0,0,1,0,2,0,0,0,0,0,0,0,0,1,0,0,0,1,0, + 3,0,2,1,1,0,0,1,0,0,1,0,2,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,1,3,1,2,1,1,2,1,1,1,0,1,1,0,1,1,1,1,1,1,0,0,1,1,1,0,1,1,1,0,0, + 0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 2,0,0,1,1,0,0,0,1,3,0,3,2,2,2,0,2,0,0,0,2,0,1,2,2,1,0,2,3,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0, + 3,0,0,2,2,0,0,0,2,2,0,1,3,2,1,0,2,0,0,0,3,0,1,1,1,1,0,0,1,0,0,0, + 3,1,1,1,1,0,2,1,1,0,0,1,2,1,0,1,1,1,2,1,1,1,1,1,2,1,2,1,1,0,1,1, + 0,0,0,0,0,0,1,0,0,0,3,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,0,0,3,3,0,0,0,2,2,0,2,2,2,1,0,2,0,0,0,2,0,1,1,1,2,0,1,1,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,0,0,2,3,0,0,0,2,1,0,2,2,2,1,0,1,0,0,0,1,0,3,2,1,2,0,1,1,0,0,0, + 0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 2,0,0,1,2,0,0,0,2,1,0,1,3,2,1,0,2,0,0,0,1,0,2,1,1,1,0,1,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,0,0,2,2,0,0,0,2,2,0,0,1,1,2,0,1,0,0,0,1,0,1,1,1,1,0,1,1,0,0,0, + 1,1,3,2,2,0,2,1,1,1,1,2,1,1,0,1,1,2,1,0,1,1,1,1,1,1,1,1,0,0,0,1, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 2,0,0,2,2,0,0,0,2,0,0,1,2,1,1,0,1,0,0,0,0,0,2,1,0,1,0,0,0,0,0,0, + 3,1,1,1,2,0,1,2,1,0,0,0,1,2,0,1,2,1,1,1,1,0,0,0,1,1,0,1,1,0,0,1, + 0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0, + 0,0,0,3,0,0,0,0,0,2,0,0,1,0,0,1,0,2,2,0,0,1,0,0,0,0,0,0,2,0,1,0, + 0,0,0,0,0,3,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,1,1,0,1,1,1,0,0,0,1,0,0,1,0,1,0,0,1,0,1,1,1,1,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 2,0,0,2,2,0,0,0,2,0,0,1,0,1,1,0,1,0,0,0,1,0,1,1,1,2,0,0,2,0,0,0, + 2,1,1,0,2,0,2,1,1,1,1,2,1,1,1,0,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,0,0,2,2,0,0,0,2,1,0,1,1,1,1,0,1,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,0,0,1,1,0,0,0,0,2,0,2,2,2,2,0,2,0,0,0,2,0,1,0,1,1,0,1,1,1,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 2,0,0,2,2,0,0,0,2,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,2,0,1,0,0,0,0, + 0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0, + 1,0,0,1,1,0,0,0,1,1,0,0,1,2,1,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0, + 1,0,1,2,1,0,1,1,1,1,0,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,0,0,0,1,1, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 2,0,0,2,1,0,0,0,2,0,0,2,1,1,2,0,0,0,0,0,0,0,2,1,1,2,0,1,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 2,0,0,1,2,0,0,0,2,1,0,2,1,0,0,0,0,0,0,0,0,0,0,2,0,1,0,0,2,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,0,0,1,1,0,0,0,1,0,0,0,2,0,0,0,2,0,0,0,0,0,1,1,1,1,0,1,1,1,0,0, + 0,1,2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 2,1,1,0,0,0,0,1,1,1,1,2,0,0,1,0,1,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +}; + + +const SequenceModel Tis_620ThaiModel = +{ + Tis_620_CharToOrderMap, + ThaiLangModel, + 64, + (float)0.8815720594354438, + PR_FALSE, + "TIS-620" +}; + +const SequenceModel Iso_8859_11ThaiModel = +{ + Iso_8859_11_CharToOrderMap, + ThaiLangModel, + 64, + (float)0.8815720594354438, + PR_FALSE, + "ISO-8859-11" +}; \ No newline at end of file diff --git a/PowerEditor/src/uchardet/LangModels/LangTurkishModel.cpp b/PowerEditor/src/uchardet/LangModels/LangTurkishModel.cpp new file mode 100644 index 00000000..e68bcf6a --- /dev/null +++ b/PowerEditor/src/uchardet/LangModels/LangTurkishModel.cpp @@ -0,0 +1,173 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Mozilla Communicator client code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "../nsSBCharSetProber.h" + +/********* Language model for: Turkish *********/ + +/** + * Generated by BuildLangModel.py + * On: 2015-12-04 02:24:44.730727 + **/ + +/* Character Mapping Table: + * ILL: illegal character. + * CTR: control character specific to the charset. + * RET: carriage/return. + * SYM: symbol (punctuation) that does not belong to word. + * NUM: 0 - 9. + * + * Other characters are ordered by probabilities + * (0 is the most common character in the language). + * + * Orders are generic to a language. So the codepoint with order X in + * CHARSET1 maps to the same character as the codepoint with the same + * order X in CHARSET2 for the same language. + * As such, it is possible to get missing order. For instance the + * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 + * even though they are both used for French. Same for the euro sign. + */ +static const unsigned char Iso_8859_3_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 15, 21, 7, 1, 26, 22, 19, 6, 28, 9, 5, 11, 3, 14, /* 4X */ + 23, 34, 4, 10, 8, 12, 20, 29, 32, 13, 18,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 15, 21, 7, 1, 26, 22, 19, 2, 28, 9, 5, 11, 3, 14, /* 6X */ + 23, 34, 4, 10, 8, 12, 20, 29, 32, 13, 18,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM, 48,SYM,SYM,SYM,ILL, 49,SYM,SYM, 2, 17, 25, 50,SYM,ILL, 51, /* AX */ + SYM, 52,SYM,SYM,SYM,SYM, 53,SYM,SYM, 6, 17, 25, 54,SYM,ILL, 55, /* BX */ + 41, 36, 30,ILL, 39, 56, 57, 24, 42, 33, 58, 45, 59, 37, 31, 60, /* CX */ + ILL, 47, 61, 38, 62, 63, 27,SYM, 64, 65, 40, 35, 16, 66, 67, 68, /* DX */ + 41, 36, 30,ILL, 39, 69, 70, 24, 42, 33, 71, 45, 72, 37, 31, 73, /* EX */ + ILL, 47, 74, 38, 75, 76, 27,SYM, 77, 78, 40, 35, 16, 79, 80,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_9_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 15, 21, 7, 1, 26, 22, 19, 6, 28, 9, 5, 11, 3, 14, /* 4X */ + 23, 34, 4, 10, 8, 12, 20, 29, 32, 13, 18,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 15, 21, 7, 1, 26, 22, 19, 2, 28, 9, 5, 11, 3, 14, /* 6X */ + 23, 34, 4, 10, 8, 12, 20, 29, 32, 13, 18,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 81,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 41, 36, 30, 44, 39, 82, 46, 24, 42, 33, 83, 45, 84, 37, 31, 85, /* CX */ + 25, 47, 86, 38, 87, 88, 27,SYM, 43, 89, 40, 35, 16, 2, 17, 90, /* DX */ + 41, 36, 30, 44, 39, 91, 46, 24, 42, 33, 92, 45, 93, 37, 31, 94, /* EX */ + 25, 47, 95, 38, 96, 97, 27,SYM, 43, 98, 40, 35, 16, 6, 17, 99, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + + +/* Model Table: + * Total sequences: 935 + * First 512 sequences: 0.991865243864388 + * Next 512 sequences (512-1024): 0.008134756135611957 + * Rest: 2.949029909160572e-17 + * Negative sequences: TODO + */ +static const PRUint8 TurkishLangModel[] = +{ + 3,2,3,3,3,3,2,3,3,3,3,3,3,3,2,3,0,3,3,3,3,3,3,3,3,3,3,0,3,3,0,2,2,2,2,0, + 3,3,3,3,3,3,0,3,3,3,3,3,3,3,3,3,0,3,3,3,3,3,3,3,3,3,3,0,3,3,2,0,3,0,2,0, + 3,3,3,3,3,3,0,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,0,2,2,2,0,2,0,2,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,3,0,3,2,2,2,2,2,2,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,3,2,2,2,2,2,2,2, + 3,3,3,2,2,3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,2,3,3,2,3,0,3,2,2,2,2,3,0,2,2,2, + 3,2,0,3,3,3,3,3,3,3,3,3,2,3,2,3,0,3,3,2,3,3,2,3,2,3,2,0,0,0,0,0,2,0,0,0, + 3,3,3,2,3,3,3,3,2,2,2,2,3,3,3,2,3,0,2,2,2,2,2,2,0,0,0,3,2,3,2,2,0,0,0,0, + 3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,0,2,3,3,2,2,2,3,0,2,3,2,2,3,2,2,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,3,2,2,2,2,3,0,2,3,2,2,3,0,0,0,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,2,3,3,3,2,3,3,0,2,3,0,2,2,0,0,2,2,2, + 3,3,3,2,3,3,3,3,2,2,3,3,3,3,3,3,3,2,3,3,0,3,2,3,2,0,2,2,0,2,3,2,2,2,2,2, + 3,3,3,3,3,3,0,3,3,3,3,3,2,3,2,3,0,3,3,3,3,3,3,3,3,3,2,0,2,2,0,0,2,2,0,0, + 3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,2,2,2,2,3,2,2,0,2,3,0,2,2,0,0,2,0,2, + 3,3,3,3,3,3,0,3,3,3,3,3,3,3,3,3,0,3,3,3,3,3,3,3,3,3,3,0,3,3,0,2,2,2,0,0, + 3,3,3,3,3,3,3,3,0,2,2,3,3,3,3,3,3,0,2,2,2,2,0,2,0,0,0,3,2,2,2,0,0,2,0,0, + 2,2,2,3,3,3,0,3,3,3,3,3,0,3,2,3,0,3,3,3,3,3,2,3,3,3,3,0,2,0,0,0,0,0,0,0, + 3,3,3,0,2,3,3,2,3,3,2,3,3,2,2,3,3,2,0,2,2,2,2,2,3,0,2,2,0,0,2,2,0,0,0,0, + 3,3,3,2,2,3,3,3,2,2,0,3,3,3,3,2,3,0,2,2,0,3,3,0,0,0,0,2,0,0,2,2,0,0,0,0, + 3,3,3,3,3,3,3,2,3,2,3,3,3,3,3,3,3,2,2,2,2,2,0,2,3,0,2,0,0,2,3,2,0,2,0,2, + 3,3,3,2,3,3,2,2,0,2,3,2,3,3,3,2,2,2,2,2,3,2,2,0,0,0,2,0,0,0,2,2,0,0,0,0, + 3,3,3,2,3,3,3,2,3,3,2,2,3,2,3,2,3,0,2,3,0,2,0,0,0,0,0,2,0,0,2,0,0,2,2,2, + 3,3,3,2,3,3,3,2,2,2,2,0,3,2,3,0,3,0,2,3,2,0,2,2,0,0,2,3,2,2,2,0,0,2,0,0, + 3,3,3,0,3,3,3,2,3,2,3,3,3,2,3,2,2,0,2,3,0,2,2,3,2,0,2,0,0,2,2,0,2,2,0,0, + 3,3,3,0,2,3,3,2,3,2,0,3,3,2,3,2,3,2,0,0,0,0,2,2,0,0,0,3,0,0,0,0,0,0,0,0, + 3,3,3,0,3,3,3,3,0,0,0,3,3,0,0,2,3,2,2,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,2,3,3,3,2,3,2,2,0,3,3,3,2,2,0,0,2,0,2,2,0,2,0,2,2,2,0,2,2,0,0,0,0, + 0,0,0,3,3,3,0,3,3,3,3,3,0,3,0,2,0,2,3,2,2,0,0,2,3,3,2,0,2,0,0,0,0,0,0,0, + 3,3,3,0,0,2,2,2,0,2,0,0,3,0,3,0,2,0,0,0,0,2,2,2,0,0,0,2,0,0,2,0,0,0,0,0, + 3,3,3,2,2,2,0,0,0,2,2,2,2,2,3,2,0,0,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,2,0,0, + 0,0,2,3,3,3,0,3,2,2,2,2,0,2,0,2,0,2,2,3,2,0,0,0,0,2,2,0,0,0,0,0,0,0,0,0, + 0,0,0,2,0,2,0,2,2,0,0,2,0,2,0,0,0,2,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0, + 3,2,2,0,0,0,2,0,2,0,0,0,0,2,2,0,0,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,0,0,0,0, + 2,0,2,2,2,2,0,2,2,0,2,2,2,0,0,2,0,0,2,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0, + 2,0,2,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,2,2,0,2,0,0,2,2,0,0,0,2,0,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0, +}; + + +const SequenceModel Iso_8859_3TurkishModel = +{ + Iso_8859_3_CharToOrderMap, + TurkishLangModel, + 36, + (float)0.991865243864388, + PR_FALSE, + "ISO-8859-3" +}; + +const SequenceModel Iso_8859_9TurkishModel = +{ + Iso_8859_9_CharToOrderMap, + TurkishLangModel, + 36, + (float)0.991865243864388, + PR_FALSE, + "ISO-8859-9" +}; \ No newline at end of file diff --git a/PowerEditor/src/uchardet/LangModels/LangVietnameseModel.cpp b/PowerEditor/src/uchardet/LangModels/LangVietnameseModel.cpp new file mode 100644 index 00000000..dff4a795 --- /dev/null +++ b/PowerEditor/src/uchardet/LangModels/LangVietnameseModel.cpp @@ -0,0 +1,247 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Mozilla Communicator client code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "../nsSBCharSetProber.h" + +/********* Language model for: Vietnamese *********/ + +/** + * Generated by BuildLangModel.py + * On: 2016-02-13 03:42:06.561440 + **/ + +/* Character Mapping Table: + * ILL: illegal character. + * CTR: control character specific to the charset. + * RET: carriage/return. + * SYM: symbol (punctuation) that does not belong to word. + * NUM: 0 - 9. + * + * Other characters are ordered by probabilities + * (0 is the most common character in the language). + * + * Orders are generic to a language. So the codepoint with order X in + * CHARSET1 maps to the same character as the codepoint with the same + * order X in CHARSET2 for the same language. + * As such, it is possible to get missing order. For instance the + * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 + * even though they are both used for French. Same for the euro sign. + */ +static const unsigned char Windows_1258_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 6, 17, 3, 22, 21, 66, 5, 1, 4, 75, 24, 14, 8, 0, 9, /* 4X */ + 16, 36, 11, 19, 2, 7, 13, 69, 54, 20, 82,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 6, 17, 3, 22, 21, 66, 5, 1, 4, 75, 24, 14, 8, 0, 9, /* 6X */ + 16, 36, 11, 19, 2, 7, 13, 69, 54, 20, 82,SYM,SYM,SYM,SYM,CTR, /* 7X */ + SYM,ILL,SYM,101,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM,100,ILL,ILL,ILL, /* 8X */ + ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM,100,ILL,ILL,102, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM,103,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 12, 15, 25, 51, 97,104, 98, 91, 90, 62, 27,105,SYM, 47,106,107, /* CX */ + 10,108,SYM, 33, 29, 46, 93,SYM, 94, 58, 67,109, 96, 18,SYM, 99, /* DX */ + 12, 15, 25, 51, 97,110, 98, 91, 90, 62, 27,111,SYM, 47,112,113, /* EX */ + 10,114,SYM, 33, 29, 46, 93,SYM, 94, 58, 67,115, 96, 18,116,117, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Viscii_CharToOrderMap[] = +{ + CTR,CTR, 88,CTR,CTR, 95, 77,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR, 80,CTR,CTR,CTR,CTR, 79,CTR,CTR,CTR,CTR, 92,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 6, 17, 3, 22, 21, 66, 5, 1, 4, 75, 24, 14, 8, 0, 9, /* 4X */ + 16, 36, 11, 19, 2, 7, 13, 69, 54, 20, 82,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 6, 17, 3, 22, 21, 66, 5, 1, 4, 75, 24, 14, 8, 0, 9, /* 6X */ + 16, 36, 11, 19, 2, 7, 13, 69, 54, 20, 82,SYM,SYM,SYM,SYM,CTR, /* 7X */ + 30, 57, 71, 65, 41, 43, 78, 49, 83, 89, 23, 45, 39, 74, 28, 32, /* 8X */ + 53, 60, 84, 31, 37, 40, 38, 59, 42, 81, 44, 73, 35, 72, 48, 76, /* 9X */ + 86, 57, 71, 65, 41, 43, 78, 49, 83, 89, 23, 45, 39, 74, 28, 32, /* AX */ + 53, 60, 84, 87, 46, 31, 38, 59, 42, 56, 52, 55, 70, 46, 40, 18, /* BX */ + 12, 15, 25, 61, 34, 51, 88, 95, 90, 62, 27, 85, 50, 47, 64, 76, /* CX */ + 10, 52, 63, 33, 29, 30, 80, 55, 70, 58, 67, 79, 92, 68, 87, 18, /* DX */ + 12, 15, 25, 61, 34, 51, 26, 77, 90, 62, 27, 85, 50, 47, 64, 73, /* EX */ + 10, 56, 63, 33, 29, 86, 81, 44, 48, 58, 67, 72, 35, 68, 37, 26, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + + +/* Model Table: + * Total sequences: 1494 + * First 512 sequences: 0.9321889118082535 + * Next 512 sequences (512-1024): 0.06092051479986333 + * Rest: 0.0068905733918831966 + * Negative sequences: TODO + */ +static const PRUint8 VietnameseLangModel[] = +{ + 3,3,3,3,3,3,3,2,2,3,0,2,3,1,1,1,1,2,3,3,2,3,3,3,2,1,2, + 3,0,3,2,2,2,3,1,0,1,1,2,0,0,1,0,1,0,2,2,1,0,0,0,3,0,0,2, + 2,1,2,0,3,0,3,3,2,3,0,2,3,0,2,3,0,0,3,1,3,3,1,3,1,3,3, + 3,3,3,3,3,3,3,3,3,0,3,3,3,2,3,3,3,3,2,3,3,3,3,3,2,3,2,0, + 2,3,2,2,3,1,3,3,1,3,1,3,3,2,2,3,2,0,3,2,2,3,1,3,0,3,0, + 3,1,3,3,3,3,2,3,2,0,0,2,1,2,2,2,2,0,0,1,3,2,3,2,2,2,2,0, + 2,3,2,2,3,0,3,3,2,3,0,2,2,1,2,3,1,1,2,2,2,3,1,0,2,2,0, + 0,0,3,2,3,2,3,3,3,1,1,2,0,0,2,0,3,0,0,2,0,2,2,0,2,3,1,1, + 3,1,3,3,3,3,3,2,3,3,1,3,2,2,3,3,2,2,0,3,1,3,3,3,2,0,3, + 3,3,1,0,0,3,1,3,0,2,0,2,3,3,2,0,0,2,3,0,0,0,1,0,1,0,0,2, + 2,3,2,2,3,1,3,3,1,3,0,3,3,0,2,2,0,1,3,2,2,3,1,1,1,2,3, + 0,0,3,3,1,2,2,0,1,0,2,2,0,0,1,1,3,3,0,0,0,1,1,2,1,0,3,0, + 3,2,3,3,3,2,2,3,3,3,0,3,0,2,3,0,2,3,0,3,3,2,3,0,2,0,0, + 0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,2, + 3,1,3,2,3,2,3,1,3,2,0,3,1,2,3,2,2,2,0,3,3,3,2,2,2,3,0, + 2,1,3,1,3,3,0,2,0,0,0,1,0,1,3,0,3,0,0,2,2,0,3,0,2,0,3,1, + 2,1,0,2,3,0,3,3,2,3,0,0,3,0,2,3,2,2,3,2,2,3,2,0,0,1,0, + 0,2,3,3,3,2,2,1,0,0,0,2,0,3,3,0,1,2,2,0,0,3,2,2,1,2,1,1, + 3,2,3,2,3,2,3,3,3,2,0,3,3,2,3,3,2,3,0,3,2,2,3,0,2,0,0, + 0,0,0,3,0,0,1,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,2, + 0,0,0,0,3,0,3,2,0,3,0,1,3,0,0,3,0,1,3,0,0,1,0,3,0,3,0, + 2,3,3,3,3,3,3,3,2,0,1,3,3,1,3,3,3,3,3,2,2,0,1,2,2,3,3,0, + 3,2,3,2,3,2,3,3,2,3,0,3,2,2,3,2,1,2,3,3,3,3,3,0,2,1,2, + 3,1,2,2,3,2,0,2,0,0,2,2,1,0,3,3,2,3,0,1,2,2,2,3,3,1,2,0, + 3,0,0,0,3,0,0,2,3,3,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 2,0,0,0,3,0,3,3,0,2,0,1,3,0,1,1,0,0,2,1,1,3,1,1,0,2,1, + 2,1,2,1,0,1,0,0,0,0,2,1,0,3,2,3,3,1,3,0,3,2,3,3,3,0,0,0, + 0,2,2,1,3,2,3,3,2,3,0,0,3,2,3,2,2,2,3,2,2,3,2,1,1,2,1, + 3,2,2,3,3,2,1,0,0,0,3,2,0,3,2,3,2,1,0,1,2,2,3,0,2,0,0,1, + 3,0,3,3,3,1,0,2,3,3,0,1,0,0,1,0,3,0,0,1,3,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,3,2,0,3,0,3,2,1,3,0,3,0,0,2,0,2,1,0,2,2,3,1,0,0,0,0, + 2,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0, + 2,1,0,2,3,1,3,3,0,3,0,3,3,0,3,3,0,3,1,2,2,3,1,1,1,0,0, + 2,1,0,2,3,3,2,3,0,0,0,1,0,2,2,3,2,0,1,0,2,1,2,3,0,2,3,0, + 3,0,1,1,2,0,3,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,3,3,0,3,0,0,0,0,0,3,0,0,0,0,0,0,0,0, + 1,3,3,3,3,1,3,3,2,3,0,1,2,0,2,3,2,2,2,3,2,3,2,0,2,2,0, + 0,0,2,1,0,3,2,2,0,1,1,1,1,1,1,0,0,0,0,2,0,1,0,0,1,2,1,0, + 2,0,1,2,1,0,2,2,1,2,0,2,0,0,1,1,2,1,0,2,0,2,1,3,1,0,0, + 3,2,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0, + 3,2,3,2,2,2,3,2,3,3,0,3,0,2,3,1,2,2,0,3,2,3,3,0,2,0,0, + 0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2, + 1,1,1,2,3,1,3,3,0,3,0,3,3,1,2,1,0,0,3,2,2,3,2,0,1,3,1, + 1,0,0,3,1,1,1,0,0,0,0,1,0,0,3,3,2,1,0,1,0,3,2,1,1,2,1,0, + 3,0,3,2,0,0,0,3,3,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,3,1,0,3,1,3,2,0,2,0,2,0,1,2,0,0,1,0,2,2,2,0,3,1,0,0, + 2,0,1,0,0,0,0,0,0,0,0,0,2,0,0,2,0,0,0,0,3,0,0,2,0,0,0,1, + 3,0,1,1,0,0,0,3,3,0,0,0,0,0,1,0,1,0,0,0,3,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,0,0,0,0,0,3,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,0,2,1,0,0,0,3,3,0,1,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0, + 0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,0,3,3,0,0,0,3,3,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,0,2,2,3,0,0,0,3,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0, + 0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,0,3,3,3,0,0,0,2,3,0,0,0,0,0,0,3,0,0,0,2,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,0,3,3,3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,0,3,3,3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,0,1,2,3,0,3,0,2,0,0,1,0,1,0,0,2,0,0,0,0,0,0,0,1,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,0,0,0,3,0,0,0,2,3,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 2,0,0,0,1,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,3,0,0,0,2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,0,2,3,3,0,0,2,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,0,0,0,0,0,0,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,0,2,3,3,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,0,3,0,0,0,0,3,3,0,0,0,0,0,0,0,3,0,0,0,3,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,0,1,3,0,0,3,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,0,0,0,1,0,0,3,2,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,0,1,3,3,0,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,0,0,0,0,0,0,3,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,0,1,1,3,0,0,2,2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,0,2,3,0,0,2,1,1,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,0,1,3,1,0,0,0,1,0,0,0,0,0,0,0,2,0,0,0,2,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,0,3,1,0,0,0,2,2,0,0,0,0,0,0,0,3,0,0,0,2,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,0,0,0,0,0,1,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,0,1,1,1,0,0,0,3,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,0,2,3,0,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,0,0,0,2,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,2,1,2,0,3,3,0,1,0,0,0,2,0,3,1,2,2,0,1,3,0,2,0,2,0, + 2,0,2,1,1,0,1,2,0,0,0,1,0,0,1,0,0,0,0,1,2,0,0,1,1,2,0,2, +}; + + +const SequenceModel Windows_1258VietnameseModel = +{ + Windows_1258_CharToOrderMap, + VietnameseLangModel, + 55, + (float)0.9321889118082535, + PR_FALSE, + "WINDOWS-1258" +}; + +const SequenceModel VisciiVietnameseModel = +{ + Viscii_CharToOrderMap, + VietnameseLangModel, + 55, + (float)0.9321889118082535, + PR_FALSE, + "VISCII" +}; \ No newline at end of file diff --git a/PowerEditor/src/uchardet/LangThaiModel.cpp b/PowerEditor/src/uchardet/LangThaiModel.cpp deleted file mode 100644 index 11b8e75e..00000000 --- a/PowerEditor/src/uchardet/LangThaiModel.cpp +++ /dev/null @@ -1,220 +0,0 @@ -/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ -/* ***** BEGIN LICENSE BLOCK ***** - * Version: MPL 1.1/GPL 2.0/LGPL 2.1 - * - * The contents of this file are subject to the Mozilla Public License Version - * 1.1 (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * http://www.mozilla.org/MPL/ - * - * Software distributed under the License is distributed on an "AS IS" basis, - * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License - * for the specific language governing rights and limitations under the - * License. - * - * The Original Code is Mozilla Communicator client code. - * - * The Initial Developer of the Original Code is - * Netscape Communications Corporation. - * Portions created by the Initial Developer are Copyright (C) 1998 - * the Initial Developer. All Rights Reserved. - * - * Contributor(s): - * - * Alternatively, the contents of this file may be used under the terms of - * either the GNU General Public License Version 2 or later (the "GPL"), or - * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), - * in which case the provisions of the GPL or the LGPL are applicable instead - * of those above. If you wish to allow use of your version of this file only - * under the terms of either the GPL or the LGPL, and not to allow others to - * use your version of this file under the terms of the MPL, indicate your - * decision by deleting the provisions above and replace them with the notice - * and other provisions required by the GPL or the LGPL. If you do not delete - * the provisions above, a recipient may use your version of this file under - * the terms of any one of the MPL, the GPL or the LGPL. - * - * ***** END LICENSE BLOCK ***** */ - -#include "nsSBCharSetProber.h" - - -/**************************************************************** -255: Control characters that usually does not exist in any text -254: Carriage/Return -253: symbol (punctuation) that does not belong to word -252: 0 - 9 - -*****************************************************************/ - -//The following result for thai was collected from a limited sample (1M). - -//Character Mapping Table: -static const unsigned char TIS620CharToOrderMap[] = -{ -255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 -255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 -+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20 -252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30 -253,182,106,107,100,183,184,185,101, 94,186,187,108,109,110,111, //40 -188,189,190, 89, 95,112,113,191,192,193,194,253,253,253,253,253, //50 -253, 64, 72, 73,114, 74,115,116,102, 81,201,117, 90,103, 78, 82, //60 - 96,202, 91, 79, 84,104,105, 97, 98, 92,203,253,253,253,253,253, //70 -209,210,211,212,213, 88,214,215,216,217,218,219,220,118,221,222, -223,224, 99, 85, 83,225,226,227,228,229,230,231,232,233,234,235, -236, 5, 30,237, 24,238, 75, 8, 26, 52, 34, 51,119, 47, 58, 57, - 49, 53, 55, 43, 20, 19, 44, 14, 48, 3, 17, 25, 39, 62, 31, 54, - 45, 9, 16, 2, 61, 15,239, 12, 42, 46, 18, 21, 76, 4, 66, 63, - 22, 10, 1, 36, 23, 13, 40, 27, 32, 35, 86,240,241,242,243,244, - 11, 28, 41, 29, 33,245, 50, 37, 6, 7, 67, 77, 38, 93,246,247, - 68, 56, 59, 65, 69, 60, 70, 80, 71, 87,248,249,250,251,252,253, -}; - - - - -//Model Table: -//total sequences: 100% -//first 512 sequences: 92.6386% -//first 1024 sequences:7.3177% -//rest sequences: 1.0230% -//negative sequences: 0.0436% -static const PRUint8 ThaiLangModel[] = -{ -0,1,3,3,3,3,0,0,3,3,0,3,3,0,3,3,3,3,3,3,3,3,0,0,3,3,3,0,3,3,3,3, -0,3,3,0,0,0,1,3,0,3,3,2,3,3,0,1,2,3,3,3,3,0,2,0,2,0,0,3,2,1,2,2, -3,0,3,3,2,3,0,0,3,3,0,3,3,0,3,3,3,3,3,3,3,3,3,0,3,2,3,0,2,2,2,3, -0,2,3,0,0,0,0,1,0,1,2,3,1,1,3,2,2,0,1,1,0,0,1,0,0,0,0,0,0,0,1,1, -3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,2,2,3,3,2,3,2,3,3,2,2,2, -3,1,2,3,0,3,3,2,2,1,2,3,3,1,2,0,1,3,0,1,0,0,1,0,0,0,0,0,0,0,1,1, -3,3,2,2,3,3,3,3,1,2,3,3,3,3,3,2,2,2,2,3,3,2,2,3,3,2,2,3,2,3,2,2, -3,3,1,2,3,1,2,2,3,3,1,0,2,1,0,0,3,1,2,1,0,0,1,0,0,0,0,0,0,1,0,1, -3,3,3,3,3,3,2,2,3,3,3,3,2,3,2,2,3,3,2,2,3,2,2,2,2,1,1,3,1,2,1,1, -3,2,1,0,2,1,0,1,0,1,1,0,1,1,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0, -3,3,3,2,3,2,3,3,2,2,3,2,3,3,2,3,1,1,2,3,2,2,2,3,2,2,2,2,2,1,2,1, -2,2,1,1,3,3,2,1,0,1,2,2,0,1,3,0,0,0,1,1,0,0,0,0,0,2,3,0,0,2,1,1, -3,3,2,3,3,2,0,0,3,3,0,3,3,0,2,2,3,1,2,2,1,1,1,0,2,2,2,0,2,2,1,1, -0,2,1,0,2,0,0,2,0,1,0,0,1,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,1,0, -3,3,2,3,3,2,0,0,3,3,0,2,3,0,2,1,2,2,2,2,1,2,0,0,2,2,2,0,2,2,1,1, -0,2,1,0,2,0,0,2,0,1,1,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0, -3,3,2,3,2,3,2,0,2,2,1,3,2,1,3,2,1,2,3,2,2,3,0,2,3,2,2,1,2,2,2,2, -1,2,2,0,0,0,0,2,0,1,2,0,1,1,1,0,1,0,3,1,1,0,0,0,0,0,0,0,0,0,1,0, -3,3,2,3,3,2,3,2,2,2,3,2,2,3,2,2,1,2,3,2,2,3,1,3,2,2,2,3,2,2,2,3, -3,2,1,3,0,1,1,1,0,2,1,1,1,1,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,2,0,0, -1,0,0,3,0,3,3,3,3,3,0,0,3,0,2,2,3,3,3,3,3,0,0,0,1,1,3,0,0,0,0,2, -0,0,1,0,0,0,0,0,0,0,2,3,0,0,0,3,0,2,0,0,0,0,0,3,0,0,0,0,0,0,0,0, -2,0,3,3,3,3,0,0,2,3,0,0,3,0,3,3,2,3,3,3,3,3,0,0,3,3,3,0,0,0,3,3, -0,0,3,0,0,0,0,2,0,0,2,1,1,3,0,0,1,0,0,2,3,0,1,0,0,0,0,0,0,0,1,0, -3,3,3,3,2,3,3,3,3,3,3,3,1,2,1,3,3,2,2,1,2,2,2,3,1,1,2,0,2,1,2,1, -2,2,1,0,0,0,1,1,0,1,0,1,1,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0, -3,0,2,1,2,3,3,3,0,2,0,2,2,0,2,1,3,2,2,1,2,1,0,0,2,2,1,0,2,1,2,2, -0,1,1,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0, -3,3,3,3,2,1,3,3,1,1,3,0,2,3,1,1,3,2,1,1,2,0,2,2,3,2,1,1,1,1,1,2, -3,0,0,1,3,1,2,1,2,0,3,0,0,0,1,0,3,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0, -3,3,1,1,3,2,3,3,3,1,3,2,1,3,2,1,3,2,2,2,2,1,3,3,1,2,1,3,1,2,3,0, -2,1,1,3,2,2,2,1,2,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2, -3,3,2,3,2,3,3,2,3,2,3,2,3,3,2,1,0,3,2,2,2,1,2,2,2,1,2,2,1,2,1,1, -2,2,2,3,0,1,3,1,1,1,1,0,1,1,0,2,1,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0, -3,3,3,3,2,3,2,2,1,1,3,2,3,2,3,2,0,3,2,2,1,2,0,2,2,2,1,2,2,2,2,1, -3,2,1,2,2,1,0,2,0,1,0,0,1,1,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,1, -3,3,3,3,3,2,3,1,2,3,3,2,2,3,0,1,1,2,0,3,3,2,2,3,0,1,1,3,0,0,0,0, -3,1,0,3,3,0,2,0,2,1,0,0,3,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -3,3,3,2,3,2,3,3,0,1,3,1,1,2,1,2,1,1,3,1,1,0,2,3,1,1,1,1,1,1,1,1, -3,1,1,2,2,2,2,1,1,1,0,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1, -3,2,2,1,1,2,1,3,3,2,3,2,2,3,2,2,3,1,2,2,1,2,0,3,2,1,2,2,2,2,2,1, -3,2,1,2,2,2,1,1,1,1,0,0,1,1,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0, -3,3,3,3,3,3,3,3,1,3,3,0,2,1,0,3,2,0,0,3,1,0,1,1,0,1,0,0,0,0,0,1, -1,0,0,1,0,3,2,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -3,0,2,2,2,3,0,0,1,3,0,3,2,0,3,2,2,3,3,3,3,3,1,0,2,2,2,0,2,2,1,2, -0,2,3,0,0,0,0,1,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1, -3,0,2,3,1,3,3,2,3,3,0,3,3,0,3,2,2,3,2,3,3,3,0,0,2,2,3,0,1,1,1,3, -0,0,3,0,0,0,2,2,0,1,3,0,1,2,2,2,3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1, -3,2,3,3,2,0,3,3,2,2,3,1,3,2,1,3,2,0,1,2,2,0,2,3,2,1,0,3,0,0,0,0, -3,0,0,2,3,1,3,0,0,3,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -3,1,3,2,2,2,1,2,0,1,3,1,1,3,1,3,0,0,2,1,1,1,1,2,1,1,1,0,2,1,0,1, -1,2,0,0,0,3,1,1,0,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0,0,3,1,0,0,0,1,0, -3,3,3,3,2,2,2,2,2,1,3,1,1,1,2,0,1,1,2,1,2,1,3,2,0,0,3,1,1,1,1,1, -3,1,0,2,3,0,0,0,3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,0,2,3,0,3,3,0,2,0,0,0,0,0,0,0,3,0,0,1,0,0,0,0,0,0,0,0,0,0,0, -0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,2,3,1,3,0,0,1,2,0,0,2,0,3,3,2,3,3,3,2,3,0,0,2,2,2,0,0,0,2,2, -0,0,1,0,0,0,0,3,0,0,0,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0, -0,0,0,3,0,2,0,0,0,0,0,0,0,0,0,0,1,2,3,1,3,3,0,0,1,0,3,0,0,0,0,0, -0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -3,3,1,2,3,1,2,3,1,0,3,0,2,2,1,0,2,1,1,2,0,1,0,0,1,1,1,1,0,1,0,0, -1,0,0,0,0,1,1,0,3,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -3,3,3,3,2,1,0,1,1,1,3,1,2,2,2,2,2,2,1,1,1,1,0,3,1,0,1,3,1,1,1,1, -1,1,0,2,0,1,3,1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,2,0,1, -3,0,2,2,1,3,3,2,3,3,0,1,1,0,2,2,1,2,1,3,3,1,0,0,3,2,0,0,0,0,2,1, -0,1,0,0,0,0,1,2,0,1,1,3,1,1,2,2,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0, -0,0,3,0,0,1,0,0,0,3,0,0,3,0,3,1,0,1,1,1,3,2,0,0,0,3,0,0,0,0,2,0, -0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2,0,0,0,0,0,0,0,0,0, -3,3,1,3,2,1,3,3,1,2,2,0,1,2,1,0,1,2,0,0,0,0,0,3,0,0,0,3,0,0,0,0, -3,0,0,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -3,0,1,2,0,3,3,3,2,2,0,1,1,0,1,3,0,0,0,2,2,0,0,0,0,3,1,0,1,0,0,0, -0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -3,0,2,3,1,2,0,0,2,1,0,3,1,0,1,2,0,1,1,1,1,3,0,0,3,1,1,0,2,2,1,1, -0,2,0,0,0,0,0,1,0,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -3,0,0,3,1,2,0,0,2,2,0,1,2,0,1,0,1,3,1,2,1,0,0,0,2,0,3,0,0,0,1,0, -0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -3,0,1,1,2,2,0,0,0,2,0,2,1,0,1,1,0,1,1,1,2,1,0,0,1,1,1,0,2,1,1,1, -0,1,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1, -0,0,0,2,0,1,3,1,1,1,1,0,0,0,0,3,2,0,1,0,0,0,1,2,0,0,0,1,0,0,0,0, -0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,0,0,0,3,3,3,3,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -1,0,2,3,2,2,0,0,0,1,0,0,0,0,2,3,2,1,2,2,3,0,0,0,2,3,1,0,0,0,1,1, -0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0, -3,3,2,2,0,1,0,0,0,0,2,0,2,0,1,0,0,0,1,1,0,0,0,2,1,0,1,0,1,1,0,0, -0,1,0,2,0,0,1,0,3,0,1,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -3,3,1,0,0,1,0,0,0,0,0,1,1,2,0,0,0,0,1,0,0,1,3,1,0,0,0,0,1,1,0,0, -0,1,0,0,0,0,3,0,0,0,0,0,0,3,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0, -3,3,1,1,1,1,2,3,0,0,2,1,1,1,1,1,0,2,1,1,0,0,0,2,1,0,1,2,1,1,0,1, -2,1,0,3,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -1,3,1,0,0,0,0,0,0,0,3,0,0,0,3,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1, -0,0,0,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -3,3,2,0,0,0,0,0,0,1,2,1,0,1,1,0,2,0,0,1,0,0,2,0,0,0,0,0,0,0,0,0, -0,0,0,0,0,0,2,0,0,0,1,3,0,1,0,0,0,2,0,0,0,0,0,0,0,1,2,0,0,0,0,0, -3,3,0,0,1,1,2,0,0,1,2,1,0,1,1,1,0,1,1,0,0,2,1,1,0,1,0,0,1,1,1,0, -0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0, -2,2,2,1,0,0,0,0,1,0,0,0,0,3,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0, -2,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -2,3,0,0,1,1,0,0,0,2,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -1,1,0,1,2,0,1,2,0,0,1,1,0,2,0,1,0,0,1,0,0,0,0,1,0,0,0,2,0,0,0,0, -1,0,0,1,0,1,1,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,1,0,0,0,0,0,0,0,1,1,0,1,1,0,2,1,3,0,0,0,0,1,1,0,0,0,0,0,0,0,3, -1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0, -0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -2,0,1,0,1,0,0,2,0,0,2,0,0,1,1,2,0,0,1,1,0,0,0,1,0,0,0,1,1,0,0,0, -1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0, -1,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,1,1,0,0,0, -2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -2,0,0,0,0,2,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,3,0,0,0, -2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,1,0,0,0,0, -1,0,0,0,0,0,0,0,0,1,0,0,0,0,2,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,1,1,0,0,2,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -}; - - -const SequenceModel TIS620ThaiModel( - TIS620CharToOrderMap, - ThaiLangModel, - (float)0.926386, - PR_FALSE, - "TIS-620" -); diff --git a/PowerEditor/src/uchardet/README.TXT b/PowerEditor/src/uchardet/README.TXT deleted file mode 100644 index ad75bd56..00000000 --- a/PowerEditor/src/uchardet/README.TXT +++ /dev/null @@ -1,10 +0,0 @@ -Uchardet is a C language binding of the original C++ implementation of the universal charset detection library by Mozilla. -The source code of universalchardet is available at https://github.com/BYVoid/uchardet - -uchardet is an encoding detector library, which takes a sequence of bytes in an unknown character encoding without any additional information, and attempts to determine the encoding of the text. - -The original code of universalchardet is available at http://lxr.mozilla.org/seamonkey/source/extensions/universalchardet/ - -Techniques used by universalchardet are described at http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html - -Uchardet is licensed under Mozilla Public License Version 1.1 (http://www.mozilla.org/MPL/1.1/) diff --git a/PowerEditor/src/uchardet/README.md b/PowerEditor/src/uchardet/README.md new file mode 100644 index 00000000..b6a2feae --- /dev/null +++ b/PowerEditor/src/uchardet/README.md @@ -0,0 +1,294 @@ +# uchardet + +[uchardet](https://www.freedesktop.org/wiki/Software/uchardet/) is an encoding detector library, which takes a sequence of bytes in an unknown character encoding without any additional information, and attempts to determine the encoding of the text. Returned encoding names are [iconv](https://www.gnu.org/software/libiconv/)-compatible. + +uchardet started as a C language binding of the original C++ implementation of the universal charset detection library by Mozilla. It can now detect more charsets, and more reliably than the original implementation. + +The original code of universalchardet is available at http://lxr.mozilla.org/seamonkey/source/extensions/universalchardet/ + +Techniques used by universalchardet are described at http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html + +## Supported Languages/Encodings + + * International (Unicode) + * UTF-8 + * UTF-16BE / UTF-16LE + * UTF-32BE / UTF-32LE / X-ISO-10646-UCS-4-34121 / X-ISO-10646-UCS-4-21431 + * Arabic + * ISO-8859-6 + * WINDOWS-1256 + * Bulgarian + * ISO-8859-5 + * WINDOWS-1251 + * Chinese + * ISO-2022-CN + * BIG5 + * EUC-TW + * GB18030 + * HZ-GB-2312 + * Croatian: + * ISO-8859-2 + * ISO-8859-13 + * ISO-8859-16 + * Windows-1250 + * IBM852 + * MAC-CENTRALEUROPE + * Czech + * Windows-1250 + * ISO-8859-2 + * IBM852 + * MAC-CENTRALEUROPE + * Danish + * ISO-8859-1 + * ISO-8859-15 + * WINDOWS-1252 + * English + * ASCII + * Esperanto + * ISO-8859-3 + * Estonian + * ISO-8859-4 + * ISO-8859-13 + * ISO-8859-13 + * Windows-1252 + * Windows-1257 + * Finnish + * ISO-8859-1 + * ISO-8859-4 + * ISO-8859-9 + * ISO-8859-13 + * ISO-8859-15 + * WINDOWS-1252 + * French + * ISO-8859-1 + * ISO-8859-15 + * WINDOWS-1252 + * German + * ISO-8859-1 + * WINDOWS-1252 + * Greek + * ISO-8859-7 + * WINDOWS-1253 + * Hebrew + * ISO-8859-8 + * WINDOWS-1255 + * Hungarian: + * ISO-8859-2 + * WINDOWS-1250 + * Irish Gaelic + * ISO-8859-1 + * ISO-8859-9 + * ISO-8859-15 + * WINDOWS-1252 + * Italian + * ISO-8859-1 + * ISO-8859-3 + * ISO-8859-9 + * ISO-8859-15 + * WINDOWS-1252 + * Japanese + * ISO-2022-JP + * SHIFT_JIS + * EUC-JP + * Korean + * ISO-2022-KR + * EUC-KR / UHC + * Lithuanian + * ISO-8859-4 + * ISO-8859-10 + * ISO-8859-13 + * Latvian + * ISO-8859-4 + * ISO-8859-10 + * ISO-8859-13 + * Maltese + * ISO-8859-3 + * Polish: + * ISO-8859-2 + * ISO-8859-13 + * ISO-8859-16 + * Windows-1250 + * IBM852 + * MAC-CENTRALEUROPE + * Portuguese + * ISO-8859-1 + * ISO-8859-9 + * ISO-8859-15 + * WINDOWS-1252 + * Romanian: + * ISO-8859-2 + * ISO-8859-16 + * Windows-1250 + * IBM852 + * Russian + * ISO-8859-5 + * KOI8-R + * WINDOWS-1251 + * MAC-CYRILLIC + * IBM866 + * IBM855 + * Slovak + * Windows-1250 + * ISO-8859-2 + * IBM852 + * MAC-CENTRALEUROPE + * Slovene + * ISO-8859-2 + * ISO-8859-16 + * Windows-1250 + * IBM852 + * MAC-CENTRALEUROPE + * Spanish + * ISO-8859-1 + * ISO-8859-15 + * WINDOWS-1252 + * Swedish + * ISO-8859-1 + * ISO-8859-4 + * ISO-8859-9 + * ISO-8859-15 + * WINDOWS-1252 + * Thai + * TIS-620 + * ISO-8859-11 + * Turkish: + * ISO-8859-3 + * ISO-8859-9 + * Vietnamese: + * VISCII + * Windows-1258 + * Others + * WINDOWS-1252 + +## Installation + +### Debian/Ubuntu/Mint + + apt-get install uchardet libuchardet-dev + +### Mageia + + urpmi libuchardet libuchardet-devel + +### Fedora + + dnf install uchardet uchardet-devel + +### Gentoo + + emerge uchardet + +### Mac + + brew install uchardet + +### Windows + +Binary packages are provided in Fedora repository. There may exist other +pre-built packages but I am not aware of them. +Nevertheless the library is very easily and quickly compilable under +Windows as well, so finding a binary package is not necessary. +Some did it successfully with the [CMake Windows +installer](https://cmake.org/download/) and MinGW. It should be possible +to use MinGW-w64 instead of MinGW, in particular to build both 32 and +64-bit DLL libraries). + +Note also that it is very easily cross-buildable (for instance from a +GNU/Linux machine). + +### Build from source + +Releases are available from: +https://www.freedesktop.org/software/uchardet/releases/ + +If you prefer a development version, clone the git repository: + + git clone git://anongit.freedesktop.org/uchardet/uchardet + +The source can be browsed at: https://cgit.freedesktop.org/uchardet/uchardet/ + + cmake . + make + make install + +### Build with flatpak-builder + +Here is a working "module" section to include in your Flatpak's json manifest: + +``` +"modules": [ + { + "name": "uchardet", + "buildsystem": "cmake", + "builddir": true, + "config-opts": [ "-DCMAKE_INSTALL_LIBDIR=lib" ], + "sources": [ + { + ... + } + ] + } +] +``` + +## Usage + +### Command Line + +``` +uchardet Command Line Tool +Version 0.0.6 + +Authors: BYVoid, Jehan +Bug Report: https://bugs.freedesktop.org/enter_bug.cgi?product=uchardet + +Usage: + uchardet [Options] [File]... + +Options: + -v, --version Print version and build information. + -h, --help Print this help. +``` + +### Library + +See [uchardet.h](https://cgit.freedesktop.org/uchardet/uchardet/tree/src/uchardet.h) + +## Related Projects + + * [python-chardet](https://github.com/chardet/chardet) Python port + * [ruby-rchardet](http://rubyforge.org/projects/chardet/) Ruby port + * [juniversalchardet](http://code.google.com/p/juniversalchardet/) Java port of universalchardet + * [jchardet](http://jchardet.sourceforge.net/) Java port of chardet + * [nuniversalchardet](http://code.google.com/p/nuniversalchardet/) C# port of universalchardet + * [nchardet](http://www.conceptdevelopment.net/Localization/NCharDet/) C# port of chardet + * [uchardet-enhanced](https://bitbucket.org/medoc/uchardet-enhanced) A fork of mozilla universalchardet + * [rust-uchardet](https://github.com/emk/rust-uchardet) Rust language binding of uchardet + * [libchardet](https://ftp.oops.org/pub/oops/libchardet/) Another C/C++ API wrapping Mozilla code. + +## Used by + +* [mpv](https://mpv.io/) for subtitle detection +* [Tepl](https://wiki.gnome.org/Projects/Tepl) +* [Nextcloud IOS app](https://github.com/nextcloud/ios) +* … + +## Licenses + +* [Mozilla Public License Version 1.1](http://www.mozilla.org/MPL/1.1/) +* [GNU General Public License, version 2.0](http://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html) or later. +* [GNU Lesser General Public License, version 2.1](http://www.gnu.org/licenses/old-licenses/lgpl-2.1.en.html) or later. + +See the file `COPYING` for the complete text of these 3 licenses. + +## Code of Conduct + +The `uchardet` project is hosted by [freedesktop.org](https://www.freedesktop.org/) +and as such follows its code of conduct. In other words, it means we +will treat anyone with respect and expect anyone to do the same. + +Please read [freedesktop.org Code of Conduct](https://www.freedesktop.org/wiki/CodeOfConduct). + +In case of any problem regarding abusive behavior in uchardet project, +please contact the maintainer (Jehan) or create a bug report (possibly +private if needed). diff --git a/PowerEditor/src/uchardet/nsBig5Prober.h b/PowerEditor/src/uchardet/nsBig5Prober.h index 5ae35764..7d13be8c 100644 --- a/PowerEditor/src/uchardet/nsBig5Prober.h +++ b/PowerEditor/src/uchardet/nsBig5Prober.h @@ -50,7 +50,7 @@ public: Reset();} virtual ~nsBig5Prober(void){delete mCodingSM;} nsProbingState HandleData(const char* aBuf, PRUint32 aLen); - const char* GetCharSetName() {return "Big5";} + const char* GetCharSetName() {return "BIG5";} nsProbingState GetState(void) {return mState;} void Reset(void); float GetConfidence(void); diff --git a/PowerEditor/src/uchardet/nsCharSetProber.cpp b/PowerEditor/src/uchardet/nsCharSetProber.cpp index 5e45d2d8..6d31ef8b 100644 --- a/PowerEditor/src/uchardet/nsCharSetProber.cpp +++ b/PowerEditor/src/uchardet/nsCharSetProber.cpp @@ -35,7 +35,7 @@ * the terms of any one of the MPL, the GPL or the LGPL. * * ***** END LICENSE BLOCK ***** */ - + #include "nsCharSetProber.h" #include "prmem.h" @@ -74,9 +74,7 @@ PRBool nsCharSetProber::FilterWithoutEnglishLetters(const char* aBuf, PRUint32 a if (meetMSB && curPtr > prevPtr) while (prevPtr < curPtr) *newptr++ = *prevPtr++; - auto np = reinterpret_cast(newptr); - auto nb = reinterpret_cast(*newBuf); - newLen = static_cast(np - nb); + newLen = static_cast(newptr - *newBuf); return PR_TRUE; } @@ -121,9 +119,7 @@ PRBool nsCharSetProber::FilterWithEnglishLetters(const char* aBuf, PRUint32 aLen while (prevPtr < curPtr) *newptr++ = *prevPtr++; - auto np = reinterpret_cast(newptr); - auto nb = reinterpret_cast(*newBuf); - newLen = static_cast(np - nb); + newLen = static_cast(newptr - *newBuf); return PR_TRUE; } diff --git a/PowerEditor/src/uchardet/nsCodingStateMachine.h b/PowerEditor/src/uchardet/nsCodingStateMachine.h index 07eadee9..819f9ab0 100644 --- a/PowerEditor/src/uchardet/nsCodingStateMachine.h +++ b/PowerEditor/src/uchardet/nsCodingStateMachine.h @@ -1,107 +1,104 @@ -/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ -/* ***** BEGIN LICENSE BLOCK ***** - * Version: MPL 1.1/GPL 2.0/LGPL 2.1 - * - * The contents of this file are subject to the Mozilla Public License Version - * 1.1 (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * http://www.mozilla.org/MPL/ - * - * Software distributed under the License is distributed on an "AS IS" basis, - * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License - * for the specific language governing rights and limitations under the - * License. - * - * The Original Code is mozilla.org code. - * - * The Initial Developer of the Original Code is - * Netscape Communications Corporation. - * Portions created by the Initial Developer are Copyright (C) 1998 - * the Initial Developer. All Rights Reserved. - * - * Contributor(s): - * - * Alternatively, the contents of this file may be used under the terms of - * either the GNU General Public License Version 2 or later (the "GPL"), or - * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), - * in which case the provisions of the GPL or the LGPL are applicable instead - * of those above. If you wish to allow use of your version of this file only - * under the terms of either the GPL or the LGPL, and not to allow others to - * use your version of this file under the terms of the MPL, indicate your - * decision by deleting the provisions above and replace them with the notice - * and other provisions required by the GPL or the LGPL. If you do not delete - * the provisions above, a recipient may use your version of this file under - * the terms of any one of the MPL, the GPL or the LGPL. - * - * ***** END LICENSE BLOCK ***** */ -#ifndef nsCodingStateMachine_h__ -#define nsCodingStateMachine_h__ - -#include "nsPkgInt.h" - -typedef enum { - eStart = 0, - eError = 1, - eItsMe = 2 -} nsSMState; - -#define GETCLASS(c) GETFROMPCK(((unsigned char)(c)), mModel->classTable) - -//state machine model -struct SMModel -{ - nsPkgInt classTable; - PRUint32 classFactor; - nsPkgInt stateTable; - const PRUint32* charLenTable; - const char* name; - SMModel(){}; - SMModel(nsPkgInt a,PRUint32 b,nsPkgInt c,const PRUint32* d, const char* e): - classTable(a), classFactor(b), stateTable(c), charLenTable(d), name(e){}; -} ; - -class nsCodingStateMachine { -public: - nsCodingStateMachine(const SMModel* sm) : mModel(sm) { mCurrentState = eStart; } - nsSMState NextState(char c){ - //for each byte we get its class , if it is first byte, we also get byte length - PRUint32 byteCls = GETCLASS(c); - if (mCurrentState == eStart) - { - mCurrentBytePos = 0; - mCurrentCharLen = mModel->charLenTable[byteCls]; - } - //from byte's class and stateTable, we get its next state - mCurrentState=(nsSMState)GETFROMPCK(mCurrentState*(mModel->classFactor)+byteCls, - mModel->stateTable); - mCurrentBytePos++; - return mCurrentState; - } - PRUint32 GetCurrentCharLen(void) {return mCurrentCharLen;} - void Reset(void) {mCurrentState = eStart;} - const char * GetCodingStateMachine() {return mModel->name;} - -protected: - nsSMState mCurrentState; - PRUint32 mCurrentCharLen; - PRUint32 mCurrentBytePos; - - const SMModel *mModel; -}; - -extern const SMModel UTF8SMModel; -extern const SMModel Big5SMModel; -extern const SMModel EUCJPSMModel; -extern const SMModel EUCKRSMModel; -extern const SMModel EUCTWSMModel; -extern const SMModel GB18030SMModel; -extern const SMModel SJISSMModel; - - -extern const SMModel HZSMModel; -extern const SMModel ISO2022CNSMModel; -extern const SMModel ISO2022JPSMModel; -extern const SMModel ISO2022KRSMModel; - -#endif /* nsCodingStateMachine_h__ */ - +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is mozilla.org code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ +#ifndef nsCodingStateMachine_h__ +#define nsCodingStateMachine_h__ + +#include "nsPkgInt.h" + +typedef enum { + eStart = 0, + eError = 1, + eItsMe = 2 +} nsSMState; + +#define GETCLASS(c) GETFROMPCK(((unsigned char)(c)), mModel->classTable) + +//state machine model +typedef struct +{ + nsPkgInt classTable; + PRUint32 classFactor; + nsPkgInt stateTable; + const PRUint32* charLenTable; + const char* name; +} SMModel; + +class nsCodingStateMachine { +public: + nsCodingStateMachine(const SMModel* sm) : mModel(sm) { mCurrentState = eStart; } + nsSMState NextState(char c){ + //for each byte we get its class , if it is first byte, we also get byte length + PRUint32 byteCls = GETCLASS(c); + if (mCurrentState == eStart) + { + mCurrentBytePos = 0; + mCurrentCharLen = mModel->charLenTable[byteCls]; + } + //from byte's class and stateTable, we get its next state + mCurrentState=(nsSMState)GETFROMPCK(mCurrentState*(mModel->classFactor)+byteCls, + mModel->stateTable); + mCurrentBytePos++; + return mCurrentState; + } + PRUint32 GetCurrentCharLen(void) {return mCurrentCharLen;} + void Reset(void) {mCurrentState = eStart;} + const char * GetCodingStateMachine() {return mModel->name;} + +protected: + nsSMState mCurrentState; + PRUint32 mCurrentCharLen; + PRUint32 mCurrentBytePos; + + const SMModel *mModel; +}; + +extern const SMModel UTF8SMModel; +extern const SMModel Big5SMModel; +extern const SMModel EUCJPSMModel; +extern const SMModel EUCKRSMModel; +extern const SMModel EUCTWSMModel; +extern const SMModel GB18030SMModel; +extern const SMModel SJISSMModel; + + +extern const SMModel HZSMModel; +extern const SMModel ISO2022CNSMModel; +extern const SMModel ISO2022JPSMModel; +extern const SMModel ISO2022KRSMModel; + +#endif /* nsCodingStateMachine_h__ */ + diff --git a/PowerEditor/src/uchardet/nsEUCTWProber.h b/PowerEditor/src/uchardet/nsEUCTWProber.h index 911d50b0..ee6376e2 100644 --- a/PowerEditor/src/uchardet/nsEUCTWProber.h +++ b/PowerEditor/src/uchardet/nsEUCTWProber.h @@ -50,7 +50,7 @@ public: Reset();} virtual ~nsEUCTWProber(void){delete mCodingSM;} nsProbingState HandleData(const char* aBuf, PRUint32 aLen); - const char* GetCharSetName() {return "x-euc-tw";} + const char* GetCharSetName() {return "EUC-TW";} nsProbingState GetState(void) {return mState;} void Reset(void); float GetConfidence(void); diff --git a/PowerEditor/src/uchardet/nsEscCharsetProber.cpp b/PowerEditor/src/uchardet/nsEscCharsetProber.cpp index 128f0a22..464c7534 100644 --- a/PowerEditor/src/uchardet/nsEscCharsetProber.cpp +++ b/PowerEditor/src/uchardet/nsEscCharsetProber.cpp @@ -75,13 +75,17 @@ void nsEscCharSetProber::Reset(void) nsProbingState nsEscCharSetProber::HandleData(const char* aBuf, PRUint32 aLen) { - for (PRUint32 i = 0; i < aLen && mState == eDetecting; i++) + nsSMState codingState; + PRInt32 j; + PRUint32 i; + + for ( i = 0; i < aLen && mState == eDetecting; i++) { - for (PRInt32 j = mActiveSM-1; j>= 0; j--) + for (j = mActiveSM-1; j>= 0; j--) { if (mCodingSM[j]) { - nsSMState codingState = mCodingSM[j]->NextState(aBuf[i]); + codingState = mCodingSM[j]->NextState(aBuf[i]); if (codingState == eItsMe) { mState = eFoundIt; diff --git a/PowerEditor/src/uchardet/nsEscSM.cpp b/PowerEditor/src/uchardet/nsEscSM.cpp index 7b1de390..eed1b7cf 100644 --- a/PowerEditor/src/uchardet/nsEscSM.cpp +++ b/PowerEditor/src/uchardet/nsEscSM.cpp @@ -34,7 +34,6 @@ * the terms of any one of the MPL, the GPL or the LGPL. * * ***** END LICENSE BLOCK ***** */ - #include "nsCodingStateMachine.h" static const PRUint32 HZ_cls[ 256 / 8 ] = { @@ -84,12 +83,13 @@ PCK4BITS( 4,eItsMe,eStart,eStart,eStart,eStart,eStart,eStart) //28-2f static const PRUint32 HZCharLenTable[] = {0, 0, 0, 0, 0, 0}; -const SMModel HZSMModel( - nsPkgInt(eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, HZ_cls), +const SMModel HZSMModel = { + {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, HZ_cls }, 6, - nsPkgInt(eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, HZ_st), + {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, HZ_st }, HZCharLenTable, - "HZ-GB-2312"); + "HZ-GB-2312", +}; static const PRUint32 ISO2022CN_cls [ 256 / 8 ] = { @@ -141,12 +141,13 @@ PCK4BITS(eError,eError,eError,eError,eError,eItsMe,eError,eStart) //38-3f static const PRUint32 ISO2022CNCharLenTable[] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; -const SMModel ISO2022CNSMModel( - nsPkgInt(eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022CN_cls), +const SMModel ISO2022CNSMModel = { + {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022CN_cls }, 9, - nsPkgInt(eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022CN_st), + {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022CN_st }, ISO2022CNCharLenTable, - "ISO-2022-CN"); + "ISO-2022-CN", +}; static const PRUint32 ISO2022JP_cls [ 256 / 8 ] = { PCK4BITS(2,0,0,0,0,0,0,0), // 00 - 07 @@ -198,12 +199,13 @@ PCK4BITS(eError,eError,eError,eError,eItsMe,eError,eStart,eStart) //40-47 static const PRUint32 ISO2022JPCharLenTable[] = {0, 0, 0, 0, 0, 0, 0, 0}; -const SMModel ISO2022JPSMModel( - nsPkgInt(eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022JP_cls), +const SMModel ISO2022JPSMModel = { + {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022JP_cls }, 10, - nsPkgInt(eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022JP_st), + {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022JP_st }, ISO2022JPCharLenTable, - "ISO-2022-JP"); + "ISO-2022-JP", +}; static const PRUint32 ISO2022KR_cls [ 256 / 8 ] = { PCK4BITS(2,0,0,0,0,0,0,0), // 00 - 07 @@ -251,10 +253,11 @@ PCK4BITS(eError,eError,eError,eItsMe,eStart,eStart,eStart,eStart) //20-27 static const PRUint32 ISO2022KRCharLenTable[] = {0, 0, 0, 0, 0, 0}; -const SMModel ISO2022KRSMModel( - nsPkgInt(eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022KR_cls), +const SMModel ISO2022KRSMModel = { + {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022KR_cls }, 6, - nsPkgInt(eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022KR_st), + {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022KR_st }, ISO2022KRCharLenTable, - "ISO-2022-KR"); + "ISO-2022-KR", +}; diff --git a/PowerEditor/src/uchardet/nsGB2312Prober.h b/PowerEditor/src/uchardet/nsGB2312Prober.h index 4bdac3bb..26ebf844 100644 --- a/PowerEditor/src/uchardet/nsGB2312Prober.h +++ b/PowerEditor/src/uchardet/nsGB2312Prober.h @@ -42,7 +42,7 @@ #include "nsCodingStateMachine.h" #include "CharDistribution.h" -// We use gb18030 to replace gb2312, because 18030 is a superset. +// We use GB18030 to replace GB2312, because 18030 is a superset. class nsGB18030Prober: public nsCharSetProber { public: @@ -52,7 +52,7 @@ public: Reset();} virtual ~nsGB18030Prober(void){delete mCodingSM;} nsProbingState HandleData(const char* aBuf, PRUint32 aLen); - const char* GetCharSetName() {return "gb18030";} + const char* GetCharSetName() {return "GB18030";} nsProbingState GetState(void) {return mState;} void Reset(void); float GetConfidence(void); diff --git a/PowerEditor/src/uchardet/nsHebrewProber.cpp b/PowerEditor/src/uchardet/nsHebrewProber.cpp index b148ce3f..c503617c 100644 --- a/PowerEditor/src/uchardet/nsHebrewProber.cpp +++ b/PowerEditor/src/uchardet/nsHebrewProber.cpp @@ -59,7 +59,7 @@ #define MIN_MODEL_DISTANCE (0.01) #define VISUAL_HEBREW_NAME ("ISO-8859-8") -#define LOGICAL_HEBREW_NAME ("windows-1255") +#define LOGICAL_HEBREW_NAME ("WINDOWS-1255") PRBool nsHebrewProber::isFinal(char c) { diff --git a/PowerEditor/src/uchardet/nsLatin1Prober.h b/PowerEditor/src/uchardet/nsLatin1Prober.h index 5145e965..59118a7a 100644 --- a/PowerEditor/src/uchardet/nsLatin1Prober.h +++ b/PowerEditor/src/uchardet/nsLatin1Prober.h @@ -48,7 +48,7 @@ public: nsLatin1Prober(void){Reset();} virtual ~nsLatin1Prober(void){} nsProbingState HandleData(const char* aBuf, PRUint32 aLen); - const char* GetCharSetName() {return "windows-1252";} + const char* GetCharSetName() {return "WINDOWS-1252";} nsProbingState GetState(void) {return mState;} void Reset(void); float GetConfidence(void); diff --git a/PowerEditor/src/uchardet/nsMBCSGroupProber.cpp b/PowerEditor/src/uchardet/nsMBCSGroupProber.cpp index 4fafb134..057ddb11 100644 --- a/PowerEditor/src/uchardet/nsMBCSGroupProber.cpp +++ b/PowerEditor/src/uchardet/nsMBCSGroupProber.cpp @@ -36,7 +36,6 @@ * the terms of any one of the MPL, the GPL or the LGPL. * * ***** END LICENSE BLOCK ***** */ - #include #include "nsMBCSGroupProber.h" @@ -45,13 +44,13 @@ #if defined(DEBUG_chardet) || defined(DEBUG_jgmyers) const char *ProberName[] = { - "UTF8", + "UTF-8", "SJIS", - "EUCJP", + "EUC-JP", "GB18030", - "EUCKR", + "EUC-KR", "Big5", - "EUCTW", + "EUC-TW", }; #endif diff --git a/PowerEditor/src/uchardet/nsMBCSSM.cpp b/PowerEditor/src/uchardet/nsMBCSSM.cpp index bedf2b76..50700968 100644 --- a/PowerEditor/src/uchardet/nsMBCSSM.cpp +++ b/PowerEditor/src/uchardet/nsMBCSSM.cpp @@ -1,507 +1,513 @@ -/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ -/* ***** BEGIN LICENSE BLOCK ***** - * Version: MPL 1.1/GPL 2.0/LGPL 2.1 - * - * The contents of this file are subject to the Mozilla Public License Version - * 1.1 (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * http://www.mozilla.org/MPL/ - * - * Software distributed under the License is distributed on an "AS IS" basis, - * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License - * for the specific language governing rights and limitations under the - * License. - * - * The Original Code is mozilla.org code. - * - * The Initial Developer of the Original Code is - * Netscape Communications Corporation. - * Portions created by the Initial Developer are Copyright (C) 1998 - * the Initial Developer. All Rights Reserved. - * - * Contributor(s): - * - * Alternatively, the contents of this file may be used under the terms of - * either the GNU General Public License Version 2 or later (the "GPL"), or - * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), - * in which case the provisions of the GPL or the LGPL are applicable instead - * of those above. If you wish to allow use of your version of this file only - * under the terms of either the GPL or the LGPL, and not to allow others to - * use your version of this file under the terms of the MPL, indicate your - * decision by deleting the provisions above and replace them with the notice - * and other provisions required by the GPL or the LGPL. If you do not delete - * the provisions above, a recipient may use your version of this file under - * the terms of any one of the MPL, the GPL or the LGPL. - * - * ***** END LICENSE BLOCK ***** */ - -#include "nsCodingStateMachine.h" - -/* -Modification from frank tang's original work: -. 0x00 is allowed as a legal character. Since some web pages contains this char in - text stream. -*/ - -// BIG5 - -static const PRUint32 BIG5_cls [ 256 / 8 ] = { -//PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07 -PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07 //allow 0x00 as legal value -PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f -PCK4BITS(1,1,1,1,1,1,1,1), // 10 - 17 -PCK4BITS(1,1,1,0,1,1,1,1), // 18 - 1f -PCK4BITS(1,1,1,1,1,1,1,1), // 20 - 27 -PCK4BITS(1,1,1,1,1,1,1,1), // 28 - 2f -PCK4BITS(1,1,1,1,1,1,1,1), // 30 - 37 -PCK4BITS(1,1,1,1,1,1,1,1), // 38 - 3f -PCK4BITS(2,2,2,2,2,2,2,2), // 40 - 47 -PCK4BITS(2,2,2,2,2,2,2,2), // 48 - 4f -PCK4BITS(2,2,2,2,2,2,2,2), // 50 - 57 -PCK4BITS(2,2,2,2,2,2,2,2), // 58 - 5f -PCK4BITS(2,2,2,2,2,2,2,2), // 60 - 67 -PCK4BITS(2,2,2,2,2,2,2,2), // 68 - 6f -PCK4BITS(2,2,2,2,2,2,2,2), // 70 - 77 -PCK4BITS(2,2,2,2,2,2,2,1), // 78 - 7f -PCK4BITS(4,4,4,4,4,4,4,4), // 80 - 87 -PCK4BITS(4,4,4,4,4,4,4,4), // 88 - 8f -PCK4BITS(4,4,4,4,4,4,4,4), // 90 - 97 -PCK4BITS(4,4,4,4,4,4,4,4), // 98 - 9f -PCK4BITS(4,3,3,3,3,3,3,3), // a0 - a7 -PCK4BITS(3,3,3,3,3,3,3,3), // a8 - af -PCK4BITS(3,3,3,3,3,3,3,3), // b0 - b7 -PCK4BITS(3,3,3,3,3,3,3,3), // b8 - bf -PCK4BITS(3,3,3,3,3,3,3,3), // c0 - c7 -PCK4BITS(3,3,3,3,3,3,3,3), // c8 - cf -PCK4BITS(3,3,3,3,3,3,3,3), // d0 - d7 -PCK4BITS(3,3,3,3,3,3,3,3), // d8 - df -PCK4BITS(3,3,3,3,3,3,3,3), // e0 - e7 -PCK4BITS(3,3,3,3,3,3,3,3), // e8 - ef -PCK4BITS(3,3,3,3,3,3,3,3), // f0 - f7 -PCK4BITS(3,3,3,3,3,3,3,0) // f8 - ff -}; - - -static const PRUint32 BIG5_st [ 3] = { -PCK4BITS(eError,eStart,eStart, 3,eError,eError,eError,eError),//00-07 -PCK4BITS(eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError),//08-0f -PCK4BITS(eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart) //10-17 -}; - -static const PRUint32 Big5CharLenTable[] = {0, 1, 1, 2, 0}; - -const SMModel Big5SMModel( -nsPkgInt( eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, BIG5_cls ), - 5, -nsPkgInt( eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, BIG5_st ), - Big5CharLenTable, -"Big5"); - -static const PRUint32 EUCJP_cls [ 256 / 8 ] = { -//PCK4BITS(5,4,4,4,4,4,4,4), // 00 - 07 -PCK4BITS(4,4,4,4,4,4,4,4), // 00 - 07 -PCK4BITS(4,4,4,4,4,4,5,5), // 08 - 0f -PCK4BITS(4,4,4,4,4,4,4,4), // 10 - 17 -PCK4BITS(4,4,4,5,4,4,4,4), // 18 - 1f -PCK4BITS(4,4,4,4,4,4,4,4), // 20 - 27 -PCK4BITS(4,4,4,4,4,4,4,4), // 28 - 2f -PCK4BITS(4,4,4,4,4,4,4,4), // 30 - 37 -PCK4BITS(4,4,4,4,4,4,4,4), // 38 - 3f -PCK4BITS(4,4,4,4,4,4,4,4), // 40 - 47 -PCK4BITS(4,4,4,4,4,4,4,4), // 48 - 4f -PCK4BITS(4,4,4,4,4,4,4,4), // 50 - 57 -PCK4BITS(4,4,4,4,4,4,4,4), // 58 - 5f -PCK4BITS(4,4,4,4,4,4,4,4), // 60 - 67 -PCK4BITS(4,4,4,4,4,4,4,4), // 68 - 6f -PCK4BITS(4,4,4,4,4,4,4,4), // 70 - 77 -PCK4BITS(4,4,4,4,4,4,4,4), // 78 - 7f -PCK4BITS(5,5,5,5,5,5,5,5), // 80 - 87 -PCK4BITS(5,5,5,5,5,5,1,3), // 88 - 8f -PCK4BITS(5,5,5,5,5,5,5,5), // 90 - 97 -PCK4BITS(5,5,5,5,5,5,5,5), // 98 - 9f -PCK4BITS(5,2,2,2,2,2,2,2), // a0 - a7 -PCK4BITS(2,2,2,2,2,2,2,2), // a8 - af -PCK4BITS(2,2,2,2,2,2,2,2), // b0 - b7 -PCK4BITS(2,2,2,2,2,2,2,2), // b8 - bf -PCK4BITS(2,2,2,2,2,2,2,2), // c0 - c7 -PCK4BITS(2,2,2,2,2,2,2,2), // c8 - cf -PCK4BITS(2,2,2,2,2,2,2,2), // d0 - d7 -PCK4BITS(2,2,2,2,2,2,2,2), // d8 - df -PCK4BITS(0,0,0,0,0,0,0,0), // e0 - e7 -PCK4BITS(0,0,0,0,0,0,0,0), // e8 - ef -PCK4BITS(0,0,0,0,0,0,0,0), // f0 - f7 -PCK4BITS(0,0,0,0,0,0,0,5) // f8 - ff -}; - - -static const PRUint32 EUCJP_st [ 5] = { -PCK4BITS( 3, 4, 3, 5,eStart,eError,eError,eError),//00-07 -PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f -PCK4BITS(eItsMe,eItsMe,eStart,eError,eStart,eError,eError,eError),//10-17 -PCK4BITS(eError,eError,eStart,eError,eError,eError, 3,eError),//18-1f -PCK4BITS( 3,eError,eError,eError,eStart,eStart,eStart,eStart) //20-27 -}; - -static const PRUint32 EUCJPCharLenTable[] = {2, 2, 2, 3, 1, 0}; - -const SMModel EUCJPSMModel( - nsPkgInt(eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCJP_cls), - 6, - nsPkgInt(eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCJP_st), - EUCJPCharLenTable, - "EUC-JP"); - -static const PRUint32 EUCKR_cls [ 256 / 8 ] = { -//PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07 -PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07 -PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f -PCK4BITS(1,1,1,1,1,1,1,1), // 10 - 17 -PCK4BITS(1,1,1,0,1,1,1,1), // 18 - 1f -PCK4BITS(1,1,1,1,1,1,1,1), // 20 - 27 -PCK4BITS(1,1,1,1,1,1,1,1), // 28 - 2f -PCK4BITS(1,1,1,1,1,1,1,1), // 30 - 37 -PCK4BITS(1,1,1,1,1,1,1,1), // 38 - 3f -PCK4BITS(1,1,1,1,1,1,1,1), // 40 - 47 -PCK4BITS(1,1,1,1,1,1,1,1), // 48 - 4f -PCK4BITS(1,1,1,1,1,1,1,1), // 50 - 57 -PCK4BITS(1,1,1,1,1,1,1,1), // 58 - 5f -PCK4BITS(1,1,1,1,1,1,1,1), // 60 - 67 -PCK4BITS(1,1,1,1,1,1,1,1), // 68 - 6f -PCK4BITS(1,1,1,1,1,1,1,1), // 70 - 77 -PCK4BITS(1,1,1,1,1,1,1,1), // 78 - 7f -PCK4BITS(0,0,0,0,0,0,0,0), // 80 - 87 -PCK4BITS(0,0,0,0,0,0,0,0), // 88 - 8f -PCK4BITS(0,0,0,0,0,0,0,0), // 90 - 97 -PCK4BITS(0,0,0,0,0,0,0,0), // 98 - 9f -PCK4BITS(0,2,2,2,2,2,2,2), // a0 - a7 -PCK4BITS(2,2,2,2,2,3,3,3), // a8 - af -PCK4BITS(2,2,2,2,2,2,2,2), // b0 - b7 -PCK4BITS(2,2,2,2,2,2,2,2), // b8 - bf -PCK4BITS(2,2,2,2,2,2,2,2), // c0 - c7 -PCK4BITS(2,3,2,2,2,2,2,2), // c8 - cf -PCK4BITS(2,2,2,2,2,2,2,2), // d0 - d7 -PCK4BITS(2,2,2,2,2,2,2,2), // d8 - df -PCK4BITS(2,2,2,2,2,2,2,2), // e0 - e7 -PCK4BITS(2,2,2,2,2,2,2,2), // e8 - ef -PCK4BITS(2,2,2,2,2,2,2,2), // f0 - f7 -PCK4BITS(2,2,2,2,2,2,2,0) // f8 - ff -}; - - -static const PRUint32 EUCKR_st [ 2] = { -PCK4BITS(eError,eStart, 3,eError,eError,eError,eError,eError),//00-07 -PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,eStart) //08-0f -}; - -static const PRUint32 EUCKRCharLenTable[] = {0, 1, 2, 0}; - -const SMModel EUCKRSMModel ( - nsPkgInt(eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCKR_cls), - 4, - nsPkgInt(eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCKR_st), - EUCKRCharLenTable, - "EUC-KR"); - -static const PRUint32 EUCTW_cls [ 256 / 8 ] = { -//PCK4BITS(0,2,2,2,2,2,2,2), // 00 - 07 -PCK4BITS(2,2,2,2,2,2,2,2), // 00 - 07 -PCK4BITS(2,2,2,2,2,2,0,0), // 08 - 0f -PCK4BITS(2,2,2,2,2,2,2,2), // 10 - 17 -PCK4BITS(2,2,2,0,2,2,2,2), // 18 - 1f -PCK4BITS(2,2,2,2,2,2,2,2), // 20 - 27 -PCK4BITS(2,2,2,2,2,2,2,2), // 28 - 2f -PCK4BITS(2,2,2,2,2,2,2,2), // 30 - 37 -PCK4BITS(2,2,2,2,2,2,2,2), // 38 - 3f -PCK4BITS(2,2,2,2,2,2,2,2), // 40 - 47 -PCK4BITS(2,2,2,2,2,2,2,2), // 48 - 4f -PCK4BITS(2,2,2,2,2,2,2,2), // 50 - 57 -PCK4BITS(2,2,2,2,2,2,2,2), // 58 - 5f -PCK4BITS(2,2,2,2,2,2,2,2), // 60 - 67 -PCK4BITS(2,2,2,2,2,2,2,2), // 68 - 6f -PCK4BITS(2,2,2,2,2,2,2,2), // 70 - 77 -PCK4BITS(2,2,2,2,2,2,2,2), // 78 - 7f -PCK4BITS(0,0,0,0,0,0,0,0), // 80 - 87 -PCK4BITS(0,0,0,0,0,0,6,0), // 88 - 8f -PCK4BITS(0,0,0,0,0,0,0,0), // 90 - 97 -PCK4BITS(0,0,0,0,0,0,0,0), // 98 - 9f -PCK4BITS(0,3,4,4,4,4,4,4), // a0 - a7 -PCK4BITS(5,5,1,1,1,1,1,1), // a8 - af -PCK4BITS(1,1,1,1,1,1,1,1), // b0 - b7 -PCK4BITS(1,1,1,1,1,1,1,1), // b8 - bf -PCK4BITS(1,1,3,1,3,3,3,3), // c0 - c7 -PCK4BITS(3,3,3,3,3,3,3,3), // c8 - cf -PCK4BITS(3,3,3,3,3,3,3,3), // d0 - d7 -PCK4BITS(3,3,3,3,3,3,3,3), // d8 - df -PCK4BITS(3,3,3,3,3,3,3,3), // e0 - e7 -PCK4BITS(3,3,3,3,3,3,3,3), // e8 - ef -PCK4BITS(3,3,3,3,3,3,3,3), // f0 - f7 -PCK4BITS(3,3,3,3,3,3,3,0) // f8 - ff -}; - - -static const PRUint32 EUCTW_st [ 6] = { -PCK4BITS(eError,eError,eStart, 3, 3, 3, 4,eError),//00-07 -PCK4BITS(eError,eError,eError,eError,eError,eError,eItsMe,eItsMe),//08-0f -PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eStart,eError),//10-17 -PCK4BITS(eStart,eStart,eStart,eError,eError,eError,eError,eError),//18-1f -PCK4BITS( 5,eError,eError,eError,eStart,eError,eStart,eStart),//20-27 -PCK4BITS(eStart,eError,eStart,eStart,eStart,eStart,eStart,eStart) //28-2f -}; - -static const PRUint32 EUCTWCharLenTable[] = {0, 0, 1, 2, 2, 2, 3}; - -const SMModel EUCTWSMModel( - nsPkgInt(eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCTW_cls), - 7, - nsPkgInt(eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCTW_st), - EUCTWCharLenTable, - "x-euc-tw"); - -/* obsolete GB2312 by gb18030 -static PRUint32 GB2312_cls [ 256 / 8 ] = { -//PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07 -PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07 -PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f -PCK4BITS(1,1,1,1,1,1,1,1), // 10 - 17 -PCK4BITS(1,1,1,0,1,1,1,1), // 18 - 1f -PCK4BITS(1,1,1,1,1,1,1,1), // 20 - 27 -PCK4BITS(1,1,1,1,1,1,1,1), // 28 - 2f -PCK4BITS(1,1,1,1,1,1,1,1), // 30 - 37 -PCK4BITS(1,1,1,1,1,1,1,1), // 38 - 3f -PCK4BITS(1,1,1,1,1,1,1,1), // 40 - 47 -PCK4BITS(1,1,1,1,1,1,1,1), // 48 - 4f -PCK4BITS(1,1,1,1,1,1,1,1), // 50 - 57 -PCK4BITS(1,1,1,1,1,1,1,1), // 58 - 5f -PCK4BITS(1,1,1,1,1,1,1,1), // 60 - 67 -PCK4BITS(1,1,1,1,1,1,1,1), // 68 - 6f -PCK4BITS(1,1,1,1,1,1,1,1), // 70 - 77 -PCK4BITS(1,1,1,1,1,1,1,1), // 78 - 7f -PCK4BITS(1,0,0,0,0,0,0,0), // 80 - 87 -PCK4BITS(0,0,0,0,0,0,0,0), // 88 - 8f -PCK4BITS(0,0,0,0,0,0,0,0), // 90 - 97 -PCK4BITS(0,0,0,0,0,0,0,0), // 98 - 9f -PCK4BITS(0,2,2,2,2,2,2,2), // a0 - a7 -PCK4BITS(2,2,3,3,3,3,3,3), // a8 - af -PCK4BITS(2,2,2,2,2,2,2,2), // b0 - b7 -PCK4BITS(2,2,2,2,2,2,2,2), // b8 - bf -PCK4BITS(2,2,2,2,2,2,2,2), // c0 - c7 -PCK4BITS(2,2,2,2,2,2,2,2), // c8 - cf -PCK4BITS(2,2,2,2,2,2,2,2), // d0 - d7 -PCK4BITS(2,2,2,2,2,2,2,2), // d8 - df -PCK4BITS(2,2,2,2,2,2,2,2), // e0 - e7 -PCK4BITS(2,2,2,2,2,2,2,2), // e8 - ef -PCK4BITS(2,2,2,2,2,2,2,2), // f0 - f7 -PCK4BITS(2,2,2,2,2,2,2,0) // f8 - ff -}; - - -static PRUint32 GB2312_st [ 2] = { -PCK4BITS(eError,eStart, 3,eError,eError,eError,eError,eError),//00-07 -PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,eStart) //08-0f -}; - -static const PRUint32 GB2312CharLenTable[] = {0, 1, 2, 0}; - -SMModel GB2312SMModel = { - {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, GB2312_cls }, - 4, - {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, GB2312_st }, - GB2312CharLenTable, - "GB2312", -}; -*/ - -// the following state machine data was created by perl script in -// intl/chardet/tools. It should be the same as in PSM detector. -static const PRUint32 GB18030_cls [ 256 / 8 ] = { -PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07 -PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f -PCK4BITS(1,1,1,1,1,1,1,1), // 10 - 17 -PCK4BITS(1,1,1,0,1,1,1,1), // 18 - 1f -PCK4BITS(1,1,1,1,1,1,1,1), // 20 - 27 -PCK4BITS(1,1,1,1,1,1,1,1), // 28 - 2f -PCK4BITS(3,3,3,3,3,3,3,3), // 30 - 37 -PCK4BITS(3,3,1,1,1,1,1,1), // 38 - 3f -PCK4BITS(2,2,2,2,2,2,2,2), // 40 - 47 -PCK4BITS(2,2,2,2,2,2,2,2), // 48 - 4f -PCK4BITS(2,2,2,2,2,2,2,2), // 50 - 57 -PCK4BITS(2,2,2,2,2,2,2,2), // 58 - 5f -PCK4BITS(2,2,2,2,2,2,2,2), // 60 - 67 -PCK4BITS(2,2,2,2,2,2,2,2), // 68 - 6f -PCK4BITS(2,2,2,2,2,2,2,2), // 70 - 77 -PCK4BITS(2,2,2,2,2,2,2,4), // 78 - 7f -PCK4BITS(5,6,6,6,6,6,6,6), // 80 - 87 -PCK4BITS(6,6,6,6,6,6,6,6), // 88 - 8f -PCK4BITS(6,6,6,6,6,6,6,6), // 90 - 97 -PCK4BITS(6,6,6,6,6,6,6,6), // 98 - 9f -PCK4BITS(6,6,6,6,6,6,6,6), // a0 - a7 -PCK4BITS(6,6,6,6,6,6,6,6), // a8 - af -PCK4BITS(6,6,6,6,6,6,6,6), // b0 - b7 -PCK4BITS(6,6,6,6,6,6,6,6), // b8 - bf -PCK4BITS(6,6,6,6,6,6,6,6), // c0 - c7 -PCK4BITS(6,6,6,6,6,6,6,6), // c8 - cf -PCK4BITS(6,6,6,6,6,6,6,6), // d0 - d7 -PCK4BITS(6,6,6,6,6,6,6,6), // d8 - df -PCK4BITS(6,6,6,6,6,6,6,6), // e0 - e7 -PCK4BITS(6,6,6,6,6,6,6,6), // e8 - ef -PCK4BITS(6,6,6,6,6,6,6,6), // f0 - f7 -PCK4BITS(6,6,6,6,6,6,6,0) // f8 - ff -}; - - -static const PRUint32 GB18030_st [ 6] = { -PCK4BITS(eError,eStart,eStart,eStart,eStart,eStart, 3,eError),//00-07 -PCK4BITS(eError,eError,eError,eError,eError,eError,eItsMe,eItsMe),//08-0f -PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart),//10-17 -PCK4BITS( 4,eError,eStart,eStart,eError,eError,eError,eError),//18-1f -PCK4BITS(eError,eError, 5,eError,eError,eError,eItsMe,eError),//20-27 -PCK4BITS(eError,eError,eStart,eStart,eStart,eStart,eStart,eStart) //28-2f -}; - -// To be accurate, the length of class 6 can be either 2 or 4. -// But it is not necessary to discriminate between the two since -// it is used for frequency analysis only, and we are validing -// each code range there as well. So it is safe to set it to be -// 2 here. -static const PRUint32 GB18030CharLenTable[] = {0, 1, 1, 1, 1, 1, 2}; - -const SMModel GB18030SMModel( - nsPkgInt(eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, GB18030_cls ), - 7, - nsPkgInt(eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, GB18030_st ), - GB18030CharLenTable, - "GB18030"); - -// sjis - -static const PRUint32 SJIS_cls [ 256 / 8 ] = { -//PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07 -PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07 -PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f -PCK4BITS(1,1,1,1,1,1,1,1), // 10 - 17 -PCK4BITS(1,1,1,0,1,1,1,1), // 18 - 1f -PCK4BITS(1,1,1,1,1,1,1,1), // 20 - 27 -PCK4BITS(1,1,1,1,1,1,1,1), // 28 - 2f -PCK4BITS(1,1,1,1,1,1,1,1), // 30 - 37 -PCK4BITS(1,1,1,1,1,1,1,1), // 38 - 3f -PCK4BITS(2,2,2,2,2,2,2,2), // 40 - 47 -PCK4BITS(2,2,2,2,2,2,2,2), // 48 - 4f -PCK4BITS(2,2,2,2,2,2,2,2), // 50 - 57 -PCK4BITS(2,2,2,2,2,2,2,2), // 58 - 5f -PCK4BITS(2,2,2,2,2,2,2,2), // 60 - 67 -PCK4BITS(2,2,2,2,2,2,2,2), // 68 - 6f -PCK4BITS(2,2,2,2,2,2,2,2), // 70 - 77 -PCK4BITS(2,2,2,2,2,2,2,1), // 78 - 7f -PCK4BITS(3,3,3,3,3,3,3,3), // 80 - 87 -PCK4BITS(3,3,3,3,3,3,3,3), // 88 - 8f -PCK4BITS(3,3,3,3,3,3,3,3), // 90 - 97 -PCK4BITS(3,3,3,3,3,3,3,3), // 98 - 9f -//0xa0 is illegal in sjis encoding, but some pages does -//contain such byte. We need to be more error forgiven. -PCK4BITS(2,2,2,2,2,2,2,2), // a0 - a7 -PCK4BITS(2,2,2,2,2,2,2,2), // a8 - af -PCK4BITS(2,2,2,2,2,2,2,2), // b0 - b7 -PCK4BITS(2,2,2,2,2,2,2,2), // b8 - bf -PCK4BITS(2,2,2,2,2,2,2,2), // c0 - c7 -PCK4BITS(2,2,2,2,2,2,2,2), // c8 - cf -PCK4BITS(2,2,2,2,2,2,2,2), // d0 - d7 -PCK4BITS(2,2,2,2,2,2,2,2), // d8 - df -PCK4BITS(3,3,3,3,3,3,3,3), // e0 - e7 -PCK4BITS(3,3,3,3,3,4,4,4), // e8 - ef -PCK4BITS(4,4,4,4,4,4,4,4), // f0 - f7 -PCK4BITS(4,4,4,4,4,0,0,0) // f8 - ff -}; - - -static const PRUint32 SJIS_st [ 3] = { -PCK4BITS(eError,eStart,eStart, 3,eError,eError,eError,eError),//00-07 -PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f -PCK4BITS(eItsMe,eItsMe,eError,eError,eStart,eStart,eStart,eStart) //10-17 -}; - -static const PRUint32 SJISCharLenTable[] = {0, 1, 1, 2, 0, 0}; - -const SMModel SJISSMModel( - nsPkgInt(eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, SJIS_cls), - 6, - nsPkgInt(eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, SJIS_st), - SJISCharLenTable, - "Shift_JIS"); - - -static const PRUint32 UTF8_cls [ 256 / 8 ] = { -//PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07 -PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07 //allow 0x00 as a legal value -PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f -PCK4BITS(1,1,1,1,1,1,1,1), // 10 - 17 -PCK4BITS(1,1,1,0,1,1,1,1), // 18 - 1f -PCK4BITS(1,1,1,1,1,1,1,1), // 20 - 27 -PCK4BITS(1,1,1,1,1,1,1,1), // 28 - 2f -PCK4BITS(1,1,1,1,1,1,1,1), // 30 - 37 -PCK4BITS(1,1,1,1,1,1,1,1), // 38 - 3f -PCK4BITS(1,1,1,1,1,1,1,1), // 40 - 47 -PCK4BITS(1,1,1,1,1,1,1,1), // 48 - 4f -PCK4BITS(1,1,1,1,1,1,1,1), // 50 - 57 -PCK4BITS(1,1,1,1,1,1,1,1), // 58 - 5f -PCK4BITS(1,1,1,1,1,1,1,1), // 60 - 67 -PCK4BITS(1,1,1,1,1,1,1,1), // 68 - 6f -PCK4BITS(1,1,1,1,1,1,1,1), // 70 - 77 -PCK4BITS(1,1,1,1,1,1,1,1), // 78 - 7f -PCK4BITS(2,2,2,2,3,3,3,3), // 80 - 87 -PCK4BITS(4,4,4,4,4,4,4,4), // 88 - 8f -PCK4BITS(4,4,4,4,4,4,4,4), // 90 - 97 -PCK4BITS(4,4,4,4,4,4,4,4), // 98 - 9f -PCK4BITS(5,5,5,5,5,5,5,5), // a0 - a7 -PCK4BITS(5,5,5,5,5,5,5,5), // a8 - af -PCK4BITS(5,5,5,5,5,5,5,5), // b0 - b7 -PCK4BITS(5,5,5,5,5,5,5,5), // b8 - bf -PCK4BITS(0,0,6,6,6,6,6,6), // c0 - c7 -PCK4BITS(6,6,6,6,6,6,6,6), // c8 - cf -PCK4BITS(6,6,6,6,6,6,6,6), // d0 - d7 -PCK4BITS(6,6,6,6,6,6,6,6), // d8 - df -PCK4BITS(7,8,8,8,8,8,8,8), // e0 - e7 -PCK4BITS(8,8,8,8,8,9,8,8), // e8 - ef -PCK4BITS(10,11,11,11,11,11,11,11), // f0 - f7 -PCK4BITS(12,13,13,13,14,15,0,0) // f8 - ff -}; - - -static const PRUint32 UTF8_st [ 26] = { -PCK4BITS(eError,eStart,eError,eError,eError,eError, 12, 10),//00-07 -PCK4BITS( 9, 11, 8, 7, 6, 5, 4, 3),//08-0f -PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//10-17 -PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//18-1f -PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe),//20-27 -PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe),//28-2f -PCK4BITS(eError,eError, 5, 5, 5, 5,eError,eError),//30-37 -PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//38-3f -PCK4BITS(eError,eError,eError, 5, 5, 5,eError,eError),//40-47 -PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//48-4f -PCK4BITS(eError,eError, 7, 7, 7, 7,eError,eError),//50-57 -PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//58-5f -PCK4BITS(eError,eError,eError,eError, 7, 7,eError,eError),//60-67 -PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//68-6f -PCK4BITS(eError,eError, 9, 9, 9, 9,eError,eError),//70-77 -PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//78-7f -PCK4BITS(eError,eError,eError,eError,eError, 9,eError,eError),//80-87 -PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//88-8f -PCK4BITS(eError,eError, 12, 12, 12, 12,eError,eError),//90-97 -PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//98-9f -PCK4BITS(eError,eError,eError,eError,eError, 12,eError,eError),//a0-a7 -PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//a8-af -PCK4BITS(eError,eError, 12, 12, 12,eError,eError,eError),//b0-b7 -PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//b8-bf -PCK4BITS(eError,eError,eStart,eStart,eStart,eStart,eError,eError),//c0-c7 -PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError) //c8-cf -}; - -static const PRUint32 UTF8CharLenTable[] = {0, 1, 0, 0, 0, 0, 2, 3, - 3, 3, 4, 4, 5, 5, 6, 6 }; - -const SMModel UTF8SMModel( - nsPkgInt(eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UTF8_cls), - 16, - nsPkgInt(eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UTF8_st), - UTF8CharLenTable, - "UTF-8"); - +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is mozilla.org code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ +#include "nsCodingStateMachine.h" + +/* +Modification from frank tang's original work: +. 0x00 is allowed as a legal character. Since some web pages contains this char in + text stream. +*/ + +// BIG5 + +static const PRUint32 BIG5_cls [ 256 / 8 ] = { +//PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07 +PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07 //allow 0x00 as legal value +PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f +PCK4BITS(1,1,1,1,1,1,1,1), // 10 - 17 +PCK4BITS(1,1,1,0,1,1,1,1), // 18 - 1f +PCK4BITS(1,1,1,1,1,1,1,1), // 20 - 27 +PCK4BITS(1,1,1,1,1,1,1,1), // 28 - 2f +PCK4BITS(1,1,1,1,1,1,1,1), // 30 - 37 +PCK4BITS(1,1,1,1,1,1,1,1), // 38 - 3f +PCK4BITS(2,2,2,2,2,2,2,2), // 40 - 47 +PCK4BITS(2,2,2,2,2,2,2,2), // 48 - 4f +PCK4BITS(2,2,2,2,2,2,2,2), // 50 - 57 +PCK4BITS(2,2,2,2,2,2,2,2), // 58 - 5f +PCK4BITS(2,2,2,2,2,2,2,2), // 60 - 67 +PCK4BITS(2,2,2,2,2,2,2,2), // 68 - 6f +PCK4BITS(2,2,2,2,2,2,2,2), // 70 - 77 +PCK4BITS(2,2,2,2,2,2,2,1), // 78 - 7f +PCK4BITS(4,4,4,4,4,4,4,4), // 80 - 87 +PCK4BITS(4,4,4,4,4,4,4,4), // 88 - 8f +PCK4BITS(4,4,4,4,4,4,4,4), // 90 - 97 +PCK4BITS(4,4,4,4,4,4,4,4), // 98 - 9f +PCK4BITS(4,3,3,3,3,3,3,3), // a0 - a7 +PCK4BITS(3,3,3,3,3,3,3,3), // a8 - af +PCK4BITS(3,3,3,3,3,3,3,3), // b0 - b7 +PCK4BITS(3,3,3,3,3,3,3,3), // b8 - bf +PCK4BITS(3,3,3,3,3,3,3,3), // c0 - c7 +PCK4BITS(3,3,3,3,3,3,3,3), // c8 - cf +PCK4BITS(3,3,3,3,3,3,3,3), // d0 - d7 +PCK4BITS(3,3,3,3,3,3,3,3), // d8 - df +PCK4BITS(3,3,3,3,3,3,3,3), // e0 - e7 +PCK4BITS(3,3,3,3,3,3,3,3), // e8 - ef +PCK4BITS(3,3,3,3,3,3,3,3), // f0 - f7 +PCK4BITS(3,3,3,3,3,3,3,0) // f8 - ff +}; + + +static const PRUint32 BIG5_st [ 3] = { +PCK4BITS(eError,eStart,eStart, 3,eError,eError,eError,eError),//00-07 +PCK4BITS(eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError),//08-0f +PCK4BITS(eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart) //10-17 +}; + +static const PRUint32 Big5CharLenTable[] = {0, 1, 1, 2, 0}; + +SMModel const Big5SMModel = { + {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, BIG5_cls }, + 5, + {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, BIG5_st }, + Big5CharLenTable, + "BIG5", +}; + +static const PRUint32 EUCJP_cls [ 256 / 8 ] = { +//PCK4BITS(5,4,4,4,4,4,4,4), // 00 - 07 +PCK4BITS(4,4,4,4,4,4,4,4), // 00 - 07 +PCK4BITS(4,4,4,4,4,4,5,5), // 08 - 0f +PCK4BITS(4,4,4,4,4,4,4,4), // 10 - 17 +PCK4BITS(4,4,4,5,4,4,4,4), // 18 - 1f +PCK4BITS(4,4,4,4,4,4,4,4), // 20 - 27 +PCK4BITS(4,4,4,4,4,4,4,4), // 28 - 2f +PCK4BITS(4,4,4,4,4,4,4,4), // 30 - 37 +PCK4BITS(4,4,4,4,4,4,4,4), // 38 - 3f +PCK4BITS(4,4,4,4,4,4,4,4), // 40 - 47 +PCK4BITS(4,4,4,4,4,4,4,4), // 48 - 4f +PCK4BITS(4,4,4,4,4,4,4,4), // 50 - 57 +PCK4BITS(4,4,4,4,4,4,4,4), // 58 - 5f +PCK4BITS(4,4,4,4,4,4,4,4), // 60 - 67 +PCK4BITS(4,4,4,4,4,4,4,4), // 68 - 6f +PCK4BITS(4,4,4,4,4,4,4,4), // 70 - 77 +PCK4BITS(4,4,4,4,4,4,4,4), // 78 - 7f +PCK4BITS(5,5,5,5,5,5,5,5), // 80 - 87 +PCK4BITS(5,5,5,5,5,5,1,3), // 88 - 8f +PCK4BITS(5,5,5,5,5,5,5,5), // 90 - 97 +PCK4BITS(5,5,5,5,5,5,5,5), // 98 - 9f +PCK4BITS(5,2,2,2,2,2,2,2), // a0 - a7 +PCK4BITS(2,2,2,2,2,2,2,2), // a8 - af +PCK4BITS(2,2,2,2,2,2,2,2), // b0 - b7 +PCK4BITS(2,2,2,2,2,2,2,2), // b8 - bf +PCK4BITS(2,2,2,2,2,2,2,2), // c0 - c7 +PCK4BITS(2,2,2,2,2,2,2,2), // c8 - cf +PCK4BITS(2,2,2,2,2,2,2,2), // d0 - d7 +PCK4BITS(2,2,2,2,2,2,2,2), // d8 - df +PCK4BITS(0,0,0,0,0,0,0,0), // e0 - e7 +PCK4BITS(0,0,0,0,0,0,0,0), // e8 - ef +PCK4BITS(0,0,0,0,0,0,0,0), // f0 - f7 +PCK4BITS(0,0,0,0,0,0,0,5) // f8 - ff +}; + + +static const PRUint32 EUCJP_st [ 5] = { +PCK4BITS( 3, 4, 3, 5,eStart,eError,eError,eError),//00-07 +PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f +PCK4BITS(eItsMe,eItsMe,eStart,eError,eStart,eError,eError,eError),//10-17 +PCK4BITS(eError,eError,eStart,eError,eError,eError, 3,eError),//18-1f +PCK4BITS( 3,eError,eError,eError,eStart,eStart,eStart,eStart) //20-27 +}; + +static const PRUint32 EUCJPCharLenTable[] = {2, 2, 2, 3, 1, 0}; + +const SMModel EUCJPSMModel = { + {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCJP_cls }, + 6, + {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCJP_st }, + EUCJPCharLenTable, + "EUC-JP", +}; + +static const PRUint32 EUCKR_cls [ 256 / 8 ] = { +//PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07 +PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07 +PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f +PCK4BITS(1,1,1,1,1,1,1,1), // 10 - 17 +PCK4BITS(1,1,1,0,1,1,1,1), // 18 - 1f +PCK4BITS(1,1,1,1,1,1,1,1), // 20 - 27 +PCK4BITS(1,1,1,1,1,1,1,1), // 28 - 2f +PCK4BITS(1,1,1,1,1,1,1,1), // 30 - 37 +PCK4BITS(1,1,1,1,1,1,1,1), // 38 - 3f +PCK4BITS(1,1,1,1,1,1,1,1), // 40 - 47 +PCK4BITS(1,1,1,1,1,1,1,1), // 48 - 4f +PCK4BITS(1,1,1,1,1,1,1,1), // 50 - 57 +PCK4BITS(1,1,1,1,1,1,1,1), // 58 - 5f +PCK4BITS(1,1,1,1,1,1,1,1), // 60 - 67 +PCK4BITS(1,1,1,1,1,1,1,1), // 68 - 6f +PCK4BITS(1,1,1,1,1,1,1,1), // 70 - 77 +PCK4BITS(1,1,1,1,1,1,1,1), // 78 - 7f +PCK4BITS(0,0,0,0,0,0,0,0), // 80 - 87 +PCK4BITS(0,0,0,0,0,0,0,0), // 88 - 8f +PCK4BITS(0,0,0,0,0,0,0,0), // 90 - 97 +PCK4BITS(0,0,0,0,0,0,0,0), // 98 - 9f +PCK4BITS(0,2,2,2,2,2,2,2), // a0 - a7 +PCK4BITS(2,2,2,2,2,3,3,3), // a8 - af +PCK4BITS(2,2,2,2,2,2,2,2), // b0 - b7 +PCK4BITS(2,2,2,2,2,2,2,2), // b8 - bf +PCK4BITS(2,2,2,2,2,2,2,2), // c0 - c7 +PCK4BITS(2,3,2,2,2,2,2,2), // c8 - cf +PCK4BITS(2,2,2,2,2,2,2,2), // d0 - d7 +PCK4BITS(2,2,2,2,2,2,2,2), // d8 - df +PCK4BITS(2,2,2,2,2,2,2,2), // e0 - e7 +PCK4BITS(2,2,2,2,2,2,2,2), // e8 - ef +PCK4BITS(2,2,2,2,2,2,2,2), // f0 - f7 +PCK4BITS(2,2,2,2,2,2,2,0) // f8 - ff +}; + + +static const PRUint32 EUCKR_st [ 2] = { +PCK4BITS(eError,eStart, 3,eError,eError,eError,eError,eError),//00-07 +PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,eStart) //08-0f +}; + +static const PRUint32 EUCKRCharLenTable[] = {0, 1, 2, 0}; + +const SMModel EUCKRSMModel = { + {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCKR_cls }, + 4, + {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCKR_st }, + EUCKRCharLenTable, + "EUC-KR", +}; + +static const PRUint32 EUCTW_cls [ 256 / 8 ] = { +//PCK4BITS(0,2,2,2,2,2,2,2), // 00 - 07 +PCK4BITS(2,2,2,2,2,2,2,2), // 00 - 07 +PCK4BITS(2,2,2,2,2,2,0,0), // 08 - 0f +PCK4BITS(2,2,2,2,2,2,2,2), // 10 - 17 +PCK4BITS(2,2,2,0,2,2,2,2), // 18 - 1f +PCK4BITS(2,2,2,2,2,2,2,2), // 20 - 27 +PCK4BITS(2,2,2,2,2,2,2,2), // 28 - 2f +PCK4BITS(2,2,2,2,2,2,2,2), // 30 - 37 +PCK4BITS(2,2,2,2,2,2,2,2), // 38 - 3f +PCK4BITS(2,2,2,2,2,2,2,2), // 40 - 47 +PCK4BITS(2,2,2,2,2,2,2,2), // 48 - 4f +PCK4BITS(2,2,2,2,2,2,2,2), // 50 - 57 +PCK4BITS(2,2,2,2,2,2,2,2), // 58 - 5f +PCK4BITS(2,2,2,2,2,2,2,2), // 60 - 67 +PCK4BITS(2,2,2,2,2,2,2,2), // 68 - 6f +PCK4BITS(2,2,2,2,2,2,2,2), // 70 - 77 +PCK4BITS(2,2,2,2,2,2,2,2), // 78 - 7f +PCK4BITS(0,0,0,0,0,0,0,0), // 80 - 87 +PCK4BITS(0,0,0,0,0,0,6,0), // 88 - 8f +PCK4BITS(0,0,0,0,0,0,0,0), // 90 - 97 +PCK4BITS(0,0,0,0,0,0,0,0), // 98 - 9f +PCK4BITS(0,3,4,4,4,4,4,4), // a0 - a7 +PCK4BITS(5,5,1,1,1,1,1,1), // a8 - af +PCK4BITS(1,1,1,1,1,1,1,1), // b0 - b7 +PCK4BITS(1,1,1,1,1,1,1,1), // b8 - bf +PCK4BITS(1,1,3,1,3,3,3,3), // c0 - c7 +PCK4BITS(3,3,3,3,3,3,3,3), // c8 - cf +PCK4BITS(3,3,3,3,3,3,3,3), // d0 - d7 +PCK4BITS(3,3,3,3,3,3,3,3), // d8 - df +PCK4BITS(3,3,3,3,3,3,3,3), // e0 - e7 +PCK4BITS(3,3,3,3,3,3,3,3), // e8 - ef +PCK4BITS(3,3,3,3,3,3,3,3), // f0 - f7 +PCK4BITS(3,3,3,3,3,3,3,0) // f8 - ff +}; + + +static const PRUint32 EUCTW_st [ 6] = { +PCK4BITS(eError,eError,eStart, 3, 3, 3, 4,eError),//00-07 +PCK4BITS(eError,eError,eError,eError,eError,eError,eItsMe,eItsMe),//08-0f +PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eStart,eError),//10-17 +PCK4BITS(eStart,eStart,eStart,eError,eError,eError,eError,eError),//18-1f +PCK4BITS( 5,eError,eError,eError,eStart,eError,eStart,eStart),//20-27 +PCK4BITS(eStart,eError,eStart,eStart,eStart,eStart,eStart,eStart) //28-2f +}; + +static const PRUint32 EUCTWCharLenTable[] = {0, 0, 1, 2, 2, 2, 3}; + +const SMModel EUCTWSMModel = { + {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCTW_cls }, + 7, + {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCTW_st }, + EUCTWCharLenTable, + "EUC-TW", +}; + +/* obsolete GB2312 by GB18030 +static PRUint32 GB2312_cls [ 256 / 8 ] = { +//PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07 +PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07 +PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f +PCK4BITS(1,1,1,1,1,1,1,1), // 10 - 17 +PCK4BITS(1,1,1,0,1,1,1,1), // 18 - 1f +PCK4BITS(1,1,1,1,1,1,1,1), // 20 - 27 +PCK4BITS(1,1,1,1,1,1,1,1), // 28 - 2f +PCK4BITS(1,1,1,1,1,1,1,1), // 30 - 37 +PCK4BITS(1,1,1,1,1,1,1,1), // 38 - 3f +PCK4BITS(1,1,1,1,1,1,1,1), // 40 - 47 +PCK4BITS(1,1,1,1,1,1,1,1), // 48 - 4f +PCK4BITS(1,1,1,1,1,1,1,1), // 50 - 57 +PCK4BITS(1,1,1,1,1,1,1,1), // 58 - 5f +PCK4BITS(1,1,1,1,1,1,1,1), // 60 - 67 +PCK4BITS(1,1,1,1,1,1,1,1), // 68 - 6f +PCK4BITS(1,1,1,1,1,1,1,1), // 70 - 77 +PCK4BITS(1,1,1,1,1,1,1,1), // 78 - 7f +PCK4BITS(1,0,0,0,0,0,0,0), // 80 - 87 +PCK4BITS(0,0,0,0,0,0,0,0), // 88 - 8f +PCK4BITS(0,0,0,0,0,0,0,0), // 90 - 97 +PCK4BITS(0,0,0,0,0,0,0,0), // 98 - 9f +PCK4BITS(0,2,2,2,2,2,2,2), // a0 - a7 +PCK4BITS(2,2,3,3,3,3,3,3), // a8 - af +PCK4BITS(2,2,2,2,2,2,2,2), // b0 - b7 +PCK4BITS(2,2,2,2,2,2,2,2), // b8 - bf +PCK4BITS(2,2,2,2,2,2,2,2), // c0 - c7 +PCK4BITS(2,2,2,2,2,2,2,2), // c8 - cf +PCK4BITS(2,2,2,2,2,2,2,2), // d0 - d7 +PCK4BITS(2,2,2,2,2,2,2,2), // d8 - df +PCK4BITS(2,2,2,2,2,2,2,2), // e0 - e7 +PCK4BITS(2,2,2,2,2,2,2,2), // e8 - ef +PCK4BITS(2,2,2,2,2,2,2,2), // f0 - f7 +PCK4BITS(2,2,2,2,2,2,2,0) // f8 - ff +}; + + +static PRUint32 GB2312_st [ 2] = { +PCK4BITS(eError,eStart, 3,eError,eError,eError,eError,eError),//00-07 +PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,eStart) //08-0f +}; + +static const PRUint32 GB2312CharLenTable[] = {0, 1, 2, 0}; + +SMModel GB2312SMModel = { + {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, GB2312_cls }, + 4, + {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, GB2312_st }, + GB2312CharLenTable, + "GB2312", +}; +*/ + +// the following state machine data was created by perl script in +// intl/chardet/tools. It should be the same as in PSM detector. +static const PRUint32 GB18030_cls [ 256 / 8 ] = { +PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07 +PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f +PCK4BITS(1,1,1,1,1,1,1,1), // 10 - 17 +PCK4BITS(1,1,1,0,1,1,1,1), // 18 - 1f +PCK4BITS(1,1,1,1,1,1,1,1), // 20 - 27 +PCK4BITS(1,1,1,1,1,1,1,1), // 28 - 2f +PCK4BITS(3,3,3,3,3,3,3,3), // 30 - 37 +PCK4BITS(3,3,1,1,1,1,1,1), // 38 - 3f +PCK4BITS(2,2,2,2,2,2,2,2), // 40 - 47 +PCK4BITS(2,2,2,2,2,2,2,2), // 48 - 4f +PCK4BITS(2,2,2,2,2,2,2,2), // 50 - 57 +PCK4BITS(2,2,2,2,2,2,2,2), // 58 - 5f +PCK4BITS(2,2,2,2,2,2,2,2), // 60 - 67 +PCK4BITS(2,2,2,2,2,2,2,2), // 68 - 6f +PCK4BITS(2,2,2,2,2,2,2,2), // 70 - 77 +PCK4BITS(2,2,2,2,2,2,2,4), // 78 - 7f +PCK4BITS(5,6,6,6,6,6,6,6), // 80 - 87 +PCK4BITS(6,6,6,6,6,6,6,6), // 88 - 8f +PCK4BITS(6,6,6,6,6,6,6,6), // 90 - 97 +PCK4BITS(6,6,6,6,6,6,6,6), // 98 - 9f +PCK4BITS(6,6,6,6,6,6,6,6), // a0 - a7 +PCK4BITS(6,6,6,6,6,6,6,6), // a8 - af +PCK4BITS(6,6,6,6,6,6,6,6), // b0 - b7 +PCK4BITS(6,6,6,6,6,6,6,6), // b8 - bf +PCK4BITS(6,6,6,6,6,6,6,6), // c0 - c7 +PCK4BITS(6,6,6,6,6,6,6,6), // c8 - cf +PCK4BITS(6,6,6,6,6,6,6,6), // d0 - d7 +PCK4BITS(6,6,6,6,6,6,6,6), // d8 - df +PCK4BITS(6,6,6,6,6,6,6,6), // e0 - e7 +PCK4BITS(6,6,6,6,6,6,6,6), // e8 - ef +PCK4BITS(6,6,6,6,6,6,6,6), // f0 - f7 +PCK4BITS(6,6,6,6,6,6,6,0) // f8 - ff +}; + + +static const PRUint32 GB18030_st [ 6] = { +PCK4BITS(eError,eStart,eStart,eStart,eStart,eStart, 3,eError),//00-07 +PCK4BITS(eError,eError,eError,eError,eError,eError,eItsMe,eItsMe),//08-0f +PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart),//10-17 +PCK4BITS( 4,eError,eStart,eStart,eError,eError,eError,eError),//18-1f +PCK4BITS(eError,eError, 5,eError,eError,eError,eItsMe,eError),//20-27 +PCK4BITS(eError,eError,eStart,eStart,eStart,eStart,eStart,eStart) //28-2f +}; + +// To be accurate, the length of class 6 can be either 2 or 4. +// But it is not necessary to discriminate between the two since +// it is used for frequency analysis only, and we are validing +// each code range there as well. So it is safe to set it to be +// 2 here. +static const PRUint32 GB18030CharLenTable[] = {0, 1, 1, 1, 1, 1, 2}; + +const SMModel GB18030SMModel = { + {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, GB18030_cls }, + 7, + {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, GB18030_st }, + GB18030CharLenTable, + "GB18030", +}; + +// sjis + +static const PRUint32 SJIS_cls [ 256 / 8 ] = { +//PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07 +PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07 +PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f +PCK4BITS(1,1,1,1,1,1,1,1), // 10 - 17 +PCK4BITS(1,1,1,0,1,1,1,1), // 18 - 1f +PCK4BITS(1,1,1,1,1,1,1,1), // 20 - 27 +PCK4BITS(1,1,1,1,1,1,1,1), // 28 - 2f +PCK4BITS(1,1,1,1,1,1,1,1), // 30 - 37 +PCK4BITS(1,1,1,1,1,1,1,1), // 38 - 3f +PCK4BITS(2,2,2,2,2,2,2,2), // 40 - 47 +PCK4BITS(2,2,2,2,2,2,2,2), // 48 - 4f +PCK4BITS(2,2,2,2,2,2,2,2), // 50 - 57 +PCK4BITS(2,2,2,2,2,2,2,2), // 58 - 5f +PCK4BITS(2,2,2,2,2,2,2,2), // 60 - 67 +PCK4BITS(2,2,2,2,2,2,2,2), // 68 - 6f +PCK4BITS(2,2,2,2,2,2,2,2), // 70 - 77 +PCK4BITS(2,2,2,2,2,2,2,1), // 78 - 7f +PCK4BITS(3,3,3,3,3,3,3,3), // 80 - 87 +PCK4BITS(3,3,3,3,3,3,3,3), // 88 - 8f +PCK4BITS(3,3,3,3,3,3,3,3), // 90 - 97 +PCK4BITS(3,3,3,3,3,3,3,3), // 98 - 9f +//0xa0 is illegal in sjis encoding, but some pages does +//contain such byte. We need to be more error forgiven. +PCK4BITS(2,2,2,2,2,2,2,2), // a0 - a7 +PCK4BITS(2,2,2,2,2,2,2,2), // a8 - af +PCK4BITS(2,2,2,2,2,2,2,2), // b0 - b7 +PCK4BITS(2,2,2,2,2,2,2,2), // b8 - bf +PCK4BITS(2,2,2,2,2,2,2,2), // c0 - c7 +PCK4BITS(2,2,2,2,2,2,2,2), // c8 - cf +PCK4BITS(2,2,2,2,2,2,2,2), // d0 - d7 +PCK4BITS(2,2,2,2,2,2,2,2), // d8 - df +PCK4BITS(3,3,3,3,3,3,3,3), // e0 - e7 +PCK4BITS(3,3,3,3,3,4,4,4), // e8 - ef +PCK4BITS(4,4,4,4,4,4,4,4), // f0 - f7 +PCK4BITS(4,4,4,4,4,0,0,0) // f8 - ff +}; + + +static const PRUint32 SJIS_st [ 3] = { +PCK4BITS(eError,eStart,eStart, 3,eError,eError,eError,eError),//00-07 +PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f +PCK4BITS(eItsMe,eItsMe,eError,eError,eStart,eStart,eStart,eStart) //10-17 +}; + +static const PRUint32 SJISCharLenTable[] = {0, 1, 1, 2, 0, 0}; + +const SMModel SJISSMModel = { + {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, SJIS_cls }, + 6, + {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, SJIS_st }, + SJISCharLenTable, + "SHIFT_JIS", +}; + + +static const PRUint32 UTF8_cls [ 256 / 8 ] = { +//PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07 +PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07 //allow 0x00 as a legal value +PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f +PCK4BITS(1,1,1,1,1,1,1,1), // 10 - 17 +PCK4BITS(1,1,1,0,1,1,1,1), // 18 - 1f +PCK4BITS(1,1,1,1,1,1,1,1), // 20 - 27 +PCK4BITS(1,1,1,1,1,1,1,1), // 28 - 2f +PCK4BITS(1,1,1,1,1,1,1,1), // 30 - 37 +PCK4BITS(1,1,1,1,1,1,1,1), // 38 - 3f +PCK4BITS(1,1,1,1,1,1,1,1), // 40 - 47 +PCK4BITS(1,1,1,1,1,1,1,1), // 48 - 4f +PCK4BITS(1,1,1,1,1,1,1,1), // 50 - 57 +PCK4BITS(1,1,1,1,1,1,1,1), // 58 - 5f +PCK4BITS(1,1,1,1,1,1,1,1), // 60 - 67 +PCK4BITS(1,1,1,1,1,1,1,1), // 68 - 6f +PCK4BITS(1,1,1,1,1,1,1,1), // 70 - 77 +PCK4BITS(1,1,1,1,1,1,1,1), // 78 - 7f +PCK4BITS(2,2,2,2,3,3,3,3), // 80 - 87 +PCK4BITS(4,4,4,4,4,4,4,4), // 88 - 8f +PCK4BITS(4,4,4,4,4,4,4,4), // 90 - 97 +PCK4BITS(4,4,4,4,4,4,4,4), // 98 - 9f +PCK4BITS(5,5,5,5,5,5,5,5), // a0 - a7 +PCK4BITS(5,5,5,5,5,5,5,5), // a8 - af +PCK4BITS(5,5,5,5,5,5,5,5), // b0 - b7 +PCK4BITS(5,5,5,5,5,5,5,5), // b8 - bf +PCK4BITS(0,0,6,6,6,6,6,6), // c0 - c7 +PCK4BITS(6,6,6,6,6,6,6,6), // c8 - cf +PCK4BITS(6,6,6,6,6,6,6,6), // d0 - d7 +PCK4BITS(6,6,6,6,6,6,6,6), // d8 - df +PCK4BITS(7,8,8,8,8,8,8,8), // e0 - e7 +PCK4BITS(8,8,8,8,8,9,8,8), // e8 - ef +PCK4BITS(10,11,11,11,11,11,11,11), // f0 - f7 +PCK4BITS(12,13,13,13,14,15,0,0) // f8 - ff +}; + + +static const PRUint32 UTF8_st [ 26] = { +PCK4BITS(eError,eStart,eError,eError,eError,eError, 12, 10),//00-07 +PCK4BITS( 9, 11, 8, 7, 6, 5, 4, 3),//08-0f +PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//10-17 +PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//18-1f +PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe),//20-27 +PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe),//28-2f +PCK4BITS(eError,eError, 5, 5, 5, 5,eError,eError),//30-37 +PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//38-3f +PCK4BITS(eError,eError,eError, 5, 5, 5,eError,eError),//40-47 +PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//48-4f +PCK4BITS(eError,eError, 7, 7, 7, 7,eError,eError),//50-57 +PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//58-5f +PCK4BITS(eError,eError,eError,eError, 7, 7,eError,eError),//60-67 +PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//68-6f +PCK4BITS(eError,eError, 9, 9, 9, 9,eError,eError),//70-77 +PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//78-7f +PCK4BITS(eError,eError,eError,eError,eError, 9,eError,eError),//80-87 +PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//88-8f +PCK4BITS(eError,eError, 12, 12, 12, 12,eError,eError),//90-97 +PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//98-9f +PCK4BITS(eError,eError,eError,eError,eError, 12,eError,eError),//a0-a7 +PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//a8-af +PCK4BITS(eError,eError, 12, 12, 12,eError,eError,eError),//b0-b7 +PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//b8-bf +PCK4BITS(eError,eError,eStart,eStart,eStart,eStart,eError,eError),//c0-c7 +PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError) //c8-cf +}; + +static const PRUint32 UTF8CharLenTable[] = {0, 1, 0, 0, 0, 0, 2, 3, + 3, 3, 4, 4, 5, 5, 6, 6 }; + +const SMModel UTF8SMModel = { + {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UTF8_cls }, + 16, + {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UTF8_st }, + UTF8CharLenTable, + "UTF-8", +}; + diff --git a/PowerEditor/src/uchardet/nsPkgInt.h b/PowerEditor/src/uchardet/nsPkgInt.h index b1e66785..3caa9122 100644 --- a/PowerEditor/src/uchardet/nsPkgInt.h +++ b/PowerEditor/src/uchardet/nsPkgInt.h @@ -1,93 +1,89 @@ -/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ -/* ***** BEGIN LICENSE BLOCK ***** - * Version: MPL 1.1/GPL 2.0/LGPL 2.1 - * - * The contents of this file are subject to the Mozilla Public License Version - * 1.1 (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * http://www.mozilla.org/MPL/ - * - * Software distributed under the License is distributed on an "AS IS" basis, - * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License - * for the specific language governing rights and limitations under the - * License. - * - * The Original Code is mozilla.org code. - * - * The Initial Developer of the Original Code is - * Netscape Communications Corporation. - * Portions created by the Initial Developer are Copyright (C) 1998 - * the Initial Developer. All Rights Reserved. - * - * Contributor(s): - * - * Alternatively, the contents of this file may be used under the terms of - * either the GNU General Public License Version 2 or later (the "GPL"), or - * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), - * in which case the provisions of the GPL or the LGPL are applicable instead - * of those above. If you wish to allow use of your version of this file only - * under the terms of either the GPL or the LGPL, and not to allow others to - * use your version of this file under the terms of the MPL, indicate your - * decision by deleting the provisions above and replace them with the notice - * and other provisions required by the GPL or the LGPL. If you do not delete - * the provisions above, a recipient may use your version of this file under - * the terms of any one of the MPL, the GPL or the LGPL. - * - * ***** END LICENSE BLOCK ***** */ - -#ifndef nsPkgInt_h__ -#define nsPkgInt_h__ -#include "nscore.h" - -typedef enum { - eIdxSft4bits = 3, - eIdxSft8bits = 2, - eIdxSft16bits = 1 -} nsIdxSft; - -typedef enum { - eSftMsk4bits = 7, - eSftMsk8bits = 3, - eSftMsk16bits = 1 -} nsSftMsk; - -typedef enum { - eBitSft4bits = 2, - eBitSft8bits = 3, - eBitSft16bits = 4 -} nsBitSft; - -typedef enum { - eUnitMsk4bits = 0x0000000FL, - eUnitMsk8bits = 0x000000FFL, - eUnitMsk16bits = 0x0000FFFFL -} nsUnitMsk; - -struct nsPkgInt { - nsIdxSft idxsft; - nsSftMsk sftmsk; - nsBitSft bitsft; - nsUnitMsk unitmsk; - const PRUint32* const data; - nsPkgInt(nsIdxSft a,nsSftMsk b, nsBitSft c,nsUnitMsk d,const PRUint32* const e) - :idxsft(a), sftmsk(b), bitsft(c), unitmsk(d), data(e){} - nsPkgInt(); - nsPkgInt operator= (const nsPkgInt&); -}; - - -#define PCK16BITS(a,b) ((PRUint32)(((b) << 16) | (a))) - -#define PCK8BITS(a,b,c,d) PCK16BITS( ((PRUint32)(((b) << 8) | (a))), \ - ((PRUint32)(((d) << 8) | (c)))) - -#define PCK4BITS(a,b,c,d,e,f,g,h) PCK8BITS( ((PRUint32)(((b) << 4) | (a))), \ - ((PRUint32)(((d) << 4) | (c))), \ - ((PRUint32)(((f) << 4) | (e))), \ - ((PRUint32)(((h) << 4) | (g))) ) - -#define GETFROMPCK(i, c) \ - (((((c).data)[(i)>>(c).idxsft])>>(((i)&(c).sftmsk)<<(c).bitsft))&(c).unitmsk) - -#endif /* nsPkgInt_h__ */ - +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is mozilla.org code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#ifndef nsPkgInt_h__ +#define nsPkgInt_h__ +#include "nscore.h" + +typedef enum { + eIdxSft4bits = 3, + eIdxSft8bits = 2, + eIdxSft16bits = 1 +} nsIdxSft; + +typedef enum { + eSftMsk4bits = 7, + eSftMsk8bits = 3, + eSftMsk16bits = 1 +} nsSftMsk; + +typedef enum { + eBitSft4bits = 2, + eBitSft8bits = 3, + eBitSft16bits = 4 +} nsBitSft; + +typedef enum { + eUnitMsk4bits = 0x0000000FL, + eUnitMsk8bits = 0x000000FFL, + eUnitMsk16bits = 0x0000FFFFL +} nsUnitMsk; + +typedef struct nsPkgInt { + nsIdxSft idxsft; + nsSftMsk sftmsk; + nsBitSft bitsft; + nsUnitMsk unitmsk; + const PRUint32* const data; +} nsPkgInt; + + +#define PCK16BITS(a,b) ((PRUint32)(((b) << 16) | (a))) + +#define PCK8BITS(a,b,c,d) PCK16BITS( ((PRUint32)(((b) << 8) | (a))), \ + ((PRUint32)(((d) << 8) | (c)))) + +#define PCK4BITS(a,b,c,d,e,f,g,h) PCK8BITS( ((PRUint32)(((b) << 4) | (a))), \ + ((PRUint32)(((d) << 4) | (c))), \ + ((PRUint32)(((f) << 4) | (e))), \ + ((PRUint32)(((h) << 4) | (g))) ) + +#define GETFROMPCK(i, c) \ + (((((c).data)[(i)>>(c).idxsft])>>(((i)&(c).sftmsk)<<(c).bitsft))&(c).unitmsk) + +#endif /* nsPkgInt_h__ */ + diff --git a/PowerEditor/src/uchardet/nsSBCSGroupProber.cpp b/PowerEditor/src/uchardet/nsSBCSGroupProber.cpp index d8fef879..b1a60cc0 100644 --- a/PowerEditor/src/uchardet/nsSBCSGroupProber.cpp +++ b/PowerEditor/src/uchardet/nsSBCSGroupProber.cpp @@ -46,42 +46,70 @@ nsSBCSGroupProber::nsSBCSGroupProber() { - mProbers[0] = new nsSingleByteCharSetProber(&Win1251Model); - mProbers[1] = new nsSingleByteCharSetProber(&Koi8rModel); - mProbers[2] = new nsSingleByteCharSetProber(&Latin5Model); - mProbers[3] = new nsSingleByteCharSetProber(&MacCyrillicModel); - mProbers[4] = new nsSingleByteCharSetProber(&Ibm866Model); - mProbers[5] = new nsSingleByteCharSetProber(&Ibm855Model); - mProbers[6] = new nsSingleByteCharSetProber(&Latin7Model); - mProbers[7] = new nsSingleByteCharSetProber(&Win1253Model); + mProbers[0] = new nsSingleByteCharSetProber(&Win1251RussianModel); + mProbers[1] = new nsSingleByteCharSetProber(&Koi8rRussianModel); + mProbers[2] = new nsSingleByteCharSetProber(&Latin5RussianModel); + mProbers[3] = new nsSingleByteCharSetProber(&MacCyrillicRussianModel); + mProbers[4] = new nsSingleByteCharSetProber(&Ibm866RussianModel); + mProbers[5] = new nsSingleByteCharSetProber(&Ibm855RussianModel); + + mProbers[6] = new nsSingleByteCharSetProber(&Iso_8859_7GreekModel); + mProbers[7] = new nsSingleByteCharSetProber(&Windows_1253GreekModel); + mProbers[8] = new nsSingleByteCharSetProber(&Latin5BulgarianModel); mProbers[9] = new nsSingleByteCharSetProber(&Win1251BulgarianModel); - mProbers[10] = new nsSingleByteCharSetProber(&TIS620ThaiModel); nsHebrewProber *hebprober = new nsHebrewProber(); // Notice: Any change in these indexes - 10,11,12 must be reflected // in the code below as well. - mProbers[11] = hebprober; - mProbers[12] = new nsSingleByteCharSetProber(&Win1255Model, PR_FALSE, hebprober); // Logical Hebrew - mProbers[13] = new nsSingleByteCharSetProber(&Win1255Model, PR_TRUE, hebprober); // Visual Hebrew + mProbers[10] = hebprober; + mProbers[11] = new nsSingleByteCharSetProber(&Win1255Model, PR_FALSE, hebprober); // Logical Hebrew + mProbers[12] = new nsSingleByteCharSetProber(&Win1255Model, PR_TRUE, hebprober); // Visual Hebrew // Tell the Hebrew prober about the logical and visual probers - if (mProbers[11] && mProbers[12] && mProbers[13]) // all are not null + if (mProbers[10] && mProbers[11] && mProbers[12]) // all are not null { - hebprober->SetModelProbers(mProbers[12], mProbers[13]); + hebprober->SetModelProbers(mProbers[11], mProbers[12]); } else // One or more is null. avoid any Hebrew probing, null them all { - for (PRUint32 i = 11; i <= 13; ++i) - { - delete mProbers[i]; - mProbers[i] = 0; + for (PRUint32 i = 10; i <= 12; ++i) + { + delete mProbers[i]; + mProbers[i] = 0; } } - // disable latin2 before latin1 is available, otherwise all latin1 - // will be detected as latin2 because of their similarity. - //mProbers[10] = new nsSingleByteCharSetProber(&Latin2HungarianModel); - //mProbers[11] = new nsSingleByteCharSetProber(&Win1250HungarianModel); + mProbers[13] = new nsSingleByteCharSetProber(&Tis_620ThaiModel); + mProbers[14] = new nsSingleByteCharSetProber(&Iso_8859_11ThaiModel); + + mProbers[15] = new nsSingleByteCharSetProber(&Iso_8859_1FrenchModel); + mProbers[16] = new nsSingleByteCharSetProber(&Iso_8859_15FrenchModel); + mProbers[17] = new nsSingleByteCharSetProber(&Windows_1252FrenchModel); + + mProbers[18] = new nsSingleByteCharSetProber(&Iso_8859_1SpanishModel); + mProbers[19] = new nsSingleByteCharSetProber(&Iso_8859_15SpanishModel); + mProbers[20] = new nsSingleByteCharSetProber(&Windows_1252SpanishModel); + + mProbers[21] = new nsSingleByteCharSetProber(&Iso_8859_2HungarianModel); + mProbers[22] = new nsSingleByteCharSetProber(&Windows_1250HungarianModel); + + mProbers[23] = new nsSingleByteCharSetProber(&Iso_8859_1GermanModel); + mProbers[24] = new nsSingleByteCharSetProber(&Windows_1252GermanModel); + + mProbers[25] = new nsSingleByteCharSetProber(&Iso_8859_3EsperantoModel); + + mProbers[26] = new nsSingleByteCharSetProber(&Iso_8859_3TurkishModel); + mProbers[27] = new nsSingleByteCharSetProber(&Iso_8859_9TurkishModel); + + mProbers[28] = new nsSingleByteCharSetProber(&Iso_8859_6ArabicModel); + mProbers[29] = new nsSingleByteCharSetProber(&Windows_1256ArabicModel); + + mProbers[30] = new nsSingleByteCharSetProber(&VisciiVietnameseModel); + mProbers[31] = new nsSingleByteCharSetProber(&Windows_1258VietnameseModel); + + mProbers[32] = new nsSingleByteCharSetProber(&Iso_8859_15DanishModel); + mProbers[33] = new nsSingleByteCharSetProber(&Iso_8859_1DanishModel); + mProbers[34] = new nsSingleByteCharSetProber(&Windows_1252DanishModel); Reset(); } diff --git a/PowerEditor/src/uchardet/nsSBCSGroupProber.h b/PowerEditor/src/uchardet/nsSBCSGroupProber.h index cfbf7e16..c1ea4a11 100644 --- a/PowerEditor/src/uchardet/nsSBCSGroupProber.h +++ b/PowerEditor/src/uchardet/nsSBCSGroupProber.h @@ -40,7 +40,7 @@ #define nsSBCSGroupProber_h__ -#define NUM_OF_SBCS_PROBERS 14 +#define NUM_OF_SBCS_PROBERS 35 class nsCharSetProber; class nsSBCSGroupProber: public nsCharSetProber { diff --git a/PowerEditor/src/uchardet/nsSBCharSetProber.cpp b/PowerEditor/src/uchardet/nsSBCharSetProber.cpp index 3a88fdf3..1f7f4731 100644 --- a/PowerEditor/src/uchardet/nsSBCharSetProber.cpp +++ b/PowerEditor/src/uchardet/nsSBCharSetProber.cpp @@ -35,7 +35,6 @@ * the terms of any one of the MPL, the GPL or the LGPL. * * ***** END LICENSE BLOCK ***** */ - #include #include "nsSBCharSetProber.h" @@ -48,18 +47,31 @@ nsProbingState nsSingleByteCharSetProber::HandleData(const char* aBuf, PRUint32 order = mModel->charToOrderMap[(unsigned char)aBuf[i]]; if (order < SYMBOL_CAT_ORDER) + { mTotalChar++; - if (order < SAMPLE_SIZE) + } + else if (order == ILL) + { + /* When encountering an illegal codepoint, no need + * to continue analyzing data. */ + mState = eNotMe; + break; + } + else if (order == CTR) + { + mCtrlChar++; + } + if (order < mModel->freqCharCount) { mFreqChar++; - if (mLastOrder < SAMPLE_SIZE) + if (mLastOrder < mModel->freqCharCount) { mTotalSeqs++; if (!mReversed) - ++(mSeqCounters[mModel->precedenceMatrix[mLastOrder*SAMPLE_SIZE+order]]); + ++(mSeqCounters[mModel->precedenceMatrix[mLastOrder*mModel->freqCharCount+order]]); else // reverse the order of the letters in the lookup - ++(mSeqCounters[mModel->precedenceMatrix[order*SAMPLE_SIZE+mLastOrder]]); + ++(mSeqCounters[mModel->precedenceMatrix[order*mModel->freqCharCount+mLastOrder]]); } } mLastOrder = order; @@ -86,6 +98,7 @@ void nsSingleByteCharSetProber::Reset(void) mSeqCounters[i] = 0; mTotalSeqs = 0; mTotalChar = 0; + mCtrlChar = 0; mFreqChar = 0; } @@ -103,6 +116,19 @@ float nsSingleByteCharSetProber::GetConfidence(void) if (mTotalSeqs > 0) { r = ((float)1.0) * mSeqCounters[POSITIVE_CAT] / mTotalSeqs / mModel->mTypicalPositiveRatio; + /* Multiply by a ratio of positive sequences per characters. + * This would help in particular to distinguish close winners. + * Indeed if you add a letter, you'd expect the positive sequence count + * to increase as well. If it doesn't, it may mean that this new codepoint + * may not have been a letter, but instead a symbol (or some other + * character). This could make the difference between very closely related + * charsets used for the same language. + */ + r = r * (mSeqCounters[POSITIVE_CAT] + (float) mSeqCounters[PROBABLE_CAT] / 4) / mTotalChar; + /* The more control characters (proportionnaly to the size of the text), the + * less confident we become in the current charset. + */ + r = r * (mTotalChar - mCtrlChar) / mTotalChar; r = r*mFreqChar/mTotalChar; if (r >= (float)1.00) r = (float)0.99; @@ -112,7 +138,7 @@ float nsSingleByteCharSetProber::GetConfidence(void) #endif } -const char* nsSingleByteCharSetProber::GetCharSetName() +const char* nsSingleByteCharSetProber::GetCharSetName() { if (!mNameProber) return mModel->charsetName; diff --git a/PowerEditor/src/uchardet/nsSBCharSetProber.h b/PowerEditor/src/uchardet/nsSBCharSetProber.h index f2d055f9..211846e4 100644 --- a/PowerEditor/src/uchardet/nsSBCharSetProber.h +++ b/PowerEditor/src/uchardet/nsSBCharSetProber.h @@ -1,130 +1,176 @@ -/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ -/* ***** BEGIN LICENSE BLOCK ***** - * Version: MPL 1.1/GPL 2.0/LGPL 2.1 - * - * The contents of this file are subject to the Mozilla Public License Version - * 1.1 (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * http://www.mozilla.org/MPL/ - * - * Software distributed under the License is distributed on an "AS IS" basis, - * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License - * for the specific language governing rights and limitations under the - * License. - * - * The Original Code is Mozilla Universal charset detector code. - * - * The Initial Developer of the Original Code is - * Netscape Communications Corporation. - * Portions created by the Initial Developer are Copyright (C) 2001 - * the Initial Developer. All Rights Reserved. - * - * Contributor(s): - * Shy Shalom - * - * Alternatively, the contents of this file may be used under the terms of - * either the GNU General Public License Version 2 or later (the "GPL"), or - * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), - * in which case the provisions of the GPL or the LGPL are applicable instead - * of those above. If you wish to allow use of your version of this file only - * under the terms of either the GPL or the LGPL, and not to allow others to - * use your version of this file under the terms of the MPL, indicate your - * decision by deleting the provisions above and replace them with the notice - * and other provisions required by the GPL or the LGPL. If you do not delete - * the provisions above, a recipient may use your version of this file under - * the terms of any one of the MPL, the GPL or the LGPL. - * - * ***** END LICENSE BLOCK ***** */ -#ifndef nsSingleByteCharSetProber_h__ -#define nsSingleByteCharSetProber_h__ - -#include "nsCharSetProber.h" - -#define SAMPLE_SIZE 64 -#define SB_ENOUGH_REL_THRESHOLD 1024 -#define POSITIVE_SHORTCUT_THRESHOLD (float)0.95 -#define NEGATIVE_SHORTCUT_THRESHOLD (float)0.05 -#define SYMBOL_CAT_ORDER 250 -#define NUMBER_OF_SEQ_CAT 4 -#define POSITIVE_CAT (NUMBER_OF_SEQ_CAT-1) -#define NEGATIVE_CAT 0 - -struct SequenceModel -{ - const unsigned char* const charToOrderMap; // [256] table use to find a char's order - const PRUint8* const precedenceMatrix; // [SAMPLE_SIZE][SAMPLE_SIZE]; table to find a 2-char sequence's frequency - float mTypicalPositiveRatio; // = freqSeqs / totalSeqs - PRBool keepEnglishLetter; // says if this script contains English characters (not implemented) - const char* const charsetName; - SequenceModel(void); - SequenceModel(const unsigned char* const a, const PRUint8* const b,float c,PRBool d,const char* const e) - : charToOrderMap(a), precedenceMatrix(b), mTypicalPositiveRatio(c), keepEnglishLetter(d), charsetName(e){} - SequenceModel& operator=(const SequenceModel&); -} ; - - -class nsSingleByteCharSetProber : public nsCharSetProber{ -public: - nsSingleByteCharSetProber(const SequenceModel *model) - :mModel(model), mReversed(PR_FALSE), mNameProber(0) { Reset(); } - nsSingleByteCharSetProber(const SequenceModel *model, PRBool reversed, nsCharSetProber* nameProber) - :mModel(model), mReversed(reversed), mNameProber(nameProber) { Reset(); } - nsSingleByteCharSetProber(): mModel(0), mReversed(0){}; - virtual const char* GetCharSetName(); - virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen); - virtual nsProbingState GetState(void) {return mState;} - virtual void Reset(void); - virtual float GetConfidence(void); - virtual void SetOpion() {} - - // This feature is not implemented yet. any current language model - // contain this parameter as PR_FALSE. No one is looking at this - // parameter or calling this method. - // Moreover, the nsSBCSGroupProber which calls the HandleData of this - // prober has a hard-coded call to FilterWithoutEnglishLetters which gets rid - // of the English letters. - PRBool KeepEnglishLetters() {return mModel->keepEnglishLetter;} // (not implemented) - nsSingleByteCharSetProber operator=(const nsSingleByteCharSetProber&) = delete; - -#ifdef DEBUG_chardet - virtual void DumpStatus(); -#endif - -protected: - nsProbingState mState; - const SequenceModel* const mModel; - const PRBool mReversed; // PR_TRUE if we need to reverse every pair in the model lookup - - //char order of last character - unsigned char mLastOrder; - - PRUint32 mTotalSeqs; - PRUint32 mSeqCounters[NUMBER_OF_SEQ_CAT]; - - PRUint32 mTotalChar; - //characters that fall in our sampling range - PRUint32 mFreqChar; - - // Optional auxiliary prober for name decision. created and destroyed by the GroupProber - nsCharSetProber* mNameProber; - -}; - - -extern const SequenceModel Koi8rModel; -extern const SequenceModel Win1251Model; -extern const SequenceModel Latin5Model; -extern const SequenceModel MacCyrillicModel; -extern const SequenceModel Ibm866Model; -extern const SequenceModel Ibm855Model; -extern const SequenceModel Latin7Model; -extern const SequenceModel Win1253Model; -extern const SequenceModel Latin5BulgarianModel; -extern const SequenceModel Win1251BulgarianModel; -extern const SequenceModel Latin2HungarianModel; -extern const SequenceModel Win1250HungarianModel; -extern const SequenceModel Win1255Model; -extern const SequenceModel TIS620ThaiModel; - -#endif /* nsSingleByteCharSetProber_h__ */ - +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Mozilla Universal charset detector code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 2001 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * Shy Shalom + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ +#ifndef nsSingleByteCharSetProber_h__ +#define nsSingleByteCharSetProber_h__ + +#include "nsCharSetProber.h" + +/** Codepoints **/ + +/* Illegal codepoints.*/ +#define ILL 255 +/* Control character. */ +#define CTR 254 +/* Symbols and punctuation that does not belong to words. */ +#define SYM 253 +/* Return/Line feeds. */ +#define RET 252 +/* Numbers 0-9. */ +#define NUM 251 + +#define SB_ENOUGH_REL_THRESHOLD 1024 +#define POSITIVE_SHORTCUT_THRESHOLD (float)0.95 +#define NEGATIVE_SHORTCUT_THRESHOLD (float)0.05 +#define SYMBOL_CAT_ORDER 250 + +#define NUMBER_OF_SEQ_CAT 4 +#define POSITIVE_CAT (NUMBER_OF_SEQ_CAT-1) +#define PROBABLE_CAT (NUMBER_OF_SEQ_CAT-2) +#define NEUTRAL_CAT (NUMBER_OF_SEQ_CAT-3) +#define NEGATIVE_CAT 0 + +typedef struct +{ + /* [256] table mapping codepoints to chararacter orders. */ + const unsigned char* const charToOrderMap; + /* freqCharCount x freqCharCount table of 2-char sequence's frequencies. */ + const PRUint8* const precedenceMatrix; + /* The count of frequent characters. */ + int freqCharCount; + float mTypicalPositiveRatio; // = freqSeqs / totalSeqs + PRBool keepEnglishLetter; // says if this script contains English characters (not implemented) + const char* const charsetName; +} SequenceModel; + + +class nsSingleByteCharSetProber : public nsCharSetProber{ +public: + nsSingleByteCharSetProber(const SequenceModel *model) + :mModel(model), mReversed(PR_FALSE), mNameProber(0) { Reset(); } + nsSingleByteCharSetProber(const SequenceModel *model, PRBool reversed, nsCharSetProber* nameProber) + :mModel(model), mReversed(reversed), mNameProber(nameProber) { Reset(); } + + virtual const char* GetCharSetName(); + virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen); + virtual nsProbingState GetState(void) {return mState;} + virtual void Reset(void); + virtual float GetConfidence(void); + virtual void SetOpion() {} + + // This feature is not implemented yet. any current language model + // contain this parameter as PR_FALSE. No one is looking at this + // parameter or calling this method. + // Moreover, the nsSBCSGroupProber which calls the HandleData of this + // prober has a hard-coded call to FilterWithoutEnglishLetters which gets rid + // of the English letters. + PRBool KeepEnglishLetters() {return mModel->keepEnglishLetter;} // (not implemented) + +#ifdef DEBUG_chardet + virtual void DumpStatus(); +#endif + +protected: + nsProbingState mState; + const SequenceModel* const mModel; + const PRBool mReversed; // PR_TRUE if we need to reverse every pair in the model lookup + + //char order of last character + unsigned char mLastOrder; + + PRUint32 mTotalSeqs; + PRUint32 mSeqCounters[NUMBER_OF_SEQ_CAT]; + + PRUint32 mTotalChar; + PRUint32 mCtrlChar; + //characters that fall in our sampling range + PRUint32 mFreqChar; + + // Optional auxiliary prober for name decision. created and destroyed by the GroupProber + nsCharSetProber* mNameProber; + +}; + +extern const SequenceModel Windows_1256ArabicModel; +extern const SequenceModel Iso_8859_6ArabicModel; + +extern const SequenceModel Koi8rRussianModel; +extern const SequenceModel Win1251RussianModel; +extern const SequenceModel Latin5RussianModel; +extern const SequenceModel MacCyrillicRussianModel; +extern const SequenceModel Ibm866RussianModel; +extern const SequenceModel Ibm855RussianModel; + +extern const SequenceModel Iso_8859_7GreekModel; +extern const SequenceModel Windows_1253GreekModel; + +extern const SequenceModel Latin5BulgarianModel; +extern const SequenceModel Win1251BulgarianModel; + +extern const SequenceModel Iso_8859_2HungarianModel; +extern const SequenceModel Windows_1250HungarianModel; + +extern const SequenceModel Win1255Model; + +extern const SequenceModel Tis_620ThaiModel; +extern const SequenceModel Iso_8859_11ThaiModel; + +extern const SequenceModel Iso_8859_15FrenchModel; +extern const SequenceModel Iso_8859_1FrenchModel; +extern const SequenceModel Windows_1252FrenchModel; + +extern const SequenceModel Iso_8859_15SpanishModel; +extern const SequenceModel Iso_8859_1SpanishModel; +extern const SequenceModel Windows_1252SpanishModel; + +extern const SequenceModel Iso_8859_1GermanModel; +extern const SequenceModel Windows_1252GermanModel; + +extern const SequenceModel Iso_8859_3EsperantoModel; + +extern const SequenceModel Iso_8859_3TurkishModel; +extern const SequenceModel Iso_8859_9TurkishModel; + +extern const SequenceModel VisciiVietnameseModel; +extern const SequenceModel Windows_1258VietnameseModel; + +extern const SequenceModel Iso_8859_15DanishModel; +extern const SequenceModel Iso_8859_1DanishModel; +extern const SequenceModel Windows_1252DanishModel; + +#endif /* nsSingleByteCharSetProber_h__ */ + diff --git a/PowerEditor/src/uchardet/nsSJISProber.cpp b/PowerEditor/src/uchardet/nsSJISProber.cpp index 0b59e399..c7842f6a 100644 --- a/PowerEditor/src/uchardet/nsSJISProber.cpp +++ b/PowerEditor/src/uchardet/nsSJISProber.cpp @@ -40,7 +40,6 @@ // 2, kana character often exist in group // 3, certain combination of kana is never used in japanese language - #include "nsSJISProber.h" void nsSJISProber::Reset(void) diff --git a/PowerEditor/src/uchardet/nsSJISProber.h b/PowerEditor/src/uchardet/nsSJISProber.h index 1efb6e3d..f326ded2 100644 --- a/PowerEditor/src/uchardet/nsSJISProber.h +++ b/PowerEditor/src/uchardet/nsSJISProber.h @@ -57,7 +57,7 @@ public: Reset();} virtual ~nsSJISProber(void){delete mCodingSM;} nsProbingState HandleData(const char* aBuf, PRUint32 aLen); - const char* GetCharSetName() {return "Shift_JIS";} + const char* GetCharSetName() {return "SHIFT_JIS";} nsProbingState GetState(void) {return mState;} void Reset(void); float GetConfidence(void); diff --git a/PowerEditor/src/uchardet/nsUniversalDetector.cpp b/PowerEditor/src/uchardet/nsUniversalDetector.cpp index dd74243c..ff06b9d6 100644 --- a/PowerEditor/src/uchardet/nsUniversalDetector.cpp +++ b/PowerEditor/src/uchardet/nsUniversalDetector.cpp @@ -47,6 +47,7 @@ nsUniversalDetector::nsUniversalDetector(PRUint32 aLanguageFilter) { + mNbspFound = PR_FALSE; mDone = PR_FALSE; mBestGuess = -1; //illegal value as signal mInTag = PR_FALSE; @@ -64,7 +65,7 @@ nsUniversalDetector::nsUniversalDetector(PRUint32 aLanguageFilter) mCharSetProbers[i] = nsnull; } -nsUniversalDetector::~nsUniversalDetector() +nsUniversalDetector::~nsUniversalDetector() { for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++) delete mCharSetProbers[i]; @@ -72,9 +73,10 @@ nsUniversalDetector::~nsUniversalDetector() delete mEscCharSetProber; } -void +void nsUniversalDetector::Reset() { + mNbspFound = PR_FALSE; mDone = PR_FALSE; mBestGuess = -1; //illegal value as signal mInTag = PR_FALSE; @@ -96,17 +98,17 @@ nsUniversalDetector::Reset() //--------------------------------------------------------------------- #define SHORTCUT_THRESHOLD (float)0.95 -#define MINIMUM_THRESHOLD (float)0.60 +#define MINIMUM_THRESHOLD (float)0.20 nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) { - if(mDone) + if(mDone) return NS_OK; if (aLen > 0) mGotData = PR_TRUE; - //If the data starts with BOM, we know it is UTF + /* If the data starts with BOM, we know it is UTF. */ if (mStart) { mStart = PR_FALSE; @@ -115,20 +117,42 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) { case '\xEF': if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2])) - // EF BB BF UTF-8 encoded BOM + /* EF BB BF: UTF-8 encoded BOM. */ mDetectedCharset = "UTF-8"; break; case '\xFE': if ('\xFF' == aBuf[1]) - // FE FF UTF-16, big endian BOM + /* FE FF: UTF-16, big endian BOM. */ mDetectedCharset = "UTF-16"; break; case '\xFF': if ('\xFE' == aBuf[1]) - // FF FE UTF-16, little endian BOM - mDetectedCharset = "UTF-16"; - break; - } // switch + { + if (aLen > 3 && + aBuf[2] == '\x00' && + aBuf[3] == '\x00') + { + /* FF FE 00 00: UTF-32 (LE). */ + mDetectedCharset = "UTF-32"; + } + else + { + /* FF FE: UTF-16, little endian BOM. */ + mDetectedCharset = "UTF-16"; + } + } + break; + case '\x00': + if (aLen > 3 && + aBuf[1] == '\x00' && + aBuf[2] == '\xFE' && + aBuf[3] == '\xFF') + { + /* 00 00 FE FF: UTF-32 (BE). */ + mDetectedCharset = "UTF-32"; + } + break; + } if (mDetectedCharset) { @@ -136,14 +160,17 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) return NS_OK; } } - + PRUint32 i; for (i = 0; i < aLen; i++) { - //other than 0xa0, if every othe character is ascii, the page is ascii - if (aBuf[i] & '\x80' && aBuf[i] != '\xA0') //Since many Ascii only page contains NBSP + /* If every other character is ASCII or 0xA0, we don't run charset + * probers. + * 0xA0 (NBSP in a few charset) is apparently a rare exception + * of non-ASCII character often contained in nearly-ASCII text. */ + if (aBuf[i] & '\x80' && aBuf[i] != '\xA0') { - //we got a non-ascii byte (high-byte) + /* We got a non-ASCII byte (high-byte) */ if (mInputState != eHighbyte) { //adjust state @@ -171,7 +198,7 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) } if (nsnull == mCharSetProbers[2]) { - mCharSetProbers[2] = new nsLatin1Prober; + mCharSetProbers[2] = new nsLatin1Prober; if (nsnull == mCharSetProbers[2]) return NS_ERROR_OUT_OF_MEMORY; } @@ -179,11 +206,19 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) } else { - //ok, just pure ascii so far - if ( ePureAscii == mInputState && - (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')) ) + /* Just pure ASCII or NBSP so far. */ + if (aBuf[i] == '\xA0') { - //found escape character or HZ "~{" + /* ASCII with the only exception of NBSP seems quite common. + * I doubt it is really necessary to train a model here, so let's + * just make an exception. + */ + mNbspFound = PR_TRUE; + } + else if (mInputState == ePureAscii && + (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~'))) + { + /* We found an escape character or HZ "~{". */ mInputState = eEscAscii; } mLastChar = aBuf[i]; @@ -205,6 +240,16 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) mDone = PR_TRUE; mDetectedCharset = mEscCharSetProber->GetCharSetName(); } + else if (mNbspFound) + { + mDetectedCharset = "ISO-8859-1"; + } + else + { + /* ASCII with the ESC character (or the sequence "~{") is still + * ASCII until proven otherwise. */ + mDetectedCharset = "ASCII"; + } break; case eHighbyte: for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) @@ -212,18 +257,29 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) if (mCharSetProbers[i]) { st = mCharSetProbers[i]->HandleData(aBuf, aLen); - if (st == eFoundIt) + if (st == eFoundIt) { mDone = PR_TRUE; mDetectedCharset = mCharSetProbers[i]->GetCharSetName(); return NS_OK; } - } + } } break; - default: //pure ascii - ;//do nothing here + default: + if (mNbspFound) + { + /* ISO-8859-1 is a good result candidate for ASCII + NBSP. + * (though it could have been any ISO-8859 encoding). */ + mDetectedCharset = "ISO-8859-1"; + } + else + { + /* Pure ASCII */ + mDetectedCharset = "ASCII"; + } + break; } return NS_OK; } @@ -234,7 +290,7 @@ void nsUniversalDetector::DataEnd() { if (!mGotData) { - // we haven't got any data yet, return immediately + // we haven't got any data yet, return immediately // caller program sometimes call DataEnd before anything has been sent to detector return; } @@ -245,7 +301,7 @@ void nsUniversalDetector::DataEnd() Report(mDetectedCharset); return; } - + switch (mInputState) { case eHighbyte: diff --git a/PowerEditor/src/uchardet/nsUniversalDetector.h b/PowerEditor/src/uchardet/nsUniversalDetector.h index 525f7220..9f0a4b18 100644 --- a/PowerEditor/src/uchardet/nsUniversalDetector.h +++ b/PowerEditor/src/uchardet/nsUniversalDetector.h @@ -72,6 +72,7 @@ protected: virtual void Report(const char* aCharset) = 0; virtual void Reset(); nsInputState mInputState; + PRBool mNbspFound; PRBool mDone; PRBool mInTag; PRBool mStart; @@ -86,4 +87,3 @@ protected: }; #endif - diff --git a/PowerEditor/src/uchardet/uchardet.cpp b/PowerEditor/src/uchardet/uchardet.cpp index 35b84092..f1951d1a 100644 --- a/PowerEditor/src/uchardet/uchardet.cpp +++ b/PowerEditor/src/uchardet/uchardet.cpp @@ -34,47 +34,52 @@ * the terms of any one of the MPL, the GPL or the LGPL. * * ***** END LICENSE BLOCK ***** */ - #include "uchardet.h" +#include +#include #include "nscore.h" #include "nsUniversalDetector.h" -#include - -using std::string; class HandleUniversalDetector : public nsUniversalDetector { protected: - string m_charset; + char *m_charset; public: HandleUniversalDetector() : nsUniversalDetector(NS_FILTER_ALL) + , m_charset(0) { - m_charset = ""; } virtual ~HandleUniversalDetector() - {} + { + if (m_charset) + free(m_charset); + } virtual void Report(const char* charset) { - m_charset = charset; + if (m_charset) + free(m_charset); + m_charset = strdup(charset); } virtual void Reset() { nsUniversalDetector::Reset(); - m_charset = ""; + if (m_charset) + free(m_charset); + m_charset = strdup(""); } const char* GetCharset() const { - return m_charset.c_str(); + return m_charset? m_charset : ""; } }; -uchardet_t uchardet_new() +uchardet_t uchardet_new(void) { return reinterpret_cast (new HandleUniversalDetector()); } diff --git a/PowerEditor/src/uchardet/uchardet.h b/PowerEditor/src/uchardet/uchardet.h index 533666aa..c1593eb2 100644 --- a/PowerEditor/src/uchardet/uchardet.h +++ b/PowerEditor/src/uchardet/uchardet.h @@ -34,8 +34,8 @@ * the terms of any one of the MPL, the GPL or the LGPL. * * ***** END LICENSE BLOCK ***** */ -#ifndef ___UCHARDET_H___ -#define ___UCHARDET_H___ +#ifndef UCHARDET_H___ +#define UCHARDET_H___ #ifdef __cplusplus extern "C" { @@ -43,13 +43,13 @@ extern "C" { #include -typedef void * uchardet_t; +typedef struct uchardet * uchardet_t; /** * Create an encoding detector. * @return a handle of a instance of uchardet */ -uchardet_t uchardet_new(); +uchardet_t uchardet_new(void); /** * Delete an encoding detector. @@ -79,9 +79,9 @@ void uchardet_data_end(uchardet_t ud); void uchardet_reset(uchardet_t ud); /** - * Get the name of encoding that was detected. + * Get an iconv-compatible name of the encoding that was detected. * @param ud [in] handle of a instance of uchardet - * @return name of charset on success and "" on failure or pure ascii. + * @return name of charset on success and "" on failure. */ const char * uchardet_get_charset(uchardet_t ud); diff --git a/PowerEditor/visual.net/notepadPlus.vcxproj b/PowerEditor/visual.net/notepadPlus.vcxproj index 13f8b663..3a94d179 100755 --- a/PowerEditor/visual.net/notepadPlus.vcxproj +++ b/PowerEditor/visual.net/notepadPlus.vcxproj @@ -272,6 +272,20 @@ copy ..\src\contextMenu.xml ..\bin64\contextMenu.xml + + + + + + + + + + + + + + @@ -306,12 +320,6 @@ copy ..\src\contextMenu.xml ..\bin64\contextMenu.xml - - - - - -