notepad-plus-plus-legacy/scintilla/src/CaseConvert.cxx
Don Ho cb7806e94c [UPDATE] Update Scintilla to 3.3.4
[BUG_FIXED] (Author: Dave Brotherstone) Fix scintilla crash bug while closing a document.

git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@1104 f5eea248-9336-0410-98b8-ebc06183d4e3
2013-08-28 00:44:27 +00:00

631 lines
15 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// Scintilla source code edit control
// Encoding: UTF-8
/** @file CaseConvert.cxx
** Case fold characters and convert them to upper or lower case.
** Tables automatically regenerated by scripts/GenerateCharacterCategory.py
** Should only be rarely regenerated for new versions of Unicode.
**/
// Copyright 2013 by Neil Hodgson <neilh@scintilla.org>
// The License.txt file describes the conditions under which this software may be distributed.
#include <cstring>
#include <vector>
#include <algorithm>
#include "CaseConvert.h"
#include "UniConversion.h"
#include "UnicodeFromUTF8.h"
#ifdef SCI_NAMESPACE
using namespace Scintilla;
#endif
namespace {
// Use an unnamed namespace to protect the declarations from name conflicts
// Unicode code points are ordered by groups and follow patterns.
// Most characters (pitch==1) are in ranges for a particular alphabet and their
// upper case forms are a fixed distance away.
// Another pattern (pitch==2) is where each lower case letter is preceded by
// the upper case form. These are also grouped into ranges.
int symmetricCaseConversionRanges[] = {
//lower, upper, range length, range pitch
//++Autogenerated -- start of section automatically generated
//**\(\*\n\)
97,65,26,1,
224,192,23,1,
248,216,7,1,
257,256,24,2,
314,313,8,2,
331,330,23,2,
462,461,8,2,
479,478,9,2,
505,504,20,2,
547,546,9,2,
583,582,5,2,
945,913,17,1,
963,931,9,1,
985,984,12,2,
1072,1040,32,1,
1104,1024,16,1,
1121,1120,17,2,
1163,1162,27,2,
1218,1217,7,2,
1233,1232,44,2,
1377,1329,38,1,
7681,7680,75,2,
7841,7840,48,2,
7936,7944,8,1,
7952,7960,6,1,
7968,7976,8,1,
7984,7992,8,1,
8000,8008,6,1,
8032,8040,8,1,
8560,8544,16,1,
9424,9398,26,1,
11312,11264,47,1,
11393,11392,50,2,
11520,4256,38,1,
42561,42560,23,2,
42625,42624,12,2,
42787,42786,7,2,
42803,42802,31,2,
42879,42878,5,2,
42913,42912,5,2,
65345,65313,26,1,
66600,66560,40,1,
//--Autogenerated -- end of section automatically generated
};
// Code points that are symmetric but don't fit into a range of similar characters
// are listed here.
int symmetricCaseConversions[] = {
//lower, upper
//++Autogenerated -- start of section automatically generated
//**1 \(\*\n\)
255,376,
307,306,
309,308,
311,310,
378,377,
380,379,
382,381,
384,579,
387,386,
389,388,
392,391,
396,395,
402,401,
405,502,
409,408,
410,573,
414,544,
417,416,
419,418,
421,420,
424,423,
429,428,
432,431,
436,435,
438,437,
441,440,
445,444,
447,503,
454,452,
457,455,
460,458,
477,398,
499,497,
501,500,
572,571,
575,11390,
576,11391,
578,577,
592,11375,
593,11373,
594,11376,
595,385,
596,390,
598,393,
599,394,
601,399,
603,400,
608,403,
611,404,
613,42893,
614,42922,
616,407,
617,406,
619,11362,
623,412,
625,11374,
626,413,
629,415,
637,11364,
640,422,
643,425,
648,430,
649,580,
650,433,
651,434,
652,581,
658,439,
881,880,
883,882,
887,886,
891,1021,
892,1022,
893,1023,
940,902,
941,904,
942,905,
943,906,
972,908,
973,910,
974,911,
983,975,
1010,1017,
1016,1015,
1019,1018,
1231,1216,
7545,42877,
7549,11363,
8017,8025,
8019,8027,
8021,8029,
8023,8031,
8048,8122,
8049,8123,
8050,8136,
8051,8137,
8052,8138,
8053,8139,
8054,8154,
8055,8155,
8056,8184,
8057,8185,
8058,8170,
8059,8171,
8060,8186,
8061,8187,
8112,8120,
8113,8121,
8144,8152,
8145,8153,
8160,8168,
8161,8169,
8165,8172,
8526,8498,
8580,8579,
11361,11360,
11365,570,
11366,574,
11368,11367,
11370,11369,
11372,11371,
11379,11378,
11382,11381,
11500,11499,
11502,11501,
11507,11506,
11559,4295,
11565,4301,
42874,42873,
42876,42875,
42892,42891,
42897,42896,
42899,42898,
//--Autogenerated -- end of section automatically generated
};
// Characters that have complex case conversions are listed here.
// This includes cases where more than one character is needed for a conversion,
// folding is different to lowering, or (as appropriate) upper(lower(x)) != x or
// lower(upper(x)) != x.
const char *complexCaseConversions =
// Original | Folded | Upper | Lower |
//++Autogenerated -- start of section automatically generated
//**2 \(\*\n\)
"µ|μ|Μ||"
"ß|ss|SS||"
"İ|i̇||i̇|"
"ı||I||"
"ʼn|ʼn|ʼN||"
"ſ|s|S||"
"Dž|dž|DŽ|dž|"
"Lj|lj|LJ|lj|"
"Nj|nj|NJ|nj|"
"ǰ|ǰ|J̌||"
"Dz|dz|DZ|dz|"
"ͅ|ι|Ι||"
"ΐ|ΐ|Ϊ́||"
"ΰ|ΰ|Ϋ́||"
"ς|σ|Σ||"
"ϐ|β|Β||"
"ϑ|θ|Θ||"
"ϕ|φ|Φ||"
"ϖ|π|Π||"
"ϰ|κ|Κ||"
"ϱ|ρ|Ρ||"
"ϴ|θ||θ|"
"ϵ|ε|Ε||"
"և|եւ|ԵՒ||"
"ẖ|ẖ|H̱||"
"ẗ|ẗ|T̈||"
"ẘ|ẘ|W̊||"
"ẙ|ẙ|Y̊||"
"ẚ|aʾ|Aʾ||"
"ẛ|ṡ|Ṡ||"
"ẞ|ss||ß|"
"ὐ|ὐ|Υ̓||"
"ὒ|ὒ|Υ̓̀||"
"ὔ|ὔ|Υ̓́||"
"ὖ|ὖ|Υ̓͂||"
"ᾀ|ἀι|ἈΙ||"
"ᾁ|ἁι|ἉΙ||"
"ᾂ|ἂι|ἊΙ||"
"ᾃ|ἃι|ἋΙ||"
"ᾄ|ἄι|ἌΙ||"
"ᾅ|ἅι|ἍΙ||"
"ᾆ|ἆι|ἎΙ||"
"ᾇ|ἇι|ἏΙ||"
"ᾈ|ἀι|ἈΙ|ᾀ|"
"ᾉ|ἁι|ἉΙ|ᾁ|"
"ᾊ|ἂι|ἊΙ|ᾂ|"
"ᾋ|ἃι|ἋΙ|ᾃ|"
"ᾌ|ἄι|ἌΙ|ᾄ|"
"ᾍ|ἅι|ἍΙ|ᾅ|"
"ᾎ|ἆι|ἎΙ|ᾆ|"
"ᾏ|ἇι|ἏΙ|ᾇ|"
"ᾐ|ἠι|ἨΙ||"
"ᾑ|ἡι|ἩΙ||"
"ᾒ|ἢι|ἪΙ||"
"ᾓ|ἣι|ἫΙ||"
"ᾔ|ἤι|ἬΙ||"
"ᾕ|ἥι|ἭΙ||"
"ᾖ|ἦι|ἮΙ||"
"ᾗ|ἧι|ἯΙ||"
"ᾘ|ἠι|ἨΙ|ᾐ|"
"ᾙ|ἡι|ἩΙ|ᾑ|"
"ᾚ|ἢι|ἪΙ|ᾒ|"
"ᾛ|ἣι|ἫΙ|ᾓ|"
"ᾜ|ἤι|ἬΙ|ᾔ|"
"ᾝ|ἥι|ἭΙ|ᾕ|"
"ᾞ|ἦι|ἮΙ|ᾖ|"
"ᾟ|ἧι|ἯΙ|ᾗ|"
"ᾠ|ὠι|ὨΙ||"
"ᾡ|ὡι|ὩΙ||"
"ᾢ|ὢι|ὪΙ||"
"ᾣ|ὣι|ὫΙ||"
"ᾤ|ὤι|ὬΙ||"
"ᾥ|ὥι|ὭΙ||"
"ᾦ|ὦι|ὮΙ||"
"ᾧ|ὧι|ὯΙ||"
"ᾨ|ὠι|ὨΙ|ᾠ|"
"ᾩ|ὡι|ὩΙ|ᾡ|"
"ᾪ|ὢι|ὪΙ|ᾢ|"
"ᾫ|ὣι|ὫΙ|ᾣ|"
"ᾬ|ὤι|ὬΙ|ᾤ|"
"ᾭ|ὥι|ὭΙ|ᾥ|"
"ᾮ|ὦι|ὮΙ|ᾦ|"
"ᾯ|ὧι|ὯΙ|ᾧ|"
"ᾲ|ὰι|ᾺΙ||"
"ᾳ|αι|ΑΙ||"
"ᾴ|άι|ΆΙ||"
"ᾶ|ᾶ|Α͂||"
"ᾷ|ᾶι|Α͂Ι||"
"ᾼ|αι|ΑΙ|ᾳ|"
"|ι|Ι||"
"ῂ|ὴι|ῊΙ||"
"ῃ|ηι|ΗΙ||"
"ῄ|ήι|ΉΙ||"
"ῆ|ῆ|Η͂||"
"ῇ|ῆι|Η͂Ι||"
"ῌ|ηι|ΗΙ|ῃ|"
"ῒ|ῒ|Ϊ̀||"
"ΐ|ΐ|Ϊ́||"
"ῖ|ῖ|Ι͂||"
"ῗ|ῗ|Ϊ͂||"
"ῢ|ῢ|Ϋ̀||"
"ΰ|ΰ|Ϋ́||"
"ῤ|ῤ|Ρ̓||"
"ῦ|ῦ|Υ͂||"
"ῧ|ῧ|Ϋ͂||"
"ῲ|ὼι|ῺΙ||"
"ῳ|ωι|ΩΙ||"
"ῴ|ώι|ΏΙ||"
"ῶ|ῶ|Ω͂||"
"ῷ|ῶι|Ω͂Ι||"
"ῼ|ωι|ΩΙ|ῳ|"
"Ω|ω||ω|"
"|k||k|"
"Å|å||å|"
"ff|ff|FF||"
"fi|fi|FI||"
"fl|fl|FL||"
"ffi|ffi|FFI||"
"ffl|ffl|FFL||"
"ſt|st|ST||"
"st|st|ST||"
"ﬓ|մն|ՄՆ||"
"ﬔ|մե|ՄԵ||"
"ﬕ|մի|ՄԻ||"
"ﬖ|վն|ՎՆ||"
"ﬗ|մխ|ՄԽ||"
//--Autogenerated -- end of section automatically generated
;
class CaseConverter : public ICaseConverter {
// Maximum length of a case conversion result is 6 bytes in UTF-8
enum { maxConversionLength=6 };
struct ConversionString {
char conversion[maxConversionLength+1];
};
// Conversions are initially store in a vector of structs but then decomposed into
// parallel arrays as that is about 10% faster to search.
struct CharacterConversion {
int character;
ConversionString conversion;
CharacterConversion(int character_=0, const char *conversion_="") : character(character_) {
strcpy(conversion.conversion, conversion_);
}
bool operator<(const CharacterConversion &other) const {
return character < other.character;
}
};
typedef std::vector<CharacterConversion> CharacterToConversion;
CharacterToConversion characterToConversion;
// The parallel arrays
std::vector<int> characters;
std::vector<ConversionString> conversions;
public:
CaseConverter() {
}
bool Initialised() const {
return characters.size() > 0;
}
void Add(int character, const char *conversion) {
characterToConversion.push_back(CharacterConversion(character, conversion));
}
const char *Find(int character) {
const std::vector<int>::iterator it = std::lower_bound(characters.begin(), characters.end(), character);
if (it == characters.end())
return 0;
else if (*it == character)
return conversions[it - characters.begin()].conversion;
else
return 0;
}
size_t CaseConvertString(char *converted, size_t sizeConverted, const char *mixed, size_t lenMixed) {
size_t lenConverted = 0;
size_t mixedPos = 0;
unsigned char bytes[UTF8MaxBytes + 1];
while (mixedPos < lenMixed) {
const unsigned char leadByte = static_cast<unsigned char>(mixed[mixedPos]);
const char *caseConverted = 0;
size_t lenMixedChar = 1;
if (UTF8IsAscii(leadByte)) {
caseConverted = Find(leadByte);
} else {
bytes[0] = leadByte;
const int widthCharBytes = UTF8BytesOfLead[leadByte];
for (int b=1; b<widthCharBytes; b++) {
bytes[b] = (mixedPos+b < lenMixed) ? mixed[mixedPos+b] : 0;
}
int classified = UTF8Classify(bytes, widthCharBytes);
if (!(classified & UTF8MaskInvalid)) {
// valid UTF-8
lenMixedChar = classified & UTF8MaskWidth;
int character = UnicodeFromUTF8(bytes);
caseConverted = Find(character);
}
}
if (caseConverted) {
// Character has a conversion so copy that conversion in
while (*caseConverted) {
converted[lenConverted++] = *caseConverted++;
if (lenConverted >= sizeConverted)
return 0;
}
} else {
// Character has no conversion so copy the input to output
for (size_t i=0; i<lenMixedChar; i++) {
converted[lenConverted++] = mixed[mixedPos+i];
if (lenConverted >= sizeConverted)
return 0;
}
}
mixedPos += lenMixedChar;
}
return lenConverted;
}
void FinishedAdding() {
std::sort(characterToConversion.begin(), characterToConversion.end());
characters.reserve(characterToConversion.size());
conversions.reserve(characterToConversion.size());
for (CharacterToConversion::iterator it = characterToConversion.begin(); it != characterToConversion.end(); ++it) {
characters.push_back(it->character);
conversions.push_back(it->conversion);
}
// Empty the original calculated data completely
CharacterToConversion().swap(characterToConversion);
}
};
CaseConverter caseConvFold;
CaseConverter caseConvUp;
CaseConverter caseConvLow;
void UTF8FromUTF32Character(int uch, char *putf) {
size_t k = 0;
if (uch < 0x80) {
putf[k++] = static_cast<char>(uch);
} else if (uch < 0x800) {
putf[k++] = static_cast<char>(0xC0 | (uch >> 6));
putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
} else if (uch < 0x10000) {
putf[k++] = static_cast<char>(0xE0 | (uch >> 12));
putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
} else {
putf[k++] = static_cast<char>(0xF0 | (uch >> 18));
putf[k++] = static_cast<char>(0x80 | ((uch >> 12) & 0x3f));
putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
}
putf[k] = 0;
}
void AddSymmetric(enum CaseConversion conversion, int lower,int upper) {
char lowerUTF8[UTF8MaxBytes+1];
UTF8FromUTF32Character(lower, lowerUTF8);
char upperUTF8[UTF8MaxBytes+1];
UTF8FromUTF32Character(upper, upperUTF8);
switch (conversion) {
case CaseConversionFold:
caseConvFold.Add(upper, lowerUTF8);
break;
case CaseConversionUpper:
caseConvUp.Add(lower, upperUTF8);
break;
case CaseConversionLower:
caseConvLow.Add(upper, lowerUTF8);
break;
}
}
void SetupConversions(enum CaseConversion conversion) {
// First initialize for the symmetric ranges
for (size_t i=0; i<sizeof(symmetricCaseConversionRanges)/sizeof(symmetricCaseConversionRanges[0]);) {
int lower = symmetricCaseConversionRanges[i++];
int upper = symmetricCaseConversionRanges[i++];
int length = symmetricCaseConversionRanges[i++];
int pitch = symmetricCaseConversionRanges[i++];
for (int j=0;j<length*pitch;j+=pitch) {
AddSymmetric(conversion, lower+j, upper+j);
}
}
// Add the symmetric singletons
for (size_t i=0; i<sizeof(symmetricCaseConversions)/sizeof(symmetricCaseConversions[0]);) {
int lower = symmetricCaseConversions[i++];
int upper = symmetricCaseConversions[i++];
AddSymmetric(conversion, lower, upper);
}
// Add the complex cases
const char *sComplex = complexCaseConversions;
while (*sComplex) {
// Longest ligature is 3 character so 5 for safety
const size_t lenUTF8 = 5*UTF8MaxBytes+1;
char originUTF8[lenUTF8];
char foldedUTF8[lenUTF8];
char lowerUTF8[lenUTF8];
char upperUTF8[lenUTF8];
size_t i = 0;
while (*sComplex && *sComplex != '|') {
originUTF8[i++] = *sComplex;
sComplex++;
}
sComplex++;
originUTF8[i] = 0;
i = 0;
while (*sComplex && *sComplex != '|') {
foldedUTF8[i++] = *sComplex;
sComplex++;
}
sComplex++;
foldedUTF8[i] = 0;
i = 0;
while (*sComplex && *sComplex != '|') {
upperUTF8[i++] = *sComplex;
sComplex++;
}
sComplex++;
upperUTF8[i] = 0;
i = 0;
while (*sComplex && *sComplex != '|') {
lowerUTF8[i++] = *sComplex;
sComplex++;
}
sComplex++;
lowerUTF8[i] = 0;
int character = UnicodeFromUTF8(reinterpret_cast<unsigned char *>(originUTF8));
if (conversion == CaseConversionFold && foldedUTF8[0]) {
caseConvFold.Add(character, foldedUTF8);
}
if (conversion == CaseConversionUpper && upperUTF8[0]) {
caseConvUp.Add(character, upperUTF8);
}
if (conversion == CaseConversionLower && lowerUTF8[0]) {
caseConvLow.Add(character, lowerUTF8);
}
}
switch (conversion) {
case CaseConversionFold:
caseConvFold.FinishedAdding();
break;
case CaseConversionUpper:
caseConvUp.FinishedAdding();
break;
case CaseConversionLower:
caseConvLow.FinishedAdding();
break;
}
}
CaseConverter *ConverterForConversion(enum CaseConversion conversion) {
switch (conversion) {
case CaseConversionFold:
return &caseConvFold;
case CaseConversionUpper:
return &caseConvUp;
case CaseConversionLower:
return &caseConvLow;
}
return 0;
}
}
#ifdef SCI_NAMESPACE
namespace Scintilla {
#endif
ICaseConverter *ConverterFor(enum CaseConversion conversion) {
CaseConverter *pCaseConv = ConverterForConversion(conversion);
if (!pCaseConv->Initialised())
SetupConversions(conversion);
return pCaseConv;
}
const char *CaseConvert(int character, enum CaseConversion conversion) {
CaseConverter *pCaseConv = ConverterForConversion(conversion);
if (!pCaseConv->Initialised())
SetupConversions(conversion);
return pCaseConv->Find(character);
}
size_t CaseConvertString(char *converted, size_t sizeConverted, const char *mixed, size_t lenMixed, enum CaseConversion conversion) {
CaseConverter *pCaseConv = ConverterForConversion(conversion);
if (!pCaseConv->Initialised())
SetupConversions(conversion);
return pCaseConv->CaseConvertString(converted, sizeConverted, mixed, lenMixed);
}
#ifdef SCI_NAMESPACE
}
#endif