// Scintilla source code edit control // Encoding: UTF-8 /** @file CaseConvert.cxx ** Case fold characters and convert them to upper or lower case. ** Tables automatically regenerated by scripts/GenerateCharacterCategory.py ** Should only be rarely regenerated for new versions of Unicode. **/ // Copyright 2013 by Neil Hodgson // The License.txt file describes the conditions under which this software may be distributed. #include #include #include #include "CaseConvert.h" #include "UniConversion.h" #include "UnicodeFromUTF8.h" #ifdef SCI_NAMESPACE using namespace Scintilla; #endif namespace { // Use an unnamed namespace to protect the declarations from name conflicts // Unicode code points are ordered by groups and follow patterns. // Most characters (pitch==1) are in ranges for a particular alphabet and their // upper case forms are a fixed distance away. // Another pattern (pitch==2) is where each lower case letter is preceded by // the upper case form. These are also grouped into ranges. int symmetricCaseConversionRanges[] = { //lower, upper, range length, range pitch //++Autogenerated -- start of section automatically generated //**\(\*\n\) 97,65,26,1, 224,192,23,1, 248,216,7,1, 257,256,24,2, 314,313,8,2, 331,330,23,2, 462,461,8,2, 479,478,9,2, 505,504,20,2, 547,546,9,2, 583,582,5,2, 945,913,17,1, 963,931,9,1, 985,984,12,2, 1072,1040,32,1, 1104,1024,16,1, 1121,1120,17,2, 1163,1162,27,2, 1218,1217,7,2, 1233,1232,44,2, 1377,1329,38,1, 7681,7680,75,2, 7841,7840,48,2, 7936,7944,8,1, 7952,7960,6,1, 7968,7976,8,1, 7984,7992,8,1, 8000,8008,6,1, 8032,8040,8,1, 8560,8544,16,1, 9424,9398,26,1, 11312,11264,47,1, 11393,11392,50,2, 11520,4256,38,1, 42561,42560,23,2, 42625,42624,12,2, 42787,42786,7,2, 42803,42802,31,2, 42879,42878,5,2, 42913,42912,5,2, 65345,65313,26,1, 66600,66560,40,1, //--Autogenerated -- end of section automatically generated }; // Code points that are symmetric but don't fit into a range of similar characters // are listed here. int symmetricCaseConversions[] = { //lower, upper //++Autogenerated -- start of section automatically generated //**1 \(\*\n\) 255,376, 307,306, 309,308, 311,310, 378,377, 380,379, 382,381, 384,579, 387,386, 389,388, 392,391, 396,395, 402,401, 405,502, 409,408, 410,573, 414,544, 417,416, 419,418, 421,420, 424,423, 429,428, 432,431, 436,435, 438,437, 441,440, 445,444, 447,503, 454,452, 457,455, 460,458, 477,398, 499,497, 501,500, 572,571, 575,11390, 576,11391, 578,577, 592,11375, 593,11373, 594,11376, 595,385, 596,390, 598,393, 599,394, 601,399, 603,400, 608,403, 611,404, 613,42893, 614,42922, 616,407, 617,406, 619,11362, 623,412, 625,11374, 626,413, 629,415, 637,11364, 640,422, 643,425, 648,430, 649,580, 650,433, 651,434, 652,581, 658,439, 881,880, 883,882, 887,886, 891,1021, 892,1022, 893,1023, 940,902, 941,904, 942,905, 943,906, 972,908, 973,910, 974,911, 983,975, 1010,1017, 1016,1015, 1019,1018, 1231,1216, 7545,42877, 7549,11363, 8017,8025, 8019,8027, 8021,8029, 8023,8031, 8048,8122, 8049,8123, 8050,8136, 8051,8137, 8052,8138, 8053,8139, 8054,8154, 8055,8155, 8056,8184, 8057,8185, 8058,8170, 8059,8171, 8060,8186, 8061,8187, 8112,8120, 8113,8121, 8144,8152, 8145,8153, 8160,8168, 8161,8169, 8165,8172, 8526,8498, 8580,8579, 11361,11360, 11365,570, 11366,574, 11368,11367, 11370,11369, 11372,11371, 11379,11378, 11382,11381, 11500,11499, 11502,11501, 11507,11506, 11559,4295, 11565,4301, 42874,42873, 42876,42875, 42892,42891, 42897,42896, 42899,42898, //--Autogenerated -- end of section automatically generated }; // Characters that have complex case conversions are listed here. // This includes cases where more than one character is needed for a conversion, // folding is different to lowering, or (as appropriate) upper(lower(x)) != x or // lower(upper(x)) != x. const char *complexCaseConversions = // Original | Folded | Upper | Lower | //++Autogenerated -- start of section automatically generated //**2 \(\*\n\) "µ|μ|Μ||" "ß|ss|SS||" "İ|i̇||i̇|" "ı||I||" "ʼn|ʼn|ʼN||" "ſ|s|S||" "Dž|dž|DŽ|dž|" "Lj|lj|LJ|lj|" "Nj|nj|NJ|nj|" "ǰ|ǰ|J̌||" "Dz|dz|DZ|dz|" "ͅ|ι|Ι||" "ΐ|ΐ|Ϊ́||" "ΰ|ΰ|Ϋ́||" "ς|σ|Σ||" "ϐ|β|Β||" "ϑ|θ|Θ||" "ϕ|φ|Φ||" "ϖ|π|Π||" "ϰ|κ|Κ||" "ϱ|ρ|Ρ||" "ϴ|θ||θ|" "ϵ|ε|Ε||" "և|եւ|ԵՒ||" "ẖ|ẖ|H̱||" "ẗ|ẗ|T̈||" "ẘ|ẘ|W̊||" "ẙ|ẙ|Y̊||" "ẚ|aʾ|Aʾ||" "ẛ|ṡ|Ṡ||" "ẞ|ss||ß|" "ὐ|ὐ|Υ̓||" "ὒ|ὒ|Υ̓̀||" "ὔ|ὔ|Υ̓́||" "ὖ|ὖ|Υ̓͂||" "ᾀ|ἀι|ἈΙ||" "ᾁ|ἁι|ἉΙ||" "ᾂ|ἂι|ἊΙ||" "ᾃ|ἃι|ἋΙ||" "ᾄ|ἄι|ἌΙ||" "ᾅ|ἅι|ἍΙ||" "ᾆ|ἆι|ἎΙ||" "ᾇ|ἇι|ἏΙ||" "ᾈ|ἀι|ἈΙ|ᾀ|" "ᾉ|ἁι|ἉΙ|ᾁ|" "ᾊ|ἂι|ἊΙ|ᾂ|" "ᾋ|ἃι|ἋΙ|ᾃ|" "ᾌ|ἄι|ἌΙ|ᾄ|" "ᾍ|ἅι|ἍΙ|ᾅ|" "ᾎ|ἆι|ἎΙ|ᾆ|" "ᾏ|ἇι|ἏΙ|ᾇ|" "ᾐ|ἠι|ἨΙ||" "ᾑ|ἡι|ἩΙ||" "ᾒ|ἢι|ἪΙ||" "ᾓ|ἣι|ἫΙ||" "ᾔ|ἤι|ἬΙ||" "ᾕ|ἥι|ἭΙ||" "ᾖ|ἦι|ἮΙ||" "ᾗ|ἧι|ἯΙ||" "ᾘ|ἠι|ἨΙ|ᾐ|" "ᾙ|ἡι|ἩΙ|ᾑ|" "ᾚ|ἢι|ἪΙ|ᾒ|" "ᾛ|ἣι|ἫΙ|ᾓ|" "ᾜ|ἤι|ἬΙ|ᾔ|" "ᾝ|ἥι|ἭΙ|ᾕ|" "ᾞ|ἦι|ἮΙ|ᾖ|" "ᾟ|ἧι|ἯΙ|ᾗ|" "ᾠ|ὠι|ὨΙ||" "ᾡ|ὡι|ὩΙ||" "ᾢ|ὢι|ὪΙ||" "ᾣ|ὣι|ὫΙ||" "ᾤ|ὤι|ὬΙ||" "ᾥ|ὥι|ὭΙ||" "ᾦ|ὦι|ὮΙ||" "ᾧ|ὧι|ὯΙ||" "ᾨ|ὠι|ὨΙ|ᾠ|" "ᾩ|ὡι|ὩΙ|ᾡ|" "ᾪ|ὢι|ὪΙ|ᾢ|" "ᾫ|ὣι|ὫΙ|ᾣ|" "ᾬ|ὤι|ὬΙ|ᾤ|" "ᾭ|ὥι|ὭΙ|ᾥ|" "ᾮ|ὦι|ὮΙ|ᾦ|" "ᾯ|ὧι|ὯΙ|ᾧ|" "ᾲ|ὰι|ᾺΙ||" "ᾳ|αι|ΑΙ||" "ᾴ|άι|ΆΙ||" "ᾶ|ᾶ|Α͂||" "ᾷ|ᾶι|Α͂Ι||" "ᾼ|αι|ΑΙ|ᾳ|" "ι|ι|Ι||" "ῂ|ὴι|ῊΙ||" "ῃ|ηι|ΗΙ||" "ῄ|ήι|ΉΙ||" "ῆ|ῆ|Η͂||" "ῇ|ῆι|Η͂Ι||" "ῌ|ηι|ΗΙ|ῃ|" "ῒ|ῒ|Ϊ̀||" "ΐ|ΐ|Ϊ́||" "ῖ|ῖ|Ι͂||" "ῗ|ῗ|Ϊ͂||" "ῢ|ῢ|Ϋ̀||" "ΰ|ΰ|Ϋ́||" "ῤ|ῤ|Ρ̓||" "ῦ|ῦ|Υ͂||" "ῧ|ῧ|Ϋ͂||" "ῲ|ὼι|ῺΙ||" "ῳ|ωι|ΩΙ||" "ῴ|ώι|ΏΙ||" "ῶ|ῶ|Ω͂||" "ῷ|ῶι|Ω͂Ι||" "ῼ|ωι|ΩΙ|ῳ|" "Ω|ω||ω|" "K|k||k|" "Å|å||å|" "ff|ff|FF||" "fi|fi|FI||" "fl|fl|FL||" "ffi|ffi|FFI||" "ffl|ffl|FFL||" "ſt|st|ST||" "st|st|ST||" "ﬓ|մն|ՄՆ||" "ﬔ|մե|ՄԵ||" "ﬕ|մի|ՄԻ||" "ﬖ|վն|ՎՆ||" "ﬗ|մխ|ՄԽ||" //--Autogenerated -- end of section automatically generated ; class CaseConverter : public ICaseConverter { // Maximum length of a case conversion result is 6 bytes in UTF-8 enum { maxConversionLength=6 }; struct ConversionString { char conversion[maxConversionLength+1]; }; // Conversions are initially store in a vector of structs but then decomposed into // parallel arrays as that is about 10% faster to search. struct CharacterConversion { int character; ConversionString conversion; CharacterConversion(int character_=0, const char *conversion_="") : character(character_) { strcpy(conversion.conversion, conversion_); } bool operator<(const CharacterConversion &other) const { return character < other.character; } }; typedef std::vector CharacterToConversion; CharacterToConversion characterToConversion; // The parallel arrays std::vector characters; std::vector conversions; public: CaseConverter() { } bool Initialised() const { return characters.size() > 0; } void Add(int character, const char *conversion) { characterToConversion.push_back(CharacterConversion(character, conversion)); } const char *Find(int character) { const std::vector::iterator it = std::lower_bound(characters.begin(), characters.end(), character); if (it == characters.end()) return 0; else if (*it == character) return conversions[it - characters.begin()].conversion; else return 0; } size_t CaseConvertString(char *converted, size_t sizeConverted, const char *mixed, size_t lenMixed) { size_t lenConverted = 0; size_t mixedPos = 0; unsigned char bytes[UTF8MaxBytes + 1]; while (mixedPos < lenMixed) { const unsigned char leadByte = static_cast(mixed[mixedPos]); const char *caseConverted = 0; size_t lenMixedChar = 1; if (UTF8IsAscii(leadByte)) { caseConverted = Find(leadByte); } else { bytes[0] = leadByte; const int widthCharBytes = UTF8BytesOfLead[leadByte]; for (int b=1; b= sizeConverted) return 0; } } else { // Character has no conversion so copy the input to output for (size_t i=0; i= sizeConverted) return 0; } } mixedPos += lenMixedChar; } return lenConverted; } void FinishedAdding() { std::sort(characterToConversion.begin(), characterToConversion.end()); characters.reserve(characterToConversion.size()); conversions.reserve(characterToConversion.size()); for (CharacterToConversion::iterator it = characterToConversion.begin(); it != characterToConversion.end(); ++it) { characters.push_back(it->character); conversions.push_back(it->conversion); } // Empty the original calculated data completely CharacterToConversion().swap(characterToConversion); } }; CaseConverter caseConvFold; CaseConverter caseConvUp; CaseConverter caseConvLow; void UTF8FromUTF32Character(int uch, char *putf) { size_t k = 0; if (uch < 0x80) { putf[k++] = static_cast(uch); } else if (uch < 0x800) { putf[k++] = static_cast(0xC0 | (uch >> 6)); putf[k++] = static_cast(0x80 | (uch & 0x3f)); } else if (uch < 0x10000) { putf[k++] = static_cast(0xE0 | (uch >> 12)); putf[k++] = static_cast(0x80 | ((uch >> 6) & 0x3f)); putf[k++] = static_cast(0x80 | (uch & 0x3f)); } else { putf[k++] = static_cast(0xF0 | (uch >> 18)); putf[k++] = static_cast(0x80 | ((uch >> 12) & 0x3f)); putf[k++] = static_cast(0x80 | ((uch >> 6) & 0x3f)); putf[k++] = static_cast(0x80 | (uch & 0x3f)); } putf[k] = 0; } void AddSymmetric(enum CaseConversion conversion, int lower,int upper) { char lowerUTF8[UTF8MaxBytes+1]; UTF8FromUTF32Character(lower, lowerUTF8); char upperUTF8[UTF8MaxBytes+1]; UTF8FromUTF32Character(upper, upperUTF8); switch (conversion) { case CaseConversionFold: caseConvFold.Add(upper, lowerUTF8); break; case CaseConversionUpper: caseConvUp.Add(lower, upperUTF8); break; case CaseConversionLower: caseConvLow.Add(upper, lowerUTF8); break; } } void SetupConversions(enum CaseConversion conversion) { // First initialize for the symmetric ranges for (size_t i=0; i(originUTF8)); if (conversion == CaseConversionFold && foldedUTF8[0]) { caseConvFold.Add(character, foldedUTF8); } if (conversion == CaseConversionUpper && upperUTF8[0]) { caseConvUp.Add(character, upperUTF8); } if (conversion == CaseConversionLower && lowerUTF8[0]) { caseConvLow.Add(character, lowerUTF8); } } switch (conversion) { case CaseConversionFold: caseConvFold.FinishedAdding(); break; case CaseConversionUpper: caseConvUp.FinishedAdding(); break; case CaseConversionLower: caseConvLow.FinishedAdding(); break; } } CaseConverter *ConverterForConversion(enum CaseConversion conversion) { switch (conversion) { case CaseConversionFold: return &caseConvFold; case CaseConversionUpper: return &caseConvUp; case CaseConversionLower: return &caseConvLow; } return 0; } } #ifdef SCI_NAMESPACE namespace Scintilla { #endif ICaseConverter *ConverterFor(enum CaseConversion conversion) { CaseConverter *pCaseConv = ConverterForConversion(conversion); if (!pCaseConv->Initialised()) SetupConversions(conversion); return pCaseConv; } const char *CaseConvert(int character, enum CaseConversion conversion) { CaseConverter *pCaseConv = ConverterForConversion(conversion); if (!pCaseConv->Initialised()) SetupConversions(conversion); return pCaseConv->Find(character); } size_t CaseConvertString(char *converted, size_t sizeConverted, const char *mixed, size_t lenMixed, enum CaseConversion conversion) { CaseConverter *pCaseConv = ConverterForConversion(conversion); if (!pCaseConv->Initialised()) SetupConversions(conversion); return pCaseConv->CaseConvertString(converted, sizeConverted, mixed, lenMixed); } #ifdef SCI_NAMESPACE } #endif