From 948f281eb05cbcbe40084f8a59fcdd1e0cc8a6f8 Mon Sep 17 00:00:00 2001 From: Don Ho Date: Mon, 16 Aug 2010 16:52:03 +0000 Subject: [PATCH] =?UTF-8?q?[BUG=5FFIXED]=20(Author:=20Fran=C3=A7ois-R=20Bo?= =?UTF-8?q?yer)=20Fix=20DBCS=20encodings=20file=20saving=20corruption=20bu?= =?UTF-8?q?g.=20[BUG=5FFIXED]=20(Author:=20Fran=C3=A7ois-R=20Boyer)=20Fix?= =?UTF-8?q?=20file=20containing=20NULL=20character=20loading=20bug.=20[ENH?= =?UTF-8?q?ANCEMENT]=20(Author:=20Fran=C3=A7ois-R=20Boyer)=20Improve=20get?= =?UTF-8?q?CurrentDocCharCount()=20method=20performance.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@651 f5eea248-9336-0410-98b8-ebc06183d4e3 --- PowerEditor/src/MISC/Common/Common.cpp | 123 ++++++++++--------- PowerEditor/src/MISC/Common/Common.h | 54 +++++--- PowerEditor/src/Notepad_plus.cpp | 66 +++++++--- PowerEditor/src/Notepad_plus.h | 2 +- PowerEditor/src/NppBigSwitch.cpp | 3 +- PowerEditor/src/ScitillaComponent/Buffer.cpp | 47 ++++--- PowerEditor/src/Utf8.h | 55 +++++++++ PowerEditor/src/resource.h | 6 +- 8 files changed, 235 insertions(+), 121 deletions(-) create mode 100644 PowerEditor/src/Utf8.h diff --git a/PowerEditor/src/MISC/Common/Common.cpp b/PowerEditor/src/MISC/Common/Common.cpp index e9eb3240..79ed12ad 100644 --- a/PowerEditor/src/MISC/Common/Common.cpp +++ b/PowerEditor/src/MISC/Common/Common.cpp @@ -16,6 +16,7 @@ //Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. #include "precompiledHeaders.h" +#include "../Utf8.h" WcharMbcsConvertor * WcharMbcsConvertor::_pSelf = new WcharMbcsConvertor; @@ -217,28 +218,60 @@ generic_string purgeMenuItemString(const TCHAR * menuItemStr, bool keepAmpersand return cleanedName; }; -const wchar_t * WcharMbcsConvertor::char2wchar(const char * mbcs2Convert, UINT codepage) +const wchar_t * WcharMbcsConvertor::char2wchar(const char * mbcs2Convert, UINT codepage, int lenMbcs, int *pLenWc, int *pBytesNotProcessed) { - if (!_wideCharStr) + // Do not process empty strings + if (lenMbcs == 0 || lenMbcs == -1 && mbcs2Convert[0] == 0) { _wideCharStr.empty(); return _wideCharStr; } + + int bytesNotProcessed = 0; + int lenWc = 0; + + // If length not specified, simply convert without checking + if (lenMbcs == -1) { - _wideCharStr = new wchar_t[initSize]; - _wideCharAllocLen = initSize; + lenWc = MultiByteToWideChar(codepage, 0, mbcs2Convert, lenMbcs, NULL, 0); + } + // Otherwise, test if we are cutting a multi-byte character at end of buffer + else if(lenMbcs != -1 && codepage == CP_UTF8) // For UTF-8, we know how to test it + { + int indexOfLastChar = Utf8::characterStart(mbcs2Convert, lenMbcs-1); // get index of last character + if (indexOfLastChar != 0 && !Utf8::isValid(mbcs2Convert+indexOfLastChar, lenMbcs-indexOfLastChar)) // if it is not valid we do not process it right now (unless its the only character in string, to ensure that we always progress, e.g. that bytesNotProcessed < lenMbcs) + { + bytesNotProcessed = lenMbcs-indexOfLastChar; + } + lenWc = MultiByteToWideChar(codepage, 0, mbcs2Convert, lenMbcs-bytesNotProcessed, NULL, 0); + } + else // For other encodings, ask system if there are any invalid characters; note that it will not correctly know if last character is cut when there are invalid characters inside the text + { + lenWc = MultiByteToWideChar(codepage, (lenMbcs == -1) ? 0 : MB_ERR_INVALID_CHARS, mbcs2Convert, lenMbcs, NULL, 0); + if (lenWc == 0 && GetLastError() == ERROR_NO_UNICODE_TRANSLATION) + { + // Test without last byte + if (lenMbcs > 1) lenWc = MultiByteToWideChar(codepage, MB_ERR_INVALID_CHARS, mbcs2Convert, lenMbcs-1, NULL, 0); + if (lenWc == 0) // don't have to check that the error is still ERROR_NO_UNICODE_TRANSLATION, since only the length parameter changed + { + // TODO: should warn user about incorrect loading due to invalid characters + // We still load the file, but the system will either strip or replace invalid characters (including the last character, if cut in half) + lenWc = MultiByteToWideChar(codepage, 0, mbcs2Convert, lenMbcs, NULL, 0); + } + else + { + // We found a valid text by removing one byte. + bytesNotProcessed = 1; + } + } } - int len = MultiByteToWideChar(codepage, 0, mbcs2Convert, -1, _wideCharStr, 0); - if (len > 0) + if (lenWc > 0) { - if ((size_t)len > _wideCharAllocLen) - { - delete [] _wideCharStr; - _wideCharAllocLen = len; - _wideCharStr = new wchar_t[_wideCharAllocLen]; - } - MultiByteToWideChar(codepage, 0, mbcs2Convert, -1, _wideCharStr, len); + _wideCharStr.sizeTo(lenWc); + MultiByteToWideChar(codepage, 0, mbcs2Convert, lenMbcs-bytesNotProcessed, _wideCharStr, lenWc); } else - _wideCharStr[0] = 0; + _wideCharStr.empty(); + if(pLenWc) *pLenWc = lenWc; + if(pBytesNotProcessed) *pBytesNotProcessed = bytesNotProcessed; return _wideCharStr; } @@ -246,21 +279,10 @@ const wchar_t * WcharMbcsConvertor::char2wchar(const char * mbcs2Convert, UINT c // which are converted to the corresponding indexes in the returned wchar_t string. const wchar_t * WcharMbcsConvertor::char2wchar(const char * mbcs2Convert, UINT codepage, int *mstart, int *mend) { - if (!_wideCharStr) - { - _wideCharStr = new wchar_t[initSize]; - _wideCharAllocLen = initSize; - } - - int len = MultiByteToWideChar(codepage, 0, mbcs2Convert, -1, _wideCharStr, 0); + int len = MultiByteToWideChar(codepage, 0, mbcs2Convert, -1, NULL, 0); if (len > 0) { - if (len > int(_wideCharAllocLen)) - { - delete [] _wideCharStr; - _wideCharAllocLen = len; - _wideCharStr = new wchar_t[_wideCharAllocLen]; - } + _wideCharStr.sizeTo(len); len = MultiByteToWideChar(codepage, 0, mbcs2Convert, -1, _wideCharStr, len); if ((size_t)*mstart < strlen(mbcs2Convert) && (size_t)*mend <= strlen(mbcs2Convert)) @@ -276,61 +298,40 @@ const wchar_t * WcharMbcsConvertor::char2wchar(const char * mbcs2Convert, UINT c } else { - _wideCharStr[0] = 0; + _wideCharStr.empty(); *mstart = 0; *mend = 0; } return _wideCharStr; } -const char * WcharMbcsConvertor::wchar2char(const wchar_t * wcharStr2Convert, UINT codepage) +const char * WcharMbcsConvertor::wchar2char(const wchar_t * wcharStr2Convert, UINT codepage, int lenWc, int *pLenMbcs) { - if (!_multiByteStr) + int lenMbcs = WideCharToMultiByte(codepage, 0, wcharStr2Convert, lenWc, NULL, 0, NULL, NULL); + if (lenMbcs > 0) { - _multiByteStr = new char[initSize]; - _multiByteAllocLen = initSize; - } - - int len = WideCharToMultiByte(codepage, 0, wcharStr2Convert, -1, _multiByteStr, 0, NULL, NULL); - if (len > 0) - { - if ((size_t)len > _multiByteAllocLen) - { - delete [] _multiByteStr; - _multiByteAllocLen = len; - _multiByteStr = new char[_multiByteAllocLen]; - } - WideCharToMultiByte(codepage, 0, wcharStr2Convert, -1, _multiByteStr, len, NULL, NULL); + _multiByteStr.sizeTo(lenMbcs); + WideCharToMultiByte(codepage, 0, wcharStr2Convert, lenWc, _multiByteStr, lenMbcs, NULL, NULL); } else - _multiByteStr[0] = 0; + _multiByteStr.empty(); + if(pLenMbcs) *pLenMbcs = lenMbcs; return _multiByteStr; } const char * WcharMbcsConvertor::wchar2char(const wchar_t * wcharStr2Convert, UINT codepage, long *mstart, long *mend) { - if (!_multiByteStr) - { - _multiByteStr = new char[initSize]; - _multiByteAllocLen = initSize; - } - - int len = WideCharToMultiByte(codepage, 0, wcharStr2Convert, -1, _multiByteStr, 0, NULL, NULL); + int len = WideCharToMultiByte(codepage, 0, wcharStr2Convert, -1, NULL, 0, NULL, NULL); if (len > 0) { - if ((size_t)len > _multiByteAllocLen) - { - delete [] _multiByteStr; - _multiByteAllocLen = len; - _multiByteStr = new char[_multiByteAllocLen]; - } + _multiByteStr.sizeTo(len); len = WideCharToMultiByte(codepage, 0, wcharStr2Convert, -1, _multiByteStr, len, NULL, NULL); // not needed? if ((int)*mstart < lstrlenW(wcharStr2Convert) && (int)*mend < lstrlenW(wcharStr2Convert)) { - *mstart = WideCharToMultiByte(codepage, 0, wcharStr2Convert, *mstart, _multiByteStr, 0, NULL, NULL); - *mend = WideCharToMultiByte(codepage, 0, wcharStr2Convert, *mend, _multiByteStr, 0, NULL, NULL); + *mstart = WideCharToMultiByte(codepage, 0, wcharStr2Convert, *mstart, NULL, 0, NULL, NULL); + *mend = WideCharToMultiByte(codepage, 0, wcharStr2Convert, *mend, NULL, 0, NULL, NULL); if (*mstart >= len || *mend >= len) { *mstart = 0; @@ -339,7 +340,7 @@ const char * WcharMbcsConvertor::wchar2char(const wchar_t * wcharStr2Convert, UI } } else - _multiByteStr[0] = 0; + _multiByteStr.empty(); return _multiByteStr; } diff --git a/PowerEditor/src/MISC/Common/Common.h b/PowerEditor/src/MISC/Common/Common.h index d3b4ca70..88834967 100644 --- a/PowerEditor/src/MISC/Common/Common.h +++ b/PowerEditor/src/MISC/Common/Common.h @@ -94,32 +94,56 @@ public: static WcharMbcsConvertor * getInstance() {return _pSelf;}; static void destroyInstance() {delete _pSelf;}; - const wchar_t * char2wchar(const char *mbStr, UINT codepage); + const wchar_t * char2wchar(const char *mbStr, UINT codepage, int lenIn=-1, int *pLenOut=NULL, int *pBytesNotProcessed=NULL); const wchar_t * char2wchar(const char *mbcs2Convert, UINT codepage, int *mstart, int *mend); - const char * wchar2char(const wchar_t *wcStr, UINT codepage); + const char * wchar2char(const wchar_t *wcStr, UINT codepage, int lenIn=-1, int *pLenOut=NULL); const char * wchar2char(const wchar_t *wcStr, UINT codepage, long *mstart, long *mend); - const char * encode(UINT fromCodepage, UINT toCodepage, const char *txt2Encode) { - const wchar_t * strW = char2wchar(txt2Encode, fromCodepage); - return wchar2char(strW, toCodepage); + const char * encode(UINT fromCodepage, UINT toCodepage, const char *txt2Encode, int lenIn=-1, int *pLenOut=NULL, int *pBytesNotProcessed=NULL) { + int lenWc = 0; + const wchar_t * strW = char2wchar(txt2Encode, fromCodepage, lenIn, &lenWc, pBytesNotProcessed); + return wchar2char(strW, toCodepage, lenWc, pLenOut); }; protected: - WcharMbcsConvertor() : _multiByteStr(NULL), _wideCharStr(NULL), _multiByteAllocLen(0), _wideCharAllocLen(0), initSize(1024) { + WcharMbcsConvertor() { }; ~WcharMbcsConvertor() { - if (_multiByteStr) - delete [] _multiByteStr; - if (_wideCharStr) - delete [] _wideCharStr; }; static WcharMbcsConvertor * _pSelf; - const int initSize; - char *_multiByteStr; - size_t _multiByteAllocLen; - wchar_t *_wideCharStr; - size_t _wideCharAllocLen; + template + class StringBuffer { + public: + StringBuffer() : _str(0), _allocLen(0) { } + ~StringBuffer() { if(_str) delete [] _str; } + + void sizeTo(size_t size) { + if(_allocLen < size) + { + if(_allocLen) delete[] _str; + _allocLen = max(size, initSize); + _str = new T[_allocLen]; + } + } + void empty() { + static T nullStr = 0; // routines may return an empty string, with null terminator, without allocating memory; a pointer to this null character will be returned in that case + if(_allocLen == 0) + _str = &nullStr; + else + _str[0] = 0; + } + + operator T*() { return _str; } + + protected: + static const int initSize = 1024; + size_t _allocLen; + T* _str; + }; + + StringBuffer _multiByteStr; + StringBuffer _wideCharStr; private: // Since there's no public ctor, we need to void the default assignment operator. diff --git a/PowerEditor/src/Notepad_plus.cpp b/PowerEditor/src/Notepad_plus.cpp index ae189465..66dd2484 100644 --- a/PowerEditor/src/Notepad_plus.cpp +++ b/PowerEditor/src/Notepad_plus.cpp @@ -31,16 +31,15 @@ #include "xmlMatchedTagsHighlighter.h" #include "EncodingMapper.h" - - enum tb_stat {tb_saved, tb_unsaved, tb_ro}; #define DIR_LEFT true #define DIR_RIGHT false int docTabIconIDs[] = {IDI_SAVED_ICON, IDI_UNSAVED_ICON, IDI_READONLY_ICON}; + ToolBarButtonUnit toolBarIcons[] = { {IDM_FILE_NEW, IDI_NEW_OFF_ICON, IDI_NEW_ON_ICON, IDI_NEW_OFF_ICON, IDR_FILENEW}, - {IDM_FILE_OPEN, IDI_OPEN_OFF_ICON, IDI_OPEN_ON_ICON, IDI_NEW_OFF_ICON, IDR_FILEOPEN}, + {IDM_FILE_OPEN, IDI_OPEN_OFF_ICON, IDI_OPEN_ON_ICON, IDI_OPEN_OFF_ICON, IDR_FILEOPEN}, {IDM_FILE_SAVE, IDI_SAVE_OFF_ICON, IDI_SAVE_ON_ICON, IDI_SAVE_DISABLE_ICON, IDR_FILESAVE}, {IDM_FILE_SAVEALL, IDI_SAVEALL_OFF_ICON, IDI_SAVEALL_ON_ICON, IDI_SAVEALL_DISABLE_ICON, IDR_SAVEALL}, {IDM_FILE_CLOSE, IDI_CLOSE_OFF_ICON, IDI_CLOSE_ON_ICON, IDI_CLOSE_OFF_ICON, IDR_CLOSEFILE}, @@ -2311,7 +2310,26 @@ size_t Notepad_plus::getSelectedCharNumber(UniMode u) } return result; } -/* + + +#ifdef _OPENMP +#include +#endif +static inline size_t countUtf8Characters(unsigned char *buf, int pos, int endpos) +{ + size_t result = 0; + while(pos < endpos) + { + unsigned char c = buf[pos++]; + if ((c&0xc0) == 0x80 // do not count unexpected continuation bytes (this handles the case where an UTF-8 character is split in the middle) + || c == '\n' || c == '\r') continue; // do not count end of lines + if (c >= 0xc0) pos += utflen[(c & 0x30) >> 4]; + result++; + } + return result; +} + + size_t Notepad_plus::getCurrentDocCharCount(size_t numLines, UniMode u) { if (u != uniUTF8 && u != uniCookie) @@ -2322,23 +2340,39 @@ size_t Notepad_plus::getCurrentDocCharCount(size_t numLines, UniMode u) result -= lines; return ((int)result < 0)?0:result; } - else - { + else + { + // Note that counting is not well defined for invalid UTF-8 characters. + // This method is O(filelength) regardless of the number of characters we count (due to SCI_GETCHARACTERPOINTER); + // it would not be appropriate for counting characters in a small selection. size_t result = 0; - for (size_t line=0; lineexecute(SCI_GETLENGTH); + unsigned char* buf = (unsigned char*)_pEditView->execute(SCI_GETCHARACTERPOINTER); // Scintilla doc sais the pointer can be invalidated by any other "execute" + +#ifdef _OPENMP // parallel counting of characters with OpenMP + if(endpos > 50000) // starting threads takes time; for small files it is better to simply count in one thread { - size_t endpos = _pEditView->execute(SCI_GETLINEENDPOSITION, line); - for (size_t pos = _pEditView->execute(SCI_POSITIONFROMLINE, line); pos < endpos; pos++) + #pragma omp parallel reduction(+: result) { - unsigned char c = 0xf0 & (unsigned char)_pEditView->execute(SCI_GETCHARAT, pos); - if (c >= 0xc0) pos += utflen[(c & 0x30) >> 4]; - result++; + // split in chunks of same size (except last chunk if it's not evenly divisible) + unsigned int num_threads = omp_get_num_threads(); + unsigned int thread_num = omp_get_thread_num(); + size_t chunk_size = endpos/num_threads; + size_t pos = chunk_size*thread_num; + size_t endpos_local = (thread_num == num_threads-1) ? endpos : pos+chunk_size; + result = countUtf8Characters(buf, pos, endpos_local); } } - return result; - } + else +#endif + { + result = countUtf8Characters(buf, 0, endpos); + } + return result; + } } -*/ + bool Notepad_plus::isFormatUnicode(UniMode u) { @@ -2377,6 +2411,8 @@ size_t Notepad_plus::getSelectedBytes() void Notepad_plus::updateStatusBar() { + if(!NppParameters::getInstance()->getNppGUI()._statusBarShow) return; // do not update if status bar not shown + UniMode u = _pEditView->getCurrentBuffer()->getUnicodeMode(); TCHAR strLnCol[64]; diff --git a/PowerEditor/src/Notepad_plus.h b/PowerEditor/src/Notepad_plus.h index 3f3b97c2..1b9b3681 100644 --- a/PowerEditor/src/Notepad_plus.h +++ b/PowerEditor/src/Notepad_plus.h @@ -541,7 +541,7 @@ private: void updateStatusBar(); size_t getSelectedCharNumber(UniMode); - //size_t getCurrentDocCharCount(size_t numLines, UniMode u); + size_t getCurrentDocCharCount(size_t numLines, UniMode u); int getSelectedAreas(); int _numSel; size_t getSelectedBytes(); diff --git a/PowerEditor/src/NppBigSwitch.cpp b/PowerEditor/src/NppBigSwitch.cpp index 26070507..b092ca37 100644 --- a/PowerEditor/src/NppBigSwitch.cpp +++ b/PowerEditor/src/NppBigSwitch.cpp @@ -1666,7 +1666,8 @@ LRESULT Notepad_plus::process(HWND hwnd, UINT Message, WPARAM wParam, LPARAM lPa _pPublicInterface->getClientRect(rc); nppGUI._statusBarShow = show; - _statusBar.display(nppGUI._statusBarShow); + if(show) + _statusBar.display(nppGUI._statusBarShow); ::SendMessage(_pPublicInterface->getHSelf(), WM_SIZE, SIZE_RESTORED, MAKELONG(rc.bottom, rc.right)); return oldVal; } diff --git a/PowerEditor/src/ScitillaComponent/Buffer.cpp b/PowerEditor/src/ScitillaComponent/Buffer.cpp index c608adf8..ccb971b8 100644 --- a/PowerEditor/src/ScitillaComponent/Buffer.cpp +++ b/PowerEditor/src/ScitillaComponent/Buffer.cpp @@ -564,9 +564,10 @@ bool FileManager::saveBuffer(BufferID id, const TCHAR * filename, bool isCopy) { char data[blockSize + 1]; int lengthDoc = _pscratchTilla->getCurrentDocLen(); - for (int i = 0; i < lengthDoc; i += blockSize) + int grabSize; + for (int i = 0; i < lengthDoc; i += grabSize) { - int grabSize = lengthDoc - i; + grabSize = lengthDoc - i; if (grabSize > blockSize) grabSize = blockSize; @@ -574,8 +575,11 @@ bool FileManager::saveBuffer(BufferID id, const TCHAR * filename, bool isCopy) { if (encoding != -1) { WcharMbcsConvertor *wmc = WcharMbcsConvertor::getInstance(); - const char *newData = wmc->encode(SC_CP_UTF8, encoding, data); - UnicodeConvertor.fwrite(newData, strlen(newData)); + int newDataLen = 0; + int incompleteMultibyteChar = 0; + const char *newData = wmc->encode(SC_CP_UTF8, encoding, data, grabSize, &newDataLen, &incompleteMultibyteChar); + grabSize -= incompleteMultibyteChar; + UnicodeConvertor.fwrite(newData, newDataLen); } else { @@ -692,26 +696,11 @@ bool FileManager::loadFileData(Document doc, const TCHAR * filename, Utf8_16_Rea size_t lenFile = 0; size_t lenConvert = 0; //just in case conversion results in 0, but file not empty bool isFirstTime = true; - int incompleteMultibyteChar = 0; //we do not want to call SCI_APPENDTEXT with an incomplete character if the buffer ends in the middle of one - char incompleteMultibyteChar_first = 0; + int incompleteMultibyteChar = 0; do { lenFile = fread(data+incompleteMultibyteChar, 1, blockSize-incompleteMultibyteChar, fp) + incompleteMultibyteChar; - // we might not know yet the encoding; we ensure that valid UTF-8 characters will not be cut in the middle, without causing problems if it's not UTF-8 - // TODO: all expressions for testing UTF chars should be put in inline functions, not directly in the code - if(lenFile == blockSize && (data[blockSize-1]&0x80) != 0) // possible multi-byte character that could be cut due to blockSize - { - incompleteMultibyteChar = 1; - while(incompleteMultibyteChar < 6 // longest "defined" UTF-8 code (including restricted codes not yet defined by Unicode) - && (data[blockSize-incompleteMultibyteChar]&0xC0) == 0x80) // is possibly a continuation byte in a multi-byte character - ++incompleteMultibyteChar; - // leave for the next buffer all bytes that could potentially be multi-byte UTF-8 at the end of current buffer - lenFile -= incompleteMultibyteChar; - incompleteMultibyteChar_first = data[lenFile]; // this byte can be erased by following code to put a null terminator - } - else incompleteMultibyteChar = 0; - // check if file contain any BOM if (isFirstTime) { @@ -726,10 +715,19 @@ bool FileManager::loadFileData(Document doc, const TCHAR * filename, Utf8_16_Rea if (encoding != -1) { - data[lenFile] = '\0'; - WcharMbcsConvertor *wmc = WcharMbcsConvertor::getInstance(); - const char *newData = wmc->encode(encoding, SC_CP_UTF8, data); - _pscratchTilla->execute(SCI_APPENDTEXT, strlen(newData), (LPARAM)newData); + if (encoding == SC_CP_UTF8) + { + // Pass through UTF-8 (this does not check validity of characters, thus inserting a multi-byte character in two halfs is working) + _pscratchTilla->execute(SCI_APPENDTEXT, lenFile, (LPARAM)data); + } + else + { + WcharMbcsConvertor *wmc = WcharMbcsConvertor::getInstance(); + int newDataLen = 0; + const char *newData = wmc->encode(encoding, SC_CP_UTF8, data, lenFile, &newDataLen, &incompleteMultibyteChar); + _pscratchTilla->execute(SCI_APPENDTEXT, newDataLen, (LPARAM)newData); + } + if (format == -1) format = getEOLFormatForm(data); } @@ -743,7 +741,6 @@ bool FileManager::loadFileData(Document doc, const TCHAR * filename, Utf8_16_Rea { // copy bytes to next buffer memcpy(data, data+blockSize-incompleteMultibyteChar, incompleteMultibyteChar); - data[0] = incompleteMultibyteChar_first; } } while (lenFile > 0); diff --git a/PowerEditor/src/Utf8.h b/PowerEditor/src/Utf8.h new file mode 100644 index 00000000..178ef0a6 --- /dev/null +++ b/PowerEditor/src/Utf8.h @@ -0,0 +1,55 @@ +// Simple functions to test UTF-8 characters. +// Copyright (C)2010 Francois-R.Boyer@PolyMtl.ca +// First version 2010-08 +// +// Written for notepad++, and distributed under same license: +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License +// as published by the Free Software Foundation; either +// version 2 of the License, or (at your option) any later version. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +namespace Utf8 { // could be a static class, instead of a namespace, if it needs private members + // basic classification of UTF-8 bytes + inline static bool isSingleByte(UCHAR c) { return c < 0x80; } + inline static bool isPartOfMultibyte(UCHAR c) { return c >= 0x80; } + inline static bool isFirstOfMultibyte(UCHAR c) { return c >= 0xC2 && c < 0xF5; } // 0xF5 to 0xFD are defined by UTF-8, but are not currently valid Unicode + inline static bool isContinuation(UCHAR c) { return (c & 0xC0) == 0x80; } + inline static bool isValid(UCHAR c) { return c < 0xC0 || isFirstOfMultibyte(c); } // validates a byte, out of context + + // number of continuation bytes for a given valid first character (0 for single byte characters) + inline static int continuationBytes(UCHAR c) { + static const char _len[] = { 1,1,2,3 }; + return (c < 0xC0) ? 0 : _len[(c & 0x30) >> 4]; + } + + // validates a full character + inline static bool isValid(const char* buf, int buflen) { + if(isSingleByte(buf[0])) return true; // single byte is valid + if(!isFirstOfMultibyte(buf[0])) return false; // not single byte, nor valid multi-byte first byte + int charContinuationBytes = continuationBytes(buf[0]); + if(buflen < charContinuationBytes+1) return false; // character does not fit in buffer + for(int i = charContinuationBytes; i>0; --i) + if(!isContinuation(*(++buf))) return false; // not enough continuation bytes + return true; // the character is valid (if there are too many continuation bytes, it is the next character that will be invalid) + } + + // rewinds to the first byte of a multi-byte character for any valid UTF-8 (and will not rewind too much on any other input) + inline static int characterStart(const char* buf, int startingIndex) { + int charContinuationBytes = 0; + while(charContinuationBytes < startingIndex // rewind past start of buffer? + && charContinuationBytes < 5 // UTF-8 support up to 5 continuation bytes (but valid sequences currently do not have more than 3) + && isContinuation(buf[startingIndex-charContinuationBytes]) + ) + ++charContinuationBytes; + return startingIndex-charContinuationBytes; + } +}; diff --git a/PowerEditor/src/resource.h b/PowerEditor/src/resource.h index 49d27c72..e5b18252 100644 --- a/PowerEditor/src/resource.h +++ b/PowerEditor/src/resource.h @@ -18,12 +18,12 @@ #ifndef RESOURCE_H #define RESOURCE_H -#define NOTEPAD_PLUS_VERSION TEXT("Notepad++ v5.7") +#define NOTEPAD_PLUS_VERSION TEXT("Notepad++ v5.7.1") // should be X.Y : ie. if VERSION_DIGITALVALUE == 4, 7, 1, 0 , then X = 4, Y = 71 // ex : #define VERSION_VALUE TEXT("5.63\0") -#define VERSION_VALUE TEXT("5.7\0") -#define VERSION_DIGITALVALUE 5, 7, 0, 0 +#define VERSION_VALUE TEXT("5.71\0") +#define VERSION_DIGITALVALUE 5, 7, 1, 0 #ifdef UNICODE #define UNICODE_ANSI_MODE TEXT("(UNICODE)")