[BUG_FIXED] (Author: François-R Boyer) Fix DBCS encodings file saving corruption bug.

[BUG_FIXED] (Author: François-R Boyer) Fix file containing NULL character loading bug.
[ENHANCEMENT] (Author: François-R Boyer) Improve getCurrentDocCharCount() method performance.

git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@651 f5eea248-9336-0410-98b8-ebc06183d4e3
This commit is contained in:
Don Ho 2010-08-16 16:52:03 +00:00
parent 33de57fe1d
commit 948f281eb0
8 changed files with 235 additions and 121 deletions

View File

@ -16,6 +16,7 @@
//Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. //Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#include "precompiledHeaders.h" #include "precompiledHeaders.h"
#include "../Utf8.h"
WcharMbcsConvertor * WcharMbcsConvertor::_pSelf = new WcharMbcsConvertor; WcharMbcsConvertor * WcharMbcsConvertor::_pSelf = new WcharMbcsConvertor;
@ -217,28 +218,60 @@ generic_string purgeMenuItemString(const TCHAR * menuItemStr, bool keepAmpersand
return cleanedName; return cleanedName;
}; };
const wchar_t * WcharMbcsConvertor::char2wchar(const char * mbcs2Convert, UINT codepage) const wchar_t * WcharMbcsConvertor::char2wchar(const char * mbcs2Convert, UINT codepage, int lenMbcs, int *pLenWc, int *pBytesNotProcessed)
{ {
if (!_wideCharStr) // Do not process empty strings
if (lenMbcs == 0 || lenMbcs == -1 && mbcs2Convert[0] == 0) { _wideCharStr.empty(); return _wideCharStr; }
int bytesNotProcessed = 0;
int lenWc = 0;
// If length not specified, simply convert without checking
if (lenMbcs == -1)
{ {
_wideCharStr = new wchar_t[initSize]; lenWc = MultiByteToWideChar(codepage, 0, mbcs2Convert, lenMbcs, NULL, 0);
_wideCharAllocLen = initSize; }
// Otherwise, test if we are cutting a multi-byte character at end of buffer
else if(lenMbcs != -1 && codepage == CP_UTF8) // For UTF-8, we know how to test it
{
int indexOfLastChar = Utf8::characterStart(mbcs2Convert, lenMbcs-1); // get index of last character
if (indexOfLastChar != 0 && !Utf8::isValid(mbcs2Convert+indexOfLastChar, lenMbcs-indexOfLastChar)) // if it is not valid we do not process it right now (unless its the only character in string, to ensure that we always progress, e.g. that bytesNotProcessed < lenMbcs)
{
bytesNotProcessed = lenMbcs-indexOfLastChar;
}
lenWc = MultiByteToWideChar(codepage, 0, mbcs2Convert, lenMbcs-bytesNotProcessed, NULL, 0);
}
else // For other encodings, ask system if there are any invalid characters; note that it will not correctly know if last character is cut when there are invalid characters inside the text
{
lenWc = MultiByteToWideChar(codepage, (lenMbcs == -1) ? 0 : MB_ERR_INVALID_CHARS, mbcs2Convert, lenMbcs, NULL, 0);
if (lenWc == 0 && GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
{
// Test without last byte
if (lenMbcs > 1) lenWc = MultiByteToWideChar(codepage, MB_ERR_INVALID_CHARS, mbcs2Convert, lenMbcs-1, NULL, 0);
if (lenWc == 0) // don't have to check that the error is still ERROR_NO_UNICODE_TRANSLATION, since only the length parameter changed
{
// TODO: should warn user about incorrect loading due to invalid characters
// We still load the file, but the system will either strip or replace invalid characters (including the last character, if cut in half)
lenWc = MultiByteToWideChar(codepage, 0, mbcs2Convert, lenMbcs, NULL, 0);
}
else
{
// We found a valid text by removing one byte.
bytesNotProcessed = 1;
}
}
} }
int len = MultiByteToWideChar(codepage, 0, mbcs2Convert, -1, _wideCharStr, 0); if (lenWc > 0)
if (len > 0)
{ {
if ((size_t)len > _wideCharAllocLen) _wideCharStr.sizeTo(lenWc);
{ MultiByteToWideChar(codepage, 0, mbcs2Convert, lenMbcs-bytesNotProcessed, _wideCharStr, lenWc);
delete [] _wideCharStr;
_wideCharAllocLen = len;
_wideCharStr = new wchar_t[_wideCharAllocLen];
}
MultiByteToWideChar(codepage, 0, mbcs2Convert, -1, _wideCharStr, len);
} }
else else
_wideCharStr[0] = 0; _wideCharStr.empty();
if(pLenWc) *pLenWc = lenWc;
if(pBytesNotProcessed) *pBytesNotProcessed = bytesNotProcessed;
return _wideCharStr; return _wideCharStr;
} }
@ -246,21 +279,10 @@ const wchar_t * WcharMbcsConvertor::char2wchar(const char * mbcs2Convert, UINT c
// which are converted to the corresponding indexes in the returned wchar_t string. // which are converted to the corresponding indexes in the returned wchar_t string.
const wchar_t * WcharMbcsConvertor::char2wchar(const char * mbcs2Convert, UINT codepage, int *mstart, int *mend) const wchar_t * WcharMbcsConvertor::char2wchar(const char * mbcs2Convert, UINT codepage, int *mstart, int *mend)
{ {
if (!_wideCharStr) int len = MultiByteToWideChar(codepage, 0, mbcs2Convert, -1, NULL, 0);
{
_wideCharStr = new wchar_t[initSize];
_wideCharAllocLen = initSize;
}
int len = MultiByteToWideChar(codepage, 0, mbcs2Convert, -1, _wideCharStr, 0);
if (len > 0) if (len > 0)
{ {
if (len > int(_wideCharAllocLen)) _wideCharStr.sizeTo(len);
{
delete [] _wideCharStr;
_wideCharAllocLen = len;
_wideCharStr = new wchar_t[_wideCharAllocLen];
}
len = MultiByteToWideChar(codepage, 0, mbcs2Convert, -1, _wideCharStr, len); len = MultiByteToWideChar(codepage, 0, mbcs2Convert, -1, _wideCharStr, len);
if ((size_t)*mstart < strlen(mbcs2Convert) && (size_t)*mend <= strlen(mbcs2Convert)) if ((size_t)*mstart < strlen(mbcs2Convert) && (size_t)*mend <= strlen(mbcs2Convert))
@ -276,61 +298,40 @@ const wchar_t * WcharMbcsConvertor::char2wchar(const char * mbcs2Convert, UINT c
} }
else else
{ {
_wideCharStr[0] = 0; _wideCharStr.empty();
*mstart = 0; *mstart = 0;
*mend = 0; *mend = 0;
} }
return _wideCharStr; return _wideCharStr;
} }
const char * WcharMbcsConvertor::wchar2char(const wchar_t * wcharStr2Convert, UINT codepage) const char * WcharMbcsConvertor::wchar2char(const wchar_t * wcharStr2Convert, UINT codepage, int lenWc, int *pLenMbcs)
{ {
if (!_multiByteStr) int lenMbcs = WideCharToMultiByte(codepage, 0, wcharStr2Convert, lenWc, NULL, 0, NULL, NULL);
if (lenMbcs > 0)
{ {
_multiByteStr = new char[initSize]; _multiByteStr.sizeTo(lenMbcs);
_multiByteAllocLen = initSize; WideCharToMultiByte(codepage, 0, wcharStr2Convert, lenWc, _multiByteStr, lenMbcs, NULL, NULL);
}
int len = WideCharToMultiByte(codepage, 0, wcharStr2Convert, -1, _multiByteStr, 0, NULL, NULL);
if (len > 0)
{
if ((size_t)len > _multiByteAllocLen)
{
delete [] _multiByteStr;
_multiByteAllocLen = len;
_multiByteStr = new char[_multiByteAllocLen];
}
WideCharToMultiByte(codepage, 0, wcharStr2Convert, -1, _multiByteStr, len, NULL, NULL);
} }
else else
_multiByteStr[0] = 0; _multiByteStr.empty();
if(pLenMbcs) *pLenMbcs = lenMbcs;
return _multiByteStr; return _multiByteStr;
} }
const char * WcharMbcsConvertor::wchar2char(const wchar_t * wcharStr2Convert, UINT codepage, long *mstart, long *mend) const char * WcharMbcsConvertor::wchar2char(const wchar_t * wcharStr2Convert, UINT codepage, long *mstart, long *mend)
{ {
if (!_multiByteStr) int len = WideCharToMultiByte(codepage, 0, wcharStr2Convert, -1, NULL, 0, NULL, NULL);
{
_multiByteStr = new char[initSize];
_multiByteAllocLen = initSize;
}
int len = WideCharToMultiByte(codepage, 0, wcharStr2Convert, -1, _multiByteStr, 0, NULL, NULL);
if (len > 0) if (len > 0)
{ {
if ((size_t)len > _multiByteAllocLen) _multiByteStr.sizeTo(len);
{
delete [] _multiByteStr;
_multiByteAllocLen = len;
_multiByteStr = new char[_multiByteAllocLen];
}
len = WideCharToMultiByte(codepage, 0, wcharStr2Convert, -1, _multiByteStr, len, NULL, NULL); // not needed? len = WideCharToMultiByte(codepage, 0, wcharStr2Convert, -1, _multiByteStr, len, NULL, NULL); // not needed?
if ((int)*mstart < lstrlenW(wcharStr2Convert) && (int)*mend < lstrlenW(wcharStr2Convert)) if ((int)*mstart < lstrlenW(wcharStr2Convert) && (int)*mend < lstrlenW(wcharStr2Convert))
{ {
*mstart = WideCharToMultiByte(codepage, 0, wcharStr2Convert, *mstart, _multiByteStr, 0, NULL, NULL); *mstart = WideCharToMultiByte(codepage, 0, wcharStr2Convert, *mstart, NULL, 0, NULL, NULL);
*mend = WideCharToMultiByte(codepage, 0, wcharStr2Convert, *mend, _multiByteStr, 0, NULL, NULL); *mend = WideCharToMultiByte(codepage, 0, wcharStr2Convert, *mend, NULL, 0, NULL, NULL);
if (*mstart >= len || *mend >= len) if (*mstart >= len || *mend >= len)
{ {
*mstart = 0; *mstart = 0;
@ -339,7 +340,7 @@ const char * WcharMbcsConvertor::wchar2char(const wchar_t * wcharStr2Convert, UI
} }
} }
else else
_multiByteStr[0] = 0; _multiByteStr.empty();
return _multiByteStr; return _multiByteStr;
} }

View File

@ -94,32 +94,56 @@ public:
static WcharMbcsConvertor * getInstance() {return _pSelf;}; static WcharMbcsConvertor * getInstance() {return _pSelf;};
static void destroyInstance() {delete _pSelf;}; static void destroyInstance() {delete _pSelf;};
const wchar_t * char2wchar(const char *mbStr, UINT codepage); const wchar_t * char2wchar(const char *mbStr, UINT codepage, int lenIn=-1, int *pLenOut=NULL, int *pBytesNotProcessed=NULL);
const wchar_t * char2wchar(const char *mbcs2Convert, UINT codepage, int *mstart, int *mend); const wchar_t * char2wchar(const char *mbcs2Convert, UINT codepage, int *mstart, int *mend);
const char * wchar2char(const wchar_t *wcStr, UINT codepage); const char * wchar2char(const wchar_t *wcStr, UINT codepage, int lenIn=-1, int *pLenOut=NULL);
const char * wchar2char(const wchar_t *wcStr, UINT codepage, long *mstart, long *mend); const char * wchar2char(const wchar_t *wcStr, UINT codepage, long *mstart, long *mend);
const char * encode(UINT fromCodepage, UINT toCodepage, const char *txt2Encode) { const char * encode(UINT fromCodepage, UINT toCodepage, const char *txt2Encode, int lenIn=-1, int *pLenOut=NULL, int *pBytesNotProcessed=NULL) {
const wchar_t * strW = char2wchar(txt2Encode, fromCodepage); int lenWc = 0;
return wchar2char(strW, toCodepage); const wchar_t * strW = char2wchar(txt2Encode, fromCodepage, lenIn, &lenWc, pBytesNotProcessed);
return wchar2char(strW, toCodepage, lenWc, pLenOut);
}; };
protected: protected:
WcharMbcsConvertor() : _multiByteStr(NULL), _wideCharStr(NULL), _multiByteAllocLen(0), _wideCharAllocLen(0), initSize(1024) { WcharMbcsConvertor() {
}; };
~WcharMbcsConvertor() { ~WcharMbcsConvertor() {
if (_multiByteStr)
delete [] _multiByteStr;
if (_wideCharStr)
delete [] _wideCharStr;
}; };
static WcharMbcsConvertor * _pSelf; static WcharMbcsConvertor * _pSelf;
const int initSize; template <class T>
char *_multiByteStr; class StringBuffer {
size_t _multiByteAllocLen; public:
wchar_t *_wideCharStr; StringBuffer() : _str(0), _allocLen(0) { }
size_t _wideCharAllocLen; ~StringBuffer() { if(_str) delete [] _str; }
void sizeTo(size_t size) {
if(_allocLen < size)
{
if(_allocLen) delete[] _str;
_allocLen = max(size, initSize);
_str = new T[_allocLen];
}
}
void empty() {
static T nullStr = 0; // routines may return an empty string, with null terminator, without allocating memory; a pointer to this null character will be returned in that case
if(_allocLen == 0)
_str = &nullStr;
else
_str[0] = 0;
}
operator T*() { return _str; }
protected:
static const int initSize = 1024;
size_t _allocLen;
T* _str;
};
StringBuffer<char> _multiByteStr;
StringBuffer<wchar_t> _wideCharStr;
private: private:
// Since there's no public ctor, we need to void the default assignment operator. // Since there's no public ctor, we need to void the default assignment operator.

View File

@ -31,16 +31,15 @@
#include "xmlMatchedTagsHighlighter.h" #include "xmlMatchedTagsHighlighter.h"
#include "EncodingMapper.h" #include "EncodingMapper.h"
enum tb_stat {tb_saved, tb_unsaved, tb_ro}; enum tb_stat {tb_saved, tb_unsaved, tb_ro};
#define DIR_LEFT true #define DIR_LEFT true
#define DIR_RIGHT false #define DIR_RIGHT false
int docTabIconIDs[] = {IDI_SAVED_ICON, IDI_UNSAVED_ICON, IDI_READONLY_ICON}; int docTabIconIDs[] = {IDI_SAVED_ICON, IDI_UNSAVED_ICON, IDI_READONLY_ICON};
ToolBarButtonUnit toolBarIcons[] = { ToolBarButtonUnit toolBarIcons[] = {
{IDM_FILE_NEW, IDI_NEW_OFF_ICON, IDI_NEW_ON_ICON, IDI_NEW_OFF_ICON, IDR_FILENEW}, {IDM_FILE_NEW, IDI_NEW_OFF_ICON, IDI_NEW_ON_ICON, IDI_NEW_OFF_ICON, IDR_FILENEW},
{IDM_FILE_OPEN, IDI_OPEN_OFF_ICON, IDI_OPEN_ON_ICON, IDI_NEW_OFF_ICON, IDR_FILEOPEN}, {IDM_FILE_OPEN, IDI_OPEN_OFF_ICON, IDI_OPEN_ON_ICON, IDI_OPEN_OFF_ICON, IDR_FILEOPEN},
{IDM_FILE_SAVE, IDI_SAVE_OFF_ICON, IDI_SAVE_ON_ICON, IDI_SAVE_DISABLE_ICON, IDR_FILESAVE}, {IDM_FILE_SAVE, IDI_SAVE_OFF_ICON, IDI_SAVE_ON_ICON, IDI_SAVE_DISABLE_ICON, IDR_FILESAVE},
{IDM_FILE_SAVEALL, IDI_SAVEALL_OFF_ICON, IDI_SAVEALL_ON_ICON, IDI_SAVEALL_DISABLE_ICON, IDR_SAVEALL}, {IDM_FILE_SAVEALL, IDI_SAVEALL_OFF_ICON, IDI_SAVEALL_ON_ICON, IDI_SAVEALL_DISABLE_ICON, IDR_SAVEALL},
{IDM_FILE_CLOSE, IDI_CLOSE_OFF_ICON, IDI_CLOSE_ON_ICON, IDI_CLOSE_OFF_ICON, IDR_CLOSEFILE}, {IDM_FILE_CLOSE, IDI_CLOSE_OFF_ICON, IDI_CLOSE_ON_ICON, IDI_CLOSE_OFF_ICON, IDR_CLOSEFILE},
@ -2311,7 +2310,26 @@ size_t Notepad_plus::getSelectedCharNumber(UniMode u)
} }
return result; return result;
} }
/*
#ifdef _OPENMP
#include <omp.h>
#endif
static inline size_t countUtf8Characters(unsigned char *buf, int pos, int endpos)
{
size_t result = 0;
while(pos < endpos)
{
unsigned char c = buf[pos++];
if ((c&0xc0) == 0x80 // do not count unexpected continuation bytes (this handles the case where an UTF-8 character is split in the middle)
|| c == '\n' || c == '\r') continue; // do not count end of lines
if (c >= 0xc0) pos += utflen[(c & 0x30) >> 4];
result++;
}
return result;
}
size_t Notepad_plus::getCurrentDocCharCount(size_t numLines, UniMode u) size_t Notepad_plus::getCurrentDocCharCount(size_t numLines, UniMode u)
{ {
if (u != uniUTF8 && u != uniCookie) if (u != uniUTF8 && u != uniCookie)
@ -2322,23 +2340,39 @@ size_t Notepad_plus::getCurrentDocCharCount(size_t numLines, UniMode u)
result -= lines; result -= lines;
return ((int)result < 0)?0:result; return ((int)result < 0)?0:result;
} }
else else
{ {
// Note that counting is not well defined for invalid UTF-8 characters.
// This method is O(filelength) regardless of the number of characters we count (due to SCI_GETCHARACTERPOINTER);
// it would not be appropriate for counting characters in a small selection.
size_t result = 0; size_t result = 0;
for (size_t line=0; line<numLines; line++)
size_t endpos = _pEditView->execute(SCI_GETLENGTH);
unsigned char* buf = (unsigned char*)_pEditView->execute(SCI_GETCHARACTERPOINTER); // Scintilla doc sais the pointer can be invalidated by any other "execute"
#ifdef _OPENMP // parallel counting of characters with OpenMP
if(endpos > 50000) // starting threads takes time; for small files it is better to simply count in one thread
{ {
size_t endpos = _pEditView->execute(SCI_GETLINEENDPOSITION, line); #pragma omp parallel reduction(+: result)
for (size_t pos = _pEditView->execute(SCI_POSITIONFROMLINE, line); pos < endpos; pos++)
{ {
unsigned char c = 0xf0 & (unsigned char)_pEditView->execute(SCI_GETCHARAT, pos); // split in chunks of same size (except last chunk if it's not evenly divisible)
if (c >= 0xc0) pos += utflen[(c & 0x30) >> 4]; unsigned int num_threads = omp_get_num_threads();
result++; unsigned int thread_num = omp_get_thread_num();
size_t chunk_size = endpos/num_threads;
size_t pos = chunk_size*thread_num;
size_t endpos_local = (thread_num == num_threads-1) ? endpos : pos+chunk_size;
result = countUtf8Characters(buf, pos, endpos_local);
} }
} }
return result; else
} #endif
{
result = countUtf8Characters(buf, 0, endpos);
}
return result;
}
} }
*/
bool Notepad_plus::isFormatUnicode(UniMode u) bool Notepad_plus::isFormatUnicode(UniMode u)
{ {
@ -2377,6 +2411,8 @@ size_t Notepad_plus::getSelectedBytes()
void Notepad_plus::updateStatusBar() void Notepad_plus::updateStatusBar()
{ {
if(!NppParameters::getInstance()->getNppGUI()._statusBarShow) return; // do not update if status bar not shown
UniMode u = _pEditView->getCurrentBuffer()->getUnicodeMode(); UniMode u = _pEditView->getCurrentBuffer()->getUnicodeMode();
TCHAR strLnCol[64]; TCHAR strLnCol[64];

View File

@ -541,7 +541,7 @@ private:
void updateStatusBar(); void updateStatusBar();
size_t getSelectedCharNumber(UniMode); size_t getSelectedCharNumber(UniMode);
//size_t getCurrentDocCharCount(size_t numLines, UniMode u); size_t getCurrentDocCharCount(size_t numLines, UniMode u);
int getSelectedAreas(); int getSelectedAreas();
int _numSel; int _numSel;
size_t getSelectedBytes(); size_t getSelectedBytes();

View File

@ -1666,7 +1666,8 @@ LRESULT Notepad_plus::process(HWND hwnd, UINT Message, WPARAM wParam, LPARAM lPa
_pPublicInterface->getClientRect(rc); _pPublicInterface->getClientRect(rc);
nppGUI._statusBarShow = show; nppGUI._statusBarShow = show;
_statusBar.display(nppGUI._statusBarShow); if(show)
_statusBar.display(nppGUI._statusBarShow);
::SendMessage(_pPublicInterface->getHSelf(), WM_SIZE, SIZE_RESTORED, MAKELONG(rc.bottom, rc.right)); ::SendMessage(_pPublicInterface->getHSelf(), WM_SIZE, SIZE_RESTORED, MAKELONG(rc.bottom, rc.right));
return oldVal; return oldVal;
} }

View File

@ -564,9 +564,10 @@ bool FileManager::saveBuffer(BufferID id, const TCHAR * filename, bool isCopy) {
char data[blockSize + 1]; char data[blockSize + 1];
int lengthDoc = _pscratchTilla->getCurrentDocLen(); int lengthDoc = _pscratchTilla->getCurrentDocLen();
for (int i = 0; i < lengthDoc; i += blockSize) int grabSize;
for (int i = 0; i < lengthDoc; i += grabSize)
{ {
int grabSize = lengthDoc - i; grabSize = lengthDoc - i;
if (grabSize > blockSize) if (grabSize > blockSize)
grabSize = blockSize; grabSize = blockSize;
@ -574,8 +575,11 @@ bool FileManager::saveBuffer(BufferID id, const TCHAR * filename, bool isCopy) {
if (encoding != -1) if (encoding != -1)
{ {
WcharMbcsConvertor *wmc = WcharMbcsConvertor::getInstance(); WcharMbcsConvertor *wmc = WcharMbcsConvertor::getInstance();
const char *newData = wmc->encode(SC_CP_UTF8, encoding, data); int newDataLen = 0;
UnicodeConvertor.fwrite(newData, strlen(newData)); int incompleteMultibyteChar = 0;
const char *newData = wmc->encode(SC_CP_UTF8, encoding, data, grabSize, &newDataLen, &incompleteMultibyteChar);
grabSize -= incompleteMultibyteChar;
UnicodeConvertor.fwrite(newData, newDataLen);
} }
else else
{ {
@ -692,26 +696,11 @@ bool FileManager::loadFileData(Document doc, const TCHAR * filename, Utf8_16_Rea
size_t lenFile = 0; size_t lenFile = 0;
size_t lenConvert = 0; //just in case conversion results in 0, but file not empty size_t lenConvert = 0; //just in case conversion results in 0, but file not empty
bool isFirstTime = true; bool isFirstTime = true;
int incompleteMultibyteChar = 0; //we do not want to call SCI_APPENDTEXT with an incomplete character if the buffer ends in the middle of one int incompleteMultibyteChar = 0;
char incompleteMultibyteChar_first = 0;
do { do {
lenFile = fread(data+incompleteMultibyteChar, 1, blockSize-incompleteMultibyteChar, fp) + incompleteMultibyteChar; lenFile = fread(data+incompleteMultibyteChar, 1, blockSize-incompleteMultibyteChar, fp) + incompleteMultibyteChar;
// we might not know yet the encoding; we ensure that valid UTF-8 characters will not be cut in the middle, without causing problems if it's not UTF-8
// TODO: all expressions for testing UTF chars should be put in inline functions, not directly in the code
if(lenFile == blockSize && (data[blockSize-1]&0x80) != 0) // possible multi-byte character that could be cut due to blockSize
{
incompleteMultibyteChar = 1;
while(incompleteMultibyteChar < 6 // longest "defined" UTF-8 code (including restricted codes not yet defined by Unicode)
&& (data[blockSize-incompleteMultibyteChar]&0xC0) == 0x80) // is possibly a continuation byte in a multi-byte character
++incompleteMultibyteChar;
// leave for the next buffer all bytes that could potentially be multi-byte UTF-8 at the end of current buffer
lenFile -= incompleteMultibyteChar;
incompleteMultibyteChar_first = data[lenFile]; // this byte can be erased by following code to put a null terminator
}
else incompleteMultibyteChar = 0;
// check if file contain any BOM // check if file contain any BOM
if (isFirstTime) if (isFirstTime)
{ {
@ -726,10 +715,19 @@ bool FileManager::loadFileData(Document doc, const TCHAR * filename, Utf8_16_Rea
if (encoding != -1) if (encoding != -1)
{ {
data[lenFile] = '\0'; if (encoding == SC_CP_UTF8)
WcharMbcsConvertor *wmc = WcharMbcsConvertor::getInstance(); {
const char *newData = wmc->encode(encoding, SC_CP_UTF8, data); // Pass through UTF-8 (this does not check validity of characters, thus inserting a multi-byte character in two halfs is working)
_pscratchTilla->execute(SCI_APPENDTEXT, strlen(newData), (LPARAM)newData); _pscratchTilla->execute(SCI_APPENDTEXT, lenFile, (LPARAM)data);
}
else
{
WcharMbcsConvertor *wmc = WcharMbcsConvertor::getInstance();
int newDataLen = 0;
const char *newData = wmc->encode(encoding, SC_CP_UTF8, data, lenFile, &newDataLen, &incompleteMultibyteChar);
_pscratchTilla->execute(SCI_APPENDTEXT, newDataLen, (LPARAM)newData);
}
if (format == -1) if (format == -1)
format = getEOLFormatForm(data); format = getEOLFormatForm(data);
} }
@ -743,7 +741,6 @@ bool FileManager::loadFileData(Document doc, const TCHAR * filename, Utf8_16_Rea
{ {
// copy bytes to next buffer // copy bytes to next buffer
memcpy(data, data+blockSize-incompleteMultibyteChar, incompleteMultibyteChar); memcpy(data, data+blockSize-incompleteMultibyteChar, incompleteMultibyteChar);
data[0] = incompleteMultibyteChar_first;
} }
} while (lenFile > 0); } while (lenFile > 0);

55
PowerEditor/src/Utf8.h Normal file
View File

@ -0,0 +1,55 @@
// Simple functions to test UTF-8 characters.
// Copyright (C)2010 Francois-R.Boyer@PolyMtl.ca
// First version 2010-08
//
// Written for notepad++, and distributed under same license:
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either
// version 2 of the License, or (at your option) any later version.
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
namespace Utf8 { // could be a static class, instead of a namespace, if it needs private members
// basic classification of UTF-8 bytes
inline static bool isSingleByte(UCHAR c) { return c < 0x80; }
inline static bool isPartOfMultibyte(UCHAR c) { return c >= 0x80; }
inline static bool isFirstOfMultibyte(UCHAR c) { return c >= 0xC2 && c < 0xF5; } // 0xF5 to 0xFD are defined by UTF-8, but are not currently valid Unicode
inline static bool isContinuation(UCHAR c) { return (c & 0xC0) == 0x80; }
inline static bool isValid(UCHAR c) { return c < 0xC0 || isFirstOfMultibyte(c); } // validates a byte, out of context
// number of continuation bytes for a given valid first character (0 for single byte characters)
inline static int continuationBytes(UCHAR c) {
static const char _len[] = { 1,1,2,3 };
return (c < 0xC0) ? 0 : _len[(c & 0x30) >> 4];
}
// validates a full character
inline static bool isValid(const char* buf, int buflen) {
if(isSingleByte(buf[0])) return true; // single byte is valid
if(!isFirstOfMultibyte(buf[0])) return false; // not single byte, nor valid multi-byte first byte
int charContinuationBytes = continuationBytes(buf[0]);
if(buflen < charContinuationBytes+1) return false; // character does not fit in buffer
for(int i = charContinuationBytes; i>0; --i)
if(!isContinuation(*(++buf))) return false; // not enough continuation bytes
return true; // the character is valid (if there are too many continuation bytes, it is the next character that will be invalid)
}
// rewinds to the first byte of a multi-byte character for any valid UTF-8 (and will not rewind too much on any other input)
inline static int characterStart(const char* buf, int startingIndex) {
int charContinuationBytes = 0;
while(charContinuationBytes < startingIndex // rewind past start of buffer?
&& charContinuationBytes < 5 // UTF-8 support up to 5 continuation bytes (but valid sequences currently do not have more than 3)
&& isContinuation(buf[startingIndex-charContinuationBytes])
)
++charContinuationBytes;
return startingIndex-charContinuationBytes;
}
};

View File

@ -18,12 +18,12 @@
#ifndef RESOURCE_H #ifndef RESOURCE_H
#define RESOURCE_H #define RESOURCE_H
#define NOTEPAD_PLUS_VERSION TEXT("Notepad++ v5.7") #define NOTEPAD_PLUS_VERSION TEXT("Notepad++ v5.7.1")
// should be X.Y : ie. if VERSION_DIGITALVALUE == 4, 7, 1, 0 , then X = 4, Y = 71 // should be X.Y : ie. if VERSION_DIGITALVALUE == 4, 7, 1, 0 , then X = 4, Y = 71
// ex : #define VERSION_VALUE TEXT("5.63\0") // ex : #define VERSION_VALUE TEXT("5.63\0")
#define VERSION_VALUE TEXT("5.7\0") #define VERSION_VALUE TEXT("5.71\0")
#define VERSION_DIGITALVALUE 5, 7, 0, 0 #define VERSION_DIGITALVALUE 5, 7, 1, 0
#ifdef UNICODE #ifdef UNICODE
#define UNICODE_ANSI_MODE TEXT("(UNICODE)") #define UNICODE_ANSI_MODE TEXT("(UNICODE)")