[BUG_FIXED] (Author: François-R Boyer) Fix DBCS encodings file saving corruption bug.
[BUG_FIXED] (Author: François-R Boyer) Fix file containing NULL character loading bug. [ENHANCEMENT] (Author: François-R Boyer) Improve getCurrentDocCharCount() method performance. git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@651 f5eea248-9336-0410-98b8-ebc06183d4e3
This commit is contained in:
parent
33de57fe1d
commit
948f281eb0
@ -16,6 +16,7 @@
|
||||
//Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
|
||||
#include "precompiledHeaders.h"
|
||||
#include "../Utf8.h"
|
||||
|
||||
WcharMbcsConvertor * WcharMbcsConvertor::_pSelf = new WcharMbcsConvertor;
|
||||
|
||||
@ -217,28 +218,60 @@ generic_string purgeMenuItemString(const TCHAR * menuItemStr, bool keepAmpersand
|
||||
return cleanedName;
|
||||
};
|
||||
|
||||
const wchar_t * WcharMbcsConvertor::char2wchar(const char * mbcs2Convert, UINT codepage)
|
||||
const wchar_t * WcharMbcsConvertor::char2wchar(const char * mbcs2Convert, UINT codepage, int lenMbcs, int *pLenWc, int *pBytesNotProcessed)
|
||||
{
|
||||
if (!_wideCharStr)
|
||||
// Do not process empty strings
|
||||
if (lenMbcs == 0 || lenMbcs == -1 && mbcs2Convert[0] == 0) { _wideCharStr.empty(); return _wideCharStr; }
|
||||
|
||||
int bytesNotProcessed = 0;
|
||||
int lenWc = 0;
|
||||
|
||||
// If length not specified, simply convert without checking
|
||||
if (lenMbcs == -1)
|
||||
{
|
||||
_wideCharStr = new wchar_t[initSize];
|
||||
_wideCharAllocLen = initSize;
|
||||
lenWc = MultiByteToWideChar(codepage, 0, mbcs2Convert, lenMbcs, NULL, 0);
|
||||
}
|
||||
// Otherwise, test if we are cutting a multi-byte character at end of buffer
|
||||
else if(lenMbcs != -1 && codepage == CP_UTF8) // For UTF-8, we know how to test it
|
||||
{
|
||||
int indexOfLastChar = Utf8::characterStart(mbcs2Convert, lenMbcs-1); // get index of last character
|
||||
if (indexOfLastChar != 0 && !Utf8::isValid(mbcs2Convert+indexOfLastChar, lenMbcs-indexOfLastChar)) // if it is not valid we do not process it right now (unless its the only character in string, to ensure that we always progress, e.g. that bytesNotProcessed < lenMbcs)
|
||||
{
|
||||
bytesNotProcessed = lenMbcs-indexOfLastChar;
|
||||
}
|
||||
lenWc = MultiByteToWideChar(codepage, 0, mbcs2Convert, lenMbcs-bytesNotProcessed, NULL, 0);
|
||||
}
|
||||
else // For other encodings, ask system if there are any invalid characters; note that it will not correctly know if last character is cut when there are invalid characters inside the text
|
||||
{
|
||||
lenWc = MultiByteToWideChar(codepage, (lenMbcs == -1) ? 0 : MB_ERR_INVALID_CHARS, mbcs2Convert, lenMbcs, NULL, 0);
|
||||
if (lenWc == 0 && GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
|
||||
{
|
||||
// Test without last byte
|
||||
if (lenMbcs > 1) lenWc = MultiByteToWideChar(codepage, MB_ERR_INVALID_CHARS, mbcs2Convert, lenMbcs-1, NULL, 0);
|
||||
if (lenWc == 0) // don't have to check that the error is still ERROR_NO_UNICODE_TRANSLATION, since only the length parameter changed
|
||||
{
|
||||
// TODO: should warn user about incorrect loading due to invalid characters
|
||||
// We still load the file, but the system will either strip or replace invalid characters (including the last character, if cut in half)
|
||||
lenWc = MultiByteToWideChar(codepage, 0, mbcs2Convert, lenMbcs, NULL, 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
// We found a valid text by removing one byte.
|
||||
bytesNotProcessed = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int len = MultiByteToWideChar(codepage, 0, mbcs2Convert, -1, _wideCharStr, 0);
|
||||
if (len > 0)
|
||||
if (lenWc > 0)
|
||||
{
|
||||
if ((size_t)len > _wideCharAllocLen)
|
||||
{
|
||||
delete [] _wideCharStr;
|
||||
_wideCharAllocLen = len;
|
||||
_wideCharStr = new wchar_t[_wideCharAllocLen];
|
||||
}
|
||||
MultiByteToWideChar(codepage, 0, mbcs2Convert, -1, _wideCharStr, len);
|
||||
_wideCharStr.sizeTo(lenWc);
|
||||
MultiByteToWideChar(codepage, 0, mbcs2Convert, lenMbcs-bytesNotProcessed, _wideCharStr, lenWc);
|
||||
}
|
||||
else
|
||||
_wideCharStr[0] = 0;
|
||||
_wideCharStr.empty();
|
||||
|
||||
if(pLenWc) *pLenWc = lenWc;
|
||||
if(pBytesNotProcessed) *pBytesNotProcessed = bytesNotProcessed;
|
||||
return _wideCharStr;
|
||||
}
|
||||
|
||||
@ -246,21 +279,10 @@ const wchar_t * WcharMbcsConvertor::char2wchar(const char * mbcs2Convert, UINT c
|
||||
// which are converted to the corresponding indexes in the returned wchar_t string.
|
||||
const wchar_t * WcharMbcsConvertor::char2wchar(const char * mbcs2Convert, UINT codepage, int *mstart, int *mend)
|
||||
{
|
||||
if (!_wideCharStr)
|
||||
{
|
||||
_wideCharStr = new wchar_t[initSize];
|
||||
_wideCharAllocLen = initSize;
|
||||
}
|
||||
|
||||
int len = MultiByteToWideChar(codepage, 0, mbcs2Convert, -1, _wideCharStr, 0);
|
||||
int len = MultiByteToWideChar(codepage, 0, mbcs2Convert, -1, NULL, 0);
|
||||
if (len > 0)
|
||||
{
|
||||
if (len > int(_wideCharAllocLen))
|
||||
{
|
||||
delete [] _wideCharStr;
|
||||
_wideCharAllocLen = len;
|
||||
_wideCharStr = new wchar_t[_wideCharAllocLen];
|
||||
}
|
||||
_wideCharStr.sizeTo(len);
|
||||
len = MultiByteToWideChar(codepage, 0, mbcs2Convert, -1, _wideCharStr, len);
|
||||
|
||||
if ((size_t)*mstart < strlen(mbcs2Convert) && (size_t)*mend <= strlen(mbcs2Convert))
|
||||
@ -276,61 +298,40 @@ const wchar_t * WcharMbcsConvertor::char2wchar(const char * mbcs2Convert, UINT c
|
||||
}
|
||||
else
|
||||
{
|
||||
_wideCharStr[0] = 0;
|
||||
_wideCharStr.empty();
|
||||
*mstart = 0;
|
||||
*mend = 0;
|
||||
}
|
||||
return _wideCharStr;
|
||||
}
|
||||
|
||||
const char * WcharMbcsConvertor::wchar2char(const wchar_t * wcharStr2Convert, UINT codepage)
|
||||
const char * WcharMbcsConvertor::wchar2char(const wchar_t * wcharStr2Convert, UINT codepage, int lenWc, int *pLenMbcs)
|
||||
{
|
||||
if (!_multiByteStr)
|
||||
int lenMbcs = WideCharToMultiByte(codepage, 0, wcharStr2Convert, lenWc, NULL, 0, NULL, NULL);
|
||||
if (lenMbcs > 0)
|
||||
{
|
||||
_multiByteStr = new char[initSize];
|
||||
_multiByteAllocLen = initSize;
|
||||
}
|
||||
|
||||
int len = WideCharToMultiByte(codepage, 0, wcharStr2Convert, -1, _multiByteStr, 0, NULL, NULL);
|
||||
if (len > 0)
|
||||
{
|
||||
if ((size_t)len > _multiByteAllocLen)
|
||||
{
|
||||
delete [] _multiByteStr;
|
||||
_multiByteAllocLen = len;
|
||||
_multiByteStr = new char[_multiByteAllocLen];
|
||||
}
|
||||
WideCharToMultiByte(codepage, 0, wcharStr2Convert, -1, _multiByteStr, len, NULL, NULL);
|
||||
_multiByteStr.sizeTo(lenMbcs);
|
||||
WideCharToMultiByte(codepage, 0, wcharStr2Convert, lenWc, _multiByteStr, lenMbcs, NULL, NULL);
|
||||
}
|
||||
else
|
||||
_multiByteStr[0] = 0;
|
||||
_multiByteStr.empty();
|
||||
|
||||
if(pLenMbcs) *pLenMbcs = lenMbcs;
|
||||
return _multiByteStr;
|
||||
}
|
||||
|
||||
const char * WcharMbcsConvertor::wchar2char(const wchar_t * wcharStr2Convert, UINT codepage, long *mstart, long *mend)
|
||||
{
|
||||
if (!_multiByteStr)
|
||||
{
|
||||
_multiByteStr = new char[initSize];
|
||||
_multiByteAllocLen = initSize;
|
||||
}
|
||||
|
||||
int len = WideCharToMultiByte(codepage, 0, wcharStr2Convert, -1, _multiByteStr, 0, NULL, NULL);
|
||||
int len = WideCharToMultiByte(codepage, 0, wcharStr2Convert, -1, NULL, 0, NULL, NULL);
|
||||
if (len > 0)
|
||||
{
|
||||
if ((size_t)len > _multiByteAllocLen)
|
||||
{
|
||||
delete [] _multiByteStr;
|
||||
_multiByteAllocLen = len;
|
||||
_multiByteStr = new char[_multiByteAllocLen];
|
||||
}
|
||||
_multiByteStr.sizeTo(len);
|
||||
len = WideCharToMultiByte(codepage, 0, wcharStr2Convert, -1, _multiByteStr, len, NULL, NULL); // not needed?
|
||||
|
||||
if ((int)*mstart < lstrlenW(wcharStr2Convert) && (int)*mend < lstrlenW(wcharStr2Convert))
|
||||
{
|
||||
*mstart = WideCharToMultiByte(codepage, 0, wcharStr2Convert, *mstart, _multiByteStr, 0, NULL, NULL);
|
||||
*mend = WideCharToMultiByte(codepage, 0, wcharStr2Convert, *mend, _multiByteStr, 0, NULL, NULL);
|
||||
*mstart = WideCharToMultiByte(codepage, 0, wcharStr2Convert, *mstart, NULL, 0, NULL, NULL);
|
||||
*mend = WideCharToMultiByte(codepage, 0, wcharStr2Convert, *mend, NULL, 0, NULL, NULL);
|
||||
if (*mstart >= len || *mend >= len)
|
||||
{
|
||||
*mstart = 0;
|
||||
@ -339,7 +340,7 @@ const char * WcharMbcsConvertor::wchar2char(const wchar_t * wcharStr2Convert, UI
|
||||
}
|
||||
}
|
||||
else
|
||||
_multiByteStr[0] = 0;
|
||||
_multiByteStr.empty();
|
||||
|
||||
return _multiByteStr;
|
||||
}
|
||||
|
@ -94,32 +94,56 @@ public:
|
||||
static WcharMbcsConvertor * getInstance() {return _pSelf;};
|
||||
static void destroyInstance() {delete _pSelf;};
|
||||
|
||||
const wchar_t * char2wchar(const char *mbStr, UINT codepage);
|
||||
const wchar_t * char2wchar(const char *mbStr, UINT codepage, int lenIn=-1, int *pLenOut=NULL, int *pBytesNotProcessed=NULL);
|
||||
const wchar_t * char2wchar(const char *mbcs2Convert, UINT codepage, int *mstart, int *mend);
|
||||
const char * wchar2char(const wchar_t *wcStr, UINT codepage);
|
||||
const char * wchar2char(const wchar_t *wcStr, UINT codepage, int lenIn=-1, int *pLenOut=NULL);
|
||||
const char * wchar2char(const wchar_t *wcStr, UINT codepage, long *mstart, long *mend);
|
||||
|
||||
const char * encode(UINT fromCodepage, UINT toCodepage, const char *txt2Encode) {
|
||||
const wchar_t * strW = char2wchar(txt2Encode, fromCodepage);
|
||||
return wchar2char(strW, toCodepage);
|
||||
const char * encode(UINT fromCodepage, UINT toCodepage, const char *txt2Encode, int lenIn=-1, int *pLenOut=NULL, int *pBytesNotProcessed=NULL) {
|
||||
int lenWc = 0;
|
||||
const wchar_t * strW = char2wchar(txt2Encode, fromCodepage, lenIn, &lenWc, pBytesNotProcessed);
|
||||
return wchar2char(strW, toCodepage, lenWc, pLenOut);
|
||||
};
|
||||
|
||||
protected:
|
||||
WcharMbcsConvertor() : _multiByteStr(NULL), _wideCharStr(NULL), _multiByteAllocLen(0), _wideCharAllocLen(0), initSize(1024) {
|
||||
WcharMbcsConvertor() {
|
||||
};
|
||||
~WcharMbcsConvertor() {
|
||||
if (_multiByteStr)
|
||||
delete [] _multiByteStr;
|
||||
if (_wideCharStr)
|
||||
delete [] _wideCharStr;
|
||||
};
|
||||
static WcharMbcsConvertor * _pSelf;
|
||||
|
||||
const int initSize;
|
||||
char *_multiByteStr;
|
||||
size_t _multiByteAllocLen;
|
||||
wchar_t *_wideCharStr;
|
||||
size_t _wideCharAllocLen;
|
||||
template <class T>
|
||||
class StringBuffer {
|
||||
public:
|
||||
StringBuffer() : _str(0), _allocLen(0) { }
|
||||
~StringBuffer() { if(_str) delete [] _str; }
|
||||
|
||||
void sizeTo(size_t size) {
|
||||
if(_allocLen < size)
|
||||
{
|
||||
if(_allocLen) delete[] _str;
|
||||
_allocLen = max(size, initSize);
|
||||
_str = new T[_allocLen];
|
||||
}
|
||||
}
|
||||
void empty() {
|
||||
static T nullStr = 0; // routines may return an empty string, with null terminator, without allocating memory; a pointer to this null character will be returned in that case
|
||||
if(_allocLen == 0)
|
||||
_str = &nullStr;
|
||||
else
|
||||
_str[0] = 0;
|
||||
}
|
||||
|
||||
operator T*() { return _str; }
|
||||
|
||||
protected:
|
||||
static const int initSize = 1024;
|
||||
size_t _allocLen;
|
||||
T* _str;
|
||||
};
|
||||
|
||||
StringBuffer<char> _multiByteStr;
|
||||
StringBuffer<wchar_t> _wideCharStr;
|
||||
|
||||
private:
|
||||
// Since there's no public ctor, we need to void the default assignment operator.
|
||||
|
@ -31,16 +31,15 @@
|
||||
#include "xmlMatchedTagsHighlighter.h"
|
||||
#include "EncodingMapper.h"
|
||||
|
||||
|
||||
|
||||
enum tb_stat {tb_saved, tb_unsaved, tb_ro};
|
||||
#define DIR_LEFT true
|
||||
#define DIR_RIGHT false
|
||||
|
||||
int docTabIconIDs[] = {IDI_SAVED_ICON, IDI_UNSAVED_ICON, IDI_READONLY_ICON};
|
||||
|
||||
ToolBarButtonUnit toolBarIcons[] = {
|
||||
{IDM_FILE_NEW, IDI_NEW_OFF_ICON, IDI_NEW_ON_ICON, IDI_NEW_OFF_ICON, IDR_FILENEW},
|
||||
{IDM_FILE_OPEN, IDI_OPEN_OFF_ICON, IDI_OPEN_ON_ICON, IDI_NEW_OFF_ICON, IDR_FILEOPEN},
|
||||
{IDM_FILE_OPEN, IDI_OPEN_OFF_ICON, IDI_OPEN_ON_ICON, IDI_OPEN_OFF_ICON, IDR_FILEOPEN},
|
||||
{IDM_FILE_SAVE, IDI_SAVE_OFF_ICON, IDI_SAVE_ON_ICON, IDI_SAVE_DISABLE_ICON, IDR_FILESAVE},
|
||||
{IDM_FILE_SAVEALL, IDI_SAVEALL_OFF_ICON, IDI_SAVEALL_ON_ICON, IDI_SAVEALL_DISABLE_ICON, IDR_SAVEALL},
|
||||
{IDM_FILE_CLOSE, IDI_CLOSE_OFF_ICON, IDI_CLOSE_ON_ICON, IDI_CLOSE_OFF_ICON, IDR_CLOSEFILE},
|
||||
@ -2311,7 +2310,26 @@ size_t Notepad_plus::getSelectedCharNumber(UniMode u)
|
||||
}
|
||||
return result;
|
||||
}
|
||||
/*
|
||||
|
||||
|
||||
#ifdef _OPENMP
|
||||
#include <omp.h>
|
||||
#endif
|
||||
static inline size_t countUtf8Characters(unsigned char *buf, int pos, int endpos)
|
||||
{
|
||||
size_t result = 0;
|
||||
while(pos < endpos)
|
||||
{
|
||||
unsigned char c = buf[pos++];
|
||||
if ((c&0xc0) == 0x80 // do not count unexpected continuation bytes (this handles the case where an UTF-8 character is split in the middle)
|
||||
|| c == '\n' || c == '\r') continue; // do not count end of lines
|
||||
if (c >= 0xc0) pos += utflen[(c & 0x30) >> 4];
|
||||
result++;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
size_t Notepad_plus::getCurrentDocCharCount(size_t numLines, UniMode u)
|
||||
{
|
||||
if (u != uniUTF8 && u != uniCookie)
|
||||
@ -2322,23 +2340,39 @@ size_t Notepad_plus::getCurrentDocCharCount(size_t numLines, UniMode u)
|
||||
result -= lines;
|
||||
return ((int)result < 0)?0:result;
|
||||
}
|
||||
else
|
||||
{
|
||||
else
|
||||
{
|
||||
// Note that counting is not well defined for invalid UTF-8 characters.
|
||||
// This method is O(filelength) regardless of the number of characters we count (due to SCI_GETCHARACTERPOINTER);
|
||||
// it would not be appropriate for counting characters in a small selection.
|
||||
size_t result = 0;
|
||||
for (size_t line=0; line<numLines; line++)
|
||||
|
||||
size_t endpos = _pEditView->execute(SCI_GETLENGTH);
|
||||
unsigned char* buf = (unsigned char*)_pEditView->execute(SCI_GETCHARACTERPOINTER); // Scintilla doc sais the pointer can be invalidated by any other "execute"
|
||||
|
||||
#ifdef _OPENMP // parallel counting of characters with OpenMP
|
||||
if(endpos > 50000) // starting threads takes time; for small files it is better to simply count in one thread
|
||||
{
|
||||
size_t endpos = _pEditView->execute(SCI_GETLINEENDPOSITION, line);
|
||||
for (size_t pos = _pEditView->execute(SCI_POSITIONFROMLINE, line); pos < endpos; pos++)
|
||||
#pragma omp parallel reduction(+: result)
|
||||
{
|
||||
unsigned char c = 0xf0 & (unsigned char)_pEditView->execute(SCI_GETCHARAT, pos);
|
||||
if (c >= 0xc0) pos += utflen[(c & 0x30) >> 4];
|
||||
result++;
|
||||
// split in chunks of same size (except last chunk if it's not evenly divisible)
|
||||
unsigned int num_threads = omp_get_num_threads();
|
||||
unsigned int thread_num = omp_get_thread_num();
|
||||
size_t chunk_size = endpos/num_threads;
|
||||
size_t pos = chunk_size*thread_num;
|
||||
size_t endpos_local = (thread_num == num_threads-1) ? endpos : pos+chunk_size;
|
||||
result = countUtf8Characters(buf, pos, endpos_local);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
result = countUtf8Characters(buf, 0, endpos);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
bool Notepad_plus::isFormatUnicode(UniMode u)
|
||||
{
|
||||
@ -2377,6 +2411,8 @@ size_t Notepad_plus::getSelectedBytes()
|
||||
|
||||
void Notepad_plus::updateStatusBar()
|
||||
{
|
||||
if(!NppParameters::getInstance()->getNppGUI()._statusBarShow) return; // do not update if status bar not shown
|
||||
|
||||
UniMode u = _pEditView->getCurrentBuffer()->getUnicodeMode();
|
||||
TCHAR strLnCol[64];
|
||||
|
||||
|
@ -541,7 +541,7 @@ private:
|
||||
|
||||
void updateStatusBar();
|
||||
size_t getSelectedCharNumber(UniMode);
|
||||
//size_t getCurrentDocCharCount(size_t numLines, UniMode u);
|
||||
size_t getCurrentDocCharCount(size_t numLines, UniMode u);
|
||||
int getSelectedAreas();
|
||||
int _numSel;
|
||||
size_t getSelectedBytes();
|
||||
|
@ -1666,7 +1666,8 @@ LRESULT Notepad_plus::process(HWND hwnd, UINT Message, WPARAM wParam, LPARAM lPa
|
||||
_pPublicInterface->getClientRect(rc);
|
||||
|
||||
nppGUI._statusBarShow = show;
|
||||
_statusBar.display(nppGUI._statusBarShow);
|
||||
if(show)
|
||||
_statusBar.display(nppGUI._statusBarShow);
|
||||
::SendMessage(_pPublicInterface->getHSelf(), WM_SIZE, SIZE_RESTORED, MAKELONG(rc.bottom, rc.right));
|
||||
return oldVal;
|
||||
}
|
||||
|
@ -564,9 +564,10 @@ bool FileManager::saveBuffer(BufferID id, const TCHAR * filename, bool isCopy) {
|
||||
|
||||
char data[blockSize + 1];
|
||||
int lengthDoc = _pscratchTilla->getCurrentDocLen();
|
||||
for (int i = 0; i < lengthDoc; i += blockSize)
|
||||
int grabSize;
|
||||
for (int i = 0; i < lengthDoc; i += grabSize)
|
||||
{
|
||||
int grabSize = lengthDoc - i;
|
||||
grabSize = lengthDoc - i;
|
||||
if (grabSize > blockSize)
|
||||
grabSize = blockSize;
|
||||
|
||||
@ -574,8 +575,11 @@ bool FileManager::saveBuffer(BufferID id, const TCHAR * filename, bool isCopy) {
|
||||
if (encoding != -1)
|
||||
{
|
||||
WcharMbcsConvertor *wmc = WcharMbcsConvertor::getInstance();
|
||||
const char *newData = wmc->encode(SC_CP_UTF8, encoding, data);
|
||||
UnicodeConvertor.fwrite(newData, strlen(newData));
|
||||
int newDataLen = 0;
|
||||
int incompleteMultibyteChar = 0;
|
||||
const char *newData = wmc->encode(SC_CP_UTF8, encoding, data, grabSize, &newDataLen, &incompleteMultibyteChar);
|
||||
grabSize -= incompleteMultibyteChar;
|
||||
UnicodeConvertor.fwrite(newData, newDataLen);
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -692,26 +696,11 @@ bool FileManager::loadFileData(Document doc, const TCHAR * filename, Utf8_16_Rea
|
||||
size_t lenFile = 0;
|
||||
size_t lenConvert = 0; //just in case conversion results in 0, but file not empty
|
||||
bool isFirstTime = true;
|
||||
int incompleteMultibyteChar = 0; //we do not want to call SCI_APPENDTEXT with an incomplete character if the buffer ends in the middle of one
|
||||
char incompleteMultibyteChar_first = 0;
|
||||
int incompleteMultibyteChar = 0;
|
||||
|
||||
do {
|
||||
lenFile = fread(data+incompleteMultibyteChar, 1, blockSize-incompleteMultibyteChar, fp) + incompleteMultibyteChar;
|
||||
|
||||
// we might not know yet the encoding; we ensure that valid UTF-8 characters will not be cut in the middle, without causing problems if it's not UTF-8
|
||||
// TODO: all expressions for testing UTF chars should be put in inline functions, not directly in the code
|
||||
if(lenFile == blockSize && (data[blockSize-1]&0x80) != 0) // possible multi-byte character that could be cut due to blockSize
|
||||
{
|
||||
incompleteMultibyteChar = 1;
|
||||
while(incompleteMultibyteChar < 6 // longest "defined" UTF-8 code (including restricted codes not yet defined by Unicode)
|
||||
&& (data[blockSize-incompleteMultibyteChar]&0xC0) == 0x80) // is possibly a continuation byte in a multi-byte character
|
||||
++incompleteMultibyteChar;
|
||||
// leave for the next buffer all bytes that could potentially be multi-byte UTF-8 at the end of current buffer
|
||||
lenFile -= incompleteMultibyteChar;
|
||||
incompleteMultibyteChar_first = data[lenFile]; // this byte can be erased by following code to put a null terminator
|
||||
}
|
||||
else incompleteMultibyteChar = 0;
|
||||
|
||||
// check if file contain any BOM
|
||||
if (isFirstTime)
|
||||
{
|
||||
@ -726,10 +715,19 @@ bool FileManager::loadFileData(Document doc, const TCHAR * filename, Utf8_16_Rea
|
||||
|
||||
if (encoding != -1)
|
||||
{
|
||||
data[lenFile] = '\0';
|
||||
WcharMbcsConvertor *wmc = WcharMbcsConvertor::getInstance();
|
||||
const char *newData = wmc->encode(encoding, SC_CP_UTF8, data);
|
||||
_pscratchTilla->execute(SCI_APPENDTEXT, strlen(newData), (LPARAM)newData);
|
||||
if (encoding == SC_CP_UTF8)
|
||||
{
|
||||
// Pass through UTF-8 (this does not check validity of characters, thus inserting a multi-byte character in two halfs is working)
|
||||
_pscratchTilla->execute(SCI_APPENDTEXT, lenFile, (LPARAM)data);
|
||||
}
|
||||
else
|
||||
{
|
||||
WcharMbcsConvertor *wmc = WcharMbcsConvertor::getInstance();
|
||||
int newDataLen = 0;
|
||||
const char *newData = wmc->encode(encoding, SC_CP_UTF8, data, lenFile, &newDataLen, &incompleteMultibyteChar);
|
||||
_pscratchTilla->execute(SCI_APPENDTEXT, newDataLen, (LPARAM)newData);
|
||||
}
|
||||
|
||||
if (format == -1)
|
||||
format = getEOLFormatForm(data);
|
||||
}
|
||||
@ -743,7 +741,6 @@ bool FileManager::loadFileData(Document doc, const TCHAR * filename, Utf8_16_Rea
|
||||
{
|
||||
// copy bytes to next buffer
|
||||
memcpy(data, data+blockSize-incompleteMultibyteChar, incompleteMultibyteChar);
|
||||
data[0] = incompleteMultibyteChar_first;
|
||||
}
|
||||
|
||||
} while (lenFile > 0);
|
||||
|
55
PowerEditor/src/Utf8.h
Normal file
55
PowerEditor/src/Utf8.h
Normal file
@ -0,0 +1,55 @@
|
||||
// Simple functions to test UTF-8 characters.
|
||||
// Copyright (C)2010 Francois-R.Boyer@PolyMtl.ca
|
||||
// First version 2010-08
|
||||
//
|
||||
// Written for notepad++, and distributed under same license:
|
||||
// This program is free software; you can redistribute it and/or
|
||||
// modify it under the terms of the GNU General Public License
|
||||
// as published by the Free Software Foundation; either
|
||||
// version 2 of the License, or (at your option) any later version.
|
||||
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
|
||||
namespace Utf8 { // could be a static class, instead of a namespace, if it needs private members
|
||||
// basic classification of UTF-8 bytes
|
||||
inline static bool isSingleByte(UCHAR c) { return c < 0x80; }
|
||||
inline static bool isPartOfMultibyte(UCHAR c) { return c >= 0x80; }
|
||||
inline static bool isFirstOfMultibyte(UCHAR c) { return c >= 0xC2 && c < 0xF5; } // 0xF5 to 0xFD are defined by UTF-8, but are not currently valid Unicode
|
||||
inline static bool isContinuation(UCHAR c) { return (c & 0xC0) == 0x80; }
|
||||
inline static bool isValid(UCHAR c) { return c < 0xC0 || isFirstOfMultibyte(c); } // validates a byte, out of context
|
||||
|
||||
// number of continuation bytes for a given valid first character (0 for single byte characters)
|
||||
inline static int continuationBytes(UCHAR c) {
|
||||
static const char _len[] = { 1,1,2,3 };
|
||||
return (c < 0xC0) ? 0 : _len[(c & 0x30) >> 4];
|
||||
}
|
||||
|
||||
// validates a full character
|
||||
inline static bool isValid(const char* buf, int buflen) {
|
||||
if(isSingleByte(buf[0])) return true; // single byte is valid
|
||||
if(!isFirstOfMultibyte(buf[0])) return false; // not single byte, nor valid multi-byte first byte
|
||||
int charContinuationBytes = continuationBytes(buf[0]);
|
||||
if(buflen < charContinuationBytes+1) return false; // character does not fit in buffer
|
||||
for(int i = charContinuationBytes; i>0; --i)
|
||||
if(!isContinuation(*(++buf))) return false; // not enough continuation bytes
|
||||
return true; // the character is valid (if there are too many continuation bytes, it is the next character that will be invalid)
|
||||
}
|
||||
|
||||
// rewinds to the first byte of a multi-byte character for any valid UTF-8 (and will not rewind too much on any other input)
|
||||
inline static int characterStart(const char* buf, int startingIndex) {
|
||||
int charContinuationBytes = 0;
|
||||
while(charContinuationBytes < startingIndex // rewind past start of buffer?
|
||||
&& charContinuationBytes < 5 // UTF-8 support up to 5 continuation bytes (but valid sequences currently do not have more than 3)
|
||||
&& isContinuation(buf[startingIndex-charContinuationBytes])
|
||||
)
|
||||
++charContinuationBytes;
|
||||
return startingIndex-charContinuationBytes;
|
||||
}
|
||||
};
|
@ -18,12 +18,12 @@
|
||||
#ifndef RESOURCE_H
|
||||
#define RESOURCE_H
|
||||
|
||||
#define NOTEPAD_PLUS_VERSION TEXT("Notepad++ v5.7")
|
||||
#define NOTEPAD_PLUS_VERSION TEXT("Notepad++ v5.7.1")
|
||||
|
||||
// should be X.Y : ie. if VERSION_DIGITALVALUE == 4, 7, 1, 0 , then X = 4, Y = 71
|
||||
// ex : #define VERSION_VALUE TEXT("5.63\0")
|
||||
#define VERSION_VALUE TEXT("5.7\0")
|
||||
#define VERSION_DIGITALVALUE 5, 7, 0, 0
|
||||
#define VERSION_VALUE TEXT("5.71\0")
|
||||
#define VERSION_DIGITALVALUE 5, 7, 1, 0
|
||||
|
||||
#ifdef UNICODE
|
||||
#define UNICODE_ANSI_MODE TEXT("(UNICODE)")
|
||||
|
Loading…
Reference in New Issue
Block a user