From 948f281eb05cbcbe40084f8a59fcdd1e0cc8a6f8 Mon Sep 17 00:00:00 2001
From: Don Ho <don.h@free.fr>
Date: Mon, 16 Aug 2010 16:52:03 +0000
Subject: [PATCH] =?UTF-8?q?[BUG=5FFIXED]=20(Author:=20Fran=C3=A7ois-R=20Bo?=
 =?UTF-8?q?yer)=20Fix=20DBCS=20encodings=20file=20saving=20corruption=20bu?=
 =?UTF-8?q?g.=20[BUG=5FFIXED]=20(Author:=20Fran=C3=A7ois-R=20Boyer)=20Fix?=
 =?UTF-8?q?=20file=20containing=20NULL=20character=20loading=20bug.=20[ENH?=
 =?UTF-8?q?ANCEMENT]=20(Author:=20Fran=C3=A7ois-R=20Boyer)=20Improve=20get?=
 =?UTF-8?q?CurrentDocCharCount()=20method=20performance.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@651 f5eea248-9336-0410-98b8-ebc06183d4e3
---
 PowerEditor/src/MISC/Common/Common.cpp       | 123 ++++++++++---------
 PowerEditor/src/MISC/Common/Common.h         |  54 +++++---
 PowerEditor/src/Notepad_plus.cpp             |  66 +++++++---
 PowerEditor/src/Notepad_plus.h               |   2 +-
 PowerEditor/src/NppBigSwitch.cpp             |   3 +-
 PowerEditor/src/ScitillaComponent/Buffer.cpp |  47 ++++---
 PowerEditor/src/Utf8.h                       |  55 +++++++++
 PowerEditor/src/resource.h                   |   6 +-
 8 files changed, 235 insertions(+), 121 deletions(-)
 create mode 100644 PowerEditor/src/Utf8.h

diff --git a/PowerEditor/src/MISC/Common/Common.cpp b/PowerEditor/src/MISC/Common/Common.cpp
index e9eb3240..79ed12ad 100644
--- a/PowerEditor/src/MISC/Common/Common.cpp
+++ b/PowerEditor/src/MISC/Common/Common.cpp
@@ -16,6 +16,7 @@
 //Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 
 #include "precompiledHeaders.h"
+#include "../Utf8.h"
 
 WcharMbcsConvertor * WcharMbcsConvertor::_pSelf = new WcharMbcsConvertor;
 
@@ -217,28 +218,60 @@ generic_string purgeMenuItemString(const TCHAR * menuItemStr, bool keepAmpersand
 	return cleanedName;
 };
 
-const wchar_t * WcharMbcsConvertor::char2wchar(const char * mbcs2Convert, UINT codepage)
+const wchar_t * WcharMbcsConvertor::char2wchar(const char * mbcs2Convert, UINT codepage, int lenMbcs, int *pLenWc, int *pBytesNotProcessed)
 {
-	if (!_wideCharStr)
+	// Do not process empty strings
+	if (lenMbcs == 0 || lenMbcs == -1 && mbcs2Convert[0] == 0) { _wideCharStr.empty(); return _wideCharStr;	}
+
+	int bytesNotProcessed = 0;
+	int lenWc = 0;
+
+	// If length not specified, simply convert without checking
+	if (lenMbcs == -1)
 	{
-		_wideCharStr = new wchar_t[initSize];
-		_wideCharAllocLen = initSize;
+		lenWc = MultiByteToWideChar(codepage, 0, mbcs2Convert, lenMbcs, NULL, 0);
+	}
+	// Otherwise, test if we are cutting a multi-byte character at end of buffer
+	else if(lenMbcs != -1 && codepage == CP_UTF8) // For UTF-8, we know how to test it
+	{
+		int indexOfLastChar = Utf8::characterStart(mbcs2Convert, lenMbcs-1); // get index of last character
+		if (indexOfLastChar != 0 && !Utf8::isValid(mbcs2Convert+indexOfLastChar, lenMbcs-indexOfLastChar)) // if it is not valid we do not process it right now (unless its the only character in string, to ensure that we always progress, e.g. that bytesNotProcessed < lenMbcs)
+		{
+			bytesNotProcessed = lenMbcs-indexOfLastChar;
+		}
+		lenWc = MultiByteToWideChar(codepage, 0, mbcs2Convert, lenMbcs-bytesNotProcessed, NULL, 0);
+	}
+	else // For other encodings, ask system if there are any invalid characters; note that it will not correctly know if last character is cut when there are invalid characters inside the text
+	{
+		lenWc = MultiByteToWideChar(codepage, (lenMbcs == -1) ? 0 : MB_ERR_INVALID_CHARS, mbcs2Convert, lenMbcs, NULL, 0);
+		if (lenWc == 0 && GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
+		{
+			// Test without last byte
+			if (lenMbcs > 1) lenWc = MultiByteToWideChar(codepage, MB_ERR_INVALID_CHARS, mbcs2Convert, lenMbcs-1, NULL, 0);
+			if (lenWc == 0) // don't have to check that the error is still ERROR_NO_UNICODE_TRANSLATION, since only the length parameter changed
+			{
+				// TODO: should warn user about incorrect loading due to invalid characters
+				// We still load the file, but the system will either strip or replace invalid characters (including the last character, if cut in half)
+				lenWc = MultiByteToWideChar(codepage, 0, mbcs2Convert, lenMbcs, NULL, 0);
+			}
+			else
+			{
+				// We found a valid text by removing one byte.
+				bytesNotProcessed = 1;
+			}
+		}
 	}
 
-	int len = MultiByteToWideChar(codepage, 0, mbcs2Convert, -1, _wideCharStr, 0);
-	if (len > 0)
+	if (lenWc > 0)
 	{
-		if ((size_t)len > _wideCharAllocLen)
-		{
-			delete [] _wideCharStr;
-			_wideCharAllocLen = len;
-			_wideCharStr = new wchar_t[_wideCharAllocLen];
-		}
-		MultiByteToWideChar(codepage, 0, mbcs2Convert, -1, _wideCharStr, len);
+		_wideCharStr.sizeTo(lenWc);
+		MultiByteToWideChar(codepage, 0, mbcs2Convert, lenMbcs-bytesNotProcessed, _wideCharStr, lenWc);
 	}
 	else
-		_wideCharStr[0] = 0;
+		_wideCharStr.empty();
 
+	if(pLenWc) *pLenWc = lenWc;
+	if(pBytesNotProcessed) *pBytesNotProcessed = bytesNotProcessed;
 	return _wideCharStr;
 }
 
@@ -246,21 +279,10 @@ const wchar_t * WcharMbcsConvertor::char2wchar(const char * mbcs2Convert, UINT c
 // which are converted to the corresponding indexes in the returned wchar_t string.
 const wchar_t * WcharMbcsConvertor::char2wchar(const char * mbcs2Convert, UINT codepage, int *mstart, int *mend)
 {
-	if (!_wideCharStr)
-	{
-		_wideCharStr = new wchar_t[initSize];
-		_wideCharAllocLen = initSize;
-	}
-
-	int len = MultiByteToWideChar(codepage, 0, mbcs2Convert, -1, _wideCharStr, 0);
+	int len = MultiByteToWideChar(codepage, 0, mbcs2Convert, -1, NULL, 0);
 	if (len > 0)
 	{
-		if (len > int(_wideCharAllocLen))
-		{
-			delete [] _wideCharStr;
-			_wideCharAllocLen = len;
-			_wideCharStr = new wchar_t[_wideCharAllocLen];
-		}
+		_wideCharStr.sizeTo(len);
 		len = MultiByteToWideChar(codepage, 0, mbcs2Convert, -1, _wideCharStr, len);
 
 		if ((size_t)*mstart < strlen(mbcs2Convert) && (size_t)*mend <= strlen(mbcs2Convert))
@@ -276,61 +298,40 @@ const wchar_t * WcharMbcsConvertor::char2wchar(const char * mbcs2Convert, UINT c
 	}
 	else
 	{
-		_wideCharStr[0] = 0;
+		_wideCharStr.empty();
 		*mstart = 0;
 		*mend = 0;
 	}
 	return _wideCharStr;
 } 
 
-const char * WcharMbcsConvertor::wchar2char(const wchar_t * wcharStr2Convert, UINT codepage) 
+const char * WcharMbcsConvertor::wchar2char(const wchar_t * wcharStr2Convert, UINT codepage, int lenWc, int *pLenMbcs) 
 {
-	if (!_multiByteStr)
+	int lenMbcs = WideCharToMultiByte(codepage, 0, wcharStr2Convert, lenWc, NULL, 0, NULL, NULL);
+	if (lenMbcs > 0)
 	{
-		_multiByteStr = new char[initSize];
-		_multiByteAllocLen = initSize;
-	}
-
-	int len = WideCharToMultiByte(codepage, 0, wcharStr2Convert, -1, _multiByteStr, 0, NULL, NULL);
-	if (len > 0)
-	{
-		if ((size_t)len > _multiByteAllocLen)
-		{
-			delete [] _multiByteStr;
-			_multiByteAllocLen = len;
-			_multiByteStr = new char[_multiByteAllocLen];
-		}
-		WideCharToMultiByte(codepage, 0, wcharStr2Convert, -1, _multiByteStr, len, NULL, NULL);
+		_multiByteStr.sizeTo(lenMbcs);
+		WideCharToMultiByte(codepage, 0, wcharStr2Convert, lenWc, _multiByteStr, lenMbcs, NULL, NULL);
 	}
 	else
-		_multiByteStr[0] = 0;
+		_multiByteStr.empty();
 
+	if(pLenMbcs) *pLenMbcs = lenMbcs;
 	return _multiByteStr;
 }
 
 const char * WcharMbcsConvertor::wchar2char(const wchar_t * wcharStr2Convert, UINT codepage, long *mstart, long *mend) 
 {
-	if (!_multiByteStr)
-	{
-		_multiByteStr = new char[initSize];
-		_multiByteAllocLen = initSize;
-	}
-
-	int len = WideCharToMultiByte(codepage, 0, wcharStr2Convert, -1, _multiByteStr, 0, NULL, NULL);
+	int len = WideCharToMultiByte(codepage, 0, wcharStr2Convert, -1, NULL, 0, NULL, NULL);
 	if (len > 0)
 	{
-		if ((size_t)len > _multiByteAllocLen)
-		{
-			delete [] _multiByteStr;
-			_multiByteAllocLen = len;
-			_multiByteStr = new char[_multiByteAllocLen];
-		}
+		_multiByteStr.sizeTo(len);
 		len = WideCharToMultiByte(codepage, 0, wcharStr2Convert, -1, _multiByteStr, len, NULL, NULL); // not needed?
 
         if ((int)*mstart < lstrlenW(wcharStr2Convert) && (int)*mend < lstrlenW(wcharStr2Convert))
         {
-			*mstart = WideCharToMultiByte(codepage, 0, wcharStr2Convert, *mstart, _multiByteStr, 0, NULL, NULL);
-			*mend = WideCharToMultiByte(codepage, 0, wcharStr2Convert, *mend, _multiByteStr, 0, NULL, NULL);
+			*mstart = WideCharToMultiByte(codepage, 0, wcharStr2Convert, *mstart, NULL, 0, NULL, NULL);
+			*mend = WideCharToMultiByte(codepage, 0, wcharStr2Convert, *mend, NULL, 0, NULL, NULL);
 			if (*mstart >= len || *mend >= len)
 			{
 				*mstart = 0;
@@ -339,7 +340,7 @@ const char * WcharMbcsConvertor::wchar2char(const wchar_t * wcharStr2Convert, UI
 		}
 	}
 	else
-		_multiByteStr[0] = 0;
+		_multiByteStr.empty();
 
 	return _multiByteStr;
 }
diff --git a/PowerEditor/src/MISC/Common/Common.h b/PowerEditor/src/MISC/Common/Common.h
index d3b4ca70..88834967 100644
--- a/PowerEditor/src/MISC/Common/Common.h
+++ b/PowerEditor/src/MISC/Common/Common.h
@@ -94,32 +94,56 @@ public:
 	static WcharMbcsConvertor * getInstance() {return _pSelf;};
 	static void destroyInstance() {delete _pSelf;};
 
-	const wchar_t * char2wchar(const char *mbStr, UINT codepage);
+	const wchar_t * char2wchar(const char *mbStr, UINT codepage, int lenIn=-1, int *pLenOut=NULL, int *pBytesNotProcessed=NULL);
 	const wchar_t * char2wchar(const char *mbcs2Convert, UINT codepage, int *mstart, int *mend);
-	const char * wchar2char(const wchar_t *wcStr, UINT codepage);
+	const char * wchar2char(const wchar_t *wcStr, UINT codepage, int lenIn=-1, int *pLenOut=NULL);
 	const char * wchar2char(const wchar_t *wcStr, UINT codepage, long *mstart, long *mend);
 	
-	const char * encode(UINT fromCodepage, UINT toCodepage, const char *txt2Encode) {
-        const wchar_t * strW = char2wchar(txt2Encode, fromCodepage);
-        return wchar2char(strW, toCodepage);
+	const char * encode(UINT fromCodepage, UINT toCodepage, const char *txt2Encode, int lenIn=-1, int *pLenOut=NULL, int *pBytesNotProcessed=NULL) {
+		int lenWc = 0;
+        const wchar_t * strW = char2wchar(txt2Encode, fromCodepage, lenIn, &lenWc, pBytesNotProcessed);
+        return wchar2char(strW, toCodepage, lenWc, pLenOut);
     };
 
 protected:
-	WcharMbcsConvertor() : _multiByteStr(NULL), _wideCharStr(NULL), _multiByteAllocLen(0), _wideCharAllocLen(0), initSize(1024) {
+	WcharMbcsConvertor() {
 	};
 	~WcharMbcsConvertor() {
-		if (_multiByteStr)
-			delete [] _multiByteStr;
-		if (_wideCharStr)
-			delete [] _wideCharStr;
 	};
 	static WcharMbcsConvertor * _pSelf;
 
-	const int initSize;
-	char *_multiByteStr;
-	size_t _multiByteAllocLen;
-	wchar_t *_wideCharStr;
-	size_t _wideCharAllocLen;
+	template <class T>
+	class StringBuffer {
+	public:
+		StringBuffer() : _str(0), _allocLen(0) { }
+		~StringBuffer() { if(_str) delete [] _str; }
+
+		void sizeTo(size_t size) {
+			if(_allocLen < size)
+			{
+				if(_allocLen) delete[] _str;
+				_allocLen = max(size, initSize);
+				_str = new T[_allocLen];
+			}
+		}
+		void empty() {
+			static T nullStr = 0; // routines may return an empty string, with null terminator, without allocating memory; a pointer to this null character will be returned in that case
+			if(_allocLen == 0)
+				_str = &nullStr;
+			else
+				_str[0] = 0;
+		}
+
+		operator T*() { return _str; }
+
+	protected:
+		static const int initSize = 1024;
+		size_t _allocLen;
+		T* _str;
+	};
+
+	StringBuffer<char> _multiByteStr;
+	StringBuffer<wchar_t> _wideCharStr;
 
 private:
 	// Since there's no public ctor, we need to void the default assignment operator.
diff --git a/PowerEditor/src/Notepad_plus.cpp b/PowerEditor/src/Notepad_plus.cpp
index ae189465..66dd2484 100644
--- a/PowerEditor/src/Notepad_plus.cpp
+++ b/PowerEditor/src/Notepad_plus.cpp
@@ -31,16 +31,15 @@
 #include "xmlMatchedTagsHighlighter.h"
 #include "EncodingMapper.h"
 
-
-
 enum tb_stat {tb_saved, tb_unsaved, tb_ro};
 #define DIR_LEFT true
 #define DIR_RIGHT false
 
 int docTabIconIDs[] = {IDI_SAVED_ICON, IDI_UNSAVED_ICON, IDI_READONLY_ICON};
+
 ToolBarButtonUnit toolBarIcons[] = {
 	{IDM_FILE_NEW,		IDI_NEW_OFF_ICON,		IDI_NEW_ON_ICON,		IDI_NEW_OFF_ICON, IDR_FILENEW},
-	{IDM_FILE_OPEN,		IDI_OPEN_OFF_ICON,		IDI_OPEN_ON_ICON,		IDI_NEW_OFF_ICON, IDR_FILEOPEN},
+	{IDM_FILE_OPEN,		IDI_OPEN_OFF_ICON,		IDI_OPEN_ON_ICON,		IDI_OPEN_OFF_ICON, IDR_FILEOPEN},
 	{IDM_FILE_SAVE,		IDI_SAVE_OFF_ICON,		IDI_SAVE_ON_ICON,		IDI_SAVE_DISABLE_ICON, IDR_FILESAVE},
 	{IDM_FILE_SAVEALL,	IDI_SAVEALL_OFF_ICON,	IDI_SAVEALL_ON_ICON,	IDI_SAVEALL_DISABLE_ICON, IDR_SAVEALL},
 	{IDM_FILE_CLOSE,	IDI_CLOSE_OFF_ICON,		IDI_CLOSE_ON_ICON,		IDI_CLOSE_OFF_ICON, IDR_CLOSEFILE},
@@ -2311,7 +2310,26 @@ size_t Notepad_plus::getSelectedCharNumber(UniMode u)
 	}
 	return result;
 }
-/*
+
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+static inline size_t countUtf8Characters(unsigned char *buf, int pos, int endpos)
+{
+	size_t result = 0;
+	while(pos < endpos)
+	{
+		unsigned char c = buf[pos++];
+		if ((c&0xc0) == 0x80 // do not count unexpected continuation bytes (this handles the case where an UTF-8 character is split in the middle)
+			|| c == '\n' || c == '\r') continue; // do not count end of lines
+		if (c >= 0xc0) pos += utflen[(c & 0x30) >>  4];
+		result++;
+	}
+	return result;
+}
+
+
 size_t Notepad_plus::getCurrentDocCharCount(size_t numLines, UniMode u)
 {
 	if (u != uniUTF8 && u != uniCookie)
@@ -2322,23 +2340,39 @@ size_t Notepad_plus::getCurrentDocCharCount(size_t numLines, UniMode u)
 		result -= lines;
 		return ((int)result < 0)?0:result;
 	}
-	else
-	{
+ 	else
+ 	{
+		// Note that counting is not well defined for invalid UTF-8 characters.
+		// This method is O(filelength) regardless of the number of characters we count (due to SCI_GETCHARACTERPOINTER);
+		// it would not be appropriate for counting characters in a small selection.
 		size_t result = 0;
-		for (size_t line=0; line<numLines; line++)
+
+		size_t endpos = _pEditView->execute(SCI_GETLENGTH);
+		unsigned char* buf = (unsigned char*)_pEditView->execute(SCI_GETCHARACTERPOINTER); // Scintilla doc sais the pointer can be invalidated by any other "execute"
+
+#ifdef _OPENMP // parallel counting of characters with OpenMP
+		if(endpos > 50000) // starting threads takes time; for small files it is better to simply count in one thread
 		{
-			size_t endpos = _pEditView->execute(SCI_GETLINEENDPOSITION, line);
-			for (size_t pos = _pEditView->execute(SCI_POSITIONFROMLINE, line); pos < endpos; pos++)
+			#pragma omp parallel reduction(+: result)
 			{
-				unsigned char c = 0xf0 & (unsigned char)_pEditView->execute(SCI_GETCHARAT, pos);
-				if (c >= 0xc0) pos += utflen[(c & 0x30) >>  4];
-				result++;
+				// split in chunks of same size (except last chunk if it's not evenly divisible)
+				unsigned int num_threads = omp_get_num_threads();
+				unsigned int thread_num = omp_get_thread_num();
+				size_t chunk_size = endpos/num_threads;
+				size_t pos = chunk_size*thread_num;
+				size_t endpos_local = (thread_num == num_threads-1) ? endpos : pos+chunk_size;
+				result = countUtf8Characters(buf, pos, endpos_local);
 			}
 		}
-		return result;
-	}
+		else
+#endif
+		{
+			result = countUtf8Characters(buf, 0, endpos);
+		}
+ 		return result;
+ 	}
 }
-*/
+
 
 bool Notepad_plus::isFormatUnicode(UniMode u)
 {
@@ -2377,6 +2411,8 @@ size_t Notepad_plus::getSelectedBytes()
 
 void Notepad_plus::updateStatusBar() 
 {
+	if(!NppParameters::getInstance()->getNppGUI()._statusBarShow) return; // do not update if status bar not shown
+
 	UniMode u = _pEditView->getCurrentBuffer()->getUnicodeMode();
     TCHAR strLnCol[64];
 
diff --git a/PowerEditor/src/Notepad_plus.h b/PowerEditor/src/Notepad_plus.h
index 3f3b97c2..1b9b3681 100644
--- a/PowerEditor/src/Notepad_plus.h
+++ b/PowerEditor/src/Notepad_plus.h
@@ -541,7 +541,7 @@ private:
 
 	void updateStatusBar();
 	size_t getSelectedCharNumber(UniMode);
-	//size_t getCurrentDocCharCount(size_t numLines, UniMode u);
+	size_t getCurrentDocCharCount(size_t numLines, UniMode u);
 	int getSelectedAreas();
 	int _numSel;
 	size_t getSelectedBytes();
diff --git a/PowerEditor/src/NppBigSwitch.cpp b/PowerEditor/src/NppBigSwitch.cpp
index 26070507..b092ca37 100644
--- a/PowerEditor/src/NppBigSwitch.cpp
+++ b/PowerEditor/src/NppBigSwitch.cpp
@@ -1666,7 +1666,8 @@ LRESULT Notepad_plus::process(HWND hwnd, UINT Message, WPARAM wParam, LPARAM lPa
 			_pPublicInterface->getClientRect(rc);
 			
 			nppGUI._statusBarShow = show;
-            _statusBar.display(nppGUI._statusBarShow);
+			if(show)
+				_statusBar.display(nppGUI._statusBarShow);
             ::SendMessage(_pPublicInterface->getHSelf(), WM_SIZE, SIZE_RESTORED, MAKELONG(rc.bottom, rc.right));
             return oldVal;
         }
diff --git a/PowerEditor/src/ScitillaComponent/Buffer.cpp b/PowerEditor/src/ScitillaComponent/Buffer.cpp
index c608adf8..ccb971b8 100644
--- a/PowerEditor/src/ScitillaComponent/Buffer.cpp
+++ b/PowerEditor/src/ScitillaComponent/Buffer.cpp
@@ -564,9 +564,10 @@ bool FileManager::saveBuffer(BufferID id, const TCHAR * filename, bool isCopy) {
 
 		char data[blockSize + 1];
 		int lengthDoc = _pscratchTilla->getCurrentDocLen();
-		for (int i = 0; i < lengthDoc; i += blockSize)
+		int grabSize;
+		for (int i = 0; i < lengthDoc; i += grabSize)
 		{
-			int grabSize = lengthDoc - i;
+			grabSize = lengthDoc - i;
 			if (grabSize > blockSize) 
 				grabSize = blockSize;
 			
@@ -574,8 +575,11 @@ bool FileManager::saveBuffer(BufferID id, const TCHAR * filename, bool isCopy) {
 			if (encoding != -1)
 			{
 				WcharMbcsConvertor *wmc = WcharMbcsConvertor::getInstance();
-				const char *newData = wmc->encode(SC_CP_UTF8, encoding, data);
-				UnicodeConvertor.fwrite(newData, strlen(newData));
+				int newDataLen = 0;
+				int incompleteMultibyteChar = 0;
+				const char *newData = wmc->encode(SC_CP_UTF8, encoding, data, grabSize, &newDataLen, &incompleteMultibyteChar);
+				grabSize -= incompleteMultibyteChar;
+				UnicodeConvertor.fwrite(newData, newDataLen);
 			}
 			else
 			{
@@ -692,26 +696,11 @@ bool FileManager::loadFileData(Document doc, const TCHAR * filename, Utf8_16_Rea
 		size_t lenFile = 0;
 		size_t lenConvert = 0;	//just in case conversion results in 0, but file not empty
 		bool isFirstTime = true;
-		int incompleteMultibyteChar = 0; //we do not want to call SCI_APPENDTEXT with an incomplete character if the buffer ends in the middle of one
-		char incompleteMultibyteChar_first = 0;
+		int incompleteMultibyteChar = 0;
 
 		do {
 			lenFile = fread(data+incompleteMultibyteChar, 1, blockSize-incompleteMultibyteChar, fp) + incompleteMultibyteChar;
 
-			// we might not know yet the encoding; we ensure that valid UTF-8 characters will not be cut in the middle, without causing problems if it's not UTF-8
-			// TODO: all expressions for testing UTF chars should be put in inline functions, not directly in the code
-			if(lenFile == blockSize && (data[blockSize-1]&0x80) != 0) // possible multi-byte character that could be cut due to blockSize
-			{
-				incompleteMultibyteChar = 1;
-				while(incompleteMultibyteChar < 6 // longest "defined" UTF-8 code (including restricted codes not yet defined by Unicode)
-					&& (data[blockSize-incompleteMultibyteChar]&0xC0) == 0x80) // is possibly a continuation byte in a multi-byte character
-					++incompleteMultibyteChar;
-				// leave for the next buffer all bytes that could potentially be multi-byte UTF-8 at the end of current buffer
-				lenFile -= incompleteMultibyteChar;
-				incompleteMultibyteChar_first = data[lenFile]; // this byte can be erased by following code to put a null terminator
-			}
-			else incompleteMultibyteChar = 0;
-
             // check if file contain any BOM
             if (isFirstTime) 
             {
@@ -726,10 +715,19 @@ bool FileManager::loadFileData(Document doc, const TCHAR * filename, Utf8_16_Rea
 
 			if (encoding != -1)
 			{
-				data[lenFile] = '\0';
-				WcharMbcsConvertor *wmc = WcharMbcsConvertor::getInstance();
-				const char *newData = wmc->encode(encoding, SC_CP_UTF8, data);
-				_pscratchTilla->execute(SCI_APPENDTEXT, strlen(newData), (LPARAM)newData);
+				if (encoding == SC_CP_UTF8)
+				{
+					// Pass through UTF-8 (this does not check validity of characters, thus inserting a multi-byte character in two halfs is working)
+					_pscratchTilla->execute(SCI_APPENDTEXT, lenFile, (LPARAM)data);
+				}
+				else
+				{
+					WcharMbcsConvertor *wmc = WcharMbcsConvertor::getInstance();
+					int newDataLen = 0;
+					const char *newData = wmc->encode(encoding, SC_CP_UTF8, data, lenFile, &newDataLen, &incompleteMultibyteChar);
+					_pscratchTilla->execute(SCI_APPENDTEXT, newDataLen, (LPARAM)newData);
+				}
+
 				if (format == -1)
 					format = getEOLFormatForm(data);
 			}
@@ -743,7 +741,6 @@ bool FileManager::loadFileData(Document doc, const TCHAR * filename, Utf8_16_Rea
 			{
 				// copy bytes to next buffer
 				memcpy(data, data+blockSize-incompleteMultibyteChar, incompleteMultibyteChar);
-				data[0] = incompleteMultibyteChar_first;
 			}
 			
 		} while (lenFile > 0);
diff --git a/PowerEditor/src/Utf8.h b/PowerEditor/src/Utf8.h
new file mode 100644
index 00000000..178ef0a6
--- /dev/null
+++ b/PowerEditor/src/Utf8.h
@@ -0,0 +1,55 @@
+// Simple functions to test UTF-8 characters.
+// Copyright (C)2010 Francois-R.Boyer@PolyMtl.ca
+// First version 2010-08
+//
+// Written for notepad++, and distributed under same license:
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License
+// as published by the Free Software Foundation; either
+// version 2 of the License, or (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+namespace Utf8 { // could be a static class, instead of a namespace, if it needs private members
+	// basic classification of UTF-8 bytes
+	inline static bool isSingleByte(UCHAR c)       { return c < 0x80; }
+	inline static bool isPartOfMultibyte(UCHAR c)  { return c >= 0x80; }
+	inline static bool isFirstOfMultibyte(UCHAR c) { return c >= 0xC2 && c < 0xF5; } // 0xF5 to 0xFD are defined by UTF-8, but are not currently valid Unicode
+	inline static bool isContinuation(UCHAR c)     { return (c & 0xC0) == 0x80; }
+	inline static bool isValid(UCHAR c)            { return c < 0xC0 || isFirstOfMultibyte(c); }	// validates a byte, out of context
+
+	// number of continuation bytes for a given valid first character (0 for single byte characters)
+	inline static int  continuationBytes(UCHAR c)  {
+		static const char _len[] = { 1,1,2,3 };
+		return (c < 0xC0) ? 0 : _len[(c & 0x30) >>  4];
+	} 
+
+	// validates a full character
+	inline static bool isValid(const char* buf, int buflen) {
+		if(isSingleByte(buf[0])) return true; // single byte is valid
+		if(!isFirstOfMultibyte(buf[0])) return false; // not single byte, nor valid multi-byte first byte
+		int charContinuationBytes = continuationBytes(buf[0]);
+		if(buflen < charContinuationBytes+1) return false; // character does not fit in buffer
+		for(int i = charContinuationBytes; i>0; --i)
+			if(!isContinuation(*(++buf))) return false; // not enough continuation bytes
+		return true;  // the character is valid (if there are too many continuation bytes, it is the next character that will be invalid)
+	}
+
+	// rewinds to the first byte of a multi-byte character for any valid UTF-8 (and will not rewind too much on any other input)
+	inline static int characterStart(const char* buf, int startingIndex) {
+		int charContinuationBytes = 0;
+		while(charContinuationBytes < startingIndex	// rewind past start of buffer?
+			&& charContinuationBytes < 5	// UTF-8 support up to 5 continuation bytes (but valid sequences currently do not have more than 3)
+			&& isContinuation(buf[startingIndex-charContinuationBytes])
+			)
+			++charContinuationBytes;
+		return startingIndex-charContinuationBytes;
+	}
+};
diff --git a/PowerEditor/src/resource.h b/PowerEditor/src/resource.h
index 49d27c72..e5b18252 100644
--- a/PowerEditor/src/resource.h
+++ b/PowerEditor/src/resource.h
@@ -18,12 +18,12 @@
 #ifndef RESOURCE_H
 #define RESOURCE_H
 
-#define NOTEPAD_PLUS_VERSION TEXT("Notepad++ v5.7")
+#define NOTEPAD_PLUS_VERSION TEXT("Notepad++ v5.7.1")
 
 // should be X.Y : ie. if VERSION_DIGITALVALUE == 4, 7, 1, 0 , then X = 4, Y = 71 
 // ex : #define VERSION_VALUE TEXT("5.63\0")
-#define VERSION_VALUE TEXT("5.7\0")
-#define VERSION_DIGITALVALUE 5, 7, 0, 0
+#define VERSION_VALUE TEXT("5.71\0")
+#define VERSION_DIGITALVALUE 5, 7, 1, 0
 
 #ifdef UNICODE
 #define UNICODE_ANSI_MODE TEXT("(UNICODE)")