notepad-plus-plus-legacy/scintilla/src/UniConversion.cxx

// Scintilla source code edit control
/** @file UniConversion.cxx
 ** Functions to handle UTF-8 and UTF-16 strings.
 **/
// Copyright 1998-2001 by Neil Hodgson <neilh@scintilla.org>
// The License.txt file describes the conditions under which this software may be distributed.

#include <stdlib.h>

#include "UniConversion.h"

enum { SURROGATE_LEAD_FIRST = 0xD800 };
enum { SURROGATE_TRAIL_FIRST = 0xDC00 };
enum { SURROGATE_TRAIL_LAST = 0xDFFF };

unsigned int UTF8Length(const wchar_t *uptr, unsigned int tlen) {
	unsigned int len = 0;
	for (unsigned int i = 0; i < tlen && uptr[i];) {
		unsigned int uch = uptr[i];
		if (uch < 0x80) {
			len++;
		} else if (uch < 0x800) {
			len += 2;
		} else if ((uch >= SURROGATE_LEAD_FIRST) &&
			(uch <= SURROGATE_TRAIL_LAST)) {
			len += 4;
			i++;
		} else {
			len += 3;
		}
		i++;
	}
	return len;
}

void UTF8FromUTF16(const wchar_t *uptr, unsigned int tlen, char *putf, unsigned int len) {
	int k = 0;
	for (unsigned int i = 0; i < tlen && uptr[i];) {
		unsigned int uch = uptr[i];
		if (uch < 0x80) {
			putf[k++] = static_cast<char>(uch);
		} else if (uch < 0x800) {
			putf[k++] = static_cast<char>(0xC0 | (uch >> 6));
			putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
		} else if ((uch >= SURROGATE_LEAD_FIRST) &&
			(uch <= SURROGATE_TRAIL_LAST)) {
			// Half a surrogate pair
			i++;
			unsigned int xch = 0x10000 + ((uch & 0x3ff) << 10) + (uptr[i] & 0x3ff);
			putf[k++] = static_cast<char>(0xF0 | (xch >> 18));
			putf[k++] = static_cast<char>(0x80 | ((xch >> 12) & 0x3f));
			putf[k++] = static_cast<char>(0x80 | ((xch >> 6) & 0x3f));
			putf[k++] = static_cast<char>(0x80 | (xch & 0x3f));
		} else {
			putf[k++] = static_cast<char>(0xE0 | (uch >> 12));
			putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
			putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
		}
		i++;
	}
	putf[len] = '\0';
}

unsigned int UTF8CharLength(unsigned char ch) {
	if (ch < 0x80) {
		return 1;
	} else if (ch < 0x80 + 0x40 + 0x20) {
		return 2;
	} else if (ch < 0x80 + 0x40 + 0x20 + 0x10) {
		return 3;
	} else {
		return 4;
	}
}

unsigned int UTF16Length(const char *s, unsigned int len) {
	unsigned int ulen = 0;
	unsigned int charLen;
	for (unsigned int i=0; i<len;) {
		unsigned char ch = static_cast<unsigned char>(s[i]);
		if (ch < 0x80) {
			charLen = 1;
		} else if (ch < 0x80 + 0x40 + 0x20) {
			charLen = 2;
		} else if (ch < 0x80 + 0x40 + 0x20 + 0x10) {
			charLen = 3;
		} else {
			charLen = 4;
			ulen++;
		}
		i += charLen;
		ulen++;
	}
	return ulen;
}

unsigned int UTF16FromUTF8(const char *s, unsigned int len, wchar_t *tbuf, unsigned int tlen) {
	unsigned int ui=0;
	const unsigned char *us = reinterpret_cast<const unsigned char *>(s);
	unsigned int i=0;
	while ((i<len) && (ui<tlen)) {
		unsigned char ch = us[i++];
		if (ch < 0x80) {
			tbuf[ui] = ch;
		} else if (ch < 0x80 + 0x40 + 0x20) {
			tbuf[ui] = static_cast<wchar_t>((ch & 0x1F) << 6);
			ch = us[i++];
			tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + (ch & 0x7F));
		} else if (ch < 0x80 + 0x40 + 0x20 + 0x10) {
			tbuf[ui] = static_cast<wchar_t>((ch & 0xF) << 12);
			ch = us[i++];
			tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + ((ch & 0x7F) << 6));
			ch = us[i++];
			tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + (ch & 0x7F));
		} else {
			// Outside the BMP so need two surrogates
			int val = (ch & 0x7) << 18;
			ch = us[i++];
			val += (ch & 0x3F) << 12;
			ch = us[i++];
			val += (ch & 0x3F) << 6;
			ch = us[i++];
			val += (ch & 0x3F);
			tbuf[ui] = static_cast<wchar_t>(((val - 0x10000) >> 10) + SURROGATE_LEAD_FIRST);
			ui++;
			tbuf[ui] = static_cast<wchar_t>((val & 0x3ff) + SURROGATE_TRAIL_FIRST);
		}
		ui++;
	}
	return ui;
}

int UTF8BytesOfLead[256];
static bool initialisedBytesOfLead = false;

static int BytesFromLead(int leadByte) {
	if (leadByte < 0xC2) {
		// Single byte or invalid
		return 1;
	} else if (leadByte < 0xE0) {
		return 2;
	} else if (leadByte < 0xF0) {
		return 3;
	} else if (leadByte < 0xF5) {
		return 4;
	} else {
		// Characters longer than 4 bytes not possible in current UTF-8
		return 1;
	}
}

void UTF8BytesOfLeadInitialise() {
	if (!initialisedBytesOfLead) {
		for (int i=0;i<256;i++) {
			UTF8BytesOfLead[i] = BytesFromLead(i);
		}
		initialisedBytesOfLead = true;
	}
}

// Return both the width of the first character in the string and a status
// saying whether it is valid or invalid.
// Most invalid sequences return a width of 1 so are treated as isolated bytes but
// the non-characters *FFFE, *FFFF and FDD0 .. FDEF return 3 or 4 as they can be
// reasonably treated as code points in some circumstances. They will, however,
// not have associated glyphs.
int UTF8Classify(const unsigned char *us, int len) {
	// For the rules: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
	if (*us < 0x80) {
		// Single bytes easy
		return 1;
	} else if (*us > 0xf4) {
		// Characters longer than 4 bytes not possible in current UTF-8
		return UTF8MaskInvalid | 1;
	} else if (*us >= 0xf0) {
		// 4 bytes
		if (len < 4)
			return UTF8MaskInvalid | 1;
		if (UTF8IsTrailByte(us[1]) && UTF8IsTrailByte(us[2]) && UTF8IsTrailByte(us[3])) {
			if (((us[1] & 0xf) == 0xf) && (us[2] == 0xbf) && ((us[3] == 0xbe) || (us[3] == 0xbf))) {
				// *FFFE or *FFFF non-character
				return UTF8MaskInvalid | 4;
			}
			if (*us == 0xf4) {
				// Check if encoding a value beyond the last Unicode character 10FFFF
				if (us[1] > 0x8f) {
					return UTF8MaskInvalid | 1;
				} else if (us[1] == 0x8f) {
					if (us[2] > 0xbf) {
						return UTF8MaskInvalid | 1;
					} else if (us[2] == 0xbf) {
						if (us[3] > 0xbf) {
							return UTF8MaskInvalid | 1;
						}
					}
				}
			} else if ((*us == 0xf0) && ((us[1] & 0xf0) == 0x80)) {
				// Overlong
				return UTF8MaskInvalid | 1;
			}
			return 4;
		} else {
			return UTF8MaskInvalid | 1;
		}
	} else if (*us >= 0xe0) {
		// 3 bytes
		if (len < 3)
			return UTF8MaskInvalid | 1;
		if (UTF8IsTrailByte(us[1]) && UTF8IsTrailByte(us[2])) {
			if ((*us == 0xe0) && ((us[1] & 0xe0) == 0x80)) {
				// Overlong
				return UTF8MaskInvalid | 1;
			}
			if ((*us == 0xed) && ((us[1] & 0xe0) == 0xa0)) {
				// Surrogate
				return UTF8MaskInvalid | 1;
			}
			if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbe)) {
				// U+FFFE non-character - 3 bytes long
				return UTF8MaskInvalid | 3;
			}
			if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbf)) {
				// U+FFFF non-character - 3 bytes long
				return UTF8MaskInvalid | 3;
			}
			if ((*us == 0xef) && (us[1] == 0xb7) && (((us[2] & 0xf0) == 0x90) || ((us[2] & 0xf0) == 0xa0))) {
				// U+FDD0 .. U+FDEF
				return UTF8MaskInvalid | 3;
			}
			return 3;
		} else {
			return UTF8MaskInvalid | 1;
		}
	} else if (*us >= 0xc2) {
		// 2 bytes
		if (len < 2)
			return UTF8MaskInvalid | 1;
		if (UTF8IsTrailByte(us[1])) {
			return 2;
		} else {
			return UTF8MaskInvalid | 1;
		}
	} else {
		// 0xc0 .. 0xc1 is overlong encoding
		// 0x80 .. 0xbf is trail byte
		return UTF8MaskInvalid | 1;
	}
}
Moved scintilla in trunk remotely git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@461 f5eea248-9336-0410-98b8-ebc06183d4e3 2009-04-24 23:35:41 +00:00			`// Scintilla source code edit control`
			`/** @file UniConversion.cxx`
[UPDATE] Update Scintilla from 2.0.1 to 2.1.2 git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@640 f5eea248-9336-0410-98b8-ebc06183d4e3 2010-07-12 22:19:51 +00:00			`** Functions to handle UTF-8 and UTF-16 strings.`
Moved scintilla in trunk remotely git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@461 f5eea248-9336-0410-98b8-ebc06183d4e3 2009-04-24 23:35:41 +00:00			`**/`
			`// Copyright 1998-2001 by Neil Hodgson <neilh@scintilla.org>`
			`// The License.txt file describes the conditions under which this software may be distributed.`

			`#include <stdlib.h>`

			`#include "UniConversion.h"`

			`enum { SURROGATE_LEAD_FIRST = 0xD800 };`
			`enum { SURROGATE_TRAIL_FIRST = 0xDC00 };`
			`enum { SURROGATE_TRAIL_LAST = 0xDFFF };`

			`unsigned int UTF8Length(const wchar_t *uptr, unsigned int tlen) {`
			`unsigned int len = 0;`
			`for (unsigned int i = 0; i < tlen && uptr[i];) {`
			`unsigned int uch = uptr[i];`
			`if (uch < 0x80) {`
			`len++;`
			`} else if (uch < 0x800) {`
			`len += 2;`
			`} else if ((uch >= SURROGATE_LEAD_FIRST) &&`
			`(uch <= SURROGATE_TRAIL_LAST)) {`
			`len += 4;`
			`i++;`
			`} else {`
			`len += 3;`
			`}`
			`i++;`
			`}`
			`return len;`
			`}`

			`void UTF8FromUTF16(const wchar_t uptr, unsigned int tlen, char putf, unsigned int len) {`
			`int k = 0;`
			`for (unsigned int i = 0; i < tlen && uptr[i];) {`
			`unsigned int uch = uptr[i];`
			`if (uch < 0x80) {`
			`putf[k++] = static_cast<char>(uch);`
			`} else if (uch < 0x800) {`
			`putf[k++] = static_cast<char>(0xC0 \| (uch >> 6));`
			`putf[k++] = static_cast<char>(0x80 \| (uch & 0x3f));`
			`} else if ((uch >= SURROGATE_LEAD_FIRST) &&`
			`(uch <= SURROGATE_TRAIL_LAST)) {`
			`// Half a surrogate pair`
			`i++;`
			`unsigned int xch = 0x10000 + ((uch & 0x3ff) << 10) + (uptr[i] & 0x3ff);`
			`putf[k++] = static_cast<char>(0xF0 \| (xch >> 18));`
[NEW] Upgrade Scintilla from 1.77 to 1.78. Signed-off-by: Don HO <don.h@free.fr> git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@499 f5eea248-9336-0410-98b8-ebc06183d4e3 2009-06-24 19:09:31 +00:00			`putf[k++] = static_cast<char>(0x80 \| ((xch >> 12) & 0x3f));`
Moved scintilla in trunk remotely git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@461 f5eea248-9336-0410-98b8-ebc06183d4e3 2009-04-24 23:35:41 +00:00			`putf[k++] = static_cast<char>(0x80 \| ((xch >> 6) & 0x3f));`
			`putf[k++] = static_cast<char>(0x80 \| (xch & 0x3f));`
			`} else {`
			`putf[k++] = static_cast<char>(0xE0 \| (uch >> 12));`
			`putf[k++] = static_cast<char>(0x80 \| ((uch >> 6) & 0x3f));`
			`putf[k++] = static_cast<char>(0x80 \| (uch & 0x3f));`
			`}`
			`i++;`
			`}`
			`putf[len] = '\0';`
			`}`

[UPDATE] Update Scintilla from 2.0.1 to 2.1.2 git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@640 f5eea248-9336-0410-98b8-ebc06183d4e3 2010-07-12 22:19:51 +00:00			`unsigned int UTF8CharLength(unsigned char ch) {`
			`if (ch < 0x80) {`
			`return 1;`
			`} else if (ch < 0x80 + 0x40 + 0x20) {`
			`return 2;`
			`} else if (ch < 0x80 + 0x40 + 0x20 + 0x10) {`
			`return 3;`
			`} else {`
			`return 4;`
			`}`
			`}`

Moved scintilla in trunk remotely git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@461 f5eea248-9336-0410-98b8-ebc06183d4e3 2009-04-24 23:35:41 +00:00			`unsigned int UTF16Length(const char *s, unsigned int len) {`
			`unsigned int ulen = 0;`
			`unsigned int charLen;`
[UPDATE] Update Scintilla from 2.0.1 to 2.1.2 git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@640 f5eea248-9336-0410-98b8-ebc06183d4e3 2010-07-12 22:19:51 +00:00			`for (unsigned int i=0; i<len;) {`
Moved scintilla in trunk remotely git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@461 f5eea248-9336-0410-98b8-ebc06183d4e3 2009-04-24 23:35:41 +00:00			`unsigned char ch = static_cast<unsigned char>(s[i]);`
			`if (ch < 0x80) {`
			`charLen = 1;`
			`} else if (ch < 0x80 + 0x40 + 0x20) {`
			`charLen = 2;`
			`} else if (ch < 0x80 + 0x40 + 0x20 + 0x10) {`
			`charLen = 3;`
			`} else {`
			`charLen = 4;`
			`ulen++;`
			`}`
			`i += charLen;`
			`ulen++;`
			`}`
			`return ulen;`
			`}`

			`unsigned int UTF16FromUTF8(const char s, unsigned int len, wchar_t tbuf, unsigned int tlen) {`
			`unsigned int ui=0;`
			`const unsigned char us = reinterpret_cast<const unsigned char >(s);`
			`unsigned int i=0;`
			`while ((i<len) && (ui<tlen)) {`
			`unsigned char ch = us[i++];`
			`if (ch < 0x80) {`
			`tbuf[ui] = ch;`
			`} else if (ch < 0x80 + 0x40 + 0x20) {`
			`tbuf[ui] = static_cast<wchar_t>((ch & 0x1F) << 6);`
			`ch = us[i++];`
			`tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + (ch & 0x7F));`
			`} else if (ch < 0x80 + 0x40 + 0x20 + 0x10) {`
			`tbuf[ui] = static_cast<wchar_t>((ch & 0xF) << 12);`
			`ch = us[i++];`
			`tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + ((ch & 0x7F) << 6));`
			`ch = us[i++];`
			`tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + (ch & 0x7F));`
			`} else {`
			`// Outside the BMP so need two surrogates`
			`int val = (ch & 0x7) << 18;`
			`ch = us[i++];`
			`val += (ch & 0x3F) << 12;`
			`ch = us[i++];`
			`val += (ch & 0x3F) << 6;`
			`ch = us[i++];`
			`val += (ch & 0x3F);`
			`tbuf[ui] = static_cast<wchar_t>(((val - 0x10000) >> 10) + SURROGATE_LEAD_FIRST);`
			`ui++;`
			`tbuf[ui] = static_cast<wchar_t>((val & 0x3ff) + SURROGATE_TRAIL_FIRST);`
			`}`
			`ui++;`
			`}`
			`return ui;`
			`}`
[UPDATE] Update Scintilla to 3.3.4 [BUG_FIXED] (Author: Dave Brotherstone) Fix scintilla crash bug while closing a document. git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@1104 f5eea248-9336-0410-98b8-ebc06183d4e3 2013-08-28 00:44:27 +00:00
			`int UTF8BytesOfLead[256];`
			`static bool initialisedBytesOfLead = false;`

			`static int BytesFromLead(int leadByte) {`
			`if (leadByte < 0xC2) {`
			`// Single byte or invalid`
			`return 1;`
			`} else if (leadByte < 0xE0) {`
			`return 2;`
			`} else if (leadByte < 0xF0) {`
			`return 3;`
			`} else if (leadByte < 0xF5) {`
			`return 4;`
			`} else {`
			`// Characters longer than 4 bytes not possible in current UTF-8`
			`return 1;`
			`}`
			`}`

			`void UTF8BytesOfLeadInitialise() {`
			`if (!initialisedBytesOfLead) {`
			`for (int i=0;i<256;i++) {`
			`UTF8BytesOfLead[i] = BytesFromLead(i);`
			`}`
			`initialisedBytesOfLead = true;`
			`}`
			`}`

			`// Return both the width of the first character in the string and a status`
			`// saying whether it is valid or invalid.`
			`// Most invalid sequences return a width of 1 so are treated as isolated bytes but`
			`// the non-characters FFFE, FFFF and FDD0 .. FDEF return 3 or 4 as they can be`
			`// reasonably treated as code points in some circumstances. They will, however,`
			`// not have associated glyphs.`
			`int UTF8Classify(const unsigned char *us, int len) {`
			`// For the rules: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8`
			`if (*us < 0x80) {`
			`// Single bytes easy`
			`return 1;`
			`} else if (*us > 0xf4) {`
			`// Characters longer than 4 bytes not possible in current UTF-8`
			`return UTF8MaskInvalid \| 1;`
			`} else if (*us >= 0xf0) {`
			`// 4 bytes`
			`if (len < 4)`
			`return UTF8MaskInvalid \| 1;`
			`if (UTF8IsTrailByte(us[1]) && UTF8IsTrailByte(us[2]) && UTF8IsTrailByte(us[3])) {`
			`if (((us[1] & 0xf) == 0xf) && (us[2] == 0xbf) && ((us[3] == 0xbe) \|\| (us[3] == 0xbf))) {`
			`// FFFE or FFFF non-character`
			`return UTF8MaskInvalid \| 4;`
			`}`
			`if (*us == 0xf4) {`
			`// Check if encoding a value beyond the last Unicode character 10FFFF`
			`if (us[1] > 0x8f) {`
			`return UTF8MaskInvalid \| 1;`
			`} else if (us[1] == 0x8f) {`
			`if (us[2] > 0xbf) {`
			`return UTF8MaskInvalid \| 1;`
			`} else if (us[2] == 0xbf) {`
			`if (us[3] > 0xbf) {`
			`return UTF8MaskInvalid \| 1;`
			`}`
			`}`
			`}`
			`} else if ((*us == 0xf0) && ((us[1] & 0xf0) == 0x80)) {`
			`// Overlong`
			`return UTF8MaskInvalid \| 1;`
			`}`
			`return 4;`
			`} else {`
			`return UTF8MaskInvalid \| 1;`
			`}`
			`} else if (*us >= 0xe0) {`
			`// 3 bytes`
			`if (len < 3)`
			`return UTF8MaskInvalid \| 1;`
			`if (UTF8IsTrailByte(us[1]) && UTF8IsTrailByte(us[2])) {`
			`if ((*us == 0xe0) && ((us[1] & 0xe0) == 0x80)) {`
			`// Overlong`
			`return UTF8MaskInvalid \| 1;`
			`}`
			`if ((*us == 0xed) && ((us[1] & 0xe0) == 0xa0)) {`
			`// Surrogate`
			`return UTF8MaskInvalid \| 1;`
			`}`
			`if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbe)) {`
			`// U+FFFE non-character - 3 bytes long`
			`return UTF8MaskInvalid \| 3;`
			`}`
			`if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbf)) {`
			`// U+FFFF non-character - 3 bytes long`
			`return UTF8MaskInvalid \| 3;`
			`}`
			`if ((*us == 0xef) && (us[1] == 0xb7) && (((us[2] & 0xf0) == 0x90) \|\| ((us[2] & 0xf0) == 0xa0))) {`
			`// U+FDD0 .. U+FDEF`
			`return UTF8MaskInvalid \| 3;`
			`}`
			`return 3;`
			`} else {`
			`return UTF8MaskInvalid \| 1;`
			`}`
			`} else if (*us >= 0xc2) {`
			`// 2 bytes`
			`if (len < 2)`
			`return UTF8MaskInvalid \| 1;`
			`if (UTF8IsTrailByte(us[1])) {`
			`return 2;`
			`} else {`
			`return UTF8MaskInvalid \| 1;`
			`}`
			`} else {`
			`// 0xc0 .. 0xc1 is overlong encoding`
			`// 0x80 .. 0xbf is trail byte`
			`return UTF8MaskInvalid \| 1;`
			`}`
			`}`