2009-04-24 23:35:41 +00:00
|
|
|
// Scintilla source code edit control
|
|
|
|
/** @file UniConversion.cxx
|
2010-07-12 22:19:51 +00:00
|
|
|
** Functions to handle UTF-8 and UTF-16 strings.
|
2009-04-24 23:35:41 +00:00
|
|
|
**/
|
|
|
|
// Copyright 1998-2001 by Neil Hodgson <neilh@scintilla.org>
|
|
|
|
// The License.txt file describes the conditions under which this software may be distributed.
|
|
|
|
|
|
|
|
#include <stdlib.h>
|
|
|
|
|
|
|
|
#include "UniConversion.h"
|
|
|
|
|
|
|
|
enum { SURROGATE_LEAD_FIRST = 0xD800 };
|
|
|
|
enum { SURROGATE_TRAIL_FIRST = 0xDC00 };
|
|
|
|
enum { SURROGATE_TRAIL_LAST = 0xDFFF };
|
|
|
|
|
|
|
|
unsigned int UTF8Length(const wchar_t *uptr, unsigned int tlen) {
|
|
|
|
unsigned int len = 0;
|
|
|
|
for (unsigned int i = 0; i < tlen && uptr[i];) {
|
|
|
|
unsigned int uch = uptr[i];
|
|
|
|
if (uch < 0x80) {
|
|
|
|
len++;
|
|
|
|
} else if (uch < 0x800) {
|
|
|
|
len += 2;
|
|
|
|
} else if ((uch >= SURROGATE_LEAD_FIRST) &&
|
|
|
|
(uch <= SURROGATE_TRAIL_LAST)) {
|
|
|
|
len += 4;
|
|
|
|
i++;
|
|
|
|
} else {
|
|
|
|
len += 3;
|
|
|
|
}
|
|
|
|
i++;
|
|
|
|
}
|
|
|
|
return len;
|
|
|
|
}
|
|
|
|
|
|
|
|
void UTF8FromUTF16(const wchar_t *uptr, unsigned int tlen, char *putf, unsigned int len) {
|
|
|
|
int k = 0;
|
|
|
|
for (unsigned int i = 0; i < tlen && uptr[i];) {
|
|
|
|
unsigned int uch = uptr[i];
|
|
|
|
if (uch < 0x80) {
|
|
|
|
putf[k++] = static_cast<char>(uch);
|
|
|
|
} else if (uch < 0x800) {
|
|
|
|
putf[k++] = static_cast<char>(0xC0 | (uch >> 6));
|
|
|
|
putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
|
|
|
|
} else if ((uch >= SURROGATE_LEAD_FIRST) &&
|
|
|
|
(uch <= SURROGATE_TRAIL_LAST)) {
|
|
|
|
// Half a surrogate pair
|
|
|
|
i++;
|
|
|
|
unsigned int xch = 0x10000 + ((uch & 0x3ff) << 10) + (uptr[i] & 0x3ff);
|
|
|
|
putf[k++] = static_cast<char>(0xF0 | (xch >> 18));
|
2009-06-24 19:09:31 +00:00
|
|
|
putf[k++] = static_cast<char>(0x80 | ((xch >> 12) & 0x3f));
|
2009-04-24 23:35:41 +00:00
|
|
|
putf[k++] = static_cast<char>(0x80 | ((xch >> 6) & 0x3f));
|
|
|
|
putf[k++] = static_cast<char>(0x80 | (xch & 0x3f));
|
|
|
|
} else {
|
|
|
|
putf[k++] = static_cast<char>(0xE0 | (uch >> 12));
|
|
|
|
putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
|
|
|
|
putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
|
|
|
|
}
|
|
|
|
i++;
|
|
|
|
}
|
|
|
|
putf[len] = '\0';
|
|
|
|
}
|
|
|
|
|
2010-07-12 22:19:51 +00:00
|
|
|
unsigned int UTF8CharLength(unsigned char ch) {
|
|
|
|
if (ch < 0x80) {
|
|
|
|
return 1;
|
|
|
|
} else if (ch < 0x80 + 0x40 + 0x20) {
|
|
|
|
return 2;
|
|
|
|
} else if (ch < 0x80 + 0x40 + 0x20 + 0x10) {
|
|
|
|
return 3;
|
|
|
|
} else {
|
|
|
|
return 4;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-04-24 23:35:41 +00:00
|
|
|
unsigned int UTF16Length(const char *s, unsigned int len) {
|
|
|
|
unsigned int ulen = 0;
|
|
|
|
unsigned int charLen;
|
2010-07-12 22:19:51 +00:00
|
|
|
for (unsigned int i=0; i<len;) {
|
2009-04-24 23:35:41 +00:00
|
|
|
unsigned char ch = static_cast<unsigned char>(s[i]);
|
|
|
|
if (ch < 0x80) {
|
|
|
|
charLen = 1;
|
|
|
|
} else if (ch < 0x80 + 0x40 + 0x20) {
|
|
|
|
charLen = 2;
|
|
|
|
} else if (ch < 0x80 + 0x40 + 0x20 + 0x10) {
|
|
|
|
charLen = 3;
|
|
|
|
} else {
|
|
|
|
charLen = 4;
|
|
|
|
ulen++;
|
|
|
|
}
|
|
|
|
i += charLen;
|
|
|
|
ulen++;
|
|
|
|
}
|
|
|
|
return ulen;
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned int UTF16FromUTF8(const char *s, unsigned int len, wchar_t *tbuf, unsigned int tlen) {
|
|
|
|
unsigned int ui=0;
|
|
|
|
const unsigned char *us = reinterpret_cast<const unsigned char *>(s);
|
|
|
|
unsigned int i=0;
|
|
|
|
while ((i<len) && (ui<tlen)) {
|
|
|
|
unsigned char ch = us[i++];
|
|
|
|
if (ch < 0x80) {
|
|
|
|
tbuf[ui] = ch;
|
|
|
|
} else if (ch < 0x80 + 0x40 + 0x20) {
|
|
|
|
tbuf[ui] = static_cast<wchar_t>((ch & 0x1F) << 6);
|
|
|
|
ch = us[i++];
|
|
|
|
tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + (ch & 0x7F));
|
|
|
|
} else if (ch < 0x80 + 0x40 + 0x20 + 0x10) {
|
|
|
|
tbuf[ui] = static_cast<wchar_t>((ch & 0xF) << 12);
|
|
|
|
ch = us[i++];
|
|
|
|
tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + ((ch & 0x7F) << 6));
|
|
|
|
ch = us[i++];
|
|
|
|
tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + (ch & 0x7F));
|
|
|
|
} else {
|
|
|
|
// Outside the BMP so need two surrogates
|
|
|
|
int val = (ch & 0x7) << 18;
|
|
|
|
ch = us[i++];
|
|
|
|
val += (ch & 0x3F) << 12;
|
|
|
|
ch = us[i++];
|
|
|
|
val += (ch & 0x3F) << 6;
|
|
|
|
ch = us[i++];
|
|
|
|
val += (ch & 0x3F);
|
|
|
|
tbuf[ui] = static_cast<wchar_t>(((val - 0x10000) >> 10) + SURROGATE_LEAD_FIRST);
|
|
|
|
ui++;
|
|
|
|
tbuf[ui] = static_cast<wchar_t>((val & 0x3ff) + SURROGATE_TRAIL_FIRST);
|
|
|
|
}
|
|
|
|
ui++;
|
|
|
|
}
|
|
|
|
return ui;
|
|
|
|
}
|
2013-08-28 00:44:27 +00:00
|
|
|
|
|
|
|
int UTF8BytesOfLead[256];
|
|
|
|
static bool initialisedBytesOfLead = false;
|
|
|
|
|
|
|
|
static int BytesFromLead(int leadByte) {
|
|
|
|
if (leadByte < 0xC2) {
|
|
|
|
// Single byte or invalid
|
|
|
|
return 1;
|
|
|
|
} else if (leadByte < 0xE0) {
|
|
|
|
return 2;
|
|
|
|
} else if (leadByte < 0xF0) {
|
|
|
|
return 3;
|
|
|
|
} else if (leadByte < 0xF5) {
|
|
|
|
return 4;
|
|
|
|
} else {
|
|
|
|
// Characters longer than 4 bytes not possible in current UTF-8
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void UTF8BytesOfLeadInitialise() {
|
|
|
|
if (!initialisedBytesOfLead) {
|
|
|
|
for (int i=0;i<256;i++) {
|
|
|
|
UTF8BytesOfLead[i] = BytesFromLead(i);
|
|
|
|
}
|
|
|
|
initialisedBytesOfLead = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Return both the width of the first character in the string and a status
|
|
|
|
// saying whether it is valid or invalid.
|
|
|
|
// Most invalid sequences return a width of 1 so are treated as isolated bytes but
|
|
|
|
// the non-characters *FFFE, *FFFF and FDD0 .. FDEF return 3 or 4 as they can be
|
|
|
|
// reasonably treated as code points in some circumstances. They will, however,
|
|
|
|
// not have associated glyphs.
|
|
|
|
int UTF8Classify(const unsigned char *us, int len) {
|
|
|
|
// For the rules: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
|
|
|
|
if (*us < 0x80) {
|
|
|
|
// Single bytes easy
|
|
|
|
return 1;
|
|
|
|
} else if (*us > 0xf4) {
|
|
|
|
// Characters longer than 4 bytes not possible in current UTF-8
|
|
|
|
return UTF8MaskInvalid | 1;
|
|
|
|
} else if (*us >= 0xf0) {
|
|
|
|
// 4 bytes
|
|
|
|
if (len < 4)
|
|
|
|
return UTF8MaskInvalid | 1;
|
|
|
|
if (UTF8IsTrailByte(us[1]) && UTF8IsTrailByte(us[2]) && UTF8IsTrailByte(us[3])) {
|
|
|
|
if (((us[1] & 0xf) == 0xf) && (us[2] == 0xbf) && ((us[3] == 0xbe) || (us[3] == 0xbf))) {
|
|
|
|
// *FFFE or *FFFF non-character
|
|
|
|
return UTF8MaskInvalid | 4;
|
|
|
|
}
|
|
|
|
if (*us == 0xf4) {
|
|
|
|
// Check if encoding a value beyond the last Unicode character 10FFFF
|
|
|
|
if (us[1] > 0x8f) {
|
|
|
|
return UTF8MaskInvalid | 1;
|
|
|
|
} else if (us[1] == 0x8f) {
|
|
|
|
if (us[2] > 0xbf) {
|
|
|
|
return UTF8MaskInvalid | 1;
|
|
|
|
} else if (us[2] == 0xbf) {
|
|
|
|
if (us[3] > 0xbf) {
|
|
|
|
return UTF8MaskInvalid | 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else if ((*us == 0xf0) && ((us[1] & 0xf0) == 0x80)) {
|
|
|
|
// Overlong
|
|
|
|
return UTF8MaskInvalid | 1;
|
|
|
|
}
|
|
|
|
return 4;
|
|
|
|
} else {
|
|
|
|
return UTF8MaskInvalid | 1;
|
|
|
|
}
|
|
|
|
} else if (*us >= 0xe0) {
|
|
|
|
// 3 bytes
|
|
|
|
if (len < 3)
|
|
|
|
return UTF8MaskInvalid | 1;
|
|
|
|
if (UTF8IsTrailByte(us[1]) && UTF8IsTrailByte(us[2])) {
|
|
|
|
if ((*us == 0xe0) && ((us[1] & 0xe0) == 0x80)) {
|
|
|
|
// Overlong
|
|
|
|
return UTF8MaskInvalid | 1;
|
|
|
|
}
|
|
|
|
if ((*us == 0xed) && ((us[1] & 0xe0) == 0xa0)) {
|
|
|
|
// Surrogate
|
|
|
|
return UTF8MaskInvalid | 1;
|
|
|
|
}
|
|
|
|
if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbe)) {
|
|
|
|
// U+FFFE non-character - 3 bytes long
|
|
|
|
return UTF8MaskInvalid | 3;
|
|
|
|
}
|
|
|
|
if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbf)) {
|
|
|
|
// U+FFFF non-character - 3 bytes long
|
|
|
|
return UTF8MaskInvalid | 3;
|
|
|
|
}
|
|
|
|
if ((*us == 0xef) && (us[1] == 0xb7) && (((us[2] & 0xf0) == 0x90) || ((us[2] & 0xf0) == 0xa0))) {
|
|
|
|
// U+FDD0 .. U+FDEF
|
|
|
|
return UTF8MaskInvalid | 3;
|
|
|
|
}
|
|
|
|
return 3;
|
|
|
|
} else {
|
|
|
|
return UTF8MaskInvalid | 1;
|
|
|
|
}
|
|
|
|
} else if (*us >= 0xc2) {
|
|
|
|
// 2 bytes
|
|
|
|
if (len < 2)
|
|
|
|
return UTF8MaskInvalid | 1;
|
|
|
|
if (UTF8IsTrailByte(us[1])) {
|
|
|
|
return 2;
|
|
|
|
} else {
|
|
|
|
return UTF8MaskInvalid | 1;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// 0xc0 .. 0xc1 is overlong encoding
|
|
|
|
// 0x80 .. 0xbf is trail byte
|
|
|
|
return UTF8MaskInvalid | 1;
|
|
|
|
}
|
|
|
|
}
|