係電腦發展初時.定義左套『ASCII碼』,得128字符,英文加數字用單字節BYTE. 後蒞各國皆自定『字符編碼』,『BIG5/GB2312』皆占两字節WORD,结果係編碼重叠.所以先有亂碼.
UNICODE『萬國碼』,各國各自有獨立編碼段,吾重叠,同『ASCII碼』兼容.
『UNICODE』係設計之初每字符占『2 BYTE』即『USC2』字符集. 但係『2 BYTE』够支持65535字符.所以後蒞有『USC4』占『4 BYTE』.
係同壹字符串USC2同USC4會混合出現.
但係『ASCII碼』只需單字節『1 BYTE』. 所以發明左『UTF-8』以節約地方.
『UTF-8』同『UNICODE』按照下表互為轉换.
Unicode『USC2』字符集HEX | 『UTF-8』bin |
0x0000~0x007F | 0xxxxxxx |
0x0080~0x07FF | 110xxxxx 10xxxxxx |
0x0800~0xFFFF | 1110xxxx 10xxxxxx 10xxxxxx |
為左係同壹字符串『USC2』同『USC4』混合出現.係『USC4』字符『低16bit』同『高16bit』分別加前缀標記.
『低16bit』加『0xD800』,『高16bit』加『0xDC00』,再加壹區域0x10000.
前缀標記『0xD800』『0xDC00』各占6bit,各净低10bit加埋有『20bit』.够支持 『1048576』字符
USC4-低16bit前缀標記 | 0xD800 | BIN:110110 00000 00000 |
USC4-高16bit前缀標記 | 0xDC00 | BIN:110111 00000 00000 |
USC4 | 前缀標記 | USC4=前缀標記+字符 |
低16bit | 0xD800 | BIN:110110 00000 00000 + BIN:xxxxxxxxxx |
高16bit | 0xDC00 | BIN:110111 00000 00000 + BIN:xxxxxxxxxx |
utf8 轉 usc4
首字節 | value = utf8[sour] & (0xFF >> (bytes + 1));
++sour; |
尾随字節 | for (int i = 1; i < bytes; ++i) {
value = value << 6; value = value | (utf8[sour] & 0x3f);// 提低6bit ++sour; } |
减壹區域 | value = value – 0x10000 |
低16bit | unicode[dest] = 0xD800 | ((value >> 10) & 0x3ff ); |
高16bit | unicode[dest+1] = 0xDC00 | ((value) & 0x3ff);
dest = dest + 2; |
utf8 轉 usc2
首字節 | value = utf8[sour] & (0xFF >> (bytes + 1));
++sour; |
尾随字節 | for (int i = 1; i < bytes; ++i) {
value = value << 6; value = value | (utf8[sour] & 0x3f); ++sour; } |
反轉字節 | v = (value >> 24) & 0xFF;
unicode[dest] = v; v = (value >> 16) & 0xFF; if (v != 0) { unicode[dest] = (unicode[dest] << 8) + v; ++dest; } |
反轉字節 | v = (value >> 8) & 0xFF;
unicode[dest] = v; v = value & 0xFF; if (v != 0) { unicode[dest] = (unicode[dest] << 8) + v; ++dest; } |
Usc4轉utf8
提取字符 | value = (unicode[sour] – 0xD800) << 10 | (unicode[sour + 1] – 0xDC00); |
加壹區域 | value = value + 0x10000; |
Usc2轉utf8
提取字符 | value = unicode[sour]; |
首字節 | utf8[dest] = (0xFF << (8 – bytes)) | (value >> ((bytes – 1) * 6));
++dest; |
尾随字節 | for (int i = 1; i < bytes; ++i) {
utf8[dest] = 0x80 | (value >> ((bytes – i – 1) * 6) & 0x3F); ++dest; } ++sour; |
『UTF-8』首byte,前缀標記字符長度. 前缀0長度1, 前缀110長度2, 前缀1110長度3. 以此类推.尾随byte前缀皆標記01.
『UTF-8』bin | 長度 |
bin:0xxxxxxx | 1 |
bin:110xxxxx 10xxxxxx | 2 |
bin:1110xxxx 10xxxxxx 10xxxxxx | 3 |
bin:11110xxx 10xxxxxx 10xxxxxx 10xxxxxx | 4 |
bin:111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx | 5 |
bin:1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx | 6 |
按首BIT符號,計算UTF8字符長度,返回0非UFT8字符.
『UTF-8』字符 | |
if ((utf8[0] & 0x80) == 0x00)
return 1; |
0xxxxxxx |
if ((utf8[0] & 0xE0) == 0xC0 &&
(utf8[1] & 0xC0) == 0x80) return 2; |
110xxxxx 10xxxxxx |
if ((utf8[0] & 0xF0) == 0xE0 &&
(utf8[1] & 0xC0) == 0x80 && (utf8[2] & 0xC0) == 0x80) return 3; |
1110xxxx
10xxxxxx 10xxxxxx |
if ((utf8[0] & 0xF8) == 0xF0 &&
(utf8[1] & 0xC0) == 0x80 && (utf8[2] & 0xC0) == 0x80 && (utf8[3] & 0xC0) == 0x80) return 4; |
11110xxx
10xxxxxx 10xxxxxx 10xxxxxx |
if ((utf8[0] & 0xFC) == 0xF8 &&
(utf8[1] & 0xC0) == 0x80 && (utf8[2] & 0xC0) == 0x80 && (utf8[3] & 0xC0) == 0x80 && (utf8[4] & 0xC0) == 0x80) return 5; |
111110xx
10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
if ((utf8[0] & 0xFE) == 0xFC &&
(utf8[1] & 0xC0) == 0x80 && (utf8[2] & 0xC0) == 0x80 && (utf8[3] & 0xC0) == 0x80 && (utf8[4] & 0xC0) == 0x80 && (utf8[5] & 0xC0) == 0x80) return 6; |
1111110x
10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
unicode轉utf8 , ASCII碼相等.
int UnicodeToUTF8(char * utf8, const wchar_t * unicode)
{ int unicodeLength = 0; int bytes; int dest, sour; DWORD value; unicodeLength = Unicode_Length(unicode) ; sour = dest = 0; while (sour < unicodeLength) { bytes = 1; if (unicode[sour] >= 0xD800 && unicode[sour + 1] >= 0xDC00) bytes = 4; else if (unicode[sour] >= 0x00 && unicode[sour] <= 0x7F) bytes = 1; else if (unicode[sour] >= 0x80 && unicode[sour] <= 0x7FF) bytes = 2; else if (unicode[sour] >= 0x800 && unicode[sour] <= 0xFFFF) bytes = 3; else if (((unicode[sour + 1] << 16) | unicode[sour]) >= 0x10000 && ((unicode[sour + 1] << 16) | unicode[sour]) <= 0x1FFFFF) bytes = 4; else if (((unicode[sour + 1] << 16) | unicode[sour]) >= 0x200000 && ((unicode[sour + 1] << 16) | unicode[sour]) <= 0x3FFFFFF) bytes = 5; else if (((unicode[sour + 1] << 16) | unicode[sour]) >= 0x4000000 && ((unicode[sour + 1] << 16) | unicode[sour]) <= 0x7FFFFFFF) bytes = 6; else if (((unicode[sour + 1] << 16) | unicode[sour]) >= 0x80000000) bytes = 7;
if (bytes == 1) { utf8[dest] = unicode[sour]; ++dest; ++sour; } else if (unicode[sour] >= 0xD800 && unicode[sour + 1] >= 0xDC00) { value = (unicode[sour] – 0xD800) << 10 | (unicode[sour + 1] – 0xDC00); value = value + 0x10000; utf8[dest] = (0xFF << (8 – bytes)) | (value >> ((bytes – 1) * 6)); ++dest; for (int i = 1; i < bytes; ++i) { utf8[dest] = 0x80 | (value >> ((bytes – i – 1) * 6) & 0x3F); ++dest; } sour = sour + 2; } else if (bytes == 2 || bytes == 3) { value = unicode[sour]; utf8[dest] = (0xFF << (8 – bytes)) | (value >> ((bytes – 1) * 6)); ++dest; for (int i = 1; i < bytes; ++i) { utf8[dest] = 0x80 | (value >> ((bytes – i – 1) * 6) & 0x3F); ++dest; } ++sour; } else if (bytes >= 4) { value = (unicode[sour + 1] << 16) | unicode[sour]; utf8[dest] = (0xFF << (8 – bytes)) | (value >> ((bytes – 1) * 6)); ++dest; for (int i = 1; i < bytes; ++i) { utf8[dest] = 0x80 | (value >> ((bytes – i – 1) * 6) & 0x3F); ++dest; } sour = sour + 2; } } utf8[dest] = NULL; return dest; } |
utf8 轉 unicode
int UTF8ToUnicode(wchar_t * unicode, const char* utf8)
{ int utf8Length; int sour, dest; int bytes; dest = sour = 0; DWORD value; BYTE v; utf8Length = strlen(utf8); while (sour < utf8Length) { if ((utf8[sour] & 0x80) == 0x00) bytes = 1; else if ((utf8[sour] & 0xE0) == 0xC0) bytes = 2; else if ((utf8[sour] & 0xF0) == 0xE0) bytes = 3; else if ((utf8[sour] & 0xF8) == 0xF0) bytes = 4; else if ((utf8[sour] & 0xFC) == 0xF8) bytes = 5; else if ((utf8[sour] & 0xFE) == 0xFC) bytes = 6; else bytes = 7; if (bytes == 1) { unicode[dest] = utf8[sour]; ++dest; ++sour; } else if (bytes == 2 || bytes == 3) { value = utf8[sour] & (0xFF >> (bytes + 1)); ++sour; for (int i = 1; i < bytes; ++i) { value = value << 6; value = value | (utf8[sour] & 0x3f); ++sour; }
v = (value >> 24) & 0xFF; unicode[dest] = v; v = (value >> 16) & 0xFF; if (v != 0) { unicode[dest] = (unicode[dest] << 8) + v; ++dest; }
v = (value >> 8) & 0xFF; unicode[dest] = v; v = value & 0xFF; if (v != 0) { unicode[dest] = (unicode[dest] << 8) + v; ++dest; } } else if (bytes >= 4 ) { value = utf8[sour] & (0xFF >> (bytes + 1)); ++sour; for (int i = 1; i < bytes; ++i) { value = value << 6; value = value | (utf8[sour] & 0x3f); ++sour; } value = value – 0x10000; unicode[dest] = 0xD800 | ((value >> 10) & 0x3ff ); unicode[dest+1] = 0xDC00 | ((value) & 0x3ff); dest = dest + 2; } }
unicode[dest] = NULL; return dest; } |