UNICODE-UTF8轉換

UNICODE-UTF8轉換
UNICODE-UTF8轉換

係電腦發展初時.定義左套『ASCII碼』,得128字符,英文加數字用單字節BYTE. 後蒞各國皆自定『字符編碼』,『BIG5/GB2312』皆占两字節WORD,结果係編碼重叠.所以先有亂碼.

UNICODE『萬國碼』,各國各自有獨立編碼段,吾重叠,同『ASCII碼』兼容.

『UNICODE』係設計之初每字符占『2 BYTE』即『USC2』字符集. 但係『2 BYTE』够支持65535字符.所以後蒞有『USC4』占『4 BYTE』.

係同壹字符串USC2同USC4會混合出現.

但係『ASCII碼』只需單字節『1 BYTE』. 所以發明左『UTF-8』以節約地方.

『UTF-8』同『UNICODE』按照下表互為轉换.

Unicode『USC2』字符集HEX 『UTF-8』bin
0x0000~0x007F 0xxxxxxx
0x0080~0x07FF 110xxxxx 10xxxxxx
0x0800~0xFFFF 1110xxxx 10xxxxxx 10xxxxxx

為左係同壹字符串『USC2』同『USC4』混合出現.係『USC4』字符『低16bit』同『高16bit』分別加前缀標記.

『低16bit』加『0xD800』,『高16bit』加『0xDC00』,再加壹區域0x10000.

前缀標記『0xD800』『0xDC00』各占6bit,各净低10bit加埋有『20bit』.够支持 『1048576』字符

USC4-低16bit前缀標記 0xD800 BIN:110110 00000 00000
USC4-高16bit前缀標記 0xDC00 BIN:110111 00000 00000

 

USC4 前缀標記 USC4=前缀標記+字符
低16bit 0xD800 BIN:110110 00000 00000 + BIN:xxxxxxxxxx
高16bit 0xDC00 BIN:110111 00000 00000 + BIN:xxxxxxxxxx

utf8 轉 usc4

首字節 value = utf8[sour] & (0xFF >> (bytes + 1));

++sour;

尾随字節 for (int i = 1; i < bytes; ++i) {

value = value << 6;

value = value | (utf8[sour] & 0x3f);// 提低6bit

++sour;

}

减壹區域 value = value – 0x10000
低16bit unicode[dest] = 0xD800 | ((value >> 10) & 0x3ff );
高16bit unicode[dest+1] = 0xDC00 | ((value) & 0x3ff);

dest = dest + 2;

utf8 轉 usc2

首字節 value = utf8[sour] & (0xFF >> (bytes + 1));

++sour;

尾随字節 for (int i = 1; i < bytes; ++i) {

value = value << 6;

value = value | (utf8[sour] & 0x3f);

++sour;

}

反轉字節 v = (value >> 24) & 0xFF;

unicode[dest] = v;

v = (value >> 16) & 0xFF;

if (v != 0) {

unicode[dest] = (unicode[dest] << 8) + v;

++dest;

}

反轉字節 v = (value >> 8) & 0xFF;

unicode[dest] = v;

v = value & 0xFF;

if (v != 0) {

unicode[dest] = (unicode[dest] << 8) + v;

++dest;

}

Usc4轉utf8

提取字符 value = (unicode[sour] – 0xD800) << 10 | (unicode[sour + 1] – 0xDC00);
加壹區域 value = value + 0x10000;

Usc2轉utf8

提取字符 value = unicode[sour];
首字節 utf8[dest] = (0xFF << (8 – bytes)) | (value >> ((bytes – 1) * 6));

++dest;

尾随字節 for (int i = 1; i < bytes; ++i) {

utf8[dest] = 0x80 | (value >> ((bytes – i – 1) * 6) & 0x3F);

++dest;

}

++sour;

『UTF-8』首byte,前缀標記字符長度. 前缀0長度1, 前缀110長度2, 前缀1110長度3. 以此类推.尾随byte前缀皆標記01.

『UTF-8』bin 長度
bin:0xxxxxxx 1
bin:110xxxxx 10xxxxxx 2
bin:1110xxxx 10xxxxxx 10xxxxxx 3
bin:11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 4
bin:111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 5
bin:1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 6

按首BIT符號,計算UTF8字符長度,返回0非UFT8字符.

『UTF-8』字符
if ((utf8[0] & 0x80) == 0x00)

return 1;

0xxxxxxx
if ((utf8[0] & 0xE0) == 0xC0  &&

(utf8[1] & 0xC0) == 0x80)

return 2;

110xxxxx 10xxxxxx
if ((utf8[0] & 0xF0) == 0xE0 &&

(utf8[1] & 0xC0) == 0x80 &&

(utf8[2] & 0xC0) == 0x80)

return 3;

1110xxxx

10xxxxxx

10xxxxxx

if ((utf8[0] & 0xF8) == 0xF0 &&

(utf8[1] & 0xC0) == 0x80 &&

(utf8[2] & 0xC0) == 0x80 &&

(utf8[3] & 0xC0) == 0x80)

return 4;

11110xxx

10xxxxxx

10xxxxxx

10xxxxxx

if ((utf8[0] & 0xFC) == 0xF8 &&

(utf8[1] & 0xC0) == 0x80 &&

(utf8[2] & 0xC0) == 0x80 &&

(utf8[3] & 0xC0) == 0x80 &&

(utf8[4] & 0xC0) == 0x80)

return 5;

111110xx

10xxxxxx

10xxxxxx

10xxxxxx

10xxxxxx

if ((utf8[0] & 0xFE) == 0xFC &&

(utf8[1] & 0xC0) == 0x80 &&

(utf8[2] & 0xC0) == 0x80 &&

(utf8[3] & 0xC0) == 0x80 &&

(utf8[4] & 0xC0) == 0x80 &&

(utf8[5] & 0xC0) == 0x80)

return 6;

1111110x

10xxxxxx

10xxxxxx

10xxxxxx

10xxxxxx

10xxxxxx

unicode轉utf8 , ASCII碼相等.

 int UnicodeToUTF8(char * utf8, const wchar_t * unicode)

{

int unicodeLength = 0;

int bytes;

int dest, sour;

DWORD value;

unicodeLength = Unicode_Length(unicode) ;

sour = dest = 0;

while (sour < unicodeLength)

{

bytes = 1;

if (unicode[sour] >= 0xD800 && unicode[sour + 1] >= 0xDC00)

bytes = 4;

else

if (unicode[sour] >= 0x00 && unicode[sour] <= 0x7F)

bytes = 1;

else

if (unicode[sour] >= 0x80 && unicode[sour] <= 0x7FF)

bytes = 2;

else

if (unicode[sour] >= 0x800 && unicode[sour] <= 0xFFFF)

bytes = 3;

else

if (((unicode[sour + 1] << 16) | unicode[sour]) >= 0x10000 &&

((unicode[sour + 1] << 16) | unicode[sour]) <= 0x1FFFFF)

bytes = 4;

else

if (((unicode[sour + 1] << 16) | unicode[sour]) >= 0x200000 &&

((unicode[sour + 1] << 16) | unicode[sour]) <= 0x3FFFFFF)

bytes = 5;

else

if (((unicode[sour + 1] << 16) | unicode[sour]) >= 0x4000000 &&

((unicode[sour + 1] << 16) | unicode[sour]) <= 0x7FFFFFFF)

bytes = 6;

else

if (((unicode[sour + 1] << 16) | unicode[sour]) >= 0x80000000)

bytes = 7;

 

if (bytes == 1)

{

utf8[dest] = unicode[sour];

++dest;

++sour;

}

else

if (unicode[sour] >= 0xD800 && unicode[sour + 1] >= 0xDC00)

{

value = (unicode[sour] – 0xD800) << 10   |  (unicode[sour + 1] – 0xDC00);

value = value + 0x10000;

utf8[dest] = (0xFF << (8 – bytes)) | (value >> ((bytes – 1) * 6));

++dest;

for (int i = 1; i < bytes; ++i) {

utf8[dest] = 0x80 | (value >> ((bytes – i – 1) * 6) & 0x3F);

++dest;

}

sour = sour + 2;

}

else

if (bytes == 2 || bytes == 3)

{

value = unicode[sour];

utf8[dest] = (0xFF << (8 – bytes)) | (value >> ((bytes – 1) * 6));

++dest;

for (int i = 1; i < bytes; ++i) {

utf8[dest] = 0x80 | (value >> ((bytes – i – 1) * 6) & 0x3F);

++dest;

}

++sour;

}

else

if (bytes >= 4)

{

value = (unicode[sour + 1] << 16) | unicode[sour];

utf8[dest] = (0xFF << (8 – bytes)) | (value >> ((bytes – 1) * 6));

++dest;

for (int i = 1; i < bytes; ++i) {

utf8[dest] = 0x80 | (value >> ((bytes – i – 1) * 6) & 0x3F);

++dest;

}

sour = sour + 2;

}

}

utf8[dest] = NULL;

return dest;

}

utf8 轉 unicode

 int UTF8ToUnicode(wchar_t * unicode, const char* utf8)

{

int utf8Length;

int sour, dest;

int bytes;

dest = sour = 0;

DWORD value;

BYTE v;

utf8Length = strlen(utf8);

while (sour < utf8Length)

{

if ((utf8[sour] & 0x80) == 0x00)

bytes = 1;

else

if ((utf8[sour] & 0xE0) == 0xC0)

bytes = 2;

else

if ((utf8[sour] & 0xF0) == 0xE0)

bytes = 3;

else

if ((utf8[sour] & 0xF8) == 0xF0)

bytes = 4;

else

if ((utf8[sour] & 0xFC) == 0xF8)

bytes = 5;

else

if ((utf8[sour] & 0xFE) == 0xFC)

bytes = 6;

else

bytes = 7;

if (bytes == 1)

{

unicode[dest] = utf8[sour];

++dest;

++sour;

}

else

if (bytes == 2 || bytes == 3)

{

value = utf8[sour] & (0xFF >> (bytes + 1));

++sour;

for (int i = 1; i < bytes; ++i) {

value = value << 6;

value = value | (utf8[sour] & 0x3f);

++sour;

}

 

v = (value >> 24) & 0xFF;

unicode[dest] = v;

v = (value >> 16) & 0xFF;

if (v != 0) {

unicode[dest] = (unicode[dest] << 8) + v;

++dest;

}

 

v = (value >> 8) & 0xFF;

unicode[dest] = v;

v = value & 0xFF;

if (v != 0) {

unicode[dest] = (unicode[dest] << 8) + v;

++dest;

}

}

else

if (bytes >= 4 )

{

value = utf8[sour] & (0xFF >> (bytes + 1));

++sour;

for (int i = 1; i < bytes; ++i) {

value = value << 6;

value = value | (utf8[sour] & 0x3f);

++sour;

}

value = value – 0x10000;

unicode[dest] = 0xD800 | ((value >> 10) & 0x3ff );

unicode[dest+1] = 0xDC00 | ((value) & 0x3ff);

dest = dest + 2;

}

}

 

unicode[dest] = NULL;

return dest;

}

 

評論