字符编码之UCS-2与Utf-8

时间:2023-01-11 13:05:50

很多操作系统都直接支持utf-8字符串操作,只有MS这个异类用的Unicode,就是所谓的ucs-2

如果写关于跨平台的代码,那么避免不了要做编码转化

这里贴一下今天写的把Unicode转化为Utf-8的代码

Ucs2BeToUcs2Le负责将大端转化为小端
Ucs2ToUtf8负责将Unicode转化为Utf-8
Utf8ToUcs2负责将Utf-8转化为Unicode

本转化函数只考虑了3个字节以下的编码,需要3个字节以上的同学请自行google了啊
字符编码之UCS-2与Utf-8
  1 // Convert Unicode big endian to Unicode little endian
2 unsigned Ucs2BeToUcs2Le(unsigned short *ucs2bige, unsigned int size)
3 {
4 printf("%s %d\n", __FUNCTION__, __LINE__);
5
6 if (!ucs2bige) {
7 return 0;
8 }
9
10 unsigned int length = size;
11 unsigned short *tmp = ucs2bige;
12
13 while (*tmp && length) {
14
15 length--;
16 unsigned char val_high = *tmp >> 8;
17 unsigned char val_low = (unsigned char)*tmp;
18
19 *tmp = val_low << 8 | val_high;
20
21 tmp++;
22 }
23
24 return size - length;
25 }
26
27 // Convert Ucs-2 to Utf-8
28 unsigned int Ucs2ToUtf8(unsigned short *ucs2, unsigned int ucs2_size,
29 unsigned char *utf8, unsigned int utf8_size)
30 {
31 unsigned int length = 0;
32
33 if (!ucs2) {
34 return 0;
35 }
36
37 unsigned short *inbuf = ucs2;
38 unsigned char *outbuf = utf8;
39
40 if (*inbuf == 0xFFFE) {
41 Ucs2BeToUcs2Le(inbuf, ucs2_size);
42 }
43
44 if (!utf8) {
45 unsigned int insize = ucs2_size;
46
47 while (*inbuf && insize) {
48 insize--;
49
50 /* if (*inbuf == 0xFEFF) {
51 inbuf++;
52 continue;
53 }*/
54
55 if (0x0080 > *inbuf) {
56 length++;
57 } else if (0x0800 > *inbuf) {
58 length += 2;
59 } else {
60 length += 3;
61 }
62
63 inbuf++;
64 }
65 return length;
66
67 } else {
68 unsigned int insize = ucs2_size;
69
70 while (*inbuf && insize && length < utf8_size) {
71 insize--;
72
73 if (*inbuf == 0xFFFE) {
74 inbuf++;
75 continue;
76 }
77
78 if (0x0080 > *inbuf) {
79 /* 1 byte UTF-8 Character.*/
80 *outbuf++ = (unsigned char)(*inbuf);
81 length++;
82 } else if (0x0800 > *inbuf) {
83 /*2 bytes UTF-8 Character.*/
84 *outbuf++ = 0xc0 | ((unsigned char)(*inbuf >> 6));
85 *outbuf++ = 0x80 | ((unsigned char)(*inbuf & 0x3F));
86 length += 2;
87
88 } else {
89 /* 3 bytes UTF-8 Character .*/
90 *outbuf++ = 0xE0 | ((unsigned char)(*inbuf >> 12));
91 *outbuf++ = 0x80 | ((unsigned char)((*inbuf >> 6) & 0x3F));
92 *outbuf++ = 0x80 | ((unsigned char)(*inbuf & 0x3F));
93 length += 3;
94 }
95
96 inbuf++;
97 }
98
99 return length;
100 }
101 }
102
103 // Convert Utf-8 to Ucs-2
104 unsigned int Utf8ToUcs2(unsigned char *utf8, unsigned int utf8_size,
105 unsigned short *ucs2, unsigned int ucs2_size)
106 {
107 int length = 0;
108 unsigned int insize = utf8_size;
109 unsigned char *inbuf = utf8;
110
111 if(!utf8)
112 return 0;
113
114 if(!ucs2) {
115 while(*inbuf && insize) {
116 unsigned char c = *inbuf;
117 if((c & 0x80) == 0) {
118 length += 1;
119 insize -= 1;
120 inbuf++;
121 }
122 else if((c & 0xE0) == 0xC0) {
123 length += 1;
124 insize -= 2;
125 inbuf += 2;
126 } else if((c & 0xF0) == 0xE0) {
127 length += 1;
128 insize -= 3;
129 inbuf += 3;
130 }
131 }
132 return length;
133
134 } else {
135 unsigned short *outbuf = ucs2;
136 unsigned int outsize = ucs2_size;
137
138 while(*inbuf && insize && length < outsize) {
139 unsigned char c = *inbuf;
140 if((c & 0x80) == 0) {
141 *outbuf++ = c;
142 inbuf++;
143 length++;
144 insize--;
145 } else if((c & 0xE0) == 0xC0) {
146 unsigned short val;
147
148 val = (c & 0x3F) << 6;
149 inbuf++;
150 c = *inbuf;
151 val |= (c & 0x3F);
152 inbuf++;
153
154 length++;
155 insize -= 2;
156
157 *outbuf++ = val;
158 } else if((c & 0xF0) == 0xE0) {
159 unsigned short val;
160
161 val = (c & 0x1F) << 12;
162 inbuf++;
163 c = *inbuf;
164 val |= (c & 0x3F) << 6;
165 inbuf++;
166 c = *inbuf;
167 val |= (c & 0x3F);
168 inbuf++;
169
170 insize -= 3;
171 length++;
172
173 *outbuf++ = val;
174 }
175 }
176 return length;
177 }
178 return 0;
179 }
字符编码之UCS-2与Utf-8


当前,Unicode深入人心,且UTF-8大行其道,UCS编码基本被等同于UTF-16,UTF-32了,所以目前UCS基本谈出人们的视野中。(Windows NT用的就是UCS-2)




from: http://www.cnblogs.com/jojodru/archive/2012/07/03/2574616.html