UTF-8 转 gb2312 的代码

function UTF2GB(UTFStr)
for Dig=1 to len(UTFStr)
if mid(UTFStr,Dig,1)="%" then
if len(UTFStr) >= Dig+8 then
GBStr=GBStr & ConvChinese(mid(UTFStr,Dig,9))
Dig=Dig+8
else
GBStr=GBStr & mid(UTFStr,Dig,1)
end if
else
GBStr=GBStr & mid(UTFStr,Dig,1)
end if
next
UTF2GB=GBStr
end function

function ConvChinese(x)
A=split(mid(x,2),"%")
i=0
j=0

for i=0 to ubound(A)
A(i)=c16to2(A(i))
next

for i=0 to ubound(A)-1
DigS=instr(A(i),"0")
Unicode=""
for j=1 to DigS-1
if j=1 then
A(i)=right(A(i),len(A(i))-DigS)
Unicode=Unicode & A(i)
else
i=i+1
A(i)=right(A(i),len(A(i))-2)
Unicode=Unicode & A(i)
end if
next

if len(c2to16(Unicode))=4 then
ConvChinese=ConvChinese & chrw(int("&H" & c2to16(Unicode)))
else
ConvChinese=ConvChinese & chr(int("&H" & c2to16(Unicode)))
end if
next
end function

function c2to16(x)
i=1
for i=1 to len(x) step 4
c2to16=c2to16 & hex(c2to10(mid(x,i,4)))
next
end function

function c2to10(x)
c2to10=0
if x="0" then exit function
i=0
for i= 0 to len(x) -1
if mid(x,len(x)-i,1)="1" then c2to10=c2to10+2^(i)
next
end function

function c16to2(x)
i=0
for i=1 to len(trim(x))
tempstr= c10to2(cint(int("&h" & mid(x,i,1))))
do while len(tempstr)<4
tempstr="0" & tempstr
loop
c16to2=c16to2 & tempstr
next
end function

function c10to2(x)
mysign=sgn(x)
x=abs(x)
DigS=1
do
if x<2^DigS then
exit do
else
DigS=DigS+1
end if
loop
tempnum=x

i=0
for i=DigS to 1 step-1
if tempnum>=2^(i-1) then
tempnum=tempnum-2^(i-1)
c10to2=c10to2 & "1"
else
c10to2=c10to2 & "0"
end if
next
if mysign=-1 then c10to2="-" & c10to2
end function

这段码是用于UTF-8转GB2312的，大多数可以正常转换，但少部分转的不正确，会出现乱码，不知问题出在哪？请高手赐教。

14 个解决方案

#1

没有细看
不过 GB 比 UTF-8 字符范围要小的多，从 UTF-8 向 GB2312 转换，会有字符转换不正确也是理所当然

#2

这个我明白，但是字符是常规汉字，并非生僻的文字。

#3

抚顺青松药业有限公司
%E6%8A%9A%E9%A1%BA%E9%9D%92%E6%9D%BE%E8%8D%AF%E4%B8%9A%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8%09%09%09%09%0D%0A%09%09
这个应该没有超过GB2312的字符范围吧，转换后即成乱码。

#4

既然是 URI 编码，为什么不利用 JScript 的 decodeURIComponent

<script language="JScript" runat="Server">
function decodeURL(s) { return decodeURIComponent(s) }
</script>

<%
Response.Write decodeURL("%E6%8A%9A%E9%A1%BA%E9%9D%92%E6%9D%BE%E8%8D%AF%E4%B8%9A%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8%09%09%09%09%0D%0A%09%09")
%>

#5

http://blog.csdn.net/wtogether/archive/2008/05/23/2474436.aspx

#6

利用一下 ADODB.Stream 也可以


Function UTF2GB(s)
	Dim sm
	Set sm = Server.CreateObject("ADODB.Stream")
	With sm
		.Type = 2
		.Mode = 3
		.Open
		.CharSet = "iso-8859-1"
		.WriteText UnEscape(s)
		.Position = 0
		.CharSet = "UTF-8"
		UTF2GB = .ReadText(-1)
		.Close
	End With
	Set sm = Nothing
End Function


Response.Write UTF2GB("%E6%8A%9A%E9%A1%BA%E9%9D%92%E6%9D%BE%E8%8D%AF%E4%B8%9A%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8%09%09%09%09%0D%0A%09%09")

#7

问题已经得到解决。
不过我想做得更好些，抚顺青松药业有限公司这个词条，UTF-8的URI是
%E6%8A%9A%E9%A1%BA%E9%9D%92%E6%9D%BE%E8%8D%AF%E4%B8%9A%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8%09%09%09%09%0D%0A%09%09
GB2312的URI是
%B8%A7%CB%B3%C7%E0%CB%C9%D2%A9%D2%B5%D3%D0%CF%DE%B9%AB%CB%BE%09%09%09%09%09%09
能否通过程序自动判断出是哪种编码，而不用加多一个URI的参数进行传递如&code=utf-8 或者 &code=gb2312
google的搜索就能自动识别，而不管你传递的哪种ＵＲＩ，如下：
http://www.google.cn/search?complete=1&hl=zh-CN&newwindow=1&q=%E6%8A%9A%E9%A1%BA%E9%9D%92%E6%9D%BE%E8%8D%AF%E4%B8%9A%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8&btnG=Google+%E6%90%9C%E7%B4%A2&meta=&aq=f
或者
http://www.google.cn/search?complete=1&hl=zh-CN&newwindow=1&q=%B8%A7%CB%B3%C7%E0%CB%C9%D2%A9%D2%B5%D3%D0%CF%DE%B9%AB%CB%BE%09%09%09%09%09%09&btnG=Google+%E6%90%9C%E7%B4%A2&meta=&aq=f
它都能自动识别出来

#8


' 不肯定正确，自己再测测
Function UTF2GB(s)
	s = UnEscape(s)
	Dim reg, cs
	cs = "GBK"
	Set reg = New RegExp
	reg.IgnoreCase = True
	reg.Pattern = "^(?:[\x00-\x7f]|[\xfc-\xff][\x80-\xbf]{5}|[\xf8-\xfb][\x80-\xbf]{4}|[\xf0-\xf7][\x80-\xbf]{3}|[\xe0-\xef][\x80-\xbf]{2}|[\xc0-\xdf][\x80-\xbf])+$"
	If reg.Test(s) Then cs = "UTF-8"
	Set reg = Nothing
	Dim sm
	Set sm = CreateObject("ADODB.Stream")
	With sm
		.Type = 2
		.Mode = 3
		.Open
		.CharSet = "iso-8859-1"
		.WriteText s
		.Position = 0
		.CharSet = cs
		UTF2GB = .ReadText(-1)
		.Close
	End With
	Set sm = Nothing
End Function


Response.Write UTF2GB("%B8%A7%CB%B3%C7%E0%CB%C9%D2%A9%D2%B5%D3%D0%CF%DE%B9%AB%CB%BE%09%09%09%09%09%09")
Response.Write UTF2GB("%E6%8A%9A%E9%A1%BA%E9%9D%92%E6%9D%BE%E8%8D%AF%E4%B8%9A%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8%09%09%09%09%0D%0A%09%09")

自动识别要搜索字符串，可能对于提交很长数据的话性能就恐怕....

识别也不一定准确，其实 Google 都很难做的准确, 提交 "%C5%A3%C6%A4%D1%A2" (牛皮癣GB2312) 给 Google 就乱码了

#9

上面的代码很有用，不胜感激，CSDN真是高手如云啊。

#10

问题补充：UTF2GB
str = "%D5%AC%C5%AE" '“宅女”的gb2312编码
Response.Write UTF2GB(str)
出现??

#11

是编码误认
这个问题不知如何解决啊

把 "%D5%AC%C5%AE" 提交给 Google 都会
http://www.google.cn/search?q=%D5%AC%C5%AE

#12

http://www.baidu.com/s?wd=%D4%BD%D2%B0
越野

response.write DecodeURI("%D4%BD%D2%B0")
得到的是乱码??

#13

http://www.baidu.com/s?wd=%D4%BD%D2%B0
越野

response.write UTF2GB("%D4%BD%D2%B0")
得到的是乱码??

#14

将

reg.Pattern = "^(?:[\x00-\x7f]|[\xfc-\xff][\x80-\xbf]{5}|[\xf8-\xfb][\x80-\xbf]{4}|[\xf0-\xf7][\x80-\xbf]{3}|[\xe0-\xef][\x80-\xbf]{2}|[\xc0-\xdf][\x80-\xbf])+$"

改成:

reg.Pattern = "^(?:[\x00-\x7f]|[\xe0-\xef][\x80-\xbf]{2})+$"

可放弃对外文UTF-8字符的识别,而获得更多的GBK支持

#1