今天在使用搜索的时候,发现网站对中文编码不是常见的形式%AB%,而是%u4e92%,采用的是Unicode编码,决定对其解码。
package com.bc.codec; import java.io.UnsupportedEncodingException; import java.net.URLDecoder; import java.net.URLEncoder; /** * <pre> * 简单的字符编解码 * </pre> * * @author badu * */ public class DecodeTest { final String ENCODE_UTF8 = "UTF-8"; final String ENCODE_GBK = "GBK"; public static void main(String[] args) { DecodeTest test = new DecodeTest(); // encoding String msg = "http://weibo.com/?=纳木措"; System.out.println(test.encode(msg)); System.out.println(test.decode(test.encode(msg))); // decoding // 汉字"互联网"的unicode编码 String dsg = "\u4e92%\u8054%\u7f51"; String regex = "%"; String[] str = dsg.split(regex); for (String s : str) { System.out.println(test.unicodeToUtf8(s)); } } public String decode(String str) { try { return URLDecoder.decode(str, ENCODE_UTF8); } catch (UnsupportedEncodingException e) { e.printStackTrace(); return null; } } public String encode(String str) { try { return URLEncoder.encode(str, ENCODE_UTF8); } catch (UnsupportedEncodingException e) { e.printStackTrace(); return null; } } public String unicodeToUtf8(String theString) { char aChar; int len = theString.length(); StringBuffer outBuffer = new StringBuffer(len); for (int x = 0; x < len;) { aChar = theString.charAt(x++); if (aChar == '\\') { aChar = theString.charAt(x++); if (aChar == 'u') { // Read the xxxx int value = 0; for (int i = 0; i < 4; i++) { aChar = theString.charAt(x++); switch (aChar) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': value = (value << 4) + aChar - '0'; break; case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': value = (value << 4) + 10 + aChar - 'a'; break; case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': value = (value << 4) + 10 + aChar - 'A'; break; default: throw new IllegalArgumentException( "Malformed <a><font color=#4563b9>\\uxxxx</font></a> encoding."); } } outBuffer.append((char) value); } else { if (aChar == 't') aChar = '\t'; else if (aChar == 'r') aChar = '\r'; else if (aChar == 'n') aChar = '\n'; else if (aChar == 'f') aChar = '\f'; outBuffer.append(aChar); } } else outBuffer.append(aChar); } return outBuffer.toString(); } }
%AB%这种形式的编码比较常见,直接用URLEncode和URLDecode实现编码和解码。%u4e92%这种格式则是多了步转化。
顺便比较了各个搜索引擎对中文的处理,其中baidu.com, soso.com, sougou.com, 采用的是%AB%式编码;
google.com.hk, youdou.com则直接是中文,采用UTF-8编码。
参照:http://zhaobohao.iteye.com/blog/591650