事情起源于一次读取网络上带有中文的xml信息,用JDK1.5的时候转换一切正常,当换成JDK1.6,转换出现问题,xml文件只有中文能正常显示,其他符号全部是未知代码。经过多次测试,发现,JDK1.5的编码和1.6的编码不一样,1.5直接GBK转UTF-8是可行的,1.6只能通过unicode进行转换。只能在控制台为UTF-8的条件下运行。代码如下:
import java.io.File;
import java.io.InputStream;
import java.net.URL;
import org.jdom.Document;
import org.jdom.input.SAXBuilder;
import org.jdom.output.Format;
import org.jdom.output.XMLOutputter;
public class ParserTestCase {
public static void main(String[] args) {
try {
ParserTestCase convert = new ParserTestCase();
URL url = new URL(
"http://www.google.com/ig/api?hl=zh-cn&weather=beijing");
// InputStream is = convert.getClass().getResourceAsStream("test.xml");
InputStream is = url.openStream();
byte[] b = new byte[is.available()];
System.out.println("len = " + b.length);
for (int i = 0; i < b.length; i++) {
b[i] = (byte) is.read();
}
EncodingUtils eu = new EncodingUtils();
System.out.println("origin " +eu.toHexString(b));
System.out.println(unicode2string(new String(b)));
String gbk = new String(b, "GBK");
System.out.println(gbk);
FileHandler file = new FileHandler("1.6.txt");
// byte[] fullByte = convert.gbk2utf8("锟斤拷锟斤拷sdfhskldfhs8673**667323");
byte[] fullByte = convert.gbk2utf8(gbk);
String fullStr = new String(fullByte, "UTF-8");
System.out.println("exchanged " +eu.toHexString(fullByte));
System.out.println("string from GBK to UTF-8 byte: " + fullStr);
String unicodes = string2unicode(gbk);
System.out.println("GBK 2 unicode :" + unicodes);
System.out.println("unicode 2 str :" + unicode2string(unicodes));
// file1_5.writeFile(s + "/n" + fullStr);
file.writeFile(unicode2string(unicodes));
SAXBuilder builder = new SAXBuilder();
Document doc = builder.build(file.getFile());
XMLOutputter outputter = new XMLOutputter(Format.getPrettyFormat());
outputter.output(doc, System.out);
} catch (Exception e) {
e.printStackTrace();
}
}
public byte[] gbk2utf8(String chenese) {
char c[] = chenese.toCharArray();
byte[] fullByte = new byte[3 * c.length];
for (int i = 0; i < c.length; i++) {
int m = (int) c[i];
String word = Integer.toBinaryString(m);
// System.out.println(word);
StringBuffer sb = new StringBuffer();
int len = 16 - word.length();
//
for (int j = 0; j < len; j++) {
sb.append("0");
}
sb.append(word);
sb.insert(0, "1110");
sb.insert(8, "10");
sb.insert(16, "10");
// System.out.println(sb.toString());
String s1 = sb.substring(0, 8);
String s2 = sb.substring(8, 16);
String s3 = sb.substring(16);
byte b0 = Integer.valueOf(s1, 2).byteValue();
byte b1 = Integer.valueOf(s2, 2).byteValue();
byte b2 = Integer.valueOf(s3, 2).byteValue();
byte[] bf = new byte[3];
bf[0] = b0;
fullByte[i * 3] = bf[0];
bf[1] = b1;
fullByte[i * 3 + 1] = bf[1];
bf[2] = b2;
fullByte[i * 3 + 2] = bf[2];
}
return fullByte;
}
public static String string2unicode(String s) {
if (s == null)
return null;
StringBuffer result = new StringBuffer();
int i, tempI, j, ch;
for (i = 0; i < s.length(); i++) {
if (s.charAt(i) >= 0x2018) {
result.append('//');
result.append('u');
String hex = Integer.toHexString(s.charAt(i));
result.append(hex);
} else {
result.append(s.charAt(i));
}
}
return result.toString();
}
public static String unicode2string(String s) {
if (s == null)
return null;
StringBuffer result = new StringBuffer();
int i, tempI, j, ch;
for (i = 0; i < s.length(); i++) {
if ((ch = s.charAt(i)) == '//') {
tempI = i;
i += 2;
while (s.length() > i && s.charAt(i) == 'u') {
i++;
}
if (s.length() >= i + 4) {
ch = Integer.parseInt(s.substring(i, i + 4), 16);
i += 3;
} else {
i = tempI;
}
}
result.append((char) ch);
}
return result.toString();
}
}
本文介绍了一位开发者在使用不同版本的JDK(从1.5升级到1.6)读取含有中文字符的XML文件时遇到的问题。主要讨论了在JDK1.6中如何通过Unicode转换来正确解析XML文件中的中文字符,并提供了具体的代码实现。
1817

被折叠的 条评论
为什么被折叠?



