JDK1.5和1.6的GBK转UTF-8问题

最新推荐文章于 2021-02-19 15:17:47 发布

原创最新推荐文章于 2021-02-19 15:17:47 发布 · 617 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#jdk #string #byte #null #exception #hex

JAVA 专栏收录该内容

11 篇文章

订阅专栏

本文介绍了一位开发者在使用不同版本的JDK（从1.5升级到1.6）读取含有中文字符的XML文件时遇到的问题。主要讨论了在JDK1.6中如何通过Unicode转换来正确解析XML文件中的中文字符，并提供了具体的代码实现。

事情起源于一次读取网络上带有中文的xml信息，用JDK1.5的时候转换一切正常，当换成JDK1.6，转换出现问题，xml文件只有中文能正常显示，其他符号全部是未知代码。经过多次测试，发现，JDK1.5的编码和1.6的编码不一样，1.5直接GBK转UTF-8是可行的，1.6只能通过unicode进行转换。只能在控制台为UTF-8的条件下运行。代码如下：

import java.io.File;
import java.io.InputStream;
import java.net.URL;

import org.jdom.Document;
import org.jdom.input.SAXBuilder;
import org.jdom.output.Format;
import org.jdom.output.XMLOutputter;

public class ParserTestCase {
    public static void main(String[] args) {

        try {

            ParserTestCase convert = new ParserTestCase();
            URL url = new URL(
                    "http://www.google.com/ig/api?hl=zh-cn&weather=beijing");

//            InputStream is = convert.getClass().getResourceAsStream("test.xml");
            InputStream is = url.openStream();
            byte[] b = new byte[is.available()];
            System.out.println("len = " + b.length);
            for (int i = 0; i < b.length; i++) {
                b[i] = (byte) is.read();
            }

            EncodingUtils eu = new EncodingUtils();
            System.out.println("origin " +eu.toHexString(b));

            System.out.println(unicode2string(new String(b)));
            String gbk = new String(b, "GBK");
            System.out.println(gbk);

            FileHandler file = new FileHandler("1.6.txt");

//            byte[] fullByte = convert.gbk2utf8("锟斤拷锟斤拷sdfhskldfhs8673**667323");
            byte[] fullByte = convert.gbk2utf8(gbk);
            String fullStr = new String(fullByte, "UTF-8");
            System.out.println("exchanged " +eu.toHexString(fullByte));
            System.out.println("string from GBK to UTF-8 byte: " + fullStr);
            String unicodes = string2unicode(gbk);
            System.out.println("GBK 2 unicode :" + unicodes);
            System.out.println("unicode 2 str :" + unicode2string(unicodes));

//            file1_5.writeFile(s + "/n" + fullStr);
            file.writeFile(unicode2string(unicodes));


            SAXBuilder builder = new SAXBuilder();
            Document doc = builder.build(file.getFile());
            XMLOutputter outputter = new XMLOutputter(Format.getPrettyFormat());
            outputter.output(doc, System.out);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public byte[] gbk2utf8(String chenese) {
        char c[] = chenese.toCharArray();
        byte[] fullByte = new byte[3 * c.length];
        for (int i = 0; i < c.length; i++) {
            int m = (int) c[i];
            String word = Integer.toBinaryString(m);
            // System.out.println(word);

            StringBuffer sb = new StringBuffer();
            int len = 16 - word.length();
            //
            for (int j = 0; j < len; j++) {
                sb.append("0");
            }
            sb.append(word);
            sb.insert(0, "1110");
            sb.insert(8, "10");
            sb.insert(16, "10");

            // System.out.println(sb.toString());

            String s1 = sb.substring(0, 8);
            String s2 = sb.substring(8, 16);
            String s3 = sb.substring(16);

            byte b0 = Integer.valueOf(s1, 2).byteValue();
            byte b1 = Integer.valueOf(s2, 2).byteValue();
            byte b2 = Integer.valueOf(s3, 2).byteValue();
            byte[] bf = new byte[3];
            bf[0] = b0;
            fullByte[i * 3] = bf[0];
            bf[1] = b1;
            fullByte[i * 3 + 1] = bf[1];
            bf[2] = b2;
            fullByte[i * 3 + 2] = bf[2];

        }
        return fullByte;
    }

    public static String string2unicode(String s) {

        if (s == null)
            return null;

        StringBuffer result = new StringBuffer();
        int i, tempI, j, ch;
        for (i = 0; i < s.length(); i++) {
            if (s.charAt(i) >= 0x2018) {
                result.append('//');
                result.append('u');
                String hex = Integer.toHexString(s.charAt(i));
                result.append(hex);
            } else {
                result.append(s.charAt(i));
            }
        }
        return result.toString();
    }

    public static String unicode2string(String s) {

        if (s == null)
            return null;

        StringBuffer result = new StringBuffer();
        int i, tempI, j, ch;
        for (i = 0; i < s.length(); i++) {
            if ((ch = s.charAt(i)) == '//') {
                tempI = i;
                i += 2;
                while (s.length() > i && s.charAt(i) == 'u') {
                    i++;
                }
                if (s.length() >= i + 4) {
                    ch = Integer.parseInt(s.substring(i, i + 4), 16);
                    i += 3;
                } else {
                    i = tempI;
                }
            }
            result.append((char) ch);
        }
        return result.toString();
    }
}