GBK编码字符串转UTF-8

最新推荐文章于 2025-11-07 12:26:29 发布

原创最新推荐文章于 2025-11-07 12:26:29 发布 · 1.6k 阅读

1 ·

CC 4.0 BY-SA版权

文章标签：

#数据库 #java

学习笔记专栏收录该内容

38 篇文章

订阅专栏

本文介绍了一种从GBK编码转换到UTF-8编码的方法。通过Java代码实现，详细展示了如何利用位操作来完成这一过程。对于需要处理中文字符编码的开发者来说，这种方法提供了一种高效且实用的选择。

由于项目需要, 需要把GBK编码的字符串转成UTF-8编码, 网上随手搜了一下, 找到些样例代码, 可以用, 不过很恶心, 竟然把字符数值转成二进制的字符串形式进行操作, 于是自己动手根据编码规则用位操作写了一个...也没几行...有时还真不能太懒...

import java.io.UnsupportedEncodingException;

/**
 * @author GChan
 *
 */
public class CharacterEncodeConverter {

	/**
	 * The main method.
	 * 
	 * @param args
	 *            the arguments
	 */
	public static void main(String[] args) {

		try {
			CharacterEncodeConverter convert = new CharacterEncodeConverter();
			byte[] src = new byte[5];
			src[0] = (byte) 0xD6;
			src[1] = (byte) 0xD0;
			src[2] = (byte) 0x32;
			src[3] = (byte) 0xCE;
			src[4] = (byte) 0xC4;

			byte[] fullByte = convert.gbk2utf8byte(new String(src, "GBK"));
			String fullStr = new String(fullByte, "UTF-8");
			System.out.println("string from GBK to UTF-8 byte:  " + fullStr);

		} catch (Exception e) {
			e.printStackTrace();
		}
	}

	/**
	 * gbk2utf8byte.
	 * 
	 * @param chinese
	 *            the chinese
	 * 
	 * @return the byte[]
	 */
	public byte[] gbk2utf8byte(String chinese) {

		// Step 1: 得到GBK编码下的字符数组，一个中文字符对应这里的一个c[i]
		char c[] = chinese.toCharArray();

		// Step 2: UTF-8使用3个字节存放一个中文字符，假设全部为中文,则最大长度为字符数年的3倍
		byte[] fullByte = new byte[3 * c.length];

		// Step 3: 循环将字符的编码转换成UTF-8编码
		int counter = 0;
		for (int i = 0; i < c.length; i++) {
			// Step 3-1：将字符的ASCII编码转换成数值
			int m = (int) c[i];
			
			// ASCII 直接下一字符
			if (m <= 127) {
				fullByte[counter++] = (byte) m;
				continue;
			}
			
			
			// 编码格式:1110[xxxx] 10[xxxxxx] 10[xxxxxx]
			
			// 1110 和前四位构成第一字节
			int b0 = ((m & 0xF000) >>> 12) | 0xE0;
			// 10 和中间六位构成第二字节
			int b1 = ((m & 0x0FC0) >>> 6) | 0x80;
			// 10 和最后六位构成第三字节
			int b2 = (m & 0x003F) | 0x80;

			fullByte[counter++] = (byte) b0;
			fullByte[counter++] = (byte) b1;
			fullByte[counter++] = (byte) b2;

			// 继续解析下一个中文字符
		}
		
		// 根据实际编码长度生成新数组返回
		byte[] result = new byte[counter];
		System.arraycopy(fullByte, 0, result, 0, counter);
		return result;
	}

	
	/**
	 * gbk2utf8.
	 * @param chinese
	 * @return
	 */
	public String gbk2utf8(String chinese) {
		String result = null;
		try {
			result = new String(gbk2utf8byte(chinese), "UTF-8");
		} catch (UnsupportedEncodingException e) {
			e.printStackTrace();
		}
		return result;
	}
}