解决FileInputStream读取ANSI格式txt中文乱码问题
GBK中文转为byte后以负数开头,正常来说为连续两个负数,生僻字可能为一个负数和一个整数,所以需要特殊处理一下
注:utf-8的txt一个中文占三个byte数组,故此方法不适用
import java.io.FileInputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.Arrays;
public class FileInputStreamTest03 {
public static void main(String[] args) {
// test
String s = "中c国Dh丄";
byte[] bs = new byte[10];
try {
bs = s.getBytes("GBK");
System.out.println(Arrays.toString(bs));
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
String s2 = null;
try {
s2 = new String(bs, "GBK");
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
System.out.println(s2);
// main
FileInputStream fp = null;
try {
fp = new FileInputStream("C:\\Users\\dell\\Documents\\a代码备份\\python\\爬虫\\bilibili\\list.txt");
byte[] b = new byte[10];
int num;
while ((num = fp.read(b)) != -1){
int pos = 0; // 记录负值个数,中文GBK为两个负值
for (byte b1 : b) {
if (b1 < 0){
pos++;
}
}
// System.out.println(Arrays.toString(b));
if (pos%2 != 0 && b[b.length-1] < 0){
int nextValue=fp.read();
int size = b.length;
int nextLen=size+1;
//字节数组扩容一位
b = Arrays.copyOf(b,nextLen);
b[size]= (byte) nextValue;
String content=new String(b, 0, nextLen, "GBK");
System.out.print(content);
} else {
System.out.print(new String(b, 0, num, "GBK"));
}
}
} catch (IOException e) {
e.printStackTrace();
} finally {
if (fp != null){
try {
fp.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}