Java读取txt文档,自动识别编码txt编码

本博客介绍了一个Java程序,用于读取TXT文件的内容并打印到控制台。此外,还提供了一种方法来自动识别文件的编码格式,包括Unicode、UTF-16、UTF-8和GB2312等,确保了文件内容的正确解析。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

代码

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;

public class ReadTxt {

	/**
	 * @param args
	 * @author Leemeea
	 * @date Nov 26, 2018
	 */
	public static void main(String[] args) {
		//String filePath = "C:\\Users\\Administrator\\Desktop\\测试sql.sql";
		String filePath = "C:\\Users\\Administrator\\Desktop\\新建文本文档 (2).txt";
		List<String> list = ReadTxt.readFileByLines(filePath);
		
		for (String string : list) {
			System.out.println(string);
		}

	}
	
	
    // 读取文件
    public static List<String> readFileByLines(String fileName) {
        List<String> list = new ArrayList<String>();
        File file = new File(fileName);  

        BufferedReader reader = null;
        InputStream inputStream = null;
        try {   
            // 判断的文件输入流
            inputStream = new FileInputStream(file);
            byte[] head = new byte[3];
            inputStream.read(head);   
            //判断TXT文件编码格式    
            if (head[0] == -1 && head[1] == -2 ){    
                 //Unicode              -1,-2,84
                 reader = new BufferedReader(new InputStreamReader(new FileInputStream(file),"Unicode"));
            }else if (head[0] == -2 && head[1] == -1 ){ 
                 //Unicode big endian   -2,-1,0,84
                 reader = new BufferedReader(new InputStreamReader(new FileInputStream(file),"UTF-16"));
            }else if(head[0]==-17 && head[1]==-69 && head[2] ==-65) {   
                 //UTF-8                -17,-69,-65,84
                 reader = new BufferedReader(new InputStreamReader(new FileInputStream(file),"UTF-8"));
            }else{    
                 //ANSI                  84 = T
                 reader = new BufferedReader(new InputStreamReader(new FileInputStream(file),"gb2312"));
            }
            String tempString = null;
            int line = 1;
            while ((tempString = reader.readLine()) != null) {
                list.add(tempString);
                line++;
            }
            inputStream.close();
            reader.close();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            if (inputStream != null) {
                try {
                    inputStream.close();
                } catch (IOException e1) {
                }
            }
            if (reader != null) {
                try {
                    reader.close();
                } catch (IOException e1) {
                }
            }
        }
        return list;
    }
}

20220425补一个代码,上面获取字符编码有问题,下面是获取字符编码的代码

private static  String getFilecharset(File sourceFile) {
        String charset = "GBK";
        byte[] first3Bytes = new byte[3];
        try {
            boolean checked = false;
            BufferedInputStream bis = new BufferedInputStream(new FileInputStream(sourceFile));
            bis.mark(0);
            int read = bis.read(first3Bytes, 0, 3);
            if (read == -1) {
                //文件编码为 ANSI
                return charset;
            } else if (first3Bytes[0] == (byte) 0xFF
                    && first3Bytes[1] == (byte) 0xFE) {
                //文件编码为 Unicode
                charset = "UTF-16LE";
                checked = true;
            } else if (first3Bytes[0] == (byte) 0xFE
                    && first3Bytes[1] == (byte) 0xFF) {
                //文件编码为 Unicode big endian
                charset = "UTF-16BE";
                checked = true;
            } else if (first3Bytes[0] == (byte) 0xEF
                    && first3Bytes[1] == (byte) 0xBB
                    && first3Bytes[2] == (byte) 0xBF) {
                //文件编码为 UTF-8
                charset = "UTF-8";
                checked = true;
            }
            bis.reset();
            if (!checked) {
                int loc = 0;
                while ((read = bis.read()) != -1) {
                    loc++;
                    if (read >= 0xF0) {
                        break;
                    }
                    if (0x80 <= read && read <= 0xBF){
                        // 单独出现BF如下的,也算是GBK
                        break;
                    }
                    if (0xC0 <= read && read <= 0xDF) {
                        read = bis.read();
                        if (0x80 <= read && read <= 0xBF){
                            // 双字节 (0xC0 - 0xDF)
                            // (0x80
                            // - 0xBF),也可能在GB编码内
                            continue;
                        } else {
                            break;
                        }
                    } else if (0xE0 <= read && read <= 0xEF) {
                        // 也有可能出错,可是概率较小
                        read = bis.read();
                        if (0x80 <= read && read <= 0xBF) {
                            read = bis.read();
                            if (0x80 <= read && read <= 0xBF) {
                                charset = "UTF-8";
                                break;
                            } else {
                                break;
                            }
                        } else {
                            break;
                        }
                    }
                }
            }
            bis.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
        return charset;
    }
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值