import lombok.extern.slf4j.Slf4j; import java.io.*; /** * 目前版本只支持文本文件. * * @Auther: zyx. * @Date: 2018/12/27 19:08 */ @Slf4j public class EncodeUtils { /** * 检查文件编码是否是utf-8 * * @param: [file] * @return: boolean * @date: 2018/12/26 15:43 */ public static boolean utf8EncodeCheck(File file) { return utf8EncodeCheck(file.getAbsolutePath()); } /** * 检查文件编码是否是utf-8 * * @param: [filePath] * @return: boolean * @date: 2018/12/26 15:43 */ public static boolean utf8EncodeCheck(String filePath) { File file = new File(filePath); if(file == null || !file.exists()){ log.error("file no exist, path is :{}...", filePath); return false; } byte[] bytes = fileTransformByte(file); if (isUTF8(bytes)) return true; return false; } /** * 将文件转换为字节数组 * * @param: [file] * @return: byte[] * @date: 2018/12/27 19:15 */ public static byte[] fileTransformByte(File file) { byte[] buffer = null; try { FileInputStream fis = new FileInputStream(file); ByteArrayOutputStream bos = new ByteArrayOutputStream(1024); byte[] b = new byte[1024]; int n; while ((n = fis.read(b)) != -1) { bos.write(b, 0, n); } fis.close(); bos.close(); buffer = bos.toByteArray(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return buffer; } /** * 以指定编码格式读取文件内容. * * @param: [filePath, charsetName] * @return: java.lang.String * @date: 2018/12/27 19:39 */ public static String readFile(String filePath, String charsetName) throws Exception { StringBuilder fileContent = new StringBuilder(); File f = new File(filePath); if (f.isFile() && f.exists()) { InputStreamReader read = new InputStreamReader(new FileInputStream(f), charsetName); BufferedReader reader = new BufferedReader(read); String line; while ((line = reader.readLine()) != null) { fileContent.append(line); fileContent.append("\r\n"); } read.close(); } return fileContent.toString(); } /** * 以指定编码格式写入文件. * * @param: [filePath, fileContent, charsetName] * @return: void * @date: 2018/12/27 19:46 */ public static void writeFile(String filePath, String fileContent, String charsetName) throws Exception { File f = new File(filePath); if (!f.exists()) { f.createNewFile(); } //利用getBytes将unicode字符串转成UTF-8格式的字节数组 // byte[] utf8Bytes = fileContent.getBytes("UTF-8"); //然后用utf-8 对这个字节数组解码成新的字符串 // String utf8Str = new String(utf8Bytes, "UTF-8"); OutputStreamWriter write = new OutputStreamWriter(new FileOutputStream(f), charsetName); BufferedWriter writer = new BufferedWriter(write); writer.write(fileContent); writer.close(); } /** * 检查字节编码是否是utf-8 * * @param: [rawtext] * @return: boolean * @date: 2018/12/27 19:18 */ public static boolean isUTF8(byte[] rawtext) { int score = 0; int i, rawtextlen = 0; int goodbytes = 0, asciibytes = 0; // Maybe also use UTF8 Byte Order Mark: EF BB BF // Check to see if characters fit into acceptable ranges rawtextlen = rawtext.length; for (i = 0; i < rawtextlen; i++) { if ((rawtext[i] & (byte) 0x7F) == rawtext[i]) { // 最高位是0的ASCII字符 asciibytes++; // Ignore ASCII, can throw off count } else if (-64 <= rawtext[i] && rawtext[i] <= -33 //-0x40~-0x21 && // Two bytes i + 1 < rawtextlen && -128 <= rawtext[i + 1] && rawtext[i + 1] <= -65) { goodbytes += 2; i++; } else if (-32 <= rawtext[i] && rawtext[i] <= -17 && // Three bytes i + 2 < rawtextlen && -128 <= rawtext[i + 1] && rawtext[i + 1] <= -65 && -128 <= rawtext[i + 2] && rawtext[i + 2] <= -65) { goodbytes += 3; i += 2; } } if (asciibytes == rawtextlen) { return false; } score = 100 * goodbytes / (rawtextlen - asciibytes); // If not above 98, reduce to zero to prevent coincidental matches // Allows for some (few) bad formed sequences if (score > 98) { return true; } else if (score > 95 && goodbytes > 30) { return true; } else { return false; } } }
转载于:https://my.oschina.net/zhangyaxin/blog/2994987