乱码解析工具类优化过程分析
版本一:
import cn.hutool.core.collection.ConcurrentHashSet;
import java.util.Set;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.ForkJoinTask;
import java.util.concurrent.RecursiveTask;
/**
* @author zlj
* @date 2021/11/1
* @version 1.0
* @func 不做优化完成乱码检测
*/
public class MessyCodeDetect {
//有效字符集合,一般而言ASCII码表都是认为是有效的(因为其他编码方式都包含了ASCII码表 其他编码都是可以识别ASCII的)
//所以ASCII一般不会出现乱码在其他编码环境中
private static Set<Character> validSet = new ConcurrentHashSet<>();
static {
for (int i = 0; i <= 127; i++) {
validSet.add((char) i);
}
}
/**
* 判断字符是否是中文
* 采用货运算判断是否中文 -- 时间复杂度O(1)
*
* @param c 字符
* @return 是否是中文
*/
public static boolean isChinese(char c) {
Character.UnicodeBlock ub = Character.UnicodeBlock.of(c);
if (ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
|| ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
|| ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
|| ub == Character.UnicodeBlock.GENERAL_PUNCTUATION
|| ub == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION
|| ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) {
return true;
}
return false;
}
/**
* 判断是否是乱码
* @param str
* @return
*/
public static boolean isMessyCode(String str){
//空字符串默认不是乱码字符 直接返回
if(str == null || str.length()==0) return false;
char[] chars = str.toCharArray();
//不是合法字符 也不是中文字符 那么就是乱码
for(int i=0;i<chars.length;i++){
if(!validSet.contains(chars[i]) && !isChinese(chars[i]))return true;
}
return false;
}
public static void main(String[] args) throws ExecutionException, InterruptedException {
String str = "韩顺åÂ你好";
System.out.println(str+"是否是乱码 :" + isMessyCode(str));
}
}
版本二:
import cn.hutool.core.collection.ConcurrentHashSet;
import java.util.Set;
import java.util.concurrent.*;
/**
* @author zlj
* @date 2021/11/1
* @version 2.0
* @func 使用PorkJoinPool分治方式优化,将多个线程的时间共享平摊 完成乱码检测
*/
public class MessyCodeDetect2 {
//有效字符集合,一般而言ASCII码表都是认为是有效的(因为其他编码方式都包含了ASCII码表 其他编码都是可以识别ASCII的)
//所以ASCII一般不会出现乱码在其他编码环境中
private static Set<Character> validSet = new ConcurrentHashSet<>();
static {
for(int i=0;i<=127;i++){
validSet.add((char)i);
}
}
//统计乱码个数
static class StatisticsMessyCode extends RecursiveTask<Integer> {
//按照这个规模阈值进行分治,效果类似生成一个节点长度为10的二叉树,使得整体时间复杂度趋近(LogN)水平
private static final int THRESHOLD = 10;
private char[] content;
private int start;
private int end;
public StatisticsMessyCode(char[] content, int start, int end) {
this.content = content;
this.start = start;
this.end = end;
}
/**
* 判断字符是否是中文
* 采用货运算判断是否中文 -- 时间复杂度O(1)
* @param c 字符
* @return 是否是中文
*/
public static boolean isChinese ( char c ) {
Character . UnicodeBlock ub = Character . UnicodeBlock . of ( c ) ;
if ( ub == Character . UnicodeBlock . CJK_UNIFIED_IDEOGRAPHS
|| ub == Character . UnicodeBlock . CJK_COMPATIBILITY_IDEOGRAPHS
|| ub == Character . UnicodeBlock . CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
|| ub == Character . UnicodeBlock . GENERAL_PUNCTUATION
|| ub == Character . UnicodeBlock . CJK_SYMBOLS_AND_PUNCTUATION
|| ub == Character . UnicodeBlock . HALFWIDTH_AND_FULLWIDTH_FORMS ) {
return true ;
}
return false ;
}
@Override
protected Integer compute() {
//乱码个数统计
int messCodeNum = 0;
//base case
if((end-start)<=THRESHOLD){
for(int i=start;i<=end;i++){
//不是有效集中的字符也不是中文--那就是乱码
if(!validSet.contains(content[i]) && !isChinese(content[i])) messCodeNum++;
}
}else{
//分治
//取中位数,为了防止整数溢出 不适用mid=(end+start)/2方式 而是使用下面方式取中位数
int mid = start+((end-start)>>>1);
StatisticsMessyCode left = new StatisticsMessyCode(content, start, mid);
StatisticsMessyCode right = new StatisticsMessyCode(content, mid + 1, end);
left.fork();
right.fork();
//join 最后统计综合
messCodeNum = left.join()+ right.join();
}
return messCodeNum;
}
}
public static void main(String[] args) throws ExecutionException, InterruptedException {
String str = "ʡʈÂĹ【ƬqñǪa\u008Ao̜̊9ɌǪWEȶÇɅ]ý}ʂÃĹʼna\u008Ao̜\n" +
"̊9]ʼnWEʔ【ǯuάɫˠ【ƬqñǪZɌǪȏĚʡʈ˫yϔǵάБŅɅЇÁ\n" +
"ĥa\u008Ao̜̊9ĘͰ}WY9ÁĥWY9ɌÁĥWEˠŮǭcȇʞģʅ˾Y\u009E¡\n" +
"¡【Ƭa\u008Ao̜̊9ɌǪ【ƬWEƐĘͰ}ĘͰ(\t(\n" +
"ƲƼ̖úMɈβ˱Ɉͯάç\n" +
"a\u008Ao̜̊9ɌɵĘͰWEqżǽ\u008EĘͰ}ˠUПʔ【РϼZɌǪɪȌ[]ý}\n" +
"ʂÃĹʼnÁĥʔ?a\u008Ao̜̊9[Áĥʔ?WEY[]ʼn}ʂÃĹʼn]ʼn}ʂϱ\n" +
"Ͱʔ?a\u008Ao̜̊9[}ʂϱͰʔ?WE";
System.out.println("字符个数 :"+str.length());
StatisticsMessyCode messyCodeCount = new StatisticsMessyCode(str.toCharArray(),0,str.length()-1);
ForkJoinPool pool = new ForkJoinPool();
ForkJoinTask<Integer> future = pool.submit(messyCodeCount);
Integer aLong = future.get();
System.out.println("乱码个数 : "+ aLong);
System.out.println("非乱码个数 : "+ (str.length()-aLong));
pool.shutdown();
}
}
版本三:
import cn.hutool.core.collection.ConcurrentHashSet;
import java.util.Set;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.ForkJoinTask;
import java.util.concurrent.RecursiveTask;
/**
* @author zlj
* @date 2021/11/1
* @version 3.0
* @func 使用PorkJoinPool分治方式优化,将多个线程的时间共享平摊 完成乱码检测(最终版本)
*/
@Slf4j
public class MessyCodeDetect3 {
//有效字符集合,一般而言ASCII码表都是认为是有效的(因为其他编码方式都包含了ASCII码表 其他编码都是可以识别ASCII的)
//所以ASCII一般不会出现乱码在其他编码环境中
private static Set<Character> validSet = new ConcurrentHashSet<>();
static {
for(int i=0;i<=127;i++){
validSet.add((char)i);
}
}
//统计乱码个数类
static class StatisticsMessyCode extends RecursiveTask<Integer> {
//按照这个规模阈值进行分治,效果类似生成一个节点长度为10的二叉树,使得整体时间复杂度趋近(LogN)水平
//这个validSet阈值大家可以根据自己的系统进行批量压测调出最优参数
private static final int THRESHOLD = 10;
private char[] content;
private int start;
private int end;
public StatisticsMessyCode(char[] content, int start, int end) {
this.content = content;
this.start = start;
this.end = end;
}
/**
* 判断字符是否是中文
* 采用货运算判断是否中文 -- 时间复杂度O(1)
* @param c 字符
* @return 是否是中文
*/
public static boolean isChinese ( char c ) {
Character . UnicodeBlock ub = Character . UnicodeBlock . of ( c ) ;
if ( ub == Character . UnicodeBlock . CJK_UNIFIED_IDEOGRAPHS
|| ub == Character . UnicodeBlock . CJK_COMPATIBILITY_IDEOGRAPHS
|| ub == Character . UnicodeBlock . CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
|| ub == Character . UnicodeBlock . GENERAL_PUNCTUATION
|| ub == Character . UnicodeBlock . CJK_SYMBOLS_AND_PUNCTUATION
|| ub == Character . UnicodeBlock . HALFWIDTH_AND_FULLWIDTH_FORMS ) {
return true ;
}
return false ;
}
@Override
protected Integer compute() {
//乱码个数统计
int messCodeNum = 0;
//base case
if((end-start)<=THRESHOLD){
for(int i=start;i<=end;i++){
//不是有效集中的字符也不是中文--那就是乱码
if(!validSet.contains(content[i]) && !isChinese(content[i])) messCodeNum++;
}
}else{
//分治
//取中位数,为了防止整数溢出 不适用mid=(end+start)/2方式 而是使用下面方式取中位数
int mid = start+((end-start)>>>1);
StatisticsMessyCode left = new StatisticsMessyCode(content, start, mid);
StatisticsMessyCode right = new StatisticsMessyCode(content, mid + 1, end);
left.fork();
right.fork();
//join 最后统计综合
messCodeNum = left.join()+ right.join();
}
return messCodeNum;
}
}
/**
* 统计content乱码个数
* @param content
* @return 乱码个数
*/
public static Integer getMessyCode(String content) throws ExecutionException, InterruptedException {
StatisticsMessyCode messyCodeCount = new StatisticsMessyCode(content.toCharArray(),0,content.length()-1);
ForkJoinPool pool = new ForkJoinPool();
ForkJoinTask<Integer> future = pool.submit(messyCodeCount);
Integer messyCodeSum = future.get();
pool.shutdown();
return messyCodeSum;
}
/**
* 根据乱码率给出改文本是否是乱码文本,如果实际乱码率大于预期乱码率,该文本就是乱码文本 否则就不是乱码文本
* @param content 文本内容
* @param messyRate 预期乱码率
* @return 是否乱码
* @throws ExecutionException
* @throws InterruptedException
*/
public static boolean isMessyCode(String content,float messyRate) throws ExecutionException, InterruptedException {
log.info("messy-code detection start");
long curTime = System.currentTimeMillis();
Integer messyCodeSum = getMessyCode(content);
log.info("messy-code detection finish,it costs time {}",(System.currentTimeMillis()-curTime));
return (messyCodeSum/content.length())>messyRate;
}
}