情景:
两个ES索引在导入时候出现数据偏差,需要找差集。
思路
由于索引是共用,并且是动态,无法对比数据。
索引字段超多,先分页将数据保留本地再分析。只保留ID,格式为:List 并且以升序排列,不排序也可以。
由于LIST一次运行超过100W数据会超慢,顾分解为5W一个LIST去做对比,结果保留到RESULT LIST内。最后输出。
ES --> search data by id order by asc --> save local data 5w/file --> file compare file --> save diff
测试为两个文件夹,总有22个文件,每5万一个文件,总数据量约为120W,用时大概3分钟。
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
*
* @author andy
* @version $Revision: 1.1 $ $Date: 2020年11月18日 下午4:09:15 $
*/
public class LoadFile2 {
public static void main(String[] args) throws Exception {
System.out.println("准备处理。。。");
String fixPath = "E:/Desktop/es/run/3/";
long c = System.currentTimeMillis();
// 处理第一个文件夹内所有txt文件名
List<String> newPath = new ArrayList<>();
System.out.println("开始统计文件数new");
getFile(new File(fixPath + "new"), newPath);
System.out.println("统计文件数 new: " + newPath.size());
// 处理第二个文件夹内所有txt文件名
System.out.println("开始统计文件数backup");
List<String> backPath = new ArrayList<>();
getFile(new File(fixPath + "backup"), backPath);
System.out.println("统计文件数 backup: " + backPath.size());
// 过滤数据
System.out.println("过滤数据");
List<String> resultList = filterData(newPath, backPath);
System.out.println("cost: " + (System.currentTimeMillis() - c));
resultList.stream().forEach(System.out::println);
System.out.println("保留数据");
FileWriter fw = new FileWriter(fixPath + "result.txt");
if (!resultList.isEmpty()) {
resultList.stream().forEach(x -> {
try {
fw.append(x + "\n");
} catch (IOException e) {
e.printStackTrace();
}
});
}
fw.flush();
fw.close();
System.out.println("处理完成");
}
/**
* @param resultList
* @param backFbr
* @param newFbr
* @throws Exception
*/
private static List<String> filterData(List<String> newPath, List<String> backPath) throws Exception {
if (newPath.size() > backPath.size()) return removeData(newPath, backPath);
if (backPath.size() > newPath.size()) return removeData(backPath, newPath);
return removeData(newPath, backPath);
}
/**
* @param newFbr
* @param backFbr
* @return
* @throws Exception
*/
private static List<String> removeData(List<String> bigPath, List<String> smallPath) throws Exception {
List<String> temp = new ArrayList<>();
List<String> resultList = new ArrayList<>();
List<String> firstData = new ArrayList<>();
List<String> secondData = new ArrayList<>();
for (int i = 0; i < bigPath.size(); i++) {
txt2List(bigPath.get(i), firstData);
if ( i >= smallPath.size()) {
firstData.removeAll(resultList);
resultList.addAll(firstData);
firstData.clear();
System.out.println("resultListSize: " + resultList.size());
continue;
}
txt2List(smallPath.get(i), secondData);
System.out.println("file compare: " + bigPath.get(i) + " - " + smallPath.get(i));
temp.addAll(firstData);
firstData.removeAll(secondData);
secondData.removeAll(temp);
firstData.addAll(secondData);
temp.clear();
temp.addAll(resultList);
resultList.removeAll(firstData);
firstData.removeAll(temp);
resultList.addAll(firstData);
firstData.clear();
secondData.clear();
temp.clear();
System.out.println("resultListSize: " + resultList.size());
}
return resultList;
}
public static void txt2List(String path, List<String> list) throws Exception {
String s = "";
BufferedReader br = null;
// 读取txt内容 并转换成List
br = new BufferedReader(new FileReader(path));// 构造一个BufferedReader类来读取文件
try {
while ((s = br.readLine()) != null) {// 使用readLine方法,一次读一行
list.add(s);
}
br.close();
} catch (Exception e) {
e.printStackTrace();
}
}
// 读取文件夹下所有文件名
public static void getFile(File file, List<String> listLocal) {
if (file != null) {
File[] f = file.listFiles();
if (f != null) {
for (int i = 0; i < f.length; i++) {
getFile(f[i], listLocal);
}
} else {
listLocal.add(file.getPath());
}
}
}
}