4G内存挑战：50亿URL分治比对与布隆过滤器应用-优快云博客

本文链接：https://blog.youkuaiyun.com/m0_37583655/article/details/120201200

分治法对比大文件内rul

1. 需求描述
2. 原理分析
- 2.1. 布隆过滤器
- 2.2. 分治法
3.实现验证

1. 需求描述

给定a、b两个文件，各存放50亿个url，每个url各占64字节，内存限制是4G，让你找出a、b文件共同的url?
本质上这不算一道算法题，更多的是考察思维能力。

2. 原理分析

2.1. 布隆过滤器

采用Bloom filter，假设布隆过滤器的错误率为0.01，则位数组大小m约为输入元素个数n的13倍，此时需要的哈希函数k约为8个。
元素个数：n = 5G
位数组大小：m = 5G * 13 = 65G = 650亿即需要650亿个bit位才能达到错误率0.01
而我们拥有的内存可容纳bit位个数：4G * 8bit = 32G bit = 320亿，按此实现错误率大于0.01。
布隆过滤器：https://blog.youkuaiyun.com/qq_41946557/article/details/102593912

2.2. 分治法

假如每个url大小为10bytes，那么可以估计每个文件的大小为50G×64=320G，远远大于内存限制的4G，所以不可能将其完全加载到内存中处理，可以采用分治的思想来解决。
1.文件拆分。按照文件行hash运算后的值进行拆分。
2.文件对比。由于a与b文件的url都是经过hash运算后拆分的，首先a文件相同的数据肯定经过hash运算拆分到一个文件内，hash值相同值不一定相同，值相同hash一定相同。因为a与b文件采用的相同的hash算法，所以a与b相同的值也一定在与之对应的文件里，比如，a1与b1，a2与b2。

3.实现验证

3.1 文件准备

在这里插入图片描述

3.2 代码实现

package com.zrj.unit.matchdup;

import java.io.*;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;

/**
 * 搜索重复数据
 * 需求：给定a、b两个文件，各存放50亿个url，每个url各占64字节，内存限制是4G，让你找出a、b文件共同的url?
 * 分析：假如每个url大小为10bytes，那么可以估计每个文件的大小为50G×64=320G，远远大于内存限制的4G，
 * 所以不可能将其完全加载到内存中处理，可以采用分治的思想来解决。
 *
 * @author zrj
 * @since 2021/9/9
 **/
public class MatchDuplicateDate {

    public static void main(String[] args) {
        // 文件拆分
        //fileSplit("D:\\doc\\log\\match\\match.txt");
        //fileSplit("D:\\doc\\log\\match\\duplicate.txt");
        String matchFile = "D:\\doc\\log\\match\\match.txt";
        String duplicateFile = "D:\\doc\\log\\match\\duplicate.txt";
        matchData(matchFile, duplicateFile);

    }

    /**
     * 文件对比
     * 分析：
     * 由于a与b文件的url都是经过hash运算后拆分的，首先a文件相同的数据肯定经过hash运算拆分到一个文件内，hash值相同值不一定相同，值相同hash一定相同。
     * 因为a与b文件采用的相同的hash算法，所以a与b相同的值也一定在与之对应的文件里，比如，a1与b1，a2与b2
     * 验证：
     * a1与b1对比
     * a2与b2对比
     * a3与b3对比
     */
    public static void matchData(String matchFile, String duplicateFile) {
        BufferedReader br = null;
        HashSet<String> set = new HashSet<>();
        String line;

        try {
            String fileSuffix = ".txt";
            for (int i = 1; i < 6; i++) {
                String matchFilePath = matchFile.substring(0, matchFile.indexOf(fileSuffix)) + i + fileSuffix;
                String dupliFilePath = duplicateFile.substring(0, duplicateFile.indexOf(fileSuffix)) + i + fileSuffix;
                System.out.println("第" + i + "次文件对比：matchFilePath=" + matchFilePath + "，dupliFilePath=" + dupliFilePath);

                br = new BufferedReader(new FileReader(matchFilePath));
                while ((line = br.readLine()) != null) {
                    set.add(line);
                }

                br = new BufferedReader(new FileReader(dupliFilePath));
                while ((line = br.readLine()) != null) {
                    if (set.contains(line)) {
                        System.out.println("重复值：" + line);
                    }
                }
            }
        } catch (Exception e) {
            System.out.println("匹配异常：" + e);
        } finally {
            bufferedReaderClose(br);
        }
    }

    /**
     * 文件拆分
     * 按照文件行hash运算后的值进行拆分
     *
     * @param filePath d://matcha.txt
     * @return void
     */
    public static void fileSplit(String filePath) {
        // 初始化io流
        BufferedReader obr = null;
        BufferedWriter obw1 = null;
        BufferedWriter obw2 = null;
        BufferedWriter obw3 = null;
        BufferedWriter obw4 = null;
        BufferedWriter obw5 = null;

        try {
            // 路径拆分，d://matcha1.txt，d://matcha2.txt
            String fileSuffix = ".txt";
            String filePre = filePath.substring(0, filePath.indexOf(fileSuffix));
            List<String> filePathList = new LinkedList<>();
            for (int i = 1; i < 6; i++) {
                filePathList.add(filePre + i + fileSuffix);
            }

            //默认拆分5个文件
            obr = new BufferedReader(new FileReader(filePath));
            obw1 = new BufferedWriter(new FileWriter(filePathList.get(0), true));
            obw2 = new BufferedWriter(new FileWriter(filePathList.get(1), true));
            obw3 = new BufferedWriter(new FileWriter(filePathList.get(2), true));
            obw4 = new BufferedWriter(new FileWriter(filePathList.get(3), true));
            obw5 = new BufferedWriter(new FileWriter(filePathList.get(4), true));

            String oline;
            while ((oline = obr.readLine()) != null) {
                int x = toHash(oline);
                if (x == 0) {
                    obw1.write(oline);
                    obw1.write("\r\n");
                } else if (x == -1) {
                    obw2.write(oline);
                    obw2.write("\r\n");
                } else if (x == -2) {
                    obw3.write(oline);
                    obw3.write("\r\n");
                } else if (x == -3) {
                    obw4.write(oline);
                    obw4.write("\r\n");
                } else {
                    obw5.write(oline);
                    obw5.write("\r\n");
                }
            }
        } catch (Exception e) {
            System.out.println("文件拆分异常：" + e);
        } finally {
            // 关闭io流
            bufferedWriterClose(obw1, obw2, obw3, obw4, obw5);
            bufferedReaderClose(obr);
        }
    }

    /**
     * 关闭读取IO流
     */
    public static void bufferedReaderClose(BufferedReader... bufferedReaders) {
        try {
            for (BufferedReader bufferedReader : bufferedReaders) {
                bufferedReader.close();
            }
        } catch (IOException e) {
            System.err.println("关闭读取IO流异常：" + e);
        }
    }

    /**
     * 关闭写取IO流
     */
    public static void bufferedWriterClose(BufferedWriter... bufferedWriters) {
        try {
            for (BufferedWriter bufferedWriter : bufferedWriters) {
                bufferedWriter.close();
            }
        } catch (IOException e) {
            System.err.println("关闭写取IO流异常：" + e);
        }
    }

    /**
     * 将字符串转成hash值
     */
    public static int toHash(String key) {
        int arraySize = 5; // 数组大小一般取质数
        int hashCode = 0;
        for (int i = 0; i < key.length(); i++) { // 从字符串的左边开始计算
            int letterValue = key.charAt(i) - 96;// 将获取到的字符串转换成数字，比如a的码值是97，则97-96=1
            // 就代表a的值，同理b=2；
            hashCode = ((hashCode << 5) + letterValue) % arraySize;// 防止编码溢出，对每步结果都进行取模运算
        }
        System.out.println("hash前：" + key + "，hash后：" + hashCode);
        return hashCode;
    }
}

3.3 结果验证

第1次文件对比：matchFilePath=D:\doc\log\match\match1.txt，dupliFilePath=D:\doc\log\match\duplicate1.txt
重复值："202109091008050"	"张50" 	"http://helloworld50.com" 
第2次文件对比：matchFilePath=D:\doc\log\match\match2.txt，dupliFilePath=D:\doc\log\match\duplicate2.txt
第3次文件对比：matchFilePath=D:\doc\log\match\match3.txt，dupliFilePath=D:\doc\log\match\duplicate3.txt
重复值："202109091008010"	"张10" 	"http://helloworld10.com" 
重复值："202109091008060"	"张60" 	"http://helloworld60.com" 
第4次文件对比：matchFilePath=D:\doc\log\match\match4.txt，dupliFilePath=D:\doc\log\match\duplicate4.txt
重复值："202109091008040"	"张40" 	"http://helloworld40.com" 
重复值："202109091008090"	"张90" 	"http://helloworld90.com" 
第5次文件对比：matchFilePath=D:\doc\log\match\match5.txt，dupliFilePath=D:\doc\log\match\duplicate5.txt
重复值："202109091008020"	"张20" 	"http://helloworld20.com" 
重复值："202109091008030"	"张30" 	"http://helloworld30.com" 
重复值："202109091008070"	"张70" 	"http://helloworld70.com" 
重复值："202109091008080"	"张80" 	"http://helloworld80.com" 
重复值："202109091008100"	"张100" "http://helloworld100.com"