Bloom Filter布隆过滤器的使用

布隆过滤器实战

最新推荐文章于 2025-09-08 18:57:50 发布

原创最新推荐文章于 2025-09-08 18:57:50 发布 · 2.4k 阅读

5 ·

CC 4.0 BY-SA版权

文章标签：

#布隆过滤器 #bloom filter

java框架同时被 2 个专栏收录

6 篇文章

订阅专栏

java算法

1 篇文章

订阅专栏

本文介绍了一种高效的数据去重方法——布隆过滤器，并通过实际案例演示了其在大量数据处理中的应用。展示了布隆过滤器如何节省内存并快速进行数据比对。

大批量数据去重，特别的占用内存。但是用布隆过滤器（Bloom Filter）会非常的省内存。亲测了一遍，果然是不错的。现将测试代码发出来，一来给自己做个笔记，二来希望大家一起学习。

一：布隆过滤器介绍

介绍：布隆过滤器的主要是由一个很长的二进制向量和若干个（k个）散列映射函数组成。因为每个元数据的存储信息值固定，而且总的二进制向量固定。所以在内存占用和查询时间上都远远超过一般的算法。当然存在一定的不准确率（可以控制）和不容易删除样本数据。

1：二进制的向量初始状态（JAVA中由BitSet实现）

2：添加一个样本数据

样本数据经过函数组后获得位置数组，对应改变二进制向量的值为1。继续添加样本数据，重复上述过程。

3：得到最终二进制向量

4：新数据比对

获取到位置数组，判断二进制向量上对应位置是否为1，只要有一个不为1（为0），那么就能肯定不存在。如果都为1，那么就很可能存在。

二：测试代码

package com.java.base;

import java.util.ArrayList;
import java.util.BitSet;
import java.util.List;

public class TestBloomFilter {

	private static final int DEFAULT_SIZE = (1 << 31) - 1; // m的值
	private static final int[] seeds = new int[] { 9, 11, 13, 31, 37, 57 }; // 6个函数
	private BitSet bits = new BitSet(DEFAULT_SIZE);
	private HashFunc[] func = new HashFunc[seeds.length];
	private static String words = "abcdefghijklmnopqrstuvwxyz1234567890_"; //

	public static void main(String[] args) {
		runFilter();
	}

	public static void runFilter() {
		TestBloomFilter  filter = new TestBloomFilter ();
		List<String> existList = new ArrayList<String>();
		List<String> noExistList = new ArrayList<String>();
		int countExist = 0;
		System.out.println("开始添加数据");
		int SampleCount = 100000000;
		for (int i = 0; i < SampleCount; i++) {
			String value = getStr();
			if (!filter.contains(value)) {
				if (existList.size() < 1000) {
					existList.add(value);
				}
				filter.add(value);
			} else {
				countExist++;
			}
			if (i % 1000000 == 0) {
				System.out.println("已经添加:" + i);
			}
		}
		System.out.println("随机保存值重复了" + countExist);
		System.out.println(SampleCount + "比对样本值保存完毕");
		boolean flag = true;
		while (flag) {
			if (noExistList.size() > 999) {
				flag = false;
			} else {
				String str = getStr();
				if (!filter.contains(str)) {
					noExistList.add(str);
				}
			}
		}
		System.out.println("1千的存在和不存在的待比对数据准备完毕");
		long start = System.currentTimeMillis();
		System.out.println("开始比对存在字符串");
		int existCount = 0;
		for (int i = 0; i < existList.size(); i++) {
			if (filter.contains(existList.get(i))) {
				existCount++;
			}
		}
		System.out.println("比对正确率:" + existCount + "/1000");
		System.out.println("开始比对不存在字符串");
		int noExistCount = 0;
		for (int i = 0; i < noExistList.size(); i++) {
			if (!filter.contains(noExistList.get(i))) {
				noExistCount++;
			}
		}
		System.out.println("比对正确率:" + noExistCount + "/1000");
		System.out.println("比对2千数据耗时：" + (System.currentTimeMillis() - start) + "毫秒");
		System.out.println("over");
	}

	/**
	 * 获取随机比对字符串
	 * 
	 * @return
	 */
	public static String getStr() {
		StringBuilder sb = new StringBuilder();
		for (int i = 0; i < 30; i++) {
			sb.append(words.charAt((int) (Math.random() * 37)));
		}
		sb.append(Math.random() * 100000);
		// sb.append(System.nanoTime());
		return sb.toString();
	}

	/**
	 * 创建过滤器
	 */
	public TestBloomFilter () {
		for (int i = 0; i < seeds.length; i++) {
			func[i] = new HashFunc(DEFAULT_SIZE, seeds[i]);
		}
	}
	
	/**
	 * 添加样本数据
	 * @param value
	 */
	public void add(String value) {
		for (HashFunc f : func) {
			bits.set(f.hash(value), true);
		}
	}

	/**
	 * 判断是否存在
	 * @param value
	 * @return
	 */
	public boolean contains(String value) {
		if (value == null) {
			return false;
		}
		boolean ret = true;
		for (HashFunc f : func) {
			ret = ret && bits.get(f.hash(value));
		}
		return ret;
	}

	/**
	 * 哈希函数
	 * @author Administrator
	 *
	 */
	public static class HashFunc {
		private int maxCount;
		private int seed;

		public HashFunc(int maxCount, int seed) {
			this.maxCount = maxCount;
			this.seed = seed;
		}

		public int hash(String value) {
			int result = 0;
			int len = value.length();
			for (int i = 0; i < len; i++) {
				result = seed * result + value.charAt(i);
			}
			return (maxCount - 1) & result;
		}
	}

}

测试时内存占用500m左右。