原书第7章。
package com.foxbill.redisinaction;
import redis.clients.jedis.Jedis;
/**
* 1. HyperLogLog是Redis提供的用于基数统计的概率数据结构
* 2. 主要特点:占用空间小(约12KB),即使统计10亿个不同元素
* 3. 误差率低:标准误差约为0.81%
* 4. 适用于:网站访问量统计(UV)、用户行为统计等需要进行大量去重计数但不需要精确结果的场景
*/
public class Chapter7 {
static String hyperLogLogKey1 = "hyperloglog:example:visitors:day1";
static String hyperLogLogKey2 = "hyperloglog:example:visitors:day2";
static String hyperLogLogKeyMerged = "hyperloglog:example:visitors:twodays";
public static void start(Jedis jedis) {
init(jedis);
PFADD(jedis);
PFCOUNT(jedis);
}
private static void init(Jedis jedis) {
/*先清除旧数据*/
jedis.del(hyperLogLogKey1);
jedis.del(hyperLogLogKey2);
jedis.del(hyperLogLogKeyMerged);
}
// 示例1:使用PFADD添加元素到HyperLogLog
private static void PFADD(Jedis jedis) {
// 添加第1天的访客ID
for (int i = 1; i <= 1000; i++) {
jedis.pfadd(hyperLogLogKey1, "user:" + i);
}
// 添加第2天的访客ID(包含一些与第1天重复的访客)
for (int i = 500; i <= 1500; i++) {
jedis.pfadd(hyperLogLogKey2, "user:" + i);
}
System.out.println("- 已向" + hyperLogLogKey1 + "添加1000个用户ID");
System.out.println("- 已向" + hyperLogLogKey2 + "添加1001个用户ID(包含与第1天重叠的501个)");
}
// 示例2:使用PFCOUNT获取基数估算
private static void PFCOUNT(Jedis jedis) {
// 统计单个HyperLogLog
long count1 = jedis.pfcount(hyperLogLogKey1);
long count2 = jedis.pfcount(hyperLogLogKey2);
System.out.println("- " + hyperLogLogKey1 + "的基数估算: " + count1);
System.out.println("- " + hyperLogLogKey2 + "的基数估算: " + count2);
// 统计多个HyperLogLog的并集基数
long unionCount = jedis.pfcount(hyperLogLogKey1, hyperLogLogKey2);
System.out.println("- 两个HyperLogLog的并集基数估算: " + unionCount);
System.out.println("- 实际唯一用户数: 1500");
System.out.println("- 误差情况: HyperLogLog估算值与实际值的误差在0.81%左右");
//使用PFMERGE合并多个HyperLogLog
System.out.println("\n【示例3:使用PFMERGE合并HyperLogLog】");
jedis.pfmerge(hyperLogLogKeyMerged, hyperLogLogKey1, hyperLogLogKey2);
long mergedCount = jedis.pfcount(hyperLogLogKeyMerged);
System.out.println("- 已合并两个HyperLogLog到" + hyperLogLogKeyMerged);
System.out.println("- 合并后的基数估算: " + mergedCount);
}
}
📌 小结
在大数据量中进行去重计数,而不需要精确结果的场景,可以使用HyperLogLog来实现高效统计,误差约为0.81%。
1830

被折叠的 条评论
为什么被折叠?



