Python爬虫学习——布隆过滤器

本文介绍了两种布隆过滤器的实现方法:一种是通过自定义哈希函数和位数组实现;另一种是使用pybloom库简化开发过程。通过具体实例展示了如何添加元素到布隆过滤器中,并检查元素是否存在。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

布隆过滤器的实现方法1:自己实现

参考 http://www.cnblogs.com/naive/p/5815433.html

bllomFilter两个参数分别代表,布隆过滤器的大小和hash函数的个数

#coding:utf-8
#!/usr/bin/env python

from bitarray import bitarray
# 3rd party
import mmh3
import scrapy
from BeautifulSoup import BeautifulSoup as BS
import os
ls = os.linesep

class BloomFilter(set):

    def __init__(self, size, hash_count):
        super(BloomFilter, self).__init__()
        self.bit_array = bitarray(size)
        self.bit_array.setall(0)
        self.size = size
        self.hash_count = hash_count

    def __len__(self):
        return self.size

    def __iter__(self):
        return iter(self.bit_array)

    def add(self, item):
        for ii in range(self.hash_count):
            index = mmh3.hash(item, ii) % self.size
            self.bit_array[index] = 1

        return self

    def __contains__(self, item):
        out = True
        for ii in range(self.hash_count):
            index = mmh3.hash(item, ii) % self.size
            if self.bit_array[index] == 0:
                out = False

        return out

class DmozSpider(scrapy.Spider):
    name = "baidu"
    allowed_domains = ["baidu.com"]
    start_urls = [
        "http://baike.baidu.com/item/%E7%BA%B3%E5%85%B0%E6%98%8E%E7%8F%A0"
    ]

    def parse(self, response):

        # fname = "/media/common/娱乐/Electronic_Design/Coding/Python/Scrapy/tutorial/tutorial/spiders/temp"
        #
        # html = response.xpath('//html').extract()[0]
        # fobj = open(fname, 'w')
        # fobj.writelines(html.encode('utf-8'))
        # fobj.close()

        bloom = BloomFilter(1000, 10)
        animals = ['dog', 'cat', 'giraffe', 'fly', 'mosquito', 'horse', 'eagle',
                   'bird', 'bison', 'boar', 'butterfly', 'ant', 'anaconda', 'bear',
                   'chicken', 'dolphin', 'donkey', 'crow', 'crocodile']
        # First insertion of animals into the bloom filter
        for animal in animals:
            bloom.add(animal)

        # Membership existence for already inserted animals
        # There should not be any false negatives
        for animal in animals:
            if animal in bloom:
                print('{} is in bloom filter as expected'.format(animal))
            else:
                print('Something is terribly went wrong for {}'.format(animal))
                print('FALSE NEGATIVE!')

        # Membership existence for not inserted animals
        # There could be false positives
        other_animals = ['badger', 'cow', 'pig', 'sheep', 'bee', 'wolf', 'fox',
                         'whale', 'shark', 'fish', 'turkey', 'duck', 'dove',
                         'deer', 'elephant', 'frog', 'falcon', 'goat', 'gorilla',
                         'hawk']
        for other_animal in other_animals:
            if other_animal in bloom:
                print('{} is not in the bloom, but a false positive'.format(other_animal))
            else:
                print('{} is not in the bloom filter as expected'.format(other_animal))

 

布隆过滤器的实现方法2:使用pybloom

参考 http://www.jianshu.com/p/f57187e2b5b9

#coding:utf-8
#!/usr/bin/env python

from pybloom import BloomFilter

import scrapy
from BeautifulSoup import BeautifulSoup as BS
import os
ls = os.linesep

class DmozSpider(scrapy.Spider):
    name = "baidu"
    allowed_domains = ["baidu.com"]
    start_urls = [
        "http://baike.baidu.com/item/%E7%BA%B3%E5%85%B0%E6%98%8E%E7%8F%A0"
    ]

    def parse(self, response):

        # fname = "/media/common/娱乐/Electronic_Design/Coding/Python/Scrapy/tutorial/tutorial/spiders/temp"
        #
        # html = response.xpath('//html').extract()[0]
        # fobj = open(fname, 'w')
        # fobj.writelines(html.encode('utf-8'))
        # fobj.close()

        # bloom = BloomFilter(100, 10)
        bloom = BloomFilter(1000, 0.001)
        animals = ['dog', 'cat', 'giraffe', 'fly', 'mosquito', 'horse', 'eagle',
                   'bird', 'bison', 'boar', 'butterfly', 'ant', 'anaconda', 'bear',
                   'chicken', 'dolphin', 'donkey', 'crow', 'crocodile']
        # First insertion of animals into the bloom filter
        for animal in animals:
            bloom.add(animal)

        # Membership existence for already inserted animals
        # There should not be any false negatives
        for animal in animals:
            if animal in bloom:
                print('{} is in bloom filter as expected'.format(animal))
            else:
                print('Something is terribly went wrong for {}'.format(animal))
                print('FALSE NEGATIVE!')

        # Membership existence for not inserted animals
        # There could be false positives
        other_animals = ['badger', 'cow', 'pig', 'sheep', 'bee', 'wolf', 'fox',
                         'whale', 'shark', 'fish', 'turkey', 'duck', 'dove',
                         'deer', 'elephant', 'frog', 'falcon', 'goat', 'gorilla',
                         'hawk']
        for other_animal in other_animals:
            if other_animal in bloom:
                print('{} is not in the bloom, but a false positive'.format(other_animal))
            else:
                print('{} is not in the bloom filter as expected'.format(other_animal))

 

输出

dog is in bloom filter as expected
cat is in bloom filter as expected
giraffe is in bloom filter as expected
fly is in bloom filter as expected
mosquito is in bloom filter as expected
horse is in bloom filter as expected
eagle is in bloom filter as expected
bird is in bloom filter as expected
bison is in bloom filter as expected
boar is in bloom filter as expected
butterfly is in bloom filter as expected
ant is in bloom filter as expected
anaconda is in bloom filter as expected
bear is in bloom filter as expected
chicken is in bloom filter as expected
dolphin is in bloom filter as expected
donkey is in bloom filter as expected
crow is in bloom filter as expected
crocodile is in bloom filter as expected
badger is not in the bloom filter as expected
cow is not in the bloom filter as expected
pig is not in the bloom filter as expected
sheep is not in the bloom filter as expected
bee is not in the bloom filter as expected
wolf is not in the bloom filter as expected
fox is not in the bloom filter as expected
whale is not in the bloom filter as expected
shark is not in the bloom filter as expected
fish is not in the bloom filter as expected
turkey is not in the bloom filter as expected
duck is not in the bloom filter as expected
dove is not in the bloom filter as expected
deer is not in the bloom filter as expected
elephant is not in the bloom filter as expected
frog is not in the bloom filter as expected
falcon is not in the bloom filter as expected
goat is not in the bloom filter as expected
gorilla is not in the bloom filter as expected
hawk is not in the bloom filter as expected

 

转载于:https://www.cnblogs.com/tonglin0325/p/7043886.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值