[Python] 纯文本查看 复制代码
import mmh3
from bitarray import bitarray
# Implement a simple bloom filter with murmurhash algorithm.
# Bloom filter is used to check wether an element exists in a collection,
# and it has a good performance in big data situation.
# It may has positive rate depend on hash functions and elements count.
class BloomFilter(object):
def __init__(self,BIT_SIZE, HASH_NUM):
# 初始化布隆过滤器,生成一下全0的 bitarray
bit_array = bitarray(BIT_SIZE)
bit_array.setall(0)
self.bit_array = bit_array
def add(self, url):
# 添加一个url,同时获取这个url的对应的bitarray的位置
point_list = self.get_postions(url)
for b in point_list:
self.bit_array = 1
def contains(self, url):
# 验证这个url是否存在在集合中
point_list = self.get_postions(url)
result = True
for b in point_list:
result = result and self.bit_array
return result
def get_postions(self, url):
# 一个url获取HASH_NUM个位置,之后会把这些位置变为1
return [mmh3.hash(url, i) % BIT_SIZE for i in range(HASH_NUM)]
if __name__ == '__main__':
BIT_SIZE = 5000000
HASH_NUM = 10
# 类的实例化
bloom_filter = BloomFilter(BIT_SIZE, HASH_NUM)
urls = ['www.baidu.com','mathpretty.com','sina.com', 'qaz', 'wsx', 'edc',
'rfv']
urls_check = ['mathpretty.com','zhihu.com','qaz', 'wsx']
for url in urls:
bloom_filter.add(url)
for url_check in urls_check:
result = bloom_filter.contains(url_check)
print('被检测的网址 : ',url_check,'/ 是否被包含在原集合中 : ',result)