1. 程式人生 > >布隆過濾的程式碼

布隆過濾的程式碼

# -*- coding: utf-8 -*-
import redis
from hashlib import md5

class SimpleHash(object):
    def __init__(self, cap, seed):
        self.cap = cap
        self.seed = seed

    def hash(self, value):
        ret = 0
        for i in range(len(value)):
            ret += self.seed * ret + ord(value[i])
        
return (self.cap - 1) & ret class BloomFilter(object): def __init__(self, host='localhost', port=6379, db=0,password=None, blockNum=1, key='bloomfilter'): """ :param host: the host of Redis :param port: the port of Redis :param db: witch db in Redis :param blockNum: one blockNum for about 90,000,000; if you have more strings for filtering, increase it. :param key: the key's name in Redis
""" self.server = redis.Redis(host=host, port=port, db=db,password=password) self.bit_size = 1 << 31 # Redis的String型別最大容量為512M,現使用256M self.seeds = [5, 7, 11, 13, 31, 37, 61] self.key = key self.blockNum = blockNum self.hashfunc = [] for
seed in self.seeds: self.hashfunc.append(SimpleHash(self.bit_size, seed)) def isContains(self, str_input): if not str_input: return False m5 = md5() m5.update(str_input.encode('utf-8')) str_input = m5.hexdigest() ret = True name = self.key + str(int(str_input[0:2], 16) % self.blockNum) for f in self.hashfunc: loc = f.hash(str_input) ret = ret & self.server.getbit(name, loc) return ret def insert(self, str_input): m5 = md5() m5.update(str_input.encode('utf-8')) str_input = m5.hexdigest() name = self.key + str(int(str_input[0:2], 16) % self.blockNum) for f in self.hashfunc: loc = f.hash(str_input) self.server.setbit(name, loc, 1) if __name__ == '__main__': bf = BloomFilter() if bf.isContains('http://www.baidu.com'): # 判斷字串是否存在 print('exists!') else: print('not exists!')#如果不存則加入進去 bf.insert('http://www.baidu.com')