1. 程式人生 > >布隆過濾器(Bloom Filter)(給兩個檔案,分別有100億個字串,我們只要1g的記憶體,如何找到兩個檔案的交集?分別給出精確演算法和近似演算法?)

布隆過濾器(Bloom Filter)(給兩個檔案,分別有100億個字串,我們只要1g的記憶體,如何找到兩個檔案的交集?分別給出精確演算法和近似演算法?)

  給兩個檔案,分別有100億個字串,我們只要1g的記憶體,如何找到兩個檔案的交集?分別給出精確演算法和近似演算法? 精確演算法:   我們可以建立1000個檔案,運用雜湊函式先將檔案1的字串儲存在對應的檔案中,之後再檔案2中取元素,通過雜湊函式計算出雜湊地址,去對應的檔案裡面找是否有與之相同的字串。 近似演算法:   我們可以使用點陣圖的方法,通過一個函式將一個元素對映成一個位矩陣中的一個點,這樣一來,我們只要看看這個點是不是1就知道集合裡有沒有它了。 但是有可能兩個字串對應的整數是一樣的,對於這種情況我們可以設定更多的雜湊函式,對應更多的地址,這樣更加精確。 這裡寫圖片描述 點陣圖相關問題

程式碼實現:

BloomFilter.h

#include"BitMap.h"


//定義一個函式指標、函式返回型別為int、引數為字串
typedef int(*STRTOINT)(const char *);
typedef struct BloomFilter{
    BitMap _bmp;
    int size;
    STRTOINT HashFun[5];
}BloomFilter;


//初始化
void InitBloomFilter(BloomFilter *BloomFilter, int total, STRTOINT *hashfun);
//插入
void InsertBloomFilter(BloomFilter *BloomFilter, char
*str); //大小 int SizeBloomFilter(BloomFilter *BloomFilter); //查詢 int FindBloomFilter(BloomFilter* BloomFilter, char *str); //刪除 void DeleteBloomFilter(BloomFilter *BloomFilter, char *str); /////////////////////////////////////////////////////////////////////////// //5種字串轉整形的方法 int HashFun1(const char *str) { unsigned int hash = 0; while (*str) { // equivalent to: hash = 65599*hash + (*str++);
hash = (*str++) + (hash << 6) + (hash << 16) - hash; } return (hash & 0x7FFFFFFF); } // RS Hash Function int HashFun2(const char *str) { unsigned int b = 378551; unsigned int a = 63689; unsigned int hash = 0; while (*str) { hash = hash * a + (*str++); a *= b; } return (hash & 0x7FFFFFFF); } // JS Hash Function int HashFun3(const char *str) { unsigned int hash = 1315423911; while (*str) { hash ^= ((hash << 5) + (*str++) + (hash >> 2)); } return (hash & 0x7FFFFFFF); } // P. J. Weinberger Hash Function int HashFun4(const char *str) { unsigned int BitsInUnignedInt = (unsigned int)(sizeof(unsigned int)* 8); unsigned int ThreeQuarters = (unsigned int)((BitsInUnignedInt * 3) / 4); unsigned int OneEighth = (unsigned int)(BitsInUnignedInt / 8); unsigned int HighBits = (unsigned int)(0xFFFFFFFF) << (BitsInUnignedInt - OneEighth); unsigned int hash = 0; unsigned int test = 0; while (*str) { hash = (hash << OneEighth) + (*str++); if ((test = hash & HighBits) != 0) { hash = ((hash ^ (test >> ThreeQuarters)) & (~HighBits)); } } return (hash & 0x7FFFFFFF); } // ELF Hash Function int HashFun5(const char *str) { unsigned int hash = 0; unsigned int x = 0; while (*str) { hash = (hash << 4) + (*str++); if ((x = hash & 0xF0000000L) != 0) { hash ^= (x >> 24); hash &= ~x; } } return (hash & 0x7FFFFFFF); } ///////////////////////////////////////////////////////////////////////////////////// //初始化 void InitBloomFilter(BloomFilter *BloomFilter, int total, STRTOINT *hashfun) { int i = 0; assert(BloomFilter); //初始化位元位 InitBitMap(&BloomFilter->_bmp, 5 * total); for (i = 0; i < 5; i++) { BloomFilter->HashFun[i] = hashfun[i]; } BloomFilter->size = 0; } //插入 void InsertBloomFilter(BloomFilter *BloomFilter, char *str) { int Hash1 = 0; int Hash2 = 0; int Hash3 = 0; int Hash4 = 0; int Hash5 = 0; assert(BloomFilter); Hash1 = BloomFilter->HashFun[0](str) % BloomFilter->_bmp.capacity; Hash2 = BloomFilter->HashFun[1](str) % BloomFilter->_bmp.capacity; Hash3 = BloomFilter->HashFun[2](str) % BloomFilter->_bmp.capacity; Hash4 = BloomFilter->HashFun[3](str) % BloomFilter->_bmp.capacity; Hash5 = BloomFilter->HashFun[4](str) % BloomFilter->_bmp.capacity; SetBitMap(&BloomFilter->_bmp, Hash1); SetBitMap(&BloomFilter->_bmp, Hash2); SetBitMap(&BloomFilter->_bmp, Hash3); SetBitMap(&BloomFilter->_bmp, Hash4); SetBitMap(&BloomFilter->_bmp, Hash5); BloomFilter->size++; } //大小 int SizeBloomFilter(BloomFilter *BloomFilter) { assert(BloomFilter); return BloomFilter->size; } //查詢 int FindBloomFilter(BloomFilter* BloomFilter, char *str) { int Hash1 = 0; int Hash2 = 0; int Hash3 = 0; int Hash4 = 0; int Hash5 = 0; assert(BloomFilter); Hash1 = BloomFilter->HashFun[0](str) % BloomFilter->_bmp.capacity; Hash2 = BloomFilter->HashFun[1](str) % BloomFilter->_bmp.capacity; Hash3 = BloomFilter->HashFun[2](str) % BloomFilter->_bmp.capacity; Hash4 = BloomFilter->HashFun[3](str) % BloomFilter->_bmp.capacity; Hash5 = BloomFilter->HashFun[4](str) % BloomFilter->_bmp.capacity; if (!TestSetMap(&BloomFilter->_bmp, Hash1)) { return 0; } if (!TestSetMap(&BloomFilter->_bmp, Hash2)) { return 0; } if (!TestSetMap(&BloomFilter->_bmp, Hash3)) { return 0; } if (!TestSetMap(&BloomFilter->_bmp, Hash4)) { return 0; } if (!TestSetMap(&BloomFilter->_bmp,Hash5)) { return 0; } return 1; }

BitMap.h

#include<stdio.h>
#include<malloc.h>
#include<assert.h>
#include<stdlib.h>
#include<string.h>


typedef struct BitMap
{
    int *Bit;
    int capacity;//位元位的總個數
    int size;//位元位為1的個數
}BitMap;

//初始化位元位
void InitBitMap(BitMap *bit,int total);
//置1
void SetBitMap(BitMap *bit, int which);
//為1的個數
int SizeBitMap(BitMap *bit);
//置0
void ResetBitMap(BitMap *bit, int which);
//判斷位元位是否為1
int TestSetMap(BitMap *bit, int which);






/////////////////////////////////////////////////////////////////////////////////
//初始化位元位
void InitBitMap(BitMap *bit, int total)
{
    //total為建立的位元位個數
    assert(bit);
    //位元位的容量
    bit->capacity = total;
    bit->Bit = (int *)malloc(((total / 32) + 1)*sizeof(int));
    if (bit->Bit == NULL)
    {
        assert(0);
        return;
    }
    //置0
    memset(bit->Bit, 0, ((total / 32) + 1)*sizeof(int));
    bit->size = 0;
}
//置1
void SetBitMap(BitMap *bit, int which)
{
    int index = 0;
    int pos = 0;
    assert(bit);
    //確定在哪個位元組
    index = which >> 5;
    //確定位置
    pos = which % 32;
    //置1
    bit->Bit[index] = bit->Bit[index] | (1 << pos);
    bit->size++;
}
//為1的個數
int SizeBitMap(BitMap *bit)
{
    assert(bit);
    return bit->size;
}

//判斷位元位是否為1
int TestSetMap(BitMap *bit, int which)
{
    assert(bit);
    int index = 0;
    int pos = 0;
    assert(bit);
    //確定在哪個位元組
    index = which >> 5;
    //確定位置
    pos = which % 32;

    //該位元位為1返回1,否則返回0
    bit->Bit[index] = bit->Bit[index] & (1 << pos);
    return bit->Bit[index];
}
//置0
void ResetBitMap(BitMap *bit, int which)
{
    int index = 0;
    int pos = 0;
    assert(bit);
    //確定在哪個位元組
    index = which >> 5;
    //確定位置
    pos = which % 32;
    //如果位元位是1,就置0 size--
    if (TestSetMap(bit, which))
    {
        //置0
        bit->Bit[index] = bit->Bit[index] & (~(1 << pos));
        bit->size--;
    }   
}   

測試

#include"BloomFilter.h"





void Test()
{
    BloomFilter  BloomFilter;
    STRTOINT hanshfun[5] = { HashFun1, HashFun2,HashFun3,HashFun4,HashFun5 };
    //初始化
    InitBloomFilter(&BloomFilter, 100, hanshfun);
    //插入
    InsertBloomFilter(&BloomFilter, "pig");
    InsertBloomFilter(&BloomFilter, "dog");
    InsertBloomFilter(&BloomFilter, "cat");
    InsertBloomFilter(&BloomFilter, "apple");
    InsertBloomFilter(&BloomFilter, "banana");
    printf("Size = %d\n", SizeBloomFilter(&BloomFilter));
    //查詢
    if (FindBloomFilter(&BloomFilter, "pig"))
    {
        printf("\"pig\" is BloomFilter!\n");
    }
    else
    {
        printf("\"pig\" not is BloomFilter!\n");
    }

    if (FindBloomFilter(&BloomFilter, "orange"))
    {
        printf("\"orange\" is BloomFilter!\n");
    }
    else
    {
        printf("\"orange\" not is BloomFilter!\n");
    }
}
int main()
{
    Test();
    system("pause");
    return 0;
}

這裡寫圖片描述