1. 程式人生 > >AC自動機的一種簡單實現

AC自動機的一種簡單實現

ProblemDescription(本題源自ACM題庫HDU 2222)

In the modern time, Search engine came intothe life of everybody like Google, Baidu, etc.

Wiskey also wants to bring this feature tohis image retrieval system.

Every image have a long description, whenusers type some keywords to find the image, the system will match the keywordswith description of image and show the image which the most keywords bematched.

To simplify the problem, giving you adescription of image, and some keywords, you should tell me how many keywordswill be match.

(給一個描述即目標串,和一些關鍵詞即模式串,請輸出描述中關鍵詞的出現次數。)

Input

First line will contain one integer meanshow many cases will follow by.

Each case will contain two integers N meansthe number of keywords and N keywords follow. (N <= 10000)

Each keyword will only contains characters'a'-'z', and the length will be not longer than 50.

The last line is the description, and thelength will be not longer than 1000000.

(第一行包含多少個例子將要測試,每個例子將包含兩個整數型N,代表有多少個子字串,字串只允許包含’a’-‘z’範圍的字母,且長度不超過50,最後一行為描述,長度不應該超過1000000。)

Output

Print how many keywords are contained inthe description.

(列印有多少個關鍵詞在描述中)

SampleInput

1

5

she

he

say

shr

her

yasherhs

SampleOutput

3

說明:上述題目旨在希望編寫一個可以查詢目標串中多個模式串的出現次數的程式,屬於典型的AC自動機的應用,利用Trie樹和KMP匹配思想構建AC自動機可以快速有效的查詢一個目標串中多個模式串的出現次數。“典型應用是用於統計和排序大量的字串(但不僅限於字串),所以經常被搜尋引擎系統用於文字詞頻統計。它的優點是:最大限度地減少無謂的字串比較,查詢效率比雜湊表高。”——摘自百度百科

本題使用了Trie樹+KMP匹配思想來構建AC自動機,對Trie樹的儲存結構採用的是動態儲存分配,結點中的孩子結點的指標儲存在陣列中,方便查詢,另外構建fail指標來進行回溯(若字串匹配失敗則回溯到上一匹配結點)。另外本課題中使用了佇列來對Trie樹進行BFS(廣度遍歷),在此不在列出。

步驟:

1.      首先通過使用者輸入模式串的個數(即所要匹配的子字串個數);

2.      再通過使用者輸入的多個子字串來構建Trie樹(Trie_insert);

3.      然後對構建好的Trie樹構建fail指標,當匹配失敗時通過fail指標回溯到前一匹配結果(KMP演算法);

4.      利用構建好的AC自動機來對目標串進行查詢並顯示查詢結果。


AC_auto.h

#ifndef _Trie_H_
#define _Trie_H_

#include<iostream>
using namespace std;

/*
MAX_CHILD  一個結點中最多孩子數目,26為字母數量
MAX_SIZE   佇列和棧的最大長度,應該大於MAX_CHILD為準
*/

#define MAX_CHILD 26
#define MAX_SIZE 50

struct TrieNode{
	int count;
	TrieNode *next[MAX_CHILD];
	TrieNode *fail;
	bool exist;
	TrieNode() :count(0), exist(false), fail(NULL){ for (int i = 0; i < MAX_CHILD; i++)next[i] = NULL; }
};

bool Trie_insert(TrieNode *root, char *str);
bool Trie_search(TrieNode *root, char *str);//測試檢查模式串是否已經插入到Trie樹中
void construct_Fail(TrieNode *root);//構建Fail指標,對於next[id]為空的則直接指向根結點
int query_str(TrieNode *root, char *str);//將構建好fail指標的AV自動機與目標串進行查詢

//佇列用於BFS(層次訪問)各個樹結點
class queue{
public:
	queue() :front(0), rear(0){ elem = new TrieNode*[MAX_SIZE]; }
	void makeEmpty();
	bool isEmpty();
	bool isFull();
	bool pop(TrieNode *&p);
	bool push(TrieNode *p);
private:
	TrieNode **elem;
	int front, rear;//rear指標用於指向佇列的尾元素的後一位
};

#endif

AC_func.cpp
#include"AC_auto.h"
#include<iostream>
using namespace std;

void queue::makeEmpty(){ rear = front; }
bool queue::isEmpty(){ if (front == rear)return true; return false; }
bool queue::isFull(){ if ((rear + 1) % MAX_SIZE == front)return true; return false; }

bool queue::push(TrieNode *p){
	if (isFull())return false;
	elem[rear] = p;
	rear = (rear + 1) % MAX_SIZE;
	return true;
}

bool queue::pop(TrieNode *&p){
	if (isEmpty())return false;
	p = elem[front];
	front = (front + 1) % MAX_SIZE;
	return true;
}

bool Trie_insert(TrieNode *root, char *str){
	TrieNode* tail = root;
	char *p = str;
	int id;
	while (*p){
		id = *p - 'a';
		if (tail->next[id] == NULL){
			tail->next[id] = new TrieNode;
			if (tail->next[id] == NULL)return false;
		}
		tail = tail->next[id];
		++p; tail->count++;
	}
	tail->exist = true;
	return true;
}


bool Trie_search(TrieNode *root, char *str){
	TrieNode *tail = root;
	char *p = str;
	int id;
	while (*p){
		id = *p - 'a';
		tail = tail->next[id]; ++p;
		if (tail == NULL)return false;
	}
	if (tail->exist)return true;
	else return false;
}


void construct_Fail(TrieNode *root){
	TrieNode *p;
	queue q;
	q.makeEmpty();
	root->fail = NULL;
	q.push(root);
	while (!q.isEmpty()){
		q.pop(p);

		for (int i = 0; i < MAX_CHILD; i++){
			if (p->next[i] == NULL){
				p->next[i] = root;
			}
			else{
				p->next[i]->fail = (p == root) ? root : p->fail->next[i];
				q.push(p->next[i]);
			}
		}

	}
}


int query_str(TrieNode *root, char *str){
	TrieNode *p = root;
	int id, count1 = 0;
	for (int i = 0; str[i]; i++){
		id = str[i] - 'a';
		if (id == -1){
			p = root;
			continue;
		}

		if (p != root&&p->next[id]->count == 0){
			p = p->fail;
			if (p->exist)count1++;
		}
		p = p->next[id];
		if (p->exist)count1++;
	}
	return count1;
}

AC_auto.cpp
#include"AC_auto.h"
#include<iostream>
using namespace std;


int main(){
	system("color F0");
	cout << "\n" << endl;
	cout << "\t***************************************** " << endl;
	cout << "\t*\t\t\t\t\t*" << endl;
	cout << "\t*\t   本程式通過構建AC自動機   \t*" << endl;
	cout << "\t*    統計多模式串在目標串中的出現次數   *" << endl;
	cout << "\t*\t\t\t\t\t*" << endl;
	cout << "\t***************************************** " << endl;
	cout << endl << endl;
	TrieNode *root=new TrieNode;
	int n = 0, count1 = 0;
	cout << "--請輸入單詞子串數量:" ;
	cin >> n;
	char **str=new char*[n];
	char *astring = new char[MAX_SIZE];

	cout << "--請逐個輸入單詞子串(子串字母長度應少於"<<MAX_SIZE <<"):"<< endl;//輸入n個模式串
	for (int i = 0; i < n; i++){
		str[i] = new char[MAX_SIZE];
		cin >> str[i];
	}

	for (int i = 0; i < n; i++){
		Trie_insert(root, str[i]);//執行插入結點
	}
	//搜尋模式串是否已經插入結點
	/*for (int i = 0; i < n; i++){
		if (Trie_search(root, str[i]));
	}*/

	construct_Fail(root);//構建fail指標

	cout << "--請輸入目標串(字母長度應少於"<<MAX_SIZE<<"):" << endl;//接下來是執行多模式串匹配過程
	cin >> astring;
	count1 = query_str(root, astring);
	cout <<"--目標串中含"<<count1<<"個模式子串.\n" << endl;
	return 0;
}