1. 程式人生 > >利用AC自動機進行關鍵字的提取和過濾

利用AC自動機進行關鍵字的提取和過濾

package com.AC.domain;

import java.util.*;
import java.io.*;
import java.math.*;

public class Patterns {
	private final Node root = new Node();
	
	
	private List<Node> tree;
	
	public Patterns(List<Keyword> keywords){
		tree = new ArrayList<Node> ();
		root.failureNode=root;
		tree.add(root);
		for(Keyword keyword : keywords){
			addKeyword(keyword);
		}
		setFailNode();
	}

	private  void setFailNode() {
		// TODO Auto-generated method stub
		
		Queue<Node> queue = new LinkedList<Node>();
		Node node =root;
		for (Node d1 : node.childrenList){
			queue.offer(d1);
		}
		while (!queue.isEmpty()){
			node = queue.poll();
			if (node.childrenList!=null){
				for (Node curNode : node.childrenList) {
					queue.offer(curNode);
					Node failNode = node.failureNode;
					while(!failNode.containsChild(curNode.character)){
						failNode = failNode.failureNode;
						if(failNode==null||failNode.state==0) break;
					}
					if(failNode!=null&&failNode.containsChild(curNode.character)) {
						curNode.failureNode = failNode.getChild(curNode.character);
						curNode.addKeywords(curNode.failureNode.keywords);
						
					}
					
				}
			}
		}
	}

	private  void addKeyword(Keyword keyword) {
		// TODO Auto-generated method stub
		
		char [] wordCharArr = keyword.getWord().toCharArray();
		Node current = root;
		for(char currentChar : wordCharArr){
			if(current.containsChild(currentChar)){
				current = current.getChild(currentChar);
			}
			else{
				Node node = new Node (currentChar,root);
				current.addChild(node);
				current=node;
				tree.add(node);
			}
		}
		current.addKeyword(keyword);
		
	}
	
	public List<Keyword> searchKeyword(String data,Integer category) {
		List<Keyword> matchResult = new ArrayList<Keyword>();
		Node node = root;
		char[] chs = data.toCharArray();
		for (int i=0;i<chs.length;i++){
			while(node!=null&&!node.containsChild(chs[i])){
			//	if(node.state==0) break;
				node = node.failureNode;
				if(node==null||node.state==0) break;
			}
			
			if(node!=null&&node.containsChild(chs[i])) {
				node = node.getChild(chs[i]);
				if(node.keywords!=null){
					for(Keyword pattern : node.keywords){
						if(category == null){
	//						System.out.println(pattern.getWord());
							matchResult.add(new Keyword(pattern.getWord()));
						}
						else{
							if(pattern.getCategories().contains(category)){
								matchResult.add(pattern);
							}
						}
						
					}
				}
			}
		}
		return matchResult;
	}

}
Test.java
package com.AC.domain;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

public class Test {
	public static void main(String []args){
		
	//	abcd abc abe ae bc be bce cm kcabcmgh
		
		List<Keyword> keywords = new ArrayList<Keyword>();
		List<Keyword> result = new ArrayList<Keyword> ();
		
		
		
/*		List<Keyword> re= new ArrayList<Keyword> ();	
		re.clear();
		Keyword a= new Keyword("abcd");
		re.add(a);
		Keyword b= new Keyword("abc");
		re.add(b);
		
		System.out.println(re.size());*/
		
		
		
		
		Keyword a1= new Keyword();
		a1.setWord("abcd");
		keywords.add(a1);
		
		Keyword a2= new Keyword();
		a2.setWord("abc");
		keywords.add(a2);
		
		Keyword a3= new Keyword();
		a3.setWord("abe");
		keywords.add(a3);
		
		Keyword a5= new Keyword();
		a5.setWord("ae");
		keywords.add(a5);	
		
		Keyword a6= new Keyword();
		a6.setWord("bc");
		keywords.add(a6);	
		
		Keyword a7= new Keyword();
		a7.setWord("be");
		keywords.add(a7);	
		
		Keyword a8= new Keyword();
		a8.setWord("bce");
		keywords.add(a8);	
		
		Keyword a9= new Keyword();
		a9.setWord("cm");
		keywords.add(a9);	
		
		Patterns patterns=new Patterns(keywords);
		result=patterns.searchKeyword("kcabcmgha", null);
		
//		System.out.println(result.size());
		System.out.println("keys: ");
		for(Keyword key:result){
			System.out.println(key.getWord());
		}
		
	//	System.out.println(result);
	}

}
附美團文章連結:http://tech.meituan.com/ac.html