1. 程式人生 > >Java過濾敏感詞彙演算法(字典樹)

Java過濾敏感詞彙演算法(字典樹)

定義節點

import java.util.ArrayList;
import java.util.List;

public class Node {
    private char root;
    private List<Node> childList;
    private boolean isLeaf;

   public Node(char root){
       this.root=root;
       childList=new ArrayList<Node>();
       isLeaf=false;
   }

    public Node subNode(char c){
        if(childList != null){
            for(Node eachChild : childList){
                if(eachChild.root == c){
                    return eachChild;
                }
            }
        }
        return null;
    }


    public boolean isLeaf() {
        return isLeaf;
    }

    public void setLeaf(boolean leaf) {
        isLeaf = leaf;
    }

    public void addChild(Node node){
       this.childList.add(node);
    }
}

定義字典樹


public class WordsTree {
    private Node head;

    public WordsTree() {
        this.head = new Node(' ');//頭結點
    }

    public void insert(String word) {
        if(search(word).equals(word)) return;//檢測樹中是否存在此詞

        Node node = head;
        for (int i = 0; i < word.length(); i++) {
            Node child = node.subNode(word.charAt(i));
            if (child != null) {
                node = child;
            } else {
                node.addChild(new Node(word.charAt(i)));
                node = node.subNode(word.charAt(i));
            }
        }
        node.setLeaf(true);
    }

    public String search(String word) {
        Node node = this.head;

        String str = "";
        for (int i = 0; i < word.length(); i++) {

            if (node.isLeaf()) return str;
            //在這新增防止文章中的敏感詞彙被空格(可以加其他字元)隔開,無法識別
            if (word.charAt(i) == ' ') {
                str += word.charAt(i);
                continue;
            }
            if (node.subNode(word.charAt(i)) == null) return "";
            node = node.subNode(word.charAt(i));
            str += word.charAt(i);
        }
        if (node.isLeaf() == true) return str;
        else return "";
    }

    public String searchComment(String comment) {
        String str = "";
        String comment1 = comment;
        for (int i = 0; i < comment.length(); i++) {
            str = this.search(comment.substring(i));
            if (!str.equals("")) {
                comment1 = comment1.replaceAll(str, "\\*\\*");
            }
            i += str.length();
        }

        return comment1;
    }
}

呼叫

public class main {

    public static void main(String[] args){

        WordsTree tree=new WordsTree();

        tree.insert("鯉魚");
        tree.insert("他家");

        System.out.println(tree.searchComment("紅鯉魚家有頭小綠驢叫李屢屢,綠鯉    魚家有頭小紅驢叫呂裡裡,紅鯉1魚說他家的李屢屢要比綠鯉魚家的呂裡裡綠,綠鯉魚說他家的呂裡裡要比紅鯉魚家的李屢屢紅,是紅鯉魚比綠鯉魚的驢綠,還是綠鯉魚比紅鯉魚的驢紅。"));


    }
}

結果:

紅**家有頭小綠驢叫李屢屢,綠**家有頭小紅驢叫呂裡裡,紅鯉1魚說**的李屢屢要比綠**家的呂裡裡綠,綠**說**的呂裡裡要比紅**家的李屢屢紅,是紅**比綠**的驢綠,還是綠**比紅**的驢紅。

可以看出用空格隔開的“鯉魚”二字仍然被識別出來了,而用“1”隔開的“鯉魚”二字沒有被識別,在網上聊天的時候就是這樣識別的,當一些詞語無法傳送的時候用一些常見字元隔開就可以傳送了,“他家”兩個字也可以檢測出來。