1. 程式人生 > >經典的分詞方法實現(JAVA)

經典的分詞方法實現(JAVA)

基於規則的自動分詞演算法

原理

(1) 事先人工建立好分詞詞典和分詞規則庫。
(2) 原理為基於字串匹配進行分詞,這樣就要求有足夠大的詞表為依據。
(3) 通過一定的演算法來實現,如正向最大匹配法、逆向最大匹配法、雙向匹配法等。
(4) 憂缺點:當分詞詞典所收容的詞較少時,顯然覆蓋度就有限,分詞的正確率就低。

正向最大匹配法

演算法描述

設MaxLen表示最大詞長,D為分詞詞典
(1) 從待切分語料中按正向取長度為MaxLen的字串str,令Len=MaxLen;
(2) 把str與D中的詞相匹配;
(3) 若匹配成功,則認為該字串為詞,指向待切分語料的指標向前移Len個漢字(位元組),返回到(1);
(4) 若不成功:如果Len>1,則將Len減2,從待切分語料中取長度為Len的字串str,返回到(2)。否則,得到長度為2的單字詞,指向待切分語料的指標向前移1個漢字,返回(1)。

演算法程式碼

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package nlp;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import
java.io.IOException; import java.util.HashSet; import java.util.Set; import java.util.logging.Level; import java.util.logging.Logger; /** * * @author quincy1994 */ public class Nlp { private String m_sResult = ""; // 切分後的結果串 private int m_nPosIndex; // 指向待切分語料的指標的具體位置 private int m_MaxLen; // 最大取詞長
private int totalMaxLen; //總最大取詞長 private Set<String> dictionary; // 分詞字典 public Nlp(int maxLen){ this.m_MaxLen = maxLen; this.m_nPosIndex = 0; this.totalMaxLen = maxLen; try { this.dictionary = this.loadFile(); } catch (IOException ex) { Logger.getLogger(Nlp.class.getName()).log(Level.SEVERE, null, ex); } } public Nlp(){ this.m_MaxLen = 3; this.totalMaxLen = 3; this.m_nPosIndex = 0; try { this.dictionary = this.loadFile(); } catch (IOException ex) { Logger.getLogger(Nlp.class.getName()).log(Level.SEVERE, null, ex); } } public Set<String> loadFile() throws FileNotFoundException, IOException{ //讀取字典 Set<String> dictionary = new HashSet<String>(); String filename = "dict.txt"; BufferedReader br = new BufferedReader(new FileReader(filename)); String tmp; while( ( tmp = br.readLine() )!=null){ String[] token = tmp.split(","); String word = token[0]; dictionary.add(word); } return dictionary; } public String MMSegment(String source){ int len = totalMaxLen; int frompos = 0; MM(source, len, frompos); return m_sResult; } public String getSubString(String source, int m_nPosIndex, int len){ int endIndex = m_nPosIndex + len; int length = source.length(); //需要判斷是否超出句子邊界 while(endIndex > length){ endIndex -= 1; } String sub = source.substring(m_nPosIndex, endIndex); return sub; } public void MM(String source, int len , int frompos){ //遞迴匹配 if (m_nPosIndex >= source.length()) return; String sub = getSubString(source, m_nPosIndex,len); if(dictionary.contains(sub)){ //匹配 m_sResult += sub + "/ "; m_nPosIndex = m_nPosIndex + m_MaxLen; m_MaxLen = totalMaxLen; MM(source, m_MaxLen, m_nPosIndex); } else{ //不匹配 if(m_MaxLen > 1){ m_MaxLen = m_MaxLen - 1; MM(source, m_MaxLen, m_nPosIndex); } else{ m_sResult += sub+ "/ "; m_nPosIndex += 1; m_MaxLen = totalMaxLen; MM(source, m_MaxLen, m_nPosIndex); } } } /** * @param args the command line arguments */ public static void main(String[] args) { // TODO code application logic here Nlp nlp = new Nlp(); String source = "今天天氣不錯!"; String result = nlp.MMSegment(source); System.out.println(result); } }

逆向最大匹配法

演算法描述

與正向最大匹配法原理一樣,只是匹配的開始為句尾

程式碼實現

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package nlp;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;

/**
 *
 * @author quincy1994
 */
public class RMM {
    private String m_sResult = "";         //切分後的結果串
    private int m_nPosIndex;                //遊標指標
    private int m_MaxLen;                    //最大取詞長
    private int totalMaxlen;                //總最大取詞長
    private Set<String> dictionary;      //分詞字典

    public RMM(int maxLen){
        this.m_MaxLen = maxLen;
        this.totalMaxlen = maxLen;
        try {
            this.dictionary = loadFile();
        } catch (IOException ex) {
            Logger.getLogger(RMM.class.getName()).log(Level.SEVERE, null, ex);
        }
    }
    public RMM(){
        this.m_MaxLen = 3;
        this.totalMaxlen = 3;
        try {
            this.dictionary = loadFile();
        } catch (IOException ex) {
            Logger.getLogger(RMM.class.getName()).log(Level.SEVERE, null, ex);
        }
    }
    public Set<String> loadFile() throws IOException{

        //讀取字典
        Set<String> dictionary = new HashSet<String>();
        String filename = "dict.txt";
        BufferedReader br = new BufferedReader(new FileReader(filename));
        String tmp;
        while((tmp=br.readLine())!= null){
            String[] token = tmp.split(",");
            String word = token[0];
            dictionary.add(word);
        }
        return dictionary;
    }
    public String RMMSegment(String source){
        int len= totalMaxlen;
        this.m_nPosIndex = source.length();
        int frompos = this.m_nPosIndex;
        rmm(source, m_MaxLen, m_nPosIndex);

        //將結果按順序輸出
        String[] token = m_sResult.split("/");
        String result = "";
        for(int i = token.length-1; i > 0 ; i--){
            result += token[i] + "/ ";
        }
        return result;
    }
    public String getSubString(String source, int m_nPosIndex, int len){

        int startIndex = m_nPosIndex - len;
        //判斷越界條件
        while(startIndex < 0){
            startIndex += 1;
        }
        String sub = source.substring(startIndex, m_nPosIndex);
        return sub;
    }

    public void rmm(String source, int len, int frompos){
         if(m_nPosIndex < 0)  return;
         String sub = getSubString(source, m_nPosIndex, len);
         if(dictionary.contains(sub)){
             //匹配成功
             m_sResult += "/" + sub ;
             m_nPosIndex = m_nPosIndex - m_MaxLen;
             m_MaxLen = totalMaxlen;
             rmm(source, m_MaxLen, m_nPosIndex);
         }
         else{
             //不匹配
             if(m_MaxLen > 1){
                 m_MaxLen = m_MaxLen - 1;
                 rmm(source, m_MaxLen, m_nPosIndex);
             }
             else{
                 m_sResult += "/" + sub ;
                 m_nPosIndex -= 1;
                 m_MaxLen = totalMaxlen;
                 rmm(source, m_MaxLen, m_nPosIndex);
            }
        }
    }
    public static void main(String[] args) {
        // TODO code application logic here
        RMM myRMM = new RMM();
        String source = "記錄最佳前候選詞列表";
        String result = myRMM.RMMSegment(source);
        System.out.println(result);
    } 
}

基於統計的中文分詞演算法

基本思想

選擇概率最大的分詞路徑作為最優結果
利用動態規劃演算法來實現,即最優路徑中的第i個詞w i 的累計概率等於它的左相鄰詞w i-1 的累積概率乘以w i 自身的概率

具體演算法

(1)對一個待分詞的字串S,按照從左到右的順序取出全部候選詞w 1 ,w 2 ,…,w i ,w n ;
(2)計算每個候選詞的概率值P(w i ),記錄每個候選詞的全部左鄰詞;
(3)計算每個候選詞的累計概率,累計概率最大的候選詞為最佳左鄰詞;
如果當前詞w n 是字串的尾詞,且累計概率P’(w n )最大,則w n 是S的終點詞;
(4)從w n 開始,按照從右到左順序,依次將每個詞的最佳左鄰詞輸出,即S的分詞結果.

字典樹

又稱單詞查詢樹,Trie樹,是一種樹形結構,是一種雜湊樹的變種。典型應用是用於統計,排序和儲存大量的字串(但不僅限於字串),所以經常被搜尋引擎系統用於文字詞頻統計。它的優點是:利用字串的公共字首來減少查詢時間,最大限度地減少無謂的字串比較,查詢效率比雜湊樹高。

字典樹的程式碼實現

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package nlp;

import java.util.HashMap;
import java.util.Map;

/**
 *
 * @author quincy1994
 */
public class TireNode {
    private String character;           // 單個漢字
    private int frequency = -1;       //     詞頻, -1來區別某條路徑上的字串是否是一個片語
    private double antilog = -1;    //      對數化的詞頻
    private Map<String, TireNode> children;  //下一個節點

    public String getCharacter(){
        return character;
    }

    public void setCharacter(String character){
        this.character = character;
    }

    public int getFrequency(){
        return frequency;
    }

    public void setFrequency(int frequency){
        this.frequency = frequency;
    }

    public double getAntilog(){
        return antilog;
    }

    public void setAntilog(double antilog){
        this.antilog = antilog;
    }

    public void addChild(TireNode node){
        if (children == null){
            children = new HashMap<String, TireNode>();
        }
        if (!children.containsKey(node.getCharacter())){
            children.put(node.getCharacter(), node);
        }
    }

    public TireNode getChild(String ch){
        if (children == null || ! children.containsKey(ch)){
            return null;
        }
        return children.get(ch);
    }

    public void removeChildren(String ch){
        if (children == null || !children.containsKey(ch)){
            return;
        }
        children.remove(ch);
    }
}

演算法實現

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package nlp;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
 *
 * @author quincy1994
 */
public class ChnSeq {

    private TireNode tire = null;

    public List<String> loadFile() throws FileNotFoundException, IOException {
        //讀取字典
        List<String> lines = new ArrayList<String>();
        String filename = "wordFre.txt";
        BufferedReader br = new BufferedReader(new FileReader(filename));
        String tmp;
        while ((tmp = br.readLine()) != null) {
            lines.add(tmp);
        }
        br.close();
        return lines;
    }

    public void init() throws IOException {
        List<String> lines = loadFile();
        tire = new TireNode();

        for (String line : lines) {
            String[] tokens = line.split(",");
            String word = tokens[0];
            int freq = Integer.parseInt(tokens[1]);
            double antilog =  Math.log(1+0.01/Double.parseDouble(tokens[2].replace("%", ""))) ;
            //構建詞典樹
            TireNode root = tire;
            for (int i = 0; i < word.length(); i++) {
                String c = "" + word.charAt(i);
                TireNode node = root.getChild(c);
                if (node == null) {
                    node = new TireNode();
                    node.setCharacter(c);
                    root.addChild(node);
                }
                root = node;
            }
            root.setFrequency(freq);    //為每個詞設立詞頻
            root.setAntilog(antilog);   //為每個詞設立逆文件頻率
        }

    }

    public TireNode getTire() {
        return tire;
    }

    public TireNode getNodeByWord(String word) {
        TireNode node = tire;
        for (int i = 0; i < word.length(); i++) {
            String ch = word.charAt(i) + "";
            if (node == null) {
                break;
            } else {
                node = node.getChild(ch);
            }
        }
        return node;
    }

    private class Segment {

        public String word;     //詞
        public String endChar; //結束詞
        public String lastChar; //字首詞
        public double cost;

        public final static String START_SIGN = "<< STARTING >>";
        public final static String END_SIGN = "<< ENDING >>";
    }

    //尋找候選詞
    public List<Segment> preSegment(String sentence) {
        List<Segment> segs = new ArrayList<Segment>();

        //設定句子的開始標誌
        Segment terminal = new Segment();
        terminal.word = Segment.START_SIGN;
        terminal.endChar = Segment.START_SIGN;
        terminal.lastChar = null;
        segs.add(terminal);

        for (int i = 0; i < sentence.length(); i++) {
            for (int j = i + 1; j <= sentence.length(); j++) {
                String word = sentence.substring(i, j);
                TireNode tnode = this.getNodeByWord(word);
                if (tnode == null) {
                    break;
                }
                if (tnode.getFrequency() <= 0) {
                    continue;
                }

                Segment seg = new Segment();
                seg.word = word;
                seg.endChar = word.substring(word.length() - 1, word.length());
                if (i == 0) {
                    seg.lastChar = Segment.START_SIGN;
                } else {
                    seg.lastChar = sentence.substring(i - 1, i);
                }
                seg.cost = tnode.getAntilog();
                System.out.println(word + " " + seg.cost +" " + tnode.getFrequency());
                segs.add(seg);
            }
        }

        //設定句子的結束標誌
        terminal = new Segment();
        terminal.word = Segment.END_SIGN;
        terminal.endChar = Segment.END_SIGN;
        terminal.lastChar = sentence.substring(sentence.length() - 1, sentence.length());
        segs.add(terminal);

        return segs;
    }

    public String dynamicSegment(List<Segment> segs) {

        //基於動態規劃的概率統計分詞
        final double INFINITE = 9999999;

        if (segs == null || segs.size() == 0) {
            System.out.println("找不到候選詞");
            return null;
        }

        int n = segs.size();    //候選詞的個數

        //單個詞
        double[][] costs = new double[n][n];
        for (int i = 0; i < n - 1; i++) {
            for (int j = 0; j < n; j++) {
                String endChar = segs.get(i).endChar;
                if (j == i && endChar.equals(segs.get(j).word)) {
                    costs[i][j] = segs.get(j).cost;    //候選詞j的概率
                    continue;
                }
                costs[i][j] = INFINITE;
            }
        }

        //尋找前一個候選詞
        for (int i = 0; i < n - 1; i++) {
            String endChar = segs.get(i).endChar;
            for (int j = i + 1; j < n; j++) {
                String lastChar = segs.get(j).lastChar;
                if (lastChar != null && lastChar.equals(endChar) &&( j- i < 4)) {       //j字首詞不為空,同時j的字首詞等於i的字尾詞,且j和i之間的間隔不超過4個候選詞
                    costs[i][j] = segs.get(j).cost;    //候選詞j的概率
                }
            }
        }

        int sp = 0;   //開始點
        int fp = n - 1;    //結束點

        double[] dist = new double[n];         // 記錄累計概率, n為候選詞的個數
        List<List<Integer>> sPaths = new ArrayList<List<Integer>>();
        List<Integer> list = new ArrayList<Integer>();
        for (int i = 0; i < n; i++) {
            dist[i] = costs[sp][i];    //i的累計概率的初始值為索引sp到索引i的詞的概率
            if (sp != i) {
                list.add(i);   //記錄候選詞的索引位置
            }
            if (dist[i] < INFINITE) {
                List<Integer> spa = new ArrayList<Integer>();     //如果索引sp到索引i構成一個詞,則開啟一條劃分路徑
                sPaths.add(spa);
            } else {
                sPaths.add(null);
            }
        }
        while (!list.isEmpty()) {

            //選切分點
            Integer minIdx = list.get(0);
            list.remove(minIdx);

            //判斷minIdx是否為開頭的候選詞
            if(dist[minIdx] == INFINITE){
                continue;
            }

            //動態規劃
            for (int i = minIdx+1; i < n; i++) {
                if (dist[i] > dist[minIdx] + costs[minIdx][i]) {
                    dist[i] = dist[minIdx] + costs[minIdx][i];
                    List<Integer> tmp = new ArrayList<Integer>(sPaths.get(minIdx));
                    tmp.add(minIdx);
                    sPaths.set(i, tmp);  //記錄最佳前候選詞列表
                }
            }
        }
        String result = "";
        for (int i = 0; i < sPaths.get(fp).size(); i++) {
            result += segs.get(sPaths.get(fp).get(i)).word + "/ ";
        }
        return result;
    }

    public String segment(String sentences) {
        return dynamicSegment(preSegment(sentences));
    }

    public static void main(String[] args) throws ClassNotFoundException, IOException {
        ChnSeq cs = new ChnSeq();
        cs.init();
        String sentence = "在這一年中,改革開放和現代化建設繼續向前邁進。經濟保持了“高增長、低通脹”的良好發展態勢。農業生產再次獲得好的收成,企業改革繼續深化,人民生活進一步改善。對外經濟技術合作與交流不斷擴大。";
        String segs = cs.segment(sentence);
        System.out.println(segs);
    }
}