1. 程式人生 > >java 中文文字分詞

java 中文文字分詞

java 中文文字分詞

本文使用 classifier4J 以及 IKAnalyzer2012_u6 實現中文分詞。可以增加自定義詞庫,詞庫儲存為 “exdict.dic” 檔案,一個詞一行。

// MyTokenizer.java 檔案

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.StringReader;
import
java.util.ArrayList; import java.util.Collection; import java.util.List; import net.sf.classifier4J.ITokenizer; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.wltea.analyzer.cfg.Configuration; import org.wltea.analyzer.cfg.DefaultConfig; import org.wltea.analyzer.dic.Dictionary; import
org.wltea.analyzer.lucene.IKTokenizer; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; /** * 中文分詞器類 * * @author CSD * */ @SuppressWarnings("deprecation") public class MyTokenizer implements ITokenizer { private static final Logger logger = LogManager.getLogger(MyTokenizer.class); private
List<String> list; private String[] strArray; private static Collection<String> exwordc = new ArrayList<>(); private static String exdict = "exdict.dic"; // 載入新增詞庫 static { try { File file = new File(exdict); FileInputStream fin = new FileInputStream(file); BufferedReader reader = new BufferedReader(new InputStreamReader(fin)); String line = ""; while ((line = reader.readLine()) != null) { exwordc.add(line.trim()); } reader.close(); logger.info("載入詞典::" + exdict); // 增加詞庫 Configuration cfg = DefaultConfig.getInstance(); Dictionary dict = Dictionary.initial(cfg); dict.addWords(exwordc); } catch (IOException e) { logger.error(e + "------------------載入詞典出錯,請確認詞典檔案!------------------"); } } /** * 分詞,返回分詞陣列 * * @param input * 文字字串 * @return String[] */ public String[] tokenize(String input) { list = new ArrayList<String>(); IKTokenizer tokenizer = new IKTokenizer(new StringReader(input), true); try { while (tokenizer.incrementToken()) { TermAttribute termAtt = (TermAttribute) tokenizer.getAttribute(TermAttribute.class); String str = termAtt.term(); list.add(str); } } catch (IOException e) { logger.error(e + "------------------分詞出錯------------------"); } strArray = new String[list.size()]; for (int i = 0; i < list.size(); i++) { strArray[i] = (String) list.get(i); } return strArray; } }
// Segmentation.java 檔案
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;

import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;

import net.sf.classifier4J.ITokenizer;

/**
 * 中文語料分詞
 * 
 * @author CSD
 *
 */
public class Segmentation {

    private static final Logger logger = LogManager.getLogger(Segmentation.class);

    public static void main(String[] args) throws IOException {

        String path = "1.txt";
        File file = new File(path);
        FileInputStream fin = new FileInputStream(file);
        String input = getString(fin);

        logger.info("開始分詞::" + path);
        ITokenizer tokenizer = new MyTokenizer();
        String[] words = tokenizer.tokenize(input);
        for (String word : words) {
            System.out.println(word);
        }

    }

    /**
     * 從 inputStream 讀取文字並轉為一個字串。
     * 
     * @param is
     *            inputStream 輸入流
     * @return String 文字字串
     * @throws IOException
     */
    public static String getString(InputStream is) throws IOException {

        BufferedReader reader = new BufferedReader(new InputStreamReader(is));
        String line = "";
        StringBuffer stringBuffer = new StringBuffer();
        while ((line = reader.readLine()) != null) {
            stringBuffer.append(line);
            stringBuffer.append(" ");
        }

        reader.close();

        return stringBuffer.toString().trim();
    }
}

程式需依賴 IKAnalyzer2012_u6.jar 以及新增 pom.xml 檔案

<!-- https://mvnrepository.com/artifact/classifier4j/classifier4j -->
        <dependency>
            <groupId>classifier4j</groupId>
            <artifactId>classifier4j</artifactId>
            <version>0.6</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-analyzers -->
        <dependency>
            <groupId>org.apache.lucene</groupId>
            <artifactId>lucene-analyzers</artifactId>
            <version>3.2.0</version>
        </dependency>

        <dependency>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-log4j12</artifactId>
            <version>1.7.5</version>
        </dependency>