中文分詞的逆向最大匹配演算法(2016年)
阿新 • • 發佈:2019-01-15
逆向最大匹配演算法,中文分詞機械化分詞中最基本的演算法,也是入門級別的演算法。但是,在機械化分詞方面的效果,表現卻很好。尤其是在大文字的時候,一次取較多詞語進行匹配,因為大文字匹配成詞的概率遠遠高於小文字,所以會有很好的表現。下面的程式碼,來自IK分詞的一部分原始碼包,2016年本人進行了逆向最大匹配演算法的改造,閒著沒事幹,算是入門級別的分詞。
package org.wltea.analyzer.core; import java.io.IOException; import java.io.Reader; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; import java.util.Map; import java.util.Set; import org.wltea.analyzer.cfg.Configuration; import org.wltea.analyzer.dic.Dictionary; /** * 中分分詞上下文環境 * @author TongXueQiang * @date 2016/01/22 * @since 1.7 */ class AnalyzeContext { private char[] segmentBuff; private int[] charTypes; private int buffOffset; private int cursor; private int available; private Set<String> buffLocker; private QuickSortSet orgLexemes; private Map<Integer, LexemePath> pathMap; private LinkedList<Lexeme> results; private Configuration cfg; private Integer moveIndex; public AnalyzeContext(Configuration cfg) { this.cfg = cfg; this.segmentBuff = new char[4096]; this.charTypes = new int[4096]; this.buffLocker = new HashSet<String>(); this.orgLexemes = new QuickSortSet(); this.pathMap = new HashMap<Integer, LexemePath>(); this.results = new LinkedList<Lexeme>(); } int getCursor() { return this.cursor; } char[] getSegmentBuff() { return this.segmentBuff; } char getCurrentChar() { return this.segmentBuff[this.cursor]; } int getCurrentCharType() { return this.charTypes[this.cursor]; } int getBufferOffset() { return this.buffOffset; } /** * 向緩衝區填充字元 * @param reader * @return * @throws IOException */ int fillBuffer(Reader reader) throws IOException { int readCount = 0; if (this.buffOffset == 0) { readCount = reader.read(this.segmentBuff); } else { int offset = this.available - this.cursor; if (offset > 0) { System.arraycopy(this.segmentBuff, this.cursor, this.segmentBuff, 0, offset); readCount = offset; } readCount += reader.read(this.segmentBuff, offset, -offset); } this.available = readCount; this.cursor = 0; return readCount; } void initCursor() { this.cursor = this.available-1; //規範會字元 this.segmentBuff[this.cursor] = CharacterUtil .regularize(this.segmentBuff[this.cursor]); //為字元指定型別,比如阿拉伯數字型別,英文字母型別等等 this.charTypes[this.cursor] = CharacterUtil .identifyCharType(this.segmentBuff[this.cursor]); } boolean moveCursor() { if ((this.cursor-moveIndex) > 0) { this.cursor -= (moveIndex+1); //System.out.println("移動指標後的cursor位置:"+cursor); //移動指標後還要進行規範化當前字元 this.segmentBuff[this.cursor] = CharacterUtil .regularize(this.segmentBuff[this.cursor]); //指定當前字元的型別 this.charTypes[this.cursor] = CharacterUtil .identifyCharType(this.segmentBuff[this.cursor]); return true; } return false; } void lockBuffer(String segmenterName) { this.buffLocker.add(segmenterName); } void unlockBuffer(String segmenterName) { this.buffLocker.remove(segmenterName); } boolean isBufferLocked() { return (this.buffLocker.size() > 0); } boolean isBufferConsumed() { return (this.cursor == this.available - 1); } boolean needRefillBuffer() { return ((this.available == 4096) && (this.cursor < this.available - 1) && (this.cursor > this.available - 100) && (!(isBufferLocked()))); } void markBufferOffset() { this.buffOffset += this.cursor; } void addLexeme(Lexeme lexeme) { this.orgLexemes.addLexeme(lexeme); } void addLexemePath(LexemePath path) { if (path != null) this.pathMap.put(Integer.valueOf(path.getPathBegin()), path); } QuickSortSet getOrgLexemes() { return this.orgLexemes; } /** * 輸出結果集 */ void outputToResult() { int index = 0; while (index <= this.cursor) { LexemePath path = (LexemePath) this.pathMap.get(Integer .valueOf(index)); if (path != null) { Lexeme l = path.pollFirst(); if (l != null) { this.results.add(l); index = l.getBegin() + l.getLength(); this.cursor = index; } } else { outputSingleCJK(index); ++index; } } this.pathMap.clear(); } private void outputSingleCJK(int index) { Lexeme singleCharLexeme; if (4 == this.charTypes[index]) { singleCharLexeme = new Lexeme(this.buffOffset, index, 1, 64); this.results.add(singleCharLexeme); } else if (8 == this.charTypes[index]) { singleCharLexeme = new Lexeme(this.buffOffset, index, 1, 8); this.results.add(singleCharLexeme); } } /** * 取出詞元,為詞元賦值 * @return */ Lexeme getNextLexeme() { Lexeme result = (Lexeme) this.results.pollFirst(); while (result != null) { compound(result);//數量詞合併 //過濾掉停用詞 if (Dictionary.getSingleton().isStopWord(this.segmentBuff, result.getBegin(), result.getLength())) { //System.out.println(Dictionary.getSingleton().isStopWord(this.segmentBuff, //result.getBegin(), result.getLength())); result = (Lexeme) this.results.pollFirst(); } else { //為Lexeme賦值 result.setLexemeText(String.valueOf(this.segmentBuff, result.getBegin(), result.getLength())); break; } } return result; } void reset() { this.buffLocker.clear(); this.orgLexemes = new QuickSortSet(); this.available = 0; this.buffOffset = 0; this.charTypes = new int[4096]; this.cursor = 0; this.results.clear(); this.segmentBuff = new char[4096]; this.pathMap.clear(); } /** * 數量詞合併 * @param result */ private void compound(Lexeme result) { if (!(this.cfg.useSmart())) { return; } if (this.results.isEmpty()) return; Lexeme nextLexeme; boolean appendOk; if (2 == result.getLexemeType()) { nextLexeme = (Lexeme) this.results.peekFirst(); appendOk = false; if (16 == nextLexeme.getLexemeType()) { appendOk = result.append(nextLexeme, 16); } else if (32 == nextLexeme.getLexemeType()) { appendOk = result.append(nextLexeme, 48); } if (appendOk) { this.results.pollFirst(); } } if ((16 == result.getLexemeType()) && (!(this.results.isEmpty()))) { nextLexeme = (Lexeme) this.results.peekFirst(); appendOk = false; if (32 == nextLexeme.getLexemeType()) { appendOk = result.append(nextLexeme, 48); } if (!(appendOk)) return; this.results.pollFirst(); } } public void setMoveIndex(Integer moveIndex) { this.moveIndex = moveIndex; } } 以下是CJK逆向最大匹配演算法: package org.wltea.analyzer.core; import org.wltea.analyzer.dic.Dictionary; import org.wltea.analyzer.dic.Hit; /** * 中日韓分詞器,逆向最大匹配演算法 * * @author TongXueQiang * @date 2016/01/20 * @since 1.7 */ class CJKSegmenter implements ISegmenter { static final String SEGMENTER_NAME = "CJK_SEGMENTER"; static Integer MATCH_LEN = 7; static Integer moveIndex = MATCH_LEN - 1; CJKSegmenter() { } /* * 逆向最大匹配演算法 * * @see org.wltea.analyzer.core.ISegmenter#analyze(org.wltea.analyzer.core. * AnalyzeContext) */ public void analyze(AnalyzeContext context) { if (context.getCursor() < moveIndex) { moveIndex = context.getCursor(); MATCH_LEN = context.getCursor() + 1; } Hit singleCharHit = Dictionary.getSingleton().matchInMainDict( context.getSegmentBuff(), context.getCursor() - moveIndex, MATCH_LEN); if (singleCharHit.isMatch() || MATCH_LEN == 1) { Lexeme newLexeme = new Lexeme(context.getBufferOffset(), context.getCursor() - moveIndex, MATCH_LEN, 4); context.addLexeme(newLexeme); context.setMoveIndex(moveIndex); init(); } else { if (!singleCharHit.isUnmatch() || singleCharHit.isUnmatch()) { --moveIndex; --MATCH_LEN; analyze(context); } } } private void init() { moveIndex = 6; MATCH_LEN = 7; } @Override public void reset() { } }