Lucene筆記20-Lucene的分詞-實現自定義同義詞分詞器-實現分詞器(良好設計方案)
阿新 • • 發佈:2018-11-04
一、目前存在的問題
在getSameWords()方法中,我們使用map臨時存放了兩個鍵值對用來測試,實際開發中,往往需要很多的這種鍵值對來處理,比如從某個同義詞詞典裡面獲取值之類的,所以說,我們需要一個類,根據key提供近義詞。
為了能更好的適應應用場景,我們先定義一個介面,其中定義一個getSameWords()方法,在定義一個實現類,實現getSameWords()方法,當我們需要更換字典的時候,更換實現類,重新實現getSameWords()方法提供不同的同義詞即可。
二、程式碼實現
package com.wsy; public interface SameWordContext { public String[] getSameWords(String key); }
package com.wsy; import java.util.HashMap; import java.util.Map; public class SimpleSameWordContext implements SameWordContext { Map<String, String[]> map = new HashMap(); public SimpleSameWordContext() { map.put("中國", new String[]{"天朝", "大陸"}); map.put("我", new String[]{"俺", "咱"}); // 或者是讀取同義詞詞典,將字典裡的值放入map中 } @Override public String[] getSameWords(String key) { return map.get(key); } }
package com.wsy; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import java.io.IOException; import java.util.HashMap; import java.util.Map; import java.util.Stack; public class MySameTokenFilter extends TokenFilter { private CharTermAttribute charTermAttribute; private PositionIncrementAttribute positionIncrementAttribute; private State state; private Stack<String> stack; private SameWordContext sameWordContext; protected MySameTokenFilter(TokenStream input, SameWordContext sameWordContext) { super(input); charTermAttribute = this.addAttribute(CharTermAttribute.class); positionIncrementAttribute = this.addAttribute(PositionIncrementAttribute.class); stack = new Stack(); this.sameWordContext = sameWordContext; } // 這裡的incrementToken()方法有點像iterator.hasnext() // 如果後面還有待處理的元素,那麼返回true // 如果後面沒有待處理的元素,那麼返回false @Override public boolean incrementToken() throws IOException { // 如果棧中有同義詞 if (stack.size() > 0) { // 出棧並拿到這個元素 String string = stack.pop(); // 還原狀態即獲取到之前狀態的一個副本 restoreState(state); // 將當前token的內容清空並新增上同義詞 charTermAttribute.setEmpty(); charTermAttribute.append(string); // 設定當前token和前一個token的間隔是0,也就是和前一個的位置一樣 positionIncrementAttribute.setPositionIncrement(0); return true; } if (input.incrementToken() == false) { return false; } if (getSameWords(charTermAttribute.toString())) { // 如果有同義詞就捕獲當前狀態 state = captureState(); } return true; } private boolean getSameWords(String key) { String[] sameWords = sameWordContext.getSameWords(key); if (sameWords != null) { for (String sameWord : sameWords) { stack.push(sameWord); } return true; } return false; } }
package com.wsy;
import com.chenlb.mmseg4j.Dictionary;
import com.chenlb.mmseg4j.MaxWordSeg;
import com.chenlb.mmseg4j.analysis.MMSegTokenizer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
public class MySameAnalyzer extends Analyzer {
private SameWordContext sameWordContext;
public MySameAnalyzer(SameWordContext sameWordContext) {
this.sameWordContext = sameWordContext;
}
@Override
public TokenStream tokenStream(String string, Reader reader) {
// 指定分詞字典
Dictionary dictionary = Dictionary.getInstance("E:\\Lucene\\mmseg4j-1.8.5\\data");
return new MySameTokenFilter(new MMSegTokenizer(new MaxWordSeg(dictionary), reader), sameWordContext);
}
public static void displayAllToken(String string, Analyzer analyzer) {
try {
TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(string));
// 放入屬性資訊,為了檢視流中的資訊
// 位置增量資訊,語彙單元之間的距離
PositionIncrementAttribute positionIncrementAttribute = tokenStream.addAttribute(PositionIncrementAttribute.class);
// 每個語彙單元的位置偏移量資訊
OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
// 每一個語彙單元的分詞資訊
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
// 使用的分詞器的型別資訊
TypeAttribute typeAttribute = tokenStream.addAttribute(TypeAttribute.class);
while (tokenStream.incrementToken()) {
System.out.println(positionIncrementAttribute.getPositionIncrement() + ":" + charTermAttribute + "[" + offsetAttribute.startOffset() + "-" + offsetAttribute.endOffset() + "]-->" + typeAttribute.type());
}
System.out.println("----------------------------");
} catch (IOException e) {
e.printStackTrace();
}
}
public static void main(String[] args) throws IOException {
String string = "我來自中國";
MySameAnalyzer analyzer = new MySameAnalyzer(new SimpleSameWordContext());
Directory directory = new RAMDirectory();
IndexWriter indexWriter = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35, analyzer));
Document document = new Document();
document.add(new Field("content", string, Field.Store.YES, Field.Index.ANALYZED));
indexWriter.addDocument(document);
indexWriter.close();
IndexSearcher indexSearcher = new IndexSearcher(IndexReader.open(directory));
TopDocs topDocs = indexSearcher.search(new TermQuery(new Term("content", "天朝")), 10);
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
if (scoreDocs.length > 0) {
document = indexSearcher.doc(scoreDocs[0].doc);
System.out.println(document.get("content"));
}
MySameAnalyzer.displayAllToken(string, analyzer);
}
}