Lucene基礎(三)-- 中文分詞及高亮顯示
阿新 • • 發佈:2019-02-07
Lucene分詞器及高亮
分詞器
在lucene中我們按照分詞方式把文件進行索引,不同的分詞器索引的效果不太一樣,之前的例子使用的都是標準分詞器,對於英文的效果很好,但是中文分詞效果就不怎麼樣,他會按照漢字的字直接分詞,沒有詞語的概念。
使用分詞的地方只需要把Analyzer例項化成我們第三方的分詞器即可
高亮
匯入lucene-highlighter-xxx.jar 在對查詢出來的結果實現高亮顯示
// 關鍵字高亮顯示的html標籤,需要匯入lucene-highlighter-xxx.jar
SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<span style='color:red'>", "</span>");
Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query));
for (int i = 0; i < hits.length; i++) {
Document doc = isearcher.doc(hits[i].doc);
// 內容增加高亮顯示
TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(doc.get("content")));
String content = highlighter.getBestFragment(tokenStream, doc.get("content"));
System.out.println(content);
}
Lucene中文分詞器
例項:
package lucene_demo04;
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryWrapperFilter;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;
/**
*中文分詞,IKAnalayzer,對索引結果實現高亮顯示
* @author YipFun
*/
public class LuceneDemo04 {
private static final Version version = Version.LUCENE_4_9;
private Directory directory = null;
private DirectoryReader ireader = null;
private IndexWriter iwriter = null;
private IKAnalyzer analyzer;
//測試資料
private String[] content = {
"你好,我是中共人",
"中華人民共和國",
"中國人民從此站起來了",
"Lucene是一個不錯的全文檢索的工具",
"全文檢索中文分詞"
};
/**
* 構造方法
*/
public LuceneDemo04() {
directory = new RAMDirectory();
}
private IKAnalyzer getAnalyzer(){
if(analyzer == null){
return new IKAnalyzer();
}else{
return analyzer;
}
}
/**
* 建立索引
*/
public void createIndex(){
Document doc = null;
try {
IndexWriterConfig iwConfig = new IndexWriterConfig(version, getAnalyzer());
iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
iwriter = new IndexWriter(directory,iwConfig);
for(String text : content){
doc = new Document();
doc.add(new TextField("content", text,Field.Store.YES));
iwriter.addDocument(doc);
}
} catch (IOException e) {
e.printStackTrace();
}finally{
try {
if(iwriter != null)
iwriter.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
public IndexSearcher getSearcher(){
try {
if(ireader==null) {
ireader = DirectoryReader.open(directory);
} else {
DirectoryReader tr = DirectoryReader.openIfChanged(ireader) ;
if(tr!=null) {
ireader.close();
ireader = tr;
}
}
return new IndexSearcher(ireader);
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
public void searchByTerm(String field,String keyword,int num) throws InvalidTokenOffsetsException{
IndexSearcher isearcher = getSearcher();
Analyzer analyzer = getAnalyzer();
//使用QueryParser查詢分析器構造Query物件
QueryParser qp = new QueryParser(version,
field,analyzer);
//這句所起效果?
qp.setDefaultOperator(QueryParser.OR_OPERATOR);
try {
Query query = qp.parse(keyword);
ScoreDoc[] hits;
//注意searcher的幾個方法
hits = isearcher.search(query, null, num).scoreDocs;
// 關鍵字高亮顯示的html標籤,需要匯入lucene-highlighter-xxx.jar
SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<span style='color:red'>", "</span>");
Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query));
for (int i = 0; i < hits.length; i++) {
Document doc = isearcher.doc(hits[i].doc);
// 內容增加高亮顯示
TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(doc.get("content")));
String content = highlighter.getBestFragment(tokenStream, doc.get("content"));
System.out.println(content);
}
} catch (IOException e) {
e.printStackTrace();
} catch (ParseException e) {
e.printStackTrace();
}
}
/**
* 使用過濾器查詢
* @param field
* @param keyword
* @param num
* @throws InvalidTokenOffsetsException
*/
public void searchByTermFilter(String field,String keyword,int num) throws InvalidTokenOffsetsException{
IndexSearcher isearcher = getSearcher();
Analyzer analyzer = getAnalyzer();
//使用QueryParser查詢分析器構造Query物件
QueryParser qp = new QueryParser(version,
field,analyzer);
//這句所起效果?
qp.setDefaultOperator(QueryParser.OR_OPERATOR);
try {
Query query = qp.parse(keyword);
Query q2 = qp.parse("全文檢索");
ScoreDoc[] hits;
QueryWrapperFilter filter=new QueryWrapperFilter(q2);
//注意searcher的幾個方法
hits = isearcher.search(query, filter, num).scoreDocs;
// 關鍵字高亮顯示的html標籤,需要匯入lucene-highlighter-xxx.jar
SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<span style='color:red'>", "</span>");
Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query));
for (int i = 0; i < hits.length; i++) {
Document doc = isearcher.doc(hits[i].doc);
// 內容增加高亮顯示
TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(doc.get("content")));
String content = highlighter.getBestFragment(tokenStream, doc.get("content"));
System.out.println(content);
}
} catch (IOException e) {
e.printStackTrace();
} catch (ParseException e) {
e.printStackTrace();
}
}
public static void main(String[] args) throws InvalidTokenOffsetsException {
System.out.println("start");
LuceneDemo04 ld = new LuceneDemo04();
ld.createIndex();
long start = System.currentTimeMillis();
ld.searchByTerm("content","人民",500);
System.out.println("end search use "+(System.currentTimeMillis()-start)+"ms");
}
}
執行結果:
start
載入擴充套件詞典:ext.dic
載入擴充套件停止詞典:stopword.dic
中華<span style='color:red'>人民</span>共和國
中國<span style='color:red'>人民</span>從此站起來了
end search use 129ms