1. 程式人生 > >Lucene基礎(三)-- 中文分詞及高亮顯示

Lucene基礎(三)-- 中文分詞及高亮顯示

Lucene分詞器及高亮

分詞器

在lucene中我們按照分詞方式把文件進行索引,不同的分詞器索引的效果不太一樣,之前的例子使用的都是標準分詞器,對於英文的效果很好,但是中文分詞效果就不怎麼樣,他會按照漢字的字直接分詞,沒有詞語的概念。

使用分詞的地方只需要把Analyzer例項化成我們第三方的分詞器即可

高亮

匯入lucene-highlighter-xxx.jar 在對查詢出來的結果實現高亮顯示

 // 關鍵字高亮顯示的html標籤,需要匯入lucene-highlighter-xxx.jar
            SimpleHTMLFormatter simpleHTMLFormatter = new
SimpleHTMLFormatter("<span style='color:red'>", "</span>"); Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query)); for (int i = 0; i < hits.length; i++) { Document doc = isearcher.doc(hits[i].doc); // 內容增加高亮顯示
TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(doc.get("content"))); String content = highlighter.getBestFragment(tokenStream, doc.get("content")); System.out.println(content); }

Lucene中文分詞器

例項:

package lucene_demo04;

import
java.io.IOException; import java.io.StringReader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.TextField; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.QueryWrapperFilter; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.InvalidTokenOffsetsException; import org.apache.lucene.search.highlight.QueryScorer; import org.apache.lucene.search.highlight.SimpleHTMLFormatter; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Version; import org.wltea.analyzer.lucene.IKAnalyzer; /** *中文分詞,IKAnalayzer,對索引結果實現高亮顯示 * @author YipFun */ public class LuceneDemo04 { private static final Version version = Version.LUCENE_4_9; private Directory directory = null; private DirectoryReader ireader = null; private IndexWriter iwriter = null; private IKAnalyzer analyzer; //測試資料 private String[] content = { "你好,我是中共人", "中華人民共和國", "中國人民從此站起來了", "Lucene是一個不錯的全文檢索的工具", "全文檢索中文分詞" }; /** * 構造方法 */ public LuceneDemo04() { directory = new RAMDirectory(); } private IKAnalyzer getAnalyzer(){ if(analyzer == null){ return new IKAnalyzer(); }else{ return analyzer; } } /** * 建立索引 */ public void createIndex(){ Document doc = null; try { IndexWriterConfig iwConfig = new IndexWriterConfig(version, getAnalyzer()); iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND); iwriter = new IndexWriter(directory,iwConfig); for(String text : content){ doc = new Document(); doc.add(new TextField("content", text,Field.Store.YES)); iwriter.addDocument(doc); } } catch (IOException e) { e.printStackTrace(); }finally{ try { if(iwriter != null) iwriter.close(); } catch (IOException e) { e.printStackTrace(); } } } public IndexSearcher getSearcher(){ try { if(ireader==null) { ireader = DirectoryReader.open(directory); } else { DirectoryReader tr = DirectoryReader.openIfChanged(ireader) ; if(tr!=null) { ireader.close(); ireader = tr; } } return new IndexSearcher(ireader); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return null; } public void searchByTerm(String field,String keyword,int num) throws InvalidTokenOffsetsException{ IndexSearcher isearcher = getSearcher(); Analyzer analyzer = getAnalyzer(); //使用QueryParser查詢分析器構造Query物件 QueryParser qp = new QueryParser(version, field,analyzer); //這句所起效果? qp.setDefaultOperator(QueryParser.OR_OPERATOR); try { Query query = qp.parse(keyword); ScoreDoc[] hits; //注意searcher的幾個方法 hits = isearcher.search(query, null, num).scoreDocs; // 關鍵字高亮顯示的html標籤,需要匯入lucene-highlighter-xxx.jar SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<span style='color:red'>", "</span>"); Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query)); for (int i = 0; i < hits.length; i++) { Document doc = isearcher.doc(hits[i].doc); // 內容增加高亮顯示 TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(doc.get("content"))); String content = highlighter.getBestFragment(tokenStream, doc.get("content")); System.out.println(content); } } catch (IOException e) { e.printStackTrace(); } catch (ParseException e) { e.printStackTrace(); } } /** * 使用過濾器查詢 * @param field * @param keyword * @param num * @throws InvalidTokenOffsetsException */ public void searchByTermFilter(String field,String keyword,int num) throws InvalidTokenOffsetsException{ IndexSearcher isearcher = getSearcher(); Analyzer analyzer = getAnalyzer(); //使用QueryParser查詢分析器構造Query物件 QueryParser qp = new QueryParser(version, field,analyzer); //這句所起效果? qp.setDefaultOperator(QueryParser.OR_OPERATOR); try { Query query = qp.parse(keyword); Query q2 = qp.parse("全文檢索"); ScoreDoc[] hits; QueryWrapperFilter filter=new QueryWrapperFilter(q2); //注意searcher的幾個方法 hits = isearcher.search(query, filter, num).scoreDocs; // 關鍵字高亮顯示的html標籤,需要匯入lucene-highlighter-xxx.jar SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<span style='color:red'>", "</span>"); Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query)); for (int i = 0; i < hits.length; i++) { Document doc = isearcher.doc(hits[i].doc); // 內容增加高亮顯示 TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(doc.get("content"))); String content = highlighter.getBestFragment(tokenStream, doc.get("content")); System.out.println(content); } } catch (IOException e) { e.printStackTrace(); } catch (ParseException e) { e.printStackTrace(); } } public static void main(String[] args) throws InvalidTokenOffsetsException { System.out.println("start"); LuceneDemo04 ld = new LuceneDemo04(); ld.createIndex(); long start = System.currentTimeMillis(); ld.searchByTerm("content","人民",500); System.out.println("end search use "+(System.currentTimeMillis()-start)+"ms"); } }

執行結果:

start
載入擴充套件詞典:ext.dic
載入擴充套件停止詞典:stopword.dic
中華<span style='color:red'>人民</span>共和國
中國<span style='color:red'>人民</span>從此站起來了
end search use 129ms