lucene給文字索引和搜尋功能的應用
阿新 • • 發佈:2019-02-14
最近一段時間由於公司需要 ,模糊搜尋出相似的關鍵詞,所以直接考慮使用了lucene。
lucene允許你往程式中新增搜尋功能,lucene能夠把你從文字中解析出來的資料進行索引和搜尋 ,lucene不關心資料來源 甚至不關心語種,不過你需要把它轉換成文字格式。也就是說你可以搜尋 html網頁,文字文件,word文件 ,pdf,或者其他一些 總之 只要能夠提取出文字資訊的即可。同樣你也可以利用lucene來索引儲存在資料庫中的資料,以給你的使用者提供一些 比如 全文搜尋功能等 ,反正lucene的功能很是強大。裡面還有很多開源的對不同語言進行分析的外掛等。
下面我介紹一個例子 ,這裡我進行對 一個txt文件的 每一行進行了 索引的新增 ,也就是說 把每一行 當作一個document物件來處理,實際上在lucene中 每一個document 相當於我們在資料庫中的庫名, 而每個field相當於我們的表名 ,它能夠對文字進行自動處理去掉裡面的一些語氣詞,它能把你規定的域當作關鍵詞來進行索引 以備查詢時使用,lucene比較容易使用 ,但是不如資料庫靈活,速度很快。下面 我用一個例子來說明(這裡我用的lucene4.7.2,最高版本 ,你需要注意把需要的一些jar包引入的到你的工程中,使用maven可直接引入依賴http://mvnrepository.com/artifact/org.apache.lucene需要的全部引入)我這裡寫了一個例項 你可以進行參考學習使用方法。
package lucene.home.clq; /** * @author chenlongquan * Copyright Manning Publications Co..com * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific lan */ //建立索引 import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.TextField; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.Directory; import org.apache.lucene.util.Version; import java.io.BufferedReader; import java.io.File; import java.io.FileFilter; import java.io.FileInputStream; import java.io.IOException; import java.io.FileReader; import java.io.InputStreamReader; import java.io.LineNumberReader; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; /** * This code was originally build for the index * */ public class Indexer { public static void main(String[] args) throws Exception { String indexDir = "f:\\index"; //1 String dataDir = "f:\\baidu"; //2 long start = System.currentTimeMillis(); Indexer indexer = new Indexer(indexDir); int numIndexed; try { numIndexed = indexer.index(dataDir, new TextFilesFilter()); } finally { indexer.close(); } long end = System.currentTimeMillis(); System.out.println("Indexing " + numIndexed + " files took " + (end - start) + " milliseconds"); } private IndexWriter writer; public Indexer(String indexDir) throws IOException { Directory dir = FSDirectory.open(new File(indexDir)); writer = new IndexWriter(dir,indexWriterConfig()); //在這裡進行索引的除錯 } public void close() throws IOException { writer.close(); //4 } private IndexWriterConfig indexWriterConfig() { Analyzer analyzer = new SmartChineseAnalyzer(Version.LUCENE_47); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_47, analyzer); return config; } public int index(String dataDir, FileFilter filter) throws Exception { File[] files = new File(dataDir).listFiles(); for (File f: files) { if (!f.isDirectory() && !f.isHidden() && f.exists() && f.canRead() && (filter == null || filter.accept(f))) { indexFile(f); } } return writer.numDocs(); //5 } private static class TextFilesFilter implements FileFilter { public boolean accept(File path) { return path.getName().toLowerCase() //6 .endsWith(".txt"); //6 } } /** * 遍歷每一個檔案,然後讀出檔案中的每一行資料,當成一個document來處理 * @param f * @throws Exception */ private void indexFile(File f) throws Exception { System.out.println("Indexing " + f.getCanonicalPath()); // Document doc = getDocument(f); List<String> lists = readFileNoDup(f); for(String list:lists){ Document doc = new Document(); doc.add(new Field("contents",list,TextField.TYPE_STORED)); writer.addDocument(doc); } //10 } //讀取一個檔案 private List<String> readFile(File filePathAndName)throws IOException { FileInputStream fis = new FileInputStream(filePathAndName); InputStreamReader isr = new InputStreamReader(fis, "UTF-8"); BufferedReader br = new BufferedReader(isr); LineNumberReader lnr = new LineNumberReader(br); List<String> returnValue = new ArrayList<String>(); int cnt = 0; while (true) { cnt++; String tempStr = lnr.readLine(); if (tempStr == null) break; if (tempStr.length() < 2) continue; returnValue.add(tempStr); } lnr.close(); br.close(); isr.close(); fis.close(); return returnValue; } //讀取一個檔案並排重後返回 public static List<String> readFileNoDup(File filePathAndName) throws IOException { FileInputStream fis = new FileInputStream(filePathAndName); InputStreamReader isr = new InputStreamReader(fis, "UTF-8"); BufferedReader br = new BufferedReader(isr); LineNumberReader lnr = new LineNumberReader(br); Set<String> set = new HashSet<String>(); while (true) { String tempStr = lnr.readLine(); if (tempStr == null) break; if (tempStr.length() < 2) continue; set.add(tempStr.trim()); } lnr.close(); br.close(); isr.close(); fis.close(); List<String> returnValue = new ArrayList<String>(set.size()); returnValue.addAll(set); return returnValue; } }
//對剛才已經建好的索引進行搜尋
package lucene.home.clq; /** * Copyright Manning Publications Co. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific lan */ import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TopFieldCollector; import org.apache.lucene.search.TopFieldDocs; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.SimpleFSDirectory; import org.apache.lucene.util.Version; // From chapter 1 /** * This code was originally written for searcher * */ public class Searcher { public static void main(String[] args) throws IllegalArgumentException, IOException, ParseException { final String indexDir = "e:\\soso\\soso"; String q = " ";//輸入你新增的所以 進行模糊搜尋 docs = query(indexDir, q) } public static void search(String indexDir, String q) throws IOException, ParseException { IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexDir))); // Directory dir = FSDirectory.open(new File(indexDir)); //3 IndexSearcher is = new IndexSearcher(reader); //3 QueryParser parser = new QueryParser(Version.LUCENE_47,"contents",new SmartChineseAnalyzer(Version.LUCENE_47)); Query query = parser.parse(q); //4 long start = System.currentTimeMillis(); TopDocs hits = is.search(query, 500); //5 //ScoreDoc[] hits = is.search(query, null, 10).scoreDocs; long end = System.currentTimeMillis(); System.err.println("Found " + hits.totalHits + //6 " document(s) (in " + (end - start) + // 6 " milliseconds) that matched query '" + // 6 q + "':"); // 6 for(ScoreDoc scoreDoc : hits.scoreDocs) { Document doc = is.doc(scoreDoc.doc); //7 System.out.println(doc.get("contents")); } reader.close(); } private static List<String> query(String indexDir, String searcher) throws IOException, ParseException{ if (searcher == null || searcher.length() == -1) { return null; } searcher = searcher.trim(); if (searcher.length() == 0) { return null; } IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexDir)));//open the index //IndexReader reader = DirectoryReader.open(SimpleFSDirectory.open(new File(indexDir)));//open the index IndexSearcher is = new IndexSearcher(reader);//find the content QueryParser parser = new QueryParser(Version.LUCENE_47, "contents", new SmartChineseAnalyzer(Version.LUCENE_47));//parser the content Query query = parser.parse(searcher); TopFieldDocs hits = is.search(query, 100, new Sort(new SortField("contents", SortField.Type.SCORE, false))); TopDocs hits1 = is.search(query, 200);//搜尋出前200條資料 按照評分進行排序 List<String> list = new ArrayList<String>(); for(ScoreDoc scoreDoc : hits.scoreDocs){ Document doc = is.doc(scoreDoc.doc); list.add(doc.get("contents")); } reader.close(); return list; } }
//這裡我主要給文件中的文字進行添加了索引 ,你也可以在Field 中給路徑 等等一些屬性進行新增索引 具體你可以搜尋lucene api
進行使用 裡面的一些方法。我這裡說的比較粗,有問題歡迎討論。