1. 程式人生 > >lucene給文字索引和搜尋功能的應用

lucene給文字索引和搜尋功能的應用

最近一段時間由於公司需要 ,模糊搜尋出相似的關鍵詞,所以直接考慮使用了lucene。

lucene允許你往程式中新增搜尋功能,lucene能夠把你從文字中解析出來的資料進行索引和搜尋 ,lucene不關心資料來源 甚至不關心語種,不過你需要把它轉換成文字格式。也就是說你可以搜尋 html網頁,文字文件,word文件 ,pdf,或者其他一些 總之 只要能夠提取出文字資訊的即可。同樣你也可以利用lucene來索引儲存在資料庫中的資料,以給你的使用者提供一些  比如 全文搜尋功能等 ,反正lucene的功能很是強大。裡面還有很多開源的對不同語言進行分析的外掛等。

下面我介紹一個例子 ,這裡我進行對 一個txt文件的 每一行進行了 索引的新增 ,也就是說  把每一行 當作一個document物件來處理,實際上在lucene中 每一個document 相當於我們在資料庫中的庫名, 而每個field相當於我們的表名 ,它能夠對文字進行自動處理去掉裡面的一些語氣詞,它能把你規定的域當作關鍵詞來進行索引 以備查詢時使用,lucene比較容易使用 ,但是不如資料庫靈活,速度很快。下面 我用一個例子來說明(這裡我用的lucene4.7.2,最高版本 ,你需要注意把需要的一些jar包引入的到你的工程中,使用maven可直接引入依賴http://mvnrepository.com/artifact/org.apache.lucene需要的全部引入)我這裡寫了一個例項 你可以進行參考學習使用方法。

package lucene.home.clq;

/**
 * @author chenlongquan
 * Copyright Manning Publications Co..com
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *     
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific lan      
*/

//建立索引
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Version;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileFilter;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.FileReader;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
/**
 * This code was originally build for the index
 * 
 */
public class Indexer {

  public static void main(String[] args) throws Exception {
    
    String indexDir = "f:\\index";         //1
    String dataDir = "f:\\baidu";          //2


    long start = System.currentTimeMillis();
    Indexer indexer = new Indexer(indexDir);
    int numIndexed;
    try {
      numIndexed = indexer.index(dataDir, new TextFilesFilter());
    } finally {
      indexer.close();
    }
    long end = System.currentTimeMillis();

    System.out.println("Indexing " + numIndexed + " files took "
      + (end - start) + " milliseconds");
  }
   private IndexWriter writer;
  public Indexer(String indexDir) throws IOException {
      Directory dir = FSDirectory.open(new File(indexDir));
      writer = new IndexWriter(dir,indexWriterConfig());
      //在這裡進行索引的除錯
             }

  public void close() throws IOException {
    writer.close();                             //4
  }
  private IndexWriterConfig indexWriterConfig()
{
 Analyzer analyzer = new SmartChineseAnalyzer(Version.LUCENE_47);
 IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_47, analyzer);
 return config;
}
  public int index(String dataDir, FileFilter filter)
    throws Exception {


    File[] files = new File(dataDir).listFiles();


    for (File f: files) {
      if (!f.isDirectory() &&
          !f.isHidden() &&
          f.exists() &&
          f.canRead() &&
          (filter == null || filter.accept(f))) {
        indexFile(f);
      }
    }


    return writer.numDocs();                     //5
  }


  private static class TextFilesFilter implements FileFilter {
    public boolean accept(File path) {
      return path.getName().toLowerCase()        //6
             .endsWith(".txt");                  //6
    }
  }
   
   /**
    * 遍歷每一個檔案,然後讀出檔案中的每一行資料,當成一個document來處理 
    * @param f
    * @throws Exception
    */
  private void indexFile(File f) throws Exception {
    System.out.println("Indexing " + f.getCanonicalPath());
   // Document doc = getDocument(f);
    List<String> lists = readFileNoDup(f);
    for(String list:lists){
    Document doc = new Document();
    doc.add(new Field("contents",list,TextField.TYPE_STORED));
    writer.addDocument(doc);     
    }
    
                            //10
  }
  //讀取一個檔案
  private List<String> readFile(File filePathAndName)throws IOException {

FileInputStream fis = new FileInputStream(filePathAndName);
InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
BufferedReader br = new BufferedReader(isr);
LineNumberReader lnr = new LineNumberReader(br);

List<String> returnValue = new ArrayList<String>();
int cnt = 0;
while (true) {
cnt++;
String tempStr = lnr.readLine();
if (tempStr == null)
break;
if (tempStr.length() < 2)
continue;
returnValue.add(tempStr);
}
lnr.close();
br.close();
isr.close();
fis.close();
return returnValue;
}
  //讀取一個檔案並排重後返回
  public static List<String> readFileNoDup(File filePathAndName)
throws IOException {
 
FileInputStream fis = new FileInputStream(filePathAndName);
InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
BufferedReader br = new BufferedReader(isr);
LineNumberReader lnr = new LineNumberReader(br);

Set<String> set = new HashSet<String>();
while (true) {
String tempStr = lnr.readLine();
if (tempStr == null)
break;
if (tempStr.length() < 2)
continue;
set.add(tempStr.trim());
}
lnr.close();
br.close();
isr.close();
fis.close();
List<String> returnValue = new ArrayList<String>(set.size());
returnValue.addAll(set);
return returnValue;
        }
}


//對剛才已經建好的索引進行搜尋

 
package lucene.home.clq;


 


/**
 * Copyright Manning Publications Co.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific lan      
*/


import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;


import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopFieldCollector;
import org.apache.lucene.search.TopFieldDocs;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;


// From chapter 1


/**
 * This code was originally written for  searcher
 * 
 */
public class Searcher {


  public static void main(String[] args) throws IllegalArgumentException,
        IOException, ParseException {
                     
    final String indexDir = "e:\\soso\\soso";
         String q = " ";//輸入你新增的所以 進行模糊搜尋 
    docs = query(indexDir, q)
     
    }
     
  public static void search(String indexDir, String q)
    throws IOException, ParseException {
 IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexDir)));
   // Directory dir = FSDirectory.open(new File(indexDir)); //3
    IndexSearcher is = new IndexSearcher(reader);   //3   


    QueryParser parser = new QueryParser(Version.LUCENE_47,"contents",new SmartChineseAnalyzer(Version.LUCENE_47));        
    Query query = parser.parse(q);              //4   
    long start = System.currentTimeMillis();
    TopDocs hits = is.search(query, 500); //5
   
    //ScoreDoc[] hits = is.search(query, null, 10).scoreDocs;
    long end = System.currentTimeMillis();


    System.err.println("Found " + hits.totalHits +   //6  
      " document(s) (in " + (end - start) +        // 6
      " milliseconds) that matched query '" +     // 6
      q + "':");                                   // 6


    for(ScoreDoc scoreDoc : hits.scoreDocs) {
      Document doc = is.doc(scoreDoc.doc);               //7      
      System.out.println(doc.get("contents"));
    }
    reader.close();
  }
   
    private static List<String> query(String indexDir, String searcher) throws IOException, ParseException{
        if (searcher == null || searcher.length() == -1) {
            return null;
          }
    
         searcher = searcher.trim();
          if (searcher.length() == 0) {
            return null;
          }
          
    IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexDir)));//open the index
    //IndexReader reader = DirectoryReader.open(SimpleFSDirectory.open(new File(indexDir)));//open the index
    IndexSearcher is = new IndexSearcher(reader);//find the content
    QueryParser parser = new QueryParser(Version.LUCENE_47, "contents", new SmartChineseAnalyzer(Version.LUCENE_47));//parser the content
    Query query = parser.parse(searcher);
    TopFieldDocs hits = is.search(query, 100, new Sort(new SortField("contents", SortField.Type.SCORE, false)));
    TopDocs hits1 = is.search(query, 200);//搜尋出前200條資料  按照評分進行排序
    List<String> list = new ArrayList<String>();
    for(ScoreDoc scoreDoc : hits.scoreDocs){
    Document doc = is.doc(scoreDoc.doc);
    list.add(doc.get("contents"));
    }
    reader.close();
    return list;
    }
}


//這裡我主要給文件中的文字進行添加了索引 ,你也可以在Field 中給路徑 等等一些屬性進行新增索引   具體你可以搜尋lucene api

進行使用 裡面的一些方法。我這裡說的比較粗,有問題歡迎討論。