一步一步跟我學習lucene(6)---lucene索引優化之多執行緒建立索引
阿新 • • 發佈:2019-01-05
這兩天工作有點忙,部落格更新不及時,請大家見諒;
前面瞭解到lucene在索引建立的時候一個IndexWriter獲取到一個讀寫鎖,這樣勢在lucene建立大資料量的索引的時候,執行效率低下的問題;
- 磁碟空間大小,這個直接影響索引的建立,甚至會造成索引寫入提示完成,但是沒有同步的問題;
- 索引合併策略的選擇,這個類似於sql裡邊的批量操作,批量操作的數量過多直接影響執行效率,對於lucene來講,索引合併前是將document放在記憶體中,因此選擇合適的合併策略也可以提升索引的效率;
- 唯一索引對應的term的選擇,lucene索引的建立過程中是先從索引中刪除包含相同term的document然後重新新增document到索引中,這裡如果term對應的document過多,會佔用磁碟IO,同時造成IndexWriter的寫鎖佔用時間延長,相應的執行效率低下;
綜上所述,索引優化要保證磁碟空間,同時在term選擇上可以以ID等標識來確保唯一性,這樣第一條和第三條的風險就規避了;
本文旨在對合並策略和採用多執行緒建立的方式提高索引的效率;
多執行緒建立索引,我這邊還設計了多目錄索引建立,這樣避免了同一目錄資料量過大索引塊合併和索引塊重新申請;
廢話不多說,這裡附上程式碼,程式碼示例是讀取lucene官網下載並解壓的資料夾並給檔案資訊索引起來
首先定義FileBean來儲存檔案資訊
package com.lucene.bean; public class FileBean { //路徑 private String path; //修改時間 private Long modified; //內容 private String content; public String getPath() { return path; } public void setPath(String path) { this.path = path; } public Long getModified() { return modified; } public void setModified(Long modified) { this.modified = modified; } public String getContent() { return content; } public void setContent(String content) { this.content = content; } }
接下來是一個工具類,用以將資料夾的資訊遍歷讀取並轉換成FileBean的集合
package com.lucene.index.util; import java.io.File; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Paths; import java.util.LinkedList; import java.util.List; import com.lucene.bean.FileBean; public class FileUtil { /**讀取檔案資訊和下屬資料夾 * @param folder * @return * @throws IOException */ public static List<FileBean> getFolderFiles(String folder) throws IOException { List<FileBean> fileBeans = new LinkedList<FileBean>(); File file = new File(folder); if(file.isDirectory()){ File[] files = file.listFiles(); if(files != null){ for (File file2 : files) { fileBeans.addAll(getFolderFiles(file2.getAbsolutePath())); } } }else{ FileBean bean = new FileBean(); bean.setPath(file.getAbsolutePath()); bean.setModified(file.lastModified()); bean.setContent(new String(Files.readAllBytes(Paths.get(folder)))); fileBeans.add(bean); } return fileBeans; } }
定義一個公共的用於處理索引的類
package com.lucene.index;
import java.io.File;
import java.io.IOException;
import java.text.ParseException;
import java.util.List;
import java.util.concurrent.CountDownLatch;
import org.apache.lucene.index.IndexWriter;
public abstract class BaseIndex<T> implements Runnable{
/**
* 父級索引路徑
*/
private String parentIndexPath;
/**
* 索引編寫器
*/
private IndexWriter writer;
private int subIndex;
/**
* 主執行緒
*/
private final CountDownLatch countDownLatch1;
/**
*工作執行緒
*/
private final CountDownLatch countDownLatch2;
/**
* 物件列表
*/
private List<T> list;
public BaseIndex(IndexWriter writer,CountDownLatch countDownLatch1, CountDownLatch countDownLatch2,
List<T> list){
super();
this.writer = writer;
this.countDownLatch1 = countDownLatch1;
this.countDownLatch2 = countDownLatch2;
this.list = list;
}
public BaseIndex(String parentIndexPath, int subIndex,
CountDownLatch countDownLatch1, CountDownLatch countDownLatch2,
List<T> list) {
super();
this.parentIndexPath = parentIndexPath;
this.subIndex = subIndex;
try {
//多目錄索引建立
File file = new File(parentIndexPath+"/index"+subIndex);
if(!file.exists()){
file.mkdir();
}
this.writer = IndexUtil.getIndexWriter(parentIndexPath+"/index"+subIndex, true);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
};
this.subIndex = subIndex;
this.countDownLatch1 = countDownLatch1;
this.countDownLatch2 = countDownLatch2;
this.list = list;
}
public BaseIndex(String path,CountDownLatch countDownLatch1, CountDownLatch countDownLatch2,
List<T> list) {
super();
try {
//單目錄索引建立
File file = new File(path);
if(!file.exists()){
file.mkdir();
}
this.writer = IndexUtil.getIndexWriter(path,true);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
};
this.countDownLatch1 = countDownLatch1;
this.countDownLatch2 = countDownLatch2;
this.list = list;
}
/**建立索引
* @param writer
* @param carSource
* @param create
* @throws IOException
* @throws ParseException
*/
public abstract void indexDoc(IndexWriter writer,T t) throws Exception;
/**批量索引建立
* @param writer
* @param t
* @throws Exception
*/
public void indexDocs(IndexWriter writer,List<T> t) throws Exception{
for (T t2 : t) {
indexDoc(writer,t2);
}
}
@Override
public void run() {
try {
countDownLatch1.await();
System.out.println(writer);
indexDocs(writer,list);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}finally{
countDownLatch2.countDown();
try {
writer.commit();
writer.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
}
FileBeanIndex類用於處理FileBean的索引建立
package com.lucene.index;
import java.util.List;
import java.util.concurrent.CountDownLatch;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.LongField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import com.lucene.bean.FileBean;
public class FileBeanIndex extends BaseIndex<FileBean>{
public FileBeanIndex(IndexWriter writer, CountDownLatch countDownLatch1,
CountDownLatch countDownLatch2, List<FileBean> list) {
super(writer, countDownLatch1, countDownLatch2, list);
}
public FileBeanIndex(String parentIndexPath, int subIndex, CountDownLatch countDownLatch1,
CountDownLatch countDownLatch2, List<FileBean> list) {
super(parentIndexPath, subIndex, countDownLatch1, countDownLatch2, list);
}
@Override
public void indexDoc(IndexWriter writer, FileBean t) throws Exception {
Document doc = new Document();
System.out.println(t.getPath());
doc.add(new StringField("path", t.getPath(), Field.Store.YES));
doc.add(new LongField("modified", t.getModified(), Field.Store.YES));
doc.add(new TextField("content", t.getContent(), Field.Store.YES));
if (writer.getConfig().getOpenMode() == IndexWriterConfig.OpenMode.CREATE){
writer.addDocument(doc);
}else{
writer.updateDocument(new Term("path", t.getPath()), doc);
}
}
}
IndexUtil工具類裡邊設定索引合併的策略
package com.lucene.index;
import java.io.IOException;
import java.nio.file.Paths;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LogByteSizeMergePolicy;
import org.apache.lucene.index.LogMergePolicy;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
public class IndexUtil {
/**建立索引寫入器
* @param indexPath
* @param create
* @return
* @throws IOException
*/
public static IndexWriter getIndexWriter(String indexPath,boolean create) throws IOException{
Directory dir = FSDirectory.open(Paths.get(indexPath, new String[0]));
Analyzer analyzer = new StandardAnalyzer();
IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
LogMergePolicy mergePolicy = new LogByteSizeMergePolicy();
//設定segment新增文件(Document)時的合併頻率 //值較小,建立索引的速度就較慢 //值較大,建立索引的速度就較快,>10適合批量建立索引
mergePolicy.setMergeFactor(50);
//設定segment最大合併文件(Document)數
//值較小有利於追加索引的速度
//值較大,適合批量建立索引和更快的搜尋
mergePolicy.setMaxMergeDocs(5000);
if (create){
iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
}else {
iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
}
IndexWriter writer = new IndexWriter(dir, iwc);
return writer;
}
}
TestIndex類執行測試程式
package com.lucene.index.test;
import java.util.List;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import org.apache.lucene.index.IndexWriter;
import com.lucene.bean.FileBean;
import com.lucene.index.FileBeanIndex;
import com.lucene.index.util.FileUtil;
public class TestIndex {
public static void main(String[] args) {
try {
List<FileBean> fileBeans = FileUtil.getFolderFiles("C:\\Users\\lenovo\\Desktop\\lucene\\lucene-5.1.0");
int totalCount = fileBeans.size();
int perThreadCount = 3000;
System.out.println("查詢到的資料總數是"+fileBeans.size());
int threadCount = totalCount/perThreadCount + (totalCount%perThreadCount == 0 ? 0 : 1);
ExecutorService pool = Executors.newFixedThreadPool(threadCount);
CountDownLatch countDownLatch1 = new CountDownLatch(1);
CountDownLatch countDownLatch2 = new CountDownLatch(threadCount);
System.out.println(fileBeans.size());
for(int i = 0; i < threadCount; i++) {
int start = i*perThreadCount;
int end = (i+1) * perThreadCount < totalCount ? (i+1) * perThreadCount : totalCount;
List<FileBean> subList = fileBeans.subList(start, end);
Runnable runnable = new FileBeanIndex("index",i, countDownLatch1, countDownLatch2, subList);
//子執行緒交給執行緒池管理
pool.execute(runnable);
}
countDownLatch1.countDown();
System.out.println("開始建立索引");
//等待所有執行緒都完成
countDownLatch2.await();
//執行緒全部完成工作
System.out.println("所有執行緒都建立索引完畢");
//釋放執行緒池資源
pool.shutdown();
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
以上即是多執行緒多目錄索引,大家有什麼疑問的歡迎交流;
一步一步跟我學習lucene是對近期做lucene索引的總結,大家有問題的話聯絡本人的Q-Q: 891922381,同時本人新建Q-Q群:106570134(lucene,solr,netty,hadoop),如蒙加入,不勝感激,大家共同探討,本人爭取每日一博,希望大家持續關注,會帶給大家驚喜的