使用Lucene對doc、docx、pdf、txt文件進行全文檢索功能的實現

阿新 • • 發佈：2019-01-13

這裡講一下使用Lucene對doc、docx、pdf、txt文件進行全文檢索功能的實現。

涉及到的類一共有兩個：

LuceneCreateIndex，建立索引：

package com.yhd.test.poi;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Date;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;

public class LuceneCreateIndex {

/**
* @param args
* @throws IOException
*/
public static void main(String[] args) throws IOException {
// 儲存word檔案的路徑
String dataDirectory = "D:\\Studying\\poi\\test\\dataDirectory";
// 儲存Lucene索引檔案的路徑
String indexDirectory = "D:\\Studying\\poi\\test\\indexDirectory";
// 建立Directory物件，也就是分詞器物件
Directory directory = new SimpleFSDirectory(new File(indexDirectory));
// 建立一個簡單的分詞器,可以對資料進行分詞
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);

// 建立索引例項
// 第1個引數是Directory,
// 第2個是分詞器,
// 第3個表示是否是建立, true代表覆蓋原先資料, 如果為false為在此基礎上面修改,
// 第4個MaxFieldLength表示對每個Field限制建立分詞索引的最大數目，
// 如果是MaxFieldLength.UNLIMITED，表示長度沒有限制;
// 如果是MaxFieldLength.LIMITED則表示有限制，可以通過IndexWriter物件的setMaxFieldLength（int
// n）進行指定
IndexWriter indexWriter = new IndexWriter(directory, analyzer, true,
IndexWriter.MaxFieldLength.UNLIMITED);
// 獲取所有需要建立索引的檔案
File[] files = new File(dataDirectory).listFiles();

for (int i = 0; i < files.length; i++) {
// 檔案是第幾個
System.out.println("這是第" + i + "個檔案----------------");
// 檔案的完整路徑
System.out.println("完整路徑：" + files[i].toString());
// 獲取檔名稱
String fileName = files[i].getName();
// 獲取檔案字尾名，將其作為檔案型別
String fileType = fileName.substring(fileName.lastIndexOf(".") + 1,
fileName.length()).toLowerCase();
// 檔名稱
System.out.println("檔名稱：" + fileName);
// 檔案型別
System.out.println("檔案型別：" + fileType);

Document doc = new Document();

// String fileCode = FileType.getFileType(files[i].toString());
// 檢視各個檔案的檔案頭標記的型別
// System.out.println("fileCode=" + fileCode);

InputStream in = new FileInputStream(files[i]);
InputStreamReader reader = null;

if (fileType != null && !fileType.equals("")) {

if (fileType.equals("doc")) {
// 獲取doc的word文件
WordExtractor wordExtractor = new WordExtractor(in);
// 建立Field物件，並放入doc物件中
// Field的各個欄位含義如下：
// 第1個引數是設定field的name，
// 第2個引數是value，value值可以是文字（String型別，Reader型別或者是預分享的TokenStream）,
// 二進位制（byet[]）, 或者是數字（一個 Number型別）
// 第3個引數是Field.Store，選擇是否儲存，如果儲存的話在檢索的時候可以返回值
// 第4個引數是Field.Index，用來設定索引方式
doc.add(new Field("contents", wordExtractor.getText(),
Field.Store.YES, Field.Index.ANALYZED));
// 關閉文件
wordExtractor.close();
System.out.println("注意：已為檔案“" + fileName + "”建立了索引");

} else if (fileType.equals("docx")) {
// 獲取docx的word文件
XWPFWordExtractor xwpfWordExtractor = new XWPFWordExtractor(
new XWPFDocument(in));
// 建立Field物件，並放入doc物件中
doc.add(new Field("contents", xwpfWordExtractor.getText(),
Field.Store.YES, Field.Index.ANALYZED));
// 關閉文件
xwpfWordExtractor.close();
System.out.println("注意：已為檔案“" + fileName + "”建立了索引");

} else if (fileType.equals("pdf")) {
// 獲取pdf文件
PDFParser parser = new PDFParser(in);
parser.parse();
PDDocument pdDocument = parser.getPDDocument();
PDFTextStripper stripper = new PDFTextStripper();
// 建立Field物件，並放入doc物件中
doc.add(new Field("contents", stripper.getText(pdDocument),
Field.Store.NO, Field.Index.ANALYZED));
// 關閉文件
pdDocument.close();
System.out.println("注意：已為檔案“" + fileName + "”建立了索引");

} else if (fileType.equals("txt")) {
// 建立一個輸入流物件reader
reader = new InputStreamReader(in);
// 建立一個物件，它把檔案內容轉成計算機能讀懂的語言
BufferedReader br = new BufferedReader(reader);
String txtFile = "";
String line = null;

while ((line = br.readLine()) != null) {
// 一次讀入一行資料
txtFile += line;
}
// 建立Field物件，並放入doc物件中
doc.add(new Field("contents", txtFile, Field.Store.NO,
Field.Index.ANALYZED));
System.out.println("注意：已為檔案“" + fileName + "”建立了索引");

} else {

System.out.println();
continue;

}

}
// 建立檔名的域，並放入doc物件中
doc.add(new Field("filename", files[i].getName(), Field.Store.YES,
Field.Index.NOT_ANALYZED));
// 建立時間的域，並放入doc物件中
doc.add(new Field("indexDate", DateTools.dateToString(new Date(),
DateTools.Resolution.DAY), Field.Store.YES,
Field.Index.NOT_ANALYZED));
// 寫入IndexWriter
indexWriter.addDocument(doc);
// 換行
System.out.println();
}
// 檢視IndexWriter裡面有多少個索引
System.out.println("numDocs=" + indexWriter.numDocs());
// 關閉索引
indexWriter.close();

}
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
LuceneSearch，進行搜尋：

package com.yhd.test.poi;

import java.io.File;
import java.io.IOException;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;

public class LuceneSearch {
public static void main(String[] args) throws IOException, ParseException {
// 儲存索引檔案的地方
String indexDirectory = "D:\\Studying\\poi\\test\\indexDirectory";
// 建立Directory物件，也就是分詞器物件
Directory directory = new SimpleFSDirectory(new File(indexDirectory));
// 建立 IndexSearcher物件，相比IndexWriter物件，這個引數就要提供一個索引的目錄就行了
IndexSearcher indexSearch = new IndexSearcher(directory);
// 建立QueryParser物件,
// 第1個引數表示Lucene的版本,
// 第2個表示搜尋Field的欄位,
// 第3個表示搜尋使用分詞器
QueryParser queryParser = new QueryParser(Version.LUCENE_30,
"contents", new StandardAnalyzer(Version.LUCENE_30));
// 生成Query物件
Query query = queryParser.parse("百度");
// 搜尋結果 TopDocs裡面有scoreDocs[]陣列，裡面儲存著索引值
TopDocs hits = indexSearch.search(query, 10);
// hits.totalHits表示一共搜到多少個
System.out.println("找到了" + hits.totalHits + "個");
// 迴圈hits.scoreDocs資料，並使用indexSearch.doc方法把Document還原，再拿出對應的欄位的值
for (int i = 0; i < hits.scoreDocs.length; i++) {
ScoreDoc sdoc = hits.scoreDocs[i];
Document doc = indexSearch.doc(sdoc.doc);
System.out.println(doc.get("filename"));
}
indexSearch.close();
}
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
詳細的解釋在程式碼註釋裡都有了，就不做過多解釋了。需要的jar包如下：

讀取poi的類到poi官網下載，讀取pdf的類到Apache PDFBox官網下載，這裡用的1.8.13版本，2.0版本的呼叫方式與1.0版本已經不太一樣了。

專案整體結構如下：

先執行類：

LuceneCreateIndex

會讀取目錄dataDirectory，即：

D:\Studying\poi\test\dataDirectory

下的檔案，建立索引，索引會儲存在目錄indexDirectory，即：

D:\Studying\poi\test\indexDirectory

下，然後執行：

LuceneSearch

使用索引進行查詢，就能看到效果了。

使用Lucene對doc、docx、pdf、txt文件進行全文檢索功能的實現

使用Lucene對doc、docx、pdf、txt文件進行全文檢索功能的實現

學習筆記:從0開始學習大資料-30. solr通過java匯入doc，pdf文件建立全文檢索

對服務器上所有Word文件做全文檢索的解決方案-Java

python3之對本地TXT文件進行增加，刪除，修改，查看功能。

【經驗分享】：如何將PDF格式的文件進行翻譯

Lucene實現各種常見文件的全文檢索

讀寫文件、文件方法、python2的亂碼問題、python對passwd文件進行排序

freemarker生成html、html轉pdf、pdf根據關鍵字定位、pdf簽名

java通過url線上預覽Word、excel、ppt、pdf、txt文件中的內容【只獲得其中的文字】

Oracle 12C 新特性之在線重命名、遷移活躍的數據文件

作業二：優化購物車：用戶入口：1.將商品的信息存到文件中；2.將已經購買的商品、余額記錄存到文件中。商家入口：1.可以添加商品；2.可以修改商品的價格

共享表空間與獨立表空間、frm,MYD,MYI.idb,par文件說明

二、Redis命令行和配置文件redis.windows.conf

json和xml封裝數據、數據緩存到文件中

文件後綴名、linux 和windows 互傳文件文件後綴名

java移動文件夾、慎用java file.renameTo(f)方法、 java從一個目錄復制文件到另一個目錄下、 java代碼完成刪除文件、文件夾、

FFmpeg視頻編解碼庫，無法解析的外部符號、找不到inttypes.h文件的問題

Visual Studio中根據系統區分引用64位、32位DLL動態庫文件的配置方法

復習使用for、while循環遍歷文件，數據類型轉換

九、特殊權限set_uid、set_gid、stick_bit；軟鏈接文件；硬鏈接文件

使用Lucene對doc、docx、pdf、txt文件進行全文檢索功能的實現

相關推薦