學習筆記:從0開始學習大資料-30. solr通過java匯入doc,pdf文件建立全文檢索
1. eclipse 新建maven專案solr,pom.xml 加入依賴
2 在專案下新建類updoctest
package com.linbin.solr; import java.io.File; import java.io.IOException; import org.apache.solr.client.solrj.SolrClient; import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.impl.HttpSolrClient; import org.apache.solr.client.solrj.request.AbstractUpdateRequest.ACTION; import org.apache.solr.client.solrj.request.ContentStreamUpdateRequest; import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocumentList; public class updoctest { public static String solrUrl = "http://centos7:8983/solr/mycore"; public static void main(String[] args) throws Exception { //查詢 findIndex1(); //刪除 deleteIndexById(); // 匯入doc文件 String fileName = "/home/linbin/文件/能工巧匠進校園.doc"; String solrId = "能工巧匠進校園.doc"; indexFilesSolrCell(solrId, solrId,fileName); } // 查詢測試 public static void findIndex1() throws IOException, SolrServerException { HttpSolrClient solrClient = new HttpSolrClient.Builder(solrUrl).build(); SolrQuery query = new SolrQuery(); // 建立搜尋物件 query.set("q","*:*"); // 設定搜尋條件 query.setRows(10); //設定每頁顯示多少條 QueryResponse response = solrClient.query(query); //發起搜尋請求 SolrDocumentList docs = response.getResults(); // 查詢結果 long cnt = docs.getNumFound(); // 查詢結果總數 System.out.println("總條數為"+cnt+"條"); for (SolrDocument doc : docs) { // System.out.println(doc); System.out.println("-------------\r\n"); System.out.println("id:"+ doc.get("id") + ",autor:"+ doc.get("author") + ",text:"+ doc.get("text")); } solrClient.close(); } //刪除測試 public static void deleteIndexById() throws IOException, SolrServerException { HttpSolrClient solrClient = new HttpSolrClient.Builder(solrUrl).build(); //全刪 //solrClient.deleteByQuery("*:*"); //模糊匹配刪除(帶有分詞效果的刪除) solrClient.deleteByQuery("id:solr-word.pdf"); //指定id刪除 //solrClient.deleteById("1"); solrClient.commit(); } // 匯入doc文件測試 public static void indexFilesSolrCell(String fileName, String solrId, String path) throws IOException, SolrServerException { SolrClient solr = new HttpSolrClient.Builder(solrUrl).build(); ContentStreamUpdateRequest up = new ContentStreamUpdateRequest("/update/extract"); String contentType = getFileContentType(fileName); up.addFile(new File(path), contentType); up.setParam("literal.id", fileName); up.setParam("uprefix", "ignored_"); up.setParam("fmap.content", "text");//檔案內容 up.setAction(ACTION.COMMIT, true, true); solr.request(up); System.out.println("upload ok! \r\n"); } //根據檔案拓展名獲取檔案型別 public static String getFileContentType(String filename) { String contentType = ""; String prefix = filename.substring(filename.lastIndexOf(".") + 1); if (prefix.equals("xlsx")) { contentType = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"; } else if (prefix.equals("pdf")) { contentType = "application/pdf"; } else if (prefix.equals("doc")) { contentType = "application/msword"; } else if (prefix.equals("txt")) { contentType = "text/plain"; } else if (prefix.equals("xls")) { contentType = "application/vnd.ms-excel"; } else if (prefix.equals("docx")) { contentType = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"; } else if (prefix.equals("ppt")) { contentType = "application/vnd.ms-powerpoint"; } else if (prefix.equals("pptx")) { contentType = "application/vnd.openxmlformats-officedocument.presentationml.presentation"; } else { contentType = "othertype"; } return contentType; } }
2. 在solr的core目錄下的solrconfig.xml增加如下內容:
<requestHandler name="/update/extract" startup="lazy" class="solr.extraction.ExtractingRequestHandler">
<lst name="defaults">
<str name="lowernames">true</str>
<str name="uprefix">ignored_</str>
<str name="fmap.content">text</str>
</lst>
</requestHandler>
其中 <str name="uprefix">ignored_</str> 部分是把讀取檔案時不需要對映的欄位忽略掉
<str name="fmap.content">text</str> 是把讀取的fmap.content欄位對映為solr的 text欄位
3. 修改 managed-schema 檔案,增加
<dynamicField name="ignored_*" type="ignored" multiValued="true"/>
<fieldType name="ignored" stored="false" indexed="false" multiValued="true" class="solr.StrField" />
這個是生成一個動態欄位,型別為ignored,承接忽略的那些欄位
4. 檢查 solrconfig.xml
<lib dir="${solr.install.dir:../../..}/dist/" regex="solr-dataimporthandler-.*\.jar"/>
<lib dir="${solr.install.dir:../../..}/contrib/extraction/lib" regex=".*\.jar"/>
檢查以上路徑是否匹配,是相對於建立的mycore路徑
5. 在mycore目錄下建立lib目錄(如果沒有)
複製 solr-7.5.0/contrib/extraction/lib下的所有檔案 到mycore/lib目錄
複製 solr-7.5.0/dist/solr-cell-7.5.0.jar 到mycore/lib目錄
6.重新啟動solr,如正常啟動,再在eclipse 執行第1步建立的java程式
7. 在solr網頁查詢可以檢查到已上傳doc檔案的索引