學習筆記:從0開始學習大資料-30. solr通過java匯入doc，pdf文件建立全文檢索

阿新 • • 發佈：2018-12-17

1. eclipse 新建maven專案solr，pom.xml 加入依賴

2 在專案下新建類updoctest

package com.linbin.solr;

import java.io.File;
import java.io.IOException;
import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.HttpSolrClient;
import org.apache.solr.client.solrj.request.AbstractUpdateRequest.ACTION;
import org.apache.solr.client.solrj.request.ContentStreamUpdateRequest;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;

public class updoctest {
	public static String solrUrl = "http://centos7:8983/solr/mycore"; 
	
	public static void main(String[] args) throws Exception {
	//查詢	findIndex1();
	//刪除	deleteIndexById();
    // 匯入doc文件
        String fileName = "/home/linbin/文件/能工巧匠進校園.doc";
        String solrId = "能工巧匠進校園.doc";
        indexFilesSolrCell(solrId, solrId,fileName);
	}
	
	// 查詢測試
	public static void findIndex1() throws IOException, SolrServerException {
        HttpSolrClient solrClient = new HttpSolrClient.Builder(solrUrl).build();        
        SolrQuery query = new SolrQuery();  // 建立搜尋物件    
        query.set("q","*:*");        // 設定搜尋條件
        query.setRows(10);         //設定每頁顯示多少條
        QueryResponse response = solrClient.query(query);        //發起搜尋請求
        SolrDocumentList docs = response.getResults();       // 查詢結果
        long cnt = docs.getNumFound();  // 查詢結果總數
        System.out.println("總條數為"+cnt+"條");        
        for (SolrDocument doc : docs) {
        	// System.out.println(doc);
        	System.out.println("-------------\r\n");
         System.out.println("id:"+ doc.get("id") + ",autor:"+ doc.get("author") + ",text:"+ doc.get("text"));
        }
        solrClient.close();
    }

//刪除測試	
public static  void deleteIndexById() throws IOException, SolrServerException {
        HttpSolrClient solrClient = new HttpSolrClient.Builder(solrUrl).build(); 
        //全刪        //solrClient.deleteByQuery("*:*");       
        //模糊匹配刪除（帶有分詞效果的刪除）
        solrClient.deleteByQuery("id:solr-word.pdf");        
        //指定id刪除        //solrClient.deleteById("1");        
        solrClient.commit();
    }

// 匯入doc文件測試    
public static void indexFilesSolrCell(String fileName, String solrId, String path)
            throws IOException, SolrServerException
    {
        SolrClient solr = new HttpSolrClient.Builder(solrUrl).build();

        ContentStreamUpdateRequest up = new ContentStreamUpdateRequest("/update/extract");
        String contentType = getFileContentType(fileName);
        up.addFile(new File(path), contentType);
        up.setParam("literal.id", fileName);
        up.setParam("uprefix", "ignored_");
        up.setParam("fmap.content", "text");//檔案內容
        up.setAction(ACTION.COMMIT, true, true);
        solr.request(up);
        System.out.println("upload ok! \r\n");
    }
    
//根據檔案拓展名獲取檔案型別    
public static String getFileContentType(String filename) {
        String contentType = "";
        String prefix = filename.substring(filename.lastIndexOf(".") + 1);
        if (prefix.equals("xlsx")) {
            contentType = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
        } else if (prefix.equals("pdf")) {
            contentType = "application/pdf";
        } else if (prefix.equals("doc")) {
            contentType = "application/msword";
        } else if (prefix.equals("txt")) {
            contentType = "text/plain";
        } else if (prefix.equals("xls")) {
            contentType = "application/vnd.ms-excel";
        } else if (prefix.equals("docx")) {
            contentType = "application/vnd.openxmlformats-officedocument.wordprocessingml.document";
        } else if (prefix.equals("ppt")) {
            contentType = "application/vnd.ms-powerpoint";
        } else if (prefix.equals("pptx")) {
            contentType = "application/vnd.openxmlformats-officedocument.presentationml.presentation";
        }

        else {
            contentType = "othertype";
        }

        return contentType;
    }
	
}

2. 在solr的core目錄下的solrconfig.xml增加如下內容：

<requestHandler name="/update/extract" startup="lazy" class="solr.extraction.ExtractingRequestHandler">
<lst name="defaults">
<str name="lowernames">true</str>
<str name="uprefix">ignored_</str>

其中 <str name="uprefix">ignored_</str> 部分是把讀取檔案時不需要對映的欄位忽略掉

<str name="fmap.content">text</str> 是把讀取的fmap.content欄位對映為solr的 text欄位

3. 修改 managed-schema 檔案，增加

這個是生成一個動態欄位，型別為ignored，承接忽略的那些欄位

4. 檢查 solrconfig.xml

檢查以上路徑是否匹配，是相對於建立的mycore路徑

5. 在mycore目錄下建立lib目錄（如果沒有）

複製 solr-7.5.0/contrib/extraction/lib下的所有檔案到mycore/lib目錄

複製 solr-7.5.0/dist/solr-cell-7.5.0.jar 到mycore/lib目錄

6.重新啟動solr,如正常啟動，再在eclipse 執行第1步建立的java程式

7. 在solr網頁查詢可以檢查到已上傳doc檔案的索引

學習筆記:從0開始學習大資料-30. solr通過java匯入doc，pdf文件建立全文檢索

學習筆記:從0開始學習大資料-30. solr通過java匯入doc，pdf文件建立全文檢索

學習筆記:從0開始學習大資料-20. 機器學習spark ml演算法庫應用練習

學習筆記:從0開始學習大資料-19. storm開發及執行環境部署

學習筆記:從0開始學習大資料-18.kettle安裝使用

學習筆記:從0開始學習大資料-17.Redis安裝及使用

學習筆記:從0開始學習大資料-16. kafka安裝及使用

學習筆記:從0開始學習大資料-15. Flume安裝及使用

學習筆記:從0開始學習大資料-14. java spark程式設計實踐

學習筆記:從0開始學習大資料-13. Eclipse+Scala+Maven Spark開發環境配置

學習筆記:從0開始學習大資料-12. spark安裝部署

學習筆記:從0開始學習大資料-11. sqoop安裝部署

學習筆記:從0開始學習大資料-10. hive安裝部署

學習筆記:從0開始學習大資料-9. MapReduce讀並寫Hbase資料

學習筆記:從0開始學習大資料-8.直接在Eclipse配置執行MapReduce程式

學習筆記:從0開始學習大資料-7.hbase java程式設計hello world

學習筆記:從0開始學習大資料-6.hbase安裝

學習筆記:從0開始學習大資料-5.hadoop hdfs檔案讀寫api操作

學習筆記:從0開始學習大資料-4.Eclipse配置hadoop開發環境

學習筆記:從0開始學習大資料-3.Eclipse安裝

學習筆記:從0開始學習大資料-2.hadoop安裝

學習筆記:從0開始學習大資料-30. solr通過java匯入doc，pdf文件建立全文檢索

相關推薦