1. 程式人生 > >學習筆記:從0開始學習大資料-30. solr通過java匯入doc,pdf文件建立全文檢索

學習筆記:從0開始學習大資料-30. solr通過java匯入doc,pdf文件建立全文檢索

1. eclipse 新建maven專案solr,pom.xml 加入依賴

2 在專案下新建類updoctest

package com.linbin.solr;

import java.io.File;
import java.io.IOException;
import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.HttpSolrClient;
import org.apache.solr.client.solrj.request.AbstractUpdateRequest.ACTION;
import org.apache.solr.client.solrj.request.ContentStreamUpdateRequest;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;

public class updoctest {
	public static String solrUrl = "http://centos7:8983/solr/mycore"; 
	
	public static void main(String[] args) throws Exception {
	//查詢	findIndex1();
	//刪除	deleteIndexById();
    // 匯入doc文件
        String fileName = "/home/linbin/文件/能工巧匠進校園.doc";
        String solrId = "能工巧匠進校園.doc";
        indexFilesSolrCell(solrId, solrId,fileName);
	}
	
	// 查詢測試
	public static void findIndex1() throws IOException, SolrServerException {
        HttpSolrClient solrClient = new HttpSolrClient.Builder(solrUrl).build();        
        SolrQuery query = new SolrQuery();  // 建立搜尋物件    
        query.set("q","*:*");        // 設定搜尋條件
        query.setRows(10);         //設定每頁顯示多少條
        QueryResponse response = solrClient.query(query);        //發起搜尋請求
        SolrDocumentList docs = response.getResults();       // 查詢結果
        long cnt = docs.getNumFound();  // 查詢結果總數
        System.out.println("總條數為"+cnt+"條");        
        for (SolrDocument doc : docs) {
        	// System.out.println(doc);
        	System.out.println("-------------\r\n");
         System.out.println("id:"+ doc.get("id") + ",autor:"+ doc.get("author") + ",text:"+ doc.get("text"));
        }
        solrClient.close();
    }

//刪除測試	
public static  void deleteIndexById() throws IOException, SolrServerException {
        HttpSolrClient solrClient = new HttpSolrClient.Builder(solrUrl).build(); 
        //全刪        //solrClient.deleteByQuery("*:*");       
        //模糊匹配刪除(帶有分詞效果的刪除)
        solrClient.deleteByQuery("id:solr-word.pdf");        
        //指定id刪除        //solrClient.deleteById("1");        
        solrClient.commit();
    }

// 匯入doc文件測試    
public static void indexFilesSolrCell(String fileName, String solrId, String path)
            throws IOException, SolrServerException
    {
        SolrClient solr = new HttpSolrClient.Builder(solrUrl).build();

        ContentStreamUpdateRequest up = new ContentStreamUpdateRequest("/update/extract");
        String contentType = getFileContentType(fileName);
        up.addFile(new File(path), contentType);
        up.setParam("literal.id", fileName);
        up.setParam("uprefix", "ignored_");
        up.setParam("fmap.content", "text");//檔案內容
        up.setAction(ACTION.COMMIT, true, true);
        solr.request(up);
        System.out.println("upload ok! \r\n");
    }
    
//根據檔案拓展名獲取檔案型別    
public static String getFileContentType(String filename) {
        String contentType = "";
        String prefix = filename.substring(filename.lastIndexOf(".") + 1);
        if (prefix.equals("xlsx")) {
            contentType = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
        } else if (prefix.equals("pdf")) {
            contentType = "application/pdf";
        } else if (prefix.equals("doc")) {
            contentType = "application/msword";
        } else if (prefix.equals("txt")) {
            contentType = "text/plain";
        } else if (prefix.equals("xls")) {
            contentType = "application/vnd.ms-excel";
        } else if (prefix.equals("docx")) {
            contentType = "application/vnd.openxmlformats-officedocument.wordprocessingml.document";
        } else if (prefix.equals("ppt")) {
            contentType = "application/vnd.ms-powerpoint";
        } else if (prefix.equals("pptx")) {
            contentType = "application/vnd.openxmlformats-officedocument.presentationml.presentation";
        }

        else {
            contentType = "othertype";
        }

        return contentType;
    }
	
}

2. 在solr的core目錄下的solrconfig.xml增加如下內容:

 <requestHandler name="/update/extract" startup="lazy" class="solr.extraction.ExtractingRequestHandler">
      <lst name="defaults">
      <str name="lowernames">true</str>
      <str name="uprefix">ignored_</str> 

       <str name="fmap.content">text</str>
      </lst>
  </requestHandler>

其中 <str name="uprefix">ignored_</str> 部分是把讀取檔案時不需要對映的欄位忽略掉

  <str name="fmap.content">text</str>  是把讀取的fmap.content欄位對映為solr的 text欄位

3. 修改 managed-schema 檔案,增加

  <dynamicField name="ignored_*" type="ignored" multiValued="true"/>
  <fieldType name="ignored" stored="false" indexed="false" multiValued="true" class="solr.StrField" />

這個是生成一個動態欄位,型別為ignored,承接忽略的那些欄位

4. 檢查 solrconfig.xml

  <lib dir="${solr.install.dir:../../..}/dist/" regex="solr-dataimporthandler-.*\.jar"/>
  <lib dir="${solr.install.dir:../../..}/contrib/extraction/lib" regex=".*\.jar"/>

檢查以上路徑是否匹配,是相對於建立的mycore路徑

5. 在mycore目錄下建立lib目錄(如果沒有)

複製  solr-7.5.0/contrib/extraction/lib下的所有檔案 到mycore/lib目錄

複製 solr-7.5.0/dist/solr-cell-7.5.0.jar 到mycore/lib目錄

6.重新啟動solr,如正常啟動,再在eclipse 執行第1步建立的java程式

7. 在solr網頁查詢可以檢查到已上傳doc檔案的索引