1. 程式人生 > >第二節 Elasticsearch加入中文分詞器IK

第二節 Elasticsearch加入中文分詞器IK

一、簡介 Elasticsearch 內建的分詞器是standard對英文分詞還好,但對中文的支援就比較弱,所以需要外引入一箇中文分詞器。目前比較流行的中文分詞器有: IKAnalyzer MMSeg4j、 Paoding等等。此次引入的是IKAnalyzer。
二、下載和安裝 1、IK下載地址: https://github.com/medcl/elasticsearch-analysis-ik 需要根據 Elasticsearch  的版本號找到相應的IK版本。或者
https://github.com/medcl/elasticsearch-analysis-ik/releases/download/v5.3.0/elasticsearch-analysis-ik-5.3.0.zip 直接下載對應的版本 2、解壓後/elasticsearch-analysis-ik-5.3.0在此目錄下執行mvn clean package進行打包(此過程需要從maven上下載相關的jar所以時間會有點長) 3、打完包後找到elasticsearch-analysis-ik-5.3.0\target\releases\elasticsearch-analysis-ik-5.3.0.zip並將此檔案copy到
ES_HOME/plugins/ik(如果沒有可新建)。然後解壓即可。 至此,即可安裝完成,注意:網上有很多還需要修改elasticsearch.yml。這是在5.X之前的版本配置方式,在5.X之後分詞器的配置則不在全域性配置檔案中了,而是在settings和mappings進行配置。
三、新增自己的分詞 隨著業務的發展可能會有新的詞語出現,這時就需要搜尋時也要支援這些新詞,就需要在IK配置中擴充套件詞詞 1、找到elasticsearch-5.3.0\plugins\ik\config\IKAnalyzer.cfg.xml內容如下:
<?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd"> <properties> <comment>IK Analyzer 擴充套件配置</comment> <!--使用者可以在這裡配置自己的擴充套件字典 --> <entry key="ext_dict">custom/mydict.dic;custom/mydict1.dic;custom/single_word_low_freq.dic</entry> <!--使用者可以在這裡配置自己的擴充套件停止詞字典--> <entry key="ext_stopwords">custom/ext_stopword.dic</entry> <!--使用者可以在這裡配置遠端擴充套件字典 --> <!-- <entry key="remote_ext_dict">words_location</entry> --> <!--使用者可以在這裡配置遠端擴充套件停止詞字典--> <!-- <entry key="remote_ext_stopwords">words_location</entry> --> </properties> 註釋寫的很清楚可以在custom/mydict.dic;中新增新詞,一個詞一行。注意:儲存時一定要儲存為UTF-8格式,否則不生效。也可以使用遠端擴充套件字典,返回的可以是一個頁面,也可以是一個txt的文件,但要保證輸出的內容是 utf-8 的格式 並且ik 接收兩個返回的頭部屬性 Last-Modified 和 ETag。 只要其中一個有變化,就會觸發更新,ik 會每分鐘獲取一次
四、測試 GET方式:http://localhost:9200/es1/_analyze?pretty&analyzer=ik_max_word&text="中華人民共和國" 可以檢視到分詞的結果。 &analyzer有兩種分詞方式: ik_max_word: 會將文字做最細粒度的拆分;儘可能多的拆分出詞語 ik_smart:會做最粗粒度的拆分;已被分出的詞語將不會再次被其它詞語佔有
URL使用方式可以參考: https://github.com/medcl/elasticsearch-analysis-ik 下的 Quick Example
java API方式:
package com.els.util;

import com.els.common.Const;
import com.google.common.collect.Maps;
import org.elasticsearch.action.ActionListener;
import org.elasticsearch.action.admin.indices.get.GetIndexRequest;
import org.elasticsearch.action.admin.indices.get.GetIndexResponse;
import org.elasticsearch.action.admin.indices.mapping.put.PutMappingRequest;
import org.elasticsearch.action.admin.indices.settings.put.UpdateSettingsRequest;
import org.elasticsearch.action.bulk.BulkResponse;
import org.elasticsearch.action.bulk.byscroll.BulkByScrollResponse;
import org.elasticsearch.action.delete.DeleteResponse;
import org.elasticsearch.action.get.GetResponse;
import org.elasticsearch.action.get.MultiGetItemResponse;
import org.elasticsearch.action.get.MultiGetRequest;
import org.elasticsearch.action.get.MultiGetResponse;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.action.index.IndexResponse;
import org.elasticsearch.action.search.MultiSearchResponse;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.action.update.UpdateRequest;
import org.elasticsearch.action.update.UpdateResponse;
import org.elasticsearch.client.transport.TransportClient;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.transport.InetSocketTransportAddress;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentFactory;
import org.elasticsearch.index.IndexNotFoundException;
import org.elasticsearch.index.VersionType;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.index.reindex.DeleteByQueryAction;
import org.elasticsearch.index.reindex.DeleteByQueryRequestBuilder;
import org.elasticsearch.script.Script;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder;
import org.elasticsearch.transport.client.PreBuiltTransportClient;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * Created by xiongps on 2017/8/11.
 */
public class ElsUtil2 {

    private static Logger logger = LoggerFactory.getLogger(ElsUtil2.class);

    private static TransportClient client;

    public static TransportClient getClient(){
        Settings settings = Settings.builder()
                .put("cluster.name", "myApplication").build();

        try {
            TransportClient client = new PreBuiltTransportClient(settings)

                    .addTransportAddress(new InetSocketTransportAddress(
                            InetAddress.getByName("localhost"), 9300));
            ElsUtil2.client = client;
            return client;
        } catch (UnknownHostException e) {
            e.printStackTrace();
        }
        return null;
    }

    public static void closeClient(){
        if(client != null) {
            client.close();
        }
    }

    public static void createIndex(String indices) throws IOException {

        //預設setting建立索引
       // client.admin().indices().prepareCreate("es1").get();

        //獲取到所有的索引名稱
       String []aa = client.admin().indices().prepareGetIndex().get().getIndices();

        for(String a:aa) {
            logger.debug("a:"+a);
        }
        try{
            //獲取具體的某一個索引的資訊,注意:若給出的索引名稱不存在,會丟擲異常
            GetIndexResponse getIndexResponse =
                    client.admin().indices().getIndex(new GetIndexRequest().indices("es2")).actionGet();

            logger.debug("getIndexResponse:"+getIndexResponse.indices().length);
        }catch (IndexNotFoundException e) {
            logger.debug("getIndexResponse IndexNotFoundException:未找到es2");
        }

        //settings設定分片數和副本數
        Map<String,Object> settingsBuilder = new HashMap<>();
        settingsBuilder.put("number_of_shards", "5");
        settingsBuilder.put("number_of_replicas", "1");

        //mappings
        XContentBuilder mappingsBuilder = getMappings("test_type");

        //建立索引
        client.admin().indices().prepareCreate(indices)
                .setSettings(settingsBuilder)
                .addMapping("test_type",mappingsBuilder).get();

    }
    public static void updateSettings(String indices,Map<String,Object> settingsBuilder){
        //修改indexsettings
        client.admin().indices()
                .updateSettings(
                        new UpdateSettingsRequest()
                                .indices(indices)
                                .settings(settingsBuilder)).actionGet();
    }
    public static void putMappings(String indices,String type,XContentBuilder mappingsBuilder){
        //修改indextypemappings
        client.admin().indices().putMapping(
                new PutMappingRequest()
                        .indices(indices).type(type)
                        .source(mappingsBuilder)).actionGet();
    }

    public static void addDatas(String indices,String type,List<Map<String,Object>> dataMapList,String idFieldName){

        for(Map<String,Object> dataMap:dataMapList) {
            /**同步的方式
            IndexResponse response = client.prepareIndex(indices,type)
                    .setSource(dataMap).setId(String.valueOf(dataMap.get(idFieldName)))
                    .execute().actionGet();
            */
            //非同步的方式
            client.prepareIndex(indices,type)
                    .setSource(dataMap).setId(String.valueOf(dataMap.get(idFieldName)))
                    .execute(new ActionListener<IndexResponse>() {
                        @Override
                        public void onResponse(IndexResponse indexResponse) {
                            logger.debug("addDatas IndexResponse:{}",indexResponse);
                        }

                        @Override
                        public void onFailure(Exception e) {
                            logger.error("addDatas Exception IndexResponse:{}",e);
                        }
                    });
        }


    }

    public static void updateDataById(String indices,String type,String id,Map<String,Object> updMap,Long version){
        UpdateRequest updateRequest = new UpdateRequest();
        updateRequest.index(indices);
        updateRequest.type(type);
        updateRequest.id(id);
        updateRequest.doc(updMap);
        if(version != null) {
            updateRequest.version(version);
            updateRequest.versionType(VersionType.INTERNAL);
        }
        client.update(updateRequest).actionGet();
    }

    public static void updateDataAsPrepareById(String indices,String type,String id,String field,String value,Long version){
        client.prepareUpdate(indices,type,id)
                .setVersion(version).setVersionType(VersionType.INTERNAL)
                //.setDoc(updMap)
                .setScript(new Script("ctx._source."+field+" = \""+value+"\""))
                .get();
    }

    public static void delete(String indices,String type,String id,Long version){
        //版本號(插入,刪除)        //VersionType.INTERNAL 內部版本號,只有等於當前版本號才可以進行操作
        //VersionType.EXTERNAL 外部版本號,只有大於當前版本號才可以進行操作,且update不支援此型別
        DeleteResponse response =
                client.prepareDelete(indices, type, id)
                        .setVersion(version)
                        .setVersionType(VersionType.EXTERNAL)//外部版本號,或內部版本號
                        .get();
    }

    public static void deleteByQuery(String indices,String type,String matchNm,String text){

        DeleteByQueryRequestBuilder deleteByQueryRequestBuilder =
                DeleteByQueryAction.INSTANCE.newRequestBuilder(client);
        deleteByQueryRequestBuilder.source().setIndices(indices).setTypes(type);
        //為什麼提供了.source(“es”)設定setIndices,而沒有提供setTypes??
        //同步刪除
        //BulkByScrollResponse response =
        //        deleteByQueryRequestBuilder.filter(QueryBuilders.matchQuery("name", "java"))
        //                .get();
        //long deleted = response.getDeleted();

        //logger.info("delete:{}",deleted);
        //非同步刪除
        deleteByQueryRequestBuilder.filter(QueryBuilders.matchQuery(matchNm, text))
                        .execute(new ActionListener<BulkByScrollResponse>() {
                            @Override
                            public void onResponse(BulkByScrollResponse bulkByScrollResponse) {
                                long deleted = bulkByScrollResponse.getDeleted();
                                logger.info("非同步delete:{}",deleted);
                            }

                            @Override
                            public void onFailure(Exception e) {
                                logger.info("非同步delete");
                                logger.error("非同步刪除錯誤:{}",e);
                            }
                        });

    }

    public static void highlightQuery(String indices,String type,String matchNm,String text){

        HighlightBuilder highlightBuilder = new HighlightBuilder()
                .field("*").requireFieldMatch(false)
               // .field("name").requireFieldMatch(false)
               // .highlightQuery(QueryBuilders.queryStringQuery(text))
                .preTags(Const.HIGHLIGHT_PRE_TAGS)
                .postTags(Const.HIGHLIGHT_POST_TAGS);

        SearchResponse searchResponse = client.prepareSearch().setIndices(indices).setTypes(type)

                .setQuery(QueryBuilders.disMaxQuery()
                        .add(QueryBuilders.queryStringQuery(text))
                        .add(QueryBuilders.matchQuery(matchNm,text)))
                .highlighter(highlightBuilder)
                .setFrom(0).setSize(5)//分頁
                .execute().actionGet();

        logger.info("查詢的結果:{}",searchResponse);

    }

    public static void countQuery(String indices,String type,String matchNm,String text){
        SearchResponse searchResponse = client.prepareSearch().setIndices(indices).setTypes(type)
                .setQuery(QueryBuilders.disMaxQuery()
                        .add(QueryBuilders.matchQuery(matchNm,text)))
                .setSize(0)//不要資料
                .execute().actionGet();
        logger.info("查詢的結果:{}",searchResponse.getHits().getTotalHits());
        logger.info("查詢的結果:{}",searchResponse);
    }

    public static XContentBuilder getMappings(String type){

        try{
            XContentBuilder mappingsBuilder = XContentFactory.jsonBuilder();
            if("test_type".equals(type)) {
                mappingsBuilder.startObject()
                        .field("dynamic", "stu")
                        .startObject("properties")
                        .startObject("id").field("type","long").field("store", "yes").field("index", "not_analyzed")
                        .endObject()
                        .startObject("name").field("type", "string").field("index", "analyzed").field("analyzer", "ik_max_word").field("search_analyzer", "ik_smart")
                        .endObject()
                        .startObject("desc").field("type", "string").field("index", "analyzed").field("analyzer", "ik_max_word").field("search_analyzer", "ik_smart")
                        .endObject()
                        .endObject()
                        .endObject();
            }
            return mappingsBuilder;
        }catch (IOException e) {
            e.printStackTrace();
        }
        return null;
    }


    public static List<Map<String,Object>> getDataList(){
        List<Map<String,Object>> list = new ArrayList<>();
        Map<String,Object> map = Maps.newHashMap();
        map.put("id","6");
        map.put("name","java陳港生");
        map.put("desc","港生中華人民共和國產生10個分片,藍瘦");
        list.add(map);

        map = Maps.newHashMap();
        map.put("id","7");
        map.put("name","IK港生測試");
        map.put("desc","一個從分片,那麼就有5個從分片陳港生,那麼預設配置會");
        list.add(map);
        return list;
    }


}