1. 程式人生 > >基於Kubernetes的機器學習微服務系統設計系列——(四)中文分詞微服務

基於Kubernetes的機器學習微服務系統設計系列——(四)中文分詞微服務

 內容提要

  中文分詞微服務包括分詞方法有:RobinSeg(RS)、IKAnalyzer(IK)、JEAnalysis(JE)、MmSeg4j(MS)、PaoDing(PD)、SmallSeg4j(SS)。其中RS分詞實現見我的文章:知更鳥中文分詞RS設計實現 ,其他分詞方法都採用釋出的jar包進行封裝裝。

設計模式

  主要涉及外觀模式、介面卡模式、工廠模式和單例模式。分詞微服務類圖如圖所示:

中文分詞微服務類圖

  設計原則:(1)針對介面程式設計,不要針對實現;(2)只和最緊密的類互動;(3)封裝變化;(4)鬆耦合設計。
  外觀模式

:提供一個統一的介面,用來訪問子系統中的一群介面,外觀定義了一個高層介面,讓子系統更容易使用。我們採用統一的分詞外觀類封裝各種分詞介面,提供一個一致的高層介面。
  介面卡模式:將一個類的介面,轉換成客戶期望的另一個介面。介面卡讓原本介面不相容的類可以合作無間。各種分詞的的私有實現介面需要一個提供一個統一的介面呼叫。
  工廠模式:定義一個建立物件的介面,但有子類決定要例項化的類是哪一個。提供統一的分詞工廠,建立分類例項物件。
  單例模式:確保一個類只有一個例項,並提供了一個全域性訪問點。由於各種分詞物件的建立、載入詞典等需要申請大量的記憶體,耗費大量的時間,所以所分詞器例項都通過介面卡進行控制只建立一個例項。

程式碼實現

中文分詞介面抽象類

package com.robin.segment;

import com.robin.log.RobinLogger;
import java.util.logging.Logger;

/**
 * <DT><B>描述:</B></DT>
 * <DD>中文分詞介面抽象類</DD>
 *
 * @version Version1.0
 * @author  Robin
 * @version <I> Date:2018-04-18</I>
 * @author  <I> E-mail:
[email protected]
</I> */
public abstract class AbstractSegmenter { /** 日誌 */ protected static final Logger LOGGER = RobinLogger.getLogger(); /** * 分詞抽象方法 * * @param text 文字 * @param SEPARATOR 分隔符 * @return 已分詞文字 */ public abstract String segment(String text, String SEPARATOR); }

統一分詞器外觀類

package com.robin.segment;

import com.robin.log.RobinLogger;
import com.robin.segment.SegmentFactory.SegmentMethod;
import com.robin.segment.robinseg.RobinSeg;
import com.robin.segment.robinseg.SegmentArgs;
import java.util.logging.Logger;

/**
 * <DT><B>描述:</B></DT>
 * <DD>統一分詞器外觀類</DD>
 * <DD>外觀模式</DD>
 *
 * @version 1.0
 * @author Robin
 * @version <I> Date:2018-04-19</I>
 * @author  <I> E-mail:[email protected]</I>
 */
public class SegmentFacade {

    // 日誌
    private static final Logger LOGGER = RobinLogger.getLogger();

    /**
     * 獲取分詞器配置引數物件
     *
     * @param methodName 分詞方法
     * @return SegmentArgs
     */
    public static SegmentArgs getSegmentArgsObj(SegmentMethod methodName) {
        AbstractSegmenter segment = SegmentFactory.getSegInstance(methodName);
        if (methodName.equals(SegmentMethod.RS)) {
            return ((RobinSeg) segment).getSegmentConfInstance();
        }
        return null;
    }

    /**
     * <DD>根據不同分詞演算法進行分詞,</DD>
     * <DD>傳入演算法名錯誤或預設情況下用RobinSeg分詞。</DD>
     *
     * @param methodName 分詞方法名稱,“SegmentMethod.IK”,“.JE”,“.MS”,“.PD”,“.SS”,
     * “.RS”
     * @param text 待分詞文字
     * @param separator 分隔符
     * @return 使用分隔符分好詞文字
     */
    public static String split(SegmentMethod methodName, String text, String separator) {
        AbstractSegmenter segmenter = SegmentFactory.getSegInstance(methodName);
        return segmenter.segment(text, separator);
    }
}

分詞Action實現類

package com.robin.segment.action;

import com.robin.loader.MircoServiceAction;
import com.robin.log.RobinLogger;
import com.robin.segment.SegmentFacade;
import com.robin.segment.SegmentFactory.SegmentMethod;
import com.robin.segment.robinseg.SegmentArgs;
import com.robin.segment.robinseg.SegmentArgs.SegAlgorithm;
import java.util.HashSet;
import java.util.Iterator;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.codehaus.jettison.json.JSONArray;
import org.codehaus.jettison.json.JSONException;
import org.codehaus.jettison.json.JSONObject;

/**
 * <DT><B>描述:</B></DT>
 * <DD>分詞Action實現類</DD>
 *
 * @version Version1.0
 * @author Robin
 * @version <I> V1.0 Date:2018-06-05</I>
 * @author  <I> E-mail:[email protected]</I>
 */
public class SegmentAction implements MircoServiceAction {

    private static final Logger LOGGER = RobinLogger.getLogger();

    public enum StatusCode {
        OK,
        JSON_ERR,
        KIND_ERR,
        VERSION_ERR,
        SEGMETHOD_ERR,
        SEPARATOR_ERR,
        SEGMENT_FAILED,
        TEXTS_NULL,
    }

    private class ActionStatus {

        StatusCode statusCode;
        String msg;

    }

    private JSONObject getErrorJson(ActionStatus actionStatus) {
        JSONObject errJson = new JSONObject();
        try {
            errJson.put("status", actionStatus.statusCode.toString());
            errJson.put("msg", actionStatus.msg);
        } catch (JSONException ex) {
            LOGGER.log(Level.SEVERE, ex.getMessage());
        }
        return errJson;
    }

    private ActionStatus checkJSONObjectTerm(JSONObject jsonObj,
            String key,
            HashSet<String> valueSet,
            StatusCode errStatusCode) {
        ActionStatus actionStatus = new ActionStatus();

        try {
            if (!jsonObj.isNull(key)) {
                String value = jsonObj.getString(key);
                if (!valueSet.contains(value)) {
                    actionStatus.msg = "The value [" + value + "] of " + key + " is error.";
                    actionStatus.statusCode = errStatusCode;
                    return actionStatus;
                }
            } else {
                actionStatus.msg = "The input parameter is missing " + key + ".";
                actionStatus.statusCode = errStatusCode;
                return actionStatus;
            }

        } catch (JSONException ex) {
            LOGGER.log(Level.SEVERE, ex.getMessage());
        }

        actionStatus.statusCode = StatusCode.OK;
        return actionStatus;
    }

    private ActionStatus checkInputJSONObject(JSONObject jsonObj) {
        ActionStatus actionStatus = new ActionStatus();
        ActionStatus retActionStatus;

        JSONObject argsJson;
        HashSet<String> valueSet = new HashSet();

        try {
            valueSet.add("segment");
            retActionStatus = checkJSONObjectTerm(jsonObj, "kind", valueSet, StatusCode.KIND_ERR);
            if (!retActionStatus.statusCode.equals(StatusCode.OK)) {
                return retActionStatus;
            }

            valueSet.clear();
            valueSet.add("v1");
            retActionStatus = checkJSONObjectTerm(jsonObj, "version", valueSet, StatusCode.VERSION_ERR);
            if (!retActionStatus.statusCode.equals(StatusCode.OK)) {
                return retActionStatus;
            }

            JSONObject segmentMetadata = jsonObj.getJSONObject("metadata").getJSONObject("segment");

            valueSet.clear();
            valueSet.add("RS");
            valueSet.add("IK");
            valueSet.add("JE");
            valueSet.add("MS");
            valueSet.add("PD");
            valueSet.add("SS");
            retActionStatus = checkJSONObjectTerm(segmentMetadata, "method", valueSet, StatusCode.SEGMETHOD_ERR);
            if (!retActionStatus.statusCode.equals(StatusCode.OK)) {
                return retActionStatus;
            }

            valueSet.clear();
            valueSet.add(" ");
            valueSet.add("|");
            valueSet.add("/");
            retActionStatus = checkJSONObjectTerm(segmentMetadata, "separator", valueSet, StatusCode.SEPARATOR_ERR);
            if (!retActionStatus.statusCode.equals(StatusCode.OK)) {
                return retActionStatus;
            }

            // 設定RobinSeg分詞引數
            String method = segmentMetadata.getString("method");
            SegmentMethod segmentMethod = SegmentMethod.valueOf(method);
            if ((segmentMethod.equals(SegmentMethod.RS)) && (!segmentMetadata.isNull("args"))) {
                argsJson = segmentMetadata.getJSONObject("args");
                SegmentArgs segmentArgs = SegmentFacade.getSegmentArgsObj(segmentMethod);
                if (null != segmentArgs) {
                    if (!argsJson.isNull("algorithm")) {
                        String algorithm = argsJson.getString("algorithm");
                        segmentArgs.setSegAlgorithm(SegAlgorithm.valueOf(algorithm.toUpperCase()));
                    }
                    if (!argsJson.isNull("cleanSymbol")) {
                        Boolean flag = argsJson.getBoolean("cleanSymbol");
                        segmentArgs.setCleanSymbolFlag(flag);
                    }
                    if (!argsJson.isNull("markNewWord")) {
                        Boolean flag = argsJson.getBoolean("markNewWord");
                        segmentArgs.setMarkNewWordFlag(flag);
                    }
                    if (!argsJson.isNull("downcasing")) {
                        Boolean flag = argsJson.getBoolean("downcasing");
                        segmentArgs.setDowncasingFlag(flag);
                    }
                    if (!argsJson.isNull("mergePattern")) {
                        Boolean flag = argsJson.getBoolean("mergePattern");
                        segmentArgs.setMergePatternFlag(flag);
                    }
                    if (!argsJson.isNull("retrievalPattern")) {
                        Boolean flag = argsJson.getBoolean("retrievalPattern");
                        segmentArgs.setRetrievalPatternFlag(flag);
                    }
                }
            }
        } catch (JSONException ex) {
            LOGGER.log(Level.SEVERE, ex.getMessage());
        }

        actionStatus.statusCode = StatusCode.OK;

        return actionStatus;
    }

    @Override
    public Object action(Object obj) {

        ActionStatus actionStatus = new ActionStatus();
        ActionStatus retActionStatus;

        if (!(obj instanceof JSONObject)) {
            actionStatus.msg = "The action arguments is not JSONObject.";
            LOGGER.log(Level.SEVERE, actionStatus.msg);
            actionStatus.statusCode = StatusCode.JSON_ERR;
            return this.getErrorJson(actionStatus);
        }

        JSONObject jsonObj = (JSONObject) obj;
        retActionStatus = this.checkInputJSONObject(jsonObj);
        if (!retActionStatus.statusCode.equals(StatusCode.OK)) {
            LOGGER.log(Level.SEVERE, retActionStatus.msg);
            return this.getErrorJson(retActionStatus);
        }

        SegmentMethod segmentMethod;
        String separator;
        JSONObject texts;

        try {
            JSONObject segmentMetadata = jsonObj.getJSONObject("metadata").getJSONObject("segment");
            String method = segmentMetadata.getString("method");
            segmentMethod = SegmentMethod.valueOf(method);
            separator = segmentMetadata.getString("separator");
            texts = jsonObj.getJSONObject("texts");
            long beginTime = System.currentTimeMillis();
            if (null == texts) {
                actionStatus.statusCode = StatusCode.TEXTS_NULL;
                actionStatus.msg = "The input texts is null.";
                LOGGER.log(Level.SEVERE, actionStatus.msg);
                return this.getErrorJson(actionStatus);
            }

            Iterator labelsIt = texts.keys();
            while (labelsIt.hasNext()){
                String label = (String) labelsIt.next();
                JSONArray aLabelTexts = texts.getJSONArray(label);
                int len = aLabelTexts.length();
                for (int i = 0; i < len; i++) {
                    JSONObject textJson = aLabelTexts.getJSONObject(i);
                    String text = textJson.getString("text");
                    if (null != text) {
                        String result = SegmentFacade.split(segmentMethod, text, separator);
                        textJson.put("text", result);
                    }
                }
            }

            long endTime = System.currentTimeMillis();
            int spendTime = (int) (endTime - beginTime);
            segmentMetadata.put("spendTime", spendTime);
        } catch (JSONException ex) {
            LOGGER.log(Level.SEVERE, ex.getMessage());
        }

        JSONObject rsp = new JSONObject();
        try {
            rsp.put("status", "OK");
            rsp.put("result", jsonObj);
        } catch (JSONException ex) {
            LOGGER.log(Level.SEVERE, ex.getMessage());
        }
        return rsp;
    }
}

分詞例項工廠方法類

package com.robin.segment;

import com.robin.segment.adapter.SmallSeg4jAdapter;
import com.robin.segment.adapter.MmSeg4jAdapter;
import com.robin.segment.adapter.IKAnalyzerAdapter;
import com.robin.segment.adapter.JEAnalysisAdapter;
import com.robin.segment.adapter.PaoDingAdapter;
import com.robin.log.RobinLogger;
import com.robin.segment.robinseg.RobinSeg;
import java.util.logging.Level;
import java.util.logging.Logger;

/**
 * <DT><B>描述:</B></DT>
 * <DD>分詞例項工廠方法類</DD>
 *
 * @version Version1.0
 * @author  Robin
 * @version <I> Date:2018-04-19</I>
 * @author  <I> E-mail:[email protected]</I>
 */
public class SegmentFactory {

    // 日誌
    private static final Logger LOGGER = RobinLogger.getLogger();

    /** 分詞演算法名稱標記 */
    public enum SegmentMethod {

        /** JE  = "JEAnalysis" */
        JE,
        /** IK  = "IKAnalyzer"*/
        IK,
        /** MS  = "MmSeg4j" */
        MS,
        /** PD  = "PaoDing" */
        PD,
        /** SS  = "SmallSeg4j" */
        SS,
        /** RS  = "RobinSeg" */
        RS
    }

    /**
     * 建立具體分詞類例項
     *
     * @param methodName 分詞方法名稱,“SegmentMethod.IK”,“.JE”,“.MS”,“.PD”,“.SS”,“.RS”
     * @return 具體分詞方法例項
     */
    public static AbstractSegmenter getSegInstance(SegmentMethod methodName) {
        if (null == methodName) {
            methodName = SegmentMethod.RS;
        }
        switch (methodName) {
            case JE:
                return JEAnalysisAdapter.getInstance()
            
           

相關推薦

no