1. 程式人生 > >基於Kubernetes的機器學習微服務系統設計系列——(五)預處理微服務

基於Kubernetes的機器學習微服務系統設計系列——(五)預處理微服務

 內容提要

  文字分類任務的預處理階段一般包括中文分詞。這裡中文分詞單提出來,預處理階段的主要任務是停用詞去除、索引詞典的構建、詞文件矩陣化。

實現程式碼

預處理Action實現類

package com.robin.pretreatment.action;

import com.robin.loader.MircoServiceAction;
import com.robin.log.RobinLogger;
import com.robin.pretreatment.DicIndex;
import com.robin.pretreatment.DicIndex.Language; import com.robin.pretreatment.WordDocMatrix; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.logging.Level; import java.util.logging.Logger; import org.codehaus.jettison.json.JSONArray; import org.codehaus.
jettison.json.JSONException; import org.codehaus.jettison.json.JSONObject; /** * <DT><B>描述:</B></DT> * <DD>預處理Action實現類</DD> * * @version Version1.0 * @author Robin * @version <I> V1.0 Date:2018-04-08</I> * @author <I> E-mail:[email protected]
</I> */
public class PretreatAction implements MircoServiceAction { private static final Logger LOGGER = RobinLogger.getLogger(); public enum StatusCode { OK, JSON_ERR, KIND_ERR, VERSION_ERR, MIN_FREQUENCY_ERR, TEXTS_NULL, } private class ActionStatus { StatusCode statusCode; String msg; } private JSONObject getErrorJson(ActionStatus actionStatus) { JSONObject errJson = new JSONObject(); try { errJson.put("status", actionStatus.statusCode.toString()); errJson.put("msg", actionStatus.msg); } catch (JSONException ex) { LOGGER.log(Level.SEVERE, ex.getMessage()); } return errJson; } private ActionStatus checkJSONObjectTerm(JSONObject jsonObj, String key, HashSet<String> valueSet, StatusCode errStatusCode) { ActionStatus actionStatus = new ActionStatus(); try { if (!jsonObj.isNull(key)) { String value = jsonObj.getString(key); if (!valueSet.contains(value)) { actionStatus.msg = "The value [" + value + "] of " + key + " is error."; actionStatus.statusCode = errStatusCode; return actionStatus; } } else { actionStatus.msg = "The input parameter is missing " + key + "."; actionStatus.statusCode = errStatusCode; return actionStatus; } } catch (JSONException ex) { LOGGER.log(Level.SEVERE, ex.getMessage()); } actionStatus.statusCode = StatusCode.OK; return actionStatus; } private ActionStatus checkInputJSONObject(JSONObject jsonObj) { ActionStatus actionStatus = new ActionStatus(); ActionStatus retActionStatus; HashSet<String> valueSet = new HashSet(); valueSet.add("pretreatment"); retActionStatus = checkJSONObjectTerm(jsonObj, "kind", valueSet, StatusCode.KIND_ERR); if (!retActionStatus.statusCode.equals(StatusCode.OK)) { return retActionStatus; } valueSet.clear(); valueSet.add("v1"); retActionStatus = checkJSONObjectTerm(jsonObj, "version", valueSet, StatusCode.VERSION_ERR); if (!retActionStatus.statusCode.equals(StatusCode.OK)) { return retActionStatus; } actionStatus.statusCode = StatusCode.OK; return actionStatus; } @Override public Object action(Object obj) { ActionStatus actionStatus = new ActionStatus(); ActionStatus retActionStatus; if (!(obj instanceof JSONObject)) { actionStatus.msg = "The action arguments is not JSONObject."; LOGGER.log(Level.SEVERE, actionStatus.msg); actionStatus.statusCode = StatusCode.JSON_ERR; return this.getErrorJson(actionStatus); } JSONObject preJson = (JSONObject) obj; retActionStatus = this.checkInputJSONObject(preJson); if (!retActionStatus.statusCode.equals(StatusCode.OK)) { LOGGER.log(Level.SEVERE, retActionStatus.msg); return this.getErrorJson(retActionStatus); } try { long beginTime = System.currentTimeMillis(); JSONObject textsObj = preJson.getJSONObject("texts"); if (null == textsObj) { actionStatus.statusCode = StatusCode.TEXTS_NULL; actionStatus.msg = "The input texts is null."; LOGGER.log(Level.SEVERE, actionStatus.msg); return this.getErrorJson(actionStatus); } DicIndex dicIndex; String lang = preJson.getJSONObject("metadata").getJSONObject("corpus").getString("lang"); if (lang.equals("en")) { dicIndex = new DicIndex(Language.EN); } else { dicIndex = new DicIndex(Language.CN); } JSONObject preMetadataJson = preJson.getJSONObject("metadata").getJSONObject("pretreatment"); dicIndex.create(preJson, preMetadataJson.getInt("minFrequency")); HashMap<String, Integer> dicMap = dicIndex.getDicMap(preJson); if (dicMap.isEmpty()) { JSONObject errJson = new JSONObject(); errJson.put("status", StatusCode.MIN_FREQUENCY_ERR.toString()); errJson.put("result", "The minFrequency is too big."); return errJson; } // 迴圈所有文字 Iterator<String> labelsIt = textsObj.keys(); while (labelsIt.hasNext()) { String label = labelsIt.next(); JSONArray aLabelTextsArr = textsObj.getJSONArray(label); int len = aLabelTextsArr.length(); for (int i = 0; i < len; i++) { JSONObject textJson = aLabelTextsArr.getJSONObject(i); String text = textJson.getString("text"); if (null != text) { String result = WordDocMatrix.create(text, dicMap); String[] wordsDocArr = result.split("-"); textJson.remove("text"); textJson.put("totalWords", Integer.valueOf(wordsDocArr[0])); textJson.put("text", wordsDocArr[1]); } } } long endTime = System.currentTimeMillis(); int spendTime = (int) (endTime - beginTime); preMetadataJson.put("spendTime", spendTime); } catch (JSONException ex) { LOGGER.log(Level.SEVERE, ex.getMessage()); } JSONObject rsp = new JSONObject(); try { rsp.put("status", "OK"); rsp.put("result", preJson); } catch (JSONException ex) { LOGGER.log(Level.SEVERE, ex.getMessage()); } return rsp; } }

停用詞類

package com.robin.pretreatment;

import com.robin.config.ConfigUtil;
import java.util.Arrays;

import com.robin.file.FileUtil;
import com.robin.log.RobinLogger;
import java.util.HashSet;
import java.util.logging.Level;
import java.util.logging.Logger;

/**
 * <DT><B>描述:</B></DT>
 * <DD>停用詞類</DD>
 *
 * @version Version1.0
 * @author Robin
 * @version <I> Date:2018-04-21</I>
 * @author  <I> E-mail:[email protected]</I>
 */
public class StopWords {

    // 日誌
    private static final Logger LOOGER = RobinLogger.getLogger();

    /**
     * 停用詞構造方法
     */
    public StopWords() {
    }

    /**
     * 獲取中文stop words
     *
     * @return 中文stop words
     */
    public HashSet<String> getChineseSet() {
        String cnStopWordsPath = ConfigUtil.getConfig("stopWords.chinese");
        return this.load(cnStopWordsPath);
    }

    /**
     * 獲取英文stop words
     *
     * @return 英文stop words
     */
    public HashSet<String> getEnglishSet() {
        String enStopWordsPath = ConfigUtil.getConfig("stopWords.english");
        return this.load(enStopWordsPath);
    }

    /**
     * 獲取特殊符號
     *
     * @return 特殊符號
     */
    public HashSet<String> getSymbolSet() {
        String symbolPath = ConfigUtil.getConfig("stopWords.symbol");
        return this.load(symbolPath);
    }

    /**
     * 載入 stop words 檔案
     *
     * @param stopWordsPath stop words 檔案路徑
     * @return stop words List
     */
    private HashSet<String> load(String stopWordsPath) {
        HashSet<String> set = new HashSet<>();
        String stopWordsText = FileUtil.readText(stopWordsPath);
        if (null == stopWordsText) {
            LOOGER.log(Level.SEVERE, "讀取停止詞檔案失敗,檢查檔案及路徑.");
            return null;
        }
        String[] words = stopWordsText.split(" ");
        set.addAll(Arrays.asList(words));
        return set;
    }
}

請求JSON

  預處理微服務請求的JSON格式如下,紅框所示請求型別以及回填資料引數。

響應JSON

  預處理服務響應的JSON格式如下,紅框所示返回的結果。