文字分類任務的預處理階段一般包括中文分詞。這裡中文分詞單提出來,預處理階段的主要任務是停用詞去除、索引詞典的構建、詞文件矩陣化。

實現程式碼

預處理Action實現類

package com.robin.pretreatment.action;

import com.robin.loader.MircoServiceAction;
import com.robin.log.RobinLogger;
import com.robin.pretreatment.DicIndex;
import com.robin.pretreatment.DicIndex.Language;
import com.robin.pretreatment.WordDocMatrix;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.codehaus.jettison.json.JSONArray;
import org.codehaus.jettison.json.JSONException;
import org.codehaus.jettison.json.JSONObject;

/**
 * <DT><B>描述:</B></DT>
 * <DD>預處理Action實現類</DD>
 *
 * @version Version1.0
 * @author Robin
 * @version <I> V1.0 Date:2018-04-08</I>
 * @author  <I> E-mail:[email protected]</I>
 */
public class PretreatAction implements MircoServiceAction {

    private static final Logger LOGGER = RobinLogger.getLogger();

    public enum StatusCode {
        OK,
        JSON_ERR,
        KIND_ERR,
        VERSION_ERR,
        MIN_FREQUENCY_ERR,
        TEXTS_NULL,
    }

    private class ActionStatus {

        StatusCode statusCode;
        String msg;

    }

    private JSONObject getErrorJson(ActionStatus actionStatus) {
        JSONObject errJson = new JSONObject();
        try {
            errJson.put("status", actionStatus.statusCode.toString());
            errJson.put("msg", actionStatus.msg);
        } catch (JSONException ex) {
            LOGGER.log(Level.SEVERE, ex.getMessage());
        }
        return errJson;
    }

    private ActionStatus checkJSONObjectTerm(JSONObject jsonObj,
            String key,
            HashSet<String> valueSet,
            StatusCode errStatusCode) {
        ActionStatus actionStatus = new ActionStatus();

        try {
            if (!jsonObj.isNull(key)) {
                String value = jsonObj.getString(key);
                if (!valueSet.contains(value)) {
                    actionStatus.msg = "The value [" + value + "] of " + key + " is error.";
                    actionStatus.statusCode = errStatusCode;
                    return actionStatus;
                }
            } else {
                actionStatus.msg = "The input parameter is missing " + key + ".";
                actionStatus.statusCode = errStatusCode;
                return actionStatus;
            }

        } catch (JSONException ex) {
            LOGGER.log(Level.SEVERE, ex.getMessage());
        }

        actionStatus.statusCode = StatusCode.OK;
        return actionStatus;
    }

    private ActionStatus checkInputJSONObject(JSONObject jsonObj) {
        ActionStatus actionStatus = new ActionStatus();
        ActionStatus retActionStatus;

        HashSet<String> valueSet = new HashSet();

        valueSet.add("pretreatment");
        retActionStatus = checkJSONObjectTerm(jsonObj, "kind", valueSet, StatusCode.KIND_ERR);
        if (!retActionStatus.statusCode.equals(StatusCode.OK)) {
            return retActionStatus;
        }

        valueSet.clear();
        valueSet.add("v1");
        retActionStatus = checkJSONObjectTerm(jsonObj, "version", valueSet, StatusCode.VERSION_ERR);
        if (!retActionStatus.statusCode.equals(StatusCode.OK)) {
            return retActionStatus;
        }

        actionStatus.statusCode = StatusCode.OK;
        return actionStatus;
    }

    @Override
    public Object action(Object obj) {

        ActionStatus actionStatus = new ActionStatus();
        ActionStatus retActionStatus;

        if (!(obj instanceof JSONObject)) {
            actionStatus.msg = "The action arguments is not JSONObject.";
            LOGGER.log(Level.SEVERE, actionStatus.msg);
            actionStatus.statusCode = StatusCode.JSON_ERR;
            return this.getErrorJson(actionStatus);
        }

        JSONObject preJson = (JSONObject) obj;
        retActionStatus = this.checkInputJSONObject(preJson);
        if (!retActionStatus.statusCode.equals(StatusCode.OK)) {
            LOGGER.log(Level.SEVERE, retActionStatus.msg);
            return this.getErrorJson(retActionStatus);
        }

        try {
            long beginTime = System.currentTimeMillis();

            JSONObject textsObj = preJson.getJSONObject("texts");
            if (null == textsObj) {
                actionStatus.statusCode = StatusCode.TEXTS_NULL;
                actionStatus.msg = "The input texts is null.";
                LOGGER.log(Level.SEVERE, actionStatus.msg);
                return this.getErrorJson(actionStatus);
            }
            DicIndex dicIndex;
            String lang = preJson.getJSONObject("metadata").getJSONObject("corpus").getString("lang");
            if (lang.equals("en")) {
                dicIndex = new DicIndex(Language.EN);
            } else {
                dicIndex = new DicIndex(Language.CN);
            }
            JSONObject preMetadataJson = preJson.getJSONObject("metadata").getJSONObject("pretreatment");
            dicIndex.create(preJson, preMetadataJson.getInt("minFrequency"));
            HashMap<String, Integer> dicMap = dicIndex.getDicMap(preJson);
            if (dicMap.isEmpty()) {
                JSONObject errJson = new JSONObject();
                errJson.put("status", StatusCode.MIN_FREQUENCY_ERR.toString());
                errJson.put("result", "The minFrequency is too big.");
                return errJson;
            }

            // 迴圈所有文字
            Iterator<String> labelsIt = textsObj.keys();
            while (labelsIt.hasNext()) {
                String label = labelsIt.next();
                JSONArray aLabelTextsArr = textsObj.getJSONArray(label);

                int len = aLabelTextsArr.length();
                for (int i = 0; i < len; i++) {
                    JSONObject textJson = aLabelTextsArr.getJSONObject(i);
                    String text = textJson.getString("text");
                    if (null != text) {
                        String result = WordDocMatrix.create(text, dicMap);
                        String[] wordsDocArr = result.split("-");
                        textJson.remove("text");
                        textJson.put("totalWords", Integer.valueOf(wordsDocArr[0]));
                        textJson.put("text", wordsDocArr[1]);
                    }
                }
            }

            long endTime = System.currentTimeMillis();
            int spendTime = (int) (endTime - beginTime);
            preMetadataJson.put("spendTime", spendTime);
        } catch (JSONException ex) {
            LOGGER.log(Level.SEVERE, ex.getMessage());
        }

        JSONObject rsp = new JSONObject();
        try {
            rsp.put("status", "OK");
            rsp.put("result", preJson);
        } catch (JSONException ex) {
            LOGGER.log(Level.SEVERE, ex.getMessage());
        }
        return rsp;
    }
}

停用詞類

package com.robin.pretreatment;

import com.robin.config.ConfigUtil;
import java.util.Arrays;

import com.robin.file.FileUtil;
import com.robin.log.RobinLogger;
import java.util.HashSet;
import java.util.logging.Level;
import java.util.logging.Logger;

/**
 * <DT><B>描述:</B></DT>
 * <DD>停用詞類</DD>
 *
 * @version Version1.0
 * @author Robin
 * @version <I> Date:2018-04-21</I>
 * @author  <I> E-mail:[email protected]</I>
 */
public class StopWords {

    // 日誌
    private static final Logger LOOGER = RobinLogger.getLogger();

    /**
     * 停用詞構造方法
     */
    public StopWords() {
    }

    /**
     * 獲取中文stop words
     *
     * @return 中文stop words
     */
    public HashSet<String> getChineseSet() {
        String cnStopWordsPath = ConfigUtil.getConfig("stopWords.chinese");
        return this.load(cnStopWordsPath);
    }

    /**
     * 獲取英文stop words
     *
     * @return 英文stop words
     */
    public HashSet<String> getEnglishSet() {
        String enStopWordsPath = ConfigUtil.getConfig("stopWords.english");
        return this.load(enStopWordsPath);
    }

    /**
     * 獲取特殊符號
     *
     * @return 特殊符號
     */
    public HashSet<String> getSymbolSet() {
        String symbolPath = ConfigUtil.getConfig("stopWords.symbol");
        return this.load(symbolPath);
    }

    /**
     * 載入 stop words 檔案
     *
     * @param stopWordsPath stop words 檔案路徑
     * @return stop words List
     */
    private HashSet<String> load(String stopWordsPath) {
        HashSet<String> set = new HashSet<>();
        String stopWordsText = FileUtil.readText(stopWordsPath);
        if (null == stopWordsText) {
            LOOGER.log(Level.SEVERE, "讀取停止詞檔案失敗,檢查檔案及路徑.");
            return null;
        }
        String[] words = stopWordsText.split(" ");
        set.addAll(Arrays.asList(words));
        return set;
    }
}

請求JSON

  預處理微服務請求的JSON格式如下,紅框所示請求型別以及回填資料引數。

響應JSON

  預處理服務響應的JSON格式如下,紅框所示返回的結果。