基於Kubernetes的機器學習微服務系統設計系列——(五)預處理微服務
阿新 • • 發佈:2018-11-10
內容提要
文字分類任務的預處理階段一般包括中文分詞。這裡中文分詞單提出來,預處理階段的主要任務是停用詞去除、索引詞典的構建、詞文件矩陣化。
實現程式碼
預處理Action實現類
package com.robin.pretreatment.action;
import com.robin.loader.MircoServiceAction;
import com.robin.log.RobinLogger;
import com.robin.pretreatment.DicIndex;
import com.robin.pretreatment.DicIndex.Language;
import com.robin.pretreatment.WordDocMatrix;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.codehaus.jettison.json.JSONArray;
import org.codehaus. jettison.json.JSONException;
import org.codehaus.jettison.json.JSONObject;
/**
* <DT><B>描述:</B></DT>
* <DD>預處理Action實現類</DD>
*
* @version Version1.0
* @author Robin
* @version <I> V1.0 Date:2018-04-08</I>
* @author <I> E-mail:[email protected] </I>
*/
public class PretreatAction implements MircoServiceAction {
private static final Logger LOGGER = RobinLogger.getLogger();
public enum StatusCode {
OK,
JSON_ERR,
KIND_ERR,
VERSION_ERR,
MIN_FREQUENCY_ERR,
TEXTS_NULL,
}
private class ActionStatus {
StatusCode statusCode;
String msg;
}
private JSONObject getErrorJson(ActionStatus actionStatus) {
JSONObject errJson = new JSONObject();
try {
errJson.put("status", actionStatus.statusCode.toString());
errJson.put("msg", actionStatus.msg);
} catch (JSONException ex) {
LOGGER.log(Level.SEVERE, ex.getMessage());
}
return errJson;
}
private ActionStatus checkJSONObjectTerm(JSONObject jsonObj,
String key,
HashSet<String> valueSet,
StatusCode errStatusCode) {
ActionStatus actionStatus = new ActionStatus();
try {
if (!jsonObj.isNull(key)) {
String value = jsonObj.getString(key);
if (!valueSet.contains(value)) {
actionStatus.msg = "The value [" + value + "] of " + key + " is error.";
actionStatus.statusCode = errStatusCode;
return actionStatus;
}
} else {
actionStatus.msg = "The input parameter is missing " + key + ".";
actionStatus.statusCode = errStatusCode;
return actionStatus;
}
} catch (JSONException ex) {
LOGGER.log(Level.SEVERE, ex.getMessage());
}
actionStatus.statusCode = StatusCode.OK;
return actionStatus;
}
private ActionStatus checkInputJSONObject(JSONObject jsonObj) {
ActionStatus actionStatus = new ActionStatus();
ActionStatus retActionStatus;
HashSet<String> valueSet = new HashSet();
valueSet.add("pretreatment");
retActionStatus = checkJSONObjectTerm(jsonObj, "kind", valueSet, StatusCode.KIND_ERR);
if (!retActionStatus.statusCode.equals(StatusCode.OK)) {
return retActionStatus;
}
valueSet.clear();
valueSet.add("v1");
retActionStatus = checkJSONObjectTerm(jsonObj, "version", valueSet, StatusCode.VERSION_ERR);
if (!retActionStatus.statusCode.equals(StatusCode.OK)) {
return retActionStatus;
}
actionStatus.statusCode = StatusCode.OK;
return actionStatus;
}
@Override
public Object action(Object obj) {
ActionStatus actionStatus = new ActionStatus();
ActionStatus retActionStatus;
if (!(obj instanceof JSONObject)) {
actionStatus.msg = "The action arguments is not JSONObject.";
LOGGER.log(Level.SEVERE, actionStatus.msg);
actionStatus.statusCode = StatusCode.JSON_ERR;
return this.getErrorJson(actionStatus);
}
JSONObject preJson = (JSONObject) obj;
retActionStatus = this.checkInputJSONObject(preJson);
if (!retActionStatus.statusCode.equals(StatusCode.OK)) {
LOGGER.log(Level.SEVERE, retActionStatus.msg);
return this.getErrorJson(retActionStatus);
}
try {
long beginTime = System.currentTimeMillis();
JSONObject textsObj = preJson.getJSONObject("texts");
if (null == textsObj) {
actionStatus.statusCode = StatusCode.TEXTS_NULL;
actionStatus.msg = "The input texts is null.";
LOGGER.log(Level.SEVERE, actionStatus.msg);
return this.getErrorJson(actionStatus);
}
DicIndex dicIndex;
String lang = preJson.getJSONObject("metadata").getJSONObject("corpus").getString("lang");
if (lang.equals("en")) {
dicIndex = new DicIndex(Language.EN);
} else {
dicIndex = new DicIndex(Language.CN);
}
JSONObject preMetadataJson = preJson.getJSONObject("metadata").getJSONObject("pretreatment");
dicIndex.create(preJson, preMetadataJson.getInt("minFrequency"));
HashMap<String, Integer> dicMap = dicIndex.getDicMap(preJson);
if (dicMap.isEmpty()) {
JSONObject errJson = new JSONObject();
errJson.put("status", StatusCode.MIN_FREQUENCY_ERR.toString());
errJson.put("result", "The minFrequency is too big.");
return errJson;
}
// 迴圈所有文字
Iterator<String> labelsIt = textsObj.keys();
while (labelsIt.hasNext()) {
String label = labelsIt.next();
JSONArray aLabelTextsArr = textsObj.getJSONArray(label);
int len = aLabelTextsArr.length();
for (int i = 0; i < len; i++) {
JSONObject textJson = aLabelTextsArr.getJSONObject(i);
String text = textJson.getString("text");
if (null != text) {
String result = WordDocMatrix.create(text, dicMap);
String[] wordsDocArr = result.split("-");
textJson.remove("text");
textJson.put("totalWords", Integer.valueOf(wordsDocArr[0]));
textJson.put("text", wordsDocArr[1]);
}
}
}
long endTime = System.currentTimeMillis();
int spendTime = (int) (endTime - beginTime);
preMetadataJson.put("spendTime", spendTime);
} catch (JSONException ex) {
LOGGER.log(Level.SEVERE, ex.getMessage());
}
JSONObject rsp = new JSONObject();
try {
rsp.put("status", "OK");
rsp.put("result", preJson);
} catch (JSONException ex) {
LOGGER.log(Level.SEVERE, ex.getMessage());
}
return rsp;
}
}
停用詞類
package com.robin.pretreatment;
import com.robin.config.ConfigUtil;
import java.util.Arrays;
import com.robin.file.FileUtil;
import com.robin.log.RobinLogger;
import java.util.HashSet;
import java.util.logging.Level;
import java.util.logging.Logger;
/**
* <DT><B>描述:</B></DT>
* <DD>停用詞類</DD>
*
* @version Version1.0
* @author Robin
* @version <I> Date:2018-04-21</I>
* @author <I> E-mail:[email protected]</I>
*/
public class StopWords {
// 日誌
private static final Logger LOOGER = RobinLogger.getLogger();
/**
* 停用詞構造方法
*/
public StopWords() {
}
/**
* 獲取中文stop words
*
* @return 中文stop words
*/
public HashSet<String> getChineseSet() {
String cnStopWordsPath = ConfigUtil.getConfig("stopWords.chinese");
return this.load(cnStopWordsPath);
}
/**
* 獲取英文stop words
*
* @return 英文stop words
*/
public HashSet<String> getEnglishSet() {
String enStopWordsPath = ConfigUtil.getConfig("stopWords.english");
return this.load(enStopWordsPath);
}
/**
* 獲取特殊符號
*
* @return 特殊符號
*/
public HashSet<String> getSymbolSet() {
String symbolPath = ConfigUtil.getConfig("stopWords.symbol");
return this.load(symbolPath);
}
/**
* 載入 stop words 檔案
*
* @param stopWordsPath stop words 檔案路徑
* @return stop words List
*/
private HashSet<String> load(String stopWordsPath) {
HashSet<String> set = new HashSet<>();
String stopWordsText = FileUtil.readText(stopWordsPath);
if (null == stopWordsText) {
LOOGER.log(Level.SEVERE, "讀取停止詞檔案失敗,檢查檔案及路徑.");
return null;
}
String[] words = stopWordsText.split(" ");
set.addAll(Arrays.asList(words));
return set;
}
}
請求JSON
預處理微服務請求的JSON格式如下,紅框所示請求型別以及回填資料引數。
響應JSON
預處理服務響應的JSON格式如下,紅框所示返回的結果。