基於Kubernetes的機器學習微服務系統設計系列——(四)中文分詞微服務
阿新 • • 發佈:2018-11-10
內容提要
中文分詞微服務包括分詞方法有:RobinSeg(RS)、IKAnalyzer(IK)、JEAnalysis(JE)、MmSeg4j(MS)、PaoDing(PD)、SmallSeg4j(SS)。其中RS分詞實現見我的文章:知更鳥中文分詞RS設計實現 ,其他分詞方法都採用釋出的jar包進行封裝裝。
設計模式
主要涉及外觀模式、介面卡模式、工廠模式和單例模式。分詞微服務類圖如圖所示:
設計原則:(1)針對介面程式設計,不要針對實現;(2)只和最緊密的類互動;(3)封裝變化;(4)鬆耦合設計。
外觀模式
介面卡模式:將一個類的介面,轉換成客戶期望的另一個介面。介面卡讓原本介面不相容的類可以合作無間。各種分詞的的私有實現介面需要一個提供一個統一的介面呼叫。
工廠模式:定義一個建立物件的介面,但有子類決定要例項化的類是哪一個。提供統一的分詞工廠,建立分類例項物件。
單例模式:確保一個類只有一個例項,並提供了一個全域性訪問點。由於各種分詞物件的建立、載入詞典等需要申請大量的記憶體,耗費大量的時間,所以所分詞器例項都通過介面卡進行控制只建立一個例項。
程式碼實現
中文分詞介面抽象類
package com.robin.segment;
import com.robin.log.RobinLogger;
import java.util.logging.Logger;
/**
* <DT><B>描述:</B></DT>
* <DD>中文分詞介面抽象類</DD>
*
* @version Version1.0
* @author Robin
* @version <I> Date:2018-04-18</I>
* @author <I> E-mail: [email protected]</I>
*/
public abstract class AbstractSegmenter {
/** 日誌 */
protected static final Logger LOGGER = RobinLogger.getLogger();
/**
* 分詞抽象方法
*
* @param text 文字
* @param SEPARATOR 分隔符
* @return 已分詞文字
*/
public abstract String segment(String text, String SEPARATOR);
}
統一分詞器外觀類
package com.robin.segment;
import com.robin.log.RobinLogger;
import com.robin.segment.SegmentFactory.SegmentMethod;
import com.robin.segment.robinseg.RobinSeg;
import com.robin.segment.robinseg.SegmentArgs;
import java.util.logging.Logger;
/**
* <DT><B>描述:</B></DT>
* <DD>統一分詞器外觀類</DD>
* <DD>外觀模式</DD>
*
* @version 1.0
* @author Robin
* @version <I> Date:2018-04-19</I>
* @author <I> E-mail:[email protected]</I>
*/
public class SegmentFacade {
// 日誌
private static final Logger LOGGER = RobinLogger.getLogger();
/**
* 獲取分詞器配置引數物件
*
* @param methodName 分詞方法
* @return SegmentArgs
*/
public static SegmentArgs getSegmentArgsObj(SegmentMethod methodName) {
AbstractSegmenter segment = SegmentFactory.getSegInstance(methodName);
if (methodName.equals(SegmentMethod.RS)) {
return ((RobinSeg) segment).getSegmentConfInstance();
}
return null;
}
/**
* <DD>根據不同分詞演算法進行分詞,</DD>
* <DD>傳入演算法名錯誤或預設情況下用RobinSeg分詞。</DD>
*
* @param methodName 分詞方法名稱,“SegmentMethod.IK”,“.JE”,“.MS”,“.PD”,“.SS”,
* “.RS”
* @param text 待分詞文字
* @param separator 分隔符
* @return 使用分隔符分好詞文字
*/
public static String split(SegmentMethod methodName, String text, String separator) {
AbstractSegmenter segmenter = SegmentFactory.getSegInstance(methodName);
return segmenter.segment(text, separator);
}
}
分詞Action實現類
package com.robin.segment.action;
import com.robin.loader.MircoServiceAction;
import com.robin.log.RobinLogger;
import com.robin.segment.SegmentFacade;
import com.robin.segment.SegmentFactory.SegmentMethod;
import com.robin.segment.robinseg.SegmentArgs;
import com.robin.segment.robinseg.SegmentArgs.SegAlgorithm;
import java.util.HashSet;
import java.util.Iterator;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.codehaus.jettison.json.JSONArray;
import org.codehaus.jettison.json.JSONException;
import org.codehaus.jettison.json.JSONObject;
/**
* <DT><B>描述:</B></DT>
* <DD>分詞Action實現類</DD>
*
* @version Version1.0
* @author Robin
* @version <I> V1.0 Date:2018-06-05</I>
* @author <I> E-mail:[email protected]</I>
*/
public class SegmentAction implements MircoServiceAction {
private static final Logger LOGGER = RobinLogger.getLogger();
public enum StatusCode {
OK,
JSON_ERR,
KIND_ERR,
VERSION_ERR,
SEGMETHOD_ERR,
SEPARATOR_ERR,
SEGMENT_FAILED,
TEXTS_NULL,
}
private class ActionStatus {
StatusCode statusCode;
String msg;
}
private JSONObject getErrorJson(ActionStatus actionStatus) {
JSONObject errJson = new JSONObject();
try {
errJson.put("status", actionStatus.statusCode.toString());
errJson.put("msg", actionStatus.msg);
} catch (JSONException ex) {
LOGGER.log(Level.SEVERE, ex.getMessage());
}
return errJson;
}
private ActionStatus checkJSONObjectTerm(JSONObject jsonObj,
String key,
HashSet<String> valueSet,
StatusCode errStatusCode) {
ActionStatus actionStatus = new ActionStatus();
try {
if (!jsonObj.isNull(key)) {
String value = jsonObj.getString(key);
if (!valueSet.contains(value)) {
actionStatus.msg = "The value [" + value + "] of " + key + " is error.";
actionStatus.statusCode = errStatusCode;
return actionStatus;
}
} else {
actionStatus.msg = "The input parameter is missing " + key + ".";
actionStatus.statusCode = errStatusCode;
return actionStatus;
}
} catch (JSONException ex) {
LOGGER.log(Level.SEVERE, ex.getMessage());
}
actionStatus.statusCode = StatusCode.OK;
return actionStatus;
}
private ActionStatus checkInputJSONObject(JSONObject jsonObj) {
ActionStatus actionStatus = new ActionStatus();
ActionStatus retActionStatus;
JSONObject argsJson;
HashSet<String> valueSet = new HashSet();
try {
valueSet.add("segment");
retActionStatus = checkJSONObjectTerm(jsonObj, "kind", valueSet, StatusCode.KIND_ERR);
if (!retActionStatus.statusCode.equals(StatusCode.OK)) {
return retActionStatus;
}
valueSet.clear();
valueSet.add("v1");
retActionStatus = checkJSONObjectTerm(jsonObj, "version", valueSet, StatusCode.VERSION_ERR);
if (!retActionStatus.statusCode.equals(StatusCode.OK)) {
return retActionStatus;
}
JSONObject segmentMetadata = jsonObj.getJSONObject("metadata").getJSONObject("segment");
valueSet.clear();
valueSet.add("RS");
valueSet.add("IK");
valueSet.add("JE");
valueSet.add("MS");
valueSet.add("PD");
valueSet.add("SS");
retActionStatus = checkJSONObjectTerm(segmentMetadata, "method", valueSet, StatusCode.SEGMETHOD_ERR);
if (!retActionStatus.statusCode.equals(StatusCode.OK)) {
return retActionStatus;
}
valueSet.clear();
valueSet.add(" ");
valueSet.add("|");
valueSet.add("/");
retActionStatus = checkJSONObjectTerm(segmentMetadata, "separator", valueSet, StatusCode.SEPARATOR_ERR);
if (!retActionStatus.statusCode.equals(StatusCode.OK)) {
return retActionStatus;
}
// 設定RobinSeg分詞引數
String method = segmentMetadata.getString("method");
SegmentMethod segmentMethod = SegmentMethod.valueOf(method);
if ((segmentMethod.equals(SegmentMethod.RS)) && (!segmentMetadata.isNull("args"))) {
argsJson = segmentMetadata.getJSONObject("args");
SegmentArgs segmentArgs = SegmentFacade.getSegmentArgsObj(segmentMethod);
if (null != segmentArgs) {
if (!argsJson.isNull("algorithm")) {
String algorithm = argsJson.getString("algorithm");
segmentArgs.setSegAlgorithm(SegAlgorithm.valueOf(algorithm.toUpperCase()));
}
if (!argsJson.isNull("cleanSymbol")) {
Boolean flag = argsJson.getBoolean("cleanSymbol");
segmentArgs.setCleanSymbolFlag(flag);
}
if (!argsJson.isNull("markNewWord")) {
Boolean flag = argsJson.getBoolean("markNewWord");
segmentArgs.setMarkNewWordFlag(flag);
}
if (!argsJson.isNull("downcasing")) {
Boolean flag = argsJson.getBoolean("downcasing");
segmentArgs.setDowncasingFlag(flag);
}
if (!argsJson.isNull("mergePattern")) {
Boolean flag = argsJson.getBoolean("mergePattern");
segmentArgs.setMergePatternFlag(flag);
}
if (!argsJson.isNull("retrievalPattern")) {
Boolean flag = argsJson.getBoolean("retrievalPattern");
segmentArgs.setRetrievalPatternFlag(flag);
}
}
}
} catch (JSONException ex) {
LOGGER.log(Level.SEVERE, ex.getMessage());
}
actionStatus.statusCode = StatusCode.OK;
return actionStatus;
}
@Override
public Object action(Object obj) {
ActionStatus actionStatus = new ActionStatus();
ActionStatus retActionStatus;
if (!(obj instanceof JSONObject)) {
actionStatus.msg = "The action arguments is not JSONObject.";
LOGGER.log(Level.SEVERE, actionStatus.msg);
actionStatus.statusCode = StatusCode.JSON_ERR;
return this.getErrorJson(actionStatus);
}
JSONObject jsonObj = (JSONObject) obj;
retActionStatus = this.checkInputJSONObject(jsonObj);
if (!retActionStatus.statusCode.equals(StatusCode.OK)) {
LOGGER.log(Level.SEVERE, retActionStatus.msg);
return this.getErrorJson(retActionStatus);
}
SegmentMethod segmentMethod;
String separator;
JSONObject texts;
try {
JSONObject segmentMetadata = jsonObj.getJSONObject("metadata").getJSONObject("segment");
String method = segmentMetadata.getString("method");
segmentMethod = SegmentMethod.valueOf(method);
separator = segmentMetadata.getString("separator");
texts = jsonObj.getJSONObject("texts");
long beginTime = System.currentTimeMillis();
if (null == texts) {
actionStatus.statusCode = StatusCode.TEXTS_NULL;
actionStatus.msg = "The input texts is null.";
LOGGER.log(Level.SEVERE, actionStatus.msg);
return this.getErrorJson(actionStatus);
}
Iterator labelsIt = texts.keys();
while (labelsIt.hasNext()){
String label = (String) labelsIt.next();
JSONArray aLabelTexts = texts.getJSONArray(label);
int len = aLabelTexts.length();
for (int i = 0; i < len; i++) {
JSONObject textJson = aLabelTexts.getJSONObject(i);
String text = textJson.getString("text");
if (null != text) {
String result = SegmentFacade.split(segmentMethod, text, separator);
textJson.put("text", result);
}
}
}
long endTime = System.currentTimeMillis();
int spendTime = (int) (endTime - beginTime);
segmentMetadata.put("spendTime", spendTime);
} catch (JSONException ex) {
LOGGER.log(Level.SEVERE, ex.getMessage());
}
JSONObject rsp = new JSONObject();
try {
rsp.put("status", "OK");
rsp.put("result", jsonObj);
} catch (JSONException ex) {
LOGGER.log(Level.SEVERE, ex.getMessage());
}
return rsp;
}
}
分詞例項工廠方法類
package com.robin.segment;
import com.robin.segment.adapter.SmallSeg4jAdapter;
import com.robin.segment.adapter.MmSeg4jAdapter;
import com.robin.segment.adapter.IKAnalyzerAdapter;
import com.robin.segment.adapter.JEAnalysisAdapter;
import com.robin.segment.adapter.PaoDingAdapter;
import com.robin.log.RobinLogger;
import com.robin.segment.robinseg.RobinSeg;
import java.util.logging.Level;
import java.util.logging.Logger;
/**
* <DT><B>描述:</B></DT>
* <DD>分詞例項工廠方法類</DD>
*
* @version Version1.0
* @author Robin
* @version <I> Date:2018-04-19</I>
* @author <I> E-mail:[email protected]</I>
*/
public class SegmentFactory {
// 日誌
private static final Logger LOGGER = RobinLogger.getLogger();
/** 分詞演算法名稱標記 */
public enum SegmentMethod {
/** JE = "JEAnalysis" */
JE,
/** IK = "IKAnalyzer"*/
IK,
/** MS = "MmSeg4j" */
MS,
/** PD = "PaoDing" */
PD,
/** SS = "SmallSeg4j" */
SS,
/** RS = "RobinSeg" */
RS
}
/**
* 建立具體分詞類例項
*
* @param methodName 分詞方法名稱,“SegmentMethod.IK”,“.JE”,“.MS”,“.PD”,“.SS”,“.RS”
* @return 具體分詞方法例項
*/
public static AbstractSegmenter getSegInstance(SegmentMethod methodName) {
if (null == methodName) {
methodName = SegmentMethod.RS;
}
switch (methodName) {
case JE:
return JEAnalysisAdapter.getInstance()
相關推薦
no