1. 程式人生 > >NLP: 中文分詞演算法---交集歧義檢測 (cross ambiguity detect)

NLP: 中文分詞演算法---交集歧義檢測 (cross ambiguity detect)

這裡, 可以採用FMM演算法進行切分, 首先切分出最大的正向匹配, 然後進行交集歧義檢測, 如果檢測到存在交集歧義, 那麼對可能存在歧義的一段文字進行FM切分, 獲取所有的可能切分結果; 然後對剩下的query子句重複進行FMM切分, 直到query == null

例如, 使用者查詢 query = 123456789, 假設首先FMM切分到了“123”, 交集歧義檢測長度為6, 這時候存在歧義, 那麼對“123456”進行FM切分, 獲取“123456”的所有的切分可能, 然後再對剩下的子句“789”迴圈進行FMM切分。

交集歧義檢測演算法描述:

假設 query = "網際網路金寶", 首先進行FMM, 切分出“網際網路”, 然後我們要檢測是否存在交集歧義, 將“網際網路”的長度作為輸入, 這裡我們使用變數word_len表示FMM切分結果的長度, 這裡word_len = 3; 同時將query的子句“聯網金寶

”作為輸入, 進行以下迭代:

如此迴圈下去, 迴圈結束的條件是 i < word_len && i < str.length 

: 這裡因為對於字串,索引是從0開始的,所以有i < word_len && i < str.length

word_len 在這裡表示的是可能存在的歧義的最大長度。

1.  

對輸入的 str = "聯網金寶" 進行FMM切分, 獲取切分後長度, 假設為“聯網”, len = 2

如果此時 word_length < i + len, 則 word_length = i + length

此時, i = 1, len = 2, word_len = 3

str = str.slice(1)

2.  

對 str = "網金寶" 進行FMM切分, 獲取切分後長度, 假設為“網金寶”, len = 3

如果此時 word_length < i + len, 則 word_length = i + length, 這裡,  i + len = 5, i + len > word_len, 設定word_len = 5

此時, i = 2, len = 3, word_len = 5

str = str.slice(1)

3. 

str = "金寶"

此時 i =3, str.length 為2, 迴圈條件不成立, 退出迭代。

程式碼實現:

var lunr = require("./lunr.js")
var idxdata = require("./idx.json")

var idx = lunr.Index.load(idxdata)
var ii = idx.tokenStore

var query1 = "中國人民銀行指出我國最近經濟不景氣"
var query2 = "習近平今日出席了中央氣象臺的聯歡晚會"
var query3 = "中國銀行今日出臺了最新的貸款政策"
var query4 = "習近平的中央氣象臺"
var query5 = "全部門"
var query6 = "網際網路金寶"
var query7 = "上下級別"
var query8 = "網際網路中國人民銀行"
var query9 = "引領土完整"

query = query8
var result = tokenizer(ii.root, query)
console.log(result)

/* tokenizer */
/* do FMM first and then detect ambiguity, if ambiguity detected, do FM again*/
function tokenizer(root, str) {
  if ( root == null || root == undefined ) return []
  if ( str == null || str == undefined || str.length == 0 ) return []

  var out = []
  while ( str.length > 0 ) {
	var ret = matchLongest(root, str)
	var ambiguityLength = getAmbiguiousLength(root, str, ret.length)
	console.log("FMM: " + ret + ", ambituity length: " + ambiguityLength)
	if ( ret.length >= ambiguityLength) {
	    out.push(ret)
	} else {
	  console.log("ambiguity detected!!!")
	  var ambiguityStr = str.substr(0, ambiguityLength)
	  console.log("do FM again for ambiguity piece: " + ambiguityStr)
	  var ret = ambiguityTokenizer(root, ambiguityStr)
	  out = out.concat(ret)
	}
    str = str.slice(ambiguityLength)
  }

  return out
}

function matchLongest(root, str) {
  if ( root == null || root == undefined ) return
  if ( str == null || str == undefined || str.length == 0 ) return

  var maxMatch = ""
  var currentNode = root
  for( var i = 0; i < str.length; i++ ) {
    if (str[i] in currentNode ) {
      maxMatch += str[i]
      currentNode = currentNode[str[i]]
    } else {
      if ( maxMatch.length == 0 ) maxMatch = str[i] // un-board word found
      break
    }
  }

  return maxMatch
}

/* tokenizer for ambigutiy part */
function ambiguityTokenizer(root, str) {
  if ( root == null || root == undefined ) return []
  if ( str == null || str == undefined || str.length == 0 ) return []

  var out = []
  var query = str
  while ( str.length > 0 ) {
	var ret = forwardMatching(root, str)
    out = out.concat(ret)
    str = str.slice(1)
  }

  return out
}

/* FM, this will return all the possible terms in along the longest search path */
function forwardMatching(root, str) {
  if ( root == null || root == undefined ) return
  if ( str == null || str == undefined || str.length == 0 ) return

  var out = []
  var matches = ""
  var currentNode = root
  for( var i = 0; i < str.length; i++ ) {
    if (str[i] in currentNode ) {
      matches += str[i]
	  currentNode = currentNode[str[i]]
	  docs = currentNode.docs || {}
	  if ( Object.keys(docs).length ) {
		out.push(matches)
	  }
    } else {
	  if ( matches.length == 0 ) {
	    // un-board word found
		// do not add un-board word, because when doing search, un-board word is bad and will affect the search results
		//out.push(str[i])
	  }
      break
    }
  }

  return out
}

function getAmbiguiousLength(root, str, word_length) {
  var i = 1
  while ( i < word_length && i < str.length ) {
    var wid = matchLongest(root, str.slice(i))
    var length = wid.length
    if ( word_length < i + length ) word_length = i + length
    i += 1
  }

  return word_length
}

測試:

query: "網際網路金寶"

結果: 

FMM: 網際網路, ambituity length: 5
ambiguity detected!!!
do FM again for ambiguity piece: 網際網路金寶
[ '網際網路', '網', '網金寶', '金', '寶' ]

query: "網際網路中國人民銀行"

結果: 

FMM: 網際網路, ambituity length: 3
FMM: 中國人民銀行, ambituity length: 6
[ '網際網路', '中國人民銀行' ]