1. 程式人生 > >基於jieba改寫的分詞算(待完成)

基於jieba改寫的分詞算(待完成)

基於jieba改寫的分詞演算法

import os, re, pandas as pd
from math import log
from time import time
# 基礎目錄
BASE_PATH = os.path.dirname(__file__)
# 生成絕對路徑
_get_abs_path = lambda path: os.path.normpath(os.path.join(BASE_PATH, path))
# 通用詞庫
JIEBA_DICT = _get_abs_path('jieba_dict.txt')  # jieba詞典


def txt2df2dt(filename=
JIEBA_DICT, sep=' '): df = pd.read_table(filename, sep, header=None) return dict(df[[0, 1]].values) class Cutter: re_eng = re.compile('[a-zA-Z0-9_\-]+') re_num = re.compile('[0-9.\-+%/~]+') def __init__(self, dt=None, max_len=0): self.t = time() self.dt = dt or txt2df2dt(
) self.total = sum(list(self.dt.values())) # 詞最大長度,預設等於詞典最長詞 if not max_len: for k in self.dt.keys(): if len(k) > max_len: max_len = len(k) self.max_len = max_len def __del__(self): t = time() - self.t print
('分詞耗時:%.2f秒' % t) if t < 60 else print('分詞耗時:%.2f分鐘' % (t/60)) def _get_DAG(self, sentence): length = len(sentence) dt = dict() for head in range(length): tail = head + self.max_len if tail > length: tail = length dt.update({head: [head]}) for middle in range(head + 2, tail + 1): word = sentence[head: middle] # ------------- 詞典 + 正則 ------------- # if word in self.dt: dt[head].append(middle - 1) elif self.re_eng.fullmatch(word): dt[head].append(middle - 1) elif self.re_num.fullmatch(word): dt[head].append(middle - 1) return dt def _calculate(self, sentence): DAG = self._get_DAG(sentence) route = {} N = len(sentence) route[N] = (0, 0) logtotal = log(self.total) for idx in range(N - 1, -1, -1): route[idx] = max( (log(self.dt.get(sentence[idx:x + 1], 1)) - logtotal + route[x + 1][0], x) for x in DAG[idx]) return route def cut(self, sentence): route = self._calculate(sentence) x = 0 N = len(sentence) buf = '' while x < N: y = route[x][1] + 1 l_word = sentence[x:y] if len(l_word) == 1: buf += l_word x = y else: if buf: yield buf buf = '' yield l_word x = y def lcut(self, sentence): return list(self.cut(sentence)) def add_word(self, word, freq=0): new_freq = freq or 1 original_freq = self.dt.get(word, 0) self.dt[word] = new_freq self.total = self.total - original_freq + new_freq def del_word(self, word): original_freq = self.dt.get(word) if original_freq: del self.dt[word] self.total -= original_freq cut = lambda sentence: Cutter().cut(sentence) lcut = lambda sentence: Cutter().lcut(sentence)

jieba分詞原理(0.39版)

  • 1、基於詞典,對句子進行詞圖掃描,生成所有成詞情況所構成的有向無環圖DAG
def get_DAG(self, sentence):
    self.check_initialized()
    DAG = {}
    N = len(sentence)
    for k in xrange(N):
        tmplist = []
        i = k
        frag = sentence[k]
        while i < N and frag in self.FREQ:
            if self.FREQ[frag]:
                tmplist.append(i)
            i += 1
            frag = sentence[k:i + 1]
        if not tmplist:
            tmplist.append(k)
        DAG[k] = tmplist
    return DAG
  • 2、根據DAG,反向計算最大概率路徑
def calc(self, sentence, DAG, route):
    N = len(sentence)
    route[N] = (0, 0)
    logtotal = log(self.total)
    for idx in xrange(N - 1, -1, -1):
        route[idx] = max((log(self.FREQ.get(sentence[idx:x + 1]) or 1) -
                          logtotal + route[x + 1][0], x) for x in DAG[idx])
  • 3、根據路徑獲取最大概率的分詞序列
def __cut_DAG_NO_HMM(self, sentence):
    DAG = self.get_DAG(sentence)
    route = {}
    self.calc(sentence, DAG, route)
    x = 0
    N = len(sentence)
    buf = ''
    while x < N:
        y = route[x][1] + 1
        l_word = sentence[x:y]
        if re_eng.match(l_word) and len(l_word) == 1:
            buf += l_word
            x = y
        else:
            if buf:
                yield buf
                buf = ''
            yield l_word
            x = y

圖論知識補充

1、圖的表示方法

在這裡插入圖片描述

1.1、networkx

%matplotlib inline
import networkx as nx
# 建立圖
G = nx.DiGraph()
# 新增邊
G.add_edges_from([(0, 1), (0, 2), (1, 2), (2, 3)])
# 繪圖
nx.draw(G, with_labels=True, font_size=36, node_size=1500, width=4, node_color='lightgreen')

1.2、矩陣

class G:
    def __init__(self, nodes):
        self.matrix = [[0] * nodes for _ in range(nodes)]
    def add_edge(self, start, end, value=1):
        self.matrix[start][end] = value

g = G(4)
g.add_edge(0, 1)
g.add_edge(0, 2)
g.add_edge(1, 2)
g.add_edge(2, 3)
print(g.matrix)

1.3、字典

class G:
    def __init__(self):
        self.dt = dict()
    def add_edge(self, start, end, value=1):
        self.dt[start] = self.dt.get(start, dict())
        self.dt[start][end] = value

g = G()
g.add_edge(0, 1)
g.add_edge(0, 2)
g.add_edge(1, 2)
g.add_edge(2, 3)
print(g.dt)

基於詞典生成DAG

def dag(sentence, corpus, size=2):
    length = len(sentence)
    dt = dict()
    for head in range(length):
        tail = head + size
        if tail > length:
            tail = length
        dt.update({head: []})
        for middle in range(head + 1, tail + 1):
            word = sentence[head: middle]
            if word in corpus:
                dt[head].append(middle - 1)
    return dt

dt = {'南海中學': 1, '南海': 1, '中學': 1, '放假': 1, '南': 1, '海': 1, '中': 1, '學': 1, '放': 1, '假': 1}
sentence1 = '南海中學放假'
print(dag(sentence1, dt))
s = {'空調', '調和', '和風', '風扇', '空', '調', '和', '風', '扇'}
sentence2 = '空調和風扇'
print(dag(sentence2, s))

{0: [0, 1], 1: [1], 2: [2, 3], 3: [3], 4: [4, 5], 5: [5]} {0: [0, 1], 1: [1, 2], 2: [2, 3], 3: [3, 4], 4: [4]}

附錄

DAG 有向無環圖(Directed Acyclic Graph)