1. 程式人生 > >機械匹配詞表最大化分詞

機械匹配詞表最大化分詞

2017-05-18

分詞程式碼


# -*- coding:utf-8 -*-
   
#簡單的支援中文的正向最大匹配的機械分詞
   
import string
__dict = {}
   
def load_dict(dict_file='words.dic'):
    #載入詞庫,把詞庫載入成一個key為首字元,value為相關詞的列表的字典
   
    words = [line.split() for line in open(dict_file)]
    
    for word in words:
        
        first_char = word[0][0]
        __dict
.setdefault(first_char, []) __dict[first_char].append(word[0]) #按詞的長度倒序排列 for first_char, twords in __dict.items(): __dict[first_char] = sorted(twords, key=lambda x:len(x), reverse=True) def __match_ascii(i, input): #返回連續的英文字母,數字,符號, 對英文,字母,符號不處理 result = ''
for i in range(i, len(input)): if input[i] in string.printable: # and input[i] not in string.whitespace: #string.ascii_letters or input[i] in string.digits: result += input[i] else: break return result.strip() def __match_word(first_char
, i , input): #根據當前位置進行分詞,ascii的直接讀取連續字元,中文的讀取詞庫 if not __dict.get(first_char): try: if first_char in string.printable: #string.ascii_letters or first_char in string.digits: return __match_ascii(i, input) except: print('except:',first_char,chr(first_char)) return first_char words = __dict[first_char] for word in words: if input[i:i+len(word)] == word: return word return first_char def tokenize(input): #對input進行分詞 if not input: return [] tokens = [] i = 0 while i < len(input): first_char = input[i] matched_word = __match_word(first_char, i, input) tokens.append(matched_word) i += len(matched_word) return tokens if __name__ == '__main__': def get_test_text(): import requests url = "http://www.zhb.gov.cn/xxgk/gzdt/201703/t20170321_408538.shtml" #url="http://mil.news.sina.com.cn/2016-12-30/doc-ifxzczff3445251.shtml" #text = requests.get(url).content text = requests.get(url,'utf8').content #return text.decode('gbk') #print(text.decode('utf8')) return text.decode('utf8') def load_dict_test(): load_dict() i=0; for first_char, words in __dict.items(): print('%d. %s:%s' % (i,first_char, ' '.join(words))) i=i+1 if i>10: break def tokenize_test(text): load_dict() tokens = tokenize(text) for token in tokens: print(token) #load_dict_test() tokenize_test('美麗的花園裡有各種各樣的小動物') tokenize_test('他購買了一盒Rosetta Stone品牌的SHA-PA型號24/6的訂書釘,總價¥24.3元.') tokenize_test('1949年10月1日,毛主席站在天安門城樓上莊嚴宣佈:中華人民共和國中央人民政府成立了!'); tokenize_test('A Happy New Yeear and a Merry Christmas