jieba 去除停用詞、提取關鍵詞、詞性標註
阿新 • • 發佈:2019-01-10
#-*- coding:utf-8 -*- import sys import jieba import os import jieba.analyse import string import math import json reload(sys) sys.setdefaultencoding('utf-8') topk=10 testGuanJian = open(r'D:\PythonFiles\CINS\201722.news_zhengwen', 'r') fenci = open(r'D:\PythonFiles\files\stopword.txt', 'r') # 將停用次以列表的方式匯入,方便使用 stopkey = [line.strip().decode('utf-8') for line in fenci.readlines()] #去停用詞 def jiebaData(rtitleContentComment): words = jieba.cut(rtitleContentComment, cut_all=False) stayed_line = "" # print words for word in words: if word not in stopkey: stayed_line += word + " " return stayed_line #用jieba提取關鍵詞 def testJieBaGuanJianCi(): print 'start' f=open(r'D:\PythonFiles\CINS\test\cungj.news_zhengwen','w') s_zhengwen = testGuanJian.readlines() for i in s_zhengwen: try: print 'EEEEEEEEEEEEEE' m = i.strip().split('\001') print m[2] str=m[2] stayed_line=jiebaData(m[2]) print stayed_line print 'gggggggggggggggggggg' gWord =jieba.analyse.extract_tags(stayed_line,10) #print type(gWord) str='' for j in gWord: str +=j str +=' ' # if u'體育新聞' == str: # print type(m[2]) f.write(m[1]+ '\001') f.write(str+'\n') except Exception, e: print e, 'er' f.close() #testJieBaGuanJianCi()