python 分詞、自定義詞表、停用詞、詞頻統計與權值(tfidf)、詞性標註與部分詞性刪除
阿新 • • 發佈:2018-12-22
# -*- coding: utf-8 -*- """ Created on Tue Apr 17 15:11:44 2018 @author: NAU """ ##############分詞、自定義詞表、停用詞################ import jieba jieba.load_userdict('E:\\userdict.txt') #自定義詞典 inputs = open('E:\\wdkb.txt', 'r') #分詞文字 outputs = open('E:\\wdkbfenci.txt', 'w') #分詞輸出文字 stopwords = open('E:\\stop.txt') #停用詞 def seg_sentence(sentence): #分詞 sentence_seged = jieba.cut(sentence.strip()) outstr ="" for word insentence_seged: if wordnot in stopwords: if word != '\t': outstr += word outstr += " " return outstr for line in inputs: #讀文字進行分詞 line_seg =seg_sentence(line) outputs.write(line_seg + '\n') outputs.close() inputs.close() ##############詞頻統計與權值################ import jieba import jieba.analyse content = u'中國特色社會主義是我們黨領導的偉大事業,全面推進黨的建設新的偉大工程,是這一偉大事業取得勝利的關鍵所在。黨堅強有力,事業才能興旺發達,國家才能繁榮穩定,人民才能幸福安康。黨的十八大以來,我們黨堅持黨要管黨、從嚴治黨,凝心聚力、直擊積弊、扶正祛邪,黨的建設開創新局面,黨風政風呈現新氣象。習近平總書記圍繞從嚴管黨治黨提出一系列新的重要思想,為全面推進黨的建設新的偉大工程進一步指明瞭方向。' keywords = jieba.analyse.extract_tags(content,topK=20, withWeight=True, allowPOS=()) for item in keywords: printitem[0],item[1] ##############tfidf高頻詞################ tfidf = [] inputs = open('C:\\Users\\NAU\\Desktop\\top.txt', 'r', encoding='utf8') #分詞資料 outputs = open('C:\\Users\\NAU\\Desktop\\top_feature.txt', 'w', encoding='utf8') #輸出文字 nagetive_top_words = inputs.read() #讀取資料 inputs.close() #關閉輸入文字 tags = jieba.analyse.extract_tags(nagetive_top_words, topK=100, withWeight=True) #tfidf程式碼呼叫 print (''.join(str(tags)) + '\n') #列印所有前100詞彙 for i in tags: #每行列印一個詞彙 print (i) outputs.write(str(i) + '\n' for i in tags) ##############詞性標註################ import jieba import jieba.posseg as pseg jieba.load_userdict('E:\\userdict.txt') inputs = open('E:\\negetive_sentence.txt', 'r') outputs = open('E:\\negetive_tag.txt', 'w') negative=inputs.read() lines=negative.strip().split('\n') def seg_sentence(sentence): sentence_seged = pseg.cut(sentence.strip()) outstr = "" for w in sentence_seged: if w != '\t': outstr += str(w) outstr += " " return outstr for line in lines: #讀文字進行分詞 line_seg = seg_sentence(line) outputs.write(line_seg + '\n') print(line_seg + '\n') ##############詞性刪除############### import re inputs = open('C:\\Users\\NAU\\Desktop\\data1.txt', 'r', encoding='utf8') outputs = open('C:\\Users\\NAU\\Desktop\\data2.txt', 'w', encoding='utf8') negative=inputs.readlines() txtlist=[] remove_word=["/nz","/zg","/m"] for line in negative: #選擇需要的詞性 line_list2 = re.split(' ',line) line_list = line_list2[:] for segs in line_list2: for k in remove_word: if k in segs: line_list.remove(segs) break else: pass txtlist.append(line_list) resultlist=txtlist[:] for sent in resultlist: for word in sent: #刪除詞性標記 if "/" in word: slope=word.index("/") letter=word[0:slope]+" " outputs.write(letter) print(letter) else: outputs.write(word)