Python3實現文字預處理
阿新 • • 發佈:2018-12-30
# -*- coding: utf-8 -*- import pandas as pd import jieba from nltk.stem import WordNetLemmatizer """ 函式說明:停用詞過濾 Parameters: filename:停用詞檔案 list_words_lemmatizer:詞列表 Returns: list_filter_stopwords:停用詞過濾後的詞列表 """ def stopwords_filter(filename,list_words_lemmatizer): list_filter_stopwords=[] #宣告一個停用詞過濾後的詞列表 with open(filename,'r') as fr: stop_words=list(fr.read().split('\n')) #將停用詞讀取到列表裡 for i in range(len(list_words_lemmatizer)): word_list = [] for j in list_words_lemmatizer[i]: if j not in stop_words: word_list.append(j.lower()) #將詞變為小寫加入詞列表 list_filter_stopwords.append(word_list) return list_filter_stopwords if __name__=='__main__': list_word_split, category_labels=word_split('testdata.xls') #獲得每條文字的分詞列表和標籤列表 print('分詞成功') list_words_lemmatizer=word_lemmatizer(list_word_split) #詞性還原 print('詞性還原成功') list_filter_stopwords=stopwords_filter('stopwords.txt',list_words_lemmatizer) #獲得停用詞過濾後的列表 print("停用詞過濾成功")