1. 程式人生 > >Python3實現文字預處理

Python3實現文字預處理

# -*- coding: utf-8 -*-

import pandas as pd
import jieba
from nltk.stem import WordNetLemmatizer

"""
函式說明:停用詞過濾
Parameters:
     filename:停用詞檔案
     list_words_lemmatizer:詞列表
Returns:
     list_filter_stopwords:停用詞過濾後的詞列表
"""
def stopwords_filter(filename,list_words_lemmatizer):
    list_filter_stopwords=[]  #宣告一個停用詞過濾後的詞列表
    with open(filename,'r') as fr:
        stop_words=list(fr.read().split('\n')) #將停用詞讀取到列表裡
        for i in range(len(list_words_lemmatizer)):
            word_list = []
            for j in list_words_lemmatizer[i]:
                if j not in stop_words:
                    word_list.append(j.lower()) #將詞變為小寫加入詞列表
            list_filter_stopwords.append(word_list)
        return list_filter_stopwords

if __name__=='__main__':
    list_word_split, category_labels=word_split('testdata.xls') #獲得每條文字的分詞列表和標籤列表
    print('分詞成功')
    list_words_lemmatizer=word_lemmatizer(list_word_split)  #詞性還原
    print('詞性還原成功')
    list_filter_stopwords=stopwords_filter('stopwords.txt',list_words_lemmatizer) #獲得停用詞過濾後的列表
    print("停用詞過濾成功")