Python文字特徵及分類
阿新 • • 發佈:2019-01-24
1、情感分析
# 簡單的例子
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.classify import NaiveBayesClassifier
text1 = 'I like the movie so much!'
text2 = 'That is a good movie.'
text3 = 'This is a great one.'
text4 = 'That is a really bad movie.'
text5 = 'This is a terrible movie.'
def proc_text(text):
"""
預處處理文字
"""
# 分詞
raw_words = nltk.word_tokenize(text)
# 詞形歸一化
wordnet_lematizer = WordNetLemmatizer()
words = [wordnet_lematizer.lemmatize(raw_word) for raw_word in raw_words]
# 去除停用詞
filtered_words = [word for word in words if word not in stopwords.words('english')]
# True 表示該詞在文字中,為了使用nltk中的分類器
return {word: True for word in filtered_words}
# 構造訓練樣本
train_data = [[proc_text(text1), 1],
[proc_text(text2), 1],
[proc_text(text3), 1],
[proc_text(text4), 0],
[proc_text(text5), 0 ]]
print(train_data)
# 訓練模型
nb_model = NaiveBayesClassifier.train(train_data)
# 測試模型
text6 = 'That is a not bad one.'
print(nb_model.classify(proc_text(text6)))
2、文字相似度
import nltk
from nltk import FreqDist
text1 = 'I like the movie so much '
text2 = 'That is a good movie '
text3 = 'This is a great one '
text4 = 'That is a really bad movie '
text5 = 'This is a terrible movie'
text = text1 + text2 + text3 + text4 + text5
words = nltk.word_tokenize(text)
freq_dist = FreqDist(words)
print(freq_dist['That'])
print(freq_dist)
# 取出常用的n=5個單詞
n = 5
# 構造“常用單詞列表”
most_common_words = freq_dist.most_common(n)
print(most_common_words)
def lookup_pos(most_common_words):
"""
查詢常用單詞的位置
"""
result = {}
pos = 0
for word in most_common_words:
result[word[0]] = pos
pos += 1
return result
# 記錄位置
std_pos_dict = lookup_pos(most_common_words)
print(std_pos_dict)
# 新文字
new_text = 'That one is a good movie. This is so good!'
# 初始化向量
freq_vec = [0] * n
# 分詞
new_words = nltk.word_tokenize(new_text)
# 在“常用單詞列表”上計算詞頻
for new_word in new_words:
if new_word in list(std_pos_dict.keys()):
freq_vec[std_pos_dict[new_word]] += 1
print(freq_vec)
new_text = 'That one is a good movie.'
3、文字分類及TF-IDF
# 3.1 NLTK中的TF-IDF
from nltk.text import TextCollection
text1 = 'I like the movie so much '
text2 = 'That is a good movie '
text3 = 'This is a great one '
text4 = 'That is a really bad movie '
text5 = 'This is a terrible movie'
# 構建TextCollection物件
tc = TextCollection([text1, text2, text3,
text4, text5])
new_text = 'That one is a good movie. This is so good!'
word = 'That'
tf_idf_val = tc.tf_idf(word, new_text)
print('{}的TF-IDF值為:{}'.format(word, tf_idf_val))
# 3.2 sklearn中的TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
feat = vectorizer.fit_transform([text1, text2, text3, text4, text5])
print(feat.toarray())
print(vectorizer.get_feature_names())
print(feat_array = feat.toarray())
print(feat_array.shape)
print(feat_array[0:2, :])
# 3.2 3.3 中文TF-IDF
import os
import re
import jieba.posseg as pseg
ch_text1 = ' 非常失望,劇本完全敷衍了事,主線劇情沒突破大家可以理解,可所有的人物都缺乏動機,正邪之間、婦聯內部都沒什麼火花。團結-分裂-團結的三段式雖然老套但其實也可以利用積攢下來的形象魅力搞出意思,但劇本寫得非常膚淺、平面。場面上排程混亂呆板,滿屏的鐵甲審美疲勞。只有笑點算得上差強人意。'
ch_text2 = ' 2015年度最失望作品。以為面面俱到,實則畫蛇添足;以為主題深刻,實則老調重彈;以為推陳出新,實則俗不可耐;以為場面很high,實則high勁不足。氣!上一集的趣味全無,這集的笑點明顯刻意到心虛。全片沒有任何片段給我有緊張激動的時候,太弱了,跟奧創一樣。'
ch_text3 = ' 《鐵人2》中勾引鋼鐵俠,《婦聯1》中勾引鷹眼,《美隊2》中勾引美國隊長,在《婦聯2》中終於……跟綠巨人表白了,黑寡婦用實際行動告訴了我們什麼叫忠貞不二;而且為了治療不孕不育連作戰武器都變成了兩支驗孕棒(堅決相信快銀沒有死,後面還得回來)'
ch_text4 = ' 雖然從頭打到尾,但是真的很無聊啊。'
ch_text5 = ' 劇情不如第一集好玩了,全靠密集笑點在提神。僧多粥少的直接後果就是每部寡姐都要換著隊友談戀愛,這特麼比打鬥還辛苦啊,真心求放過~~~(結尾彩蛋還以為是洛基呢,結果我呸!)'
ch_texts = [ch_text1, ch_text2, ch_text3, ch_text4, ch_text5]
# 載入停用詞表
stop_words_path = './stop_words/'
stopwords1 = [line.rstrip() for line in open(os.path.join(stop_words_path, '中文停用詞庫.txt'), 'r',encoding='utf-8')]
stopwords2 = [line.rstrip() for line in open(os.path.join(stop_words_path, '哈工大停用詞表.txt'), 'r',encoding='utf-8')]
stopwords3 = [line.rstrip() for line in
open(os.path.join(stop_words_path, '四川大學機器智慧實驗室停用詞庫.txt'), 'r', encoding='utf-8')]
stopwords = stopwords1 + stopwords2 + stopwords3
print(len(stopwords))
def proc_text(raw_line):
"""
處理文字資料
返回分詞結果
"""
# 1. 使用正則表示式去除非中文字元
filter_pattern = re.compile('[^\u4E00-\u9FD5]+')
chinese_only = filter_pattern.sub('', raw_line)
# 2. 結巴分詞+詞性標註
word_list = pseg.cut(chinese_only)
# 3. 去除停用詞,保留有意義的詞性
# 動詞,形容詞,副詞
used_flags = ['v', 'a', 'ad']
meaninful_words = []
for word, flag in word_list:
if (word not in stopwords) and (flag in used_flags):
meaninful_words.append(word)
return ' '.join(meaninful_words)
corpus = [proc_text(ch_text) for ch_text in ch_texts]
print(corpus)
ch_vectorizer = TfidfVectorizer()
ch_feats = ch_vectorizer.fit_transform(corpus)
print(ch_vectorizer.get_feature_names())
print(ch_feats.toarray()[0, :])