使用TfidfVectorizer並且不去掉停用詞的條件下,對文字特徵進行量化的樸素貝葉斯分類效能測試
阿新 • • 發佈:2019-02-03
from sklearn.datasets import fetch_20newsgroups news = fetch_20newsgroups() from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(news.data, news.target, test_size=0.25, random_state=33) from sklearn.feature_extraction.text import TfidfVectorizer tfidf_vec = TfidfVectorizer() x_tfidf_train = tfidf_vec.fit_transform(x_train) x_tfidf_test = tfidf_vec.transform(x_test) from sklearn.naive_bayes import MultinomialNB from sklearn.metrics import classification_report mnb_tfidf = MultinomialNB() mnb_tfidf.fit(x_tfidf_train, y_train) print('The accuracy of classifying 20nesgroups with Naive Bayes(TfidVectorizer without filtering stopswords):', mnb_tfidf.score(x_tfidf_test, y_test)) y_tfidf_predict = mnb_tfidf.predict(x_tfidf_test) print(classification_report(y_test, y_tfidf_predict, target_names = news.target_names))
執行結果如下:
The accuracy of classifying 20nesgroups with Naive Bayes(TfidVectorizer without filtering stopswords): 0.824673029339 precision recall f1-score support alt.atheism 0.90 0.73 0.81 108 comp.graphics 0.83 0.83 0.83 130 comp.os.ms-windows.misc 0.93 0.67 0.78 163 comp.sys.ibm.pc.hardware 0.67 0.81 0.74 141 comp.sys.mac.hardware 0.93 0.86 0.89 145 comp.windows.x 0.89 0.86 0.87 141 misc.forsale 0.96 0.67 0.79 159 rec.autos 0.82 0.93 0.87 139 rec.motorcycles 0.93 0.93 0.93 153 rec.sport.baseball 0.95 0.93 0.94 141 rec.sport.hockey 0.90 0.99 0.94 148 sci.crypt 0.60 0.99 0.75 143 sci.electronics 0.94 0.76 0.84 160 sci.med 0.99 0.84 0.90 158 sci.space 0.89 0.90 0.89 149 soc.religion.christian 0.53 0.98 0.68 157 talk.politics.guns 0.77 0.93 0.84 134 talk.politics.mideast 0.90 0.98 0.94 133 talk.politics.misc 0.99 0.53 0.69 130 talk.religion.misc 1.00 0.14 0.25 97 avg / total 0.86 0.82 0.82 2829