1. 程式人生 > >機器學習——文字分類(TF-IDF)

機器學習——文字分類(TF-IDF)

首先,文字資料屬於非結構化資料,一般要轉換成結構化的資料,一般是將文字轉換成“文件-詞頻矩陣”,矩陣中的元素使用詞頻或者TF-IDF。

TF-IDF的主要思想是:如果某一個詞或短語在一篇文章中出現的頻率高,並且在其他文章中很少出現,則認為此詞或短語具有很好的類別區分能力,適合用於分類。TF-IDF=TF*IDF

IDF主要思想:如果包含詞條t的文件越少,也就是n越小,IDF越到,則說明詞條t具有很好的區分能力。

TF指的是某一個給定的詞語在該檔案中出現的頻率,這是對詞數的歸一化,IDF是一個詞語重要性的度量,IDF=log(D/Dn),其中對數以2為底,D為文字總數,Dn為該詞在n個網頁中出現過。具體證明推導可以參考《數學之美》中對於TF-IDF的介紹,其實IDF是一個特定條件下關鍵詞的概率分佈的交叉熵,是資訊理論中的內容。

from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from matplotlib import pyplot as plt

# 1) 匯入資料
categories = ['alt.atheism',
              'rec.sport.hockey',
              'comp.graphics',
              'sci.crypt',
              'comp.os.ms-windows.misc',
              'sci.electronics',
              'comp.sys.ibm.pc.hardware',
              'sci.med',
              'comp.sys.mac.hardware',
              'sci.space',
              'comp.windows.x',
              'soc.religion.christian',
              'misc.forsale',
              'talk.politics.guns',
              'rec.autos' 
              'talk.politics.mideast',
              'rec.motorcycles',
              'talk.politics.misc',
              'rec.sport.baseball',
              'talk.religion.misc']
# 匯入訓練資料
train_path = '20news-bydate-train'
dataset_train = load_files(container_path=train_path, categories=categories)
# 匯入評估資料
test_path = '20news-bydate-test'
dataset_test = load_files(container_path=test_path, categories=categories)
#計算詞頻
count_vect = CountVectorizer(stop_words='english',decode_error='ignore')
X_train_counts = count_vect.fit_transform(dataset_train.data)
#計算TF-IDF
tf_transfomer = TfidfVectorizer(stop_words='english',decode_error='ignore')
X_train_counts_tf = tf_transfomer.fit_transform(dataset_train.data)

#演算法評估基準
'''採用10折交叉驗證的方式來比較演算法的準確度'''
num_folds = 10
seed = 7
scoring = 'accuracy'
#評估演算法
models = {}
models['LR'] = LogisticRegression() #邏輯迴歸
models['SVM'] = SVC() #支援向量機
models['CART'] = DecisionTreeClassifier() #分類與迴歸樹
models['MNB'] = MultinomialNB() #樸素貝葉斯分類器
models['KNN'] = KNeighborsClassifier() #K近鄰演算法
results = []
for key in models:
    kfold = KFold(n_splits=num_folds,random_state=seed)
    cv_results = cross_val_score(models[key], X_train_counts_tf, dataset_train.target, cv=kfold, scoring=scoring)
    results.append(cv_results)
    print('%s:%f(%f)' %(key,cv_results.mean(),cv_results.std()))

#邏輯迴歸調參
'''邏輯迴歸中的超引數是C,C值越小正則化強度越大'''
param_grid = {}
param_grid['C'] = [0.1,5,13,15]
model = LogisticRegression()
kfold = KFold(n_splits=num_folds,random_state=seed)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(X=X_train_counts_tf, y=dataset_train.target)
print('最優 : %s 使用 %s' % (grid_result.best_score_, grid_result.best_params_))
#樸素貝葉斯分類器調參
param_grid = {}
param_grid['alpha'] = [0.001,0.01,0.1,1.5]
model = MultinomialNB()
kfold = KFold(n_splits=num_folds,random_state=seed)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(X=X_train_counts_tf, y=dataset_train.target)
print('最優 : %s 使用 %s' % (grid_result.best_score_, grid_result.best_params_))

#整合演算法
ensembles = {}
ensembles['RF'] = RandomForestClassifier() #隨機森林
ensembles['AB'] = AdaBoostClassifier() #Adaboost
results = []
for key in  ensembles:
    kfold = KFold(n_splits=num_folds, random_state=seed)
    cv_results = cross_val_score(ensembles[key], X_train_counts_tf, dataset_train.target, cv=kfold, scoring=scoring)
    results.append(cv_results)
    print('%s : %f (%f)' % (key, cv_results.mean(), cv_results.std()))
#整合演算法調參
param_grid = {}
param_grid['n_estimators'] = [10,100,150,200]
model = RandomForestClassifier()
kfold = KFold(n_splits=num_folds,random_state=seed)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(X=X_train_counts_tf, y=dataset_train.target)
print('最優 : %s 使用 %s' % (grid_result.best_score_, grid_result.best_params_))

#生成模型
model = LogisticRegression(C=13)
model.fit(X_train_counts_tf,dataset_train.target)
X_test_counts = tf_transformer.transform(dataset_test.data)
predictions = model.predict(X_test_counts)
print(accuracy_score(dataset_test.target, predictions))
print(classification_report(dataset_test.target, predictions))