機器學習之模型評估與引數調優
阿新 • • 發佈:2018-12-09
一、流水線工作流
在利用訓練資料對模型進行擬合時已經得到一些引數,使用流水線可以避免在將模型用於新資料時重新設定這些引數。利用sklearn中的Pipline類,使得我們可以擬合出包含任意多個處理步驟的模型,並將模型用於新資料的預測。
1. # Title : TODO 2. # Objective : TODO 3. # Created by: Chen Da 4. # Created on: 2018/9/13 5. 6. import pandas as pd 7. import numpy as np 8. import os,time,sys 9. 10. #匯入乳腺癌資料集 11. df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data",header=None) 12. 13. #構造特徵矩陣和類別矩陣 14. from sklearn.preprocessing import LabelEncoder 15. X = df.loc[:, 2:].values 16. y = df.loc[:, 1].values 17. le = LabelEncoder() 18. y = le.fit_transform(y) #對類別進行編碼 19. 20. from sklearn.cross_validation import train_test_split 21. X_train, X_test, y_train, y_test = train_test_split(X, y, 22. test_size=0.2, random_state=0)
為了避免對訓練集和測試集上的資料分別進行模型擬合、資料轉換等操作,這裡通過流水線將標準化、PCA和LR迴歸封裝在一起。
1. from sklearn.preprocessing import StandardScaler 2. from sklearn.decomposition import PCA 3. from sklearn.linear_model import LogisticRegression 4. from sklearn.pipeline import Pipeline 5. 6. pip_lr = Pipeline([('std', StandardScaler()), 7. ('PCA', PCA(n_components=2)), 8. ('LR', LogisticRegression(penalty='l1', random_state=0))]) 9. 10. pip_lr.fit(X_train, y_train) 11. print("Test score is {}".format(pip_lr.score(X_test, y_test))) 12. 13. Test score is 0.9385964912280702
二、k折交叉檢驗
通常情況下,我們將k折交叉驗證用於模型的調 優,也就是找到使得模型泛化效能 最優的超參值。一旦找到了滿意的超參值,我們就可以在全部的訓練資料上重新訓練模型, 並使用獨立的測試資料集對模型效能做出最終評價。分層交叉檢驗是一種改進的方法,可以得到方差和偏差都較小的結果。
1. from sklearn.cross_validation import StratifiedKFold #分層k折交叉驗證 2. 3. kfold = StratifiedKFold(y=y_train, 4. n_folds=10, 5. random_state=0) 6. 7. scores = [] 8. 9. for k, (train, test) in enumerate(kfold): 10. # print(k) 11. # print(train,test) 12. pip_lr.fit(X_train[train], y_train[train]) 13. score = pip_lr.score(X_train[train], y_train[train]) 14. scores.append(score) 15. print("Fold: {}; Class dist: {}; score: {}" 16. .format(k+1, np.bincount(y_train[train]), score)) 17. 18. print("CV score is %.3f +/- %.3f" % (np.mean(scores), np.std(scores))) 19. 20. 21. Fold: 1; Class dist: [261 148]; score: 0.960880195599022 22. Fold: 2; Class dist: [261 148]; score: 0.960880195599022 23. Fold: 3; Class dist: [261 148]; score: 0.9633251833740831 24. Fold: 4; Class dist: [261 148]; score: 0.960880195599022 25. Fold: 5; Class dist: [261 148]; score: 0.9755501222493888 26. Fold: 6; Class dist: [261 149]; score: 0.9658536585365853 27. Fold: 7; Class dist: [261 149]; score: 0.9609756097560975 28. Fold: 8; Class dist: [261 149]; score: 0.9658536585365853 29. Fold: 9; Class dist: [261 149]; score: 0.9585365853658536 30. Fold: 10; Class dist: [261 149]; score: 0.9609756097560975 31. 32. 33. CV score is 0.963 +/- 0.005
Sklearn中也封裝了k折交叉驗證的API:
1. from sklearn.cross_validation import cross_val_score
2.
3. scores = cross_val_score(estimator=pip_lr,
4. X=X_train,
5. y=y_train,
6. n_jobs=-1)
7. print("CV score is %.3f +/- %.3f" % (np.mean(scores), np.std(scores)))
8.
9. CV score is 0.954 +/- 0.014
三、利用學習曲線判定方差和偏差
1. import matplotlib.pyplot as plt
2. from sklearn.learning_curve import learning_curve
3.
4. pip_lr = Pipeline([('std', StandardScaler()),
5. ('clf', LogisticRegression(
6. penalty='l2',random_state=1))])
7.
8. train_sizes,train_scores,test_scores = learning_curve(estimator=pip_lr,
9. X=X_train,
10. y=y_train,
11. train_sizes=np.linspace(0.1,1,10),
12. cv=10,
13. n_jobs=-1)
14.
15. # print(train_sizes,train_scores,test_scores)
16.
17. train_mean = np.mean(train_scores,axis=1)
18. train_std = np.std(train_scores,axis=1)
19. test_mean = np.mean(test_scores,axis=1)
20. test_std = np.std(test_scores,axis=1)
21.
22. plt.plot(train_sizes,train_mean,
23. color='blue',marker='o',
24. markersize=5,
25. label='training sccuracy')
26. plt.fill_between(train_sizes,
27. train_mean + train_std,
28. train_mean - train_std,
29. alpha=0.15, color='blue')
30. plt.plot(train_sizes, test_mean,
31. color='green',marker='*',
32. markersize=5,
33. label='testing accuracy')
34. plt.fill_between(train_sizes,
35. test_mean + test_std,
36. test_mean - test_std,
37. color='green',alpha=0.15)
38. plt.grid()
39. plt.xlabel('Numbers of training samples')
40. plt.ylabel('Accuracy')
41. plt.legend(loc='best')
42. plt.ylim([0.7,1.1])
43. plt.savefig('111.png')
44. plt.show()
還可以通過驗證曲線判定過擬合或者欠擬合來提高模型效能,與學習曲線不同的是,驗證曲線繪製的不是樣本大小與訓練準確率、測試準確率之間的函式關係,而是準確率與模型引數之間的關係。
1. from sklearn.learning_curve import validation_curve
2.
3. param_range = [0.001,0.01,0.1,1,10,100]
4. train_scores, test_scores = validation_curve(estimator=pip_lr,
5. X=X_train,
6. y=y_train,
7. param_name='clf__C',
8. param_range=param_range,
9. cv=10) #這裡調整的是LR中的正則係數
10. train_mean = np.mean(train_scores,axis=1)
11. train_std = np.std(train_scores,axis=1)
12. test_mean = np.mean(test_scores,axis=1)
13. test_std = np.std(test_scores,axis=1)
14.
15. plt.plot(param_range,train_mean,
16. color='blue',marker='o',
17. markersize=5,
18. label='training sccuracy')
19. plt.fill_between(param_range,
20. train_mean + train_std,
21. train_mean - train_std,
22. alpha=0.15, color='blue')
23. plt.plot(param_range, test_mean,
24. color='green',marker='*',
25. markersize=5,
26. label='testing accuracy')
27. plt.fill_between(param_range,
28. test_mean + test_std,
29. test_mean - test_std,
30. color='green',alpha=0.15)
31. plt.grid()
32. plt.xscale('log') #x座標軸刻度按對數取
33. plt.xlabel('Parameter C')
34. plt.ylabel('Accuracy')
35. plt.legend(loc='best')
36. plt.ylim([0.7,1.1])
37. plt.savefig('222.png')
38. plt.show()
關於調優超參的方法常用的還有網格搜尋(暴力窮舉)、巢狀交叉驗證。
四、其他常用的效能評價指標
1、混餚矩陣
1. #得到混餚矩陣
2. from sklearn.metrics import confusion_matrix
3. pip_lr.fit(X_train,y_train)
4. y_pred = pip_lr.predict(X_test)
5. confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)
6. print(confmat)
7.
8. [[65 2]
9. [ 2 45]]
10.
11. #得到真正率,召回率,f1分數
12. from sklearn.metrics import precision_score, recall_score, f1_score
13.
14. print("precision score is {}".format(precision_score(y_true=y_test, y_pred=y_pred)))
15. print("recall score is {}".format(recall_score(y_true=y_test, y_pred=y_pred)))
16. print("f1 score is {}".format(f1_score(y_true=y_test, y_pred=y_pred)))
17.
18. precision score is 0.9574468085106383
19. recall score is 0.9574468085106383
20. f1 score is 0.9574468085106385
2、ROC曲線
這裡給出的是真正率-假正率曲線,也可以繪製真正率-召回率曲線。
1. from sklearn.metrics import roc_curve,auc
2. from scipy import interp
3.
4. X_train2 = X_train[:,[4,14]]
5. cv = StratifiedKFold(y_train,
6. n_folds=3,
7. random_state=0) #分層抽樣
8. fig = plt.figure(figsize=(7,5))
9. mean_tpr = 0.0
10. mean_fpr = np.linspace(0,1,100)
11. all_tpr = []
12.
13. for i, (train,test) in enumerate(cv):
14. probas = pip_lr.fit(X_train2[train],
15. y_train[train]).predict_proba(X_train2[test])
16. # print(probas)
17. fpr,tpr,thresholds = roc_curve(y_train[test],
18. probas[:,1],
19. pos_label=1)
20. # print(fpr,tpr,thresholds)
21. mean_tpr += interp(mean_fpr,fpr,tpr) #利用三個塊資料對ROC曲線的內插均值進行計算
22. # print(mean_tpr)
23. mean_tpr[0] = 0.0
24. # print(mean_tpr)
25. roc_auc = auc(fpr,tpr)
26. plt.plot(fpr,
27. tpr,
28. lw=1,
29. label='ROC fold %d (area=%0.2f)'
30. % (i+1,roc_auc))
31. plt.plot([0,1],
32. [0,1],
33. linestyle='--',
34. color=(0.6,0.6,0.6),
35. label="random guessing")
36. mean_tpr /= len(cv)
37. mean_tpr[-1] = 1.0
38. mean_auc = auc(mean_fpr,mean_tpr)
39. plt.plot(mean_fpr,
40. mean_tpr,
41. 'k--',
42. label='mean ROC (area=%0.2f)' % mean_auc,
43. lw=2)
44. plt.plot([0,0,1],
45. [0,1,1],
46. lw=2,
47. linestyle=':',
48. color='black',
49. label='perfect performance')
50. plt.xlim([-0.05,1.05])
51. plt.ylim([-0.05,1.05])
52. plt.xlabel('false positive rate')
53. plt.ylabel('true positive rate')
54. plt.legend(loc='best')
55. plt.savefig('333.png')
56. plt.show()