1. 程式人生 > >機器學習之模型評估與引數調優

機器學習之模型評估與引數調優

一、流水線工作流

       在利用訓練資料對模型進行擬合時已經得到一些引數,使用流水線可以避免在將模型用於新資料時重新設定這些引數。利用sklearn中的Pipline類,使得我們可以擬合出包含任意多個處理步驟的模型,並將模型用於新資料的預測。

1.	# Title     : TODO  
2.	# Objective : TODO  
3.	# Created by: Chen Da  
4.	# Created on: 2018/9/13  
5.	  
6.	import pandas as pd  
7.	import numpy as np  
8.	import os,time,sys  
9.	  
10.	#匯入乳腺癌資料集  
11.	df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data",header=None)  
12.	  
13.	#構造特徵矩陣和類別矩陣  
14.	from sklearn.preprocessing import LabelEncoder  
15.	X = df.loc[:, 2:].values  
16.	y = df.loc[:, 1].values  
17.	le = LabelEncoder()  
18.	y = le.fit_transform(y)     #對類別進行編碼  
19.	  
20.	from sklearn.cross_validation import train_test_split  
21.	X_train, X_test, y_train, y_test = train_test_split(X, y,  
22.	                                                    test_size=0.2, random_state=0)  

為了避免對訓練集和測試集上的資料分別進行模型擬合、資料轉換等操作,這裡通過流水線將標準化、PCA和LR迴歸封裝在一起。

1.	from sklearn.preprocessing import StandardScaler  
2.	from sklearn.decomposition import PCA  
3.	from sklearn.linear_model import LogisticRegression  
4.	from sklearn.pipeline import Pipeline  
5.	  
6.	pip_lr = Pipeline([('std', StandardScaler()),  
7.	                   ('PCA', PCA(n_components=2)),  
8.	                   ('LR', LogisticRegression(penalty='l1', random_state=0))])  
9.	  
10.	pip_lr.fit(X_train, y_train)  
11.	print("Test score is {}".format(pip_lr.score(X_test, y_test))) 
12.	
13.	Test score is 0.9385964912280702 

二、k折交叉檢驗

       通常情況下,我們將k折交叉驗證用於模型的調 優,也就是找到使得模型泛化效能 最優的超參值。一旦找到了滿意的超參值,我們就可以在全部的訓練資料上重新訓練模型, 並使用獨立的測試資料集對模型效能做出最終評價。分層交叉檢驗是一種改進的方法,可以得到方差和偏差都較小的結果。

1.		from sklearn.cross_validation import StratifiedKFold    #分層k折交叉驗證  
2.	  
3.	kfold = StratifiedKFold(y=y_train,  
4.	                        n_folds=10,  
5.	                        random_state=0)  
6.	  
7.	scores = []  
8.	  
9.	for k, (train, test) in enumerate(kfold):  
10.	    # print(k)  
11.	    # print(train,test)  
12.	    pip_lr.fit(X_train[train], y_train[train])  
13.	    score = pip_lr.score(X_train[train], y_train[train])  
14.	    scores.append(score)  
15.	    print("Fold: {}; Class dist: {}; score: {}"  
16.	          .format(k+1, np.bincount(y_train[train]), score))  
17.	
18.	print("CV score is %.3f +/- %.3f" % (np.mean(scores), np.std(scores)))  
19.	
20.	
21.	Fold: 1; Class dist: [261 148]; score: 0.960880195599022
22.	Fold: 2; Class dist: [261 148]; score: 0.960880195599022
23.	Fold: 3; Class dist: [261 148]; score: 0.9633251833740831
24.	Fold: 4; Class dist: [261 148]; score: 0.960880195599022
25.	Fold: 5; Class dist: [261 148]; score: 0.9755501222493888
26.	Fold: 6; Class dist: [261 149]; score: 0.9658536585365853
27.	Fold: 7; Class dist: [261 149]; score: 0.9609756097560975
28.	Fold: 8; Class dist: [261 149]; score: 0.9658536585365853
29.	Fold: 9; Class dist: [261 149]; score: 0.9585365853658536
30.	Fold: 10; Class dist: [261 149]; score: 0.9609756097560975
31.	
32.	
33.	CV score is 0.963 +/- 0.005

Sklearn中也封裝了k折交叉驗證的API:

1.	from sklearn.cross_validation import cross_val_score  
2.	  
3.	scores = cross_val_score(estimator=pip_lr,  
4.	                         X=X_train,  
5.	                         y=y_train,  
6.	                         n_jobs=-1)  
7.	print("CV score is %.3f +/- %.3f" % (np.mean(scores), np.std(scores)))  
8.	
9.	CV score is 0.954 +/- 0.014

三、利用學習曲線判定方差和偏差

1.	import matplotlib.pyplot as plt  
2.	from sklearn.learning_curve import learning_curve  
3.	  
4.	pip_lr = Pipeline([('std', StandardScaler()),  
5.	                   ('clf', LogisticRegression(  
6.	                                penalty='l2',random_state=1))])  
7.	  
8.	train_sizes,train_scores,test_scores = learning_curve(estimator=pip_lr,  
9.	                                                      X=X_train,  
10.	                                                      y=y_train,  
11.	                                                      train_sizes=np.linspace(0.1,1,10),  
12.	                                                      cv=10,  
13.	                                                      n_jobs=-1)  
14.	  
15.	# print(train_sizes,train_scores,test_scores)  
16.	  
17.	train_mean = np.mean(train_scores,axis=1)  
18.	train_std = np.std(train_scores,axis=1)  
19.	test_mean = np.mean(test_scores,axis=1)  
20.	test_std = np.std(test_scores,axis=1)  
21.	  
22.	plt.plot(train_sizes,train_mean,  
23.	         color='blue',marker='o',  
24.	         markersize=5,  
25.	         label='training sccuracy')  
26.	plt.fill_between(train_sizes,  
27.	                 train_mean + train_std,  
28.	                 train_mean - train_std,  
29.	                 alpha=0.15, color='blue')  
30.	plt.plot(train_sizes, test_mean,  
31.	         color='green',marker='*',  
32.	         markersize=5,  
33.	         label='testing accuracy')  
34.	plt.fill_between(train_sizes,  
35.	                 test_mean + test_std,  
36.	                 test_mean - test_std,  
37.	                 color='green',alpha=0.15)  
38.	plt.grid()  
39.	plt.xlabel('Numbers of training samples')  
40.	plt.ylabel('Accuracy')  
41.	plt.legend(loc='best')  
42.	plt.ylim([0.7,1.1])  
43.	plt.savefig('111.png')  
44.	plt.show()  

還可以通過驗證曲線判定過擬合或者欠擬合來提高模型效能,與學習曲線不同的是,驗證曲線繪製的不是樣本大小與訓練準確率、測試準確率之間的函式關係,而是準確率與模型引數之間的關係。

1.	from sklearn.learning_curve import validation_curve  
2.	  
3.	param_range = [0.001,0.01,0.1,1,10,100]  
4.	train_scores, test_scores = validation_curve(estimator=pip_lr,  
5.	                                             X=X_train,  
6.	                                             y=y_train,  
7.	                                             param_name='clf__C',  
8.	                                             param_range=param_range,  
9.	                                             cv=10)  	#這裡調整的是LR中的正則係數
10.	train_mean = np.mean(train_scores,axis=1)  
11.	train_std = np.std(train_scores,axis=1)  
12.	test_mean = np.mean(test_scores,axis=1)  
13.	test_std = np.std(test_scores,axis=1)  
14.	  
15.	plt.plot(param_range,train_mean,  
16.	         color='blue',marker='o',  
17.	         markersize=5,  
18.	         label='training sccuracy')  
19.	plt.fill_between(param_range,  
20.	                 train_mean + train_std,  
21.	                 train_mean - train_std,  
22.	                 alpha=0.15, color='blue')  
23.	plt.plot(param_range, test_mean,  
24.	         color='green',marker='*',  
25.	         markersize=5,  
26.	         label='testing accuracy')  
27.	plt.fill_between(param_range,  
28.	                 test_mean + test_std,  
29.	                 test_mean - test_std,  
30.	                 color='green',alpha=0.15)  
31.	plt.grid()  
32.	plt.xscale('log')   #x座標軸刻度按對數取  
33.	plt.xlabel('Parameter C')  
34.	plt.ylabel('Accuracy')  
35.	plt.legend(loc='best')  
36.	plt.ylim([0.7,1.1])  
37.	plt.savefig('222.png')  
38.	plt.show()  

關於調優超參的方法常用的還有網格搜尋(暴力窮舉)、巢狀交叉驗證。

四、其他常用的效能評價指標

1、混餚矩陣

1.	#得到混餚矩陣  
2.	from sklearn.metrics import confusion_matrix  
3.	pip_lr.fit(X_train,y_train)  
4.	y_pred = pip_lr.predict(X_test)  
5.	confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)  
6.	print(confmat)  
7.	
8.	[[65  2]
9.	 [ 2 45]]
10.	  
11.	#得到真正率,召回率,f1分數  
12.	from sklearn.metrics import precision_score, recall_score, f1_score  
13.	
14.	print("precision score is {}".format(precision_score(y_true=y_test, y_pred=y_pred)))  
15.	print("recall score is {}".format(recall_score(y_true=y_test, y_pred=y_pred)))  
16.	print("f1 score is {}".format(f1_score(y_true=y_test, y_pred=y_pred)))  
17.	
18.	precision score is 0.9574468085106383
19.	recall score is 0.9574468085106383
20.	f1 score is 0.9574468085106385

2、ROC曲線

       這裡給出的是真正率-假正率曲線,也可以繪製真正率-召回率曲線。

1.	from sklearn.metrics import roc_curve,auc  
2.	from scipy import interp  
3.	  
4.	X_train2 = X_train[:,[4,14]]  
5.	cv = StratifiedKFold(y_train,  
6.	                     n_folds=3,  
7.	                     random_state=0)        #分層抽樣  
8.	fig = plt.figure(figsize=(7,5))  
9.	mean_tpr = 0.0  
10.	mean_fpr = np.linspace(0,1,100)  
11.	all_tpr = []  
12.	  
13.	for i, (train,test) in enumerate(cv):  
14.	    probas = pip_lr.fit(X_train2[train],  
15.	                        y_train[train]).predict_proba(X_train2[test])  
16.	    # print(probas)  
17.	    fpr,tpr,thresholds = roc_curve(y_train[test],  
18.	                                   probas[:,1],  
19.	                                   pos_label=1)  
20.	    # print(fpr,tpr,thresholds)  
21.	    mean_tpr += interp(mean_fpr,fpr,tpr)    #利用三個塊資料對ROC曲線的內插均值進行計算  
22.	    # print(mean_tpr)  
23.	    mean_tpr[0] = 0.0  
24.	    # print(mean_tpr)  
25.	    roc_auc = auc(fpr,tpr)  
26.	    plt.plot(fpr,  
27.	             tpr,  
28.	             lw=1,  
29.	             label='ROC fold %d (area=%0.2f)'  
30.	                    % (i+1,roc_auc))  
31.	plt.plot([0,1],  
32.	         [0,1],  
33.	         linestyle='--',  
34.	         color=(0.6,0.6,0.6),  
35.	         label="random guessing")  
36.	mean_tpr /= len(cv)  
37.	mean_tpr[-1] = 1.0  
38.	mean_auc = auc(mean_fpr,mean_tpr)  
39.	plt.plot(mean_fpr,  
40.	         mean_tpr,  
41.	         'k--',  
42.	         label='mean ROC (area=%0.2f)' % mean_auc,  
43.	         lw=2)  
44.	plt.plot([0,0,1],  
45.	         [0,1,1],  
46.	         lw=2,  
47.	         linestyle=':',  
48.	         color='black',  
49.	         label='perfect performance')  
50.	plt.xlim([-0.05,1.05])  
51.	plt.ylim([-0.05,1.05])  
52.	plt.xlabel('false positive rate')  
53.	plt.ylabel('true positive rate')  
54.	plt.legend(loc='best')  
55.	plt.savefig('333.png')  
56.	plt.show()