Python機器學習庫sklearn網格搜尋與交叉驗證
阿新 • • 發佈:2019-02-01
網格搜尋一般是針對引數進行尋優,交叉驗證是為了驗證訓練模型擬合程度。sklearn中的相關API如下:
(1)交叉驗證的首要工作:切分資料集train/validation/test
A.)沒指定資料切分方式,直接選用cross_val_score按預設切分方式進行交叉驗證評估得分,如下圖
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
iris = load_iris()
logreg = LogisticRegression()
scores = cross_val_score(logreg, iris.data, iris.target, cv=5 )
#預設cv=3,沒指定預設在訓練集和測試集上進行交叉驗證
scores
#Output:
#array([ 1. , 0.96666667, 0.93333333, 0.9 , 1. ])
B.)K折交叉驗證KFold
from sklearn.datasets import load_iris
from sklearn.model_selection import KFold
iris = load_iris()
kfold = KFold(n_splits=5)
cross_val_score(logreg, iris.data, iris.target, cv=kfold)
#Output:
#array([ 1. , 0.93333333, 0.43333333, 0.96666667, 0.43333333])
from sklearn.datasets import load_iris
from sklearn.model_selection import KFold
kfold = KFold(n_splits=3, shuffle=True, random_state=0)
#shuffle添加了隨機擾動,打亂樣本順序,再進行k折切分樣本
cross_val_score(logreg, iris.data, iris.target, cv=kfold)
#Output:
#array([ 0.9 , 0.96, 0.96])
C.)留一交叉驗證LeaveOneOut(工業實踐很少用)
from sklearn.datasets import load_iris
from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()
scores = cross_val_score(logreg, iris.data, iris.target, cv=loo)
#cv引數Determines the cross-validation splitting strategy,An object to be used as a cross-validation generator.
print("number of cv iterations: ", len(scores))
print("mean accuracy: ", scores.mean())
#Output:
#number of cv iterations: 150
#mean accuracy: 0.953333333333
D.)亂序分割交叉驗證ShuffleSplit
from sklearn.datasets import load_iris
from sklearn.model_selection import ShuffleSplit
#random splits do not guarantee that all folds will be different, although this is still very likely for sizeable datasets.
shuffle_split = ShuffleSplit(test_size=.5, train_size=.5, n_splits=10,random_state=0)
cross_val_score(logreg, iris.data, iris.target, cv=shuffle_split)
#Output:
#array([ 0.84 , 0.93333333, 0.90666667, 1. , 0.90666667,
# 0.93333333, 0.94666667, 1. , 0.90666667, 0.88 ])
E.)資料與分組交叉驗證GroupKFold
The same group will not appear in two different folds (the number of distinct groups has to be at least equal to the number of folds).
from sklearn.model_selection import GroupKFold
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
y = np.array([1, 2, 3, 4])
groups = np.array([0, 0, 2, 2])
gkf=GroupKFold(n_splits=2)
for train_index, test_index in gkf.split(X, y, groups):
print("TRAIN:", train_index, "TEST:", test_index)
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
print('X_train:',X_train)
print('X_test:',X_test)
print('y_train:',y_train)
print('y_test:',y_test)
#Output:
#TRAIN: [0 1] TEST: [2 3]
#X_train: [[1 2]
# [3 4]]
#X_test: [[5 6]
# [7 8]]
#y_train: [1 2]
#y_test: [3 4]
#TRAIN: [2 3] TEST: [0 1]
#X_train: [[5 6]
# [7 8]]
#X_test: [[1 2]
# [3 4]]
#y_train: [3 4]
#y_test: [1 2]
F.)按樣本的標籤分層切分StratifiedKFold
(2.)有關模型的引數調優過程,即網格搜尋/交叉驗證
a.)最簡單的網格搜尋:兩層for迴圈
# naive grid search implementation
from sklearn.datasets import load_iris
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=0)
print("Size of training set: %d size of test set: %d" % (X_train.shape[0], X_test.shape[0]))
best_score = 0
for gamma in [0.001, 0.01, 0.1, 1, 10, 100]:
for C in [0.001, 0.01, 0.1, 1, 10, 100]:
# for each combination of parameters
# train an SVC
svm = SVC(gamma=gamma, C=C)
svm.fit(X_train, y_train)
# evaluate the SVC on the test set
score = svm.score(X_test, y_test)
# if we got a better score, store the score and parameters
if score > best_score:
best_score = score
best_parameters = {'C': C, 'gamma': gamma}
print("best score: ", best_score)
print("best parameters: ", best_parameters)
#Output:
#Size of training set: 112 size of test set: 38
#best score: 0.973684210526
#best parameters: {'gamma': 0.001, 'C': 100}
在訓練集上再切出來一部分作為驗證集,用於評估模型,防止過擬合
from sklearn.datasets import load_iris
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=0)
print("Size of training set: %d size of test set: %d" % (X_train.shape[0], X_test.shape[0]))
best_score = 0
for gamma in [0.001, 0.01, 0.1, 1, 10, 100]:
for C in [0.001, 0.01, 0.1, 1, 10, 100]:
# for each combination of parameters
# train an SVC
svm = SVC(gamma=gamma, C=C)
svm.fit(X_train, y_train)
# evaluate the SVC on the test set
score = svm.score(X_test, y_test)
# if we got a better score, store the score and parameters
if score > best_score:
best_score = score
best_parameters = {'C': C, 'gamma': gamma}
print("best score: ", best_score)
print("best parameters: ", best_parameters)
#Output:
#Size of training set: 112 size of test set: 38
#best score: 0.973684210526
#best parameters: {'gamma': 0.001, 'C': 100}
b.)網格搜尋內部嵌套了交叉驗證
import numpy as np
from sklearn.datasets import load_iris
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
iris = load_iris()
X_trainval, X_test, y_trainval, y_test = train_test_split(iris.data, iris.target, random_state=0) #總集——>訓練驗證集+測試集
X_train, X_valid, y_train, y_valid = train_test_split(X_trainval, y_trainval, random_state=1) #訓練驗證集——>訓練集+驗證集
best_score = 0
for gamma in [0.001, 0.01, 0.1, 1, 10, 100]:
for C in [0.001, 0.01, 0.1, 1, 10, 100]:
svm = SVC(gamma=gamma, C=C)
scores = cross_val_score(svm, X_trainval, y_trainval, cv=5) #在訓練集和驗證集上進行交叉驗證
score = np.mean(scores) # compute mean cross-validation accuracy
if score > best_score:
best_score = score
best_parameters = {'C': C, 'gamma': gamma}
# rebuild a model on the combined training and validation set
print('網格搜尋for迴圈<有cross_val_score交叉驗證>獲得的最好引數組合:',best_parameters)
print(' ')
svmf = SVC(**best_parameters)
svmf.fit(X_trainval, y_trainval)
print('網格搜尋<有交叉驗證>獲得的最好估計器,在訓練驗證集上沒做交叉驗證的得分:',svmf.score(X_trainval,y_trainval))#####
print(' ')
scores = cross_val_score(svmf, X_trainval, y_trainval, cv=5) #在訓練集和驗證集上進行交叉驗證
print('網格搜尋<有交叉驗證>獲得的最好估計器,在訓練驗證集上做交叉驗證的平均得分:',np.mean(scores)) #交叉驗證的平均accuracy
print(' ')
print('網格搜尋<有交叉驗證>獲得的最好估計器,在測試集上的得分:',svmf.score(X_test,y_test))#####
# print(' ')
# print(' ')
# scoreall = cross_val_score(svmf, iris.data, iris.target, cv=5)
# print(scoreall ,np.mean(scoreall))
Output:
網格搜尋for迴圈<有cross_val_score交叉驗證>獲得的最好引數組合: {'gamma': 0.01, 'C': 100}
網格搜尋<有交叉驗證>獲得的最好估計器,在訓練驗證集上沒做交叉驗證的得分: 0.982142857143
網格搜尋<有交叉驗證>獲得的最好估計器,在訓練驗證集上做交叉驗證的平均得分: 0.972689629211
網格搜尋<有交叉驗證>獲得的最好估計器,在測試集上的得分: 0.973684210526
c.)構造引數字典,代替雙層for迴圈進行網格搜尋
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
X_trainvalid, X_test, y_trainvalid, y_test = train_test_split(iris.data, iris.target, random_state=0) #default=0.25
grid_search = GridSearchCV(SVC(), param_grid, cv=5) #網格搜尋+交叉驗證
grid_search.fit(X_trainvalid, y_trainvalid)
print('GridSearchCV交叉驗證網格搜尋字典獲得的最好引數組合',grid_search.best_params_)
print(' ')
print('GridSearchCV交叉驗證網格搜尋獲得的最好估計器,在訓練驗證集上沒做交叉驗證的得分',grid_search.score(X_trainvalid,y_trainvalid))#####
print(' ')
print('GridSearchCV交叉驗證網格搜尋獲得的最好估計器,在**集上做交叉驗證的平均得分',grid_search.best_score_)#?????
# print(' ')
# print('BEST_ESTIMATOR:',grid_search.best_estimator_) #對應分數最高的估計器
print(' ')
print('GridSearchCV交叉驗證網格搜尋獲得的最好估計器,在測試集上的得分',grid_search.score(X_test, y_test))#####
Output:
GridSearchCV交叉驗證網格搜尋字典獲得的最好引數組合 {'gamma': 0.01, 'C': 100}
GridSearchCV交叉驗證網格搜尋獲得的最好估計器,在訓練驗證集上沒做交叉驗證的得分 0.982142857143
GridSearchCV交叉驗證網格搜尋獲得的最好估計器,在**集上做交叉驗證的平均得分 0.973214285714
GridSearchCV交叉驗證網格搜尋獲得的最好估計器,在測試集上的得分 0.973684210526
d.)巢狀交叉驗證:字典引數+cross_val_score
from sklearn.datasets import load_iris
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}
scores = cross_val_score(GridSearchCV(SVC(), param_grid, cv=5), iris.data, iris.target, cv=5)
#選定網格搜尋的每一組超引數,對訓練集與測試集的交叉驗證(cross_val_score沒指定資料集合分割的預設情況)
print("Cross-validation scores: ", scores)
print("Mean cross-validation score: ", scores.mean())
#Output:
#Cross-validation scores: [ 0.96666667 1. 0.96666667 0.96666667 1. ]
#Mean cross-validation score: 0.98
def nested_cv(X, y, inner_cv, outer_cv, Classifier, parameter_grid):
outer_scores = []
# for each split of the data in the outer cross-validation
# (split method returns indices)
for training_samples, test_samples in outer_cv.split(X, y):
# find best parameter using inner cross-validation:網格搜尋外層cv
best_parms = {}
best_score = -np.inf
# iterate over parameters
for parameters in parameter_grid:
# accumulate score over inner splits
cv_scores = []
# iterate over inner cross-validation
for inner_train, inner_test in inner_cv.split(X[training_samples], y[training_samples]):
# build classifier given parameters and training data交叉驗證內層cv
clf = Classifier(**parameters)
clf.fit(X[inner_train], y[inner_train])
# evaluate on inner test set
score = clf.score(X[inner_test], y[inner_test])
cv_scores.append(score)
# compute mean score over inner folds
mean_score = np.mean(cv_scores)
if mean_score > best_score:
# if better than so far, remember parameters
best_score = mean_score
best_params = parameters
# build classifier on best parameters using outer training set
clf = Classifier(**best_params)
clf.fit(X[training_samples], y[training_samples])
# evaluate
outer_scores.append(clf.score(X[test_samples], y[test_samples]))
return outer_scores
from sklearn.datasets import load_iris
from sklearn.svm import SVC
from sklearn.model_selection import ParameterGrid, StratifiedKFold
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}
#http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ParameterGrid.html#sklearn.model_selection.ParameterGrid
#http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html#sklearn.model_selection.StratifiedKFold
#ParameterGrid是按給定引數字典分配訓練集與測試集,StratifiedKFold是分層分配訓練集與測試集
nested_cv(iris.data, iris.target, StratifiedKFold(5), StratifiedKFold(5), SVC, ParameterGrid(param_grid))
#Output:
#[0.96666666666666667, 1.0, 0.96666666666666667, 0.96666666666666667, 1.0]