【機器學習】交叉驗證、正則化例項Python程式碼實現
阿新 • • 發佈:2019-02-01
前言
機器學習常用的資料集網址:資料集
執行環境:python3.6(這裡我用的anaconda的jupyter notebook)
1. 對比不同模型的交叉驗證的結果
資料集來源:紅酒資料集
這份資料集包含來自3種不同起源的葡萄酒的共178條記錄。13個屬性是葡萄酒的13種化學成分。通過化學分析可以來推斷葡萄酒的起源。值得一提的是所有屬性變數都是連續變數。
from sklearn import datasets # 用於呼叫sklearn自帶的資料集 # 用load_wine方法匯入資料 wine_data = datasets.load_wine() print(wine_data.feature_names) # 輸出的就是13個屬性名 data_input = wine_data.data # 輸入輸出資料 data_output = wine_data.target from sklearn.ensemble import RandomForestClassifier # 隨即森林模型 from sklearn.linear_model import LogisticRegression # 邏輯迴歸模型 from sklearn import svm # 支援向量機 from sklearn.model_selection import cross_val_score # 模型重新命名 rf_class = RandomForestClassifier(n_estimators=10) log_class = LogisticRegression() svm_class = svm.LinearSVC() # 把資料分為四分,並計算每次交叉驗證的結果,並返回 print(cross_val_score(rf_class, data_input, data_output, scoring='accuracy', cv = 4)) # 這裡的cross_val_score將交叉驗證的整個過程連線起來,不用再進行手動的分割資料 # cv引數用於規定將原始資料分成多少份 accuracy = cross_val_score(rf_class, data_input, data_output, scoring='accuracy', cv = 4).mean() * 100 print("Accuracy of Random Forests is: " , accuracy) accuracy = cross_val_score(log_class, data_input, data_output, scoring='accuracy', cv = 4).mean() * 100 print("Accuracy of logistic is: " , accuracy) accuracy = cross_val_score(svm_class, data_input, data_output, scoring='accuracy', cv = 4).mean() * 100 print("Accuracy of SVM is: " , accuracy)
2. 正則化(regularization)
資料準備
import numpy as np import pandas as pd import random import matplotlib.pyplot as plt %matplotlib inline from matplotlib.pylab import rcParams rcParams['figure.figsize'] = 12, 10 x = np.array([1.4*i*np.pi/180 for i in range(0,300,4)]) np.random.seed(20) #隨機數 y = np.sin(x) + np.random.normal(0,0.2,len(x)) # 加噪音 data = pd.DataFrame(np.column_stack([x,y]),columns=['x','y']) plt.plot(data['x'],data['y'],'.')
# 模型複雜度設定
for i in range(2,16):
colname = 'x_%d'%i # 變數名變為 x_i形式
data[colname] = data['x']**i
print(data.head()) # 顯示五行
LinearRegression(normalize=True) 加入正則化的線性迴歸
# 模型複雜度可變 from sklearn.linear_model import LinearRegression def linear_regression(data, power, models_to_plot): # 初始化預測器 predictors=['x'] if power>=2: predictors.extend(['x_%d'%i for i in range(2,power+1)]) # 模型訓練 linreg = LinearRegression(normalize=True) linreg.fit(data[predictors],data['y']) # 預測 y_pred = linreg.predict(data[predictors]) # 是否要畫圖(複雜度是否在models_to_plot中)為了便於比較選擇性畫圖 if power in models_to_plot: plt.subplot(models_to_plot[power]) plt.tight_layout() plt.plot(data['x'],y_pred) plt.plot(data['x'],data['y'],'.') plt.title('Plot for power: %d'%power) # 返回結果 rss = sum((y_pred-data['y'])**2) ret = [rss] ret.extend([linreg.intercept_]) ret.extend(linreg.coef_) return ret col = ['rss','intercept'] + ['coef_x_%d'%i for i in range(1,16)] ind = ['model_pow_%d'%i for i in range(1,16)] coef_matrix_simple = pd.DataFrame(index=ind, columns=col) # 定義作圖的位置與模型的複雜度 models_to_plot = {1:231,3:232,6:233,8:234,11:235,14:236} # 畫圖 for i in range(1,16): coef_matrix_simple.iloc[i-1,0:i+2] = linear_regression(data, power=i, models_to_plot=models_to_plot)
Ridge(L2-norm)
from sklearn.linear_model import Ridge
def ridge_regression(data, predictors, alpha, models_to_plot={}):
# 模型訓練
ridgereg = Ridge(alpha=alpha,normalize=True)
ridgereg.fit(data[predictors],data['y'])
# 預測
y_pred = ridgereg.predict(data[predictors])
# 選擇alpha值畫圖
if alpha in models_to_plot:
plt.subplot(models_to_plot[alpha])
plt.tight_layout()
plt.plot(data['x'],y_pred)
plt.plot(data['x'],data['y'],'.')
plt.title('Plot for alpha: %.3g'%alpha)
rss = sum((y_pred-data['y'])**2)
ret = [rss]
ret.extend([ridgereg.intercept_])
ret.extend(ridgereg.coef_)
return ret
predictors=['x']
predictors.extend(['x_%d'%i for i in range(2,16)])
# 定義alpha值
alpha_ridge = [1e-15, 1e-10, 1e-8, 1e-4, 1e-3,1e-2, 1, 5, 10, 20]
col = ['rss','intercept'] + ['coef_x_%d'%i for i in range(1,16)]
ind = ['alpha_%.2g'%alpha_ridge[i] for i in range(0,10)]
coef_matrix_ridge = pd.DataFrame(index=ind, columns=col)
models_to_plot = {1e-15:231, 1e-10:232, 1e-4:233, 1e-3:234, 1e-2:235, 5:236}
for i in range(10):
coef_matrix_ridge.iloc[i,] = ridge_regression(data, predictors, alpha_ridge[i], models_to_plot)
Lasso(L1-norm)
from sklearn.linear_model import Lasso
def lasso_regression(data, predictors, alpha, models_to_plot={}):
#Fit the model
lassoreg = Lasso(alpha=alpha,normalize=True, max_iter=1e5)
lassoreg.fit(data[predictors],data['y'])
y_pred = lassoreg.predict(data[predictors])
#Check if a plot is to be made for the entered alpha
if alpha in models_to_plot:
plt.subplot(models_to_plot[alpha])
plt.tight_layout()
plt.plot(data['x'],y_pred)
plt.plot(data['x'],data['y'],'.')
plt.title('Plot for alpha: %.3g'%alpha)
#Return the result in pre-defined format
rss = sum((y_pred-data['y'])**2)
ret = [rss]
ret.extend([lassoreg.intercept_])
ret.extend(lassoreg.coef_)
return ret
predictors=['x']
predictors.extend(['x_%d'%i for i in range(2,16)])
# 定義alpha值去測試
alpha_lasso = [1e-15, 1e-10, 1e-8, 1e-5,1e-4, 1e-3,1e-2, 1, 5, 10]
col = ['rss','intercept'] + ['coef_x_%d'%i for i in range(1,16)]
ind = ['alpha_%.2g'%alpha_lasso[i] for i in range(0,10)]
coef_matrix_lasso = pd.DataFrame(index=ind, columns=col)
# 定義畫圖的模式
models_to_plot = {1e-10:231, 1e-5:232,1e-4:233, 1e-3:234, 1e-2:235, 1:236}
#迭代10個alpha值:
for i in range(10):
coef_matrix_lasso.iloc[i,] = lasso_regression(data, predictors, alpha_lasso[i], models_to_plot)