1. 程式人生 > >【機器學習】迴歸案例實踐:資料處理建模調參

【機器學習】迴歸案例實踐:資料處理建模調參

# -*- coding: utf-8 -*-
"""迴歸問題案例.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1l8xlYKSd8nljVVEEriZyoc0oivqMDWR0
"""

# 匯入必要的包
import numpy as np
import matplotlib.pyplot as plt
from pandas import read_csv
from pandas import
set_option from pandas.plotting import scatter_matrix from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split from sklearn.model_selection import KFold from sklearn.model_selection import cross_val_score from sklearn.model_selection import GridSearchCV from
sklearn.linear_model import LinearRegression from sklearn.linear_model import Lasso from sklearn.linear_model import ElasticNet from sklearn.tree import DecisionTreeRegressor from sklearn.neighbors import KNeighborsRegressor from sklearn.svm import SVR from sklearn.pipeline import Pipeline from sklearn.
ensemble import RandomForestRegressor from sklearn.ensemble import GradientBoostingRegressor from sklearn.ensemble import ExtraTreesRegressor from sklearn.ensemble import AdaBoostRegressor from sklearn.metrics import mean_squared_error # 匯入資料 filename = 'https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data' names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PRTATTO', 'B', 'LSTAT', 'MEDV' ] data = read_csv(filename, names=names, delim_whitespace=True) # 指定檔案的分隔符為空格鍵 data.shape data.head() data.describe() # 理解資料 print(data.dtypes) # set_option('precision', 1) print(data.describe()) # 檢視特徵之間的兩兩關聯關係 set_option('precision', 2) print(data.corr(method='pearson')) # 資料視覺化 # 單一特徵圖表 data.hist(sharex=False, sharey=False, xlabelsize=1, ylabelsize=1, layout=(3,5), bins=100) plt.show() # 用密度圖展示:更加平滑展示資料特徵 data.plot(kind='density', subplots=True, layout=(4,4), sharex=False, fontsize=1) plt.show() # 箱線圖 data.plot(kind='box', subplots=True, layout=(4,4), sharex=False, sharey=False, fontsize=8) plt.show() """### 多重資料圖表 看不同資料特徵之間的相互影響關係。 """ # 散點矩陣圖 scatter_matrix(data) plt.show() # 相關矩陣圖 fig = plt.figure() ax = fig.add_subplot(111) cax = ax.matshow(data.corr(), vmin=-1, vmax=1, interpolation='none') fig.colorbar(cax) ticks = np.arange(0,14,1) ax.set_xticks(ticks) ax.set_yticks(ticks) ax.set_xticklabels(names) ax.set_yticklabels(names) plt.show() """資料集中的資料結構較複雜,需要考慮對資料進行轉換,以提高模型的準確度。 - 特徵選擇來減少大部分相關性高的特徵 - 標準化資料來降低不同資料度量單位帶來的影響 - 正態化資料以降低不同的資料分佈結構,提高演算法的準確度 """ # 分離資料集 array = data.values X = array[:, 0:13] y = array[:, 13] test_size = 0.2 # 20%測試集 seed = 7 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed) X_train.shape X_test.shape y_train.shape, y_test.shape """### 評估演算法 從直觀上看,只能得出由於部分資料的線性分佈,線性迴歸演算法和彈性網路迴歸演算法對解決問題可能有效。且由於資料的離散化,用決策樹演算法或者SVM演算法可能能生成高準確度的模型。 **但是我們仍然不清楚到底哪個演算法能生成準確度最高的模型。**需要設計評估框架來幫助我們選擇。這裡用10折交叉驗證來分離資料,用均方誤差來比較演算法準確度。其中,均方誤差越趨近於0,演算法的準確度就越高。 """ num_folds = 10 seed = 7 scoring = 'neg_mean_squared_error' # 先不對原始資料進行任何處理,得出模型效能評價的基準 models = {} models['LR'] = LinearRegression() models['Lasso'] = Lasso() models['EN'] = ElasticNet() models['KNN'] = KNeighborsRegressor() models['CART'] = DecisionTreeRegressor() models['SVM'] = SVR() # 直接評估演算法,得出一個baseline results = [] for key in models: kfold = KFold(n_splits=num_folds, random_state=seed) cv_result = cross_val_score(models[key], X_train, y_train, cv=kfold, scoring=scoring) results.append(cv_result) print("%s: %f (%f)" % (key, cv_result.mean(), cv_result.std())) # 評估圖——箱線圖 fig = plt.figure() fig.suptitle('演算法比較') ax = fig.add_subplot(111) plt.boxplot(results) ax.set_xticklabels(models.keys()) plt.show() """### 分析 線性演算法的分佈比較類似,K近鄰演算法結果分佈比較緊湊。如何從箱線圖上看出資料分佈的優劣呢? ### 正態化資料 將資料變成中值為0,方差為1的資料。用`Pipeline`來正態化資料和對模型進行評估,可以防止資料洩露。 """ # 評估演算法 pipelines = {} pipelines['ScalerLR'] = Pipeline([('Scaler', StandardScaler()), ('LR', LinearRegression())]) pipelines['ScalerLASSO'] = Pipeline([('Scaler', StandardScaler()), ('Lasso', Lasso())]) pipelines['ScalerEN'] = Pipeline([('Scaler', StandardScaler()), ('EN', ElasticNet())]) pipelines['ScalerKNN'] = Pipeline([('Scaler', StandardScaler()), ('KNN', KNeighborsRegressor())]) pipelines['ScalerCART'] = Pipeline([('Scaler', StandardScaler()), ('CART', DecisionTreeRegressor())]) pipelines['ScalerSVM'] = Pipeline([('Scaler', StandardScaler()), ('SVM', SVR())]) results = [] for key in pipelines: kfold = KFold(n_splits=num_folds, random_state=seed) cv_result = cross_val_score(pipelines[key], X_train, y_train, cv=kfold, scoring=scoring) results.append(cv_result) print("%s: %f (%f)" % (key, cv_result.mean(), cv_result.std())) """### 分析 正則化以後,KNN演算法的MSE最優。 """ # 評估演算法:箱線圖 fig = plt.figure() fig.suptitle('演算法評估') ax = fig.add_subplot(111) plt.boxplot(results) ax.set_xticklabels(models.keys()) plt.show() type(results) results[0] """### 開始調參 通過上面的分析我們看到KNN演算法具有最好的結果,能不能更好呢?這就進入到調參的過程了,即前面學過的網格搜尋和隨機搜尋等方法。 """ # 網格搜尋 scaler = StandardScaler().fit(X_train) rescaledX = scaler.transform(X_train) # 遍歷引數 param_grid = {'n_neighbors': [1,3,5,7,9,11,13,15,17,19,21]} model = KNeighborsRegressor() kfold = KFold(n_splits=num_folds, random_state=seed) grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold) grid_result = grid.fit(X=rescaledX, y=y_train) print('最優: %s 使用%s' % (grid_result.best_score_, grid_result.best_params_)) cv_results = zip(grid_result.cv_results_['mean_test_score'], grid_result.cv_results_['std_test_score'], grid_result.cv_results_['params']) for mean, std, param in cv_results: print('%f (%f) with %r' % (mean, std, param)) """### 整合演算法 除了使用調參方法以外,還可以用整合演算法提升準確率。現在對線性迴歸,K近鄰以及迴歸樹進行整合看看效果。 """ ensembles = {} ensembles['ScaledAB'] = Pipeline([('Scaler', StandardScaler()), ('AB', AdaBoostRegressor())]) ensembles['ScaledAB-KNN'] = Pipeline([('Scaler', StandardScaler()), ('ABKNN', AdaBoostRegressor(base_estimator=KNeighborsRegressor(n_neighbors=3)))]) ensembles['ScaledAB-LR'] = Pipeline([('Scaler', StandardScaler()), ('ABLR', AdaBoostRegressor(LinearRegression()))]) ensembles['ScaledRFR'] = Pipeline([('Scaler', StandardScaler()), ('RFR', RandomForestRegressor())]) ensembles['ScaledETR'] = Pipeline([('Scaler', StandardScaler()), ('ETR', ExtraTreesRegressor())]) ensembles['ScaledGBR'] = Pipeline([('Scaler', StandardScaler()), ('GBR', GradientBoostingRegressor())]) results = [] for key in ensembles: kfold = KFold(n_splits=num_folds, random_state=seed) cv_result = cross_val_score(ensembles[key], X_train, y_train, cv=kfold, scoring=scoring) results.append(cv_result) print('%s: %f (%f)' % (key, cv_result.mean(), cv_result.std())) # 評估圖——箱線圖 fig = plt.figure() fig.suptitle('演算法比較') ax = fig.add_subplot(111) plt.boxplot(results) ax.set_xticklabels(ensembles.keys()) plt.show() """### 整合演算法調參 整合演算法有一個引數`n_estimators`, 這個可以調整,看看是否可以得到有提升的結果。 """ # GBM演算法調參——網格搜尋 scaler = StandardScaler().fit(X_train) rescaledX = scaler.transform(X_train) # 遍歷引數 param_grid = {'n_estimators': [10,50,100,200,300,400,500,600,700, 800,900]} model = GradientBoostingRegressor() kfold = KFold(n_splits=num_folds, random_state=seed) grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold) grid_result = grid.fit(X=rescaledX, y=y_train) print('最優: %s 使用%s' % (grid_result.best_score_, grid_result.best_params_)) # ET演算法調參——網格搜尋 scaler = StandardScaler().fit(X_train) rescaledX = scaler.transform(X_train) # 遍歷引數 param_grid = {'n_estimators': [5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]} model = ExtraTreesRegressor() kfold = KFold(n_splits=num_folds, random_state=seed) grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold) grid_result = grid.fit(X=rescaledX, y=y_train) print('最優: %s 使用%s' % (grid_result.best_score_, grid_result.best_params_)) """### 顯示最後結果最好的是ET模型。""" # 訓練模型 scaler = StandardScaler().fit(X_train) rescaledX = scaler.transform(X_train) gbr = ExtraTreesRegressor(n_estimators=30) gbr.fit(X=rescaledX, y=y_train) # 評估演算法模型 rescaledX_test = scaler.transform(X_test) predictions = gbr.predict(rescaledX_test) type(rescaledX_test) rescaledX_test print(mean_squared_error(y_test, predictions)) # 最後求出均方誤差,結果還不錯 # 輸出 # 14.653818518518516

END.