1. 程式人生 > >Python機器學習庫sklearn幾種迴歸演算法建模及分析(實驗)

Python機器學習庫sklearn幾種迴歸演算法建模及分析(實驗)

最簡單的迴歸模型就是線性迴歸

資料匯入與視覺化分析

from IPython.display import Image
%matplotlib inline
# Added version check for recent scikit-learn 0.18 checks
from distutils.version import LooseVersion as Version
from sklearn import __version__ as sklearn_version
#原資料網址變了,新換的資料地址需要處理http://lib.stat.cmu.edu/datasets/boston
import pandas as pd
import numpy as np
#df = pd.read_csv('http://lib.stat.cmu.edu/datasets/boston',header=19,sep='\s{1,3}')
#df.head()
dfnp=np.genfromtxt('boston.txt')
df=pd.DataFrame(dfnp,columns = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV'])
df.to_csv('boston.csv')
df.head()

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid', context='notebook')   #style控制預設樣式,context控制著預設的畫幅大小
cols = ['LSTAT', 'INDUS', 'NOX', 'RM', 'MEDV']
sns.pairplot(df[cols], size=2.5)
plt.tight_layout()
# plt.savefig('./figures/scatter.png', dpi=300)
plt.show()



import numpy as np
cm = np.corrcoef(df[cols].values.T)   #corrcoef方法按行計算皮爾遜相關係數,cm是對稱矩陣
#使用np.corrcoef(a)可計算行與行之間的相關係數,np.corrcoef(a,rowvar=0)用於計算各列之間的相關係數,輸出為相關係數矩陣。
sns.set(font_scale=1.5)   #font_scale設定字型大小
hm = sns.heatmap(cm,cbar=True,annot=True,square=True,fmt='.2f',annot_kws={'size': 15},yticklabels=cols,xticklabels=cols)
# plt.tight_layout()
# plt.savefig('./figures/corr_mat.png', dpi=300)
plt.show()
sns.reset_orig()   #將引數還原為seaborn作圖前的原始值
%matplotlib inline

線性迴歸的最小二乘法


#編制最小二乘法類
class LinearRegressionGD(object):
    def __init__(self, eta=0.001, n_iter=20):
        self.eta = eta
        self.n_iter = n_iter
    def fit(self, X, y):   #X是列向量,y是行向量
        self.w_ = np.zeros(1 + X.shape[1])   #初始化(1,2)全0的行向量,存迭代過程擬合直線的兩個係數
        self.cost_ = []
        for i in range(self.n_iter):
            output = self.net_input(X)
            errors = (y - output)   #與y同維度的行向量,errors是誤差項
            self.w_[1:] += self.eta * X.T.dot(errors)   #擬合直線的一次項係數
            self.w_[0] += self.eta * errors.sum()   #擬合直線的常數項
            cost = (errors**2).sum() / 2.0   #殘差的平方和一半——目標函式
            self.cost_.append(cost)
        return self
    def net_input(self, X):
        return np.dot(X, self.w_[1:]) + self.w_[0]   
    def predict(self, X):
        return self.net_input(X)
#cost_是每次迭代的殘差平方和一半的統計列表,w_包含每次迭代直線的兩個引數,errors是每次迭代的殘差
X = df[['RM']].values   #X是(*,1)維列向量
y = df['MEDV'].values   #y是(*, )行向量
from sklearn.preprocessing import StandardScaler
sc_x = StandardScaler()
sc_y = StandardScaler()
X_std = sc_x.fit_transform(X)   
#fit_transform方法可以拆分成StanderdScalar裡的fit和transform兩步,這裡為了區別LinearRegressionGD類將兩步合併
y_std = sc_y.fit_transform(y[:, np.newaxis]).flatten()   
#y[:, np.newaxis]作用等同於y[np.newaxis].T,也就是df[['MEDV']].values;flatten方法的作用是變回成1*n的向量
#fit_transform方法是對“列向量”直接規範化
lr = LinearRegressionGD()
lr.fit(X_std, y_std)   #這裡的fit是LinearRegressionGD類的,注意區分sklearn裡不同類的fit方法使用環境
#Output:<__main__.LinearRegressionGD at 0x16add278>
plt.plot(range(1, lr.n_iter+1), lr.cost_)
plt.ylabel('SSE')
plt.xlabel('Epoch')
plt.tight_layout()
# plt.savefig('./figures/cost.png', dpi=300)
plt.show()

def lin_regplot(X, y, model):
    plt.scatter(X, y, c='lightblue')
    plt.plot(X, model.predict(X), color='red', linewidth=2)    
    return 
lin_regplot(X_std, y_std, lr)
plt.xlabel('Average number of rooms [RM] (standardized)')
plt.ylabel('Price in $1000\'s [MEDV] (standardized)')
plt.tight_layout()
# plt.savefig('./figures/gradient_fit.png', dpi=300)
plt.show()

print('Slope: %.3f' % lr.w_[1])   #ax+b裡的a
print('Intercept: %.3f' % lr.w_[0])   #ax+b裡的b
#Output:
#Slope: 0.695
#Intercept: -0.000
num_rooms_std = sc_x.transform(np.array([[5.0]]))   #與建模時資料進行同樣的標準化轉化
price_std = lr.predict(num_rooms_std)   #a*num_rooms_std+b
print("Price in $1000's: %.3f" % sc_y.inverse_transform(price_std))
#Output:
#Price in $1000's: 10.840
#用sklearn完成迴歸並檢視係數,與上面自己編寫的LinearRegressionGD對比
from sklearn.linear_model import LinearRegression
slr = LinearRegression()
slr.fit(X, y)
y_pred = slr.predict(X)
print('Slope: %.3f' % slr.coef_[0])
print('Intercept: %.3f' % slr.intercept_)
#Output:
#Slope: 9.102
#Intercept: -34.671
lin_regplot(X, y, slr)
plt.xlabel('Average number of rooms [RM]')
plt.ylabel('Price in $1000\'s [MEDV]')
plt.tight_layout()
# plt.savefig('./figures/scikit_lr_fit.png', dpi=300)
plt.show()

下面利用法方程求解擬合直線係數

# adding a column vector of "ones"
Xb = np.hstack((np.ones((X.shape[0], 1)), X))   #在X前加一列1
w = np.zeros(X.shape[1])
z = np.linalg.inv(np.dot(Xb.T, Xb))   
#np.linalg.inv方法利用線性代數包求逆,np.dot(Xb.T, Xb)是2*2方陣
w = np.dot(z, np.dot(Xb.T, y))

print('Slope: %.3f' % w[1])
print('Intercept: %.3f' % w[0])
#Output:
#Slope: 9.102
#Intercept: -34.671
#從上面結果可見:Logistic Regression求擬合直線的原理就是利用法方程

使用sklearn的RANSAC提高魯棒性

去除噪聲點,利用完全資料的有效內點子集做迴歸a subset of inliers(對應outliers是離群點)

from sklearn.linear_model import RANSACRegressor
#http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RANSACRegressor.html#sklearn.linear_model.RANSACRegressor
#RANSACRegressor是從樣本總體中隨機取無噪聲子集,迴歸擬合直線
if Version(sklearn_version) < '0.18':
    ransac = RANSACRegressor(LinearRegression(), max_trials=100, min_samples=50, 
                             residual_metric=lambda x: np.sum(np.abs(x), axis=1), 
                             residual_threshold=5.0, random_state=0)
else:
    ransac = RANSACRegressor(LinearRegression(), max_trials=100, min_samples=50, 
                             loss='absolute_loss', residual_threshold=5.0, random_state=0)
#base_estimator基本估計器物件預設為LR,max_trials隨機樣本子集選擇的最大迭代次數,
#min_samples最少需要取樣的點數,loss是樣本損失判定準則,residual_threshold是檢測離群點的樣本損失閾值
ransac.fit(X, y)
inlier_mask = ransac.inlier_mask_   #將ransac物件的inliers標註為True
outlier_mask = np.logical_not(inlier_mask)   #取非,numpy庫裡避免python本身含有的not過載

line_X = np.arange(3, 10, 1)
line_y_ransac = ransac.predict(line_X[:, np.newaxis])   #構造出的驗證ransac預測點

plt.scatter(X[inlier_mask], y[inlier_mask], c='blue', marker='o', label='Inliers')
plt.scatter(X[outlier_mask], y[outlier_mask], c='lightgreen', marker='s', label='Outliers')

plt.plot(line_X, line_y_ransac, color='red')   #構造出的預測點做的迴歸線
plt.plot(X[inlier_mask],ransac.predict(X[inlier_mask]),color='m')   #用於建模的inliers點做的迴歸線
plt.xlabel('Average number of rooms [RM]')
plt.ylabel('Price in $1000\'s [MEDV]')
plt.legend(loc='upper left')

plt.tight_layout()
# plt.savefig('./figures/ransac_fit.png', dpi=300)
plt.show()

print('Slope: %.3f' % ransac.estimator_.coef_[0])
print('Intercept: %.3f' % ransac.estimator_.intercept_)
#Output:
#Slope: 9.621
#Intercept: -37.137

評估迴歸模型的效能

if Version(sklearn_version) < '0.18':
    from sklearn.cross_validation import train_test_split
else:
    from sklearn.model_selection import train_test_split

X = df.iloc[:, :-1].values   #除去最後一列,其餘都作為特徵考慮範圍
y = df['MEDV'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
slr = LinearRegression()
slr.fit(X_train, y_train)
y_train_pred = slr.predict(X_train)
y_test_pred = slr.predict(X_test)
plt.scatter(y_train_pred, y_train_pred - y_train, c='blue', marker='o', label='Training data')
plt.scatter(y_test_pred, y_test_pred - y_test, c='lightgreen', marker='s', label='Test data')
#預測值與偏差的關係
plt.xlabel('Predicted values')
plt.ylabel('Residuals')
plt.legend(loc='upper left')
plt.hlines(y=0, xmin=-10, xmax=50, lw=2, color='red')
plt.xlim([-10, 50])
plt.tight_layout()
# plt.savefig('./figures/slr_residuals.png', dpi=300)
plt.show()


from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error   #均方誤差迴歸損失
#http://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html#sklearn.metrics.mean_squared_error
print('MSE train: %.3f, test: %.3f' % (mean_squared_error(y_train, y_train_pred),mean_squared_error(y_test, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (r2_score(y_train, y_train_pred),r2_score(y_test, y_test_pred)))
#Output:
#MSE train: 19.958, test: 27.196
#R^2 train: 0.765, test: 0.673

新增正則化項

from sklearn.linear_model import Lasso
#http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html#sklearn.linear_model.Lasso
lasso = Lasso(alpha=0.1)   #L1正則化係數
lasso.fit(X_train, y_train)
y_train_pred = lasso.predict(X_train)
y_test_pred = lasso.predict(X_test)
print(lasso.coef_)
print('MSE train: %.3f, test: %.3f' % (mean_squared_error(y_train, y_train_pred),mean_squared_error(y_test, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (r2_score(y_train, y_train_pred),r2_score(y_test, y_test_pred)))
#Output:
#[-0.11311792  0.04725111 -0.03992527  0.96478874 -0.          3.72289616
# -0.02143106 -1.23370405  0.20469    -0.0129439  -0.85269025  0.00795847
# -0.52392362]
#MSE train: 20.926, test: 28.876
#R^2 train: 0.753, test: 0.653

多項式迴歸與曲線擬合

X = np.array([258.0, 270.0, 294.0, 320.0, 342.0, 368.0, 396.0, 446.0, 480.0, 586.0])[:, np.newaxis]
y = np.array([236.4, 234.4, 252.8, 298.6, 314.2, 342.2, 360.8, 368.0, 391.2, 390.8])
from sklearn.preprocessing import PolynomialFeatures   #生成多項式特徵,不是直接用多項式模型擬合
#http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html#sklearn.preprocessing.PolynomialFeatures
lr = LinearRegression()
pr = LinearRegression()
quadratic = PolynomialFeatures(degree=2)   #degress設定多項式擬閤中多項式的最高次數
X_quad = quadratic.fit_transform(X)   #X列向量,fit_transform是對X按列求[1, X, X^2],即構造二次項資料
#For example, if an input sample is two dimensional and of the form [a, b], the degree-2 polynomial features are [1, a, b, a^2, ab, b^2].
# fit linear features
lr.fit(X, y)
X_fit = np.arange(250, 600, 10)[:, np.newaxis]   #X_fit是構造的預測資料,X是訓練資料
y_lin_fit = lr.predict(X_fit)   #利用線性迴歸對構造的X_fit資料預測

# fit quadratic features
pr.fit(X_quad, y)   #X_quad是訓練資料,使用它進行建模得多項式係數
y_quad_fit = pr.predict(quadratic.fit_transform(X_fit))   #利用二次多項式對構造的X_fit資料預測

# plot results
plt.scatter(X, y, label='training points')
plt.plot(X_fit, y_lin_fit, label='linear fit', linestyle='--')
plt.plot(X_fit, y_quad_fit, label='quadratic fit')
plt.legend(loc='upper left')

plt.tight_layout()
# plt.savefig('./figures/poly_example.png', dpi=300)
plt.show()
#驗證pr.fit(X_quad, y)可以藉助LinearRegression類中的判定邊界構造部分求多項式係數
pr.fit(X_quad, y)
pr.coef_
#Output:
#array([  0.00000000e+00,   2.39893018e+00,  -2.25020109e-03])
y_lin_pred = lr.predict(X)   #用訓練資料集訓練的線性模型,對訓練資料集進行線性預測(因為線性模型建模時,就不是100%學習到了精確模型)
y_quad_pred = pr.predict(X_quad)
print('Training MSE linear: %.3f, quadratic: %.3f' % (mean_squared_error(y, y_lin_pred),mean_squared_error(y, y_quad_pred)))
print('Training R^2 linear: %.3f, quadratic: %.3f' % (r2_score(y, y_lin_pred),r2_score(y, y_quad_pred)))
#Output:
#Training MSE linear: 569.780, quadratic: 61.330
#Training R^2 linear: 0.832, quadratic: 0.982

房價資料集的非線性擬合

X = df[['LSTAT']].values
y = df['MEDV'].values
regr = LinearRegression()
# create quadratic features
quadratic = PolynomialFeatures(degree=2)
cubic = PolynomialFeatures(degree=3)
X_quad = quadratic.fit_transform(X)
X_cubic = cubic.fit_transform(X)
# fit features
X_fit = np.arange(X.min(), X.max(), 1)[:, np.newaxis]

regr = regr.fit(X, y)   #X,y訓練資料集建模;X_fit測試資料集預測;對訓練資料集測試得分(因為有時根本不知道測試資料集對應的真實y值)
y_lin_fit = regr.predict(X_fit)
linear_r2 = r2_score(y, regr.predict(X))

regr = regr.fit(X_quad, y)
y_quad_fit = regr.predict(quadratic.fit_transform(X_fit))
quadratic_r2 = r2_score(y, regr.predict(X_quad))

regr = regr.fit(X_cubic, y)
y_cubic_fit = regr.predict(cubic.fit_transform(X_fit))
cubic_r2 = r2_score(y, regr.predict(X_cubic))

# plot results
plt.scatter(X, y, label='training points', color='lightgray')
plt.plot(X_fit, y_lin_fit, label='linear (d=1), $R^2=%.2f$' % linear_r2, color='blue', lw=2, linestyle=':')
plt.plot(X_fit, y_quad_fit, label='quadratic (d=2), $R^2=%.2f$' % quadratic_r2, color='red', lw=2, linestyle='-')
plt.plot(X_fit, y_cubic_fit, label='cubic (d=3), $R^2=%.2f$' % cubic_r2, color='green', lw=2, linestyle='--')
plt.xlabel('% lower status of the population [LSTAT]')
plt.ylabel('Price in $1000\'s [MEDV]')
plt.legend(loc='upper right')
plt.tight_layout()
# plt.savefig('./figures/polyhouse_example.png', dpi=300)
plt.show()

使用基於樹的演算法進行迴歸:迴歸樹

import numpy
from sklearn.tree import DecisionTreeRegressor
#http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html#sklearn.tree.DecisionTreeRegressor
X = df[['LSTAT']].values
y = df['MEDV'].values

tree = DecisionTreeRegressor(max_depth=3)   #max_depth設定樹深
tree.fit(X, y)   #參考官網attributes部分了解建模後得到的各種屬性:樹,使用的特徵及特徵重要性

sort_idx = X.flatten().argsort()   #X中最小元素到最大元素的索引構成的向量

lin_regplot(X[sort_idx], y[sort_idx], tree)
plt.xlabel('% lower status of the population [LSTAT]')
plt.ylabel('Price in $1000\'s [MEDV]')
# plt.savefig('./figures/tree_regression.png', dpi=300)
plt.show()
#水平紅線表示c值,豎直紅線表示特徵列選擇的切分點

隨機森林迴歸

X = df.iloc[:, :-1].values   #13個特徵列
y = df['MEDV'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1)
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(n_estimators=1000, criterion='mse', random_state=1, n_jobs=-1)
forest.fit(X_train, y_train)
y_train_pred = forest.predict(X_train)
y_test_pred = forest.predict(X_test)

print('MSE train: %.3f, test: %.3f' % (mean_squared_error(y_train, y_train_pred),mean_squared_error(y_test, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (r2_score(y_train, y_train_pred),r2_score(y_test, y_test_pred)))
#Output:
#MSE train: 1.642, test: 11.052
#R^2 train: 0.979, test: 0.878
plt.scatter(y_train_pred, y_train_pred - y_train, c='black', marker='o', s=35, alpha=0.5, label='Training data')
plt.scatter(y_test_pred, y_test_pred - y_test, c='lightgreen', marker='s', s=35, alpha=0.7, label='Test data')
plt.xlabel('Predicted values')
plt.ylabel('Residuals')
plt.legend(loc='upper left')
plt.hlines(y=0, xmin=-10, xmax=50, lw=2, color='red')
plt.xlim([-10, 50])
plt.tight_layout()
# plt.savefig('./figures/slr_residuals.png', dpi=300)
plt.show()