1. 程式人生 > >【SciKit-Learn學習筆記】3:線性迴歸測算波士頓房價,Logistic迴歸預測乳腺癌

【SciKit-Learn學習筆記】3:線性迴歸測算波士頓房價,Logistic迴歸預測乳腺癌

學習《scikit-learn機器學習》時的一些實踐。


線性迴歸

這部分和第一篇筆記"繪製隨機波動樣本的學習曲線 "部分基本類似。線性迴歸裡可以加入多項式特徵,以對模型做增強。

線性迴歸增加多項式特徵,擬合sin函式

import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import
Pipeline from sklearn.metrics import mean_squared_error # MSE損失 from matplotlib.figure import SubplotParams # 使matplotlib正常顯示負號 plt.rcParams['axes.unicode_minus'] = False # 200個從-2pi到2pi的正弦函式樣本點,上下波動0.1 n_dots = 200 X = np.linspace(-2 * np.pi, 2 * np.pi, n_dots) y = np.sin(X) + 0.2 * np.random.randn(n_dots)
- 0.1 X = X.reshape(-1, 1) y = y.reshape(-1, 1) # 多項式迴歸模型 def polynomial_model(degree=1): # 多項式模型,指定多項式的次數和是否使用偏置(常數項) polynomial_features = PolynomialFeatures(degree=degree, include_bias=False) # 線性迴歸模型,指定對每個特徵歸一化到(0,1) # (歸一化只能提示演算法收斂速度,不提高準確性) liner_regression = LinearRegression(
normalize=True) # 裝入管道 pipline = Pipeline([("多項式", polynomial_features), ("線性迴歸", liner_regression)]) return pipline if __name__ == '__main__': degrees = [2, 3, 5, 10] models = [] for d in degrees: model = polynomial_model(degree=d) # 這裡會依次呼叫管道里的fit和transform(或者fit_transform),最後一個只調用fit model.fit(X, y) train_score = model.score(X, y) # R2得分 mse = mean_squared_error(y, model.predict(X)) # MSE損失 print("degree:{}\tscore:{}\tmse loss:{}".format(d, train_score, mse)) models.append({"model": model, "degree": d}) # 訓練好的模型儲存下來 # 繪製不同degree的擬合結果,SubplotParams用於為子圖設定統一引數,這裡不用 # plt.figure(figsize=(12, 6), dpi=200, subplotpars=SubplotParams(hspace=3.0)) # fig, axes = plt.subplots(2, 2) for i, mod in enumerate(models): fig = plt.subplot(2, 2, i + 1) plt.xlim(-8, 8) plt.title("增加多項式特徵的線性迴歸(次數={})".format(mod["degree"])) plt.scatter(X, y, s=5, c='b', alpha=0.5) plt.plot(X, mod["model"].predict(X), 'r-') # fig.tight_layout() plt.show()

執行結果:

degree:2	score:0.13584632282895104	mse loss:0.4661817984705142
degree:3	score:0.2876206298259192	mse loss:0.3843046725996981
degree:5	score:0.8483945839508078	mse loss:0.08178601489384577
degree:10	score:0.9330965409663139	mse loss:0.036092162401397294

在這裡插入圖片描述

測算波士頓房價資料

在這個資料集上能更明顯的看出第一篇裡學的欠擬合和過擬合時候的學習曲線特徵。

from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
import time
from z5.liner_fit_sin import polynomial_model
from z3.learning_curve import plot_learning_curve
from sklearn.model_selection import ShuffleSplit
from matplotlib import pyplot as plt

# 讀取boston房價資料集,並劃分為訓練集和測試集
boston = load_boston()
X = boston.data  # shape=(506,13)
y = boston.target  # shape=(506,)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)

# 使用前面寫的增加了多項式特徵的線性迴歸模型
model = polynomial_model(degree=2)  # 改成3時驗證集上得分:-104.825038,說明過擬合
start = time.clock()  # 計時:訓練和測試打分的用時
model.fit(X_train, y_train)
train_score = model.score(X_train, y_train)
cv_score = model.score(X_test, y_test)
end = time.clock()
print("用時:{0:.6f},訓練集上得分:{1:.6f},驗證集上得分:{2:.6f}".format(end - start, train_score, cv_score))

# 繪製學習曲線
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
plt.figure(figsize=(18, 4), dpi=100)
org_title = "學習曲線(degree={})"
degrees = (1, 2, 3)

for i in range(len(degrees)):
    plt.subplot(1, 3, i + 1)
    plt = plot_learning_curve(polynomial_model(degrees[i]), org_title.format(degrees[i]), X, y, ylim=(0.01, 1.01),
                              cv=cv)
plt.show()

執行結果:

用時:0.027985,訓練集上得分:0.930547,驗證集上得分:0.860465

在這裡插入圖片描述

Logistic迴歸預測乳腺癌資料

Logistic迴歸也可以加多項式特徵。

不帶多項式特徵的

注意,在計算ACC時書上作者的做法是錯的,這裡已經改正。

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
import time
from z3.learning_curve import plot_learning_curve
from sklearn.model_selection import ShuffleSplit
from matplotlib import pyplot as plt

'''
logistic迴歸:乳腺癌資料
'''

# 讀取和劃分資料
cancer = load_breast_cancer()
X = cancer.data
y = cancer.target
print("X的shape={},正樣本數:{},負樣本數:{}".format(X.shape, y[y == 1].shape[0], y[y == 0].shape[0]))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# 訓練模型
model = LogisticRegression()
model.fit(X_train, y_train)

# 檢視模型得分
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
print("訓練集得分:{trs:.6f},測試集得分:{tss:.6f}".format(trs=train_score, tss=test_score))

# 對測試集做預測
y_pred = model.predict(X_test)
# 這裡作者書上有重大失誤.正確的是這樣:np.equal可以比較兩個陣列中的每一項,返回True/False陣列
# 然後使用np.count_nonzero()統計其中True的數目,也就是預測正確的樣本數
print("ACC:{}/{}".format(np.count_nonzero(np.equal(y_pred, y_test)), y_test.shape[0]))

# 找出預測概率低於0.9的樣本:概率和為1,所以兩個概率都>0.1時預測概率低於0.9
# 返回樣本被預測為各類的概率
y_pred_proba = model.predict_proba(X_test)
# 類別號是0和1,這裡得到的第一列應是預測為0的概率,第二列是預測為1的概率,這裡用斷言確保一下
assert y_pred[0] == (0 if y_pred_proba[0, 0] > y_pred_proba[0, 1] else 1)
# 全部樣本中,預測為陰性的p>0.1的部分
y_pp_big = y_pred_proba[y_pred_proba[:, 0] > 0.1]
# 在這個基礎上,找其預測為陽性的p>0.1的部分
y_pp_big = y_pp_big[y_pp_big[:, 1] > 0.1]
print(y_pp_big.shape)

執行結果:

X的shape=(569, 30),正樣本數:357,負樣本數:212
訓練集得分:0.962637,測試集得分:0.921053
ACC:105/114
(16, 2)

在前面的基礎上新增多項式特徵和L1正則化項

因為誤差等值線和L1等值線相切的點在座標軸上,所以L1範數正則化項解決過擬合的措施本質上是在減少特徵的數量,即某些引數(在這裡是多項式特徵的係數)會減小到0。

L2範數的等值線是一個圓,所以和模型誤差相切的點一般不在座標軸上,其正則化項防止過擬合的措施實際上是讓模型引數都儘可能小,都做出貢獻,但不會小到0。

# 用pipeline為logistic迴歸模型增加多項式特徵
def polynomial_model(degree=2, **kwargs):
    polynomial_features = PolynomialFeatures(degree=degree, include_bias=False)
    logistic_regression = LogisticRegression(**kwargs)
    pipeline = Pipeline([("多項式特徵", polynomial_features), ("logistic迴歸", logistic_regression)])
    return pipeline


# 增加多項式特徵後的模型
# 這裡使用L1範數做正則項,實現引數稀疏化(讓某些引數減少到0)從而留下對模型有幫助的特徵
model = polynomial_model(degree=2, penalty='l1')
start = time.clock()  # 計時:訓練和測試用時
model.fit(X_train, y_train)
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
end = time.clock()
print("用時:{0:.6f},訓練集上得分:{1:.6f},測試集上得分:{2:.6f}".format(end - start, train_score, test_score))

# 觀察一下有多少特徵沒有因L1範數正則化項而被丟棄(減小到0)
# 從管道中取出logistic迴歸的estimator,使用加入管道時給出的名字
logistic_regression = model.named_steps["logistic迴歸"]
# coef_屬性裡儲存的就是模型引數的值
print("引數shape:{},其中非0項數目:{}".format(logistic_regression.coef_.shape, np.count_nonzero(logistic_regression.coef_)))

執行結果:

用時:0.237963,訓練集上得分:1.000000,測試集上得分:0.947368
引數shape:(1, 495),其中非0項數目:88

在前面的基礎上比較使用L1和L2範數作為正則化項的學習曲線

# 繪製新模型的學習曲線
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
org_titlt = "增加{}階多項式特徵和{}正則化項的logistic迴歸學習曲線"
degrees = [1, 2]
penaltys = ["l1", "l2"]
fig = plt.figure(figsize=(12, 10), dpi=100)
for p in range(len(penaltys)):
    for i in range(len(degrees)):
        plt.subplot(len(penaltys), len(degrees), p * len(degrees) + i + 1)
        plt = plot_learning_curve(polynomial_model(degree=degrees[i], penalty=penaltys[p]),
                                  org_titlt.format(degrees[i], penaltys[p]), X, y, ylim=(0.8, 1.01), cv=cv)
fig.tight_layout()
plt.show()

執行結果:
在這裡插入圖片描述