1. 程式人生 > >吳裕雄 資料探勘與分析案例實戰(6)——線性迴歸預測模型

吳裕雄 資料探勘與分析案例實戰(6)——線性迴歸預測模型

# 工作年限與收入之間的散點圖
# 匯入第三方模組
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# 匯入資料集
income = pd.read_csv(r'F:\\python_Data_analysis_and_mining\\07\\Salary_Data.csv')
print(income.shape)
print(income.head())
# 繪製散點圖
sns.lmplot(x = 'YearsExperience', y = 'Salary', data = income, ci = None)
# 顯示圖形
plt.show()

# 簡單線性迴歸模型的引數求解
# 樣本量
n = income.shape[0]
# 計算自變數、因變數、自變數平方、自變數與因變數乘積的和
sum_x = income.YearsExperience.sum()
sum_y = income.Salary.sum()
sum_x2 = income.YearsExperience.pow(2).sum()
xy = income.YearsExperience * income.Salary
sum_xy = xy.sum()
# 根據公式計算迴歸模型的引數
b = (sum_xy-sum_x*sum_y/n)/(sum_x2-sum_x**2/n)
a = income.Salary.mean()-b*income.YearsExperience.mean()
# 打印出計算結果
print('迴歸引數a的值:',a)
print('迴歸引數b的值:',b)

# 匯入第三方模組
import statsmodels.api as sm

# 利用收入資料集,構建迴歸模型
fit = sm.formula.ols('Salary ~ YearsExperience', data = income).fit()
# 返回模型的引數值
print(fit.params)

# 多元線性迴歸模型的構建和預測
# 匯入模組
from sklearn import model_selection

# 匯入資料
Profit = pd.read_excel(r'F:\\python_Data_analysis_and_mining\\07\\Predict to Profit.xlsx')
print(Profit.shape)
print(Profit.head())
# 將資料集拆分為訓練集和測試集
train, test = model_selection.train_test_split(Profit, test_size = 0.2, random_state=1234)
print(train.head())
print(test.head())
# 根據train資料集建模
model = sm.formula.ols('Profit ~ RD_Spend + Administration + Marketing_Spend + C(State)', data = train).fit()
print('模型的偏回歸係數分別為:\n', model.params)
# 刪除test資料集中的Profit變數,用剩下的自變數進行預測
test_X = test.drop(labels = 'Profit', axis = 1)
pred = model.predict(exog = test_X)
print('對比預測值和實際值的差異:\n',pd.DataFrame({'Prediction':pred,'Real':test.Profit}))

# 生成由State變數衍生的啞變數
dummies = pd.get_dummies(Profit.State)
# 將啞變數與原始資料集水平合併
Profit_New = pd.concat([Profit,dummies], axis = 1)
print(Profit_New.shape)
print(Profit_New.head())
# 刪除State變數和California變數(因為State變數已被分解為啞變數,New York變數需要作為參照組)
Profit_New.drop(labels = ['State','New York'], axis = 1, inplace = True)

# 拆分資料集Profit_New
train, test = model_selection.train_test_split(Profit_New, test_size = 0.2, random_state=1234)
# 建模
model2 = sm.formula.ols('Profit ~ RD_Spend + Administration + Marketing_Spend + Florida + California', data = train).fit()
print('模型的偏回歸係數分別為:\n', model2.params)

# 匯入第三方模組
import numpy as np

# 計算建模資料中,因變數的均值
ybar = train.Profit.mean()
print(ybar)
# 統計變數個數和觀測個數
p = model2.df_model
print(p)
n = train.shape[0]
print(n)
# 計算迴歸離差平方和
RSS = np.sum((model2.fittedvalues-ybar) ** 2)
print(RSS)
# 計算誤差平方和
ESS = np.sum(model2.resid ** 2)
print(ESS)
# 計算F統計量的值
F = (RSS/p)/(ESS/(n-p-1))
print('F統計量的值:',F)
# 返回模型中的F值
print(model2.fvalue)

# 匯入模組
from scipy.stats import f

# 計算F分佈的理論值
F_Theroy = f.ppf(q=0.95, dfn = p,dfd = n-p-1)
print('F分佈的理論值為:',F_Theroy)

# 模型的概覽資訊
print(model2.summary())

# 正態性檢驗
# 直方圖法
# 匯入第三方模組
import scipy.stats as stats

# 中文和負號的正常顯示
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
plt.rcParams['axes.unicode_minus'] = False
# 繪製直方圖
sns.distplot(a = Profit_New.Profit, bins = 10, fit = stats.norm, norm_hist = True,
hist_kws = {'color':'steelblue', 'edgecolor':'black'},
kde_kws = {'color':'black', 'linestyle':'--', 'label':'核密度曲線'},
fit_kws = {'color':'red', 'linestyle':':', 'label':'正態密度曲線'})
# 顯示圖例
plt.legend()
# 顯示圖形
plt.show()

# 殘差的正態性檢驗(PP圖和QQ圖法)
pp_qq_plot = sm.ProbPlot(Profit_New.Profit)
# 繪製PP圖
pp_qq_plot.ppplot(line = '45')
plt.title('P-P圖')
# 繪製QQ圖
pp_qq_plot.qqplot(line = 'q')
plt.title('Q-Q圖')
# 顯示圖形
plt.show()

# 匯入模組
import scipy.stats as stats

a = stats.shapiro(Profit_New.Profit)
print(a)
# 生成正態分佈和均勻分佈隨機數
rnorm = np.random.normal(loc = 5, scale=2, size = 10000)
print(rnorm)
runif = np.random.uniform(low = 1, high = 100, size = 10000)
print(runif)
# 正態性檢驗
KS_Test1 = stats.kstest(rvs = rnorm, args = (rnorm.mean(), rnorm.std()), cdf = 'norm')
print(KS_Test1)
KS_Test2 = stats.kstest(rvs = runif, args = (runif.mean(), runif.std()), cdf = 'norm')
print(KS_Test2)

# 匯入statsmodels模組中的函式
from statsmodels.stats.outliers_influence import variance_inflation_factor

# 自變數X(包含RD_Spend、Marketing_Spend和常數列1)
X = sm.add_constant(Profit_New.ix[:,['RD_Spend','Marketing_Spend']])
print(X.shape)
print(X.head())

# 構造空的資料框,用於儲存VIF值
vif = pd.DataFrame()
vif["features"] = X.columns
print(vif)
vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print(vif)
# 返回VIF值
print(vif)

# 計算資料集Profit_New中每個自變數與因變數利潤之間的相關係數
a = Profit_New.drop('Profit', axis = 1).corrwith(Profit_New.Profit)
print(a)

# 散點圖矩陣
# 匯入模組
import matplotlib.pyplot as plt
import seaborn

# 繪製散點圖矩陣
seaborn.pairplot(Profit_New.ix[:,['RD_Spend','Administration','Marketing_Spend','Profit']])
# 顯示圖形
plt.show()

import statsmodels.formula.api as smf

# 模型修正
model3 = smf.ols('Profit ~ RD_Spend + Marketing_Spend', data = train).fit()
# 模型迴歸係數的估計值
print(model3.params)
# 異常值檢驗
outliers = model3.get_influence()
print(outliers)
# 高槓杆值點(帽子矩陣)
leverage = outliers.hat_matrix_diag
print(leverage)
# dffits值
dffits = outliers.dffits[0]
print(dffits)
# 學生化殘差
resid_stu = outliers.resid_studentized_external
print(resid_stu)
# cook距離
cook = outliers.cooks_distance[0]
print(cook)
# 合併各種異常值檢驗的統計量值
contat1 = pd.concat([pd.Series(leverage, name = 'leverage'),pd.Series(dffits, name = 'dffits'),
pd.Series(resid_stu,name = 'resid_stu'),pd.Series(cook, name = 'cook')],axis = 1)
print(contat1)

# 重設train資料的行索引
train.index = range(train.shape[0])
# 將上面的統計量與train資料集合並
profit_outliers = pd.concat([train,contat1], axis = 1)
print(profit_outliers.head())
# 計算異常值數量的比例
outliers_ratio = sum(np.where((np.abs(profit_outliers.resid_stu)>2),1,0))/profit_outliers.shape[0]
print(outliers_ratio)
# 挑選出非異常的觀測點
none_outliers = profit_outliers.ix[np.abs(profit_outliers.resid_stu)<=2,]
print(none_outliers)

# 應用無異常值的資料集重新建模
model4 = smf.ols('Profit ~ RD_Spend + Marketing_Spend', data = none_outliers).fit()
print(model4.params)
# Durbin-Watson統計量
# 模型概覽
print(model4.summary())

# 方差齊性檢驗
# 設定第一張子圖的位置
ax1 = plt.subplot2grid(shape = (2,1), loc = (0,0))
# 繪製散點圖
ax1.scatter(none_outliers.RD_Spend, (model4.resid-model4.resid.mean())/model4.resid.std())
# 新增水平參考線
ax1.hlines(y = 0 ,xmin = none_outliers.RD_Spend.min(),xmax = none_outliers.RD_Spend.max(), color = 'red', linestyles = '--')
# 新增x軸和y軸標籤
ax1.set_xlabel('RD_Spend')
ax1.set_ylabel('Std_Residual')

# 設定第二張子圖的位置
ax2 = plt.subplot2grid(shape = (2,1), loc = (1,0))
# 繪製散點圖
ax2.scatter(none_outliers.Marketing_Spend, (model4.resid-model4.resid.mean())/model4.resid.std())
# 新增水平參考線
ax2.hlines(y = 0 ,xmin = none_outliers.Marketing_Spend.min(),xmax = none_outliers.Marketing_Spend.max(), color = 'red', linestyles = '--')
# 新增x軸和y軸標籤
ax2.set_xlabel('Marketing_Spend')
ax2.set_ylabel('Std_Residual')

# 調整子圖之間的水平間距和高度間距
plt.subplots_adjust(hspace=0.6, wspace=0.3)
# 顯示圖形
plt.show()

# BP檢驗
sm.stats.diagnostic.het_breushpagan(model4.resid, exog_het = model4.model.exog)

# 模型預測
# model4對測試集的預測
pred4 = model4.predict(exog = test.ix[:,['RD_Spend','Marketing_Spend']])
# 繪製預測值與實際值的散點圖
plt.scatter(x = test.Profit, y = pred4)
# 新增斜率為1,截距項為0的參考線
plt.plot([test.Profit.min(),test.Profit.max()],[test.Profit.min(),test.Profit.max()],
color = 'red', linestyle = '--')
# 新增軸標籤
plt.xlabel('實際值')
plt.ylabel('預測值')
# 顯示圖形
plt.show()