1. 程式人生 > >機器學習入門之房價預測(線性迴歸)

機器學習入門之房價預測(線性迴歸)

#!/usr/bin/env python
# coding: utf-8

# In[1]:


# 1.定義問題

# 2.匯入資料

# 匯入類庫
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import
StandardScaler get_ipython().run_line_magic('matplotlib', 'inline') import warnings warnings.filterwarnings('ignore') # 顯示所有列 pd.set_option('display.max_columns', None) # 匯入資料 train_data = pd.read_csv('../data/train.csv') test_data = pd.read_csv('../data/test.csv') # In[2]: # 3.理解資料 # 資料資訊 train_data.info()
# In[3]: # 資料維度 train_data.shape # In[4]: # 前5個數據 train_data.head(5) # In[5]: # 描述性統計資料 train_data.describe().T # In[6]: # 4.資料視覺化 # 分析SalePrice train_data['SalePrice'].describe() sns.distplot(train_data['SalePrice']) plt.show() # In[7]: # 關係矩陣 corr = train_data.corr() f, ax = plt.subplots(figsize=(20, 9)) sns.heatmap(corr, vmax
=1, vmin=-1,square=True) # In[8]: # 選取特徵 # 選擇相關係數絕對值大於0.5的特徵(共十個) train_data.corr()[train_data.corr()['SalePrice'].values > abs(0.5)] # In[9]: cols = ['OverallQual', 'YearBuilt', 'YearRemodAdd', 'TotalBsmtSF', '1stFlrSF', 'GrLivArea', 'FullBath', 'TotRmsAbvGrd', 'GarageCars', 'GarageArea', 'SalePrice'] train_data = train_data[cols] train_data.info() # In[10]: # 5.建立模型 # 分離資料集 X = train_data.values[:, 0:10] Y = train_data.values[:, 10] X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42) # 建模 model = LinearRegression() # 預測資料 model.fit(X_train,Y_train) y_pred = model.predict(X_test) print('cost:'+ str(np.sum(abs(y_pred-Y_test)/len(y_pred)))) # In[11]: # 由於原始資料所得cost太大,所以接下來對資料進行歸一化處理(誤差反而更大了,不知道為什麼???) X_scaled = StandardScaler().fit_transform(X) Y_scaled = StandardScaler().fit_transform(Y.reshape(-1, 1)) X_scaled_train, X_scaled_test, Y_scaled_train, Y_scaled_test = train_test_split(X_scaled, Y_scaled, test_size=0.33, random_state=42) model_scaled = LinearRegression() model_scaled.fit(X_scaled_train,Y_scaled_train) y_pred = model.predict(X_scaled_test) y_pred print('cost:'+ str(np.sum(abs(y_pred-Y_scaled_test)/len(y_pred)))) # In[12]: test_data['SalePrice'] = None test_data = test_data[cols] # 填充缺失值 test_data['TotalBsmtSF'].fillna(test_data['TotalBsmtSF'].median(), inplace=True) test_data['GarageCars'].fillna(test_data['GarageCars'].median(), inplace=True) test_data['GarageArea'].fillna(test_data['GarageArea'].median(), inplace=True) # In[13]: X = test_data.values[:, 0:10] y_test_pre = model.predict(X) test_data['SalePrice'] = y_test_pre test_data.head(10)