1. 程式人生 > >房價預測《進階版,測試》

房價預測《進階版,測試》

rest 哪些 tle blog model lln one atp feature

#coding=utf8

import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, AdaBoostRegressor
from xgboost import XGBRegressor

#
不要第一列id,只是作為索引 train_df = pd.read_csv(./input/train.csv, index_col=0) test_df = pd.read_csv(./input/test.csv, index_col=0) prices = pd.DataFrame({price:train_df[SalePrice], log(price + 1):np.log1p(train_df[SalePrice])}) #print train_df.columns #prices.hist() #print ‘ok‘ #print train_df.index #
print test_df.index y_train = np.log1p(train_df.pop(SalePrice)) #print y_train.shape #print train_df.index all_df = pd.concat((train_df,test_df), axis=0) #變量轉換 print train_df.index print test_df.index #print all_df[‘MSSubClass‘].dtypes all_df[MSSubClass] = all_df[MSSubClass].astype(str) #print all_df.shape
#print all_df[‘MSSubClass‘].value_counts() #print all_df[‘MSSubClass‘].dtypes #print pd.get_dummies(all_df[‘MSSubClass‘], prefix=‘MSSubClass‘).head() #當我們用numerical來表達categorical的時候,要註意,數字本身有大小的含義,所以亂用數字會給之後的模型學習帶來麻煩。於是我們可以用One-Hot的方法來表達category。 #pandas自帶的get_dummies方法,一鍵做到One-Hot。 #把所有的category數據,都給One-Hot了 all_dummy_df = pd.get_dummies(all_df) #print all_dummy_df.head() #print all_dummy_df.isnull().sum().sort_values(ascending=False).head(10) #處理缺失值 mean_cols = all_dummy_df.mean() #print mean_cols all_dummy_df = all_dummy_df.fillna(mean_cols) #print all_dummy_df.isnull().sum().sum() #標準化numerical數據,這裏,我們當然不需要把One-Hot的那些0/1數據給標準化。我們的目標應該是那些本來就是numerical的數據: #先來看看 哪些是numerical的 numeric_cols = all_df.columns[all_df.dtypes != object] #print numeric_cols #print train_df.index numeric_col_means = all_dummy_df.loc[:, numeric_cols].mean() numeric_col_std = all_dummy_df.loc[:, numeric_cols].std() all_dummy_df.loc[:, numeric_cols] = (all_dummy_df.loc[:, numeric_cols] - numeric_col_means) / numeric_col_std dummy_train_df = all_dummy_df.loc[train_df.index] dummy_test_df = all_dummy_df.loc[test_df.index] #print train_df.index #print test_df.index #print dummy_train_df.shape #print dummy_test_df.shape #print type(dummy_train_df) X_train = dummy_train_df.values X_test = dummy_test_df.values #print type(X_train) print X_train.shape alphas = np.logspace(-3, 2, 50) test_scores = [] for alpha in alphas: clf = Ridge(alpha) test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring=neg_mean_squared_error)) test_scores.append(np.mean(test_score)) plt.plot(alphas, test_scores) plt.title(Alpha vs CV Error) max_features = [.1, .3, .5, .7, .9, .99] test_scores = [] for max_feat in max_features: clf = RandomForestRegressor(n_estimators=200, max_features=max_feat) test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=5, scoring=neg_mean_squared_error)) test_scores.append(np.mean(test_score)) plt.plot(max_features, test_scores) plt.title("Max Features vs CV Error") #做一點高級的Ensemble #這裏,可以不必輸入Base_estimator,使用自帶的,但是結果不及已經調好的 base_estimator,通過作圖可以驗證。 ridge = Ridge(alpha=15) #Bagging params = [1, 10, 15, 20, 25, 30, 40] test_scores = [] for param in params: clf = BaggingRegressor(n_estimators=param, base_estimator=ridge) test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring=neg_mean_squared_error)) test_scores.append(np.mean(test_score)) plt.plot(params, test_scores) plt.title("n_estimator vs CV Error") #Boosting params = [10, 15, 20, 25, 30, 35, 40, 45, 50] test_scores = [] for param in params: clf = AdaBoostRegressor(n_estimators=param, base_estimator=ridge) test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring=neg_mean_squared_error)) test_scores.append(np.mean(test_score)) plt.plot(params, test_scores) plt.title("n_estimator vs CV Error") #XGBoost params = [1,2,3,4,5,6] test_scores = [] for param in params: clf = XGBRegressor(max_depth=param) test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring=neg_mean_squared_error)) test_scores.append(np.mean(test_score)) plt.plot(params, test_scores) plt.title("max_depth vs CV Error") """ rf = RandomForestRegressor(n_estimators=500, max_features=.3) ridge.fit(X_train, y_train) rf.fit(X_train, y_train) y_ridge = np.expm1(ridge.predict(X_test)) y_rf = np.expm1(rf.predict(X_test)) y_final = (y_ridge + y_rf) / 2 """

房價預測《進階版,測試》