1. 程式人生 > >機器學習預測時label錯位對未來資料做預測

機器學習預測時label錯位對未來資料做預測

前言

  這篇文章時承繼上一篇機器學習經典模型使用歸一化的影響。這次又有了新的任務,通過將label錯位來對未來資料做預測。

實驗過程

  使用不同的歸一化方法,不同得模型將測試集label錯位,計算出MSE的大小;

  不斷增大錯位的資料的個數,並計算出MSE,並畫圖。通過比較MSE(均方誤差,mean-square error)的大小來得出結論

過程及結果

資料處理(和上一篇的處理方式相同):

1 test_sort_data = sort_data[:5000]
2 test_sort_target = sort_target[:5000]
3 
4 sort_data1 = _sort_data[5000
:16060] 5 sort_data2 = _sort_data[16060:] 6 sort_target1 = _sort_target[5000:16060] 7 sort_target2 = _sort_target[16060:]
View Code

完整資料處理程式碼:

 1 #按時間排序
 2 sort_data = data.sort_values(by = 'time',ascending = True)
 3 
 4 sort_data.reset_index(inplace = True,drop = True)
 5 target = data['
T1AOMW_AV'] 6 sort_target = sort_data['T1AOMW_AV'] 7 del data['T1AOMW_AV'] 8 del sort_data['T1AOMW_AV'] 9 10 from sklearn.model_selection import train_test_split 11 test_sort_data = sort_data[16160:] 12 test_sort_target = sort_target[16160:] 13 14 _sort_data = sort_data[:16160] 15 _sort_target = sort_target[:16160
] 16 17 from sklearn.model_selection import train_test_split 18 test_sort_data = sort_data[:5000] 19 test_sort_target = sort_target[:5000] 20 21 sort_data1 = _sort_data[5000:16060] 22 sort_data2 = _sort_data[16060:] 23 sort_target1 = _sort_target[5000:16060] 24 sort_target2 = _sort_target[16060:] 25 26 import scipy.stats as stats 27 dict_corr = { 28 'spearman' : [], 29 'pearson' : [], 30 'kendall' : [], 31 'columns' : [] 32 } 33 34 for i in data.columns: 35 corr_pear,pval = stats.pearsonr(sort_data[i],sort_target) 36 corr_spear,pval = stats.spearmanr(sort_data[i],sort_target) 37 corr_kendall,pval = stats.kendalltau(sort_data[i],sort_target) 38 39 dict_corr['pearson'].append(abs(corr_pear)) 40 dict_corr['spearman'].append(abs(corr_spear)) 41 dict_corr['kendall'].append(abs(corr_kendall)) 42 43 dict_corr['columns'].append(i) 44 45 # 篩選新屬性 46 dict_corr =pd.DataFrame(dict_corr) 47 dict_corr.describe()
View Code

選取25%以上的;

1 new_fea = list(dict_corr[(dict_corr['pearson']>0.41) & (dict_corr['spearman']>0.45) & (dict_corr['kendall']>0.29)]['columns'].values)
View Code

包含下面的用來畫圖:

1 import matplotlib.pyplot as plt 
2 lr_plt=[]
3 ridge_plt=[]
4 svr_plt=[]
5 RF_plt=[]
View Code

正常的計算mse(label沒有移動):

 1 from sklearn.linear_model import LinearRegression,Lasso,Ridge
 2 from sklearn.preprocessing import MinMaxScaler,StandardScaler,MaxAbsScaler
 3 from sklearn.metrics import mean_squared_error as mse
 4 from sklearn.svm import SVR
 5 from sklearn.ensemble import RandomForestRegressor
 6 import xgboost as xgb
 7 #最大最小歸一化
 8 mm = MinMaxScaler()
 9 
10 lr = Lasso(alpha=0.5)
11 lr.fit(mm.fit_transform(sort_data1[new_fea]), sort_target1)
12 lr_ans = lr.predict(mm.transform(sort_data2[new_fea]))
13 lr_mse=mse(lr_ans,sort_target2)
14 lr_plt.append(lr_mse)
15 print('lr:',lr_mse)
16 
17 ridge = Ridge(alpha=0.5)
18 ridge.fit(mm.fit_transform(sort_data1[new_fea]),sort_target1)
19 ridge_ans = ridge.predict(mm.transform(sort_data2[new_fea]))
20 ridge_mse=mse(ridge_ans,sort_target2)
21 ridge_plt.append(ridge_mse)
22 print('ridge:',ridge_mse)
23 
24 svr = SVR(kernel='rbf',C=100,epsilon=0.1).fit(mm.fit_transform(sort_data1[new_fea]),sort_target1)
25 svr_ans = svr.predict(mm.transform(sort_data2[new_fea]))
26 svr_mse=mse(svr_ans,sort_target2)
27 svr_plt.append(svr_mse)
28 print('svr:',svr_mse)
29 
30 estimator_RF = RandomForestRegressor().fit(mm.fit_transform(sort_data1[new_fea]),sort_target1)
31 predict_RF = estimator_RF.predict(mm.transform(sort_data2[new_fea]))
32 RF_mse=mse(predict_RF,sort_target2)
33 RF_plt.append(RF_mse)
34 print('RF:',RF_mse)
35 
36 bst = xgb.XGBRegressor(learning_rate=0.1, n_estimators=550, max_depth=4, min_child_weight=5, seed=0,
37                              subsample=0.7, colsample_bytree=0.7, gamma=0.1, reg_alpha=1, reg_lambda=1)
38 bst.fit(mm.fit_transform(sort_data1[new_fea]),sort_target1)
39 bst_ans = bst.predict(mm.transform(sort_data2[new_fea]))
40 print('bst:',mse(bst_ans,sort_target2))
View Code

先讓label移動5個:

1 change_sort_data2 = sort_data2.shift(periods=5,axis=0)
2 change_sort_target2 = sort_target2.shift(periods=-5,axis=0)
3 change_sort_data2.dropna(inplace=True)
4 change_sort_target2.dropna(inplace=True)
View Code

讓label以5的倍數移動:

 1 mm = MinMaxScaler()
 2 
 3 for i in range(0,45,5):
 4     print(i)
 5     lr = Lasso(alpha=0.5)
 6     lr.fit(mm.fit_transform(sort_data1[new_fea]), sort_target1)
 7     lr_ans = lr.predict(mm.transform(change_sort_data2[new_fea]))
 8     lr_mse=mse(lr_ans,change_sort_target2)
 9     lr_plt.append(lr_mse)
10     print('lr:',lr_mse)
11     
12     ridge = Ridge(alpha=0.5)
13     ridge.fit(mm.fit_transform(sort_data1[new_fea]),sort_target1)
14     ridge_ans = ridge.predict(mm.transform(change_sort_data2[new_fea]))
15     ridge_mse=mse(ridge_ans,change_sort_target2)
16     ridge_plt.append(ridge_mse)
17     print('ridge:',ridge_mse)
18     
19     svr = SVR(kernel='rbf',C=100,epsilon=0.1).fit(mm.fit_transform(sort_data1[new_fea]),sort_target1)
20     svr_ans = svr.predict(mm.transform(change_sort_data2[new_fea]))
21     svr_mse=mse(svr_ans,change_sort_target2)
22     svr_plt.append(svr_mse)
23     print('svr:',svr_mse)
24     
25     estimator_RF = RandomForestRegressor().fit(mm.fit_transform(sort_data1[new_fea]),sort_target1)
26     predict_RF = estimator_RF.predict(mm.transform(change_sort_data2[new_fea]))
27     RF_mse=mse(predict_RF,change_sort_target2)
28     RF_plt.append(RF_mse)
29     print('RF:',RF_mse)
30     
31 #     bst = xgb.XGBRegressor(learning_rate=0.1, n_estimators=550, max_depth=4, min_child_weight=5, seed=0,
32 #                              subsample=0.7, colsample_bytree=0.7, gamma=0.1, reg_alpha=1, reg_lambda=1)
33 #     bst.fit(mm.fit_transform(sort_data1[new_fea]),sort_target1)
34 #     bst_ans = bst.predict(mm.transform(change_sort_data2[new_fea]))
35 #     print('bst:',mse(bst_ans,change_sort_target2))
36     
37     change_sort_target2=change_sort_target2.shift(periods=-5,axis=0)
38     change_sort_target2.dropna(inplace=True)
39     change_sort_data2 = change_sort_data2.shift(periods=5,axis=0)
40     change_sort_data2.dropna(inplace=True)
View Code

結果如圖:

然後就是畫圖了;

1 plt.plot(x,lr_plt,label='lr',color='r',marker='o')
2 plt.plot(x,ridge_plt,label='ridge',color='b',marker='o')
3 plt.plot(x,svr_plt,label='svr',color='g',marker='o')
4 plt.plot(x,RF_plt,label='RF',color='y',marker='o')
5 plt.legend()
6 plt.show()
View Code

捨去lr,並擴大縱座標:

1 #plt.plot(x,lr_plt,label='lr',color='r',marker='o')
2 plt.plot(x,ridge_plt,label='ridge',color='b',marker='o')
3 plt.plot(x,svr_plt,label='svr',color='g',marker='o')
4 plt.plot(x,RF_plt,label='RF',color='y',marker='o')
5 plt.legend()
6 plt.show()
View Code

其他模型只需將MinMaxScaler改為MaxAbsScaler,standarScaler即可;

總的來說,label的移動會使得mse增加,大約在label=10時候差異最小,結果最理想;