1. 程式人生 > >利用scikit-learn庫實現隨機森林分類演算法

利用scikit-learn庫實現隨機森林分類演算法

自己實踐一下在本章學到一些方法

 

首先實踐核心的部分,怎麼實現一個分類模型,並通過驗證曲線去優化模型,最後使用訓練出來的模型進行預測

In [20]:
#載入預處理的資料
import pandas as pd
df=pd.read_csv('../data/hr-analytics/hr_data_processed.csv') df.columns 
Out[20]:
Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'work_accident', 'left',
       'promotion_last_5years', 'department_IT', 'department_RandD',
       'department_accounting', 'department_hr', 'department_management',
       'department_marketing', 'department_product_mng', 'department_sales',
       'department_support', 'department_technical', 'salary_high',
       'salary_low', 'salary_medium'],
      dtype='object')
In [21]:
#選擇訓練集

features = ['satisfaction_level', 'last_evaluation', 'number_project', 'average_montly_hours', 'time_spend_company', 'work_accident', 'promotion_last_5years', 'department_IT', 'department_RandD', 'department_accounting', 'department_hr', 'department_management', 'department_marketing', 'department_product_mng', 'department_sales', 'department_support', 'department_technical', 'salary_high', 'salary_low', 'salary_medium'] X=df[features].values y=df.left.values 
In [33]:
#使用隨機森林分類器,計算驗證曲線的 max_depth
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score from sklearn.model_selection import validation_curve import numpy as np np.random.seed(1) #保證對於相同數量的隨機數的數列的值是相同的 clf=RandomForestClassifier(n_estimators=20) max_depths=[3,4,5,6,7,9,12,15,18,21] print('Training {} models'.format(len(max_depths))) train_scores,test_scores= validation_curve(estimator=clf, X=X, y=y, param_name="max_depth",param_range=max_depths,cv=5) 
 
Training 10 models
In [43]:
def plot_validation_curve(train_scores, test_scores, param_range, xlabel='', log=False): '''  This code is from scikit-learn docs:  http://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html   Also here:  https://github.com/rasbt/python-machine-learning-book-2nd-edition/blob/master/code/ch06/ch06.ipynb  ''' train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) fig = plt.figure() plt.plot(param_range, train_mean, color=sns.color_palette('Set1')[1], marker='o', markersize=5, label='training accuracy') plt.fill_between(param_range, train_mean + train_std, train_mean - train_std, alpha=0.15, color=sns.color_palette('Set1')[1]) plt.plot(param_range, test_mean, color=sns.color_palette('Set1')[0], linestyle='--', marker='s', markersize=5, label='validation accuracy') plt.fill_between(param_range, test_mean + test_std, test_mean - test_std, alpha=0.15, color=sns.color_palette('Set1')[0]) if log: plt.xscale('log') plt.legend(loc='lower right') if xlabel: plt.xlabel(xlabel) plt.ylabel('Accuracy') plt.ylim(0.9, 1.0) return fig 
In [45]:
import matplotlib.pyplot as plt
import seaborn as sns 
In [47]:
#畫出驗證曲線
plot_validation_curve(train_scores,test_scores,max_depths,xlabel='max_depth') plt.xlim(3,21) plt.savefig('../figures/test_classfication_model.png', bbox_inches='tight', dpi=300) 
In [58]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix from sklearn.metrics import accuracy_score from IPython.display import display from mlxtend.plotting import plot_decision_regions def cross_val_class_score(clf,X,y,cv=10): kfold=StratifiedKFold(n_splits=cv).split(X,y) class_accuracy=[] for k,(train,test) in enumerate(kfold): clf.fit(X[train],y[train]) #使用訓練資料擬合模型 y_test=y[test] y_pred=clf.predict(X[test]) #計算混淆矩陣,通過混淆矩陣找出對於每一個折,分類是0或者1的概率 cmat=confusion_matrix(y_test,y_pred) class_acc=cmat.diagonal()/cmat.sum(axis=1) class_accuracy.append(class_acc) print('fold: {:d} accuracy {:s}'.format(k+1,str(class_acc))) return np.array(class_accuracy) 
In [61]:
#顯示k折驗證的結果
np.random.seed(1) clf=RandomForestClassifier(n_estimators=200, max_depth=6) scores=cross_val_class_score(clf,X,y) print('accuracy {} +/- {}'.format(scores.mean(axis=0),scores.std(axis=0))) 
 
fold: 1 accuracy [ 0.99825022  0.88826816]
fold: 2 accuracy [ 0.99825022  0.84033613]
fold: 3 accuracy [ 0.99387577  0.81232493]
fold: 4 accuracy [ 0.99300087  0.85154062]
fold: 5 accuracy [ 0.99475066  0.82633053]
fold: 6 accuracy [ 0.99387577  0.85994398]
fold: 7 accuracy [ 0.99650044  0.87394958]
fold: 8 accuracy [ 0.99650044  0.83473389]
fold: 9 accuracy [ 0.99474606  0.87394958]
fold: 10 accuracy [ 0.99562172  0.89635854]
accuracy [ 0.99553722  0.85577359] +/- [ 0.00172575  0.02614334]
In [69]:
#畫出結果的箱圖
fig=plt.figure(figsize=(5,7)) sns.boxplot(data=pd.DataFrame(scores,columns=[0,1]), palette=sns.color_palette('Set1')) plt.xlabel('Left') plt.ylabel('accuracy') plt.show() 
  In [71]:
#計算特徵的重要性
d=(clf.feature_importances_,df.columns) list(zip(*d)) 
Out[71]:
[(0.36430881606946935, 'satisfaction_level'),
 (0.10606469651847085, 'last_evaluation'),
 (0.19088737947190054, 'number_project'),
 (0.13082595880187356, 'average_montly_hours'),
 (0.17955451160561237, 'time_spend_company'),
 (0.012101773234080513, 'work_accident'),
 (0.0008113047024873478, 'left'),
 (0.00021062542962211009, 'promotion_last_5years'),
 (0.00077649873359240354, 'department_IT'),
 (0.00022487937663401313, 'department_RandD'),
 (0.00043794363826079859, 'department_accounting'),
 (0.00031980481539390949, 'department_hr'),
 (0.00011370864098983321, 'department_management'),
 (0.00015365441067812497, 'department_marketing'),
 (0.00031929963267123197, 'department_product_mng'),
 (0.00036881031257490304, 'department_sales'),
 (0.00039082790477380948, 'department_support'),
 (0.0050013161512548546, 'department_technical'),
 (0.005775253267745778, 'salary_high'),
 (0.0013529372819138833, 'salary_low')]
In [75]:
#視覺化特徵的重要性
pd.Series(clf.feature_importances_, name='Feature importance', index=df[features].columns).sort_values().plot.barh() plt.show() 
  In [76]:
#打印出所有低重要性的特徵
importances=list(pd.Series(clf.feature_importances_, index=df[features].columns).sort_values(ascending=False).index) np.array(importances[5:]) <