1. 程式人生 > >記一次隨機森林小實踐

記一次隨機森林小實踐

前言

程式碼是從Jupyter Notebook匯出來的
過程中借鑑了些的資料清洗寫法,有時間再補充。
好記性不如爛筆頭,免得下次又到處查語法。

py版本

# -*- coding: utf-8 -*-
# @Time    : 18-11-1 上午10:43
# @Author  : wanghai
# @Email   : 
# @File    : testt.py
# @Software: PyCharm Community Edition

# coding: utf-8

# In[1]:


import numpy as np
import pandas as pd
import
warnings import matplotlib.pyplot as plt import seaborn as sns from sklearn.feature_extraction import DictVectorizer from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import classification_report from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from
sklearn.cross_validation import train_test_split # In[2]: raw_df = pd.read_csv('data.csv') df1 = raw_df.drop(['apply_id'], axis=1) # 異常值是否多 df1.describe() # In[3]: def scatterplot(x_data, y_data, area, alpha, x_label="", y_label="", title="", color="g"): plt.scatter(x, y, s=area, alpha=alpha,
c=color) plt.title(title) plt.xlabel(x_label) plt.ylabel(y_label) plt.legend(loc='upper left') plt.show() # # 資料清洗,標籤準備 # In[4]: # 應付實付時間差 df1['date'] = (pd.to_datetime(df1['act_repay_dt']) - pd.to_datetime(df1['plan_repay_dt'])).dt.total_seconds() / ( 24 * 60 * 60) # 視覺化 x = df1['date'] y = x area = np.pi * 3 scatterplot(x, y, area, 0.7, x_label="date", y_label="y", title="pay time img") # In[5]: date_show = df1['date'].dropna() # matplotlib histogram plt.hist(date_show, facecolor='blue', edgecolor='black', bins=155) # kdeplot(核密度估計圖) sns.distplot(date_show, hist=True, kde=False, bins=500, color='blue', hist_kws={'edgecolor': 'black'}) plt.title('Histogram of pay date') plt.xlabel('pay date') plt.ylabel('people count') plt.show() # In[6]: print('The shape of our features is:', df1.shape) # 標籤準備 df1['y'] = np.where((pd.isnull(df1['act_repay_dt'])) | (df1['date'] > 7), 1, 0) illegal = df1[(pd.isnull(df1['act_repay_dt'])) | (df1['date'] > 7)] print("至今未還款或者還款時間逾期的人有 %d 人,佔比 %.3f" % (len(illegal), float(len(illegal)) / float(len(df1)))) columns = ['act_repay_dt', 'plan_repay_dt', 'date'] # 刪除干擾列(初步) df1.drop(columns, inplace=True, axis=1) # 刪除最大最小的100行(TODO:該方法有待改進) columns = df1.columns.tolist() for col in columns: indexs = df1.nlargest(3, columns=[col]).index.values for i in indexs: df1.drop(i, inplace=True) print('The shape of our features after del is:', df1.shape) # TODO:計算相關性,幹掉相關係數特別高的 # In[7]: df1.head(3) # # 均值填充空值 # In[8]: df1 = df1.fillna(df1.mean()) x = np.array(df1.iloc[:, 0:-1]) y = np.array(df1.iloc[:, -1]) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=11) # dt = DictVectorizer(sparse=False) # x_train = dt.fit_transform(x_train.to_dict()) # x_test = dt.fit_transform(x_test.to_dict()) print('Training Features Shape:', x_train.shape) print('Training Labels Shape:', y_train.shape) print('Testing Features Shape:', x_test.shape) print('Testing Labels Shape:', y_test.shape) # In[9]: # # 決策樹版本 # dtc = DecisionTreeClassifier() # dtc.fit(x_train, y_train) # dt_predict = dtc.predict(x_test) # print(dtc.score(x_test, y_test)) # print(classification_report(y_test, dt_predict, target_names=["died", "survived"])) # 隨機森林版本 rfc = RandomForestClassifier() rfc.fit(x_train, y_train) rfc_y_predict = rfc.predict(x_test) # 返回給定測試資料和標籤的平均精度。 print("均值填充平均精度為:{:.2f}".format(rfc.score(x_test, y_test))) # In[11]: print("The accuracy/recall rate and other results are as follows:") print(classification_report(y_test, rfc_y_predict, target_names=["plan_repay", "overdue_repay"])) # In[12]: print(rfc_y_predict) # In[13]: print(y_test) # In[14]: # 特徵重要性 print(rfc.feature_importances_)

markdown版本

import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.cross_validation import train_test_split
/home/c/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
raw_df = pd.read_csv('data.csv')
df1 = raw_df.drop(['×××'], axis = 1)
# 異常值是否多
df1.describe()
def scatterplot(x_data, y_data, area, alpha, x_label="", y_label="", title="", color = "g"):
    plt.scatter(x, y, s=area, alpha=alpha, c=color)
    plt.title(title)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.legend(loc='upper left')
    plt.show()

# 資料清洗,標籤準備

# 應付實付時間差
df1['date'] = (pd.to_datetime(df1['×××']) - pd.to_datetime(df1['×××'])).dt.total_seconds()/(24*60*60)
# 視覺化
x = df1['date']
y = x
area = np.pi*3
scatterplot(x, y, area, 0.7, x_label="date", y_label="y", title="pay time img")

png

date_show = df1['date'].dropna()

# matplotlib histogram
plt.hist(date_show, facecolor = 'blue', edgecolor = 'black',bins = 155)

# kdeplot(核密度估計圖)
sns.distplot(date_show, hist=True, kde=False, 
             bins=500, color = 'blue',
             hist_kws={'edgecolor':'black'})
plt.title('Histogram of pay date')
plt.xlabel('pay date')
plt.ylabel('people count')
plt.show()
/home/c/anaconda2/lib/python2.7/site-packages/matplotlib/axes/_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg.
  warnings.warn("The 'normed' kwarg is deprecated, and has been "

在這裡插入圖片描述

print('The shape of our features is:', df1.shape)

# 標籤準備
df1['y'] = np.where((pd.isnull(df1['act_repay_dt'])) | (df1['date'] > 7), 1, 0)

illegal = df1[(pd.isnull(df1['act_repay_dt'])) | (df1['date']>7)]
print("至今未還款或者還款時間逾期的人有 %d 人,佔比 %.3f" % (len(illegal), float(len(illegal)) / float(len(df1))))
columns = ['act_repay_dt', 'plan_repay_dt', 'date']
# 刪除干擾列(初步)
df1.drop(columns, inplace=True, axis=1)

# 刪除最大最小的100行(TODO:該方法有待改進)
columns = df1.columns.tolist()
for col in columns:
    indexs = df1.nlargest(3, columns=[col]).index.values
    for i in indexs:
        df1.drop(i, inplace=True)

print('The shape of our features after del is:', df1.shape)
# TODO:計算相關性,幹掉相關係數特別高的

('The shape of our features is:', (12154, 221))
至今未還款或者還款時間逾期的人有 1837 人,佔比 0.151
('The shape of our features after del is:', (11497, 219))
df1.head(3)

# 均值填充空值

df1 = df1.fillna(df1.mean())
x = np.array(df1.iloc[:,0:-1])
y = np.array(df1.iloc[:,-1])
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=11)

# dt = DictVectorizer(sparse=False)
# x_train = dt.fit_transform(x_train.to_dict())
# x_test = dt.fit_transform(x_test.to_dict())

print('Training Features Shape:', x_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', x_test.shape)
print('Testing Labels Shape:', y_test.shape)
('Training Features Shape:', (8047, 218))
('Training Labels Shape:', (8047,))
('Testing Features Shape:', (3450, 218))
('Testing Labels Shape:', (3450,))
# # 決策樹版本
# dtc = DecisionTreeClassifier()
 
# dtc.fit(x_train, y_train)
 
# dt_predict = dtc.predict(x_test)
 
# print(dtc.score(x_test, y_test))
# print(classification_report(y_test, dt_predict, target_names=["died", "survived"]))

# 隨機森林版本
rfc = RandomForestClassifier()
rfc.fit(x_train, y_train)
rfc_y_predict = rfc.predict(x_test)
# 返回給定測試資料和標籤的平均精度。
print("均值填充平均精度為:{:.2f}".format(rfc.score(x_test,y_test)))
均值填充平均精度為:0.86
print("The accuracy/recall rate and other results are as follows:")
print(classification_report(y_test, rfc_y_predict, target_names=["plan_repay", "overdue_repay"]))
The accuracy/recall rate and other results are as follows:
               precision    recall  f1-score   support

   plan_repay       0.87      0.99      0.92      2976
overdue_repay       0.33      0.04      0.07       474

  avg / total       0.79      0.86      0.81      3450
rfc_y_predict
array([0, 0, 0, ..., 0, 0, 0])
y_test
array([0, 0, 0, ..., 0, 0, 0])
# 特徵重要性
rfc.feature_importances_

調優

max_features、n_estimators、min_samples_leaf

設定交叉驗證

cv_parameter = [{'min_samples_leaf':[5,15,25,35], 'n_estimators':[50,200,500], 'max_depth' = [2, 3, 5]}]
n_jobs並行
clf = GridSearchCV(estimator=rfc,param_grid=cv_parameter, cv=5, n_jobs=1)

max_depth :

整數或None,可選(預設=None)
樹的最大深度。如果為None,則擴充套件節點直到所有葉子都是純的或直到所有葉子包含少於min_samples_split樣本。

from sklearn.model_selection import GridSearchCV
rfc = RandomForestClassifier(max_features = 'sqrt', random_state = 3)
cv_parameter = [{'n_estimators':[50,200,500], 'min_samples_leaf':[5,15,25,35], 'max_depth':[2, 3, 5]}]
clf = GridSearchCV(estimator=rfc,param_grid=cv_parameter, cv=5, n_jobs=1)

clf.fit(x_train, y_train)
print('Best parameters:')
print(clf.best_params_)

在這裡插入圖片描述

設定權重

rfc = RandomForestClassifier(random_state = 3, class_weight={0: 1, 1: 5})

關於結果classification_report

在這裡插入圖片描述
預測出25個正樣本,對了11個,共474個真實正樣本。準確率0.44, 召回率0.023
在這裡插入圖片描述