1. 程式人生 > >Python_sklearn機器學習庫學習筆記(四)decision_tree(決策樹)

Python_sklearn機器學習庫學習筆記(四)decision_tree(決策樹)

min n) 空間 strong output epo from 標簽 ict

# 決策樹

import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
import zipfile
#壓縮節省空間
z=zipfile.ZipFile(‘ad-dataset.zip‘)
# df=pd.read_csv(z.open(z.namelist()[0]),header=None,low_memory=False)
# df = pd.read_csv(z.open(z.namelist()[0]), header=None, low_memory=False)
df=pd.read_csv(‘.\\tree_data\\ad.data‘,header=None)
explanatory_variable_columns=set(df.columns.values)
response_variable_column=df[len(df.columns.values)-1]
#最後一列是代表的標簽類型
explanatory_variable_columns.remove(len(df.columns)-1)
y=[1 if e ==‘ad.‘ else 0 for e in response_variable_column]
X=df.loc[:,list(explanatory_variable_columns)]
#匹配?字符,並把值轉化為-1
X.replace(to_replace=‘ *\?‘, value=-1, regex=True, inplace=True)
技術分享
X_train,X_test,y_train,y_test=train_test_split(X,y)
#用信息增益啟發式算法建立決策樹
pipeline=Pipeline([(‘clf‘,DecisionTreeClassifier(criterion=‘entropy‘))])
parameters = {
‘clf__max_depth‘: (150, 155, 160),
‘clf__min_samples_split‘: (1, 2, 3),
‘clf__min_samples_leaf‘: (1, 2, 3)
}
#f1查全率和查準率的調和平均
grid_search=GridSearchCV(pipeline,parameters,n_jobs=-1,
                         verbose=1,scoring=‘f1‘)
grid_search.fit(X_train,y_train)
print ‘最佳效果:%0.3f‘%grid_search.best_score_
print ‘最優參數‘
best_parameters=grid_search.best_estimator_.get_params()
best_parameters
技術分享

輸出結果:

Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   21.0s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   34.7s finished
最佳效果:0.888
最優參數
Out[123]:
{‘clf‘: DecisionTreeClassifier(class_weight=None, criterion=‘entropy‘, max_depth=160,
             max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
             min_samples_split=3, min_weight_fraction_leaf=0.0,
             presort=False, random_state=None, splitter=‘best‘),
 ‘clf__class_weight‘: None,
 ‘clf__criterion‘: ‘entropy‘,
 ‘clf__max_depth‘: 160,
 ‘clf__max_features‘: None,
 ‘clf__max_leaf_nodes‘: None,
 ‘clf__min_samples_leaf‘: 1,
 ‘clf__min_samples_split‘: 3,
 ‘clf__min_weight_fraction_leaf‘: 0.0,
 ‘clf__presort‘: False,
 ‘clf__random_state‘: None,
 ‘clf__splitter‘: ‘best‘,
 ‘steps‘: [(‘clf‘,
   DecisionTreeClassifier(class_weight=None, criterion=‘entropy‘, max_depth=160,
               max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
               min_samples_split=3, min_weight_fraction_leaf=0.0,
               presort=False, random_state=None, splitter=‘best‘))]}

for param_name in sorted(parameters.keys()):
    print (‘\t%s:%r‘%(param_name,best_parameters[param_name]))
predictions=grid_search.predict(X_test)
print classification_report(y_test,predictions)

輸出結果:

clf__max_depth:150
clf__min_samples_leaf:1
clf__min_samples_split:1
precision recall f1-score support

0 0.97 0.99 0.98 703
1 0.91 0.84 0.87 117

avg / total 0.96 0.96 0.96 820

df.head()

輸出結果;

0123456789...1549155015511552155315541555155615571558
0 125 125 1.0 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 ad.
1 57 468 8.2105 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 ad.
2 33 230 6.9696 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 ad.
3 60 468 7.8 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 ad.
4 60 468 7.8 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 ad.

# 決策樹集成

技術分享
#coding:utf-8
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV

df=pd.read_csv(‘.\\tree_data\\ad.data‘,header=None,low_memory=False)
explanatory_variable_columns=set(df.columns.values)
response_variable_column=df[len(df.columns.values)-1]
技術分享
df.head()
0123456789...1549155015511552155315541555155615571558
0 125 125 1.0 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 ad.
1 57 468 8.2105 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 ad.
2 33 230 6.9696 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 ad.
3 60 468 7.8 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 ad.
4 60 468 7.8 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 ad.
技術分享
#The last column describes the targets(去掉最後一列)
explanatory_variable_columns.remove(len(df.columns.values)-1)
y=[1 if e==‘ad.‘ else 0 for e in response_variable_column]
X=df.loc[:,list(explanatory_variable_columns)]
#置換有?的為-1
X.replace(to_replace=‘ *\?‘, value=-1, regex=True, inplace=True)
X_train,X_test,y_train,y_test=train_test_split(X,y)
pipeline=Pipeline([(‘clf‘,RandomForestClassifier(criterion=‘entropy‘))])
parameters = {
‘clf__n_estimators‘: (5, 10, 20, 50),
‘clf__max_depth‘: (50, 150, 250),
‘clf__min_samples_split‘: (1, 2, 3),
‘clf__min_samples_leaf‘: (1, 2, 3)
}
grid_search = GridSearchCV(pipeline,parameters,n_jobs=-1,verbose=1,scoring=‘f1‘)
grid_search.fit(X_train,y_train)
技術分享
print(u‘最佳效果:%0.3f‘%grid_search.best_score_)
print u‘最優的參數:‘
best_parameters=grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print(‘\t%s:%r‘%(param_name,best_parameters[param_name]))

輸出結果:

最佳效果:0.929 最優的參數: clf__max_depth:250 clf__min_samples_leaf:1 clf__min_samples_split:3 clf__n_estimators:50
predictions=grid_search.predict(X_test)
print classification_report(y_test,predictions)

輸出結果:

precision recall f1-score support

0 0.98 1.00 0.99 705
1 0.97 0.90 0.93 115

avg / total 0.98 0.98 0.98 820

Python_sklearn機器學習庫學習筆記(四)decision_tree(決策樹)