1. 程式人生 > >機器學習之整合學習

機器學習之整合學習

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split

1、資料載入

# 載入資料集
fruits_df = pd.read_table('fruit_data_with_colors.txt')
print(fruits_df.head())
print('樣本個數:', len(fruits_df))
# 建立目標標籤和名稱的字典
fruit_name_dict = dict(zip(fruits_df['fruit_label'
], fruits_df['fruit_name'])) print(fruit_name_dict) # 劃分資料集 X = fruits_df[['mass', 'width', 'height', 'color_score']] y = fruits_df['fruit_label'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/4, random_state=0) print('資料集樣本數:{},訓練集樣本數:{},測試集樣本數:{}'.format(len(X), len(X_train), len(X_test)))

2、特徵歸一化

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

for i in range(4):
    print('歸一化前,訓練資料第{}維特徵最大值:{:.3f},最小值:{:.3f}'.format(i + 1, X_train.iloc[:, i].max(), X_train.iloc[:, i].min
())) print('歸一化後,訓練資料第{}維特徵最大值:{:.3f},最小值:{:.3f}'.format(i + 1, X_train_scaled[:, i].max(), X_train_scaled[:, i].min())) print()

3、資料建模

# 3.1 Stacking
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from mlxtend.classifier import StackingClassifier

clf1 = KNeighborsClassifier()
clf2 = SVC()
clf3 = DecisionTreeClassifier()
lr = LogisticRegression()

sclf = StackingClassifier(classifiers=[clf1, clf2, clf3], 
                          meta_classifier=lr)

clf1.fit(X_train_scaled, y_train)
clf2.fit(X_train_scaled, y_train)
clf3.fit(X_train_scaled, y_train)
sclf.fit(X_train_scaled, y_train)

print('kNN測試集準確率:{:.3f}'.format(clf1.score(X_test_scaled, y_test)))
print('SVM測試集準確率:{:.3f}'.format(clf2.score(X_test_scaled, y_test)))
print('DT測試集準確率:{:.3f}'.format(clf3.score(X_test_scaled, y_test)))
print('Stacking測試集準確率:{:.3f}'.format(sclf.score(X_test_scaled, y_test)))

# 3.2 AdaBoost
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
parameters = {'n_estimators': [20, 40, 60, 80, 100, 120, 140]}
clf = GridSearchCV(AdaBoostClassifier(), parameters, cv=3, scoring='accuracy')
clf.fit(X_train_scaled, y_train)
print('最優引數:', clf.best_params_)
print('驗證集最高得分:', clf.best_score_)
print('測試集準確率:{:.3f}'.format(clf.score(X_test_scaled, y_test)))

# 3.3 GBDT
from sklearn.ensemble import GradientBoostingClassifier

parameters = {'learning_rate': [0.001, 0.01, 0.1, 1, 10, 100]}
clf = GridSearchCV(GradientBoostingClassifier(), parameters, cv=3, scoring='accuracy')
clf.fit(X_train_scaled, y_train)
print('最優引數:', clf.best_params_)
print('驗證集最高得分:', clf.best_score_)
print('測試集準確率:{:.3f}'.format(clf.score(X_test_scaled, y_test)))

# 3.4 隨機森林
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

parameters = {'n_estimators':[10, 50, 100, 150, 200]}
clf = GridSearchCV(RandomForestClassifier(random_state=0), parameters, cv=3, scoring='accuracy')
clf.fit(X_train_scaled, y_train)

print('最優引數:', clf.best_params_)
print('驗證集最高得分:', clf.best_score_)
print('測試集準確率:{:.3f}'.format(clf.score(X_test_scaled, y_test)))