機器學習之整合學習
阿新 • • 發佈:2019-02-02
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
1、資料載入
# 載入資料集
fruits_df = pd.read_table('fruit_data_with_colors.txt')
print(fruits_df.head())
print('樣本個數:', len(fruits_df))
# 建立目標標籤和名稱的字典
fruit_name_dict = dict(zip(fruits_df['fruit_label' ], fruits_df['fruit_name']))
print(fruit_name_dict)
# 劃分資料集
X = fruits_df[['mass', 'width', 'height', 'color_score']]
y = fruits_df['fruit_label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/4, random_state=0)
print('資料集樣本數:{},訓練集樣本數:{},測試集樣本數:{}'.format(len(X), len(X_train), len(X_test)))
2、特徵歸一化
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
for i in range(4):
print('歸一化前,訓練資料第{}維特徵最大值:{:.3f},最小值:{:.3f}'.format(i + 1, X_train.iloc[:, i].max(), X_train.iloc[:, i].min ()))
print('歸一化後,訓練資料第{}維特徵最大值:{:.3f},最小值:{:.3f}'.format(i + 1, X_train_scaled[:, i].max(), X_train_scaled[:, i].min()))
print()
3、資料建模
# 3.1 Stacking
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from mlxtend.classifier import StackingClassifier
clf1 = KNeighborsClassifier()
clf2 = SVC()
clf3 = DecisionTreeClassifier()
lr = LogisticRegression()
sclf = StackingClassifier(classifiers=[clf1, clf2, clf3],
meta_classifier=lr)
clf1.fit(X_train_scaled, y_train)
clf2.fit(X_train_scaled, y_train)
clf3.fit(X_train_scaled, y_train)
sclf.fit(X_train_scaled, y_train)
print('kNN測試集準確率:{:.3f}'.format(clf1.score(X_test_scaled, y_test)))
print('SVM測試集準確率:{:.3f}'.format(clf2.score(X_test_scaled, y_test)))
print('DT測試集準確率:{:.3f}'.format(clf3.score(X_test_scaled, y_test)))
print('Stacking測試集準確率:{:.3f}'.format(sclf.score(X_test_scaled, y_test)))
# 3.2 AdaBoost
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
parameters = {'n_estimators': [20, 40, 60, 80, 100, 120, 140]}
clf = GridSearchCV(AdaBoostClassifier(), parameters, cv=3, scoring='accuracy')
clf.fit(X_train_scaled, y_train)
print('最優引數:', clf.best_params_)
print('驗證集最高得分:', clf.best_score_)
print('測試集準確率:{:.3f}'.format(clf.score(X_test_scaled, y_test)))
# 3.3 GBDT
from sklearn.ensemble import GradientBoostingClassifier
parameters = {'learning_rate': [0.001, 0.01, 0.1, 1, 10, 100]}
clf = GridSearchCV(GradientBoostingClassifier(), parameters, cv=3, scoring='accuracy')
clf.fit(X_train_scaled, y_train)
print('最優引數:', clf.best_params_)
print('驗證集最高得分:', clf.best_score_)
print('測試集準確率:{:.3f}'.format(clf.score(X_test_scaled, y_test)))
# 3.4 隨機森林
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
parameters = {'n_estimators':[10, 50, 100, 150, 200]}
clf = GridSearchCV(RandomForestClassifier(random_state=0), parameters, cv=3, scoring='accuracy')
clf.fit(X_train_scaled, y_train)
print('最優引數:', clf.best_params_)
print('驗證集最高得分:', clf.best_score_)
print('測試集準確率:{:.3f}'.format(clf.score(X_test_scaled, y_test)))