使用Sklearn模組建立聚類、迴歸、分類模型並評價
阿新 • • 發佈:2018-12-17
資料預處理和降維
首先來學習下載入資料集、劃分資料集、資料預處理以及PCA降維
# 載入資料集
from sklearn.datasets import load_boston
boston = load_boston()
boston_data = boston['data']
boston_target = boston['target']
boston_names = boston['feature_names']
print('boston資料集資料形狀為:', boston_data.shape)
print('boston資料集標籤形狀為:', boston_target.shape)
print('boston資料集特徵名形狀為:', boston_names.shape)
# 劃分資料集
from sklearn.model_selection import train_test_split
import numpy as np
boston_data_train, boston_data_test, \
boston_target_train, boston_target_test = \
train_test_split(boston_data, boston_target,
test_size=0.2, random_state= 42)
print('訓練集資料的形狀為:', boston_data_train.shape)
print('訓練集標籤的形狀為:', boston_target_train.shape)
print('測試集資料的形狀為:', boston_data_test.shape)
print('測試集標籤的形狀為:', boston_target_test.shape)
# 使用sklearn的轉換器進行資料預處理
from sklearn.preprocessing import StandardScaler
stdScaler = StandardScaler().fit(boston_data_train)
boston_trainScaler = stdScaler.transform(boston_data_train)
boston_testScaler = stdScaler.transform(boston_data_test)
print('標準差標準化後訓練集資料的方差', np.var(boston_trainScaler))
print('標準差標準化後訓練集資料的均值', np.mean(boston_trainScaler))
print('標準差標準化後測試集資料的均值', np.var(boston_testScaler))
print('標準差標準化後測試集資料的均值', np.mean(boston_testScaler))
# 使用轉換器進行PCA降維
from sklearn.decomposition import PCA
pca = PCA(n_components=5).fit(boston_trainScaler)
boston_trainPca=pca.transform(boston_trainScaler)
boston_testPca=pca.transform(boston_testScaler)
print('降維後訓練集形狀:',boston_trainPca.shape)
print('降維後測試集形狀:',boston_testPca.shape)
執行結果如圖
聚類模型的構建與評價
聚類演算法中我們以kmeans為例對種子資料進行分析,並建立模型並評價
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
seeds = pd.read_csv('f:/data/seeds_dataset.txt', sep='\t')
# 處理資料
seeds_data = seeds.iloc[:, :7].values
seeds_target = seeds.iloc[:, 7].values
seeds_names = seeds.columns[:7]
stdScale = StandardScaler().fit(seeds_data)
seeds_dataScale = stdScale.transform(seeds_data)
# 構建並訓練模型
kmeans = KMeans(n_clusters=3, random_state=42).fit(seeds_data)
print('構建的kmeans模型為:', kmeans)
# 評價模型
from sklearn.metrics import calinski_harabaz_score
for i in range(2, 7):
kmeans = KMeans(n_clusters=i, random_state=123).fit(seeds_data)
score = calinski_harabaz_score(seeds_data, kmeans.labels_)
print('seeds資料聚%d類calinski_harabaz指數為:%f' % (i, score))
結果如圖 執行結果顯示,在聚類數目為3時,calinski_harabaz指數最大,聚類效果最好
分類模型的構建與評價(對鮑魚年齡特徵進行預測)
分類模型中我們以SVM為例對鮑魚年齡特徵進行分析
import pandas as pd
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
abalone = pd.read_csv('f:/data/abalone.data', sep=',')
abalone_data = abalone.iloc[:, :8]
abalone_target = abalone.iloc[:, 8]
# 連續型特徵離散化
sex = pd.get_dummies(abalone_data['sex'])
abalone_data = pd.concat([abalone_data, sex], axis=1)
abalone_data.drop('sex', axis=1, inplace=True)
# 劃分訓練集、測試集
abalone_data_train, abalone_data_test, \
abalone_target_train, abalone_target_test = \
train_test_split(abalone_data, abalone_target,
train_size=0.2, random_state=42)
# 標準化
abaloneScaler = StandardScaler().fit(abalone_data_train)
abalone_data_train_std = abaloneScaler.transform(abalone_data_train)
abalone_data_test_std = abaloneScaler.transform(abalone_data_test)
# 降維
pca = PCA(n_components=6).fit(abalone_data_train_std)
abalone_data_train_pca = pca.transform(abalone_data_train_std)
abalone_data_test_pca = pca.transform(abalone_data_test_std)
# 建模
svm_abalone = SVC().fit(abalone_data_train_pca, abalone_target_train)
# 評價
abalone_target_pre = svm_abalone.predict(abalone_data_test_pca)
print('svm的分類報告為:\n',
classification_report(abalone_target_test, abalone_target_pre))
結果如圖 結果表明對9的預測效果較好,其他較為欠缺
構建線性迴歸模型並評價
線性迴歸模型中我們以梯度提升迴歸樹模型為例對房價資料進行分析建模並評價
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
house = pd.read_csv('f:/data/cal_housing.data', sep=',')
house_data = house.iloc[:, :-1]
house_target = house.iloc[:, -1]
house_train,house_test,\
house_target_train,house_target_test=\
train_test_split(house_data,house_target,
test_size=0.2,random_state=42)
GBR_house=GradientBoostingRegressor().fit(house_train,house_target_train)
print(GBR_house)
#評價
house_target_pre=GBR_house.predict(house_test)
from sklearn.metrics import explained_variance_score,\
mean_absolute_error,mean_squared_error,\
median_absolute_error,r2_score
print('梯度提升迴歸樹模型的平均絕對誤差為:',mean_absolute_error(house_target_test,house_target_pre))
print('梯度提升迴歸樹模型的均方誤差為:',mean_squared_error(house_target_test,house_target_pre))
print('梯度提升迴歸樹模型的中值絕對誤差為:',median_absolute_error(house_target_test,house_target_pre))
print('梯度提升迴歸樹模型的可解釋方差值為:',explained_variance_score(house_target_test,house_target_pre))
print('梯度提升迴歸樹模型的R^2值為:',r2_score(house_target_test,house_target_pre))
結果如下 結果表明本次構建的梯度提升迴歸樹模型的平均絕對誤差和均方誤差相對合理,且可解釋方差值和R^2值較接近1,故本次構建的模型是一個較為有效的模型。