matplotlib.pyplot繪製kmeans的聚合程度,以及輪廓係數
阿新 • • 發佈:2018-12-15
Kmeans2Pmml.py
# -*- coding:utf-8 -*- import pandas from sklearn.model_selection import train_test_split import numpy as np # 匯入numpy庫 import matplotlib.pyplot as plt # 匯入matplotlib庫 from sklearn.cluster import KMeans # 匯入sklearn聚類模組 from sklearn import metrics # 匯入sklearn效果評估模組 import random from sklearn.decomposition import PCA from sklearn2pmml.pipeline import PMMLPipeline from sklearn2pmml import sklearn2pmml def make_result_pic(x_train,silhouette_s,pic_kmeans,data_view,n_clusters,result_pic): # 為方便模型視覺化將元資料降維成2維 reduced_data = PCA(n_components=2).fit_transform(x_train) print(len(reduced_data)) pic_kmeans.fit(reduced_data) x_pre=pic_kmeans.predict(reduced_data) dict = {} for key in x_pre: dict[key] = dict.get(key, 0) + 1 # print(dict) # 模型效果視覺化 centers = pic_kmeans.cluster_centers_ # 各類別中心 color_list = [] # 顏色列表 for index in range(n_clusters): R = round(random.uniform(0, 1), 4) G = round(random.uniform(0, 1), 4) B = round(random.uniform(0, 1), 4) cur_color = (R, G, B) color_list.append(cur_color) plt.figure(figsize=(10, 10)) # 建立畫布 plt.subplot(2, 2, 1) # 第一個子網格 for i in range(n_clusters): # 迴圈讀類別 index_sets = np.where(x_pre == i) # 找到相同類的索引集合 cluster = reduced_data[index_sets] # 將相同類的資料劃分為一個聚類子集 plt.scatter(cluster[:, 0], cluster[:, 1], c=color_list[i], marker='.') # 展示聚類子集內的樣本點 plt.plot(centers[i][0], centers[i][1], 'o', markerfacecolor=color_list[i], markeredgecolor='k', markersize=6) # 展示各聚類子集的中心 # 子網格3: plt.subplot(2, 2, 2) # 第二個子網格 plt.axis('off') plt.title('silhouette_s:' + str(silhouette_s)+" "+data_view, loc='center') # 子網格標題 # 子網格2: plt.subplot(2, 2, 3) # 第二個子網格 plt.axis('off') plt.title('distribution:' +str(dict), loc='center') # 子網格標題 # 自動調整繪圖區的大小及間距 fig = plt.gcf() fig.tight_layout() fig.savefig(result_pic) plt.close() def main(modelName,trainingFilePath,code,k_test_size,k_clusters,k_random_state,k_max_iter): # 評估結果生成路徑 result_pic = str(modelName).replace("pmml", "png") iris_df = pandas.read_csv(trainingFilePath, encoding=code) columns = iris_df.columns.tolist() # 預設第一列為行號 最後一列為標籤列 first_colName = columns[0]; feature_list = (iris_df.columns.difference([first_colName])).tolist() X = iris_df[iris_df.columns.difference([first_colName])] # 按照比例將資料分成訓練集和測試集 x_train, x_test= train_test_split(X, test_size=k_test_size, random_state=0); data_view = "total: " + str(len(iris_df)) + " train:" + str(len(x_train)) + " test:" + str(len(x_test)) print(data_view) if len(columns) < 3: print("columnNum error") exit(1) else: print("check success") from sklearn2pmml.pipeline import PMMLPipeline model_kmeans = KMeans(n_clusters=k_clusters, random_state=k_random_state, max_iter=k_max_iter) # 建立聚類模型物件 pic_kmeans = KMeans(n_clusters=k_clusters, random_state=k_random_state, max_iter=k_max_iter) # 建立聚類模型物件 pipeline = PMMLPipeline([ ("classifier", model_kmeans) ]) pipeline.fit(X) # 訓練聚類模型 y_pre = pipeline.predict(X) # 預測聚類模型 # 模型效果指標評估 silhouette_s = metrics.silhouette_score(X, y_pre, metric='euclidean') # 平均輪廓係數 silhouette_s=round(silhouette_s,4) #繪製結果 make_result_pic(x_train,silhouette_s,pic_kmeans,data_view,k_clusters,result_pic) sklearn2pmml(pipeline, modelName, with_repr=True)
kmeans_main.py
# -*- coding:utf-8 -*- import sys import kmeans_pmml.Kmeans2Pmml as m #sys.argv[0] 為指令碼自身 #m.main(sys.argv[1],sys.argv[2]) from util import codingUtil try: # modelName ="E:/data/out/kmeans.pmml" # trainingFilePath = "E:/data/cluster2.csv" # k_clusters=3 # k_random_state=None # k_max_iter=200 # k_test_size = 0.2 modelName=sys.argv[1] trainingFilePath=sys.argv[2] # 預設 8 k_clusters = int(sys.argv[3]) if sys.argv[4]=="None": k_random_state =None else : k_random_state =int(sys.argv[4]) k_max_iter = int(sys.argv[5]) k_test_size = float(sys.argv[6]) code = codingUtil.file_encoding(trainingFilePath) m.main(modelName,trainingFilePath,code,k_test_size,k_clusters,k_random_state,k_max_iter) except Exception as e: print('Exception :\t\t', str(e))
聚類效果圖 ,資料分佈,聚合分佈情況
總體程式碼參考
但是案例裡面的為兩個特徵的資料剛好能畫聚合效果圖
實際可能為多個特徵 所以需要先降維再去繪圖