1. 程式人生 > >matplotlib.pyplot繪製kmeans的聚合程度,以及輪廓係數

matplotlib.pyplot繪製kmeans的聚合程度,以及輪廓係數

Kmeans2Pmml.py
# -*- coding:utf-8 -*-
import pandas
from sklearn.model_selection import train_test_split
import numpy as np  # 匯入numpy庫
import matplotlib.pyplot as plt  # 匯入matplotlib庫
from sklearn.cluster import KMeans  # 匯入sklearn聚類模組
from sklearn import metrics  # 匯入sklearn效果評估模組
import random

from sklearn.decomposition import PCA
from sklearn2pmml.pipeline import PMMLPipeline
from sklearn2pmml import sklearn2pmml

def make_result_pic(x_train,silhouette_s,pic_kmeans,data_view,n_clusters,result_pic):
    # 為方便模型視覺化將元資料降維成2維
    reduced_data = PCA(n_components=2).fit_transform(x_train)
    print(len(reduced_data))
    pic_kmeans.fit(reduced_data)
    x_pre=pic_kmeans.predict(reduced_data)
    dict = {}
    for key in x_pre:
        dict[key] = dict.get(key, 0) + 1
    # print(dict)
    # 模型效果視覺化
    centers = pic_kmeans.cluster_centers_  # 各類別中心
    color_list = []  # 顏色列表
    for index in range(n_clusters):
        R = round(random.uniform(0, 1), 4)
        G = round(random.uniform(0, 1), 4)
        B = round(random.uniform(0, 1), 4)
        cur_color = (R, G, B)
        color_list.append(cur_color)
    plt.figure(figsize=(10, 10))  # 建立畫布
    plt.subplot(2, 2, 1)  # 第一個子網格
    for i in range(n_clusters):  # 迴圈讀類別
        index_sets = np.where(x_pre == i)  # 找到相同類的索引集合
        cluster = reduced_data[index_sets]  # 將相同類的資料劃分為一個聚類子集
        plt.scatter(cluster[:, 0], cluster[:, 1], c=color_list[i], marker='.')  # 展示聚類子集內的樣本點
        plt.plot(centers[i][0], centers[i][1], 'o', markerfacecolor=color_list[i], markeredgecolor='k',
                 markersize=6)  # 展示各聚類子集的中心

    # 子網格3:
    plt.subplot(2, 2, 2)  # 第二個子網格
    plt.axis('off')
    plt.title('silhouette_s:' + str(silhouette_s)+" "+data_view, loc='center')  # 子網格標題
    # 子網格2:
    plt.subplot(2, 2, 3)  # 第二個子網格
    plt.axis('off')
    plt.title('distribution:' +str(dict), loc='center')  # 子網格標題
    # 自動調整繪圖區的大小及間距
    fig = plt.gcf()
    fig.tight_layout()
    fig.savefig(result_pic)
    plt.close()

def main(modelName,trainingFilePath,code,k_test_size,k_clusters,k_random_state,k_max_iter):
    # 評估結果生成路徑
    result_pic = str(modelName).replace("pmml", "png")
    iris_df = pandas.read_csv(trainingFilePath, encoding=code)
    columns = iris_df.columns.tolist()
    # 預設第一列為行號  最後一列為標籤列
    first_colName = columns[0];
    feature_list = (iris_df.columns.difference([first_colName])).tolist()
    X = iris_df[iris_df.columns.difference([first_colName])]
    # 按照比例將資料分成訓練集和測試集
    x_train, x_test= train_test_split(X, test_size=k_test_size, random_state=0);
    data_view = "total: " + str(len(iris_df)) + " train:" + str(len(x_train)) + " test:" + str(len(x_test))
    print(data_view)
    if len(columns) < 3:
        print("columnNum error")
        exit(1)
    else:
        print("check success")
    from sklearn2pmml.pipeline import PMMLPipeline
    model_kmeans = KMeans(n_clusters=k_clusters, random_state=k_random_state, max_iter=k_max_iter)  # 建立聚類模型物件
    pic_kmeans = KMeans(n_clusters=k_clusters, random_state=k_random_state, max_iter=k_max_iter)  # 建立聚類模型物件
    pipeline = PMMLPipeline([
        ("classifier", model_kmeans)
    ])
    pipeline.fit(X)  # 訓練聚類模型
    y_pre = pipeline.predict(X)  # 預測聚類模型
    # 模型效果指標評估
    silhouette_s = metrics.silhouette_score(X, y_pre, metric='euclidean')  # 平均輪廓係數
    silhouette_s=round(silhouette_s,4)
    #繪製結果
    make_result_pic(x_train,silhouette_s,pic_kmeans,data_view,k_clusters,result_pic)
    sklearn2pmml(pipeline, modelName, with_repr=True)
kmeans_main.py
# -*- coding:utf-8 -*-
import sys
import kmeans_pmml.Kmeans2Pmml as m
#sys.argv[0] 為指令碼自身
#m.main(sys.argv[1],sys.argv[2])
from util import codingUtil
try:
     # modelName ="E:/data/out/kmeans.pmml"
     # trainingFilePath = "E:/data/cluster2.csv"
     # k_clusters=3
     # k_random_state=None
     # k_max_iter=200
     # k_test_size = 0.2
     modelName=sys.argv[1]
     trainingFilePath=sys.argv[2]
     # 預設 8
     k_clusters = int(sys.argv[3])
     if sys.argv[4]=="None":
          k_random_state =None
     else :
          k_random_state =int(sys.argv[4])
     k_max_iter = int(sys.argv[5])
     k_test_size = float(sys.argv[6])
     code = codingUtil.file_encoding(trainingFilePath)
     m.main(modelName,trainingFilePath,code,k_test_size,k_clusters,k_random_state,k_max_iter)
except Exception as e:
    print('Exception :\t\t', str(e))

聚類效果圖  ,資料分佈,聚合分佈情況

總體程式碼參考 

但是案例裡面的為兩個特徵的資料剛好能畫聚合效果圖

實際可能為多個特徵 所以需要先降維再去繪圖