python 手寫實現k-means

阿新 • • 發佈：2019-01-04

今天手寫實現了k-means，目的是加深對這個演算法原理的理解，有不足的地方請多指教。

ris鳶尾花資料集包含3個不同品種的鳶尾花（Setosa，Versicolour，and Virginica）資料，花瓣和萼片長度，儲存在一個150*4的 numpy.ndarry中

150行4列，150行指150多花，4列分別是Sepal Length，Sepal Width, Petal Length and Petal Width

使用pandas官方demo

本程式碼使用k-means實現對鶯尾花種類class的區分，最後進行了視覺化

匯入庫

from numpy import *
import matplotlib.pyplot as plt
import pandas as pd

# Load dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
dataset = pd.read_csv(url, names=names)

dataset['class'][dataset['class']=='Iris-setosa']=0
dataset['class'][dataset['class']=='Iris-versicolor']=1
dataset['class'][dataset['class']=='Iris-virginica']=2
#對類別進行編碼，3個類別分別賦值0，1，2

#算距離
def distEclud(vecA, vecB):                  #兩個向量間歐式距離
    return sqrt(sum(power(vecA - vecB, 2))) #la.norm(vecA-vecB)

#初始化聚類中心：通過在區間範圍隨機產生的值作為新的中心點
def randCent(dataSet, k):
    #獲取特徵維度
    n = shape(dataSet)[1]   
    #建立聚類中心0矩陣 k x n
    centroids = mat(zeros((k,n)))
    #遍歷n維特徵         
    for j in range(n):     
        #第j維特徵屬性值min   ,1x1矩陣                 
        minJ = min(dataSet[:,j])        
        #區間值max-min，float數值    
        rangeJ = float(max(dataSet[:,j]) - minJ)   
        #第j維，每次隨機生成k箇中心
        centroids[:,j] = mat(minJ + rangeJ * random.rand(k,1))
    return centroids

def randChosenCent(dataSet,k):
    # 樣本數
    m=shape(dataSet)[0]
    # 初始化列表
    centroidsIndex=[]
    #生成類似於樣本索引的列表
    dataIndex=list(range(m))
    for i in range(k):
        #生成隨機數
        randIndex=random.randint(0,len(dataIndex))
        #將隨機產生的樣本的索引放入centroidsIndex
        centroidsIndex.append(dataIndex[randIndex])
        #刪除已經被抽中的樣本
        del dataIndex[randIndex]
    #根據索引獲取樣本
    centroids = dataSet.iloc[centroidsIndex]
    return mat(centroids)

def kMeans(dataSet, k):
    # 樣本總數
    m = shape(dataSet)[0]
    #分配樣本到最近的簇：存[簇序號,距離的平方]
    # m行  2 列
    clusterAssment = mat(zeros((m,2)))

    #step1:
    #通過隨機產生的樣本點初始化聚類中心
    centroids = randChosenCent(dataSet, k)
    print('最初的中心=',centroids)

    #標誌位，如果迭代前後樣本分類發生變化值為Tree，否則為False
    clusterChanged = True
    #檢視迭代次數
    iterTime=0
    #所有樣本分配結果不再改變，迭代終止
    while clusterChanged:   
        clusterChanged = False        
        #step2:分配到最近的聚類中心對應的簇中
        for i in range(m):
            #初始定義距離為無窮大
            minDist = inf;
            #初始化索引值
            minIndex = -1
            # 計算每個樣本與k箇中心點距離
            for j in range(k):
                #計算第i個樣本到第j箇中心點的距離
                distJI = distEclud(centroids[j,:],dataSet.values[i,:])
                #判斷距離是否為最小
                if distJI < minDist:
                    #更新獲取到最小距離
                    minDist = distJI
                    #獲取對應的簇序號
                    minIndex = j
            #樣本上次分配結果跟本次不一樣，標誌位clusterChanged置True
            if clusterAssment[i,0] != minIndex:
                clusterChanged = True
            clusterAssment[i,:] = minIndex,minDist**2 #分配樣本到最近的簇
        iterTime+=1
        sse=sum(clusterAssment[:,1])
        print('the SSE of %d'%iterTime + 'th iteration is %f'%sse)
        #step3:更新聚類中心
        for cent in range(k):#樣本分配結束後，重新計算聚類中心
            #獲取該簇所有的樣本點
            ptsInClust = dataSet.iloc[nonzero(clusterAssment[:,0].A==cent)[0]]
            #更新聚類中心：axis=0沿列方向求均值。
            centroids[cent,:] = mean(ptsInClust, axis=0) 
    return centroids, clusterAssment

def kMeansSSE(dataSet,k,distMeas=distEclud, createCent=randChosenCent):
    m = shape(dataSet)[0]
    #分配樣本到最近的簇：存[簇序號,距離的平方]
    clusterAssment=mat(zeros((m,2)))
    #step1:#初始化聚類中心
    centroids = createCent(dataSet, k)
    print('initial centroids=',centroids)
    sseOld=0
    sseNew=inf
    iterTime=0 #檢視迭代次數
    while(abs(sseNew-sseOld)>0.0001):
        sseOld=sseNew
        #step2:將樣本分配到最近的質心對應的簇中
        for i in range(m):
            minDist=inf;minIndex=-1
            for j in range(k):
                #計算第i個樣本與第j個質心之間的距離
                distJI=distMeas(centroids[j,:],dataSet.values[i,:])
                #獲取到第i樣本最近的質心的距離,及對應簇序號
                if distJI<minDist:
                    minDist=distJI;minIndex=j
            clusterAssment[i,:]=minIndex,minDist**2 #分配樣本到最近的簇
        iterTime+=1
        sseNew=sum(clusterAssment[:,1])
        print('the SSE of %d'%iterTime + 'th iteration is %f'%sseNew)
        #step3:更新聚類中心
        for cent in range(k):
            #樣本分配結束後，重新計算聚類中心
            ptsInClust=dataSet[nonzero(clusterAssment[:,0].A==cent)[0]]
            #按列取平均,mean()對array型別
            centroids[cent,:] = mean(ptsInClust, axis=0)
    return centroids, clusterAssment

#2維資料聚類效果顯示
def datashow(dataSet,k,centroids,clusterAssment):  #二維空間顯示聚類結果
    from matplotlib import pyplot as plt
    num,dim=shape(dataSet)  #樣本數num ,維數dim
    
    if dim!=2:
        print('sorry,the dimension of your dataset is not 2!')
        return 1
    marksamples=['or','ob','og','ok','^r','^b','<g'] #樣本圖形標記
    if k>len(marksamples):
        print('sorry,your k is too large,please add length of the marksample!')
        return 1
        #繪所有樣本
    for i in range(num):
        markindex=int(clusterAssment[i,0])#矩陣形式轉為int值, 簇序號
        #特徵維對應座標軸x,y；樣本圖形標記及大小
        plt.plot(dataSet.iat[i,0],dataSet.iat[i,1],marksamples[markindex],markersize=6)

    #繪中心點            
    markcentroids=['o','*','^']#聚類中心圖形標記
    label=['0','1','2']
    c=['yellow','pink','red']
    for i in range(k):
        plt.plot(centroids[i,0],centroids[i,1],markcentroids[i],markersize=15,label=label[i],c=c[i])
        plt.legend(loc = 'upper left')
    plt.xlabel('sepal length')  
    plt.ylabel('sepal width') 
   
    plt.title('k-means cluster result') #標題        
    plt.show()

#畫出實際影象
def trgartshow(dataSet,k,labels):
    from matplotlib import pyplot as plt

    num,dim=shape(dataSet)
    label=['0','1','2']
    marksamples=['ob','or','og','ok','^r','^b','<g']
    # 通過迴圈的方式，完成分組散點圖的繪製
    for i in range(num): 
        plt.plot(datamat.iat[i,0],datamat.iat[i,1],marksamples[int(labels.iat[i,0])],markersize=6 )
    for i in range(0,num,50): 
        plt.plot(datamat.iat[i,0],datamat.iat[i,1],marksamples[int(labels.iat[i,0])],markersize=6,label=label[int(labels.iat[i,0])] )
    plt.legend(loc = 'upper left')
    # 新增軸標籤和標題

    plt.xlabel('sepal length')  
    plt.ylabel('sepal width') 
   
    plt.title('iris true result') #標題 

    # 顯示圖形
    plt.show()   
    #label=labels.iat[i,0]

#聚類前，繪製原始的樣本點
def originalDatashow(dataSet):
        #樣本的個數和特徵維數
    num,dim=shape(dataSet)
    marksamples=['ob'] #樣本圖形標記
    for i in range(num):
        plt.plot(datamat.iat[i,0],datamat.iat[i,1],marksamples[0],markersize=5)
    plt.title('original dataset')
    plt.xlabel('sepal length')  
    plt.ylabel('sepal width') #標題
    plt.show()

if __name__=='__main__':
#=====kmeans聚類
    # # #獲取樣本資料
    datamat=dataset.loc[:, ['sepal-length','sepal-width']]
    #真實的標籤
    labels=dataset.loc[:, ['class']]
    # #原始資料顯示
    originalDatashow(datamat)

    # #*****kmeans聚類
    k=3 #使用者定義聚類數
    mycentroids,clusterAssment=kMeans(datamat,k)
    #mycentroids,clusterAssment=kMeansSSE(datamat,k)

    #繪圖顯示
    datashow(datamat,k,mycentroids,clusterAssment)
    trgartshow(datamat,3,labels)

最初的中心= [[ 5.8  2.7]
 [ 6.7  3.3]
 [ 6.3  3.3]]
the SSE of 1th iteration is 89.870000
the SSE of 2th iteration is 56.076254
the SSE of 3th iteration is 51.738059
the SSE of 4th iteration is 50.664805
the SSE of 5th iteration is 49.569934
the SSE of 6th iteration is 48.696179
the SSE of 7th iteration is 46.824378
the SSE of 8th iteration is 45.085428
the SSE of 9th iteration is 44.384855
the SSE of 10th iteration is 43.591498
the SSE of 11th iteration is 41.904928
the SSE of 12th iteration is 39.066514
the SSE of 13th iteration is 38.316500
the SSE of 14th iteration is 37.912536
the SSE of 15th iteration is 37.423306
the SSE of 16th iteration is 37.136261
the SSE of 17th iteration is 37.123702

總結：

為了在平面上進行視覺化，只選取了4個特徵中的2個特徵，可能用3個特徵會更好，之前在tf的官網上看過minists資料集在tensorboard上的三維畫圖，挺漂亮的。還不會用matlibplot畫3維散點，echarts應該可以。有時間可以嘗試一下。

python 手寫實現k-means

今天手寫實現了k-means，目的是加深對這個演算法原理的理解，有不足的地方請多指教。ris鳶尾花資料集包含3個不同品種的鳶尾花（Setosa，Versicolour，and Virginica）資料，花瓣和萼片長度，儲存在一個150*4的 numpy.ndarry中150行

利用Python實現K-Means聚類並進行圖形化展示

利用K-means進行聚類，顯示聚類結果的各類別的數量，最終進行圖形化展示。 import pandas as pd import numpy as np import matplotlib.pyplot as plt from scipy import stats import c

Python實現K-Means聚類演算法

宣告：程式碼的執行環境為Python3。Python3與Python2在一些細節上會有所不同，希望廣大讀者注意。本部落格以程式碼為主，程式碼中會有詳細的註釋。相關文章將會發布在我的個人部落格專欄《Python從入門到深度學習》，歡迎大家關注~

Python實現K-means程式碼詳解（新手上路）

#coding=utf-8 2 from numpy import * 3 4 def loadDataSet(fileName): 5 dataMat = [] 6 fr = open(fileName) 7 for line in fr.readli

python手寫bp神經網路實現人臉性別識別1.0

寫在前面：本實驗用到的圖片均來自google圖片，侵刪！實驗介紹用python手寫一個簡單bp神經網路，實現人臉的性別識別。由於本人的機器配置比較差，所以無法使用網上很紅的人臉大資料資料集（如lfw資料集等等），所以我從google圖片下載了一些

python手寫神經網路實現識別手寫數字

實驗說明一直想自己寫一個神經網路來實現手寫數字的識別，而不是套用別人的框架。恰巧前幾天，有幸從同學那拿到5000張已經貼好標籤的手寫數字圖片，於是我就嘗試用matlab寫一個網路。實驗資料：5000張手寫數字圖片（.jpg)，圖片命名為

Python實現k-means演算法

這也是周志華《機器學習》的習題9.4。資料集是西瓜資料集4.0，如下編號,密度,含糖率 1,0.697,0.46 2,0.774,0.376 3,0.634,0.264 4,0.608,0.318 5,0.556,0.215 6,0.403,0.23

Python 純手寫實現感知機模型及對偶形式

根據《統計學習方法》P29頁演算法2.1，實現感知機模型及對偶形式。演算法2.1：輸入：訓練資料集，learning rate alpha。輸出：權重w，偏置b。（1）初始化w0，b0 （2）在資料集中選定Xi，Yi帶入（3）計算Yi * (Xi * w

tensorflow學習之識別單張圖片的實現（python手寫數字）

假設我們已經安裝好了tensorflow。一般在安裝好tensorflow後，都會跑它的demo，而最常見的demo就是手寫數字識別的demo,也就是mnist資料集。然而我們僅僅是跑了它的demo而已，可能很多人會有和我一樣的想法，如果拿來一張數字圖片，如何應用我們訓

通過IDEA及hadoop平臺實現k-means聚類算法

綜合 tle tostring html map apache cnblogs cos textfile 有段時間沒有操作過，發現自己忘記一些步驟了，這篇文章會記錄相關步驟，並隨時進行補充修改。 1 基礎步驟，即相關環境部署及數據準備數據文件類型為.csv文件，excel

透徹理解Spring事務設計思想之手寫實現

數據庫操作 cal 了解 hashmap 個數這一 use action 管道前言事務，是描述一組操作的抽象，比如對數據庫的一組操作，要麽全部成功，要麽全部失敗。事務具有4個特性：Atomicity（原子性），Consistency（一致性），Isolation（隔離

理解數據庫連接池底層原理之手寫實現

ring cda color 要去分配 .com 管理 roc tex 前言數據庫連接池的基本思想是：為數據庫連接建立一個“緩沖池”，預先在池中放入一定數量的數據庫連接管道，需要時，從池子中取出管道進行使用，操作完畢後，在將管道放入池子中，從而避免了頻繁的向數據庫申請資

react深入 - 手寫實現react-redux api

簡介：簡單實現react-redux基礎api react-redux api回顧 <Provider store>把store放在context裡，所有子元件可以直接拿到store資料使元件層級中的 connect() 方法都能夠獲得 Redux store 根元件應該巢狀在 &

spark 實現K-means演算法

spark 實現K-means演算法 package kmeans; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFou

使用Java實現K-Means聚類演算法

第一次寫部落格，隨便寫寫。關於K-Means介紹很多，還不清楚可以查一些相關資料。個人對其實現步驟簡單總結為4步: 1.選出k值,隨機出k個起始質心點。 2.分別計算每個點和k個起始質點之間的距離,就近歸類。 3.最終中心點集可以劃分為k類,

Python手寫模擬單向連結串列物件和棧物件

單向連結串列： class error(Exception): def __init__(self,msg): super(error,self).__init__() self.msg=msg def __str__(self):

【手寫系列】純手寫實現一個高可用的RPC

前言在實際後臺服務開發中，比如訂單服務（開發者A負責）需要呼叫商品服務（開發者B負責），那麼開發者B會和A約定呼叫API，以介面的形式提供給A。通常都是B把API上傳到Maven私服，然後B開始寫API的實現，A只需要引入API依賴進行開發即可。訂單

【原創】自己手寫實現Boost序列化簡易版

設計思路在與多個系統進行網路互動時，序列化是不可缺少的技術。編寫一個C++語言的序列化實現，是練習運用模板超程式設計的絕佳案例，理解C++模板是如何"面向編譯期程式設計"的（業內好像沒有這個說法）。序列化物件處理基礎資料型別和類型別，boost的序列化功能劃分得更細緻，基本支援了C++語言

LeanCloud SDK不好用，Python手寫一個ORM

Intro 慣例，感覺寫了好用的東西就來寫個部落格吹吹牛逼。 LeanCloud Storage 的資料模型不像是一般的 RDBMS，但有時候又很刻意地貼近那種感覺，所以用起來就很麻煩。 LeanCloud SDK 的缺陷不管別人認不認可，這些問題在使用中我是體會到不爽了。資

手寫實現滿足 Promise/A+ 規範的 Promise

最近看了 Promise/A+ 的規範，嘗試實現了一個滿足 promises-aplus-tests 測試的 Promise 類，在實現規範的過程中，對於 Promise 本身也加深了理解，這篇文章就將我的實現過程分享出來。本文的程式碼倉庫在這裡，歡迎 Star～。前置知識 Prom

python 手寫實現k-means

ris鳶尾花資料集包含3個不同品種的鳶尾花（Setosa，Versicolour，and Virginica）資料，花瓣和萼片長度，儲存在一個150*4的 numpy.ndarry中

150行4列，150行指150多花，4列分別是Sepal Length，Sepal Width, Petal Length and Petal Width

使用pandas官方demo

本程式碼使用k-means實現對鶯尾花種類class的區分，最後進行了視覺化

匯入庫

總結：

相關推薦