1. 程式人生 > >Python機器學習--聚類

Python機器學習--聚類

-- 省份 kmean def 數據包 his import clas times

  • K-means聚類算法

技術分享

技術分享

技術分享

  • 測試:

# -*- coding: utf-8 -*-
"""
Created on Thu Aug 31 10:59:20 2017

@author: Administrator
"""

‘‘‘
現有1999年全國31個省份城鎮居民家庭平均每人全年消費性支出的八個主
要變量數據,這八個變量分別是:食品、 衣著、 家庭設備用品及服務、 醫療
保健、 交通和通訊、 娛樂教育文化服務、 居住以及雜項商品和服務。 利用已
有數據,對31個省份進行聚類。
‘‘‘

import numpy as np
from sklearn.cluster import KMeans def loadData(filePath): fr = open(filePath,r+) lines = fr.readlines() retData = [] retCityName = [] for line in lines: items = line.strip().split(",") retCityName.append(items[0]) retData.append([float(items[i])
for i in range(1,len(items))]) return retData,retCityName if __name__ == __main__: fpath=F:\RANJIEWEN\MachineLearning\Python機器學習實戰_mooc\data\聚類\\ data,cityName = loadData(fpath+city.txt) km = KMeans(n_clusters=4) label = km.fit_predict(data) expenses
= np.sum(km.cluster_centers_,axis=1) #print(expenses) CityCluster = [[],[],[],[]] for i in range(len(cityName)): CityCluster[label[i]].append(cityName[i]) for i in range(len(CityCluster)): print("Expenses:%.2f" % expenses[i]) print(CityCluster[i])
  • DBSCAN密度聚類

技術分享

技術分享

技術分享

技術分享

技術分享

  • 測試

# -*- coding: utf-8 -*-
"""
Created on Thu Aug 31 11:14:37 2017

@author: Administrator
"""

‘‘‘
現有大學校園網的日誌數據,290條大學生的校園網使用情況數據,數據包
括用戶ID,設備的MAC地址,IP地址,開始上網時間,停止上網時間,上
網時長,校園網套餐等。 利用已有數據,分析學生上網的模式。
實驗目的:
通過DBSCAN聚類,分析學生上網時間和上網時長的模式。

‘‘‘

import numpy as np
import sklearn.cluster as skc
from sklearn import metrics
import matplotlib.pyplot as plt
 
 
mac2id=dict()
onlinetimes=[]

fpath=F:\RANJIEWEN\MachineLearning\Python機器學習實戰_mooc\data\聚類\\
f=open(fpath+TestData.txt,encoding=utf-8)
for line in f:
    mac=line.split(,)[2]
    onlinetime=int(line.split(,)[6])
    starttime=int(line.split(,)[4].split( )[1].split(:)[0])
    if mac not in mac2id:
        mac2id[mac]=len(onlinetimes)
        onlinetimes.append((starttime,onlinetime))
    else:
        onlinetimes[mac2id[mac]]=[(starttime,onlinetime)]
real_X=np.array(onlinetimes).reshape((-1,2))
 
X=real_X[:,0:1]

## 聚類數據變換技巧
# X=np.log(1+real_X[:,1:])
 
db=skc.DBSCAN(eps=0.01,min_samples=20).fit(X)
labels = db.labels_
 
print(Labels:)
print(labels)
raito=len(labels[labels[:] == -1]) / len(labels)
print(Noise raito:,format(raito, .2%))
 
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
 
print(Estimated number of clusters: %d % n_clusters_)
print("Silhouette Coefficient: %0.3f"% metrics.silhouette_score(X, labels))
 
for i in range(n_clusters_):
    print(Cluster ,i,:)
    print(list(X[labels == i].flatten()))
     
plt.hist(X,24)
  • 基於聚類的圖像分割

技術分享

技術分享

技術分享

  • 測試

# -*- coding: utf-8 -*-
"""
Created on Thu Aug 31 15:03:11 2017

@author: Administrator
"""

‘‘‘
目標:利用K-means聚類算法對圖像像素點顏色進行聚類實現簡單的圖像分割
輸出:同一聚類中的點使用相同顏色標記,不同聚類顏色不同
‘‘‘

import numpy as np
import PIL.Image as image
from sklearn.cluster import KMeans
 
def loadData(filePath):
    f = open(filePath,rb)
    data = []
    img = image.open(f)
    m,n = img.size
    for i in range(m):
        for j in range(n):
            x,y,z = img.getpixel((i,j))
            data.append([x/256.0,y/256.0,z/256.0])
    f.close()
    return np.mat(data),m,n
 
imPath=F:\RANJIEWEN\MachineLearning\Python機器學習實戰_mooc\data\基於聚類的整圖分割\\
imgData,row,col = loadData(imPath+bull.jpg)
label = KMeans(n_clusters=4).fit_predict(imgData)
 
label = label.reshape([row,col])
pic_new = image.new("L", (row, col))
for i in range(row):
    for j in range(col):
        pic_new.putpixel((i,j), int(256/(label[i][j]+1)))
pic_new.save("result-bull-4.jpg", "JPEG")

Python機器學習--聚類