1. 程式人生 > >sklearn實戰:對文件進行聚類分析(KMeans演算法)

sklearn實戰:對文件進行聚類分析(KMeans演算法)

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from time import time
from sklearn.datasets import load_files

print("loading documents ...")
t = time()
docs = load_files('datasets/clustering/data')
print("summary: {0} documents in {1} categories.".format(
    len(docs.data), len(docs.target_names)))
print("done in {0} seconds"
.format(time() - t))
loading documents ...
summary: 7898 documents in 4 categories.
done in 1.8740148544311523 seconds
from sklearn.feature_extraction.text import TfidfVectorizer

max_features = 20000
print("vectorizing documents ...")
t = time()
vectorizer = TfidfVectorizer(max_df=0.4, 
                             min_df=2
, max_features=max_features, encoding='latin-1') X = vectorizer.fit_transform((d for d in docs.data)) print("n_samples: %d, n_features: %d" % X.shape) print("number of non-zero features in sample [{0}]: {1}".format( docs.filenames[0], X[0].getnnz())) print("done in {0} seconds"
.format(time() - t))
vectorizing documents ...
n_samples: 7898, n_features: 20000
number of non-zero features in sample [datasets/clustering/data\sci.electronics\._12249-54259]: 0
done in 1.135350227355957 seconds
from sklearn.cluster import KMeans

print("clustering documents ...")
t = time()
n_clusters = 4
kmean = KMeans(n_clusters=n_clusters, 
               max_iter=100,
               tol=0.001,
               verbose=1,
               n_init=3)
kmean.fit(X);
print("kmean: k={}, cost={}".format(n_clusters, int(kmean.inertia_)))
print("done in {0} seconds".format(time() - t))
clustering documents ...
Initialization complete
Iteration  0, inertia 3944.720
Iteration  1, inertia 3846.168
Converged at iteration 1: center shift 0.000000e+00 within tolerance 2.438758e-08
Initialization complete
Iteration  0, inertia 3943.466
Iteration  1, inertia 3845.153
Iteration  2, inertia 3842.399
Iteration  3, inertia 3840.321
Iteration  4, inertia 3839.155
Iteration  5, inertia 3832.527
Iteration  6, inertia 3798.844
Iteration  7, inertia 3773.636
Iteration  8, inertia 3758.090
Iteration  9, inertia 3749.455
Iteration 10, inertia 3745.879
Iteration 11, inertia 3744.561
Iteration 12, inertia 3744.153
Iteration 13, inertia 3744.027
Iteration 14, inertia 3743.978
Iteration 15, inertia 3743.961
Iteration 16, inertia 3743.952
Iteration 17, inertia 3743.950
Iteration 18, inertia 3743.949
Iteration 19, inertia 3743.948
Iteration 20, inertia 3743.947
Converged at iteration 20: center shift 0.000000e+00 within tolerance 2.438758e-08
Initialization complete
Iteration  0, inertia 3943.208
Iteration  1, inertia 3844.309
Iteration  2, inertia 3843.867
Converged at iteration 2: center shift 0.000000e+00 within tolerance 2.438758e-08
kmean: k=4, cost=3743
done in 8.91970944404602 seconds

看出進行了三次KMeans聚類分析

len(kmean.labels_)
7898
kmean.labels_[1000:1010] #第1000開始前十個的文件聚類類別
array([1, 0, 0, 1, 0, 1, 1, 0, 0, 0])
docs.filenames[1000:1010]
array(['datasets/clustering/data\\sci.crypt\\11475-15954',
       'datasets/clustering/data\\sci.med\\._13133-59218',
       'datasets/clustering/data\\sci.med\\._13072-59582',
       'datasets/clustering/data\\sci.crypt\\11228-15855',
       'datasets/clustering/data\\sci.med\\._13131-58806',
       'datasets/clustering/data\\sci.space\\14343-60918',
       'datasets/clustering/data\\sci.space\\14001-60226',
       'datasets/clustering/data\\sci.space\\._14348-61339',
       'datasets/clustering/data\\sci.space\\._14390-61342',
       'datasets/clustering/data\\sci.electronics\\._12203-54305'],
      dtype='<U54')
from __future__ import print_function

print("Top terms per cluster:")

order_centroids = kmean.cluster_centers_.argsort()[:, ::-1]

terms = vectorizer.get_feature_names()
for i in range(n_clusters):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()
Top terms per cluster:
Cluster 0: edu for re on thanks it anyone that this or
Cluster 1: flyback fix whine tv scott spray sony noise princeton repairman
Cluster 2: it that for you be edu this on are have
Cluster 3: ireland astronomy min 48p 0891 uk per 2888 mastercard mir
a = np.array([[20, 10, 30, 40], [100, 300, 200, 400], [1, 5, 3, 2]])
a.argsort()[:, ::-1]
array([[3, 2, 0, 1],
       [3, 1, 2, 0],
       [1, 2, 3, 0]], dtype=int64)
a = np.array([10, 30, 20, 40])
a.argsort()[::-1]
array([3, 1, 2, 0], dtype=int64)
from sklearn import metrics

label_true = np.random.randint(1, 4, 6)
label_pred = np.random.randint(1, 4, 6)
print("Adjusted Rand-Index for random sample: %.3f"
      % metrics.adjusted_rand_score(label_true, label_pred))
label_true = [1, 1, 3, 3, 2, 2]
label_pred = [3, 3, 2, 2, 1, 1]
print("Adjusted Rand-Index for same structure sample: %.3f"
      % metrics.adjusted_rand_score(label_true, label_pred))
Adjusted Rand-Index for random sample: 0.318
Adjusted Rand-Index for same structure sample: 1.000
from sklearn import metrics

label_true = [1, 1, 2, 2]
label_pred = [2, 2, 1, 1]
print("Homogeneity score for same structure sample: %.3f"
      % metrics.homogeneity_score(label_true, label_pred))
label_true = [1, 1, 2, 2]
label_pred = [0, 1, 2, 3]
print("Homogeneity score for each cluster come from only one class: %.3f"
      % metrics.homogeneity_score(label_true, label_pred))
label_true = [1, 1, 2, 2]
label_pred = [1, 2, 1, 2]
print("Homogeneity score for each cluster come from two class: %.3f"
      % metrics.homogeneity_score(label_true, label_pred))
label_true = np.random.randint(1, 4, 6)
label_pred = np.random.randint(1, 4, 6)
print("Homogeneity score for random sample: %.3f"
      % metrics.homogeneity_score(label_true, label_pred))
Homogeneity score for same structure sample: 1.000
Homogeneity score for each cluster come from only one class: 1.000
Homogeneity score for each cluster come from two class: 0.000
Homogeneity score for random sample: 0.667
from sklearn import metrics

label_true = [1, 1, 2, 2]
label_pred = [2, 2, 1, 1]
print("Completeness score for same structure sample: %.3f"
      % metrics.completeness_score(label_true, label_pred))
label_true = [0, 1, 2, 3]
label_pred = [1, 1, 2, 2]
print("Completeness score for each class assign to only one cluster: %.3f"
      % metrics.completeness_score(label_true, label_pred))
label_true = [1, 1, 2, 2]
label_pred = [1, 2, 1, 2]
print("Completeness score for each class assign to two class: %.3f"
      % metrics.completeness_score(label_true, label_pred))
label_true = np.random.randint(1, 4, 6)
label_pred = np.random.randint(1, 4, 6)
print("Completeness score for random sample: %.3f"
      % metrics.completeness_score(label_true, label_pred))
Completeness score for same structure sample: 1.000
Completeness score for each class assign to only one cluster: 1.000
Completeness score for each class assign to two class: 0.000
Completeness score for random sample: 0.315
from sklearn import metrics

label_true = [1, 1, 2, 2]
label_pred = [2, 2, 1, 1]
print("V-measure score for same structure sample: %.3f"
      % metrics.v_measure_score(label_true, label_pred))
label_true = [0, 1, 2, 3]
label_pred = [1, 1, 2, 2]
print("V-measure score for each class assign to only one cluster: %.3f"
      % metrics.v_measure_score(label_true, label_pred))
print("V-measure score for each class assign to only one cluster: %.3f"
      % metrics.v_measure_score(label_pred, label_true))
label_true = [1, 1, 2, 2]
label_pred = [1, 2, 1, 2]
print("V-measure score for each class assign to two class: %.3f"
      % metrics.v_measure_score(label_true, label_pred))
V-measure score for same structure sample: 1.000
V-measure score for each class assign to only one cluster: 0.667
V-measure score for each class assign to only one cluster: 0.667
V-measure score for each class assign to two class: 0.000
from sklearn import metrics

labels = docs.target
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, kmean.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, kmean.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, kmean.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, kmean.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, kmean.labels_, sample_size=1000))
Homogeneity: 0.002
Completeness: 0.004
V-measure: 0.003
Adjusted Rand-Index: 0.001
Silhouette Coefficient: 0.330