1. 程式人生 > >python_NLP實戰之豆瓣讀書資料聚類

python_NLP實戰之豆瓣讀書資料聚類

用k_means對豆瓣讀書資料聚類

1、讀取資料以及資料預處理

book_data = pd.read_csv('data/data.csv') #讀取檔案

print(book_data.head())

book_titles = book_data['title'].tolist()
book_content = book_data['content'].tolist()

print('書名:', book_titles[0])
print('內容:', book_content[0][:10])

from normalization import normalize_corpus

# normalize corpus
norm_book_content = normalize_corpus(book_content)

2、提取特徵

# 提取 tf-idf 特徵
vectorizer, feature_matrix = build_feature_matrix(norm_book_content,
                                                  feature_type='tfidf',
                                                  min_df=0.2, max_df=0.90,
                                                  ngram_range=(1, 2))
# 檢視特徵數量
print(feature_matrix.shape)

# 獲取特徵名字
feature_names = vectorizer.get_feature_names()

# 列印某些特徵
print(feature_names[:10])

3、進行聚類

from sklearn.cluster import KMeans


def k_means(feature_matrix, num_clusters=10):
    km = KMeans(n_clusters=num_clusters,
                max_iter=10000)
    km.fit(feature_matrix)
    clusters = km.labels_
    return km, clusters


num_clusters = 10
km_obj, clusters = k_means(feature_matrix=feature_matrix,
                           num_clusters=num_clusters)
book_data['Cluster'] = clusters

from collections import Counter

# 獲取每個cluster的數量
c = Counter(clusters)
print(c.items())

4、列印每個書籍

def get_cluster_data(clustering_obj, book_data,
                     feature_names, num_clusters,
                     topn_features=10):
    cluster_details = {}
    # 獲取cluster的center
    ordered_centroids = clustering_obj.cluster_centers_.argsort()[:, ::-1]
    # 獲取每個cluster的關鍵特徵
    # 獲取每個cluster的書
    for cluster_num in range(num_clusters):
        cluster_details[cluster_num] = {}
        cluster_details[cluster_num]['cluster_num'] = cluster_num
        key_features = [feature_names[index]
                        for index
                        in ordered_centroids[cluster_num, :topn_features]]
        cluster_details[cluster_num]['key_features'] = key_features

        books = book_data[book_data['Cluster'] == cluster_num]['title'].values.tolist()
        cluster_details[cluster_num]['books'] = books

    return cluster_details


def print_cluster_data(cluster_data):
    # print cluster details
    for cluster_num, cluster_details in cluster_data.items():
        print('Cluster {} details:'.format(cluster_num))
        print('-' * 20)
        print('Key features:', cluster_details['key_features'])
        print('book in this cluster:')
        print(', '.join(cluster_details['books']))
        print('=' * 40)