1. 程式人生 > >主成分降維python程式碼實現(承接上一篇)

主成分降維python程式碼實現(承接上一篇)

# coding=utf-8
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
data = pd.read_csv("pca.csv")  # 資料的匯入
# print(type(data))
column_name = data.columns.values.tolist()  # 得出所有的列名
# print(data.shape)
data_standard = StandardScaler().fit_transform(data) # z-score 標準化

data_standard = pd.DataFrame(data_standard)
# print(data_standard.corr())
pca = PCA()   # 保留所有成分
pca.fit(data_standard)
feature_vectors = pca.components_  # 返回模型的各個特徵向量
# print(feature_vectors)
pca_contribution = pca.explained_variance_ratio_  # 返回各個成分各自的方差百分比(也稱貢獻率)
# print(pca_contribution)
# 選出累計方差貢獻率大於0.6時的主成分
pca_contribution_sum = 0    # 累計方差貢獻率
counter = 0  # 對應選取的主成分個數
for i in range(len(pca_contribution)):
    pca_contribution_sum += pca_contribution[i]
    counter += 1
    if pca_contribution_sum >= 0.6:   # 本人的資料選取的不好,一般設定為累計方差貢獻率大於0.8
        break


reduced_dimension_vector = feature_vectors[:, 0:(counter-1)]  # 取前count個特徵向量
Vector_judgment = (reduced_dimension_vector > 0.4)  # 取各個特徵向量中值大於0.3的係數 (一般選取大於0.7-1的指標)
# print(Vector_judgment.shape)
index_after_selection = []  # 降維後的指標列表
after_selection_name = []   # 降維後的指標列名
# 選取降維後的指標
for j in range(Vector_judgment.shape[0]):
    Vector = Vector_judgment[j]
    if True in Vector:
        index_after_selection.append(data_standard.ix[:,j])
        after_selection_name.append(column_name[j])

after_selection_data=pd.DataFrame(index_after_selection)  # list轉換成dataframe
# print(after_selection_data.shape)
after_selection_data_T = np.transpose(after_selection_data)  # 轉置
after_selection_data_T.columns = list(after_selection_name)  # 資料集新增列名
print(after_selection_data_T) #輸出降維後的指標向量