主成分降維python程式碼實現(承接上一篇)
阿新 • • 發佈:2019-01-14
# coding=utf-8 import pandas as pd import numpy as np from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler data = pd.read_csv("pca.csv") # 資料的匯入 # print(type(data)) column_name = data.columns.values.tolist() # 得出所有的列名 # print(data.shape) data_standard = StandardScaler().fit_transform(data) # z-score 標準化 data_standard = pd.DataFrame(data_standard) # print(data_standard.corr()) pca = PCA() # 保留所有成分 pca.fit(data_standard) feature_vectors = pca.components_ # 返回模型的各個特徵向量 # print(feature_vectors) pca_contribution = pca.explained_variance_ratio_ # 返回各個成分各自的方差百分比(也稱貢獻率) # print(pca_contribution) # 選出累計方差貢獻率大於0.6時的主成分 pca_contribution_sum = 0 # 累計方差貢獻率 counter = 0 # 對應選取的主成分個數 for i in range(len(pca_contribution)): pca_contribution_sum += pca_contribution[i] counter += 1 if pca_contribution_sum >= 0.6: # 本人的資料選取的不好,一般設定為累計方差貢獻率大於0.8 break reduced_dimension_vector = feature_vectors[:, 0:(counter-1)] # 取前count個特徵向量 Vector_judgment = (reduced_dimension_vector > 0.4) # 取各個特徵向量中值大於0.3的係數 (一般選取大於0.7-1的指標) # print(Vector_judgment.shape) index_after_selection = [] # 降維後的指標列表 after_selection_name = [] # 降維後的指標列名 # 選取降維後的指標 for j in range(Vector_judgment.shape[0]): Vector = Vector_judgment[j] if True in Vector: index_after_selection.append(data_standard.ix[:,j]) after_selection_name.append(column_name[j]) after_selection_data=pd.DataFrame(index_after_selection) # list轉換成dataframe # print(after_selection_data.shape) after_selection_data_T = np.transpose(after_selection_data) # 轉置 after_selection_data_T.columns = list(after_selection_name) # 資料集新增列名 print(after_selection_data_T) #輸出降維後的指標向量