1. 程式人生 > >《機器學習實戰》PCA原始碼

《機器學習實戰》PCA原始碼

#coding:utf-8

from numpy import *

"""
將資料轉換成前N個主成分的虛擬碼如下:
    去除平均值
    計算協方差矩陣
    計算協方差矩陣的特徵值和特徵向量
    將特徵值從大到小排序
    保留最上面的N個特徵向量
    將資料轉換到上述N個特徵向量構建的新空間中
"""
def loadDataSet(filename,delim=' '):
    fr = open(filename)
    stringArr = [line.strip().split(delim) for line in fr.readlines()]
    dataArr = [list(map(float,line)) for line in stringArr]  #map()的結果是惰性的,必須加list使其返回list
    #print(shape(stringArr))
    #print(shape(dataArr))
    return mat(dataArr)


def pca(dataMat,topNfeat=99999): #topNfeat是應用的N個特徵,也就是降到topNfeat維
    meanVals = mean(dataMat,axis=0)  #axis=0表示是對行壓縮,即對列求平均,返回1*n的矩陣
    print(type(meanVals))
    meanRemoved = dataMat-meanVals
    covMat = cov(meanRemoved,rowvar=0)
    #如果`rowvar`為True(預設值),則每行代表一個變數,並在列中顯示。 否則,轉換關係:每列代表一個變數,在行中顯示。
    print(covMat)
    eigVals,eigVects = linalg.eig(mat(covMat))
    eigValInd = argsort(eigVals)
    eigValInd = eigValInd[:-(topNfeat+1):-1]
    redEigVects = eigVects[:,eigValInd]
    lowDDataMat = meanRemoved * redEigVects
    reconMat = (lowDDataMat * redEigVects.T) + meanVals
    return lowDDataMat,reconMat



if __name__ == "__main__":
    Add="D:\PycharmProjects\PCA\dataTest.txt"
    dataMat= loadDataSet(Add)
    lowDMat,reconMat = pca(dataMat,1) #降成1維
    print(shape(lowDMat))
    print(shape(reconMat))
    import matplotlib.pyplot as plt
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.scatter(dataMat[:,0].flatten().A[0],dataMat[:,1].flatten().A[0],marker="*",s=90,c="b")
    #ax.scatter(lowDMat[:,0].flatten().A[0],lowDMat[:,1].flatten().A[0],marker="o",s=50,c="red")
    ax.scatter(reconMat[:, 0].flatten().A[0], reconMat[:, 1].flatten().A[0], marker="o", s=40, c="r")
    plt.show()

平均值帶缺失值的函式:

#將NaN替換成平均值
def replaceNaNWithMean():
    dataMat = loadDataSet()
    numFeat = shape(dataMat)[1]
    for i in range(numFeat):
        #計算所有非NAN的平均值
        meanVal = mean(dataMat[nonzero(~isnan(dataMat[:,i].A))[0],i])
        dataMat[nonzero(isnan(dataMat[:,i].A))[0],i] = meanVal
    return dataMat