1. 程式人生 > >【機器學習五】KNN

【機器學習五】KNN

程式碼如下,其中資料集trainingDigits可以從我的 網盤.上下載,提取碼:cbun 永久有效


#手寫識別 32x32
from numpy import *
import operator
import time
from os import listdir

def classify(inputPoint,dataSet,labels,k):
    dataSetSize = dataSet.shape[0]     #已知分類的資料集(訓練集)的行數
    #先tile函式將輸入點拓展成與訓練集相同維數的矩陣,再計算歐氏距離
    diffMat = tile(inputPoint,(dataSetSize,1))-dataSet  #樣本與訓練集的差值矩陣
    sqDiffMat = diffMat ** 2                    #差值矩陣平方
    sqDistances = sqDiffMat.sum(axis=1)         #計算每一行上元素的和
    distances = sqDistances ** 0.5              #開方得到尤拉距離矩陣
    sortedDistIndicies = distances.argsort()    #按distances中元素進行升序排序後得到的對應下標的列表
    #選擇距離最小的k個點
    classCount = {}
    for i in range(k):
        voteIlabel = labels[ sortedDistIndicies[i] ]
        classCount[voteIlabel] = classCount.get(voteIlabel,0)+1
    #按classCount字典的第2個元素(即類別出現的次數)從大到小排序
    sortedClassCount = sorted(classCount.items(), key = operator.itemgetter(1), reverse = True)
    return sortedClassCount[0][0]

def img2vector(filename):
    returnVect = []
    fr = open(filename)
    for i in range(32):
        lineStr = fr.readline()
        for j in range(32):
            returnVect.append(int(lineStr[j]))
    return returnVect
#從檔名中解析分類數字
def classnumCut(fileName): 
    fileStr = fileName.split('.')[0]  
    classNumStr = int(fileStr.split('_')[0]) 
    return classNumStr
#構建訓練集資料向量,及對應分類標籤向量
def trainingDataSet():
    hwLabels = []
    trainingFileList = listdir('trainingDigits')           #獲取目錄內容
    m = len(trainingFileList)
    trainingMat = zeros((m,1024))                          #m維向量的訓練集
    for i in range(m):
        fileNameStr = trainingFileList[i]
        hwLabels.append(classnumCut(fileNameStr))
        trainingMat[i,:] = img2vector('trainingDigits/%s' % fileNameStr)
    return hwLabels,trainingMat

#測試函式
def handwritingTest():
    hwLabels,trainingMat = trainingDataSet()    #構建訓練集
    testFileList = listdir('testDigits')        #獲取測試集
    errorCount = 0.0                            #錯誤數
    mTest = len(testFileList)                   #測試集總樣本數
    t1 = time.time()
    
    for i in range(mTest):
        fileNameStr = testFileList[i]
        classNumStr = classnumCut(fileNameStr)
        vectorUnderTest = img2vector('testDigits/%s' % fileNameStr)
        #呼叫knn演算法進行測試
        classifierResult = classify(vectorUnderTest, trainingMat, hwLabels, 3)
        if (classifierResult != classNumStr): errorCount += 1.0
    print("\nthe total number of tests is: %d" % mTest  )             #輸出測試總樣本數
    print("the total number of errors is: %d" % errorCount    )       #輸出測試錯誤樣本數
    print("the total error rate is: %f" % (errorCount/float(mTest)) ) #輸出錯誤率
    t2 = time.time()
    print("Cost time: %.2fmin, %.4fs."%((t2-t1)//60,(t2-t1)%60) )    #測試耗時

if __name__ == "__main__":
    handwritingTest()
    

最後執行的結果:

the total number of tests is: 946
the total number of errors is: 10
the total error rate is: 0.010571
Cost time: 0.00min, 44.5615s.

在測試集上效果還是很好的。