《機器學習實戰》學習總結1——K-近鄰演算法

阿新 • • 發佈：2018-11-24

新手入門學習機器學習，根據ApacheCN的視訊學習程式碼，視訊可以在bilibili線上播放。
有需要資料的可以在GitHub下載：https://github.com/RedstoneWill/MachineLearning
本文最主要的是分析程式碼的功能與實現，相應的原理大家拿看就好了。

import numpy as np
import operator
from os import listdir

def classify0(inX, dataSet, labels, k):
	"""
	inX:用於分類的輸入變數，需要預測的值（新來的值）
	dataSet：原始資料集、樣本集、訓練資料集
	labels：資料集對應的類別或者標籤
	k：自行設定的值
	""" 

	#距離運算
    dataSetSize = dataSet.shape[0]
    #tile生成和訓練樣本對應的矩陣，矩陣置為空，並與訓練樣本求差。測試資料-原始資料的陣列。
    #dataSetSize是行，dataSetSize等於多少就代表有多少行生成相應的矩陣
    diffMat = np.tile(inX, (dataSetSize, 1)) - dataSet
    """
    inX=[1,2,3]
    DS=[[1,2,3],[1,2,0]]
    tile執行後相當於：[[1,2,3],[1,2,3]]-[[1,2,3],[1,2,0]]
    tile相應執行了for迴圈的操作
    """ 

    sqDiffMat = diffMat**2 
    sqDistances = sqDiffMat.sum(axis=1)
    #按行相加：(A1-A2)^2+(B1-B2)^2+(C1-C2)^相當於求所有距離的平方
    distances = sqDistances**0.5#開方
    sortedDistIndicies = distances.argsort()#距離排序
    classCount = {}
    #選擇距離最小的K個點
    for i in range(k):
    #找到該樣本的型別
        voteIlabel = labels[ 
sortedDistIndicies[i]]
    #在字典中將該型別加1
        classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
    #排序並返回最多的那個型別
    sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
    return sortedClassCount[0][0]

def createDataSet():#資料集
    group = np.array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]])#假設出來的資料
    labels = ['A', 'A', 'B', 'B']#對應的類別 已知的特徵和分類
    return group, labels

def file2matrix(filename):
    love_dictionary = {'largeDoses':3, 'smallDoses':2, 'didntLike':1}
    fr = open(filename)
    arrayOLines = fr.readlines()
    numberOfLines = len(arrayOLines)            #get the number of lines in the file
    returnMat = np.zeros((numberOfLines, 3))        #prepare matrix to return
    classLabelVector = []                       #prepare labels return
    index = 0
    for line in arrayOLines:
        line = line.strip()
        listFromLine = line.split('\t')
        returnMat[index, :] = listFromLine[0:3]
        if(listFromLine[-1].isdigit()):
            classLabelVector.append(int(listFromLine[-1]))
        else:
            classLabelVector.append(love_dictionary.get(listFromLine[-1]))
        index += 1
    return returnMat, classLabelVector
"""
歸一化公式：
Y = (X-Xmin)/(Xmax-Xmin)
其中min和max分別是資料集中的最小特徵值和最大特徵值。轉化為0到1的區間
"""

def autoNorm(dataSet):
    minVals = dataSet.min(0)#每個特徵的最小值
    maxVals = dataSet.max(0)#每個特徵的最大值
    ranges = maxVals - minVals#極差
    normDataSet = np.zeros(np.shape(dataSet))#生成一個空的矩陣
    m = dataSet.shape[0]
    #將資料集減去每個特徵的最小值
    normDataSet = dataSet - np.tile(minVals, (m, 1))
    #再將與最小值的差除以這個範圍（極差）
    normDataSet = normDataSet/np.tile(ranges, (m, 1))   #element wise divide
    return normDataSet, ranges, minVals

def datingClassTest():
    hoRatio = 0.10      # 10%測試 
    datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')       #load data setfrom file
    normMat, ranges, minVals = autoNorm(datingDataMat)
    m = normMat.shape[0]
    #引入測試範圍，總的資料乘以比例：m*hoRatio
    numTestVecs = int(m*hoRatio)
    errorCount = 0.0
    for i in range(numTestVecs):#對10%的資料進行測試
        classifierResult = classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3)
        print("the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i]))
        #檢測資料是否正確，如果錯誤的話：errorCount += 1.0（自加1）
        if (classifierResult != datingLabels[i]): errorCount += 1.0
    print("the total error rate is: %f" % (errorCount / float(numTestVecs)))
    print(errorCount)

def classifyPerson():
    resultList = ['not at all', 'in small doses', 'in large doses']
    percentTats = float(input(\
                                  "percentage of time spent playing video games?"))
    ffMiles = float(input("frequent flier miles earned per year?"))
    iceCream = float(input("liters of ice cream consumed per year?"))
    datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
    normMat, ranges, minVals = autoNorm(datingDataMat)
    inArr = np.array([ffMiles, percentTats, iceCream, ])
    classifierResult = classify0((inArr - \
                                  minVals)/ranges, normMat, datingLabels, 3)
    print("You will probably like this person: %s" % resultList[classifierResult - 1])

def img2vector(filename):
    returnVect = np.zeros((1, 1024))#生成一個空矩陣 1行1024列
    fr = open(filename)#開啟檔案
    #採用雙迴圈
    for i in range(32):
        lineStr = fr.readline()#讀一行資料
        for j in range(32):
            returnVect[0, 32*i+j] = int(lineStr[j])#將每一行資料填充到 returnVect資料後面
    return returnVect 
    
#資料集是書上例子，給我們的資料集已經將圖片轉換成了數字我們只需要識別是0還是1
def handwritingClassTest():
    hwLabels = []#匯入資料
    trainingFileList = listdir('trainingDigits')           #listdir獲取檔案目錄下的列表
    m = len(trainingFileList)
    trainingMat = np.zeros((m, 1024))
    for i in range(m):
        fileNameStr = trainingFileList[i]
        fileStr = fileNameStr.split('.')[0]     #通過.分割，去掉.txt剩下0_0
        classNumStr = int(fileStr.split('_')[0])#利用_分割，獲取第一個元素類別標籤
        hwLabels.append(classNumStr)#將類別新增到hwLabels列表後面去
        trainingMat[i, :] = img2vector('trainingDigits/%s' % fileNameStr)#將圖片轉換為矩陣（將returnVect返回的資料給trainingMat[i, :]）
        
    #匯入待測試的資料
    testFileList = listdir('testDigits')        #iterate through the test set
    errorCount = 0.0
    mTest = len(testFileList)
    for i in range(mTest):#按每個檔案讀取資料
        fileNameStr = testFileList[i]
        fileStr = fileNameStr.split('.')[0]     #通過.分割，去掉.txt剩下0_0
        classNumStr = int(fileStr.split('_')[0])#利用_分割，獲取第一個元素類別標籤，檔名就是它的目標變數（切割後）如果資料過大 都可以實現出來
        vectorUnderTest = img2vector('testDigits/%s' % fileNameStr)
        #將得到的資料和要測試的資料一起放入k-近鄰演算法裡面
        classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
        #得出訓練後的結果
        print("the classifier came back with: %d, the real answer is: %d" % (classifierResult, classNumStr))
        #進行比較結果
        if (classifierResult != classNumStr): errorCount += 1.0
    print("\nthe total number of errors is: %d" % errorCount)
    print("\nthe total error rate is: %f" % (errorCount/float(mTest)))

"""
後面幾行是視訊中提及的對本程式碼的測試，可以嘗試自己新增練習函式使用
if  _name_ =='_main_':
	test1()
	datingClasstest()
	handwritingClassTest()
"""

特點：
優點：精度高、對異常值不敏感
缺點：計算複雜度高、空間複雜度高（每次都要跑一遍所有資料）
使用資料範圍：數值型和標稱型
總結：
根據空間兩個點的距離來計算，關鍵是引入了K值，保證了一定的穩定性。
對資料要進行歸一化，防止對結果影響很大。

《機器學習實戰》學習總結1——K-近鄰演算法

《機器學習實戰》學習總結1——K-近鄰演算法

《機器學習實戰》學習總結1——K-近鄰演算法（程式清單2-1）

《機器學習實戰》第二章——k-近鄰演算法——筆記

機器學習實戰讀書筆記(1)--k鄰近演算法

機器學習實戰筆記一：K-近鄰演算法在約會網站上的應用

機器學習實戰（一）k-近鄰演算法kNN（k-Nearest Neighbor）

Python3《機器學習實戰》筆記：K-近鄰演算法

機器學習實戰（2）—— k-近鄰演算法

程式碼註釋：機器學習實戰第2章 k-近鄰演算法

機器學習實戰（一）--k近鄰演算法

Python3《機器學習實戰》01：k-近鄰演算法（完整程式碼及註釋）

《機器學習實戰》第二章——K-近鄰演算法

《機器學習實戰》——kNN（k近鄰演算法）

《機器學習實戰》第2章閱讀筆記1 K近鄰演算法概述

機器學習實戰之第二章 k-近鄰算法

機器學習實戰（一）k-近鄰kNN（k-Nearest Neighbor）

機器學習筆記1-k近鄰演算法的實現

2.2.1 K-近鄰演算法概述

1 k-近鄰演算法

《機器學習實戰》中的程序清單2-1 k近鄰算法classify0都做了什麽

《機器學習實戰》學習總結1——K-近鄰演算法

相關推薦