1. 程式人生 > >機器學習實戰第二章----KNN

機器學習實戰第二章----KNN

BE 指定 cto 文件轉換 .sh ati subplot OS umt

  1. tile的使用方法
    tile(A,n)的功能是把A數組重復n次(可以在列方向,也可以在行方向)
  2. argsort()函數
    argsort()函數返回的是數組中值從大到小的索引值
  3. dict.get()函數
    語法:dict.get(key, default=None)
    key----字典中要查找的鍵
    default----如果指定的簡直不存在時,返回該默認值
  4. add_subplot()基礎用法

    import matplotlib.pyplot as plt
    from numpy import *
    fig = plt.figure()
    ax = fig.add_subplot(349)
    ax.plot(x,y)

    將畫布分成三行四列,在第九個分區畫圖

from numpy import *
from os import listdir
import operator


def createDataSet():
    group = array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]])
    labels = [‘A‘, ‘A‘, ‘B‘, ‘B‘]
    return group, labels


# 對數據進行分類
def classify0(inX, dataSet, labels, k):
    dataSetSize = dataSet.shape[0
] #shape[0]獲取第一維的數目 diffMat = tile(inX, (dataSetSize, 1)) - dataSet # tile?????? sqDiffMat = diffMat**2 # 求差的平方和 sqDistance = sqDiffMat.sum(axis=1) #axis=1???????sum函數默認是axis=0列元素相加,axis=1是一行的元素求和 # 求標準差 distances = sqDistance**0.5 # 距離排序 sortDistIndicies =
distances.argsort() #argsort函數返回的是數組值從小到大的索引值 # 定義元字典 classCount = {} for i in range(k): # 獲得前k個元素的標簽 voteIlabel = labels[sortDistIndicies[i]] # 計算前k個數據標簽出現的次數 classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1 #dict.get()??????????? sortedClassCount = sorted(classCount.items(),key = operator.itemgetter(1), reverse=True) return sortedClassCount[0][0] # 讀取文本文件數據 def file2matrix(filename): fr = open(filename) lines = fr.readlines() num_lines = len(lines) train_matrix = zeros((num_lines, 3)) label_vector = [] index = 0 for line in lines: line = line.strip() line_list = line.split(\t) train_matrix[index, :] = line_list[0:3] # 獲取列表的前0,1,2列 label_vector.append(int(line_list[-1])) # 獲取列表的最後一列 index += 1 return train_matrix, label_vector # add_subplot???????????????? #歸一化函數 def autoNorm(dataSet): minVals = dataSet.min(0) maxVals = dataSet.max(0) ranges = maxVals - minVals # normDataSet = zeros(shape(dataSet)) m = dataSet.shape[0] normDataSet = dataSet - tile(minVals, (m, 1)) # minVals在列上重復一次(本身),在行上重復m次,從而形成m*3的向量 normDataSet = normDataSet/tile(ranges, (m, 1)) return normDataSet, ranges, minVals # 歸一化後的數據, 極差範圍, 最小值 # 分類器測試函數 def datingClassTest(): hoRatio = 0.10 #測試集比例 datingDataMat, datingLabels = file2matrix(‘datingTestSet2.txt‘) normMat, ranges, minVals = autoNorm(datingDataMat) m = normMat.shape[0] numTestVecs = int(m*hoRatio) errcount = 0.0 for i in range(numTestVecs): classifierResult = classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 2) print("the classifier came back with :%d ,the real answer is :%d" % (classifierResult, datingLabels[i])) if(classifierResult != datingLabels[i]): errcount += 1.0 print("the total error rate is: %f" %(errcount/float(numTestVecs))) # 手寫字符文件轉換成向量 def img2vector(filename): returnVect = zeros((1, 1024)) fr = open(filename) for i in range(32): lineStr = fr.readline() for j in range(32): returnVect[0, 32*i+j] = int(lineStr[j]) return returnVect # 手寫字符識別測試 def handwritingClassTest(): hwlabels = [] # 定義手寫字符標簽 trainingFileList = listdir(‘digits/trainingDigits‘) m = len(trainingFileList) trainingMat = zeros((m, 1024)) for i in range(m): fileNameStr = trainingFileList[i] fileStr = fileNameStr.split(‘.‘)[0] classNumStr = fileStr.split(‘_‘)[0] hwlabels.append(classNumStr) # 把文件變成向量並賦值到trainingMat trainingMat[i, :] = img2vector(‘digits/trainingDigits/%s % fileNameStr) testFileList = listdir(‘digits/testDigits‘) errcount = 0.0 mTest = len(testFileList) for i in range(mTest): fileNameStr = testFileList[i] fileStr = fileNameStr.split(‘.‘)[0] classNumStr = int(fileStr.split(‘_‘)[0]) vectorUnderTest = img2vector(‘digits/testDigits/%s % fileNameStr) classifierResult = classify0(vectorUnderTest, trainingMat, hwlabels, 3) print(‘the classifier came back with : %d, the real answer is %d % (int(classifierResult), classNumStr)) if(int(classifierResult) != int(classNumStr)): errcount += 1 print(\nthe total number of errors is %d % errcount) print(\nthe total error rate is: %f % float(errcount/mTest))

機器學習實戰第二章----KNN