機器學習實戰之K近鄰改進的約會網站程式碼及手寫字型識別程式碼
阿新 • • 發佈:2018-12-21
from numpy import * import operator import os def createDataSet(): group=array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]]) labels=['A','A','B','B'] return group,labels def classify0(inX,dataSet,labels,k): dataSetSize=dataSet.shape[0] #檢視陣列維數 diffMat=tile(inX,(dataSetSize,1))-dataSet sqDiffMat=diffMat**2 sqDistances=sqDiffMat.sum(axis=1) distances=sqDistances**0.5 sortedDistIdicies=distances.argsort() classCount={} for i in range(k): voteIlabel=labels[sortedDistIdicies[i]] classCount[voteIlabel]=classCount.get(voteIlabel,0)+1 sortedClassCount=sorted(classCount.items(), key=operator.itemgetter(1),reverse=True) return sortedClassCount[0][0] def file2matrix(filename): fr=open(filename) arrayOLines=fr.readlines() numberOfLines=len(arrayOLines) returnMat=zeros((numberOfLines,3)) classLabelVector=[] index=0 for line in arrayOLines: line=line.strip() listFromLine=line.split('\t') returnMat[index,:]=listFromLine[0:3] classLabelVector.append(int(listFromLine[-1])) index+=1 return returnMat,classLabelVector #datingDataMat,datingLabels=file2matrix('datingTestSet2.txt') import matplotlib import matplotlib.pyplot as plt fig=plt.figure() ax=fig.add_subplot(111) #ax.scatter(datingDataMat[:,1],datingDataMat[:,2]) #plt.show() #ax.scatter(datingDataMat[:,1],datingDataMat[:,2],15.0*array(datingLabels),15.0*array(datingLabels)) #plt.show() def autoNorm(dataSet): minVals=dataSet.min(0) maxVals=dataSet.max(0) ranges=maxVals-minVals normDataSet=zeros(shape(dataSet)) m=dataSet.shape[0] normDataSet=dataSet-tile(minVals,(m,1)) normDataSet=normDataSet/tile(ranges,(m,1)) return normDataSet,ranges,minVals def datingClassTest(): hoRatio=0.10 datingDataMat,datingLabels=file2matrix('datingTestSet2.txt') normMat,ranges,minVals=autoNorm(datingDataMat) m=normMat.shape[0] numTestVecs=int(m*hoRatio) errorCount=0.0 for i in range(numTestVecs): classifierResult=classify0(normMat[i,:],normMat[numTestVecs:m, :], datingLabels[numTestVecs:m],3) print("the classifier came back with:%d,the real answer is:%d",classifierResult, datingLabels[i]) if(classifierResult != datingLabels[i]):errorCount += 1.0 print("the total error rate is:%f"%(errorCount/float(numTestVecs))) #程式清單2-5 約會網站預測函式 def classfyPerson():#這是真正與使用者互動的程式,按照自己的習慣對原書的英文問答改為了中文,程式碼都是之前介紹過的,不重複做解釋了, #唯一需要注意的是在python3.0中print與python2.0有較大差別,raw_input()不再適用 resultList = ['討厭','一般喜歡','非常喜歡'] percentTats = float(input('打視訊遊戲所佔時間比:')) ffMiles = float(input('飛行常客里程數:')) iceCream = float(input('每週消耗的冰淇淋公升數:')) datingDataMat,datingLabels = file2matrix('datingTestSet2.txt') normMat,ranges,minVals = autoNorm(datingDataMat) inArr = array([ffMiles,percentTats,iceCream]) classifierResult = classify0((inArr-minVals)/ranges,normMat,datingLabels,3) print('你是否會喜歡這個人:%s' %resultList[classifierResult-1]) def img2vector(filename): returnVect=zeros((1,1024)) fr=open(filename) for i in range(32): lineStr=fr.readline() for j in range(32): returnVect[0,32*i+j]=int(lineStr[j]) return returnVect #testVector=img2vector('testDigits/0_13.txt') #print(testVector[0,0:31]) def handwritingClassTest(): hwLabels=[] trainingFileList=os.listdir('trainingDigits') m=len(trainingFileList) trainingMat=zeros((m,1024)) for i in range(m): fileNameStr=trainingFileList[i] fileStr=fileNameStr.split('.')[0] classNumStr=int(fileStr.split('_')[0]) hwLabels.append(classNumStr) trainingMat[i,:]=img2vector('trainingDigits/%s' %fileNameStr) testFileList=os.listdir('testDigits') errorCount=0.0 mTest=len(testFileList) for i in range(mTest): fileNameStr=testFileList[i] fileStr=fileNameStr.split('.')[0] classNumStr=int(fileStr.split('_')[0]) vectorUnderTest=img2vector('testDigits/%s' %fileNameStr) classifierResult=classify0(vectorUnderTest,trainingMat,hwLabels,3) print("the classifier came back with:%d,the real answer is:%d",(classifierResult,classNumStr)) if (classifierResult!=classNumStr):errorCount+=1.0 print("\nthe total number of errors is:%d",errorCount) print("\nthe total error rate is:%f",(errorCount/float(mTest))) handwritingClassTest()