1. 程式人生 > >Coursera機器學習基石作業一python實現

Coursera機器學習基石作業一python實現

機器學習基石作業一

在這裡插入圖片描述

import numpy as np

def train_matrix():
    with open("hw1_15_train.dat.txt","r") as f:
        rawData=f.readlines()
    dataNum=len(rawData)
    dataDim=len(rawData[0].strip().split(' '))
    X=np.zeros((dataNum,dataDim))
    Y=np.zeros((dataNum,1))
    count=0
    x=[]
    for line in rawData:
        x.append(1)
        for str in line.split(' '):
            if len(str.split('\t')) == 1 and len(x)<=4:
                x.append(float(str))
            elif len(str.split('\t')) == 1 and len(x)==5:
                Y[count] = int(str.strip())
            else:
                x.append(float(str.split('\t')[0]))
                Y[count,0] = int(str.split('\t')[1].strip())
        X[count,:]=x
        x=[]
        count += 1
    return X,Y,dataNum,dataDim
def PLA():
    x_train,y_train,dataNum,dataDim=train_matrix()
    w=np.zeros((dataDim,1))
    count=0
    while True:
        flag=False
        for i in range(dataNum):
            if np.dot(x_train[i,:],w)[0]*y_train[i,0]<=0:
                w+=y_train[i,:]*x_train[i,:].reshape(5,1)
                flag=True
                count+=1
        if flag==False:
            break
    return count

if __name__=='__main__':
    print(PLA())

在這裡插入圖片描述

import numpy as np

def train_matrix():
    with open("hw1_15_train.dat.txt","r") as f:
        rawData=f.readlines()
    dataNum=len(rawData)
    dataDim=len(rawData[0].strip().split(' '))
    X=np.zeros((dataNum,dataDim))
    Y=np.zeros((dataNum,1))
    count=0
    x=[]
    for line in rawData:
        x.append(1)
        for str in line.split(' '):
            if len(str.split('\t')) == 1 and len(x)<=4:
                x.append(float(str))
            elif len(str.split('\t')) == 1 and len(x)==5:
                Y[count] = int(str.strip())
            else:
                x.append(float(str.split('\t')[0]))
                Y[count,0] = int(str.split('\t')[1].strip())
        X[count,:]=x
        x=[]
        count += 1
    permutation = np.random.permutation(Y.shape[0])    #numpy打亂資料集的方法
    shuffled_dataset = X[permutation]
    shuffled_labels = Y[permutation]
    return shuffled_dataset,shuffled_labels,dataNum,dataDim
def PLA():
    x_train,y_train,dataNum,dataDim=train_matrix()
    w=np.zeros((dataDim,1))
    count=0
    while True:
        flag=False
        for i in range(dataNum):
            if np.dot(x_train[i,:],w)[0]*y_train[i,0]<=0:
                w+=y_train[i,:]*x_train[i,:].reshape(5,1)
                flag=True
                count+=1
        if flag==False:
            break
    return count

if __name__=='__main__':
    sum=0
    for i in range(2000):
        count=PLA()
        sum+=count
    print(sum/2000)

在這裡插入圖片描述

只要在計算w的時候再乘以0.5即可

在這裡插入圖片描述

import numpy
import copy


class Pocket(object):
    def __init__(self, dimension, train_count, test_count):
        self.__dimension = dimension
        self.__train_count = train_count
        self.__test_count = test_count

    def train_matrix(self, path):
        with open(path, "r") as f:
            rawData = f.readlines()
        X = numpy.zeros((self.__train_count, self.__dimension))
        Y = numpy.zeros((self.__train_count, 1))
        count = 0
        x = []
        for line in rawData:
            x.append(1)
            for str in line.split(' '):
                if len(str.split('\t')) == 1:
                    x.append(float(str))
                else:
                    x.append(float(str.split('\t')[0]))
                    Y[count, 0] = int(str.split('\t')[1].strip())
            X[count, :] = x
            x = []
            count += 1
        permutation = numpy.random.permutation(Y.shape[0])  # numpy打亂資料集的方法
        shuffled_dataset = X[permutation]
        shuffled_labels = Y[permutation]
        return shuffled_dataset, shuffled_labels

    def iteration(self, path):
        count = 0
        x_train, y_train = self.train_matrix(path)
        w = numpy.zeros((self.__dimension, 1))
        best_count = self.__train_count
        best_w = numpy.zeros((self.__dimension, 1))

        # pocket演算法,對一條線進行修改(最多50次),每次修改後都用訓練集資料看是否是當前最好的那條線
        for i in range(self.__train_count):
            if numpy.dot(x_train[i, :], w)[0] * y_train[i, 0] <= 0:
                w += 0.5 * y_train[i, 0] * x_train[i, :].reshape(5, 1)
                # 修改次數加一
                count += 1
                num = 0
                # 驗證
                for j in range(self.__train_count):
                    if numpy.dot(x_train[j, :], w)[0] * y_train[j, 0] <= 0:
                        num += 1
                if num < best_count:
                    best_count = num
                    best_w = copy.deepcopy(w)
                if count == 50:
                    break
        return best_w

    def test_matrix(self, test_path):
        x_test = numpy.zeros((self.__test_count, self.__dimension))
        y_test = numpy.zeros((self.__test_count, 1))
        test_set = open(test_path)
        x = []
        x_count = 0
        for line in test_set:
            x.append(1)
            for str in line.split(' '):
                if len(str.split('\t')) == 1:
                    x.append(float(str))
                else:
                    x.append(float(str.split('\t')[0]))
                    y_test[x_count, 0] = (int(str.split('\t')[1].strip()))
            x_test[x_count, :] = x
            x = []
            x_count += 1
        return x_test, y_test

    # 驗證
    def test_error(self, train_path, test_path):
        w = self.iteration(train_path)
        x_test, y_test = self.test_matrix(test_path)
        count = 0.0
        for i in range(self.__test_count):
            if numpy.dot(x_test[i, :], w)[0] * y_test[i, 0] <= 0:
                count += 1
        return count / self.__test_count


if __name__ == '__main__':
    average_error_rate = 0
    for i in range(2000):
        my_Pocket = Pocket(5, 500, 500)
        average_error_rate += my_Pocket.test_error('hw1_18_train.dat.txt', 'hw1_18_test.dat.txt')
    print(average_error_rate / 2000.0)

在這裡插入圖片描述

import numpy
import copy


class Pocket(object):
    def __init__(self, dimension, train_count, test_count):
        self.__dimension = dimension
        self.__train_count = train_count
        self.__test_count = test_count

    def train_matrix(self, path):
        with open(path, "r") as f:
            rawData = f.readlines()
        X = numpy.zeros((self.__train_count, self.__dimension))
        Y = numpy.zeros((self.__train_count, 1))
        count = 0
        x = []
        for line in rawData:
            x.append(1)
            for str in line.split(' '):
                if len(str.split('\t')) == 1:
                    x.append(float(str))
                else:
                    x.append(float(str.split('\t')[0]))
                    Y[count, 0] = int(str.split('\t')[1].strip())
            X[count, :] = x
            x = []
            count += 1
        permutation = numpy.random.permutation(Y.shape[0])  # numpy打亂資料集的方法
        shuffled_dataset = X[permutation]
        shuffled_labels = Y[permutation]
        return shuffled_dataset, shuffled_labels

    def iteration(self, path):
        count = 0
        x_train, y_train = self.train_matrix(path)
        w = numpy.zeros((self.__dimension, 1))
        # pocket演算法,對一條線進行修改(最多50次),每次修改後都用訓練集資料看是否是當前最好的那條線
        for i in range(self.__train_count):
            if numpy.dot(x_train[i, :], w)[0] * y_train[i, 0] <= 0:
                w += 0.5 * y_train[i, 0] * x_train[i, :].reshape(5, 1)
                # 修改次數加一
                count += 1
                if count == 50:
                    break
        return w

    def test_matrix(self, test_path):
        x_test = numpy.zeros((self.__test_count, self.__dimension))
        y_test = numpy.zeros((self.__test_count, 1))
        test_set = open(test_path)
        x = []
        x_count = 0
        for line in test_set:
            x.append(1)
            for str in line.split(' '):
                if len(str.split('\t')) == 1:
                    x.append(float(str))
                else:
                    x.append(float(str.split('\t')[0]))
                    y_test[x_count, 0] = (int(str.split('\t')[1].strip()))
            x_test[x_count, :] = x
            x = []
            x_count += 1
        return x_test, y_test

    # 驗證
    def test_error(self, train_path, test_path):
        w = self.iteration(train_path)
        x_test, y_test = self.test_matrix(test_path)
        count = 0.0
        for i in range(self.__test_count):
            if numpy.dot(x_test[i, :], w)[0] * y_test[i, 0] <= 0:
                count += 1
        return count / self.__test_count


if __name__ == '__main__':
    average_error_rate = 0
    for i in range(2000):
        my_Pocket = Pocket(5, 500, 500)
        average_error_rate += my_Pocket.test_error('hw1_18_train.dat.txt', 'hw1_18_test.dat.txt')
    print(average_error_rate / 2000.0)

在這裡插入圖片描述
只需要在18題的程式碼上修改count判斷的條件,把50修改成100即可