1. 程式人生 > >吳恩達機器學習第四次作業:神經網路

吳恩達機器學習第四次作業:神經網路

這是習題和答案的下載地址,全網最便宜,只要一積分哦~~~

這是我總結的網課裡有關神經網路的筆記,不要積分的~~~

0.綜述

     神經網路的練習,比work3part2更加全面,包括了反向傳播,顯示隱藏層,引數正確性的驗證等幾個方面。

     指令碼太長,就不往上放了。

1.Loading and Visualizing Data

     見wor3part1的講解。

2.Loading Parameters

      略過。

3.Compute Cost (Feedforward)

      先不加正則化算一次J, 即lemda = 0。

%% unregularized Feedforward cost function lambda=0
% % 計算前向傳輸 Add ones to the X data matrix  -jin
% X = [ones(m, 1) X];
% a2 = sigmoid(X * Theta1');   % 第二層啟用函式輸出
% a2 = [ones(m, 1) a2];        % 第二層加入b
% a3 = sigmoid(a2 * Theta2');  
% 
% cost = Y .* log(a3) + (1 - Y ) .* log( (1 - a3));  % cost是m*K(5000*10)的結果矩陣  sum(cost(:))全部求和
% J= -1 / m * sum(cost(:));   

4.Implement Regularization

%% regularized Feedforward cost function lambda=1
% 計算前向傳輸 Add ones to the X data matrix  -jin
X = [ones(m, 1) X];
a2 = sigmoid(X * Theta1');   % 第二層啟用函式輸出
a2 = [ones(m, 1) a2];        % 第二層加入b
a3 = sigmoid(a2 * Theta2');  

temp1 = [zeros(size(Theta1,1),1) Theta1(:,2:end)];   % 先把theta(1)拿掉,不參與正則化
temp2 = [zeros(size(Theta2,1),1) Theta2(:,2:end)];
temp1 = sum(temp1 .^2);     % 計算每個引數的平方,再就求和。sum(A)是對A的列求和,sum(A,2)是對A的行求和,sum(A(:))是對A求和。
temp2 = sum(temp2 .^2);

cost = Y .* log(a3) + (1 - Y ) .* log( (1 - a3));  % cost是m*K(5000*10)的結果矩陣  sum(cost(:))全部求和
J= -1 / m * sum(cost(:)) + lambda/(2*m) * ( sum(temp1(:))+ sum(temp2(:)) );  

5.Sigmoid Gradient

function g = sigmoidGradient(z)
%SIGMOIDGRADIENT returns the gradient of the sigmoid function
%evaluated at z
%   g = SIGMOIDGRADIENT(z) computes the gradient of the sigmoid function
%   evaluated at z. This should work regardless if z is a matrix or a
%   vector. In particular, if z is a vector or matrix, you should return
%   the gradient for each element.

g = zeros(size(z));

% ====================== YOUR CODE HERE ======================
% Instructions: Compute the gradient of the sigmoid function evaluated at
%               each value of z (z can be a matrix, vector or scalar).


g = sigmoid(z) .* (1 - sigmoid(z));

6.Initializing Pameters 

function W = randInitializeWeights(L_in, L_out)
%RANDINITIALIZEWEIGHTS Randomly initialize the weights of a layer with L_in
%incoming connections and L_out outgoing connections
%   W = RANDINITIALIZEWEIGHTS(L_in, L_out) randomly initializes the weights 
%   of a layer with L_in incoming connections and L_out outgoing 
%   connections. 
%
%   Note that W should be set to a matrix of size(L_out, 1 + L_in) as
%   the column row of W handles the "bias" terms
%

% You need to return the following variables correctly 
W = zeros(L_out, 1 + L_in);

% ====================== YOUR CODE HERE ======================
% Instructions: Initialize W randomly so that we break the symmetry while
%               training the neural network.
%
% Note: The first row of W corresponds to the parameters for the bias units
%

epsilon_init = 0.12;
W = rand(L_out, 1 + L_in) * 2 * epsilon_init - epsilon_init;


% =========================================================================

end

7.Implement Backpropagation

     這部分涉及到了反向傳播和驗證導數兩個內容。

     反向傳播

%% 計算 Gradient 
delta_1 = zeros(size(Theta1));
delta_2 = zeros(size(Theta2));

for t = 1:m
   % step 1
   a_1 = X(t,:)';              %  取第t行元素並轉置為列向量。   
%        a_1 = [1 ; a_1];
   z_2 = Theta1 * a_1;   
   a_2 = sigmoid(z_2);  
      a_2 = [1 ; a_2];        % 加一個偏置單元1
   z_3 = Theta2 * a_2;
   a_3 = sigmoid(z_3);
   % step 2
   err_3 = zeros(num_labels,1);  % 構建10*1的0矩陣
   for k = 1:num_labels     
      err_3(k) = a_3(k) - (y(t) == k);    %計算輸出層的error
   end
   % step 3
   err_2 = Theta2' * err_3;                % err_2有26行!!!
   err_2 = err_2(2:end) .* sigmoidGradient(z_2);   % 去掉第一個誤差值,減少為25. sigmoidGradient(z_2)只有25行!!!
   % step 4
   delta_2 = delta_2 + err_3 * a_2';
   delta_1 = delta_1 + err_2 * a_1';
end

% step 5
Theta1_temp = [zeros(size(Theta1,1),1) Theta1(:,2:end)];
Theta2_temp = [zeros(size(Theta2,1),1) Theta2(:,2:end)];
Theta1_grad = 1 / m * delta_1 + lambda/m * Theta1_temp;
Theta2_grad = 1 / m * delta_2 + lambda/m * Theta2_temp ;
      
% -------------------------------------------------------------

% =========================================================================

% Unroll gradients
grad = [Theta1_grad(:) ; Theta2_grad(:)];

     驗證導數

      a.先生成一個小的神經網路,這個神經網路的引數由debugInitializeWeights生成,這個函式通過sin函式使每次生成的小神經網路的參             數一致

     b.用computeNumericalGradient函式來使用導數的定義求導數,然後把結果和用反向傳播算的結果作比較。

function checkNNGradients(lambda)
%CHECKNNGRADIENTS Creates a small neural network to check the
%backpropagation gradients
%   CHECKNNGRADIENTS(lambda) Creates a small neural network to check the
%   backpropagation gradients, it will output the analytical gradients
%   produced by your backprop code and the numerical gradients (computed
%   using computeNumericalGradient). These two gradient computations should
%   result in very similar values.
%

if ~exist('lambda', 'var') || isempty(lambda)
    lambda = 0;
end

input_layer_size = 3;
hidden_layer_size = 5;
num_labels = 3;
m = 5;

% We generate some 'random' test data
Theta1 = debugInitializeWeights(hidden_layer_size, input_layer_size);
Theta2 = debugInitializeWeights(num_labels, hidden_layer_size);
% Reusing debugInitializeWeights to generate X
X  = debugInitializeWeights(m, input_layer_size - 1);
y  = 1 + mod(1:m, num_labels)';                           %mod(1:m, num_labels) = (1:m).mod(num_labels);

% Unroll parameters
nn_params = [Theta1(:) ; Theta2(:)];

% Short hand for cost function
costFunc = @(p) nnCostFunction(p, input_layer_size, hidden_layer_size, ...
                               num_labels, X, y, lambda);

[cost, grad] = costFunc(nn_params);
numgrad = computeNumericalGradient(costFunc, nn_params);

% Visually examine the two gradient computations.  The two columns
% you get should be very similar. 
disp([numgrad grad]);
fprintf(['The above two columns you get should be very similar.\n' ...
         '(Left-Your Numerical Gradient, Right-Analytical Gradient)\n\n']);

% Evaluate the norm of the difference between two solutions.  
% If you have a correct implementation, and assuming you used EPSILON = 0.0001 
% in computeNumericalGradient.m, then diff below should be less than 1e-9
diff = norm(numgrad-grad)/norm(numgrad+grad);               %norm是求歐幾里得範數

fprintf(['If your backpropagation implementation is correct, then \n' ...
         'the relative difference will be small (less than 1e-9). \n' ...
         '\nRelative Difference: %g\n'], diff);

end
function W = debugInitializeWeights(fan_out, fan_in)
%DEBUGINITIALIZEWEIGHTS Initialize the weights of a layer with fan_in
%incoming connections and fan_out outgoing connections using a fixed
%strategy, this will help you later in debugging
%   W = DEBUGINITIALIZEWEIGHTS(fan_in, fan_out) initializes the weights 
%   of a layer with fan_in incoming connections and fan_out outgoing 
%   connections using a fix set of values
%
%   Note that W should be set to a matrix of size(1 + fan_in, fan_out) as
%   the first row of W handles the "bias" terms
%

% Set W to zeros
W = zeros(fan_out, 1 + fan_in);

% Initialize W using "sin", this ensures that W is always of the same
% values and will be useful for debugging
W = reshape(sin(1:numel(W)), size(W)) / 10;

% =========================================================================

end
function numgrad = computeNumericalGradient(J, theta)
%COMPUTENUMERICALGRADIENT Computes the gradient using "finite differences"
%and gives us a numerical estimate of the gradient.
%   numgrad = COMPUTENUMERICALGRADIENT(J, theta) computes the numerical
%   gradient of the function J around theta. Calling y = J(theta) should
%   return the function value at theta.

% Notes: The following code implements numerical gradient checking, and 
%        returns the numerical gradient.It sets numgrad(i) to (a numerical 
%        approximation of) the partial derivative of J with respect to the 
%        i-th input argument, evaluated at theta. (i.e., numgrad(i) should 
%        be the (approximately) the partial derivative of J with respect 
%        to theta(i).)
%                

numgrad = zeros(size(theta));
perturb = zeros(size(theta));
e = 1e-4;
for p = 1:numel(theta)              % n = numel(A)返回A中元素個數。
    % Set perturbation vector
    perturb(p) = e;
    loss1 = J(theta - perturb);
    loss2 = J(theta + perturb);
    % Compute Numerical Gradient
    numgrad(p) = (loss2 - loss1) / (2*e);
    perturb(p) = 0;
end

end

 8.Implement Regularization

     在cost函式中加入正則化。

%% regularized Feedforward cost function lambda=1
% 計算前向傳輸 Add ones to the X data matrix  -jin
X = [ones(m, 1) X];
a2 = sigmoid(X * Theta1');   % 第二層啟用函式輸出
a2 = [ones(m, 1) a2];        % 第二層加入b
a3 = sigmoid(a2 * Theta2');  

temp1 = [zeros(size(Theta1,1),1) Theta1(:,2:end)];   % 先把theta(1)拿掉,不參與正則化
temp2 = [zeros(size(Theta2,1),1) Theta2(:,2:end)];
temp1 = sum(temp1 .^2);     % 計算每個引數的平方,再就求和。sum(A)是對A的列求和,sum(A,2)是對A的行求和,sum(A(:))是對A求和。
temp2 = sum(temp2 .^2);

cost = Y .* log(a3) + (1 - Y ) .* log( (1 - a3));  % cost是m*K(5000*10)的結果矩陣  sum(cost(:))全部求和
J= -1 / m * sum(cost(:)) + lambda/(2*m) * ( sum(temp1(:))+ sum(temp2(:)) );  

9.Training NN

     訓練神經網路。

%  You have now implemented all the code necessary to train a neural 
%  network. To train your neural network, we will now use "fmincg", which
%  is a function which works similarly to "fminunc". Recall that these
%  advanced optimizers are able to train our cost functions efficiently as
%  long as we provide them with the gradient computations.
%
fprintf('\nTraining Neural Network... \n')

%  After you have completed the assignment, change the MaxIter to a larger
%  value to see how more training helps.
options = optimset('MaxIter', 400);

%  You should also try different values of lambda
lambda = 1;

% Create "short hand" for the cost function to be minimized
costFunction = @(p) nnCostFunction(p, ...
                                   input_layer_size, ...
                                   hidden_layer_size, ...
                                   num_labels, X, y, lambda);

% Now, costFunction is a function that takes in only one argument (the
% neural network parameters)
[nn_params, cost] = fmincg(costFunction, initial_nn_params, options);

% Obtain Theta1 and Theta2 back from nn_params
Theta1 = reshape(nn_params(1:hidden_layer_size * (input_layer_size + 1)), ...
                 hidden_layer_size, (input_layer_size + 1));

Theta2 = reshape(nn_params((1 + (hidden_layer_size * (input_layer_size + 1))):end), ...
                 num_labels, (hidden_layer_size + 1));

fprintf('Program paused. Press enter to continue.\n');
pause;

10.Visualize Weights 

%  You can now "visualize" what the neural network is learning by 
%  displaying the hidden units to see what features they are capturing in 
%  the data.

fprintf('\nVisualizing Neural Network... \n')

displayData(Theta1(:, 2:end));

fprintf('\nProgram paused. Press enter to continue.\n');
pause;

11.Implement Predict

      最後預測一波 

      迭代次數為50時,準確度為 95.60% 

      迭代次數為100時,準確度為97.74%

      迭代次數為400時,準確度為99.54%

function p = predict(Theta1, Theta2, X)
%PREDICT Predict the label of an input given a trained neural network
%   p = PREDICT(Theta1, Theta2, X) outputs the predicted label of X given the
%   trained weights of a neural network (Theta1, Theta2)

% Useful values
m = size(X, 1);
num_labels = size(Theta2, 1);

% You need to return the following variables correctly 
p = zeros(size(X, 1), 1);

h1 = sigmoid([ones(m, 1) X] * Theta1');
h2 = sigmoid([ones(m, 1) h1] * Theta2');
[dummy, p] = max(h2, [], 2);

% =========================================================================


end
for t = 1:m
   % step 1
   a_1 = X(t,:)';              %  取第t行元素並轉置為列向量。   
%        a_1 = [1 ; a_1];
   z_2 = Theta1 * a_1;   
   a_2 = sigmoid(z_2);  
      a_2 = [1 ; a_2];        % 加一個偏置單元1
   z_3 = Theta2 * a_2;
   a_3 = sigmoid(z_3);
   % step 2
   err_3 = zeros(num_labels,1);  % 構建10*1的0矩陣
   for k = 1:num_labels     
      err_3(k) = a_3(k) - (y(t) == k);    %計算輸出層的error
   end
   % step 3
   err_2 = Theta2' * err_3;                % err_2有26行!!!
   err_2 = err_2(2:end) .* sigmoidGradient(z_2);   % 去掉第一個誤差值,減少為25. sigmoidGradient(z_2)只有25行!!!
   % step 4
   delta_2 = delta_2 + err_3 * a_2';
   delta_1 = delta_1 + err_2 * a_1';
end