1. 程式人生 > >YOLO原始碼詳解(四)- 反向傳播(back propagation)

YOLO原始碼詳解(四)- 反向傳播(back propagation)

反向傳播是CNN中非常重要的一個環節,對於理論部分,這裡不做介紹,如果對反向傳播理論部分不熟悉,可以檢視以下網站。
非常詳細:零基礎入門深度學習(3) - 神經網路和反向傳播演算法
非常詳細:零基礎入門深度學習(4) - 卷積神經網路
非常生動:如何直觀的解釋back propagation演算法?
通過以上理論部分的學習,如果你還是感覺一臉蒙逼,那就看YOLO的程式碼吧,看完原始碼你就會豁然開朗。讓我們來一睹“back propagation”芳容

一、主函式backward_network(network net, network_state state)

//network.c
void backward_network(network net, network_state state) { int i; float *original_input = state.input; float *original_delta = state.delta; state.workspace = net.workspace; for(i = net.n-1; i >= 0; --i){ state.index = i; if(i == 0){ state.input = original_input; state.delta = original_delta; }else
{ layer prev = net.layers[i-1]; state.input = prev.output; //這裡注意,因為delta是指標變數,對state.delta做修改,就相當與對prev層的delta做了修改 state.delta = prev.delta; } layer l = net.layers[i]; l.backward(l, state); } } //這函式沒什麼好說的,一層一層看吧,順序如下:
//[detection] //[connected] //[dropout] //[local] //[convolutional] //[maxpool]

1、反向傳播-detection層

//detection_layer.c
void backward_detection_layer(const detection_layer l, network_state state)
{
    //給state.delta賦值,l.delta存放的是預測值與真實值的差
    axpy_cpu(l.batch*l.inputs, 1, l.delta, 1, state.delta, 1);
}
//blas.c
//axpy函式:y += a * x
void axpy_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY)
{
    int i;
    for(i = 0; i < N; ++i) Y[i*INCY] += ALPHA*X[i*INCX];
}

2、反向傳播-connected層

//connected_layer.c
void backward_connected_layer(connected_layer l, network_state state)
{
    int i;
    //計算啟用層的梯度值
    gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);
    //把batch size每個樣本對應的值加起來,放入bias_updates指向的記憶體
    for(i = 0; i < l.batch; ++i){
        axpy_cpu(l.outputs, 1, l.delta + i*l.outputs, 1, l.bias_updates, 1);
    }
    //全連結層沒用到batch_normalize,這裡不做介紹
    if(l.batch_normalize){
        backward_scale_cpu(l.x_norm, l.delta, l.batch, l.outputs, 1, l.scale_updates);

        scale_bias(l.delta, l.scales, l.batch, l.outputs, 1);

        mean_delta_cpu(l.delta, l.variance, l.batch, l.outputs, 1, l.mean_delta);
        variance_delta_cpu(l.x, l.delta, l.mean, l.variance, l.batch, l.outputs, 1, l.variance_delta);
        normalize_delta_cpu(l.x, l.mean, l.variance, l.mean_delta, l.variance_delta, l.batch, l.outputs, 1, l.delta);
    }

    int m = l.outputs;
    int k = l.batch;
    int n = l.inputs;
    float *a = l.delta;
    float *b = state.input;
    float *c = l.weight_updates;
    //更新這一層的權重值
    gemm(1,0,m,n,k,1,a,m,b,n,1,c,n);

    m = l.batch;
    k = l.outputs;
    n = l.inputs;

    a = l.delta;
    b = l.weights;
    c = state.delta;
    //更新前一(prev)層的誤差項
    if(c) gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
}

3、反向傳播-dropout層

//dropout_layer.c
void backward_dropout_layer(dropout_layer l, network_state state)
{
    int i;
    if(!state.delta) return;
    for(i = 0; i < l.batch * l.inputs; ++i){
        //l.rand[i]就是0~1之間的隨機數,這在前向傳播的時候有講
        float r = l.rand[i];
        //同樣將前一層的delta賦值為0
        if(r < l.probability) state.delta[i] = 0;
        else state.delta[i] *= l.scale;
    }
}

4、反向傳播-local層

//local_layer.c
void backward_local_layer(local_layer l, network_state state)
{
    int i, j;
    int locations = l.out_w*l.out_h;
    //計算啟用層梯度
    gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);
    //跟新bias_updates
    for(i = 0; i < l.batch; ++i){
        axpy_cpu(l.outputs, 1, l.delta + i*l.outputs, 1, l.bias_updates, 1);
    }

    for(i = 0; i < l.batch; ++i){
        float *input = state.input + i*l.w*l.h*l.c;
        im2col_cpu(input, l.c, l.h, l.w, 
                l.size, l.stride, l.pad, l.col_image);

        for(j = 0; j < locations; ++j){
            float *a = l.delta + i*l.outputs + j;
            float *b = l.col_image + j;
            float *c = l.weight_updates + j*l.size*l.size*l.c*l.n;
            int m = l.n;
            int n = l.size*l.size*l.c;
            int k = 1;
            //更新權重
            gemm(0,1,m,n,k,1,a,locations,b,locations,1,c,n);
        }

        if(state.delta){
            for(j = 0; j < locations; ++j){ 
                float *a = l.weights + j*l.size*l.size*l.c*l.n;
                float *b = l.delta + i*l.outputs + j;
                float *c = l.col_image + j;

                int m = l.size*l.size*l.c;
                int n = 1;
                int k = l.n;
                //更新下一層誤差項
                gemm(1,0,m,n,k,1,a,m,b,locations,0,c,locations);
            }

            col2im_cpu(l.col_image, l.c,  l.h,  l.w,  l.size,  l.stride, l.pad, state.delta+i*l.c*l.h*l.w);
        }
    }
}

5、反向傳播-convolutional層

//convolutional_layer.c
//跟local層一樣~
void backward_convolutional_layer(convolutional_layer l, network_state state)
{
    int i;
    int m = l.n;
    int n = l.size*l.size*l.c;
    int k = convolutional_out_height(l)*
        convolutional_out_width(l);

    gradient_array(l.output, m*k*l.batch, l.activation, l.delta);
    backward_bias(l.bias_updates, l.delta, l.batch, l.n, k);

    for(i = 0; i < l.batch; ++i){
        float *a = l.delta + i*m*k;
        float *b = state.workspace;
        float *c = l.weight_updates;

        float *im = state.input+i*l.c*l.h*l.w;

        im2col_cpu(im, l.c, l.h, l.w, 
                l.size, l.stride, l.pad, b);
        gemm(0,1,m,n,k,1,a,k,b,k,1,c,n);

        if(state.delta){
            a = l.weights;
            b = l.delta + i*m*k;
            c = state.workspace;

            gemm(1,0,n,k,m,1,a,n,b,k,0,c,k);

            col2im_cpu(state.workspace, l.c,  l.h,  l.w,  l.size,  l.stride, l.pad, state.delta+i*l.c*l.h*l.w);
        }
    }
}

6、反向傳播-maxpool層

//maxpool_layer.c
void backward_maxpool_layer(const maxpool_layer l, network_state state)
{
    int i;
    int h = l.out_h;
    int w = l.out_w;
    int c = l.c;
    for(i = 0; i < h*w*c*l.batch; ++i){
        //l.indexes儲存的是前一層最大值的座標
        int index = l.indexes[i];
        state.delta[index] += l.delta[i];
    }
}