1. 程式人生 > >DeepLearning | 經典卷積神經網路VGG_Net

DeepLearning | 經典卷積神經網路VGG_Net

VGGNet是牛津大學計算機視覺組和Google DeepMind 公司的研究員一起研發的深度卷積神經網路。VGGNet探索了卷積神經網路的深度與效能之間的關係,通過反覆堆疊3x3的小型卷積核和2x2的最大池化層,VGGNet成功地構築了16~19層深的卷積神經網路,並取得了ILSVRC2014比賽分類專案的第二名和定位專案的第一名。 其網路結構和思路主要展示在論文 Very Deep Convolutional Networks for Large-Scale Image Recognition

一、VGGNet網路結構 VGGNet論文中全部使用了3x3的卷積核和2x2的池化核,通過不斷加深網路結構來提升效能,下圖為VGGNet各級別的網路結構圖

這裡寫圖片描述

從A到E每一級網路逐漸變深,但是網路的引數量並沒有增長很多,這是因為引數量主要都消耗在最後3個全連線層。前面的卷積部分雖然很深,但是消耗的引數量並不大,不過訓練比較耗時的部分依然是卷積,因為其計算量比較大。這其中的D, E也就是我們常說的VGGNet-16和VGGNet-19,C很有意思,相比於B多了幾個1x1的卷積層,1x1卷積的主要意義在於線性變換,而輸入的通道數和輸出通道數不變,沒有發生降維

VGGNet 擁有5段卷積,每一段內有2~3個卷積層,每一段內有2~3個卷積層,同時每段尾部會連線一個最大池化層用來縮小圖片尺寸。每段內的卷積核數量一樣,越靠後的段的卷積核數量越多:64-128-256-512-512。其中經常出現多個完全一樣的3x3的卷積層堆疊在一起的情況,這是一個非常有用的設計,兩個3x3的卷積層串聯相當於1個5x5的卷積層,即一個畫素會跟周圍5x5的畫素產生關聯,可以說感受野大小為5x5。而三個3x3的卷積層串聯的效果則相當於1個7x7的卷積層。除此之外,3個串聯的3x3的卷積層擁有比一個7x7的卷積層更多的非線性變化,使得CNN對特徵的學習能力更強

二、VGGNet中用到的技巧 VGGNet在訓練時有一個小的技巧,先訓練級別A的簡單網路,再複用A網路的權重來初始化後面的幾個複雜模型,這樣訓練收斂速度更快。在預測時,VGG採用Multi-Scale的方法,將影象scale到一個尺寸Q,並將圖片輸入到卷積網路計算。然後在最後一個卷積層使用劃窗的方式進行分類預測,將不同視窗的分類結果平均,再將不同尺寸Q的結果平均後得到最後結果,這樣可以提高圖片的利用率並提升預測準確率。同時在訓練中,VGGNet還使用了Multi-Scale的方法做資料增強,將原始影象縮放到不同尺寸S,然後再隨機裁剪244x244的圖片,這樣能增加很多資料量,對於防止過擬合有很不錯的效果 作者在對比各級網路時總結出以下幾個觀點 (1)LRN層作用不大 (2)越深的網路效果越好 (3)1x1的卷積也是有效果的,但是沒有3x3的卷積效果好,大一些的卷積核可以學習更大的空間特徵

三、VGGNet的tensorflow實現 用到的資料集比較大,可以在評論留郵箱,會發送給你

import tensorflow as tf
import os
import numpy as np
from PIL import Image
import pandas as pd
from sklearn import preprocessing
import cv2
from sklearn.model_selection import train_test_split

def load_Img(imgDir):
    imgs = os.listdir(imgDir)
    imgs = np.ravel(pd.DataFrame(imgs).sort_values(by=0).values)
    imgNum = len(imgs)
    data = np.empty((imgNum,image_size,image_size,3),dtype="float32")
    for i in range (imgNum):
        img = Image.open(imgDir+"/"+imgs[i])
        arr = np.asarray(img,dtype="float32")
        arr = cv2.resize(arr,(image_size,image_size))
        if len(arr.shape) == 2:
            temp = np.empty((image_size,image_size,3))
            temp[:,:,0] = arr
            temp[:,:,1] = arr
            temp[:,:,2] = arr
            arr = temp
        data[i,:,:,:] = arr
    return data        

def make_label(labelFile):
    label_list = pd.read_csv(labelFile,sep = '\t',header = None) 
    label_list = label_list.sort_values(by=0)
    le = preprocessing.LabelEncoder()
    for item in [1]:
        label_list[item] = le.fit_transform(label_list[item])     
    label = label_list[1].values
    onehot = preprocessing.OneHotEncoder(sparse = False)
    label_onehot = onehot.fit_transform(np.mat(label).T)
    return label_onehot

def conv_op(input_op,name,kh,kw,n_out,dh,dw,p):
    n_in = input_op.get_shape()[-1].value

    with tf.name_scope(name) as scope:
        kernel = tf.get_variable(scope+'W',
                                 shape=[kh,kw,n_in,n_out],
                                 dtype=tf.float32,
                                 initializer=tf.contrib.layers.xavier_initializer_conv2d())
#        kernel = tf.Variable(tf.truncated_normal([kh,kw,n_in,n_out],
#                                                  dtype=tf.float32,
#                                                  stddev=0.01), 
#                                                  name=scope+'W')
        conv = tf.nn.conv2d(input_op,kernel,(1,dh,dw,1),padding='SAME')
        bias_init_val = tf.constant(0,shape=[n_out],dtype=tf.float32)
        biases = tf.Variable(bias_init_val,trainable=True,name='b')
        z = tf.nn.bias_add(conv,biases)
        activation = tf.nn.relu(z,name=scope)
        p += [kernel,biases]
        return activation

def fc_op(input_op,name,n_out,p):
    n_in = input_op.get_shape()[-1].value 

    with tf.name_scope(name) as scope:
#        kernel = tf.get_variable(scope+'w',
#                                 shape = [n_in,n_out],
#                                 dtype = tf.float32,
#                                 initializer = tf.contrib.layers.xavier_initializer())
        kernel = tf.Variable(tf.truncated_normal([n_in,n_out],
                                                  dtype=tf.float32,
                                                  stddev=0.01), 
                                                  name=scope+'W')
        biases = tf.Variable(tf.constant(0.1,shape=[n_out],dtype=tf.float32),name='b')

        activation = tf.nn.sigmoid(tf.matmul(input_op,kernel)+biases,name = 'ac')
#        activation = tf.nn.relu_layer(input_op,kernel,biases,name=scope)
        p += [kernel,biases]
        return activation

def mpool_op(input_op,name,kh,kw,dh,dw):
    return tf.nn.max_pool(input_op,
                          ksize=[1,kh,kw,1],
                          strides=[1,dh,dw,1],
                          padding='SAME',
                          name=name)

def inference_op(input_op,y,keep_prob):

    p = []

    conv1_1 = conv_op(input_op,name='conv1_1',kh=3,kw=3,n_out=64,dh=1,dw=1,p=p)
    conv1_2 = conv_op(conv1_1,name='conv1_2',kh=3,kw=3,n_out=64,dh=1,dw=1,p=p)
    pool1 = mpool_op(conv1_2,name='pool1',kh=2,kw=2,dw=2,dh=2)

    conv2_1 = conv_op(pool1,name='conv2_1',kh=3,kw=3,n_out=128,dh=1,dw=1,p=p)
    conv2_2 = conv_op(conv2_1,name='conv2_2',kh=3,kw=3,n_out=128,dh=1,dw=1,p=p)
    pool2 = mpool_op(conv2_2,name='pool2',kh=2,kw=2,dw=2,dh=2)

    conv3_1 = conv_op(pool2,name='conv3_1',kh=3,kw=3,n_out=256,dh=1,dw=1,p=p)
    conv3_2 = conv_op(conv3_1,name='conv3_2',kh=3,kw=3,n_out=256,dh=1,dw=1,p=p)    
    conv3_3 = conv_op(conv3_2,name='conv3_3',kh=3,kw=3,n_out=256,dh=1,dw=1,p=p) 
    pool3 = mpool_op(conv3_3,name='pool3',kh=2,kw=2,dh=2,dw=2)

    conv4_1 = conv_op(pool3,name='conv4_1',kh=3,kw=3,n_out=512,dh=1,dw=1,p=p)
    conv4_2 = conv_op(conv4_1,name='conv4_2',kh=3,kw=3,n_out=512,dh=1,dw=1,p=p)    
    conv4_3 = conv_op(conv4_2,name='conv4_3',kh=3,kw=3,n_out=512,dh=1,dw=1,p=p) 
    pool4 = mpool_op(conv4_3,name='pool4',kh=2,kw=2,dh=2,dw=2)           

    conv5_1 = conv_op(pool4,name='conv5_1',kh=3,kw=3,n_out=512,dh=1,dw=1,p=p)
    conv5_2 = conv_op(conv5_1,name='conv5_2',kh=3,kw=3,n_out=512,dh=1,dw=1,p=p)    
    conv5_3 = conv_op(conv5_2,name='conv5_3',kh=3,kw=3,n_out=512,dh=1,dw=1,p=p) 
    pool5 = mpool_op(conv5_3,name='pool5',kh=2,kw=2,dh=2,dw=2)   

    shp = pool5.get_shape()
    flattened_shape = shp[1].value*shp[2].value*shp[3].value
    resh1 = tf.reshape(pool5,[-1,flattened_shape],name='resh1')

    fc6 = fc_op(resh1,name='fc6',n_out=4096,p=p)
    fc6_drop = tf.nn.dropout(fc6,keep_prob,name='fc6_drop')

    fc7 = fc_op(fc6_drop,name='fc7',n_out=4096,p=p)
    fc7_drop = tf.nn.dropout(fc7,keep_prob,name='fc7_drop')

    fc8 = fc_op(fc7_drop,name='fc8',n_out=190,p=p)  

    y_conv = tf.nn.softmax(fc8) 

    cross_entropy = tf.reduce_mean(-tf.reduce_sum(y * tf.log(tf.clip_by_value(y_conv, 1e-10, 1.0)),reduction_indices=[1]))
    train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)

    correct_prediction = tf.equal(tf.arg_max(y_conv,1),tf.arg_max(y,1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction,tf.float32))  

    return accuracy,train_step,cross_entropy,fc8,p

def run_benchmark():

    imgDir = '/Users/zhuxiaoxiansheng/Desktop/DatasetA_train_20180813/train'
    labelFile = '/Users/zhuxiaoxiansheng/Desktop/DatasetA_train_20180813/train.txt'  

    data = load_Img(imgDir)
    data = data/255.0
    label = make_label(labelFile)

    traindata,testdata,trainlabel,testlabel = train_test_split(data,label,test_size=100,random_state = 2018)   
    print(traindata.shape,testdata.shape) 

    with tf.Graph().as_default():

        os.environ["CUDA_VISIBLE_DEVICES"] = '0'   #指定第一塊GPU可用
        config = tf.ConfigProto()
        config.gpu_options.per_process_gpu_memory_fraction = 1.0
        my_graph = tf.Graph()
        sess = tf.InteractiveSession(graph=my_graph,config=config)

        keep_prob = tf.placeholder(tf.float32)
        input_op = tf.placeholder(tf.float32,[None,image_size*image_size*3])
        input_op = tf.reshape(input_op,[-1,image_size,image_size,3])
        y = tf.placeholder(tf.float32,[None,190]) 

        accuracy,train_step,cross_entropy,fc8,p = inference_op(input_op,y,keep_prob)

        init = tf.global_variables_initializer()
        sess.run(init)

        for i in range(num_batches):
            rand_index = np.random.choice(38121,size=(batch_size))
            train_step.run(feed_dict={input_op:traindata[rand_index],y:trainlabel[rand_index],keep_prob:0.8})

            if i%100 == 0:
                rand_index = np.random.choice(38121,size=(100))
                train_accuracy = accuracy.eval(feed_dict={input_op:traindata[rand_index],y:trainlabel[rand_index],keep_prob:1.0})
                print('step %d, training accuracy %g'%(i,train_accuracy))
                print(fc8.eval(feed_dict={input_op:traindata[rand_index],y:trainlabel[rand_index],keep_prob:1.0}))
                print(cross_entropy.eval(feed_dict={input_op:traindata[rand_index],y:trainlabel[rand_index],keep_prob:1.0}))
        print("test accuracy %g"%accuracy.eval(feed_dict={input_op:testdata,y:testlabel,keep_prob:1.0}))




image_size = 224
batch_size = 64
num_batches = 10000
run_benchmark()