【深度學習SSD】——深刻解讀SSD tensorflow及原始碼詳解
阿新 • • 發佈:2019-01-02
- <code class="language-python"># Copyright 2016 Paul Balanca. All Rights Reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- # ==============================================================================
- """Definition of 300 VGG-based SSD network.
- This model was initially introduced in:
- SSD: Single Shot MultiBox Detector
- Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
- Cheng-Yang Fu, Alexander C. Berg
- https://arxiv.org/abs/1512.02325
- Two variants of the model are defined: the 300x300 and 512x512 models, the
- latter obtaining a slightly better accuracy on Pascal VOC.
- Usage:
- with slim.arg_scope(ssd_vgg.ssd_vgg()):
- outputs, end_points = ssd_vgg.ssd_vgg(inputs)
- This network port of the original Caffe model. The padding in TF and Caffe
- is slightly different, and can lead to severe accuracy drop if not taken care
- in a correct way!
- In Caffe, the output size of convolution and pooling layers are computing as
- following: h_o = (h_i + 2 * pad_h - kernel_h) / stride_h + 1
- Nevertheless, there is a subtle difference between both for stride > 1. In
- the case of convolution:
- top_size = floor((bottom_size + 2*pad - kernel_size) / stride) + 1
- whereas for pooling:
- top_size = ceil((bottom_size + 2*pad - kernel_size) / stride) + 1
- Hence implicitely allowing some additional padding even if pad = 0. This
- behaviour explains why pooling with stride and kernel of size 2 are behaving
- the same way in TensorFlow and Caffe.
- Nevertheless, this is not the case anymore for other kernel sizes, hence
- motivating the use of special padding layer for controlling these side-effects.
- @@ssd_vgg_300
- """
- import math
- from collections import namedtuple
- import numpy as np
- import tensorflow as tf
- import tf_extended as tfe
- from nets import custom_layers
- from nets import ssd_common
- slim = tf.contrib.slim
- # =========================================================================== #
- # SSD class definition.
- # =========================================================================== #
- #collections模組的namedtuple子類不僅可以使用item的index訪問item,還可以通過item的name進行訪問可以將namedtuple理解為c中的struct結構,其首先將各個item命名,然後對每個item賦予資料
- SSDParams = namedtuple('SSDParameters', ['img_shape', #輸入影象大小
- 'num_classes', #分類類別數
- 'no_annotation_label', #無標註標籤
- 'feat_layers', #特徵層
- 'feat_shapes', #特徵層形狀大小
- 'anchor_size_bounds', #錨點框大小上下邊界,是與原圖相比得到的小數值
- 'anchor_sizes', #初始錨點框尺寸
- 'anchor_ratios', #錨點框長寬比
- 'anchor_steps', #特徵圖相對原始影象的縮放
- 'anchor_offset', #錨點框中心的偏移
- 'normalizations', #是否正則化
- 'prior_scaling' #是對特徵圖參考框向gtbox做迴歸時用到的尺度縮放(0.1,0.1,0.2,0.2)
- ])
- class SSDNet(object):
- """Implementation of the SSD VGG-based 300 network.
- The default features layers with 300x300 image input are:
- conv4 ==> 38 x 38
- conv7 ==> 19 x 19
- conv8 ==> 10 x 10
- conv9 ==> 5 x 5
- conv10 ==> 3 x 3
- conv11 ==> 1 x 1
- The default image size used to train this network is 300x300. #訓練輸入影象尺寸預設為300x300
- """
- default_params = SSDParams( #預設引數
- img_shape=(300, 300),
- num_classes=21, #包含背景在內,共21類目標類別
- no_annotation_label=21,
- feat_layers=['block4', 'block7', 'block8', 'block9', 'block10', 'block11'], #特徵層名字
- feat_shapes=[(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)], #特徵層尺寸
- anchor_size_bounds=[0.15, 0.90],
- # anchor_size_bounds=[0.20, 0.90], #論文中初始預測框大小為0.2x300~0.9x300;實際程式碼是[45,270]
- anchor_sizes=[(21., 45.), #直接給出的每個特徵圖上起初的錨點框大小;如第一個特徵層框大小是h:21;w:45; 共6個特徵圖用於迴歸
- (45., 99.), #越小的框能夠得到原圖上更多的區域性資訊,反之得到更多的全域性資訊;
- (99., 153.),
- (153., 207.),
- (207., 261.),
- (261., 315.)],
- # anchor_sizes=[(30., 60.),
- # (60., 111.),
- # (111., 162.),
- # (162., 213.),
- # (213., 264.),
- # (264., 315.)],
- anchor_ratios=[[2, .5], #每個特徵層上的每個特徵點預測的box長寬比及數量;如:block4: def_boxes:4
- [2, .5, 3, 1./3], #block7: def_boxes:6 (ratios中的4個+預設的1:1+額外增加的一個=6)
- [2, .5, 3, 1./3], #block8: def_boxes:6
- [2, .5, 3, 1./3], #block9: def_boxes:6
- [2, .5], #block10: def_boxes:4
- [2, .5]], #block11: def_boxes:4 #備註:實際上略去了預設的ratio=1以及多加了一個sqrt(初始框寬*初始框高),後面程式碼有
- anchor_steps=[8, 16, 32, 64, 100, 300], #特徵圖錨點框放大到原始圖的縮放比例;
- anchor_offset=0.5, #每個錨點框中心點在該特徵圖cell中心,因此offset=0.5
- normalizations=[20, -1, -1, -1, -1, -1], #是否歸一化,大於0則進行,否則不做歸一化;目前看來只對block_4進行正則化,因為該層比較靠前,其norm較大,需做L2正則化(僅僅對每個畫素在channel維度做歸一化)以保證和後面檢測層差異不是很大;
- prior_scaling=[0.1, 0.1, 0.2, 0.2] #特徵圖上每個目標與參考框間的尺寸縮放(y,x,h,w)解碼時用到
- )
- def __init__(self, params=None): #網路引數的初始化
- """Init the SSD net with some parameters. Use the default ones
- if none provided.
- """
- if isinstance(params, SSDParams): #是否有引數輸入,是則用輸入的,否則使用預設的
- self.params = params #isinstance是python的內建函式,如果引數1與引數2的型別相同則返回true;
- else:
- self.params = SSDNet.default_params
- # ======================================================================= #
- def net(self, inputs, #定義網路模型
- is_training=True, #是否訓練
- update_feat_shapes=True, #是否更新特徵層的尺寸
- dropout_keep_prob=0.5, #dropout=0.5
- prediction_fn=slim.softmax, #採用softmax預測結果
- reuse=None,
- scope='ssd_300_vgg'): #網路名:ssd_300_vgg (基礎網路時VGG,輸入訓練影象size是300x300)
- """SSD network definition.
- """
- r = ssd_net(inputs, #網路輸入引數r
- num_classes=self.params.num_classes,
- feat_layers=self.params.feat_layers,
- anchor_sizes=self.params.anchor_sizes,
- anchor_ratios=self.params.anchor_ratios,
- normalizations=self.params.normalizations,
- is_training=is_training,
- dropout_keep_prob=dropout_keep_prob,
- prediction_fn=prediction_fn,
- reuse=reuse,
- scope=scope)
- # Update feature shapes (try at least!) #下面這步我的理解就是讓讀者自行更改特徵層的輸入,未必論文中介紹的那幾個block
- if update_feat_shapes: #是否更新特徵層影象尺寸?
- shapes = ssd_feat_shapes_from_net(r[0], self.params.feat_shapes) #輸入特徵層影象尺寸以及inputs(應該是預測的特徵尺寸),輸出更新後的特徵圖尺寸列表
- self.params = self.params._replace(feat_shapes=shapes) #將更新的特徵圖尺寸shapes替換當前的特徵圖尺寸
- return r #更新網路輸入引數r
- def arg_scope(self, weight_decay=0.0005, data_format='NHWC'): #定義權重衰減=0.0005,L2正則化項係數;資料型別是NHWC
- """Network arg_scope.
- """
- return ssd_arg_scope(weight_decay, data_format=data_format)
- def arg_scope_caffe(self, caffe_scope):
- """Caffe arg_scope used for weights importing.
- """
- return ssd_arg_scope_caffe(caffe_scope)
- # ======================================================================= #
- def update_feature_shapes(self, predictions): #更新特徵形狀尺寸(來自預測結果)
- """Update feature shapes from predictions collection (Tensor or Numpy
- array).
- """
- shapes = ssd_feat_shapes_from_net(predictions, self.params.feat_shapes)
- self.params = self.params._replace(feat_shapes=shapes)
- def anchors(self, img_shape, dtype=np.float32): #輸入原始影象尺寸;返回每個特徵層每個參考錨點框的位置及尺寸資訊(x,y,h,w)
- """Compute the default anchor boxes, given an image shape.
- """
- return ssd_anchors_all_layers(img_shape, #這是個關鍵函式;檢測所有特徵層中的參考錨點框位置和尺寸資訊
- self.params.feat_shapes,
- self.params.anchor_sizes,
- self.params.anchor_ratios,
- self.params.anchor_steps,
- self.params.anchor_offset,
- dtype)
- def bboxes_encode(self, labels, bboxes, anchors, #編碼,用於將標籤資訊,真實目標資訊和錨點框資訊編碼在一起;得到預測真實框到參考框的轉換值
- scope=None):
- """Encode labels and bounding boxes.
- """
- return ssd_common.tf_ssd_bboxes_encode(
- labels, bboxes, anchors,
- self.params.num_classes,
- self.params.no_annotation_label, #未標註的標籤(應該代表背景)
- ignore_threshold=0.5, #IOU篩選閾值
- prior_scaling=self.params.prior_scaling, #特徵圖目標與參考框間的尺寸縮放(0.1,0.1,0.2,0.2)
- scope=scope)
- def bboxes_decode(self, feat_localizations, anchors, #解碼,用錨點框資訊,錨點框與預測真實框間的轉換值,得到真是的預測框(ymin,xmin,ymax,xmax)
- scope='ssd_bboxes_decode'):
- """Encode labels and bounding boxes.
- """
- return ssd_common.tf_ssd_bboxes_decode(
- feat_localizations, anchors,
- prior_scaling=self.params.prior_scaling,
- scope=scope)
- def detected_bboxes(self, predictions, localisations, #通過SSD網路,得到檢測到的bbox
- select_threshold=None, nms_threshold=0.5,
- clipping_bbox=None, top_k=400, keep_top_k=200):
- """Get the detected bounding boxes from the SSD network output.
- """
- # Select top_k bboxes from predictions, and clip #選取top_k=400個框,並對框做修建(超出原圖尺寸範圍的切掉)
- rscores, rbboxes = \ #得到對應某個類別的得分值以及bbox
- ssd_common.tf_ssd_bboxes_select(predictions, localisations,
- select_threshold=select_threshold,
- num_classes=self.params.num_classes)
- rscores, rbboxes = \ #按照得分高低,篩選出400個bbox和對應得分
- tfe.bboxes_sort(rscores, rbboxes, top_k=top_k)
- # Apply NMS algorithm. #應用非極大值抑制,篩選掉與得分最高bbox重疊率大於0.5的,保留200個
- rscores, rbboxes = \
- tfe.bboxes_nms_batch(rscores, rbboxes,
- nms_threshold=nms_threshold,
- keep_top_k=keep_top_k)
- if clipping_bbox is not None:
- rbboxes = tfe.bboxes_clip(clipping_bbox, rbboxes)
- return rscores, rbboxes #返回裁剪好的bbox和對應得分
- #儘管一個ground truth可以與多個先驗框匹配,但是ground truth相對先驗框還是太少了,
- #所以負樣本相對正樣本會很多。為了保證正負樣本儘量平衡,SSD採用了hard negative mining,
- #就是對負樣本進行抽樣,抽樣時按照置信度誤差(預測背景的置信度越小,誤差越大)進行降序排列,
- #選取誤差的較大的top-k作為訓練的負樣本,以保證正負樣本比例接近1:3
- def losses(self, logits, localisations,
- gclasses, glocalisations, gscores,
- match_threshold=0.5,
- negative_ratio=3.,
- alpha=1.,
- label_smoothing=0.,
- scope='ssd_losses'):
- """Define the SSD network losses.
- """
- return ssd_losses(logits, localisations,
- gclasses, glocalisations, gscores,
- match_threshold=match_threshold,
- negative_ratio=negative_ratio,
- alpha=alpha,
- label_smoothing=label_smoothing,
- scope=scope)
- # =========================================================================== #
- # SSD tools...
- # =========================================================================== #
- def ssd_size_bounds_to_values(size_bounds,
- n_feat_layers,
- img_shape=(300, 300)):
- """Compute the reference sizes of the anchor boxes from relative bounds.
- The absolute values are measured in pixels, based on the network
- default size (300 pixels).
- This function follows the computation performed in the original
- implementation of SSD in Caffe.
- Return:
- list of list containing the absolute sizes at each scale. For each scale,
- the ratios only apply to the first value.
- """
- assert img_shape[0] == img_shape[1]
- img_size = img_shape[0]
- min_ratio = int(size_bounds[0] * 100)
- max_ratio = int(size_bounds[1] * 100)
- step = int(math.floor((max_ratio - min_ratio) / (n_feat_layers - 2)))
- # Start with the following smallest sizes.
- sizes = [[img_size * size_bounds[0] / 2, img_size * size_bounds[0]]]
- for ratio in range(min_ratio, max_ratio + 1, step):
- sizes.append((img_size * ratio / 100.,
- img_size * (ratio + step) / 100.))
- return sizes
- def ssd_feat_shapes_from_net(predictions, default_shapes=None):
- """Try to obtain the feature shapes from the prediction layers. The latter
- can be either a Tensor or Numpy ndarray.
- Return:
- list of feature shapes. Default values if predictions shape not fully
- determined.
- """
- feat_shapes = []
- for l in predictions: #l:是預測的特徵形狀
- # Get the shape, from either a np array or a tensor.
- if isinstance(l, np.ndarray): #如果l是np.ndarray型別,則將l的形狀賦給shape;否則將shape作為list
- shape = l.shape
- else:
- shape = l.get_shape().as_list()
- shape = shape[1:4]
- # Problem: undetermined shape... #如果預測的特徵尺寸未定,則使用預設的形狀;否則將shape中的值賦給特徵形狀列表中
- if None in shape:
- return default_shapes
- else:
- feat_shapes.append(shape)
- return feat_shapes #返回更新後的特徵尺寸list
- def ssd_anchor_one_layer(img_shape, #檢測單個特徵圖中所有錨點的座標和尺寸資訊(未與原圖做除法)
- feat_shape,
- sizes,
- ratios,
- step,
- offset=0.5,
- dtype=np.float32):
- """Computer SSD default anchor boxes for one feature layer.
- Determine the relative position grid of the centers, and the relative
- width and height.
- Arguments:
- feat_shape: Feature shape, used for computing relative position grids;
- size: Absolute reference sizes;
- ratios: Ratios to use on these features;
- img_shape: Image shape, used for computing height, width relatively to the
- former;
- offset: Grid offset.
- Return:
- y, x, h, w: Relative x and y grids, and height and width.
- """
- # Compute the position grid: simple way.
- # y, x = np.mgrid[0:feat_shape[0], 0:feat_shape[1]]
- # y = (y.astype(dtype) + offset) / feat_shape[0]
- # x = (x.astype(dtype) + offset) / feat_shape[1]
- # Weird SSD-Caffe computation using steps values... #歸一化到原圖的錨點中心座標(x,y);其座標值域為(0,1)
- y, x = np.mgrid[0:feat_shape[0], 0:feat_shape[1]] #對於第一個特徵圖(block4:38x38);y=[[0,0,……0],[1,1,……1],……[37,37,……,37]];而x=[[0,1,2……,37],[0,1,2……,37],……[0,1,2……,37]]
- y = (y.astype(dtype) + offset) * step / img_shape[0] #將38個cell對應錨點框的y座標偏移至每個cell中心,然後乘以相對原圖縮放的比例,再除以原圖
- x = (x.astype(dtype) + offset) * step / img_shape[1] #可以得到在原圖上,相對原圖比例大小的每個錨點中心座標x,y
- # Expand dims to support easy broadcasting. #將錨點中心座標擴大維度
- y = np.expand_dims(y, axis=-1) #對於第一個特徵圖,y的shape=38x38x1;x的shape=38x38x1
- x = np.expand_dims(x, axis=-1)
- # Compute relative height and width.
- # Tries to follow the original implementation of SSD for the order.
- num_anchors = len(sizes) + len(ratios) #該特徵圖上每個點對應的錨點框數量;如:對於第一個特徵圖每個點預測4個錨點框(block4:38x38),2+2=4
- h = np.zeros((num_anchors, ), dtype=dtype) #對於第一個特徵圖,h的shape=4x;w的shape=4x
- w = np.zeros((num_anchors, ), dtype=dtype)
- # Add first anchor boxes with ratio=1.
- h[0] = sizes[0] / img_shape[0] #第一個錨點框的高h[0]=起始錨點的高/原圖大小的高;例如:h[0]=21/300
- w[0] = sizes[0] / img_shape[1] #第一個錨點框的寬w[0]=起始錨點的寬/原圖大小的寬;例如:h[0]=45/300
- di = 1 #錨點寬個數偏移
- if len(sizes) > 1:
- h[1] = math.sqrt(sizes[0] * sizes[1]) / img_shape[0] #第二個錨點框的高h[1]=sqrt(起始錨點的高*起始錨點的寬)/原圖大小的高;例如:h[1]=sqrt(21*45)/300
- w[1] = math.sqrt(sizes[0] * sizes[1]) / img_shape[1] #第二個錨點框的高w[1]=sqrt(起始錨點的高*起始錨點的寬)/原圖大小的寬;例如:w[1]=sqrt(21*45)/300
- di += 1 #di=2
- for i, r in enumerate(ratios): #遍歷長寬比例,第一個特徵圖,r只有兩個,2和0.5;共四個錨點寬size(h[0]~h[3])
- h[i+di] = sizes[0] / img_shape[0] / math.sqrt(r) #例如:對於第一個特徵圖,h[0+2]=h[2]=21/300/sqrt(2);w[0+2]=w[2]=45/300*sqrt(2)
- w[i+di] = sizes[0] / img_shape[1] * math.sqrt(r) #例如:對於第一個特徵圖,h[1+2]=h[3]=21/300/sqrt(0.5);w[1+2]=w[3]=45/300*sqrt(0.5)
- return y, x, h, w #返回沒有歸一化前的錨點座標和尺寸
- def ssd_anchors_all_layers(img_shape, #檢測所有特徵圖中錨點框的四個座標資訊; 輸入原始圖大小
- layers_shape, #每個特徵層形狀尺寸
- anchor_sizes, #起始特徵圖中框的長寬size
- anchor_ratios, #錨點框長寬比列表
- anchor_steps, #錨點框相對原圖縮放比例
- offset=0.5, #錨點中心在每個特徵圖cell中的偏移
- dtype=np.float32):
- """Compute anchor boxes for all feature layers.
- """
- layers_anchors = [] #用於存放所有特徵圖中錨點框位置尺寸資訊
- for i, s in enumerate(layers_shape): #6個特徵圖尺寸;如:第0個是38x38
- anchor_bboxes = ssd_anchor_one_layer(img_shape, s, #分別計算每個特徵圖中錨點框的位置尺寸資訊;
- anchor_sizes[i], #輸入:第i個特徵圖中起始錨點框大小;如第0個是(21., 45.)
- anchor_ratios[i], #輸入:第i個特徵圖中錨點框長寬比列表;如第0個是[2, .5]
- anchor_steps[i], #輸入:第i個特徵圖中錨點框相對原始圖的縮放比;如第0個是8
- offset=offset, dtype=dtype) #輸入:錨點中心在每個特徵圖cell中的偏移
- layers_anchors.append(anchor_bboxes) #將6個特徵圖中每個特徵圖上的點對應的錨點框(6個或4個)儲存
- return layers_anchors
- # =========================================================================== #
- # Functional definition of VGG-based SSD 300.
- # =========================================================================== #
- def tensor_shape(x, rank=3):
- """Returns the dimensions of a tensor.
- Args:
- image: A N-D Tensor of shape.
- Returns:
- A list of dimensions. Dimensions that are statically known are python
- integers,otherwise they are integer scalar tensors.
- """
- if x.get_shape().is_fully_defined():
- return x.get_shape().as_list()
- else:
- static_shape = x.get_shape().with_rank(rank).as_list()
- dynamic_shape = tf.unstack(tf.shape(x), rank)
- return [s if s is not None else d
- for s, d in zip(static_shape, dynamic_shape)]
- def ssd_multibox_layer(inputs, #輸入特徵層
- num_classes, #類別數
- sizes, #參考先驗框的尺度
- ratios=[1], #預設的先驗框長寬比為1
- normalization=-1, #預設不做正則化
- bn_normalization=False):
- """Construct a multibox layer, return a class and localization predictions.
- """
- net = inputs
- if normalization > 0: #如果輸入整數,則進行L2正則化
- net = custom_layers.l2_normalization(net, scaling=True) #對通道所在維度進行正則化,隨後乘以gamma縮放係數
- # Number of anchors.
- num_anchors = len(sizes) + len(ratios) #每層特徵圖參考先驗框的個數[4,6,6,6,4,4]
- # Location. #每個先驗框對應4個座標資訊
- num_loc_pred = num_anchors * 4 #特徵圖上每個單元預測的座標所需維度=錨點框數*4
- loc_pred = slim.conv2d(net, num_loc_pred, [3, 3], activation_fn=None, #通過對特徵圖進行3x3卷積得到位置資訊和類別權重資訊
- scope='conv_loc') #該部分是定位資訊,輸出維度為[特徵圖h,特徵圖w,每個單元所有錨點框座標]
- loc_pred = custom_layers.channel_to_last(loc_pred)
- loc_pred = tf.reshape(loc_pred, #最後整個特徵圖所有錨點框預測目標位置 tensor為[h*w*每個cell先驗框數,4]
- tensor_shape(loc_pred, 4)[:-1]+[num_anchors, 4])
- # Class prediction. #類別預測
- num_cls_pred = num_anchors * num_classes #特徵圖上每個單元預測的類別所需維度=錨點框數*種類數
- cls_pred = slim.conv2d(net, num_cls_pred, [3, 3], activation_fn=None, #該部分是類別資訊,輸出維度為[特徵圖h,特徵圖w,每個單元所有錨點框對應類別資訊]
- scope='conv_cls')
- cls_pred = custom_layers.channel_to_last(cls_pred)
- cls_pred = tf.reshape(cls_pred,
- tensor_shape(cls_pred, 4)[:-1]+[num_anchors, num_classes]) #最後整個特徵圖所有錨點框預測類別 tensor為[h*w*每個cell先驗框數,種類數]
- return cls_pred, loc_pred #返回預測得到的類別和box位置 tensor
- def ssd_net(inputs, #定義ssd網路結構
- num_classes=SSDNet.default_params.num_classes, #分類數
- feat_layers=SSDNet.default_params.feat_layers, #特徵層
- anchor_sizes=SSDNet.default_params.anchor_sizes,
- anchor_ratios=SSDNet.default_params.anchor_ratios,
- normalizations=SSDNet.default_params.normalizations, #正則化
- is_training=True,
- dropout_keep_prob=0.5,
- prediction_fn=slim.softmax,
- reuse=None,
- scope='ssd_300_vgg'):
- """SSD net definition.
- """
- # if data_format == 'NCHW':
- # inputs = tf.transpose(inputs, perm=(0, 3, 1, 2))
- # End_points collect relevant activations for external use.
- end_points = {} #用於收集每一層輸出結果
- with tf.variable_scope(scope, 'ssd_300_vgg', [inputs], reuse=reuse):
- # Original VGG-16 blocks.
- net = slim.repeat(inputs, 2, slim.conv2d, 64, [3, 3], scope='conv1') #VGG16網路的第一個conv,重複2次卷積,核為3x3,64個特徵
- end_points['block1'] = net #conv1_2結果存入end_points,name='block1'
- net = slim.max_pool2d(net, [2, 2], scope='pool1')
- # Block 2.
- net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3], scope='conv2') #重複2次卷積,核為3x3,128個特徵
- end_points['block2'] = net #conv2_2結果存入end_points,name='block2'
- net = slim.max_pool2d(net, [2, 2], scope='pool2')
- # Block 3.
- net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], scope='conv3') #重複3次卷積,核為3x3,256個特徵
- end_points['block3'] = net #conv3_3結果存入end_points,name='block3'
- net = slim.max_pool2d(net, [2, 2], scope='pool3')
- # Block 4.
- net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv4') #重複3次卷積,核為3x3,512個特徵
- end_points['block4'] = net #conv4_3結果存入end_points,name='block4'
- net = slim.max_pool2d(net, [2, 2], scope='pool4')
- # Block 5.
- net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv5') #重複3次卷積,核為3x3,512個特徵
- end_points['block5'] = net #conv5_3結果存入end_points,name='block5'
- net = slim.max_pool2d(net, [3, 3], stride=1, scope='pool5')
- # Additional SSD blocks. #去掉了VGG的全連線層
- # Block 6: let's dilate the hell out of it!
- net = slim.conv2d(net, 1024, [3, 3], rate=6, scope='conv6') #將VGG基礎網路最後的池化層結果做擴展卷積(帶孔卷積);
- end_points['block6'] = net #conv6結果存入end_points,name='block6'
- net = tf.layers.dropout(net, rate=dropout_keep_prob, training=is_training) #dropout層
- # Block 7: 1x1 conv. Because the fuck.
- net = slim.conv2d(net, 1024, [1, 1], scope='conv7') #將dropout後的網路做1x1卷積,輸出1024特徵,name='block7'
- end_points['block7'] = net
- net = tf.layers.dropout(net, rate=dropout_keep_prob, training=is_training) #將卷積後的網路繼續做dropout
- # Block 8/9/10/11: 1x1 and 3x3 convolutions stride 2 (except lasts).
- end_point = 'block8'
- with tf.variable_scope(end_point):
- net = slim.conv2d(net, 256, [1, 1], scope='conv1x1') #對上述dropout的網路做1x1卷積,然後做3x3卷積,,輸出512特徵圖,name=‘block8’
- net = custom_layers.pad2d(net, pad=(1, 1))
- net = slim.conv2d(net, 512, [3, 3], stride=2, scope='conv3x3', padding='VALID')
- end_points[end_point] = net
- end_point = 'block9'
- with tf.variable_scope(end_point):
- net = slim.conv2d(net, 128, [1, 1], scope='conv1x1') #對上述網路做1x1卷積,然後做3x3卷積,輸出256特徵圖,name=‘block9’
- net = custom_layers.pad2d(net, pad=(1, 1))
- net = slim.conv2d(net, 256, [3, 3], stride=2, scope='conv3x3', padding='VALID')
- end_points[end_point] = net
- end_point = 'block10'
- with tf.variable_scope(end_point):
- net = slim.conv2d(net, 128, [1, 1], scope='conv1x1') #對上述網路做1x1卷積,然後做3x3卷積,輸出256特徵圖,name=‘block10’
- net = slim.conv2d(net, 256, [3, 3], scope='conv3x3', padding='VALID')
- end_points[end_point] = net
- end_point = 'block11'
- with tf.variable_scope(end_point):
- net = slim.conv2d(net, 128, [1, 1], scope='conv1x1') #對上述網路做1x1卷積,然後做3x3卷積,輸出256特徵圖,name=‘block11’
- net = slim.conv2d(net, 256, [3, 3], scope='conv3x3', padding='VALID')
- end_points[end_point] = net
- # Prediction and localisations layers. #預測和定位
- predictions = []
- logits = []
- localisations = []
- for i, layer in enumerate(feat_layers): #遍歷特徵層
- with tf.variable_scope(layer + '_box'): #起個命名範圍
- p, l = ssd_multibox_layer(end_points[layer], #做多尺度大小box預測的特徵層,返回每個cell中每個先驗框預測的類別p和預測的位置l
- num_classes, #種類數
- anchor_sizes[i], #先驗框尺度(同一特徵圖上的先驗框尺度和長寬比一致)
- anchor_ratios[i], #先驗框長寬比
- normalizations[i]) #每個特徵正則化資訊,目前是隻對第一個特徵圖做歸一化操作;
- #把每一層的預測收集
- predictions.append(prediction_fn(p)) #prediction_fn為softmax,預測類別
- logits.append(p) #把每個cell每個先驗框預測的類別的概率值存在logits中
- localisations.append(l) #預測位置資訊
- return predictions, localisations, logits, end_points #返回類別預測結果,位置預測結果,所屬某個類別的概率值,以及特徵層
- ssd_net.default_image_size = 300
- def ssd_arg_scope(weight_decay=0.0005, data_format='NHWC'): #權重衰減係數=0.0005;其是L2正則化項的係數
- """Defines the VGG arg scope.
- Args:
- weight_decay: The l2 regularization coefficient.
- Returns:
- An arg_scope.
- """
- with slim.arg_scope([slim.conv2d, slim.fully_connected],
- activation_fn=tf.nn.relu,
- weights_regularizer=slim.l2_regularizer(weight_decay),
- weights_initializer=tf.contrib.layers.xavier_initializer(),
- biases_initializer=tf.zeros_initializer()):
- with slim.arg_scope([slim.conv2d, slim.max_pool2d],
- padding='SAME',
- data_format=data_format):
- with slim.arg_scope([custom_layers.pad2d,
- custom_layers.l2_normalization,
- custom_layers.channel_to_last],
- data_format=data_format) as sc:
- return sc
- # =========================================================================== #
- # Caffe scope: importing weights at initialization.
- # =========================================================================== #
- def ssd_arg_scope_caffe(caffe_scope):
- """Caffe scope definition.
- Args:
- caffe_scope: Caffe scope object with loaded weights.
- Returns:
- An arg_scope.
- """
- # Default network arg scope.
- with slim.arg_scope([slim.conv2d],
- activation_fn=tf.nn.relu,
- weights_initializer=caffe_scope.conv_weights_init(),
- biases_initializer=caffe_scope.conv_biases_init()):
- with slim.arg_scope([slim.fully_connected],
- activation_fn=tf.nn.relu):
- with slim.arg_scope([custom_layers.l2_normalization],
- scale_initializer=caffe_scope.l2_norm_scale_init()):
- with slim.arg_scope([slim.conv2d, slim.max_pool2d],
- padding='SAME') as sc:
- return sc
- # =========================================================================== #
- # SSD loss function.
- # =========================================================================== #
- def ssd_losses(logits, localisations, #損失函式定義為位置誤差和置信度誤差的加權和;
- gclasses, glocalisations, gscores,
- match_threshold=0.5,
- negative_ratio=3.,
- alpha=1., #位置誤差權重係數
- label_smoothing=0.,
- device='/cpu:0',
- scope=None):
- with tf.name_scope(scope, 'ssd_losses'):
- lshape = tfe.get_shape(logits[0], 5)
- num_classes = lshape[-1]
- batch_size = lshape[0]
- # Flatten out all vectors!
- flogits = []
- fgclasses = []
- fgscores = []
- flocalisations = []
- fglocalisations = []
- for i in range(len(logits)):
- flogits.append(tf.reshape(logits[i], [-1, num_classes])) #將類別的概率值reshape成(-1,21)
- fgclasses.append(tf.reshape(gclasses[i], [-1])) #真實類別
- fgscores.append(tf.reshape(gscores[i], [-1])) #預測真實目標的得分
- flocalisations.append(tf.reshape(localisations[i], [-1, 4])) #預測真實目標邊框座標(編碼形式的值)
- fglocalisations.append(tf.reshape(glocalisations[i], [-1, 4])) #用於將真實目標gt的座標進行編碼儲存
- # And concat the crap!
- logits = tf.concat(flogits, axis=0)
- gclasses = tf.concat(fgclasses, axis=0)
- gscores = tf.concat(fgscores, axis=0)
- localisations = tf.concat(flocalisations, axis=0)
- glocalisations = tf.concat(fglocalisations, axis=0)
- dtype = logits.dtype
- # Compute positive matching mask...
- pmask = gscores > match_threshold #預測框與真實框IOU>0.5則將這個先驗作為正樣本
- fpmask = tf.cast(pmask, dtype)
- n_positives = tf.reduce_sum(fpmask) #求正樣本數量N
- # Hard negative mining... 為了保證正負樣本儘量平衡,SSD採用了hard negative mining,就是對負樣本進行抽樣,抽樣時按照置信度誤差(預測背景的置信度越小,誤差越大)進行降序排列,選取誤差的較大的top-k作為訓練的負樣本,以保證正負樣本比例接近1:3
- no_classes = tf.cast(pmask, tf.int32)
- predictions = slim.softmax(logits) #類別預測
- nmask = tf.logical_and(tf.logical_not(pmask),
- gscores > -0.5)
- fnmask = tf.cast(nmask, dtype)
- nvalues = tf.where(nmask,
- predictions[:, 0],
- 1. - fnmask)
- nvalues_flat = tf.reshape(nvalues, [-1])
- # Number of negative entries to select.
- max_neg_entries = tf.cast(tf.reduce_sum(fnmask), tf.int32)
- n_neg = tf.cast(negative_ratio * n_positives, tf.int32) + batch_size #負樣本數量,保證是正樣本3倍
- n_neg = tf.minimum(n_neg, max_neg_entries)
- val, idxes = tf.nn.top_k(-nvalues_flat, k=n_neg) #抽樣時按照置信度誤差(預測背景的置信度越小,誤差越大)進行降序排列,選取誤差的較大的top-k作為訓練的負樣本
- max_hard_pred = -val[-1]
- # Final negative mask.
- nmask = tf.logical_and(nmask, nvalues < max_hard_pred)
- fnmask = tf.cast(nmask, dtype)
- # Add cross-entropy loss. #交叉熵
- with tf.name_scope('cross_entropy_pos'):
- loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, #類別置信度誤差
- labels=gclasses)
- loss = tf.div(tf.reduce_sum(loss * fpmask), batch_size, name='value') #將置信度誤差除以正樣本數後除以batch-size
- tf.losses.add_loss(loss)
- with tf.name_scope('cross_entropy_neg'):
- loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
- labels=no_classes)
- loss = tf.div(tf.reduce_sum(loss * fnmask), batch_size, name='value')
- tf.losses.add_loss(loss)
- # Add localization loss: smooth L1, L2, ...
- with tf.name_scope('localization'):
- # Weights Tensor: positive mask + random negative.
- weights = tf.expand_dims(alpha * fpmask, axis=-1)
- loss = custom_layers.abs_smooth(localisations - glocalisations) #先驗框對應邊界的位置預測值-真實位置;然後做Smooth L1 loss
- loss = tf.div(tf.reduce_sum(loss * weights), batch_size, name='value') #將上面的loss*權重(=alpha/正樣本數)求和後除以batch-size
- tf.losses.add_loss(loss) #獲得置信度誤差和位置誤差的加權和
- def ssd_losses_old(logits, localisations,
- gclasses, glocalisations, gscores,
- match_threshold=0.5,
- negative_ratio=3.,
- alpha=1.,
- label_smoothing=0.,
- device='/cpu:0',
- scope=None):
- """Loss functions for training the SSD 300 VGG network.
- This function defines the different loss components of the SSD, and
- adds them to the TF loss collection.
- Arguments:
- logits: (list of) predictions logits Tensors;
- localisations: (list of) localisations Tensors;
- gclasses: (list of) groundtruth labels Tensors;
- glocalisations: (list of) groundtruth localisations Tensors;
- gscores: (list of) groundtruth score Tensors;
- """
- with tf.device(device):
- with tf.name_scope(scope, 'ssd_losses'):
- l_cross_pos = []
- l_cross_neg = []
- l_loc = []
- for i in range(len(logits)):
- dtype = logits[i].dtype
- with tf.name_scope('block_%i' % i):
- # Sizing weight...
- wsize = tfe.get_shape(logits[i], rank=5)
- wsize = wsize[1] * wsize[2] * wsize[3]
- # Positive mask.
- pmask = gscores[i] > match_threshold
- fpmask = tf.cast(pmask, dtype)
- n_positives = tf.reduce_sum(fpmask)
- # Select some random negative entries.
- # n_entries = np.prod(gclasses[i].get_shape().as_list())
- # r_positive = n_positives / n_entries
- # r_negative = negative_ratio * n_positives / (n_entries - n_positives)
- # Negative mask.
- no_classes = tf.cast(pmask, tf.int32)
- predictions = slim.softmax(logits[i])
- nmask = tf.logical_and(tf.logical_not(pmask),
- gscores[i] > -0.5)
- fnmask = tf.cast(nmask, dtype)
- nvalues = tf.where(nmask,
- predictions[:, :, :, :, 0],
- 1. - fnmask)
- nvalues_flat = tf.reshape(nvalues, [-1])
- # Number of negative entries to select.
- n_neg = tf.cast(negative_ratio * n_positives, tf.int32)
- n_neg = tf.maximum(n_neg, tf.size(nvalues_flat) // 8)
- n_neg = tf.maximum(n_neg, tf.shape(nvalues)[0] * 4)
- max_neg_entries = 1 + tf.cast(tf.reduce_sum(fnmask), tf.int32)
- n_neg = tf.minimum(n_neg, max_neg_entries)
- val, idxes = tf.nn.top_k(-nvalues_flat, k=n_neg)
- max_hard_pred = -val[-1]
- # Final negative mask.
- nmask = tf.logical_and(nmask, nvalues < max_hard_pred)
- fnmask = tf.cast(nmask, dtype)
- # Add cross-entropy loss.
- with tf.name_scope('cross_entropy_pos'):
- fpmask = wsize * fpmask
- loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits[i],
- labels=gclasses[i])
- loss = tf.losses.compute_weighted_loss(loss, fpmask)
- l_cross_pos.append(loss)
- with tf.name_scope('cross_entropy_neg'):
- fnmask = wsize * fnmask
- loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits[i],
- labels=no_classes)
- loss = tf.losses.compute_weighted_loss(loss, fnmask)
- l_cross_neg.append(loss)
- # Add localization loss: smooth L1, L2, ...
- with tf.name_scope('localization'):
- # Weights Tensor: positive mask + random negative.
- weights = tf.expand_dims(alpha * fpmask, axis=-1)
- loss = custom_layers.abs_smooth(localisations[i] - glocalisations[i])
- loss = tf.losses.compute_weighted_loss(loss, weights)
- l_loc.a