1. 程式人生 > >pytorch模型引數理解備忘

pytorch模型引數理解備忘

模型結構

def downsample_basic_block(x, planes, stride):
    out = F.avg_pool3d(x, kernel_size=1, stride=stride)
    zero_pads = torch.Tensor(
        out.size(0), planes - out.size(1), out.size(2), out.size(3),
        out.size(4)).zero_()
    if isinstance(out.data, torch.cuda.FloatTensor):
        zero_pads = zero_pads.cuda()

    out
= Variable(torch.cat([out.data, zero_pads], dim=1)) return out class ResNeXtBottleneck(nn.Module): expansion = 2 def __init__(self, inplanes, planes, cardinality, stride=1, downsample=None,conv3d_bias=True): super(ResNeXtBottleneck, self).__init__() mid_planes = cardinality * int(planes / 32
) self.conv1 = nn.Conv3d(inplanes, mid_planes, kernel_size=1, bias=conv3d_bias) self.bn1 = nn.BatchNorm3d(mid_planes) self.conv2 = nn.Conv3d( mid_planes, mid_planes, kernel_size=3, stride=stride, padding=1, groups=cardinality, bias=conv3d_bias) self
.bn2 = nn.BatchNorm3d(mid_planes) self.conv3 = nn.Conv3d( mid_planes, planes * self.expansion, kernel_size=1, bias=conv3d_bias) self.bn3 = nn.BatchNorm3d(planes * self.expansion) self.relu = nn.ReLU(inplace=True) self.downsample = downsample self.stride = stride def forward(self, x): residual = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) out = self.relu(out) out = self.conv3(out) out = self.bn3(out) if self.downsample is not None: residual = self.downsample(x) out += residual out = self.relu(out) return out class ResNeXt(nn.Module): def __init__(self, block, layers, sample_size, sample_duration, shortcut_type='B', cardinality=32, num_classes=400, conv3d_bias=True): self.conv3d_bias = conv3d_bias self.inplanes = 64 super(ResNeXt, self).__init__() self.conv1 = nn.Conv3d( 3, 64, kernel_size=7, stride=2, padding=3, bias=conv3d_bias) self.bn1 = nn.BatchNorm3d(64) self.relu = nn.ReLU(inplace=True) self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1) self.layer1 = self._make_layer(block, 128, layers[0], shortcut_type, cardinality) self.layer2 = self._make_layer( block, 256, layers[1], shortcut_type, cardinality, stride=2) self.layer3 = self._make_layer( block, 512, layers[2], shortcut_type, cardinality, stride=2) self.layer4 = self._make_layer( block, 1024, layers[3], shortcut_type, cardinality, stride=2) last_duration = int(math.ceil(sample_duration / 16)) last_size = int(math.ceil(sample_size / 32)) self.avgpool = nn.AvgPool3d( (last_duration, last_size, last_size), stride=1) self.fc = nn.Linear(cardinality * 32 * block.expansion, num_classes) for m in self.modules(): if isinstance(m, nn.Conv3d): m.weight = nn.init.kaiming_normal_(m.weight, mode='fan_out') elif isinstance(m, nn.BatchNorm3d): m.weight.data.fill_(1) m.bias.data.zero_() def _make_layer(self, block, planes, blocks, shortcut_type, cardinality, stride=1): downsample = None if stride != 1 or self.inplanes != planes * block.expansion: if shortcut_type == 'A': downsample = partial( downsample_basic_block, planes=planes * block.expansion, stride=stride) else: downsample = nn.Sequential( nn.Conv3d( self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=self.conv3d_bias), nn.BatchNorm3d(planes * block.expansion)) layers = [] layers.append( block(self.inplanes, planes, cardinality, stride, downsample, conv3d_bias=self.conv3d_bias)) self.inplanes = planes * block.expansion for i in range(1, blocks): layers.append(block(self.inplanes, planes, cardinality, conv3d_bias=self.conv3d_bias)) return nn.Sequential(*layers) def forward(self, x): x = self.conv1(x) x = self.bn1(x) x = self.relu(x) x = self.maxpool(x) x = self.layer1(x) x = self.layer2(x) x = self.layer3(x) x = self.layer4(x) x = self.avgpool(x) x = x.view(x.size(0), -1) x = self.fc(x) return x
    pretrain = torch.load(opt.pretrain_path)
    assert opt.arch == pretrain['arch']
    from collections import OrderedDict
    new_state_dict = OrderedDict()
    for k, v in pretrain['state_dict'].items():
        print(k)
        name = k[7:]
        print(name) # remove `module.`
        new_state_dict[name] = v

程式結果
這裡寫圖片描述
這裡之所以需要pretrain['state_dict']而不是直接使用model.load_state_dict(torch.load(opt.pretrain_path))是因為儲存模型的時候不但儲存了引數,還有周期,結構等資訊。

        states = {
            'epoch': epoch + 1,
            'arch': opt.arch,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict(),
        }
        torch.save(states, save_file_path)

name = k[7:]去掉每一個引數名的前七個字元,因為下載的預訓練模型是在torch.nn.DataParallel 分散式下訓練的,而我只有單卡,所以需要去掉引數名前面的module,再load。

optimizer.state_dict()stateparam_groups 兩個key,其中param_groups的value如下所示。
這裡寫圖片描述

    for k, v in model.named_parameters():
        print(k)

這裡寫圖片描述

    for i in model.parameters():
        print(i)

這裡寫圖片描述

print(model.fc)

返回

Linear(in_features=2048, out_features=90, bias=True)
print(model.layer4)

這裡寫圖片描述
總結:首先構建計算圖,返回model。如果只想要一致的學習率,只需要再optimizer的第一個引數裡寫model.parameters()。model.parameters()應該是個有順序的字典,此時len(optimizer.param_groups)等於1。如果想分別設定學習率等引數,可以如下設定,因為model.parameters()字典有順序,所以這裡列表新增的{‘params’: v, ‘lr’: 0.0}都沒有引數名,剩下的沒有定義的引數如momentum就按照optimizer引數設定的給每個都分配

    for i in range(opt.ft_begin_index, 5):
        ft_module_names.append('layer{}'.format(i))
    ft_module_names.append('fc')
    parameters = []
    for k, v in model.parameters():
        print(k)
        for ft_module in ft_module_names:
            if ft_module in k:
                parameters.append({'params': v})
                break
        else:
            parameters.append({'params': v, 'lr': 0.0})