1. 程式人生 > >Pytorch yolov3 多GPU 訓練

Pytorch yolov3 多GPU 訓練

pytorch 多gpu訓練:

# -*- coding:utf-8 -*-
from __future__ import division

import datetime
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import numpy as np

from PIL import Image

from utils.parse_config import *
from utils.utils import build_targets
from 
collections import defaultdict def create_modules(module_defs): """ Constructs module list of layer blocks from module configuration in module_defs """ #根據cfg檔案建立yolov3網路結構 hyperparams = module_defs.pop(0) output_filters = [int(hyperparams['channels'])] module_list = nn.ModuleList() for
i, module_def in enumerate(module_defs): modules = nn.Sequential() if module_def['type'] == 'convolutional': bn = int(module_def['batch_normalize']) filters = int(module_def['filters']) kernel_size = int(module_def['size']) pad = (kernel_size - 1
) // 2 if int(module_def['pad']) else 0 modules.add_module('conv_%d' % i, nn.Conv2d(in_channels=output_filters[-1], out_channels=filters, kernel_size=kernel_size, stride=int(module_def['stride']), padding=pad, bias=not bn)) if bn: modules.add_module('batch_norm_%d' % i, nn.BatchNorm2d(filters)) if module_def['activation'] == 'leaky': modules.add_module('leaky_%d' % i, nn.LeakyReLU(0.1)) elif module_def['type'] == 'upsample': upsample = nn.Upsample( scale_factor=int(module_def['stride']), mode='nearest') modules.add_module('upsample_%d' % i, upsample) elif module_def['type'] == 'route': layers = [int(x) for x in module_def["layers"].split(',')] filters = sum([output_filters[layer_i] for layer_i in layers]) modules.add_module('route_%d' % i, EmptyLayer()) elif module_def['type'] == 'shortcut': filters = output_filters[int(module_def['from'])] modules.add_module("shortcut_%d" % i, EmptyLayer()) elif module_def["type"] == "yolo": anchor_idxs = [int(x) for x in module_def["mask"].split(",")] # Extract anchors anchors = module_def["anchors"] anchors = [anchors[i] for i in anchor_idxs] num_classes = int(module_def['classes']) img_height = int(hyperparams['height']) # Define detection layer yolo_layer = YOLOLayer(anchors, num_classes, img_height) modules.add_module('yolo_%d' % i, yolo_layer) # Register module list and number of output filters module_list.append(modules) output_filters.append(filters) return hyperparams, module_list class EmptyLayer(nn.Module): """Placeholder for 'route' and 'shortcut' layers""" def __init__(self): super(EmptyLayer, self).__init__() class YOLOLayer(nn.Module): """Detection layer""" def __init__(self, anchors, num_classes, image_dim): super(YOLOLayer, self).__init__() self.anchors = anchors self.scaled_anchors = None self.num_anchors = len(anchors) self.num_classes = num_classes self.bbox_attrs = 5 + num_classes self.image_dim = image_dim self.ignore_thres = 0.5 self.coord_scale = 1 self.noobject_scale = 1 self.object_scale = 5 self.class_scale = 1 self.seen = 0 self.mse_loss = nn.MSELoss() self.bce_loss = nn.BCELoss() # self.bce_logits_loss = nn.BCEWithLogitsLoss() def forward(self, x, targets=None): bs = x.size(0) g_dim = x.size(2) stride = self.image_dim / g_dim # Tensors for cuda support FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor prediction = x.view(bs, self.num_anchors, self.bbox_attrs, g_dim, g_dim).permute(0, 1, 3, 4, 2).contiguous() # Get outputs x = torch.sigmoid(prediction[..., 0]) # Center x y = torch.sigmoid(prediction[..., 1]) # Center y w = prediction[..., 2] # Width h = prediction[..., 3] # Height conf = torch.sigmoid(prediction[..., 4]) # Conf pred_cls = torch.sigmoid(prediction[..., 5:]) # Cls pred. # Calculate offsets for each grid grid_x = torch.linspace(0, g_dim-1, g_dim).repeat(g_dim,1).repeat(bs*self.num_anchors, 1, 1).view(x.shape).type(FloatTensor) grid_y = torch.linspace(0, g_dim-1, g_dim).repeat(g_dim,1).t().repeat(bs*self.num_anchors, 1, 1).view(y.shape).type(FloatTensor) scaled_anchors = [(a_w / stride, a_h / stride) for a_w, a_h in self.anchors] anchor_w = FloatTensor(scaled_anchors).index_select(1, LongTensor([0])) anchor_h = FloatTensor(scaled_anchors).index_select(1, LongTensor([1])) anchor_w = anchor_w.repeat(bs, 1).repeat(1, 1, g_dim*g_dim).view(w.shape) anchor_h = anchor_h.repeat(bs, 1).repeat(1, 1, g_dim*g_dim).view(h.shape) # Add offset and scale with anchors pred_boxes = FloatTensor(prediction[..., :4].shape) pred_boxes[..., 0] = x.data + grid_x pred_boxes[..., 1] = y.data + grid_y pred_boxes[..., 2] = torch.exp(w.data) * anchor_w pred_boxes[..., 3] = torch.exp(h.data) * anchor_h self.seen += prediction.size(0) # Training if targets is not None: if x.is_cuda: self.mse_loss = self.mse_loss.cuda() self.bce_loss = self.bce_loss.cuda() nGT, nCorrect, coord_mask, conf_mask, cls_mask, tx, ty, tw, th, tconf, tcls = build_targets(pred_boxes.cpu().data, targets.cpu().data, scaled_anchors, self.num_anchors, self.num_classes, g_dim, self.ignore_thres) # nProposals = int((conf > 0.25).sum().item()) recall = float(nCorrect / nGT) if nGT else 1 tx = Variable(tx.type(FloatTensor), requires_grad=False) ty = Variable(ty.type(FloatTensor), requires_grad=False) tw = Variable(tw.type(FloatTensor), requires_grad=False) th = Variable(th.type(FloatTensor), requires_grad=False) tconf = Variable(tconf.type(FloatTensor), requires_grad=False) tcls = Variable(tcls[cls_mask == 1].type(FloatTensor), requires_grad=False) coord_mask = Variable(coord_mask.type(FloatTensor), requires_grad=False) conf_mask = Variable(conf_mask.type(FloatTensor), requires_grad=False) loss_x = self.coord_scale * self.mse_loss(x[coord_mask == 1], tx[coord_mask == 1]) / 2 loss_y = self.coord_scale * self.mse_loss(y[coord_mask == 1], ty[coord_mask == 1]) / 2 loss_w = self.coord_scale * self.mse_loss(w[coord_mask == 1], tw[coord_mask == 1]) / 2 loss_h = self.coord_scale * self.mse_loss(h[coord_mask == 1], th[coord_mask == 1]) / 2 loss_conf = self.bce_loss(conf[conf_mask == 1], tconf[conf_mask == 1]) loss_cls = self.class_scale * self.bce_loss(pred_cls[cls_mask == 1], tcls) loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls return loss, loss_x.item(), loss_y.item(), loss_w.item(), loss_h.item(), loss_conf.item(), loss_cls.item(),recall else: # If not in training phase return predictions output = torch.cat((pred_boxes.view(bs, -1, 4) * stride, conf.view(bs, -1, 1), pred_cls.view(bs, -1, self.num_classes)), -1) return output.data class Darknet(nn.Module): """YOLOv3 object detection model""" def __init__(self, module_defs, img_size=416): super(Darknet, self).__init__() self.module_defs = module_defs self.hyperparams, self.module_list = create_modules(self.module_defs)#根據config檔案建立yolov3網路模型,返回網路引數和torch版神經網路 # print("module",self.module_list) self.img_size = img_size self.loss_names = ['x', 'y', 'w', 'h', 'conf', 'cls', 'recall'] self.losses = defaultdict(float) def forward(self, x, targets=None): is_training = targets is not None output = [] for name in self.loss_names: self.losses[name] =0 layer_outputs = [] for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)): if module_def['type'] in ['convolutional', 'upsample']: x = module(x) elif module_def['type'] == 'route': layer_i = [int(x) for x in module_def['layers'].split(',')] x = torch.cat([layer_outputs[i] for i in layer_i], 1) elif module_def['type'] == 'shortcut': layer_i = int(module_def['from']) x = layer_outputs[-1] + layer_outputs[layer_i] elif module_def['type'] == 'yolo': # Train phase: get loss if is_training: x, *losses = module[0](x, targets) for name, loss in zip(self.loss_names, losses): self.losses[name] += loss # Test phase: Get detections else: x = module(x) output.append(x) layer_outputs.append(x) self.losses['recall'] /= 3 if is_training: return sum(output).view(-1, ) else: return torch.cat(output, 1) # return sum(output) if is_training else torch.cat(output, 1) def load_weights(self, weights_path,is_training = False): """Parses and loads the weights stored in 'weights_path'""" #Open the weights file fp = open(weights_path, "rb") header = np.fromfile(fp, dtype=np.int32, count=5) # First five are header values # Needed to write header when saving weights self.header_info = header self.seen = header[3] weights = np.fromfile(fp, dtype=np.float32) # The rest are weights fp.close() ptr = 0 for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)): if module_def['type'] == 'convolutional': conv_layer = module[0] if module_def['batch_normalize']: # Load BN bias, weights, running mean and running variance bn_layer = module[1] num_b = bn_layer.bias.numel() # Number of biases # Bias bn_b = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(bn_layer.bias) bn_layer.bias.data.copy_(bn_b) ptr += num_b # Weight bn_w = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(bn_layer.weight) bn_layer.weight.data.copy_(bn_w) ptr += num_b # Running Mean bn_rm = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(bn_layer.running_mean) bn_layer.running_mean.data.copy_(bn_rm) ptr += num_b # Running Var bn_rv = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(bn_layer.running_var) bn_layer.running_var.data.copy_(bn_rv) ptr += num_b else: # Load conv. bias num_b = conv_layer.bias.numel() conv_b = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(conv_layer.bias) conv_layer.bias.data.copy_(conv_b) ptr += num_b # Load conv. weights num_w = conv_layer.weight.numel() conv_w = torch.from_numpy(weights[ptr:ptr + num_w]).view_as(conv_layer.weight)#權重引數賦值 conv_layer.weight.data.copy_(conv_w) ptr += num_w """ @:param path - path of the new weights file @:param cutoff - save layers between 0 and cutoff (cutoff = -1 -> all are saved) """ def save_weights(self, path, cutoff=-1): fp = open(path, 'wb') self.header_info[3] = self.seen self.header_info.tofile(fp) # Iterate through layers for i, (module_def, module) in enumerate(zip(self.module_defs[:cutoff], self.module_list[:cutoff])): if module_def['type'] == 'convolutional': conv_layer = module[0] # If batch norm, load bn first if module_def['batch_normalize']: bn_layer = module[1] bn_layer.bias.data.cpu().numpy().tofile(fp) bn_layer.weight.data.cpu().numpy().tofile(fp) bn_layer.running_mean.data.cpu().numpy().tofile(fp) bn_layer.running_var.data.cpu().numpy().tofile(fp) # Load conv bias else: conv_layer.bias.data.cpu().numpy().tofile(fp) # Load conv weights conv_layer.weight.data.cpu().numpy().tofile(fp) fp.close()

train程式碼:關鍵詞

optimizer.module.zero_grad()

model.module.save_weights

loss = model(imgs, targets)

torch.sum(loss).backward()

optimizer.module.step()

for epoch in range(opt.epochs):
    for batch_i, (_, imgs, targets) in enumerate(dataloader):
        imgs = Variable(imgs.type(Tensor))
        targets = Variable(targets.type(Tensor), requires_grad=False)
        optimizer.module.zero_grad()
        loss = model(imgs, targets)
        # loss.backward()
        # optimizer.step()
torch.sum(loss).backward()
        optimizer.module.step()
        now = datetime.datetime.now()
        strftime = now.strftime("%H:%M:%S")
        print(strftime, epoch, opt.epochs, batch_i, len(dataloader), loss)
        if batch_i % 40 == 39:
            if last_total_loss > 0 and total_loss > last_total_loss * 1.01:
                print("total_loss", total_loss)
                adjust_learning_rate(optimizer)
            else:
                print("total_loss", total_loss, last_total_loss)
                last_total_loss = total_loss
            total_loss = torch.sum(loss)
        elif batch_i == 0:
            total_loss = torch.sum(loss)
        else:
            total_loss += torch.sum(loss)

        # if epoch > 0 and batch_i == 0:
        #     if torch.sum(loss) > mean_loss / batch_size :
        #         print("mean_loss", mean_loss)
        #         adjust_learning_rate(optimizer)
        #     mean_loss = torch.sum(loss)
        # else:
        #     mean_loss += torch.sum(loss)
        # info = {'loss': loss.item(), 'cls': model.losses['cls'], 'conf': model.losses['conf']}
        # for tag, value in info.items():
        #     logger.scalar_summary(tag, value, epoch)
print('%s [Epoch %d/%d, Batch %d/%d Losse s: x %f, y %f, w %f, h %f, conf %f, cls %f, total %f, recall: %.5f]' %
              (strftime, epoch, opt.epochs, batch_i, len(dataloader),
               model.module.losses['x'], model.module.losses['y'], model.module.losses['w'],
               model.module.losses['h'], model.module.losses['conf'], model.module.losses['cls'],
               torch.sum(loss), model.module.losses['recall']))

    if epoch % opt.checkpoint_interval == 0:
        model.module.save_weights('%s/%d.weights' % (opt.checkpoint_dir, epoch))
# -*- coding:utf-8 -*-
from __future__ import division

from models import *
from utils.utils import *
from utils.datasets import *
from utils.parse_config import *
from logger import Logger
import os
import sys
import time
import datetime
import argparse

import torch
from torch.utils.data import DataLoader

from torch.autograd import Variable
import torch.optim as optim

parser = argparse.ArgumentParser()
parser.add_argument('--epochs', type=int, default=2001, help='number of epochs')
parser.add_argument('--image_folder', type=str, default='data/samples', help='path to dataset')
parser.add_argument('--batch_size', type=int, default=4, help='size of each image batch')
parser.add_argument('--learning_rate', type=float, default=0.01, help='learning_rate')
parser.add_argument('--train_dir', type=str, default=r'E:\team-CV\dataset\tiny_data\VOC2007/',help='train_dir')
parser.add_argument('--model_config_path', type=str, default='config/yolov3_2cls.cfg', help='path to model config file')
parser.add_argument('--data_config_path', type=str, default='config/coco.data', help='path to data config file')
parser.add_argument('--weights_path', type=str, default='weights/yolov3.weights', help='path to weights file')
# parser.add_argument('--weights_path', type=str, default='checkpoints/40.weights', help='path to weights file')
parser.add_argument('--class_path', type=str, default='data/coco_2cls.names', help='path to class label file')
parser.add_argument('--conf_thres', type=float, default=0.8, help='object confidence threshold')
parser.add_argument('--nms_thres', type=float, default=0.4, help='iou thresshold for non-maximum suppression')
parser.add_argument('--n_cpu', type=int, default=0, help='number of cpu threads to use during batch generation')
parser.add_argument('--img_size', type=int, default=416, help='size of each image dimension')
parser.add_argument('--checkpoint_interval', type=int, default=4, help='interval between saving model weights')
parser.add_argument('--checkpoint_dir', type=str, default='checkpoints', help='directory where model checkpoints are saved')
opt = parser.parse_args()
print(opt)

os.makedirs('output', exist_ok=True)
os.makedirs('checkpoints', exist_ok=True)
def adjust_learning_rate(optimizer, decay_rate=0.5):
    for param_group in optimizer.module.param_groups:
        if(param_group['lr']>1e-8):
            param_group['lr'] = param_group['lr'] * decay_rate
    print(optimizer.module)
cuda = True if torch.cuda.is_available else False
classes = load_classes(opt.class_path)

module_defs=parse_model_config(opt.model_config_path)
hyperparams     = module_defs[0]
anchors=hyperparams["anchors"]
anchors = [int(x) for x in anchors.split(",")]
anchors = [(anchors[i], anchors[i + 1]) for i in range(0, len(anchors), 2)]
module_defs[83]["anchors"]=anchors
module_defs[95]["anchors"]=anchors
module_defs[107]["anchors"]=anchors
batch_size      = opt.batch_size# int(hyperparams['batch'])
subdivisions    = int(hyperparams['subdivisions'])
sub_batch       = batch_size // subdivisions
learning_rate   = opt.learning_rate
momentum        = float(hyperparams['momentum'])
decay           = float(hyperparams['decay'])
burn_in         = int(hyperparams['burn_in'])
hyperparams['height']=hyperparams['width']=opt.img_size

if __name__ == '__main__':
    dataloader = torch.utils.data.DataLoader(
        ListDataset(opt.train_dir,img_size=opt.img_size,is_training = 1,data_size=10000),
batch_size=batch_size, shuffle=1, num_workers=opt.n_cpu)

    model = Darknet(module_defs,img_size=opt.img_size)
    model.load_weights(opt.weights_path,is_training=True)
    #model.apply(weights_init_normal)
ngpus = 4
if ngpus >= 1:
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")
    if cuda:
        if ngpus > 1:
            model = torch.nn.DataParallel(model).to(device)
            # model = nn.parallel.DataParallel(model,device_ids=_DEVICE_ID).cuda()
else:
            model = model.to(device)

    model.train()
    Tensor = torch.cuda.FloatTensor if cuda else torch.FloatTensor
    # optimizer = optim.SGD(model.parameters(), lr=learning_rate/batch_size, momentum=momentum, dampening=0, weight_decay=decay*batch_size)
optimizer = optim.Adam(model.parameters(), lr=learning_rate/batch_size, weight_decay=decay*batch_size)
    optimizer = torch.nn.DataParallel(optimizer).to(device)
    print("subdivisions",subdivisions)
    logger = Logger('./logs')
    total_loss=0
last_total_loss=0
for epoch in range(opt.epochs):
        for batch_i, (_, imgs, targets) in enumerate(dataloader):
            imgs = Variable(imgs.type(Tensor))
            targets = Variable(targets.type(Tensor), requires_grad=False)
            optimizer.module.zero_grad()
            loss = model(imgs, targets)
            # loss.backward()
            # optimizer.step()
torch.sum(loss).backward()
            optimizer.module.step()
            strftime = datetime.datetime.now().strftime("%H:%M:%S")
            # print(strftime, epoch, opt.epochs, batch_i, len(dataloader), loss)
if batch_i % 40 == 39:
                if last_total_loss > 0 and total_loss > last_total_loss * 1.01:
                    print("total_loss", total_loss)
                    adjust_learning_rate(optimizer)
                else:
                    last_total_loss = total_loss
                total_loss = torch.sum(loss)
            elif batch_i == 0:
                total_loss = torch.sum(loss)
            else:
                total_loss += torch.sum(loss)

            # if epoch > 0 and batch_i == 0:
            #     if torch.sum(loss) > mean_loss / batch_size :
            #         print("mean_loss", mean_loss)
            #         adjust_learning_rate(optimizer)
            #     mean_loss = torch.sum(loss)
            # else:
            #     mean_loss += torch.sum(loss)
            # info = {'loss': loss.item(), 'cls': model.losses['cls'], 'conf': model.losses['conf']}
            # for tag, value in info.items():
            #     logger.scalar_summary(tag, value, epoch)
print('%s [Epoch %d/%d, Batch %d/%d Losses: x %f, y %f, w %f, h %f, conf %f, cls %f, total %f, recall: %.5f]' %
                  (strftime, epoch, opt.epochs, batch_i, len(dataloader),
model.module.losses['x'], model.module.losses['y'], model.module.losses['w'],
model.module.losses['h'], model.module.losses['conf'], model.module.losses['cls'],
torch.sum(loss), model.module.losses['recall']))

        if epoch % opt.checkpoint_interval == 0:
            model.module.save_weights('%s/%d.weights' % (opt.checkpoint_dir, epoch))