最新消息: USBMI致力于为网友们分享Windows、安卓、IOS等主流手机系统相关的资讯以及评测、同时提供相关教程、应用、软件下载等服务。

项目awesome

互联网 admin 1浏览 0评论

项目awesome

FCN、Unet、deeplabv1、deeplabv2、deeplabv3、deeplabv3+的网络

以DeepLabv3+架构为基础做图像分割(包含v1, v2, v3介绍)

有关语义分割的奇技淫巧有哪些?

语义分割中的深度学习方法全解:从FCN、SegNet到各代DeepLab

【总结】图像语义分割之FCN和CRF

图像语义分割(Semantic segmentation) Survey

 

 UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`.  Failure to do this will result in PyTorch skipping the first value of the learning rate schedule.See more details at .html#how-to-adjust-learning-rate
  ".html#how-to-adjust-learning-rate", UserWarning)

self.optimizer.step()
self.lr_scheduler.step()

该项目运行deeplabv3_plus,会遇到jpu错误:

        self.model = get_segmentation_model(model=args.model, dataset=args.dataset, backbone=args.backbone,aux=args.aux, norm_layer=BatchNorm2d).to(self.device)#aux=args.aux, jpu=args.jpu, norm_layer=BatchNorm2d).to(self.device)

JPU是什么:

Joint Pyramid Upsampling (JPU)

还会遇到错误:

Traceback (most recent call last):File "train_new.py", line 352, in <module>trainer = Trainer(args)File "train_new.py", line 171, in __init__aux=args.aux, norm_layer=BatchNorm2d).to(self.device)File "/train/results/ynh_copy/semantic-segmentation-bak/core/models/model_zoo.py", line 122, in get_segmentation_modelreturn models[model](**kwargs)File "/train/results/ynh_copy/semantic-segmentation-bak/core/models/deeplabv3_plus.py", line 127, in get_deeplabv3_plusmodel = DeepLabV3Plus(datasets[dataset].NUM_CLASS, backbone=backbone, pretrained_base=pretrained_base, **kwargs)File "/train/results/ynh_copy/semantic-segmentation-bak/core/models/deeplabv3_plus.py", line 38, in __init__self.pretrained = get_xception(pretrained=pretrained_base, output_stride=output_stride, **kwargs)File "/train/results/ynh_copy/semantic-segmentation-bak/core/models/base_models/xception.py", line 390, in get_xceptionmodel.load_state_dict(torch.load(get_model_file('xception', root=root)))File "/train/results/ynh_copy/semantic-segmentation-bak/core/models/model_store.py", line 68, in get_model_fileraise ValueError('Model file is not found. Downloading or trainning.')
ValueError: Model file is not found. Downloading or trainning

无法加载预训练,因为文中没有给出预训练:

 deeplabv3_plus.py

    from ..data.dataloader import datasetsmodel = DeepLabV3Plus(datasets[dataset].NUM_CLASS, backbone=backbone, pretrained_base=pretrained_base, **kwargs)if pretrained:from .model_store import get_model_filedevice = torch.device(kwargs['local_rank'])model.load_state_dict(torch.load(get_model_file('deeplabv3_plus_%s_%s' % (backbone, acronyms[dataset]), root=root),map_location=device))return model

 deeplabv3_plus.py

class DeepLabV3Plus(nn.Module):r"""DeepLabV3PlusParameters----------nclass : intNumber of categories for the training dataset.backbone : stringPre-trained dilated backbone network type (default:'xception').norm_layer : objectNormalization layer used in backbone network (default: :class:`nn.BatchNorm`;for Synchronized Cross-GPU BachNormalization).aux : boolAuxiliary loss.Reference:Chen, Liang-Chieh, et al. "Encoder-Decoder with Atrous Separable Convolution for SemanticImage Segmentation.""""def __init__(self, nclass, backbone='xception', aux=True, pretrained_base=True, dilated=True, **kwargs):super(DeepLabV3Plus, self).__init__()self.aux = auxself.nclass = nclassoutput_stride = 8 if dilated else 32self.pretrained = get_xception(pretrained=pretrained_base, output_stride=output_stride, **kwargs)# deeplabv3 plusself.head = _DeepLabHead(nclass, **kwargs)if aux:self.auxlayer = _FCNHead(728, nclass, **kwargs)

xception.py 

# Constructor
def get_xception(pretrained=False, root='~/.torch/models', **kwargs):model = Xception65(**kwargs)if pretrained:from ..model_store import get_model_filemodel.load_state_dict(torch.load(get_model_file('xception', root=root)))return model

 下载地址样例:

.6/site-packages/gluoncv/model_zoo/model_store.py

 改进一下

xception.py 

# Constructor
def get_xception(pretrained=False, root='~/.torch/models', **kwargs):model = Xception65(**kwargs)# if pretrained:#     from ..model_store import get_model_file#     model.load_state_dict(torch.load(get_model_file('xception', root=root)))# return modelfrom ..model_store import get_resnet_fileif pretrained:model.load_state_dict(get_resnet_file('xception'))return model

model_store.py

Model file /home/spple/.torch/models/resnet152-0d43d698.pth is not found. Downloading.
Downloading /home/spple/.torch/models/resnet152-0d43d698.zip from .zip...  
4%|▎         | 7776/218951 [00:11<05:29, 641.52KB/s]
"""Model store which provides pretrained models."""
from __future__ import print_functionimport os
import zipfilefrom ..utils.download import download, check_sha1__all__ = ['get_model_file', 'get_resnet_file']_model_sha1 = {name: checksum for checksum, name in [('25c4b50959ef024fcc050213a06b614899f94b3d', 'resnet50'),('2a57e44de9c853fa015b172309a1ee7e2d0e4e2a', 'resnet101'),('0d43d698c66aceaa2bc0309f55efdd7ff4b143af', 'resnet152'),('0d43d698c66aceaa2bc0309f55efdd7ff4b143af', 'resnet152'),('37c1c90b56800303a66934487fbf017bca8bba00', 'xception'),
]}encoding_repo_url = '/'
_url_format = '{repo_url}encoding/models/{file_name}.zip'

但是我在网上没找到xception的下载地址:

所以:

deeplabv3_plus.py 把pretrained_base=False 改为False

def get_deeplabv3_plus(dataset='pascal_voc', backbone='xception', pretrained=False, root='~/.torch/models',pretrained_base=False, **kwargs):acronyms = {'pascal_voc': 'pascal_voc','pascal_aug': 'pascal_aug','ade20k': 'ade','coco': 'coco','citys': 'citys',}from ..data.dataloader import datasetsmodel = DeepLabV3Plus(datasets[dataset].NUM_CLASS, backbone=backbone, pretrained_base=pretrained_base, **kwargs)if pretrained:from .model_store import get_model_filedevice = torch.device(kwargs['local_rank'])model.load_state_dict(torch.load(get_model_file('deeplabv3_plus_%s_%s' % (backbone, acronyms[dataset]), root=root),map_location=device))return model

deeplabv3.py 把norm_kwargs 改为norm_kwargs=None

class _ASPP(nn.Module):def __init__(self, in_channels, atrous_rates, norm_layer, norm_kwargs=None, **kwargs):super(_ASPP, self).__init__()out_channels = 256self.b0 = nn.Sequential(nn.Conv2d(in_channels, out_channels, 1, bias=False),norm_layer(out_channels, **({} if norm_kwargs is None else norm_kwargs)),nn.ReLU(True))rate1, rate2, rate3 = tuple(atrous_rates)self.b1 = _ASPPConv(in_channels, out_channels, rate1, norm_layer, norm_kwargs)self.b2 = _ASPPConv(in_channels, out_channels, rate2, norm_layer, norm_kwargs)self.b3 = _ASPPConv(in_channels, out_channels, rate3, norm_layer, norm_kwargs)self.b4 = _AsppPooling(in_channels, out_channels, norm_layer=norm_layer, norm_kwargs=norm_kwargs)self.project = nn.Sequential(nn.Conv2d(5 * out_channels, out_channels, 1, bias=False),norm_layer(out_channels, **({} if norm_kwargs is None else norm_kwargs)),nn.ReLU(True),nn.Dropout(0.5))def forward(self, x):feat1 = self.b0(x)feat2 = self.b1(x)feat3 = self.b2(x)feat4 = self.b3(x)feat5 = self.b4(x)x = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)x = self.project(x)return x

如果4卡,0,1运行一个任务,2,3运行一个任务:

第一个任务就为:

#nohup 
/root/train/results/ynh_copy/anaconda3_py3.7/bin/python train_new.py \
--model deeplabv3 \
--backbone resnet50 \
--dataset pascal_voc \
--lr 0.0001 \
--epochs 200 \
--gpu-ids 0,1 \
--save-dir ../save_model \
--batch-size 6 #>out.log 2>&1 &

那么修改第二次任务:

    if not args.no_cuda and torch.cuda.is_available():#cudnn.benchmark = True#args.device = "cuda"args.device = "cuda:2"else:args.distributed = Falseargs.device = "cpu"
class Trainer(object):def __init__(self, args):self.args = argsself.device = torch.device(args.device)# image transforminput_transform = transforms.Compose([transforms.ToTensor(),transforms.Normalize([.485, .456, .406], [.229, .224, .225]),])
#nohup 
/root/train/results/ynh_copy/anaconda3_py3.7/bin/python train_new.py \
--model deeplabv3 \
--backbone resnet101 \
--dataset pascal_voc \
--lr 0.0001 \
--epochs 200 \
--gpu-ids 2,3 \
--local_rank 2 \
--save-dir ../save_model \
--batch-size 4 #>out.log 2>&1 &

一些参考:

.md

.WORK/blob/master/semantic_segmentation.md

 

 

.md


实现如下:

FCN
ENet
PSPNet
ICNet
DeepLabv3
DeepLabv3+
DenseASPP
EncNet
BiSeNet
PSANet
DANet
OCNet
CGNet
ESPNetv2
CCNet
DUNet(DUpsampling)
FastFCN(JPU)
LEDNet
Fast-SCNN
LightSeg
DFANet

该项目默认保存路径为:

    parser.add_argument('--save-dir', default='~/.torch/models',help='Directory for saving checkpoint models')parser.add_argument('--save-epoch', type=int, default=10,help='save model every checkpoint-epoch')parser.add_argument('--log-dir', default='../runs/logs/',help='Directory for saving checkpoint models')

默认迭代次数为50,但是我们改为80:

    parser.add_argument('--epochs', type=int, default=80, metavar='N',help='number of epochs to train (default: 50)')

因为:

 加入:

self.model = nn.DataParallel(self.model, device_ids=args.gpu_ids)
import argparse
import time
import datetime
import os
import shutil
import syscur_path = os.path.abspath(os.path.dirname(__file__))
root_path = os.path.split(cur_path)[0]
sys.path.append(root_path)import torch
import torch.nn as nn
import torch.utils.data as data
import torch.backends.cudnn as cudnnfrom torchvision import transforms
from core.data.dataloader import get_segmentation_dataset
from core.models.model_zoo import get_segmentation_model
from core.utils.loss import get_segmentation_loss
from core.utils.distributed import *
from core.utils.logger import setup_logger
from core.utils.lr_scheduler import WarmupPolyLR
from core.utils.score import SegmentationMetricdef parse_args():parser = argparse.ArgumentParser(description='Semantic Segmentation Training With Pytorch')# model and datasetparser.add_argument('--model', type=str, default='fcn',choices=['fcn32s', 'fcn16s', 'fcn8s','fcn', 'psp', 'deeplabv3', 'deeplabv3_plus','danet', 'denseaspp', 'bisenet','encnet', 'dunet', 'icnet','enet', 'ocnet', 'ccnet', 'psanet','cgnet', 'espnet', 'lednet', 'dfanet'],help='model name (default: fcn32s)')parser.add_argument('--backbone', type=str, default='resnet50',choices=['vgg16', 'resnet18', 'resnet50','resnet101', 'resnet152', 'densenet121','densenet161', 'densenet169', 'densenet201'],help='backbone name (default: vgg16)')parser.add_argument('--dataset', type=str, default='pascal_voc',choices=['pascal_voc', 'pascal_aug', 'ade20k','citys', 'sbu'],help='dataset name (default: pascal_voc)')parser.add_argument('--base-size', type=int, default=520,help='base image size')parser.add_argument('--crop-size', type=int, default=480,help='crop image size')parser.add_argument('--workers', '-j', type=int, default=4,metavar='N', help='dataloader threads')# training hyper paramsparser.add_argument('--jpu', action='store_true', default=False,help='JPU')parser.add_argument('--use-ohem', type=bool, default=False,help='OHEM Loss for cityscapes dataset')parser.add_argument('--aux', action='store_true', default=False,help='Auxiliary loss')parser.add_argument('--aux-weight', type=float, default=0.4,help='auxiliary loss weight')parser.add_argument('--batch-size', type=int, default=4, metavar='N',help='input batch size for training (default: 8)')parser.add_argument('--start_epoch', type=int, default=0,metavar='N', help='start epochs (default:0)')parser.add_argument('--epochs', type=int, default=50, metavar='N',help='number of epochs to train (default: 50)')parser.add_argument('--lr', type=float, default=1e-4, metavar='LR',help='learning rate (default: 1e-4)')parser.add_argument('--momentum', type=float, default=0.9, metavar='M',help='momentum (default: 0.9)')parser.add_argument('--weight-decay', type=float, default=1e-4, metavar='M',help='w-decay (default: 5e-4)')parser.add_argument('--warmup-iters', type=int, default=0,help='warmup iters')parser.add_argument('--warmup-factor', type=float, default=1.0 / 3,help='lr = warmup_factor * lr')parser.add_argument('--warmup-method', type=str, default='linear',help='method of warmup')# cuda settingparser.add_argument('--no-cuda', action='store_true', default=False,help='disables CUDA training')parser.add_argument('--local_rank', type=int, default=0)# checkpoint and logparser.add_argument('--resume', type=str, default=None,help='put the path to resuming file if needed')parser.add_argument('--save-dir', default='~/.torch/models',help='Directory for saving checkpoint models')parser.add_argument('--save-epoch', type=int, default=10,help='save model every checkpoint-epoch')parser.add_argument('--log-dir', default='../runs/logs/',help='Directory for saving checkpoint models')parser.add_argument('--log-iter', type=int, default=10,help='print log every log-iter')# evaluation onlyparser.add_argument('--val-epoch', type=int, default=1,help='run validation every val-epoch')parser.add_argument('--skip-val', action='store_true', default=False,help='skip validation during training')parser.add_argument('--gpu-ids', type=str, default='0,1,2,3',help='use which gpu to train, must be a \comma-separated list of integers only (default=0)')args = parser.parse_args()# default settings for epochs, batch_size and lrif args.epochs is None:epoches = {'coco': 30,'pascal_aug': 80,'pascal_voc': 50,'pcontext': 80,'ade20k': 160,'citys': 120,'sbu': 160,}args.epochs = epoches[args.dataset.lower()]if args.lr is None:lrs = {'coco': 0.004,'pascal_aug': 0.001,'pascal_voc': 0.0001,'pcontext': 0.001,'ade20k': 0.01,'citys': 0.01,'sbu': 0.001,}args.lr = lrs[args.dataset.lower()] / 8 * args.batch_sizereturn argsclass Trainer(object):def __init__(self, args):self.args = argsself.device = torch.device(args.device)# image transforminput_transform = transforms.Compose([transforms.ToTensor(),transforms.Normalize([.485, .456, .406], [.229, .224, .225]),])# dataset and dataloaderdata_kwargs = {'transform': input_transform, 'base_size': args.base_size, 'crop_size': args.crop_size}train_dataset = get_segmentation_dataset(args.dataset, split='train', mode='train', **data_kwargs)val_dataset = get_segmentation_dataset(args.dataset, split='val', mode='val', **data_kwargs)args.iters_per_epoch = len(train_dataset) // (args.num_gpus * args.batch_size)args.max_iters = args.epochs * args.iters_per_epochif args.mutilgpu:args.batch_size = args.batch_size * len(args.gpu_ids)train_sampler = make_data_sampler(train_dataset, shuffle=True, distributed=args.distributed)train_batch_sampler = make_batch_data_sampler(train_sampler, args.batch_size, args.max_iters)val_sampler = make_data_sampler(val_dataset, False, args.distributed)val_batch_sampler = make_batch_data_sampler(val_sampler, args.batch_size)self.train_loader = data.DataLoader(dataset=train_dataset,batch_sampler=train_batch_sampler,num_workers=args.workers,pin_memory=True)self.val_loader = data.DataLoader(dataset=val_dataset,batch_sampler=val_batch_sampler,num_workers=args.workers,pin_memory=True)# create networkBatchNorm2d = nn.SyncBatchNorm if args.distributed else nn.BatchNorm2dself.model = get_segmentation_model(model=args.model, dataset=args.dataset, backbone=args.backbone,aux=args.aux, jpu=args.jpu, norm_layer=BatchNorm2d).to(self.device)# resume checkpoint if neededif args.resume:if os.path.isfile(args.resume):name, ext = os.path.splitext(args.resume)assert ext == '.pkl' or '.pth', 'Sorry only .pth and .pkl files supported.'print('Resuming training, loading {}...'.format(args.resume))self.model.load_state_dict(torch.load(args.resume, map_location=lambda storage, loc: storage))# create criterionself.criterion = get_segmentation_loss(args.model, use_ohem=args.use_ohem, aux=args.aux,aux_weight=args.aux_weight, ignore_index=-1).to(self.device)# optimizer, for model just includes pretrained, head and auxlayerparams_list = list()if hasattr(self.model, 'pretrained'):params_list.append({'params': self.model.pretrained.parameters(), 'lr': args.lr})if hasattr(self.model, 'exclusive'):for module in self.model.exclusive:params_list.append({'params': getattr(self.model, module).parameters(), 'lr': args.lr * 10})self.optimizer = torch.optim.SGD(params_list,lr=args.lr,momentum=args.momentum,weight_decay=args.weight_decay)# lr schedulingself.lr_scheduler = WarmupPolyLR(self.optimizer,max_iters=args.max_iters,power=0.9,warmup_factor=args.warmup_factor,warmup_iters=args.warmup_iters,warmup_method=args.warmup_method)if args.distributed:self.model = nn.parallel.DistributedDataParallel(self.model, device_ids=[args.local_rank],output_device=args.local_rank)if args.mutilgpu:self.model = nn.DataParallel(self.model, device_ids=args.gpu_ids)# evaluation metricsself.metric = SegmentationMetric(train_dataset.num_class)self.best_pred = 0.0def train(self):save_to_disk = get_rank() == 0epochs, max_iters = self.args.epochs, self.args.max_iterslog_per_iters, val_per_iters = self.args.log_iter, self.args.val_epoch * self.args.iters_per_epochsave_per_iters = self.args.save_epoch * self.args.iters_per_epochstart_time = time.time()logger.info('Start training, Total Epochs: {:d} = Total Iterations {:d}'.format(epochs, max_iters))self.model.train()for iteration, (images, targets, _) in enumerate(self.train_loader):iteration = iteration + 1images = images.to(self.device)targets = targets.to(self.device)outputs = self.model(images)loss_dict = self.criterion(outputs, targets)losses = sum(loss for loss in loss_dict.values())# reduce losses over all GPUs for logging purposesloss_dict_reduced = reduce_loss_dict(loss_dict)losses_reduced = sum(loss for loss in loss_dict_reduced.values())self.optimizer.zero_grad()losses.backward()self.optimizer.step()self.lr_scheduler.step()eta_seconds = ((time.time() - start_time) / iteration) * (max_iters - iteration)eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))if iteration % log_per_iters == 0 and save_to_disk:logger.info("Iters: {:d}/{:d} || Lr: {:.6f} || Loss: {:.4f} || Cost Time: {} || Estimated Time: {}".format(iteration, max_iters, self.optimizer.param_groups[0]['lr'], losses_reduced.item(),str(datetime.timedelta(seconds=int(time.time() - start_time))), eta_string))if iteration % save_per_iters == 0 and save_to_disk:save_checkpoint(self.model, self.args, is_best=False)if not self.args.skip_val and iteration % val_per_iters == 0:self.validation()self.model.train()save_checkpoint(self.model, self.args, is_best=False)total_training_time = time.time() - start_timetotal_training_str = str(datetime.timedelta(seconds=total_training_time))logger.info("Total training time: {} ({:.4f}s / it)".format(total_training_str, total_training_time / max_iters))def validation(self):# total_inter, total_union, total_correct, total_label = 0, 0, 0, 0is_best = Falseself.metric.reset()if self.args.distributed:model = self.model.moduleelse:model = self.modeltorch.cuda.empty_cache()  # TODO check if it helpsmodel.eval()for i, (image, target, filename) in enumerate(self.val_loader):image = image.to(self.device)target = target.to(self.device)with torch.no_grad():outputs = model(image)self.metric.update(outputs[0], target)pixAcc, mIoU = self.metric.get()logger.info("Sample: {:d}, Validation pixAcc: {:.3f}, mIoU: {:.3f}".format(i + 1, pixAcc, mIoU))new_pred = (pixAcc + mIoU) / 2if new_pred > self.best_pred:is_best = Trueself.best_pred = new_predsave_checkpoint(self.model, self.args, is_best)synchronize()def save_checkpoint(model, args, is_best=False):"""Save Checkpoint"""directory = os.path.expanduser(args.save_dir)if not os.path.exists(directory):os.makedirs(directory)filename = '{}_{}_{}.pth'.format(args.model, args.backbone, args.dataset)filename = os.path.join(directory, filename)if args.distributed:model = model.moduleif args.mutilgpu:model = model.moduletorch.save(model.state_dict(), filename)if is_best:best_filename = '{}_{}_{}_best_model.pth'.format(args.model, args.backbone, args.dataset)best_filename = os.path.join(directory, best_filename)shutil.copyfile(filename, best_filename)if __name__ == '__main__':args = parse_args()if args.no_cuda is False:try:args.gpu_ids = [int(s) for s in args.gpu_ids.split(',')]except ValueError:raise ValueError('Argument --gpu_ids must be a comma-separated list of integers only')# reference maskrcnn-benchmark#num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1#args.num_gpus = num_gpusnum_gpus = len(args.gpu_ids)args.num_gpus = num_gpus#args.distributed = num_gpus > 1args.distributed = Falseargs.mutilgpu = num_gpus > 1if not args.no_cuda and torch.cuda.is_available():cudnn.benchmark = Trueargs.device = "cuda"else:args.distributed = Falseargs.device = "cpu"if args.distributed:torch.cuda.set_device(args.local_rank)torch.distributed.init_process_group(backend="nccl", init_method="env://")synchronize()args.lr = args.lr * num_gpuslogger = setup_logger("semantic_segmentation", args.log_dir, get_rank(), filename='{}_{}_{}_log.txt'.format(args.model, args.backbone, args.dataset))logger.info("Using {} GPUs".format(num_gpus))logger.info(args)trainer = Trainer(args)trainer.train()torch.cuda.empty_cache()
nohup /root/train/results/ynh_copy/anaconda3_py3.7/bin/python train_new.py \
--model deeplabv3 \
--backbone resnet50 \
--dataset pascal_voc \
--lr 0.01 \
--epochs 80 \
--gpu-ids 0,1,2,3 \
--batch_size 16 #>out.log 2>&1 &#nohup 
/root/train/results/ynh_copy/anaconda3_py3.7/bin/python train_new.py \
--model deeplabv3 \
--backbone resnet50 \
--dataset pascal_voc \
--lr 0.0001 \
--epochs 80 \
--gpu-ids 0,1,2,3 \
--save-dir ../save_model \
--batch-size 8 #>out.log 2>&1 &

 

项目awesome

FCN、Unet、deeplabv1、deeplabv2、deeplabv3、deeplabv3+的网络

以DeepLabv3+架构为基础做图像分割(包含v1, v2, v3介绍)

有关语义分割的奇技淫巧有哪些?

语义分割中的深度学习方法全解:从FCN、SegNet到各代DeepLab

【总结】图像语义分割之FCN和CRF

图像语义分割(Semantic segmentation) Survey

 

 UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`.  Failure to do this will result in PyTorch skipping the first value of the learning rate schedule.See more details at .html#how-to-adjust-learning-rate
  ".html#how-to-adjust-learning-rate", UserWarning)

self.optimizer.step()
self.lr_scheduler.step()

该项目运行deeplabv3_plus,会遇到jpu错误:

        self.model = get_segmentation_model(model=args.model, dataset=args.dataset, backbone=args.backbone,aux=args.aux, norm_layer=BatchNorm2d).to(self.device)#aux=args.aux, jpu=args.jpu, norm_layer=BatchNorm2d).to(self.device)

JPU是什么:

Joint Pyramid Upsampling (JPU)

还会遇到错误:

Traceback (most recent call last):File "train_new.py", line 352, in <module>trainer = Trainer(args)File "train_new.py", line 171, in __init__aux=args.aux, norm_layer=BatchNorm2d).to(self.device)File "/train/results/ynh_copy/semantic-segmentation-bak/core/models/model_zoo.py", line 122, in get_segmentation_modelreturn models[model](**kwargs)File "/train/results/ynh_copy/semantic-segmentation-bak/core/models/deeplabv3_plus.py", line 127, in get_deeplabv3_plusmodel = DeepLabV3Plus(datasets[dataset].NUM_CLASS, backbone=backbone, pretrained_base=pretrained_base, **kwargs)File "/train/results/ynh_copy/semantic-segmentation-bak/core/models/deeplabv3_plus.py", line 38, in __init__self.pretrained = get_xception(pretrained=pretrained_base, output_stride=output_stride, **kwargs)File "/train/results/ynh_copy/semantic-segmentation-bak/core/models/base_models/xception.py", line 390, in get_xceptionmodel.load_state_dict(torch.load(get_model_file('xception', root=root)))File "/train/results/ynh_copy/semantic-segmentation-bak/core/models/model_store.py", line 68, in get_model_fileraise ValueError('Model file is not found. Downloading or trainning.')
ValueError: Model file is not found. Downloading or trainning

无法加载预训练,因为文中没有给出预训练:

 deeplabv3_plus.py

    from ..data.dataloader import datasetsmodel = DeepLabV3Plus(datasets[dataset].NUM_CLASS, backbone=backbone, pretrained_base=pretrained_base, **kwargs)if pretrained:from .model_store import get_model_filedevice = torch.device(kwargs['local_rank'])model.load_state_dict(torch.load(get_model_file('deeplabv3_plus_%s_%s' % (backbone, acronyms[dataset]), root=root),map_location=device))return model

 deeplabv3_plus.py

class DeepLabV3Plus(nn.Module):r"""DeepLabV3PlusParameters----------nclass : intNumber of categories for the training dataset.backbone : stringPre-trained dilated backbone network type (default:'xception').norm_layer : objectNormalization layer used in backbone network (default: :class:`nn.BatchNorm`;for Synchronized Cross-GPU BachNormalization).aux : boolAuxiliary loss.Reference:Chen, Liang-Chieh, et al. "Encoder-Decoder with Atrous Separable Convolution for SemanticImage Segmentation.""""def __init__(self, nclass, backbone='xception', aux=True, pretrained_base=True, dilated=True, **kwargs):super(DeepLabV3Plus, self).__init__()self.aux = auxself.nclass = nclassoutput_stride = 8 if dilated else 32self.pretrained = get_xception(pretrained=pretrained_base, output_stride=output_stride, **kwargs)# deeplabv3 plusself.head = _DeepLabHead(nclass, **kwargs)if aux:self.auxlayer = _FCNHead(728, nclass, **kwargs)

xception.py 

# Constructor
def get_xception(pretrained=False, root='~/.torch/models', **kwargs):model = Xception65(**kwargs)if pretrained:from ..model_store import get_model_filemodel.load_state_dict(torch.load(get_model_file('xception', root=root)))return model

 下载地址样例:

.6/site-packages/gluoncv/model_zoo/model_store.py

 改进一下

xception.py 

# Constructor
def get_xception(pretrained=False, root='~/.torch/models', **kwargs):model = Xception65(**kwargs)# if pretrained:#     from ..model_store import get_model_file#     model.load_state_dict(torch.load(get_model_file('xception', root=root)))# return modelfrom ..model_store import get_resnet_fileif pretrained:model.load_state_dict(get_resnet_file('xception'))return model

model_store.py

Model file /home/spple/.torch/models/resnet152-0d43d698.pth is not found. Downloading.
Downloading /home/spple/.torch/models/resnet152-0d43d698.zip from .zip...  
4%|▎         | 7776/218951 [00:11<05:29, 641.52KB/s]
"""Model store which provides pretrained models."""
from __future__ import print_functionimport os
import zipfilefrom ..utils.download import download, check_sha1__all__ = ['get_model_file', 'get_resnet_file']_model_sha1 = {name: checksum for checksum, name in [('25c4b50959ef024fcc050213a06b614899f94b3d', 'resnet50'),('2a57e44de9c853fa015b172309a1ee7e2d0e4e2a', 'resnet101'),('0d43d698c66aceaa2bc0309f55efdd7ff4b143af', 'resnet152'),('0d43d698c66aceaa2bc0309f55efdd7ff4b143af', 'resnet152'),('37c1c90b56800303a66934487fbf017bca8bba00', 'xception'),
]}encoding_repo_url = '/'
_url_format = '{repo_url}encoding/models/{file_name}.zip'

但是我在网上没找到xception的下载地址:

所以:

deeplabv3_plus.py 把pretrained_base=False 改为False

def get_deeplabv3_plus(dataset='pascal_voc', backbone='xception', pretrained=False, root='~/.torch/models',pretrained_base=False, **kwargs):acronyms = {'pascal_voc': 'pascal_voc','pascal_aug': 'pascal_aug','ade20k': 'ade','coco': 'coco','citys': 'citys',}from ..data.dataloader import datasetsmodel = DeepLabV3Plus(datasets[dataset].NUM_CLASS, backbone=backbone, pretrained_base=pretrained_base, **kwargs)if pretrained:from .model_store import get_model_filedevice = torch.device(kwargs['local_rank'])model.load_state_dict(torch.load(get_model_file('deeplabv3_plus_%s_%s' % (backbone, acronyms[dataset]), root=root),map_location=device))return model

deeplabv3.py 把norm_kwargs 改为norm_kwargs=None

class _ASPP(nn.Module):def __init__(self, in_channels, atrous_rates, norm_layer, norm_kwargs=None, **kwargs):super(_ASPP, self).__init__()out_channels = 256self.b0 = nn.Sequential(nn.Conv2d(in_channels, out_channels, 1, bias=False),norm_layer(out_channels, **({} if norm_kwargs is None else norm_kwargs)),nn.ReLU(True))rate1, rate2, rate3 = tuple(atrous_rates)self.b1 = _ASPPConv(in_channels, out_channels, rate1, norm_layer, norm_kwargs)self.b2 = _ASPPConv(in_channels, out_channels, rate2, norm_layer, norm_kwargs)self.b3 = _ASPPConv(in_channels, out_channels, rate3, norm_layer, norm_kwargs)self.b4 = _AsppPooling(in_channels, out_channels, norm_layer=norm_layer, norm_kwargs=norm_kwargs)self.project = nn.Sequential(nn.Conv2d(5 * out_channels, out_channels, 1, bias=False),norm_layer(out_channels, **({} if norm_kwargs is None else norm_kwargs)),nn.ReLU(True),nn.Dropout(0.5))def forward(self, x):feat1 = self.b0(x)feat2 = self.b1(x)feat3 = self.b2(x)feat4 = self.b3(x)feat5 = self.b4(x)x = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)x = self.project(x)return x

如果4卡,0,1运行一个任务,2,3运行一个任务:

第一个任务就为:

#nohup 
/root/train/results/ynh_copy/anaconda3_py3.7/bin/python train_new.py \
--model deeplabv3 \
--backbone resnet50 \
--dataset pascal_voc \
--lr 0.0001 \
--epochs 200 \
--gpu-ids 0,1 \
--save-dir ../save_model \
--batch-size 6 #>out.log 2>&1 &

那么修改第二次任务:

    if not args.no_cuda and torch.cuda.is_available():#cudnn.benchmark = True#args.device = "cuda"args.device = "cuda:2"else:args.distributed = Falseargs.device = "cpu"
class Trainer(object):def __init__(self, args):self.args = argsself.device = torch.device(args.device)# image transforminput_transform = transforms.Compose([transforms.ToTensor(),transforms.Normalize([.485, .456, .406], [.229, .224, .225]),])
#nohup 
/root/train/results/ynh_copy/anaconda3_py3.7/bin/python train_new.py \
--model deeplabv3 \
--backbone resnet101 \
--dataset pascal_voc \
--lr 0.0001 \
--epochs 200 \
--gpu-ids 2,3 \
--local_rank 2 \
--save-dir ../save_model \
--batch-size 4 #>out.log 2>&1 &

一些参考:

.md

.WORK/blob/master/semantic_segmentation.md

 

 

.md


实现如下:

FCN
ENet
PSPNet
ICNet
DeepLabv3
DeepLabv3+
DenseASPP
EncNet
BiSeNet
PSANet
DANet
OCNet
CGNet
ESPNetv2
CCNet
DUNet(DUpsampling)
FastFCN(JPU)
LEDNet
Fast-SCNN
LightSeg
DFANet

该项目默认保存路径为:

    parser.add_argument('--save-dir', default='~/.torch/models',help='Directory for saving checkpoint models')parser.add_argument('--save-epoch', type=int, default=10,help='save model every checkpoint-epoch')parser.add_argument('--log-dir', default='../runs/logs/',help='Directory for saving checkpoint models')

默认迭代次数为50,但是我们改为80:

    parser.add_argument('--epochs', type=int, default=80, metavar='N',help='number of epochs to train (default: 50)')

因为:

 加入:

self.model = nn.DataParallel(self.model, device_ids=args.gpu_ids)
import argparse
import time
import datetime
import os
import shutil
import syscur_path = os.path.abspath(os.path.dirname(__file__))
root_path = os.path.split(cur_path)[0]
sys.path.append(root_path)import torch
import torch.nn as nn
import torch.utils.data as data
import torch.backends.cudnn as cudnnfrom torchvision import transforms
from core.data.dataloader import get_segmentation_dataset
from core.models.model_zoo import get_segmentation_model
from core.utils.loss import get_segmentation_loss
from core.utils.distributed import *
from core.utils.logger import setup_logger
from core.utils.lr_scheduler import WarmupPolyLR
from core.utils.score import SegmentationMetricdef parse_args():parser = argparse.ArgumentParser(description='Semantic Segmentation Training With Pytorch')# model and datasetparser.add_argument('--model', type=str, default='fcn',choices=['fcn32s', 'fcn16s', 'fcn8s','fcn', 'psp', 'deeplabv3', 'deeplabv3_plus','danet', 'denseaspp', 'bisenet','encnet', 'dunet', 'icnet','enet', 'ocnet', 'ccnet', 'psanet','cgnet', 'espnet', 'lednet', 'dfanet'],help='model name (default: fcn32s)')parser.add_argument('--backbone', type=str, default='resnet50',choices=['vgg16', 'resnet18', 'resnet50','resnet101', 'resnet152', 'densenet121','densenet161', 'densenet169', 'densenet201'],help='backbone name (default: vgg16)')parser.add_argument('--dataset', type=str, default='pascal_voc',choices=['pascal_voc', 'pascal_aug', 'ade20k','citys', 'sbu'],help='dataset name (default: pascal_voc)')parser.add_argument('--base-size', type=int, default=520,help='base image size')parser.add_argument('--crop-size', type=int, default=480,help='crop image size')parser.add_argument('--workers', '-j', type=int, default=4,metavar='N', help='dataloader threads')# training hyper paramsparser.add_argument('--jpu', action='store_true', default=False,help='JPU')parser.add_argument('--use-ohem', type=bool, default=False,help='OHEM Loss for cityscapes dataset')parser.add_argument('--aux', action='store_true', default=False,help='Auxiliary loss')parser.add_argument('--aux-weight', type=float, default=0.4,help='auxiliary loss weight')parser.add_argument('--batch-size', type=int, default=4, metavar='N',help='input batch size for training (default: 8)')parser.add_argument('--start_epoch', type=int, default=0,metavar='N', help='start epochs (default:0)')parser.add_argument('--epochs', type=int, default=50, metavar='N',help='number of epochs to train (default: 50)')parser.add_argument('--lr', type=float, default=1e-4, metavar='LR',help='learning rate (default: 1e-4)')parser.add_argument('--momentum', type=float, default=0.9, metavar='M',help='momentum (default: 0.9)')parser.add_argument('--weight-decay', type=float, default=1e-4, metavar='M',help='w-decay (default: 5e-4)')parser.add_argument('--warmup-iters', type=int, default=0,help='warmup iters')parser.add_argument('--warmup-factor', type=float, default=1.0 / 3,help='lr = warmup_factor * lr')parser.add_argument('--warmup-method', type=str, default='linear',help='method of warmup')# cuda settingparser.add_argument('--no-cuda', action='store_true', default=False,help='disables CUDA training')parser.add_argument('--local_rank', type=int, default=0)# checkpoint and logparser.add_argument('--resume', type=str, default=None,help='put the path to resuming file if needed')parser.add_argument('--save-dir', default='~/.torch/models',help='Directory for saving checkpoint models')parser.add_argument('--save-epoch', type=int, default=10,help='save model every checkpoint-epoch')parser.add_argument('--log-dir', default='../runs/logs/',help='Directory for saving checkpoint models')parser.add_argument('--log-iter', type=int, default=10,help='print log every log-iter')# evaluation onlyparser.add_argument('--val-epoch', type=int, default=1,help='run validation every val-epoch')parser.add_argument('--skip-val', action='store_true', default=False,help='skip validation during training')parser.add_argument('--gpu-ids', type=str, default='0,1,2,3',help='use which gpu to train, must be a \comma-separated list of integers only (default=0)')args = parser.parse_args()# default settings for epochs, batch_size and lrif args.epochs is None:epoches = {'coco': 30,'pascal_aug': 80,'pascal_voc': 50,'pcontext': 80,'ade20k': 160,'citys': 120,'sbu': 160,}args.epochs = epoches[args.dataset.lower()]if args.lr is None:lrs = {'coco': 0.004,'pascal_aug': 0.001,'pascal_voc': 0.0001,'pcontext': 0.001,'ade20k': 0.01,'citys': 0.01,'sbu': 0.001,}args.lr = lrs[args.dataset.lower()] / 8 * args.batch_sizereturn argsclass Trainer(object):def __init__(self, args):self.args = argsself.device = torch.device(args.device)# image transforminput_transform = transforms.Compose([transforms.ToTensor(),transforms.Normalize([.485, .456, .406], [.229, .224, .225]),])# dataset and dataloaderdata_kwargs = {'transform': input_transform, 'base_size': args.base_size, 'crop_size': args.crop_size}train_dataset = get_segmentation_dataset(args.dataset, split='train', mode='train', **data_kwargs)val_dataset = get_segmentation_dataset(args.dataset, split='val', mode='val', **data_kwargs)args.iters_per_epoch = len(train_dataset) // (args.num_gpus * args.batch_size)args.max_iters = args.epochs * args.iters_per_epochif args.mutilgpu:args.batch_size = args.batch_size * len(args.gpu_ids)train_sampler = make_data_sampler(train_dataset, shuffle=True, distributed=args.distributed)train_batch_sampler = make_batch_data_sampler(train_sampler, args.batch_size, args.max_iters)val_sampler = make_data_sampler(val_dataset, False, args.distributed)val_batch_sampler = make_batch_data_sampler(val_sampler, args.batch_size)self.train_loader = data.DataLoader(dataset=train_dataset,batch_sampler=train_batch_sampler,num_workers=args.workers,pin_memory=True)self.val_loader = data.DataLoader(dataset=val_dataset,batch_sampler=val_batch_sampler,num_workers=args.workers,pin_memory=True)# create networkBatchNorm2d = nn.SyncBatchNorm if args.distributed else nn.BatchNorm2dself.model = get_segmentation_model(model=args.model, dataset=args.dataset, backbone=args.backbone,aux=args.aux, jpu=args.jpu, norm_layer=BatchNorm2d).to(self.device)# resume checkpoint if neededif args.resume:if os.path.isfile(args.resume):name, ext = os.path.splitext(args.resume)assert ext == '.pkl' or '.pth', 'Sorry only .pth and .pkl files supported.'print('Resuming training, loading {}...'.format(args.resume))self.model.load_state_dict(torch.load(args.resume, map_location=lambda storage, loc: storage))# create criterionself.criterion = get_segmentation_loss(args.model, use_ohem=args.use_ohem, aux=args.aux,aux_weight=args.aux_weight, ignore_index=-1).to(self.device)# optimizer, for model just includes pretrained, head and auxlayerparams_list = list()if hasattr(self.model, 'pretrained'):params_list.append({'params': self.model.pretrained.parameters(), 'lr': args.lr})if hasattr(self.model, 'exclusive'):for module in self.model.exclusive:params_list.append({'params': getattr(self.model, module).parameters(), 'lr': args.lr * 10})self.optimizer = torch.optim.SGD(params_list,lr=args.lr,momentum=args.momentum,weight_decay=args.weight_decay)# lr schedulingself.lr_scheduler = WarmupPolyLR(self.optimizer,max_iters=args.max_iters,power=0.9,warmup_factor=args.warmup_factor,warmup_iters=args.warmup_iters,warmup_method=args.warmup_method)if args.distributed:self.model = nn.parallel.DistributedDataParallel(self.model, device_ids=[args.local_rank],output_device=args.local_rank)if args.mutilgpu:self.model = nn.DataParallel(self.model, device_ids=args.gpu_ids)# evaluation metricsself.metric = SegmentationMetric(train_dataset.num_class)self.best_pred = 0.0def train(self):save_to_disk = get_rank() == 0epochs, max_iters = self.args.epochs, self.args.max_iterslog_per_iters, val_per_iters = self.args.log_iter, self.args.val_epoch * self.args.iters_per_epochsave_per_iters = self.args.save_epoch * self.args.iters_per_epochstart_time = time.time()logger.info('Start training, Total Epochs: {:d} = Total Iterations {:d}'.format(epochs, max_iters))self.model.train()for iteration, (images, targets, _) in enumerate(self.train_loader):iteration = iteration + 1images = images.to(self.device)targets = targets.to(self.device)outputs = self.model(images)loss_dict = self.criterion(outputs, targets)losses = sum(loss for loss in loss_dict.values())# reduce losses over all GPUs for logging purposesloss_dict_reduced = reduce_loss_dict(loss_dict)losses_reduced = sum(loss for loss in loss_dict_reduced.values())self.optimizer.zero_grad()losses.backward()self.optimizer.step()self.lr_scheduler.step()eta_seconds = ((time.time() - start_time) / iteration) * (max_iters - iteration)eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))if iteration % log_per_iters == 0 and save_to_disk:logger.info("Iters: {:d}/{:d} || Lr: {:.6f} || Loss: {:.4f} || Cost Time: {} || Estimated Time: {}".format(iteration, max_iters, self.optimizer.param_groups[0]['lr'], losses_reduced.item(),str(datetime.timedelta(seconds=int(time.time() - start_time))), eta_string))if iteration % save_per_iters == 0 and save_to_disk:save_checkpoint(self.model, self.args, is_best=False)if not self.args.skip_val and iteration % val_per_iters == 0:self.validation()self.model.train()save_checkpoint(self.model, self.args, is_best=False)total_training_time = time.time() - start_timetotal_training_str = str(datetime.timedelta(seconds=total_training_time))logger.info("Total training time: {} ({:.4f}s / it)".format(total_training_str, total_training_time / max_iters))def validation(self):# total_inter, total_union, total_correct, total_label = 0, 0, 0, 0is_best = Falseself.metric.reset()if self.args.distributed:model = self.model.moduleelse:model = self.modeltorch.cuda.empty_cache()  # TODO check if it helpsmodel.eval()for i, (image, target, filename) in enumerate(self.val_loader):image = image.to(self.device)target = target.to(self.device)with torch.no_grad():outputs = model(image)self.metric.update(outputs[0], target)pixAcc, mIoU = self.metric.get()logger.info("Sample: {:d}, Validation pixAcc: {:.3f}, mIoU: {:.3f}".format(i + 1, pixAcc, mIoU))new_pred = (pixAcc + mIoU) / 2if new_pred > self.best_pred:is_best = Trueself.best_pred = new_predsave_checkpoint(self.model, self.args, is_best)synchronize()def save_checkpoint(model, args, is_best=False):"""Save Checkpoint"""directory = os.path.expanduser(args.save_dir)if not os.path.exists(directory):os.makedirs(directory)filename = '{}_{}_{}.pth'.format(args.model, args.backbone, args.dataset)filename = os.path.join(directory, filename)if args.distributed:model = model.moduleif args.mutilgpu:model = model.moduletorch.save(model.state_dict(), filename)if is_best:best_filename = '{}_{}_{}_best_model.pth'.format(args.model, args.backbone, args.dataset)best_filename = os.path.join(directory, best_filename)shutil.copyfile(filename, best_filename)if __name__ == '__main__':args = parse_args()if args.no_cuda is False:try:args.gpu_ids = [int(s) for s in args.gpu_ids.split(',')]except ValueError:raise ValueError('Argument --gpu_ids must be a comma-separated list of integers only')# reference maskrcnn-benchmark#num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1#args.num_gpus = num_gpusnum_gpus = len(args.gpu_ids)args.num_gpus = num_gpus#args.distributed = num_gpus > 1args.distributed = Falseargs.mutilgpu = num_gpus > 1if not args.no_cuda and torch.cuda.is_available():cudnn.benchmark = Trueargs.device = "cuda"else:args.distributed = Falseargs.device = "cpu"if args.distributed:torch.cuda.set_device(args.local_rank)torch.distributed.init_process_group(backend="nccl", init_method="env://")synchronize()args.lr = args.lr * num_gpuslogger = setup_logger("semantic_segmentation", args.log_dir, get_rank(), filename='{}_{}_{}_log.txt'.format(args.model, args.backbone, args.dataset))logger.info("Using {} GPUs".format(num_gpus))logger.info(args)trainer = Trainer(args)trainer.train()torch.cuda.empty_cache()
nohup /root/train/results/ynh_copy/anaconda3_py3.7/bin/python train_new.py \
--model deeplabv3 \
--backbone resnet50 \
--dataset pascal_voc \
--lr 0.01 \
--epochs 80 \
--gpu-ids 0,1,2,3 \
--batch_size 16 #>out.log 2>&1 &#nohup 
/root/train/results/ynh_copy/anaconda3_py3.7/bin/python train_new.py \
--model deeplabv3 \
--backbone resnet50 \
--dataset pascal_voc \
--lr 0.0001 \
--epochs 80 \
--gpu-ids 0,1,2,3 \
--save-dir ../save_model \
--batch-size 8 #>out.log 2>&1 &

 

与本文相关的文章

发布评论

评论列表 (0)

  1. 暂无评论