Comments (9)
import argparse
import time
import datetime
import os
import shutil
import sys
cur_path = os.path.abspath(os.path.dirname(__file__))
root_path = os.path.split(cur_path)[0]
sys.path.append(root_path)
import torch
import torch.nn as nn
import torch.utils.data as data
import torch.backends.cudnn as cudnn
from torchvision import transforms
from core.data.dataloader import get_segmentation_dataset
from core.models.model_zoo import get_segmentation_model
from core.utils.loss import get_segmentation_loss
from core.utils.distributed import *
from core.utils.logger import setup_logger
from core.utils.lr_scheduler import WarmupPolyLR
from core.utils.score import SegmentationMetric
def parse_args():
parser = argparse.ArgumentParser(description='Semantic Segmentation Training With Pytorch')
# model and dataset
parser.add_argument('--model', type=str, default='fcn',
choices=['fcn32s', 'fcn16s', 'fcn8s',
'fcn', 'psp', 'deeplabv3', 'deeplabv3_plus',
'danet', 'denseaspp', 'bisenet',
'encnet', 'dunet', 'icnet',
'enet', 'ocnet', 'ccnet', 'psanet',
'cgnet', 'espnet', 'lednet', 'dfanet'],
help='model name (default: fcn32s)')
parser.add_argument('--backbone', type=str, default='resnet50',
choices=['vgg16', 'resnet18', 'resnet50',
'resnet101', 'resnet152', 'densenet121',
'densenet161', 'densenet169', 'densenet201'],
help='backbone name (default: vgg16)')
parser.add_argument('--dataset', type=str, default='pascal_voc',
choices=['pascal_voc', 'pascal_aug', 'ade20k',
'citys', 'sbu'],
help='dataset name (default: pascal_voc)')
parser.add_argument('--base-size', type=int, default=520,
help='base image size')
parser.add_argument('--crop-size', type=int, default=480,
help='crop image size')
parser.add_argument('--workers', '-j', type=int, default=4,
metavar='N', help='dataloader threads')
# training hyper params
parser.add_argument('--jpu', action='store_true', default=False,
help='JPU')
parser.add_argument('--use-ohem', type=bool, default=False,
help='OHEM Loss for cityscapes dataset')
parser.add_argument('--aux', action='store_true', default=False,
help='Auxiliary loss')
parser.add_argument('--aux-weight', type=float, default=0.4,
help='auxiliary loss weight')
parser.add_argument('--batch-size', type=int, default=4, metavar='N',
help='input batch size for training (default: 8)')
parser.add_argument('--start_epoch', type=int, default=0,
metavar='N', help='start epochs (default:0)')
parser.add_argument('--epochs', type=int, default=50, metavar='N',
help='number of epochs to train (default: 50)')
parser.add_argument('--lr', type=float, default=1e-4, metavar='LR',
help='learning rate (default: 1e-4)')
parser.add_argument('--momentum', type=float, default=0.9, metavar='M',
help='momentum (default: 0.9)')
parser.add_argument('--weight-decay', type=float, default=1e-4, metavar='M',
help='w-decay (default: 5e-4)')
parser.add_argument('--warmup-iters', type=int, default=0,
help='warmup iters')
parser.add_argument('--warmup-factor', type=float, default=1.0 / 3,
help='lr = warmup_factor * lr')
parser.add_argument('--warmup-method', type=str, default='linear',
help='method of warmup')
# cuda setting
parser.add_argument('--no-cuda', action='store_true', default=False,
help='disables CUDA training')
parser.add_argument('--local_rank', type=int, default=0)
# checkpoint and log
parser.add_argument('--resume', type=str, default=None,
help='put the path to resuming file if needed')
parser.add_argument('--save-dir', default='~/.torch/models',
help='Directory for saving checkpoint models')
parser.add_argument('--save-epoch', type=int, default=10,
help='save model every checkpoint-epoch')
parser.add_argument('--log-dir', default='../runs/logs/',
help='Directory for saving checkpoint models')
parser.add_argument('--log-iter', type=int, default=10,
help='print log every log-iter')
# evaluation only
parser.add_argument('--val-epoch', type=int, default=1,
help='run validation every val-epoch')
parser.add_argument('--skip-val', action='store_true', default=False,
help='skip validation during training')
parser.add_argument('--gpu-ids', type=str, default='0,1,2,3',
help='use which gpu to train, must be a \
comma-separated list of integers only (default=0)')
args = parser.parse_args()
# default settings for epochs, batch_size and lr
if args.epochs is None:
epoches = {
'coco': 30,
'pascal_aug': 80,
'pascal_voc': 50,
'pcontext': 80,
'ade20k': 160,
'citys': 120,
'sbu': 160,
}
args.epochs = epoches[args.dataset.lower()]
if args.lr is None:
lrs = {
'coco': 0.004,
'pascal_aug': 0.001,
'pascal_voc': 0.0001,
'pcontext': 0.001,
'ade20k': 0.01,
'citys': 0.01,
'sbu': 0.001,
}
args.lr = lrs[args.dataset.lower()] / 8 * args.batch_size
return args
class Trainer(object):
def __init__(self, args):
self.args = args
self.device = torch.device(args.device)
# image transform
input_transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize([.485, .456, .406], [.229, .224, .225]),
])
# dataset and dataloader
data_kwargs = {'transform': input_transform, 'base_size': args.base_size, 'crop_size': args.crop_size}
train_dataset = get_segmentation_dataset(args.dataset, split='train', mode='train', **data_kwargs)
val_dataset = get_segmentation_dataset(args.dataset, split='val', mode='val', **data_kwargs)
args.iters_per_epoch = len(train_dataset) // (args.num_gpus * args.batch_size)
args.max_iters = args.epochs * args.iters_per_epoch
if args.mutilgpu:
args.batch_size = args.batch_size * len(args.gpu_ids)
train_sampler = make_data_sampler(train_dataset, shuffle=True, distributed=args.distributed)
train_batch_sampler = make_batch_data_sampler(train_sampler, args.batch_size, args.max_iters)
val_sampler = make_data_sampler(val_dataset, False, args.distributed)
val_batch_sampler = make_batch_data_sampler(val_sampler, args.batch_size)
self.train_loader = data.DataLoader(dataset=train_dataset,
batch_sampler=train_batch_sampler,
num_workers=args.workers,
pin_memory=True)
self.val_loader = data.DataLoader(dataset=val_dataset,
batch_sampler=val_batch_sampler,
num_workers=args.workers,
pin_memory=True)
# create network
BatchNorm2d = nn.SyncBatchNorm if args.distributed else nn.BatchNorm2d
self.model = get_segmentation_model(model=args.model, dataset=args.dataset, backbone=args.backbone,
aux=args.aux, jpu=args.jpu, norm_layer=BatchNorm2d).to(self.device)
# resume checkpoint if needed
if args.resume:
if os.path.isfile(args.resume):
name, ext = os.path.splitext(args.resume)
assert ext == '.pkl' or '.pth', 'Sorry only .pth and .pkl files supported.'
print('Resuming training, loading {}...'.format(args.resume))
self.model.load_state_dict(torch.load(args.resume, map_location=lambda storage, loc: storage))
# create criterion
self.criterion = get_segmentation_loss(args.model, use_ohem=args.use_ohem, aux=args.aux,
aux_weight=args.aux_weight, ignore_index=-1).to(self.device)
# optimizer, for model just includes pretrained, head and auxlayer
params_list = list()
if hasattr(self.model, 'pretrained'):
params_list.append({'params': self.model.pretrained.parameters(), 'lr': args.lr})
if hasattr(self.model, 'exclusive'):
for module in self.model.exclusive:
params_list.append({'params': getattr(self.model, module).parameters(), 'lr': args.lr * 10})
self.optimizer = torch.optim.SGD(params_list,
lr=args.lr,
momentum=args.momentum,
weight_decay=args.weight_decay)
# lr scheduling
self.lr_scheduler = WarmupPolyLR(self.optimizer,
max_iters=args.max_iters,
power=0.9,
warmup_factor=args.warmup_factor,
warmup_iters=args.warmup_iters,
warmup_method=args.warmup_method)
if args.distributed:
self.model = nn.parallel.DistributedDataParallel(self.model, device_ids=[args.local_rank],
output_device=args.local_rank)
if args.mutilgpu:
self.model = nn.DataParallel(self.model, device_ids=args.gpu_ids)
# evaluation metrics
self.metric = SegmentationMetric(train_dataset.num_class)
self.best_pred = 0.0
def train(self):
save_to_disk = get_rank() == 0
epochs, max_iters = self.args.epochs, self.args.max_iters
log_per_iters, val_per_iters = self.args.log_iter, self.args.val_epoch * self.args.iters_per_epoch
save_per_iters = self.args.save_epoch * self.args.iters_per_epoch
start_time = time.time()
logger.info('Start training, Total Epochs: {:d} = Total Iterations {:d}'.format(epochs, max_iters))
self.model.train()
for iteration, (images, targets, _) in enumerate(self.train_loader):
iteration = iteration + 1
images = images.to(self.device)
targets = targets.to(self.device)
outputs = self.model(images)
loss_dict = self.criterion(outputs, targets)
losses = sum(loss for loss in loss_dict.values())
# reduce losses over all GPUs for logging purposes
loss_dict_reduced = reduce_loss_dict(loss_dict)
losses_reduced = sum(loss for loss in loss_dict_reduced.values())
self.optimizer.zero_grad()
losses.backward()
self.optimizer.step()
self.lr_scheduler.step()
eta_seconds = ((time.time() - start_time) / iteration) * (max_iters - iteration)
eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
if iteration % log_per_iters == 0 and save_to_disk:
logger.info(
"Iters: {:d}/{:d} || Lr: {:.6f} || Loss: {:.4f} || Cost Time: {} || Estimated Time: {}".format(
iteration, max_iters, self.optimizer.param_groups[0]['lr'], losses_reduced.item(),
str(datetime.timedelta(seconds=int(time.time() - start_time))), eta_string))
if iteration % save_per_iters == 0 and save_to_disk:
save_checkpoint(self.model, self.args, is_best=False)
if not self.args.skip_val and iteration % val_per_iters == 0:
self.validation()
self.model.train()
save_checkpoint(self.model, self.args, is_best=False)
total_training_time = time.time() - start_time
total_training_str = str(datetime.timedelta(seconds=total_training_time))
logger.info(
"Total training time: {} ({:.4f}s / it)".format(
total_training_str, total_training_time / max_iters))
def validation(self):
# total_inter, total_union, total_correct, total_label = 0, 0, 0, 0
is_best = False
self.metric.reset()
if self.args.distributed:
model = self.model.module
else:
model = self.model
torch.cuda.empty_cache() # TODO check if it helps
model.eval()
for i, (image, target, filename) in enumerate(self.val_loader):
image = image.to(self.device)
target = target.to(self.device)
with torch.no_grad():
outputs = model(image)
self.metric.update(outputs[0], target)
pixAcc, mIoU = self.metric.get()
logger.info("Sample: {:d}, Validation pixAcc: {:.3f}, mIoU: {:.3f}".format(i + 1, pixAcc, mIoU))
new_pred = (pixAcc + mIoU) / 2
if new_pred > self.best_pred:
is_best = True
self.best_pred = new_pred
save_checkpoint(self.model, self.args, is_best)
synchronize()
def save_checkpoint(model, args, is_best=False):
"""Save Checkpoint"""
directory = os.path.expanduser(args.save_dir)
if not os.path.exists(directory):
os.makedirs(directory)
filename = '{}_{}_{}.pth'.format(args.model, args.backbone, args.dataset)
filename = os.path.join(directory, filename)
if args.distributed:
model = model.module
if args.mutilgpu:
model = model.module
torch.save(model.state_dict(), filename)
if is_best:
best_filename = '{}_{}_{}_best_model.pth'.format(args.model, args.backbone, args.dataset)
best_filename = os.path.join(directory, best_filename)
shutil.copyfile(filename, best_filename)
if __name__ == '__main__':
args = parse_args()
if args.no_cuda is False:
try:
args.gpu_ids = [int(s) for s in args.gpu_ids.split(',')]
except ValueError:
raise ValueError('Argument --gpu_ids must be a comma-separated list of integers only')
# reference maskrcnn-benchmark
#num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
#args.num_gpus = num_gpus
num_gpus = len(args.gpu_ids)
args.num_gpus = num_gpus
#args.distributed = num_gpus > 1
args.distributed = False
args.mutilgpu = num_gpus > 1
if not args.no_cuda and torch.cuda.is_available():
cudnn.benchmark = True
args.device = "cuda"
else:
args.distributed = False
args.device = "cpu"
if args.distributed:
torch.cuda.set_device(args.local_rank)
torch.distributed.init_process_group(backend="nccl", init_method="env://")
synchronize()
args.lr = args.lr * num_gpus
logger = setup_logger("semantic_segmentation", args.log_dir, get_rank(), filename='{}_{}_{}_log.txt'.format(
args.model, args.backbone, args.dataset))
logger.info("Using {} GPUs".format(num_gpus))
logger.info(args)
trainer = Trainer(args)
trainer.train()
torch.cuda.empty_cache()
from awesome-semantic-segmentation-pytorch.
did you guys get any error like this?
subprocess.CalledProcessError: Command '['/home/user/anaconda3/envs/test-python3/bin/python', '-u', 'scripts/train.py', '--local_rank=1', '--model', 'icnet', '--backbone', 'resnet152', '--dataset', 'citys', '--lr', '0.0001', '--epochs', '50']' returned non-zero exit status 1
from awesome-semantic-segmentation-pytorch.
I modified the code to make it possible to multi-GPU parallel, but using:
Self.model = nn.DataParallel(self.model, device_ids=args.gpu_ids)
from awesome-semantic-segmentation-pytorch.
I'm also running into this issue @Tramac. Any help would be greatly appreciated!
from awesome-semantic-segmentation-pytorch.
did you guys get any error like this?
subprocess.CalledProcessError: Command '['/home/user/anaconda3/envs/test-python3/bin/python', '-u', 'scripts/train.py', '--local_rank=1', '--model', 'icnet', '--backbone', 'resnet152', '--dataset', 'citys', '--lr', '0.0001', '--epochs', '50']' returned non-zero exit status 1
我得到了跟你一样的错误,请问你解决了吗?
from awesome-semantic-segmentation-pytorch.
1
from awesome-semantic-segmentation-pytorch.
pytorch1.1.0 Ubuntu16.04 me too
from awesome-semantic-segmentation-pytorch.
nohup /root/train/results/ynh_copy/anaconda3_py3.7/bin/python train_new.py \
--model deeplabv3 \
--backbone resnet50 \
--dataset pascal_voc \
--lr 0.01 \
--epochs 80 \
--gpu-ids 0,1,2,3 \
--batch_size 16 #>out.log 2>&1 &
from awesome-semantic-segmentation-pytorch.
Traceback (most recent call last):
File "train_new.py", line 352, in
trainer.train()
File "train_new.py", line 231, in train
outputs = self.model(images)
File "/home/renshasha/anaconda3/envs/aaa/lib/python3.6/site-packages/torch/nn/modules/module.py", line 493, in call
result = self.forward(*input, **kwargs)
File "/home/renshasha/anaconda3/envs/aaa/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 146, in forward
"them on device: {}".format(self.src_device_obj, t.device))
RuntimeError: module must have its parameters and buffers on device cuda:4 (device_ids[0]) but found one of them on device: cuda:0
有人遇到这种错误码?
from awesome-semantic-segmentation-pytorch.
Related Issues (20)
- How do you fix the gap between tensorflow and pytorch?
- Sorry, Where is the Pretrained Model Link?
- > There might be something wrong with the version of `pillow`. Try this : `border=(0, 0, padw, padh)` -> `border=20` HOT 3
- RuntimeError: CUDA out of memory. HOT 1
- RuntimeError: Given groups=1, weight of size [128, 512, 1, 1], expected input[4, 2048, 1, 1] to have 512 channels, but got 2048 channels instead HOT 4
- Errors occured when running with multi-GPUs
- 为什么只能训练fcn系列的
- why " iteration=iteration+1" in train.py ?
- 有人能把模型跑到论文中的精度吗 HOT 1
- Why the performance dropped by half when use pytorch1.5 or the newer version?
- Hello! Is there any group number?
- 执行demo程序发生如下报错TypeError: __init__() got an unexpected keyword argument 'local_rank'怎么解决 HOT 2
- DenseASPP与官方代码似乎实现的不太一样(different implemantation with official code in DenseASPP)
- 有训练好的checkpoint吗
- 请问一下这个fcn32s_vgg16_pascal_voc.pth在哪里获取啊?根据readme的指示跑demo.py出现ValueError: Model file is not found. Downloading or trainning.
- bisenet cityscapes the miou is low
- VOC_aug里的mat格式标签怎么得到
- bisenet我只能跑到64左右,离论文里面的76差了很多,有人跑到哪个精度吗
- TypeError: init() got an unexpected keyword argument 'local_rank' HOT 2
- DeepLabV3+, the backbone can only use Xception, and other backbones cannot be selected HOT 1
Recommend Projects
-
React
A declarative, efficient, and flexible JavaScript library for building user interfaces.
-
Vue.js
🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
-
Typescript
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
-
TensorFlow
An Open Source Machine Learning Framework for Everyone
-
Django
The Web framework for perfectionists with deadlines.
-
Laravel
A PHP framework for web artisans
-
D3
Bring data to life with SVG, Canvas and HTML. 📊📈🎉
-
Recommend Topics
-
javascript
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
-
web
Some thing interesting about web. New door for the world.
-
server
A server is a program made to process requests and deliver data to clients.
-
Machine learning
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
-
Visualization
Some thing interesting about visualization, use data art
-
Game
Some thing interesting about game, make everyone happy.
Recommend Org
-
Facebook
We are working to build community through open source technology. NB: members must have two-factor auth.
-
Microsoft
Open source projects and samples from Microsoft.
-
Google
Google ❤️ Open Source for everyone.
-
Alibaba
Alibaba Open Source for everyone
-
D3
Data-Driven Documents codes.
-
Tencent
China tencent open source team.
from awesome-semantic-segmentation-pytorch.