diff --git a/dygraph/benchmark/hrnet_w18_benchmark.py b/dygraph/benchmark/hrnet_w18_benchmark.py new file mode 100644 index 0000000000000000000000000000000000000000..6a6ce872686bbbec8c4f51ead028830b248512f3 --- /dev/null +++ b/dygraph/benchmark/hrnet_w18_benchmark.py @@ -0,0 +1,200 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os + +import paddle.fluid as fluid +from paddle.fluid.dygraph.parallel import ParallelEnv +from paddle.fluid.io import DataLoader +from paddle.incubate.hapi.distributed import DistributedBatchSampler + +from datasets import OpticDiscSeg, Cityscapes +import transforms as T +from models import MODELS +import utils.logging as logging +from utils import get_environ_info +from utils import load_pretrained_model +from utils import resume +from utils import Timer, calculate_eta +from core import train, evaluate + + +def parse_args(): + parser = argparse.ArgumentParser(description='Model training') + + # params of model + parser.add_argument( + '--model_name', + dest='model_name', + help='Model type for training, which is one of {}'.format( + str(list(MODELS.keys()))), + type=str, + default='UNet') + + # params of dataset + parser.add_argument( + '--dataset', + dest='dataset', + help= + "The dataset you want to train, which is one of ('OpticDiscSeg', 'Cityscapes')", + type=str, + default='Cityscapes') + + # params of training + parser.add_argument( + "--input_size", + dest="input_size", + help="The image size for net inputs.", + nargs=2, + default=[1024, 512], + type=int) + parser.add_argument( + '--num_epochs', + dest='num_epochs', + help='Number epochs for training', + type=int, + default=500) + parser.add_argument( + '--batch_size', + dest='batch_size', + help='Mini batch size of one gpu or cpu', + type=int, + default=2) + parser.add_argument( + '--learning_rate', + dest='learning_rate', + help='Learning rate', + type=float, + default=0.01) + parser.add_argument( + '--pretrained_model', + dest='pretrained_model', + help='The path of pretrained model', + type=str, + default=None) + parser.add_argument( + '--resume_model', + dest='resume_model', + help='The path of resume model', + type=str, + default=None) + parser.add_argument( + '--save_interval_epochs', + dest='save_interval_epochs', + help='The interval epochs for save a model snapshot', + type=int, + default=5) + parser.add_argument( + '--save_dir', + dest='save_dir', + help='The directory for saving the model snapshot', + type=str, + default='./output') + parser.add_argument( + '--num_workers', + dest='num_workers', + help='Num workers for data loader', + type=int, + default=2) + parser.add_argument( + '--do_eval', + dest='do_eval', + help='Eval while training', + action='store_true') + parser.add_argument( + '--log_steps', + dest='log_steps', + help='Display logging information at every log_steps', + default=10, + type=int) + parser.add_argument( + '--use_vdl', + dest='use_vdl', + help='Whether to record the data to VisualDL during training', + action='store_true') + + return parser.parse_args() + + +def main(args): + env_info = get_environ_info() + places = fluid.CUDAPlace(ParallelEnv().dev_id) \ + if env_info['place'] == 'cuda' and fluid.is_compiled_with_cuda() \ + else fluid.CPUPlace() + + if args.dataset.lower() == 'opticdiscseg': + dataset = OpticDiscSeg + elif args.dataset.lower() == 'cityscapes': + dataset = Cityscapes + else: + raise Exception( + "The --dataset set wrong. It should be one of ('OpticDiscSeg', 'Cityscapes')" + ) + + with fluid.dygraph.guard(places): + # Creat dataset reader + train_transforms = T.Compose([ + T.RandomHorizontalFlip(0.5), + T.ResizeStepScaling(0.5, 2.0, 0.25), + T.RandomPaddingCrop(args.input_size), + T.RandomDistort(), + T.Normalize(), + ]) + train_dataset = dataset(transforms=train_transforms, mode='train') + + eval_dataset = None + if args.do_eval: + eval_transforms = T.Compose([T.Normalize()]) + eval_dataset = dataset(transforms=eval_transforms, mode='eval') + + if args.model_name not in MODELS: + raise Exception( + '--model_name is invalid. it should be one of {}'.format( + str(list(MODELS.keys())))) + model = MODELS[args.model_name](num_classes=train_dataset.num_classes) + + # Creat optimizer + # todo, may less one than len(loader) + num_steps_each_epoch = len(train_dataset) // ( + args.batch_size * ParallelEnv().nranks) + decay_step = args.num_epochs * num_steps_each_epoch + lr_decay = fluid.layers.polynomial_decay( + args.learning_rate, decay_step, end_learning_rate=0, power=0.9) + optimizer = fluid.optimizer.Momentum( + lr_decay, + momentum=0.9, + parameter_list=model.parameters(), + regularization=fluid.regularizer.L2Decay(regularization_coeff=5e-4)) + train( + model, + train_dataset, + places=places, + eval_dataset=eval_dataset, + optimizer=optimizer, + save_dir=args.save_dir, + num_epochs=args.num_epochs, + batch_size=args.batch_size, + pretrained_model=args.pretrained_model, + resume_model=args.resume_model, + save_interval_epochs=args.save_interval_epochs, + log_steps=args.log_steps, + num_classes=train_dataset.num_classes, + num_workers=args.num_workers, + use_vdl=args.use_vdl) + + +if __name__ == '__main__': + args = parse_args() + main(args) diff --git a/dygraph/core/train.py b/dygraph/core/train.py index 9563f0c3a2e712746ee85253b86a999deeb89b41..a9b5a79953a880b24e2a00eea9c6eacf5981b6c1 100644 --- a/dygraph/core/train.py +++ b/dygraph/core/train.py @@ -74,15 +74,18 @@ def train(model, log_writer = LogWriter(save_dir) timer = Timer() - timer.start() avg_loss = 0.0 steps_per_epoch = len(batch_sampler) total_steps = steps_per_epoch * (num_epochs - start_epoch) num_steps = 0 best_mean_iou = -1.0 best_model_epoch = -1 + train_reader_cost = 0.0 + train_batch_cost = 0.0 for epoch in range(start_epoch, num_epochs): + timer.start() for step, data in enumerate(loader): + train_reader_cost += timer.elapsed_time() images = data[0] labels = data[1].astype('int64') if nranks > 1: @@ -99,22 +102,29 @@ def train(model, avg_loss += loss.numpy()[0] lr = optimizer.current_step_lr() num_steps += 1 + train_batch_cost += timer.elapsed_time() if num_steps % log_steps == 0 and ParallelEnv().local_rank == 0: avg_loss /= log_steps - time_step = timer.elapsed_time() / log_steps + avg_train_reader_cost = train_reader_cost / log_steps + avg_train_batch_cost = train_batch_cost / log_steps + train_reader_cost = 0.0 + train_batch_cost = 0.0 remain_steps = total_steps - num_steps + eta = calculate_eta(remain_steps, avg_train_batch_cost) logging.info( - "[TRAIN] Epoch={}/{}, Step={}/{}, loss={:.4f}, lr={:.6f}, sec/step={:.4f} | ETA {}" + "[TRAIN] Epoch={}/{}, Step={}/{}, loss={:.4f}, lr={:.6f}, batch_cost={:.4f}, reader_cost={:.4f} | ETA {}" .format(epoch + 1, num_epochs, step + 1, steps_per_epoch, - avg_loss * nranks, lr, time_step, - calculate_eta(remain_steps, time_step))) + avg_loss * nranks, lr, avg_train_batch_cost, + avg_train_reader_cost, eta)) if use_vdl: log_writer.add_scalar('Train/loss', avg_loss, num_steps) log_writer.add_scalar('Train/lr', lr, num_steps) - log_writer.add_scalar('Train/time_step', time_step, - num_steps) + log_writer.add_scalar('Train/batch_cost', + avg_train_batch_cost, num_steps) + log_writer.add_scalar('Train/reader_cost', + avg_train_reader_cost, num_steps) avg_loss = 0.0 - timer.restart() + timer.restart() if ((epoch + 1) % save_interval_epochs == 0 or epoch + 1 == num_epochs) and ParallelEnv().local_rank == 0: @@ -128,7 +138,7 @@ def train(model, os.path.join(current_save_dir, 'model')) if eval_dataset is not None: - mean_iou, mean_acc = evaluate( + mean_iou, avg_acc = evaluate( model, eval_dataset, model_dir=current_save_dir, @@ -146,10 +156,8 @@ def train(model, .format(best_model_epoch, best_mean_iou)) if use_vdl: - log_writer.add_scalar('Evaluate/mean_iou', mean_iou, - epoch + 1) - log_writer.add_scalar('Evaluate/mean_acc', mean_acc, - epoch + 1) + log_writer.add_scalar('Evaluate/mIoU', mean_iou, epoch + 1) + log_writer.add_scalar('Evaluate/aAcc', avg_acc, epoch + 1) model.train() if use_vdl: log_writer.close()