diff --git a/dygraph/ppdet/engine/callbacks.py b/dygraph/ppdet/engine/callbacks.py index 16d85b4b5fefcbd7aa904d0f3fb9424c1d2df67a..60cde185e109f1f41f57ab9520bb5bcdfbf08e40 100644 --- a/dygraph/ppdet/engine/callbacks.py +++ b/dygraph/ppdet/engine/callbacks.py @@ -79,7 +79,8 @@ class LogPrinter(Callback): def on_step_end(self, status): if ParallelEnv().nranks < 2 or ParallelEnv().local_rank == 0: - if self.model.mode == 'train': + mode = status['mode'] + if mode == 'train': epoch_id = status['epoch_id'] step_id = status['step_id'] steps_per_epoch = status['steps_per_epoch'] @@ -88,8 +89,8 @@ class LogPrinter(Callback): data_time = status['data_time'] epoches = self.model.cfg.epoch - batch_size = self.model.cfg['{}Reader'.format( - self.model.mode.capitalize())]['batch_size'] + batch_size = self.model.cfg['{}Reader'.format(mode.capitalize( + ))]['batch_size'] logs = training_staus.log() space_fmt = ':' + str(len(str(steps_per_epoch))) + 'd' @@ -119,14 +120,15 @@ class LogPrinter(Callback): dtime=str(data_time), ips=ips) logger.info(fmt) - if self.model.mode == 'eval': + if mode == 'eval': step_id = status['step_id'] if step_id % 100 == 0: logger.info("Eval iter: {}".format(step_id)) def on_epoch_end(self, status): if ParallelEnv().nranks < 2 or ParallelEnv().local_rank == 0: - if self.model.mode == 'eval': + mode = status['mode'] + if mode == 'eval': sample_num = status['sample_num'] cost_time = status['cost_time'] logger.info('Total sample number: {}, averge FPS: {}'.format( @@ -147,8 +149,11 @@ class Checkpointer(Callback): self.ema.update(self.model.model) def on_epoch_end(self, status): - assert self.model.mode == 'train', \ - "Checkpointer can only be set during training" + # Checkpointer only performed during training + mode = status['mode'] + if mode != 'train': + return + if ParallelEnv().nranks < 2 or ParallelEnv().local_rank == 0: epoch_id = status['epoch_id'] end_epoch = self.model.cfg.epoch diff --git a/dygraph/ppdet/engine/env.py b/dygraph/ppdet/engine/env.py index ad6d8d521c925a64da8c75e4875616184ec912cb..5b59f8acc423c9b7c9ed949ee2f91b31c2f794be 100644 --- a/dygraph/ppdet/engine/env.py +++ b/dygraph/ppdet/engine/env.py @@ -35,8 +35,7 @@ def init_parallel_env(): random.seed(local_seed) np.random.seed(local_seed) - if ParallelEnv().nranks > 1: - paddle.distributed.init_parallel_env() + paddle.distributed.init_parallel_env() def set_random_seed(seed): diff --git a/dygraph/ppdet/engine/trainer.py b/dygraph/ppdet/engine/trainer.py index 62f23038ae17285b4ee42fcbf8be6a3cf401c0c1..32aff4e2522fd9551b7bbd6c2bc339f5acc16330 100644 --- a/dygraph/ppdet/engine/trainer.py +++ b/dygraph/ppdet/engine/trainer.py @@ -60,15 +60,19 @@ class Trainer(object): slim = create(cfg.slim) slim(self.model) - if ParallelEnv().nranks > 1: - self.model = paddle.DataParallel(self.model) - # build data loader self.dataset = cfg['{}Dataset'.format(self.mode.capitalize())] - # TestDataset build after user set images, skip loader creation here - if self.mode != 'test': + if self.mode == 'train': self.loader = create('{}Reader'.format(self.mode.capitalize()))( self.dataset, cfg.worker_num) + # EvalDataset build with BatchSampler to evaluate in single device + # TODO: multi-device evaluate + if self.mode == 'eval': + self._eval_batch_sampler = paddle.io.BatchSampler( + self.dataset, batch_size=self.cfg.EvalReader['batch_size']) + self.loader = create('{}Reader'.format(self.mode.capitalize()))( + self.dataset, cfg.worker_num, self._eval_batch_sampler) + # TestDataset build after user set images, skip loader creation here # build optimizer in train mode if self.mode == 'train': @@ -77,6 +81,9 @@ class Trainer(object): self.optimizer = create('OptimizerBuilder')(self.lr, self.model.parameters()) + self._nranks = ParallelEnv().nranks + self._local_rank = ParallelEnv().local_rank + self.status = {} self.start_epoch = 0 @@ -103,21 +110,18 @@ class Trainer(object): self._compose_callback = None def _init_metrics(self): - if self.mode == 'eval': - if self.cfg.metric == 'COCO': - self._metrics = [COCOMetric(anno_file=self.dataset.get_anno())] - elif self.cfg.metric == 'VOC': - self._metrics = [ - VOCMetric( - anno_file=self.dataset.get_anno(), - class_num=self.cfg.num_classes, - map_type=self.cfg.map_type) - ] - else: - logger.warn("Metric not support for metric type {}".format( - self.cfg.metric)) - self._metrics = [] + if self.cfg.metric == 'COCO': + self._metrics = [COCOMetric(anno_file=self.dataset.get_anno())] + elif self.cfg.metric == 'VOC': + self._metrics = [ + VOCMetric( + anno_file=self.dataset.get_anno(), + class_num=self.cfg.num_classes, + map_type=self.cfg.map_type) + ] else: + logger.warn("Metric not support for metric type {}".format( + self.cfg.metric)) self._metrics = [] def _reset_metrics(self): @@ -154,14 +158,16 @@ class Trainer(object): weight_type, weights)) self._weights_loaded = True - def train(self): + def train(self, validate=False): assert self.mode == 'train', "Model not in 'train' mode" - self.model.train() # if no given weights loaded, load backbone pretrain weights as default if not self._weights_loaded: self.load_weights(self.cfg.pretrain_weights) + if self._nranks > 1: + model = paddle.DataParallel(self.model) + self.status.update({ 'epoch_id': self.start_epoch, 'step_id': 0, @@ -175,9 +181,11 @@ class Trainer(object): self.status['training_staus'] = stats.TrainingStats(self.cfg.log_iter) for epoch_id in range(self.start_epoch, self.cfg.epoch): + self.status['mode'] = 'train' self.status['epoch_id'] = epoch_id self._compose_callback.on_epoch_begin(self.status) self.loader.dataset.set_epoch(epoch_id) + model.train() iter_tic = time.time() for step_id, data in enumerate(self.loader): self.status['data_time'].update(time.time() - iter_tic) @@ -185,7 +193,7 @@ class Trainer(object): self._compose_callback.on_step_begin(self.status) # model forward - outputs = self.model(data) + outputs = model(data) loss = outputs['loss'] # model backward @@ -196,23 +204,42 @@ class Trainer(object): self.optimizer.clear_grad() self.status['learning_rate'] = curr_lr - if ParallelEnv().nranks < 2 or ParallelEnv().local_rank == 0: + if self._nranks < 2 or self._local_rank == 0: self.status['training_staus'].update(outputs) self.status['batch_time'].update(time.time() - iter_tic) self._compose_callback.on_step_end(self.status) iter_tic = time.time() + self._compose_callback.on_epoch_end(self.status) - def evaluate(self): + if validate and (self._nranks < 2 or self._local_rank == 0) \ + and (epoch_id % self.cfg.snapshot_epoch == 0 \ + or epoch_id == self.end_epoch - 1): + if not hasattr(self, '_eval_loader'): + # build evaluation dataset and loader + self._eval_dataset = self.cfg.EvalDataset + self._eval_batch_sampler = \ + paddle.io.BatchSampler( + self._eval_dataset, + batch_size=self.cfg.EvalReader['batch_size']) + self._eval_loader = create('EvalReader')( + self._eval_dataset, + self.cfg.worker_num, + batch_sampler=self._eval_batch_sampler) + with paddle.no_grad(): + self._eval_with_loader(self._eval_loader) + + def _eval_with_loader(self, loader): sample_num = 0 tic = time.time() self._compose_callback.on_epoch_begin(self.status) - for step_id, data in enumerate(self.loader): + self.status['mode'] = 'eval' + self.model.eval() + for step_id, data in enumerate(loader): self.status['step_id'] = step_id self._compose_callback.on_step_begin(self.status) # forward - self.model.eval() outs = self.model(data) # update metrics @@ -233,6 +260,9 @@ class Trainer(object): # reset metric states for metric may performed multiple times self._reset_metrics() + def evaluate(self): + self._eval_with_loader(self.loader) + def predict(self, images, draw_threshold=0.5, output_dir='output'): self.dataset.set_images(images) loader = create('TestReader')(self.dataset, 0) @@ -242,11 +272,12 @@ class Trainer(object): anno_file = self.dataset.get_anno() clsid2catid, catid2name = get_categories(self.cfg.metric, anno_file) - # Run Infer + # Run Infer + self.status['mode'] = 'test' + self.model.eval() for step_id, data in enumerate(loader): self.status['step_id'] = step_id # forward - self.model.eval() outs = self.model(data) for key in ['im_shape', 'scale_factor', 'im_id']: outs[key] = data[key] @@ -301,6 +332,8 @@ class Trainer(object): if image_shape is None: image_shape = [3, None, None] + self.model.eval() + # Save infer cfg _dump_infer_config(self.cfg, os.path.join(save_dir, 'infer_cfg.yml'), image_shape, diff --git a/dygraph/setup.py b/dygraph/setup.py index bb60cf0d3275206e3686ea9c340f46e3336929de..2ecdc8c478e0cd148e84b0f37a84ad4547265767 100644 --- a/dygraph/setup.py +++ b/dygraph/setup.py @@ -51,6 +51,7 @@ packages = [ 'ppdet.core', 'ppdet.data', 'ppdet.engine', + 'ppdet.metrics', 'ppdet.modeling', 'ppdet.model_zoo', 'ppdet.py_op', diff --git a/dygraph/tools/eval.py b/dygraph/tools/eval.py index eb7e60af450cc8271ffb16357db327babfd50953..998710bff9378ea212a431f555edbba3e52c5e89 100755 --- a/dygraph/tools/eval.py +++ b/dygraph/tools/eval.py @@ -32,7 +32,7 @@ from paddle.distributed import ParallelEnv from ppdet.core.workspace import load_config, merge_config from ppdet.utils.check import check_gpu, check_version, check_config from ppdet.utils.cli import ArgsParser -from ppdet.engine import Trainer +from ppdet.engine import Trainer, init_parallel_env from ppdet.utils.logger import setup_logger logger = setup_logger('eval') @@ -60,6 +60,9 @@ def parse_args(): def run(FLAGS, cfg): + # init parallel environment if nranks > 1 + init_parallel_env() + # build trainer trainer = Trainer(cfg, mode='eval') diff --git a/dygraph/tools/train.py b/dygraph/tools/train.py index b00b8575cde9746f4ad019fa11131f7dce235111..c6318264c9c06205fd864a5a7ef78d9085dea82d 100755 --- a/dygraph/tools/train.py +++ b/dygraph/tools/train.py @@ -84,7 +84,7 @@ def run(FLAGS, cfg): trainer.load_weights(cfg.pretrain_weights, FLAGS.weight_type) # training - trainer.train() + trainer.train(FLAGS.eval) def main():