callbacks.py 21.1 KB
Newer Older
K
Kaipeng Deng 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
20
import sys
K
Kaipeng Deng 已提交
21
import datetime
22
import six
23 24
import copy
import json
K
Kaipeng Deng 已提交
25

26
import paddle
W
wangguanzhong 已提交
27
import paddle.distributed as dist
K
Kaipeng Deng 已提交
28 29

from ppdet.utils.checkpoint import save_model
30
from ppdet.metrics import get_infer_results
K
Kaipeng Deng 已提交
31 32

from ppdet.utils.logger import setup_logger
33
logger = setup_logger('ppdet.engine')
K
Kaipeng Deng 已提交
34

S
shangliang Xu 已提交
35 36 37 38
__all__ = [
    'Callback', 'ComposeCallback', 'LogPrinter', 'Checkpointer',
    'VisualDLWriter', 'SniperProposalsGenerator'
]
K
Kaipeng Deng 已提交
39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56


class Callback(object):
    def __init__(self, model):
        self.model = model

    def on_step_begin(self, status):
        pass

    def on_step_end(self, status):
        pass

    def on_epoch_begin(self, status):
        pass

    def on_epoch_end(self, status):
        pass

57 58 59 60 61 62
    def on_train_begin(self, status):
        pass

    def on_train_end(self, status):
        pass

K
Kaipeng Deng 已提交
63 64 65

class ComposeCallback(object):
    def __init__(self, callbacks):
66 67 68 69
        callbacks = [c for c in list(callbacks) if c is not None]
        for c in callbacks:
            assert isinstance(
                c, Callback), "callback should be subclass of Callback"
K
Kaipeng Deng 已提交
70 71 72
        self._callbacks = callbacks

    def on_step_begin(self, status):
73 74
        for c in self._callbacks:
            c.on_step_begin(status)
K
Kaipeng Deng 已提交
75 76

    def on_step_end(self, status):
77 78
        for c in self._callbacks:
            c.on_step_end(status)
K
Kaipeng Deng 已提交
79 80

    def on_epoch_begin(self, status):
81 82
        for c in self._callbacks:
            c.on_epoch_begin(status)
K
Kaipeng Deng 已提交
83 84

    def on_epoch_end(self, status):
85 86
        for c in self._callbacks:
            c.on_epoch_end(status)
K
Kaipeng Deng 已提交
87

88 89 90 91 92 93 94 95
    def on_train_begin(self, status):
        for c in self._callbacks:
            c.on_train_begin(status)

    def on_train_end(self, status):
        for c in self._callbacks:
            c.on_train_end(status)

K
Kaipeng Deng 已提交
96 97 98 99 100 101

class LogPrinter(Callback):
    def __init__(self, model):
        super(LogPrinter, self).__init__(model)

    def on_step_end(self, status):
W
wangguanzhong 已提交
102
        if dist.get_world_size() < 2 or dist.get_rank() == 0:
K
Kaipeng Deng 已提交
103 104
            mode = status['mode']
            if mode == 'train':
K
Kaipeng Deng 已提交
105 106 107 108 109 110 111 112
                epoch_id = status['epoch_id']
                step_id = status['step_id']
                steps_per_epoch = status['steps_per_epoch']
                training_staus = status['training_staus']
                batch_time = status['batch_time']
                data_time = status['data_time']

                epoches = self.model.cfg.epoch
K
Kaipeng Deng 已提交
113 114
                batch_size = self.model.cfg['{}Reader'.format(mode.capitalize(
                ))]['batch_size']
K
Kaipeng Deng 已提交
115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143

                logs = training_staus.log()
                space_fmt = ':' + str(len(str(steps_per_epoch))) + 'd'
                if step_id % self.model.cfg.log_iter == 0:
                    eta_steps = (epoches - epoch_id) * steps_per_epoch - step_id
                    eta_sec = eta_steps * batch_time.global_avg
                    eta_str = str(datetime.timedelta(seconds=int(eta_sec)))
                    ips = float(batch_size) / batch_time.avg
                    fmt = ' '.join([
                        'Epoch: [{}]',
                        '[{' + space_fmt + '}/{}]',
                        'learning_rate: {lr:.6f}',
                        '{meters}',
                        'eta: {eta}',
                        'batch_cost: {btime}',
                        'data_cost: {dtime}',
                        'ips: {ips:.4f} images/s',
                    ])
                    fmt = fmt.format(
                        epoch_id,
                        step_id,
                        steps_per_epoch,
                        lr=status['learning_rate'],
                        meters=logs,
                        eta=eta_str,
                        btime=str(batch_time),
                        dtime=str(data_time),
                        ips=ips)
                    logger.info(fmt)
F
Feng Ni 已提交
144 145 146 147
            if mode == 'eval':
                step_id = status['step_id']
                if step_id % 100 == 0:
                    logger.info("Eval iter: {}".format(step_id))
K
Kaipeng Deng 已提交
148 149

    def on_epoch_end(self, status):
W
wangguanzhong 已提交
150
        if dist.get_world_size() < 2 or dist.get_rank() == 0:
K
Kaipeng Deng 已提交
151 152
            mode = status['mode']
            if mode == 'eval':
K
Kaipeng Deng 已提交
153 154 155 156 157 158 159 160 161
                sample_num = status['sample_num']
                cost_time = status['cost_time']
                logger.info('Total sample number: {}, averge FPS: {}'.format(
                    sample_num, sample_num / cost_time))


class Checkpointer(Callback):
    def __init__(self, model):
        super(Checkpointer, self).__init__(model)
162
        self.best_ap = -1000.
163 164
        self.save_dir = os.path.join(self.model.cfg.save_dir,
                                     self.model.cfg.filename)
165 166 167 168
        if hasattr(self.model.model, 'student_model'):
            self.weight = self.model.model.student_model
        else:
            self.weight = self.model.model
K
Kaipeng Deng 已提交
169 170

    def on_epoch_end(self, status):
K
Kaipeng Deng 已提交
171 172
        # Checkpointer only performed during training
        mode = status['mode']
173 174 175
        epoch_id = status['epoch_id']
        weight = None
        save_name = None
W
wangguanzhong 已提交
176
        if dist.get_world_size() < 2 or dist.get_rank() == 0:
177 178
            if mode == 'train':
                end_epoch = self.model.cfg.epoch
179 180 181
                if (
                        epoch_id + 1
                ) % self.model.cfg.snapshot_epoch == 0 or epoch_id == end_epoch - 1:
182 183
                    save_name = str(
                        epoch_id) if epoch_id != end_epoch - 1 else "model_final"
S
shangliang Xu 已提交
184
                    weight = self.weight.state_dict()
185 186 187 188
            elif mode == 'eval':
                if 'save_best_model' in status and status['save_best_model']:
                    for metric in self.model._metrics:
                        map_res = metric.get_results()
189 190 191 192 193
                        eval_func = "ap"
                        if 'pose3d' in map_res:
                            key = 'pose3d'
                            eval_func = "mpjpe"
                        elif 'bbox' in map_res:
194 195 196 197 198
                            key = 'bbox'
                        elif 'keypoint' in map_res:
                            key = 'keypoint'
                        else:
                            key = 'mask'
199
                        if key not in map_res:
200
                            logger.warning("Evaluation results empty, this may be due to " \
201 202 203
                                        "training iterations being too few or not " \
                                        "loading the correct weights.")
                            return
204
                        if map_res[key][0] >= self.best_ap:
205 206
                            self.best_ap = map_res[key][0]
                            save_name = 'best_model'
S
shangliang Xu 已提交
207
                            weight = self.weight.state_dict()
208 209
                        logger.info("Best test {} {} is {:0.3f}.".format(
                            key, eval_func, abs(self.best_ap)))
210
            if weight:
S
shangliang Xu 已提交
211
                if self.model.use_ema:
212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237
                    exchange_save_model = status.get('exchange_save_model',
                                                     False)
                    if not exchange_save_model:
                        # save model and ema_model
                        save_model(
                            status['weight'],
                            self.model.optimizer,
                            self.save_dir,
                            save_name,
                            epoch_id + 1,
                            ema_model=weight)
                    else:
                        # save model(student model) and ema_model(teacher model)
                        # in DenseTeacher SSOD, the teacher model will be higher,
                        # so exchange when saving pdparams
                        student_model = status['weight']  # model
                        teacher_model = weight  # ema_model
                        save_model(
                            teacher_model,
                            self.model.optimizer,
                            self.save_dir,
                            save_name,
                            epoch_id + 1,
                            ema_model=student_model)
                        del teacher_model
                        del student_model
S
shangliang Xu 已提交
238
                else:
S
shangliang Xu 已提交
239 240
                    save_model(weight, self.model.optimizer, self.save_dir,
                               save_name, epoch_id + 1)
241 242 243 244 245 246 247 248 249 250 251 252


class WiferFaceEval(Callback):
    def __init__(self, model):
        super(WiferFaceEval, self).__init__(model)

    def on_epoch_begin(self, status):
        assert self.model.mode == 'eval', \
            "WiferFaceEval can only be set during evaluation"
        for metric in self.model._metrics:
            metric.update(self.model.model)
        sys.exit()
253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269


class VisualDLWriter(Callback):
    """
    Use VisualDL to log data or image
    """

    def __init__(self, model):
        super(VisualDLWriter, self).__init__(model)

        assert six.PY3, "VisualDL requires Python >= 3.5"
        try:
            from visualdl import LogWriter
        except Exception as e:
            logger.error('visualdl not found, plaese install visualdl. '
                         'for example: `pip install visualdl`.')
            raise e
M
Manuel Garcia 已提交
270 271
        self.vdl_writer = LogWriter(
            model.cfg.get('vdl_log_dir', 'vdl_log_dir/scalar'))
272 273 274 275 276 277 278
        self.vdl_loss_step = 0
        self.vdl_mAP_step = 0
        self.vdl_image_step = 0
        self.vdl_image_frame = 0

    def on_step_end(self, status):
        mode = status['mode']
W
wangguanzhong 已提交
279
        if dist.get_world_size() < 2 or dist.get_rank() == 0:
280 281 282 283 284
            if mode == 'train':
                training_staus = status['training_staus']
                for loss_name, loss_value in training_staus.get().items():
                    self.vdl_writer.add_scalar(loss_name, loss_value,
                                               self.vdl_loss_step)
285
                self.vdl_loss_step += 1
286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302
            elif mode == 'test':
                ori_image = status['original_image']
                result_image = status['result_image']
                self.vdl_writer.add_image(
                    "original/frame_{}".format(self.vdl_image_frame), ori_image,
                    self.vdl_image_step)
                self.vdl_writer.add_image(
                    "result/frame_{}".format(self.vdl_image_frame),
                    result_image, self.vdl_image_step)
                self.vdl_image_step += 1
                # each frame can display ten pictures at most.
                if self.vdl_image_step % 10 == 0:
                    self.vdl_image_step = 0
                    self.vdl_image_frame += 1

    def on_epoch_end(self, status):
        mode = status['mode']
W
wangguanzhong 已提交
303
        if dist.get_world_size() < 2 or dist.get_rank() == 0:
304 305 306 307 308 309 310
            if mode == 'eval':
                for metric in self.model._metrics:
                    for key, map_value in metric.get_results().items():
                        self.vdl_writer.add_scalar("{}-mAP".format(key),
                                                   map_value[0],
                                                   self.vdl_mAP_step)
                self.vdl_mAP_step += 1
311

312

313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331
class WandbCallback(Callback):
    def __init__(self, model):
        super(WandbCallback, self).__init__(model)

        try:
            import wandb
            self.wandb = wandb
        except Exception as e:
            logger.error('wandb not found, please install wandb. '
                         'Use: `pip install wandb`.')
            raise e

        self.wandb_params = model.cfg.get('wandb', None)
        self.save_dir = os.path.join(self.model.cfg.save_dir,
                                     self.model.cfg.filename)
        if self.wandb_params is None:
            self.wandb_params = {}
        for k, v in model.cfg.items():
            if k.startswith("wandb_"):
332 333
                self.wandb_params.update({k.lstrip("wandb_"): v})

334 335 336 337 338 339 340
        self._run = None
        if dist.get_world_size() < 2 or dist.get_rank() == 0:
            _ = self.run
            self.run.config.update(self.model.cfg)
            self.run.define_metric("epoch")
            self.run.define_metric("eval/*", step_metric="epoch")

341
        self.best_ap = -1000.
342
        self.fps = []
343

344 345 346 347
    @property
    def run(self):
        if self._run is None:
            if self.wandb.run is not None:
348 349 350 351
                logger.info(
                    "There is an ongoing wandb run which will be used"
                    "for logging. Please use `wandb.finish()` to end that"
                    "if the behaviour is not intended")
352 353 354 355
                self._run = self.wandb.run
            else:
                self._run = self.wandb.init(**self.wandb_params)
        return self._run
356

357
    def save_model(self,
358 359 360 361 362 363
                   optimizer,
                   save_dir,
                   save_name,
                   last_epoch,
                   ema_model=None,
                   ap=None,
364
                   fps=None,
365
                   tags=None):
366 367 368 369 370 371
        if dist.get_world_size() < 2 or dist.get_rank() == 0:
            model_path = os.path.join(save_dir, save_name)
            metadata = {}
            metadata["last_epoch"] = last_epoch
            if ap:
                metadata["ap"] = ap
372 373 374 375
            
            if fps:
                metadata["fps"] = fps

376
            if ema_model is None:
377 378 379 380 381 382 383 384
                ema_artifact = self.wandb.Artifact(
                    name="ema_model-{}".format(self.run.id),
                    type="model",
                    metadata=metadata)
                model_artifact = self.wandb.Artifact(
                    name="model-{}".format(self.run.id),
                    type="model",
                    metadata=metadata)
385 386 387 388 389 390 391

                ema_artifact.add_file(model_path + ".pdema", name="model_ema")
                model_artifact.add_file(model_path + ".pdparams", name="model")

                self.run.log_artifact(ema_artifact, aliases=tags)
                self.run.log_artfact(model_artifact, aliases=tags)
            else:
392 393 394 395
                model_artifact = self.wandb.Artifact(
                    name="model-{}".format(self.run.id),
                    type="model",
                    metadata=metadata)
396 397
                model_artifact.add_file(model_path + ".pdparams", name="model")
                self.run.log_artifact(model_artifact, aliases=tags)
398

399 400 401 402 403 404 405 406
    def on_step_end(self, status):

        mode = status['mode']
        if dist.get_world_size() < 2 or dist.get_rank() == 0:
            if mode == 'train':
                training_status = status['training_staus'].get()
                for k, v in training_status.items():
                    training_status[k] = float(v)
407 408 409 410 411 412 413 414 415 416 417

                # calculate ips, data_cost, batch_cost
                batch_time = status['batch_time']
                data_time = status['data_time']
                batch_size = self.model.cfg['{}Reader'.format(mode.capitalize(
                ))]['batch_size']

                ips = float(batch_size) / float(batch_time.avg)
                data_cost = float(data_time.avg)
                batch_cost = f

418
                metrics = {"train/" + k: v for k, v in training_status.items()}
419 420 421 422 423 424 425


                metrics["train/ips"] = ips
                metrics["train/data_cost"] = data_cost
                metrics["train/batch_cost"] = batch_cost

                self.fps.append(ips)
426
                self.run.log(metrics)
427

428 429 430 431 432 433
    def on_epoch_end(self, status):
        mode = status['mode']
        epoch_id = status['epoch_id']
        save_name = None
        if dist.get_world_size() < 2 or dist.get_rank() == 0:
            if mode == 'train':
434 435 436
                fps = sum(self.fps) / len(self.fps)
                self.fps = []

437 438 439 440
                end_epoch = self.model.cfg.epoch
                if (
                        epoch_id + 1
                ) % self.model.cfg.snapshot_epoch == 0 or epoch_id == end_epoch - 1:
441 442
                    save_name = str(
                        epoch_id) if epoch_id != end_epoch - 1 else "model_final"
443 444 445 446 447 448 449
                    tags = ["latest", "epoch_{}".format(epoch_id)]
                    self.save_model(
                        self.model.optimizer,
                        self.save_dir,
                        save_name,
                        epoch_id + 1,
                        self.model.use_ema,
450
                        fps=fps,
451
                        tags=tags)
452
            if mode == 'eval':
453 454 455 456 457
                sample_num = status['sample_num']
                cost_time = status['cost_time']

                fps = sample_num / cost_time

458 459 460 461 462
                merged_dict = {}
                for metric in self.model._metrics:
                    for key, map_value in metric.get_results().items():
                        merged_dict["eval/{}-mAP".format(key)] = map_value[0]
                merged_dict["epoch"] = status["epoch_id"]
463 464
                merged_dict["eval/fps"] = sample_num / cost_time

465 466 467 468 469
                self.run.log(merged_dict)

                if 'save_best_model' in status and status['save_best_model']:
                    for metric in self.model._metrics:
                        map_res = metric.get_results()
470 471 472
                        if 'pose3d' in map_res:
                            key = 'pose3d'
                        elif 'bbox' in map_res:
473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493
                            key = 'bbox'
                        elif 'keypoint' in map_res:
                            key = 'keypoint'
                        else:
                            key = 'mask'
                        if key not in map_res:
                            logger.warning("Evaluation results empty, this may be due to " \
                                        "training iterations being too few or not " \
                                        "loading the correct weights.")
                            return
                        if map_res[key][0] >= self.best_ap:
                            self.best_ap = map_res[key][0]
                            save_name = 'best_model'
                            tags = ["best", "epoch_{}".format(epoch_id)]

                            self.save_model(
                                self.model.optimizer,
                                self.save_dir,
                                save_name,
                                last_epoch=epoch_id + 1,
                                ema_model=self.model.use_ema,
494
                                ap=abs(self.best_ap),
495
                                fps=fps,
496 497
                                tags=tags)

498 499 500
    def on_train_end(self, status):
        self.run.finish()

501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558

class SniperProposalsGenerator(Callback):
    def __init__(self, model):
        super(SniperProposalsGenerator, self).__init__(model)
        ori_dataset = self.model.dataset
        self.dataset = self._create_new_dataset(ori_dataset)
        self.loader = self.model.loader
        self.cfg = self.model.cfg
        self.infer_model = self.model.model

    def _create_new_dataset(self, ori_dataset):
        dataset = copy.deepcopy(ori_dataset)
        # init anno_cropper
        dataset.init_anno_cropper()
        # generate infer roidbs
        ori_roidbs = dataset.get_ori_roidbs()
        roidbs = dataset.anno_cropper.crop_infer_anno_records(ori_roidbs)
        # set new roidbs
        dataset.set_roidbs(roidbs)

        return dataset

    def _eval_with_loader(self, loader):
        results = []
        with paddle.no_grad():
            self.infer_model.eval()
            for step_id, data in enumerate(loader):
                outs = self.infer_model(data)
                for key in ['im_shape', 'scale_factor', 'im_id']:
                    outs[key] = data[key]
                for key, value in outs.items():
                    if hasattr(value, 'numpy'):
                        outs[key] = value.numpy()

                results.append(outs)

        return results

    def on_train_end(self, status):
        self.loader.dataset = self.dataset
        results = self._eval_with_loader(self.loader)
        results = self.dataset.anno_cropper.aggregate_chips_detections(results)
        # sniper
        proposals = []
        clsid2catid = {v: k for k, v in self.dataset.catid2clsid.items()}
        for outs in results:
            batch_res = get_infer_results(outs, clsid2catid)
            start = 0
            for i, im_id in enumerate(outs['im_id']):
                bbox_num = outs['bbox_num']
                end = start + bbox_num[i]
                bbox_res = batch_res['bbox'][start:end] \
                    if 'bbox' in batch_res else None
                if bbox_res:
                    proposals += bbox_res
        logger.info("save proposals in {}".format(self.cfg.proposals_path))
        with open(self.cfg.proposals_path, 'w') as f:
            json.dump(proposals, f)