trainer.py 21.1 KB
Newer Older
F
Feng Ni 已提交
1 2 3 4 5 6 7 8 9 10 11 12
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
K
Kaipeng Deng 已提交
13 14 15 16 17 18 19 20 21 22 23 24 25 26
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import time
import random
import datetime
import numpy as np
from PIL import Image

import paddle
W
wangguanzhong 已提交
27 28
import paddle.distributed as dist
from paddle.distributed import fleet
29
from paddle import amp
K
Kaipeng Deng 已提交
30
from paddle.static import InputSpec
31
from ppdet.optimizer import ModelEMA
K
Kaipeng Deng 已提交
32 33 34

from ppdet.core.workspace import create
from ppdet.utils.checkpoint import load_weight, load_pretrain_weight
C
cnn 已提交
35
from ppdet.utils.visualizer import visualize_results, save_result
G
George Ni 已提交
36
from ppdet.metrics import JDEDetMetric, JDEReIDMetric
37
from ppdet.metrics import Metric, COCOMetric, VOCMetric, WiderFaceMetric, get_infer_results, KeyPointTopDownCOCOEval
K
Kaipeng Deng 已提交
38
from ppdet.data.source.category import get_categories
K
Kaipeng Deng 已提交
39 40
import ppdet.utils.stats as stats

41
from .callbacks import Callback, ComposeCallback, LogPrinter, Checkpointer, WiferFaceEval, VisualDLWriter
K
Kaipeng Deng 已提交
42 43 44
from .export_utils import _dump_infer_config

from ppdet.utils.logger import setup_logger
45
logger = setup_logger('ppdet.engine')
K
Kaipeng Deng 已提交
46 47 48 49 50 51 52 53 54 55

__all__ = ['Trainer']


class Trainer(object):
    def __init__(self, cfg, mode='train'):
        self.cfg = cfg
        assert mode.lower() in ['train', 'eval', 'test'], \
                "mode should be 'train', 'eval' or 'test'"
        self.mode = mode.lower()
56
        self.optimizer = None
57
        self.is_loaded_weights = False
K
Kaipeng Deng 已提交
58

G
George Ni 已提交
59 60 61 62 63 64 65 66 67 68
        # build data loader
        self.dataset = cfg['{}Dataset'.format(self.mode.capitalize())]
        if self.mode == 'train':
            self.loader = create('{}Reader'.format(self.mode.capitalize()))(
                self.dataset, cfg.worker_num)

        if cfg.architecture == 'JDE' and self.mode == 'train':
            cfg['JDEEmbeddingHead'][
                'num_identifiers'] = self.dataset.total_identities

K
Kaipeng Deng 已提交
69
        # build model
70 71 72 73 74
        if 'model' not in self.cfg:
            self.model = create(cfg.architecture)
        else:
            self.model = self.cfg.model
            self.is_loaded_weights = True
75

76 77 78 79 80
        self.use_ema = ('use_ema' in cfg and cfg['use_ema'])
        if self.use_ema:
            self.ema = ModelEMA(
                cfg['ema_decay'], self.model, use_thres_step=True)

K
Kaipeng Deng 已提交
81 82 83 84 85 86 87 88
        # EvalDataset build with BatchSampler to evaluate in single device
        # TODO: multi-device evaluate
        if self.mode == 'eval':
            self._eval_batch_sampler = paddle.io.BatchSampler(
                self.dataset, batch_size=self.cfg.EvalReader['batch_size'])
            self.loader = create('{}Reader'.format(self.mode.capitalize()))(
                self.dataset, cfg.worker_num, self._eval_batch_sampler)
        # TestDataset build after user set images, skip loader creation here
K
Kaipeng Deng 已提交
89 90 91 92 93 94 95 96

        # build optimizer in train mode
        if self.mode == 'train':
            steps_per_epoch = len(self.loader)
            self.lr = create('LearningRate')(steps_per_epoch)
            self.optimizer = create('OptimizerBuilder')(self.lr,
                                                        self.model.parameters())

W
wangguanzhong 已提交
97 98
        self._nranks = dist.get_world_size()
        self._local_rank = dist.get_rank()
K
Kaipeng Deng 已提交
99

K
Kaipeng Deng 已提交
100 101 102 103 104 105 106 107 108 109 110 111 112 113 114
        self.status = {}

        self.start_epoch = 0
        self.end_epoch = cfg.epoch

        # initial default callbacks
        self._init_callbacks()

        # initial default metrics
        self._init_metrics()
        self._reset_metrics()

    def _init_callbacks(self):
        if self.mode == 'train':
            self._callbacks = [LogPrinter(self), Checkpointer(self)]
115
            if 'use_vdl' in self.cfg and self.cfg.use_vdl:
116
                self._callbacks.append(VisualDLWriter(self))
K
Kaipeng Deng 已提交
117 118 119
            self._compose_callback = ComposeCallback(self._callbacks)
        elif self.mode == 'eval':
            self._callbacks = [LogPrinter(self)]
120 121
            if self.cfg.metric == 'WiderFace':
                self._callbacks.append(WiferFaceEval(self))
K
Kaipeng Deng 已提交
122
            self._compose_callback = ComposeCallback(self._callbacks)
123
        elif self.mode == 'test' and 'use_vdl' in self.cfg and self.cfg.use_vdl:
124 125
            self._callbacks = [VisualDLWriter(self)]
            self._compose_callback = ComposeCallback(self._callbacks)
K
Kaipeng Deng 已提交
126 127 128 129
        else:
            self._callbacks = []
            self._compose_callback = None

K
Kaipeng Deng 已提交
130 131
    def _init_metrics(self, validate=False):
        if self.mode == 'test' or (self.mode == 'train' and not validate):
G
Guanghua Yu 已提交
132 133
            self._metrics = []
            return
134
        classwise = self.cfg['classwise'] if 'classwise' in self.cfg else False
K
Kaipeng Deng 已提交
135
        if self.cfg.metric == 'COCO':
W
wangxinxin08 已提交
136
            # TODO: bias should be unified
137
            bias = self.cfg['bias'] if 'bias' in self.cfg else 0
S
shangliang Xu 已提交
138 139
            output_eval = self.cfg['output_eval'] \
                if 'output_eval' in self.cfg else None
140 141
            save_prediction_only = self.cfg['save_prediction_only'] \
                if 'save_prediction_only' in self.cfg else False
142 143 144

            # pass clsid2catid info to metric instance to avoid multiple loading
            # annotation file
K
Kaipeng Deng 已提交
145 146
            clsid2catid = {v: k for k, v in self.dataset.catid2clsid.items()} \
                                if self.mode == 'eval' else None
147 148 149 150 151 152 153 154 155

            # when do validation in train, annotation file should be get from
            # EvalReader instead of self.dataset(which is TrainReader)
            anno_file = self.dataset.get_anno()
            if self.mode == 'train' and validate:
                eval_dataset = self.cfg['EvalDataset']
                eval_dataset.check_or_download_dataset()
                anno_file = eval_dataset.get_anno()

156
            IouType = self.cfg['IouType'] if 'IouType' in self.cfg else 'bbox'
W
wangxinxin08 已提交
157 158
            self._metrics = [
                COCOMetric(
159
                    anno_file=anno_file,
K
Kaipeng Deng 已提交
160
                    clsid2catid=clsid2catid,
161
                    classwise=classwise,
S
shangliang Xu 已提交
162
                    output_eval=output_eval,
163
                    bias=bias,
164
                    IouType=IouType,
165
                    save_prediction_only=save_prediction_only)
W
wangxinxin08 已提交
166
            ]
K
Kaipeng Deng 已提交
167 168 169
        elif self.cfg.metric == 'VOC':
            self._metrics = [
                VOCMetric(
170
                    label_list=self.dataset.get_label_list(),
K
Kaipeng Deng 已提交
171
                    class_num=self.cfg.num_classes,
172 173
                    map_type=self.cfg.map_type,
                    classwise=classwise)
K
Kaipeng Deng 已提交
174
            ]
175 176 177 178 179 180 181 182 183
        elif self.cfg.metric == 'WiderFace':
            multi_scale = self.cfg.multi_scale_eval if 'multi_scale_eval' in self.cfg else True
            self._metrics = [
                WiderFaceMetric(
                    image_dir=os.path.join(self.dataset.dataset_dir,
                                           self.dataset.image_dir),
                    anno_file=self.dataset.get_anno(),
                    multi_scale=multi_scale)
            ]
184 185 186 187 188 189 190 191 192
        elif self.cfg.metric == 'KeyPointTopDownCOCOEval':
            eval_dataset = self.cfg['EvalDataset']
            eval_dataset.check_or_download_dataset()
            anno_file = eval_dataset.get_anno()
            self._metrics = [
                KeyPointTopDownCOCOEval(anno_file,
                                        len(eval_dataset), self.cfg.num_joints,
                                        self.cfg.save_dir)
            ]
G
George Ni 已提交
193 194 195 196
        elif self.cfg.metric == 'MOTDet':
            self._metrics = [JDEDetMetric(), ]
        elif self.cfg.metric == 'ReID':
            self._metrics = [JDEReIDMetric(), ]
K
Kaipeng Deng 已提交
197
        else:
K
Kaipeng Deng 已提交
198 199
            logger.warn("Metric not support for metric type {}".format(
                self.cfg.metric))
K
Kaipeng Deng 已提交
200 201 202 203 204 205 206
            self._metrics = []

    def _reset_metrics(self):
        for metric in self._metrics:
            metric.reset()

    def register_callbacks(self, callbacks):
207
        callbacks = [c for c in list(callbacks) if c is not None]
K
Kaipeng Deng 已提交
208 209 210 211 212 213 214 215 216 217 218 219 220
        for c in callbacks:
            assert isinstance(c, Callback), \
                    "metrics shoule be instances of subclass of Metric"
        self._callbacks.extend(callbacks)
        self._compose_callback = ComposeCallback(self._callbacks)

    def register_metrics(self, metrics):
        metrics = [m for m in list(metrics) if m is not None]
        for m in metrics:
            assert isinstance(m, Metric), \
                    "metrics shoule be instances of subclass of Metric"
        self._metrics.extend(metrics)

K
Kaipeng Deng 已提交
221
    def load_weights(self, weights):
222 223
        if self.is_loaded_weights:
            return
K
Kaipeng Deng 已提交
224
        self.start_epoch = 0
G
George Ni 已提交
225 226 227 228
        if hasattr(self.model, 'detector'):
            load_pretrain_weight(self.model.detector, weights)
        else:
            load_pretrain_weight(self.model, weights)
K
Kaipeng Deng 已提交
229 230 231
        logger.debug("Load weights {} to start training".format(weights))

    def resume_weights(self, weights):
232 233 234 235 236 237
        # support Distill resume weights
        if hasattr(self.model, 'student_model'):
            self.start_epoch = load_weight(self.model.student_model, weights,
                                           self.optimizer)
        else:
            self.start_epoch = load_weight(self.model, weights, self.optimizer)
K
Kaipeng Deng 已提交
238
        logger.debug("Resume weights of epoch {}".format(self.start_epoch))
K
Kaipeng Deng 已提交
239

K
Kaipeng Deng 已提交
240
    def train(self, validate=False):
K
Kaipeng Deng 已提交
241 242
        assert self.mode == 'train', "Model not in 'train' mode"

K
Kaipeng Deng 已提交
243 244 245 246 247
        # if validation in training is enabled, metrics should be re-init
        if validate:
            self._init_metrics(validate=validate)
            self._reset_metrics()

248
        model = self.model
249 250 251 252 253
        if self.cfg.fleet:
            model = fleet.distributed_model(model)
            self.optimizer = fleet.distributed_optimizer(
                self.optimizer).user_defined_optimizer
        elif self._nranks > 1:
G
George Ni 已提交
254 255 256 257
            find_unused_parameters = self.cfg[
                'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False
            model = paddle.DataParallel(
                self.model, find_unused_parameters=find_unused_parameters)
258 259 260 261 262

        # initial fp16
        if self.cfg.fp16:
            scaler = amp.GradScaler(
                enable=self.cfg.use_gpu, init_loss_scaling=1024)
K
Kaipeng Deng 已提交
263

K
Kaipeng Deng 已提交
264 265 266 267 268 269 270 271 272 273 274 275 276
        self.status.update({
            'epoch_id': self.start_epoch,
            'step_id': 0,
            'steps_per_epoch': len(self.loader)
        })

        self.status['batch_time'] = stats.SmoothedValue(
            self.cfg.log_iter, fmt='{avg:.4f}')
        self.status['data_time'] = stats.SmoothedValue(
            self.cfg.log_iter, fmt='{avg:.4f}')
        self.status['training_staus'] = stats.TrainingStats(self.cfg.log_iter)

        for epoch_id in range(self.start_epoch, self.cfg.epoch):
K
Kaipeng Deng 已提交
277
            self.status['mode'] = 'train'
K
Kaipeng Deng 已提交
278 279 280
            self.status['epoch_id'] = epoch_id
            self._compose_callback.on_epoch_begin(self.status)
            self.loader.dataset.set_epoch(epoch_id)
K
Kaipeng Deng 已提交
281
            model.train()
K
Kaipeng Deng 已提交
282 283 284 285 286 287
            iter_tic = time.time()
            for step_id, data in enumerate(self.loader):
                self.status['data_time'].update(time.time() - iter_tic)
                self.status['step_id'] = step_id
                self._compose_callback.on_step_begin(self.status)

288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305
                if self.cfg.fp16:
                    with amp.auto_cast(enable=self.cfg.use_gpu):
                        # model forward
                        outputs = model(data)
                        loss = outputs['loss']

                    # model backward
                    scaled_loss = scaler.scale(loss)
                    scaled_loss.backward()
                    # in dygraph mode, optimizer.minimize is equal to optimizer.step
                    scaler.minimize(self.optimizer, scaled_loss)
                else:
                    # model forward
                    outputs = model(data)
                    loss = outputs['loss']
                    # model backward
                    loss.backward()
                    self.optimizer.step()
K
Kaipeng Deng 已提交
306 307 308 309 310 311

                curr_lr = self.optimizer.get_lr()
                self.lr.step()
                self.optimizer.clear_grad()
                self.status['learning_rate'] = curr_lr

K
Kaipeng Deng 已提交
312
                if self._nranks < 2 or self._local_rank == 0:
K
Kaipeng Deng 已提交
313 314 315 316
                    self.status['training_staus'].update(outputs)

                self.status['batch_time'].update(time.time() - iter_tic)
                self._compose_callback.on_step_end(self.status)
317 318
                if self.use_ema:
                    self.ema.update(self.model)
F
Feng Ni 已提交
319
                iter_tic = time.time()
K
Kaipeng Deng 已提交
320

321 322 323 324 325
            # apply ema weight on model
            if self.use_ema:
                weight = self.model.state_dict()
                self.model.set_dict(self.ema.apply())

K
Kaipeng Deng 已提交
326 327
            self._compose_callback.on_epoch_end(self.status)

K
Kaipeng Deng 已提交
328
            if validate and (self._nranks < 2 or self._local_rank == 0) \
G
Guanghua Yu 已提交
329
                    and ((epoch_id + 1) % self.cfg.snapshot_epoch == 0 \
K
Kaipeng Deng 已提交
330 331 332 333 334 335 336 337 338 339 340 341 342
                             or epoch_id == self.end_epoch - 1):
                if not hasattr(self, '_eval_loader'):
                    # build evaluation dataset and loader
                    self._eval_dataset = self.cfg.EvalDataset
                    self._eval_batch_sampler = \
                        paddle.io.BatchSampler(
                            self._eval_dataset,
                            batch_size=self.cfg.EvalReader['batch_size'])
                    self._eval_loader = create('EvalReader')(
                        self._eval_dataset,
                        self.cfg.worker_num,
                        batch_sampler=self._eval_batch_sampler)
                with paddle.no_grad():
343
                    self.status['save_best_model'] = True
K
Kaipeng Deng 已提交
344 345
                    self._eval_with_loader(self._eval_loader)

346 347 348 349
            # restore origin weight on model
            if self.use_ema:
                self.model.set_dict(weight)

K
Kaipeng Deng 已提交
350
    def _eval_with_loader(self, loader):
K
Kaipeng Deng 已提交
351 352 353
        sample_num = 0
        tic = time.time()
        self._compose_callback.on_epoch_begin(self.status)
K
Kaipeng Deng 已提交
354 355 356
        self.status['mode'] = 'eval'
        self.model.eval()
        for step_id, data in enumerate(loader):
K
Kaipeng Deng 已提交
357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375
            self.status['step_id'] = step_id
            self._compose_callback.on_step_begin(self.status)
            # forward
            outs = self.model(data)

            # update metrics
            for metric in self._metrics:
                metric.update(data, outs)

            sample_num += data['im_id'].numpy().shape[0]
            self._compose_callback.on_step_end(self.status)

        self.status['sample_num'] = sample_num
        self.status['cost_time'] = time.time() - tic

        # accumulate metric to log out
        for metric in self._metrics:
            metric.accumulate()
            metric.log()
376
        self._compose_callback.on_epoch_end(self.status)
K
Kaipeng Deng 已提交
377 378 379
        # reset metric states for metric may performed multiple times
        self._reset_metrics()

K
Kaipeng Deng 已提交
380
    def evaluate(self):
381 382
        with paddle.no_grad():
            self._eval_with_loader(self.loader)
K
Kaipeng Deng 已提交
383

C
cnn 已提交
384 385 386 387 388
    def predict(self,
                images,
                draw_threshold=0.5,
                output_dir='output',
                save_txt=False):
K
Kaipeng Deng 已提交
389 390 391 392 393 394
        self.dataset.set_images(images)
        loader = create('TestReader')(self.dataset, 0)

        imid2path = self.dataset.get_imid2path()

        anno_file = self.dataset.get_anno()
C
cnn 已提交
395 396
        clsid2catid, catid2name = get_categories(
            self.cfg.metric, anno_file=anno_file)
K
Kaipeng Deng 已提交
397

K
Kaipeng Deng 已提交
398 399 400
        # Run Infer 
        self.status['mode'] = 'test'
        self.model.eval()
K
Kaipeng Deng 已提交
401 402 403 404
        for step_id, data in enumerate(loader):
            self.status['step_id'] = step_id
            # forward
            outs = self.model(data)
405

K
Kaipeng Deng 已提交
406 407
            for key in ['im_shape', 'scale_factor', 'im_id']:
                outs[key] = data[key]
G
Guanghua Yu 已提交
408
            for key, value in outs.items():
409 410
                if hasattr(value, 'numpy'):
                    outs[key] = value.numpy()
K
Kaipeng Deng 已提交
411 412 413

            batch_res = get_infer_results(outs, clsid2catid)
            bbox_num = outs['bbox_num']
Z
zhiboniu 已提交
414

K
Kaipeng Deng 已提交
415 416 417 418
            start = 0
            for i, im_id in enumerate(outs['im_id']):
                image_path = imid2path[int(im_id)]
                image = Image.open(image_path).convert('RGB')
419
                self.status['original_image'] = np.array(image.copy())
K
Kaipeng Deng 已提交
420

421
                end = start + bbox_num[i]
K
Kaipeng Deng 已提交
422 423 424 425
                bbox_res = batch_res['bbox'][start:end] \
                        if 'bbox' in batch_res else None
                mask_res = batch_res['mask'][start:end] \
                        if 'mask' in batch_res else None
G
Guanghua Yu 已提交
426 427
                segm_res = batch_res['segm'][start:end] \
                        if 'segm' in batch_res else None
428 429
                keypoint_res = batch_res['keypoint'][start:end] \
                        if 'keypoint' in batch_res else None
C
cnn 已提交
430

431 432 433
                image = visualize_results(
                    image, bbox_res, mask_res, segm_res, keypoint_res,
                    int(outs['im_id']), catid2name, draw_threshold)
434
                self.status['result_image'] = np.array(image.copy())
435 436
                if self._compose_callback:
                    self._compose_callback.on_step_end(self.status)
K
Kaipeng Deng 已提交
437 438 439 440 441
                # save image with detection
                save_name = self._get_save_image_name(output_dir, image_path)
                logger.info("Detection bbox results save in {}".format(
                    save_name))
                image.save(save_name, quality=95)
C
cnn 已提交
442 443
                if save_txt:
                    save_path = os.path.splitext(save_name)[0] + '.txt'
444 445 446 447 448 449 450
                    results = {}
                    results["im_id"] = im_id
                    if bbox_res:
                        results["bbox_res"] = bbox_res
                    if keypoint_res:
                        results["keypoint_res"] = keypoint_res
                    save_result(save_path, results, catid2name, draw_threshold)
K
Kaipeng Deng 已提交
451 452 453 454 455 456 457 458 459 460 461 462 463
                start = end

    def _get_save_image_name(self, output_dir, image_path):
        """
        Get save image name from source image path.
        """
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        image_name = os.path.split(image_path)[-1]
        name, ext = os.path.splitext(image_name)
        return os.path.join(output_dir, "{}".format(name)) + ext

    def export(self, output_dir='output_inference'):
464
        self.model.eval()
K
Kaipeng Deng 已提交
465 466 467 468 469 470 471 472
        model_name = os.path.splitext(os.path.split(self.cfg.filename)[-1])[0]
        save_dir = os.path.join(output_dir, model_name)
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        image_shape = None
        if 'inputs_def' in self.cfg['TestReader']:
            inputs_def = self.cfg['TestReader']['inputs_def']
            image_shape = inputs_def.get('image_shape', None)
473
        # set image_shape=[3, -1, -1] as default
K
Kaipeng Deng 已提交
474
        if image_shape is None:
475
            image_shape = [3, -1, -1]
K
Kaipeng Deng 已提交
476

K
Kaipeng Deng 已提交
477
        self.model.eval()
478
        if hasattr(self.model, 'deploy'): self.model.deploy = True
K
Kaipeng Deng 已提交
479

K
Kaipeng Deng 已提交
480 481 482 483 484 485 486 487 488 489 490 491 492 493 494
        # Save infer cfg
        _dump_infer_config(self.cfg,
                           os.path.join(save_dir, 'infer_cfg.yml'), image_shape,
                           self.model)

        input_spec = [{
            "image": InputSpec(
                shape=[None] + image_shape, name='image'),
            "im_shape": InputSpec(
                shape=[None, 2], name='im_shape'),
            "scale_factor": InputSpec(
                shape=[None, 2], name='scale_factor')
        }]

        # dy2st and save model
495
        if 'slim' not in self.cfg or self.cfg['slim'] != 'QAT':
496 497 498 499 500 501 502 503 504 505 506 507 508
            static_model = paddle.jit.to_static(
                self.model, input_spec=input_spec)
            # NOTE: dy2st do not pruned program, but jit.save will prune program
            # input spec, prune input spec here and save with pruned input spec
            pruned_input_spec = self._prune_input_spec(
                input_spec, static_model.forward.main_program,
                static_model.forward.outputs)
            paddle.jit.save(
                static_model,
                os.path.join(save_dir, 'model'),
                input_spec=pruned_input_spec)
            logger.info("Export model and saved in {}".format(save_dir))
        else:
509
            self.cfg.slim.save_quantized_model(
510 511 512
                self.model,
                os.path.join(save_dir, 'model'),
                input_spec=input_spec)
513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529

    def _prune_input_spec(self, input_spec, program, targets):
        # try to prune static program to figure out pruned input spec
        # so we perform following operations in static mode
        paddle.enable_static()
        pruned_input_spec = [{}]
        program = program.clone()
        program = program._prune(targets=targets)
        global_block = program.global_block()
        for name, spec in input_spec[0].items():
            try:
                v = global_block.var(name)
                pruned_input_spec[0][name] = spec
            except Exception:
                pass
        paddle.disable_static()
        return pruned_input_spec