train.py 6.4 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os

import paddle.fluid as fluid
from paddle.fluid.dygraph.parallel import ParallelEnv
from paddle.fluid.io import DataLoader
from paddle.incubate.hapi.distributed import DistributedBatchSampler

import utils.logging as logging
from utils import load_pretrained_model
from utils import resume
from utils import Timer, calculate_eta
from .val import evaluate


def train(model,
          train_dataset,
          places=None,
          eval_dataset=None,
          optimizer=None,
          save_dir='output',
          num_epochs=100,
          batch_size=2,
          pretrained_model=None,
          resume_model=None,
          save_interval_epochs=1,
          log_steps=10,
          num_classes=None,
          num_workers=8,
          use_vdl=False):
    ignore_index = model.ignore_index
    nranks = ParallelEnv().nranks

    start_epoch = 0
    if resume_model is not None:
        start_epoch = resume(model, optimizer, resume_model)
    elif pretrained_model is not None:
        load_pretrained_model(model, pretrained_model)

    if not os.path.isdir(save_dir):
        if os.path.exists(save_dir):
            os.remove(save_dir)
        os.makedirs(save_dir)

    if nranks > 1:
        strategy = fluid.dygraph.prepare_context()
C
chenguowei01 已提交
60
        ddp_model = fluid.dygraph.DataParallel(model, strategy)
61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82

    batch_sampler = DistributedBatchSampler(
        train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
    loader = DataLoader(
        train_dataset,
        batch_sampler=batch_sampler,
        places=places,
        num_workers=num_workers,
        return_list=True,
    )

    if use_vdl:
        from visualdl import LogWriter
        log_writer = LogWriter(save_dir)

    timer = Timer()
    avg_loss = 0.0
    steps_per_epoch = len(batch_sampler)
    total_steps = steps_per_epoch * (num_epochs - start_epoch)
    num_steps = 0
    best_mean_iou = -1.0
    best_model_epoch = -1
83 84
    train_reader_cost = 0.0
    train_batch_cost = 0.0
85
    for epoch in range(start_epoch, num_epochs):
86
        timer.start()
87
        for step, data in enumerate(loader):
88
            train_reader_cost += timer.elapsed_time()
89 90 91
            images = data[0]
            labels = data[1].astype('int64')
            if nranks > 1:
C
chenguowei01 已提交
92
                loss = ddp_model(images, labels)
C
chenguowei01 已提交
93
                # apply_collective_grads sum grads over multiple gpus.
C
chenguowei01 已提交
94
                loss = ddp_model.scale_loss(loss)
95
                loss.backward()
C
chenguowei01 已提交
96
                ddp_model.apply_collective_grads()
97 98 99 100 101 102 103 104
            else:
                loss = model(images, labels)
                loss.backward()
            optimizer.minimize(loss)
            model.clear_gradients()
            avg_loss += loss.numpy()[0]
            lr = optimizer.current_step_lr()
            num_steps += 1
105
            train_batch_cost += timer.elapsed_time()
106 107
            if num_steps % log_steps == 0 and ParallelEnv().local_rank == 0:
                avg_loss /= log_steps
108 109 110 111
                avg_train_reader_cost = train_reader_cost / log_steps
                avg_train_batch_cost = train_batch_cost / log_steps
                train_reader_cost = 0.0
                train_batch_cost = 0.0
112
                remain_steps = total_steps - num_steps
113
                eta = calculate_eta(remain_steps, avg_train_batch_cost)
114
                logging.info(
115
                    "[TRAIN] Epoch={}/{}, Step={}/{}, loss={:.4f}, lr={:.6f}, batch_cost={:.4f}, reader_cost={:.4f} | ETA {}"
116
                    .format(epoch + 1, num_epochs, step + 1, steps_per_epoch,
117 118
                            avg_loss * nranks, lr, avg_train_batch_cost,
                            avg_train_reader_cost, eta))
119
                if use_vdl:
C
chenguowei01 已提交
120 121
                    log_writer.add_scalar('Train/loss', avg_loss * nranks,
                                          num_steps)
122
                    log_writer.add_scalar('Train/lr', lr, num_steps)
123 124 125 126
                    log_writer.add_scalar('Train/batch_cost',
                                          avg_train_batch_cost, num_steps)
                    log_writer.add_scalar('Train/reader_cost',
                                          avg_train_reader_cost, num_steps)
127
                avg_loss = 0.0
128
            timer.restart()
129 130 131 132 133 134 135 136 137 138 139 140 141

        if ((epoch + 1) % save_interval_epochs == 0
                or epoch + 1 == num_epochs) and ParallelEnv().local_rank == 0:
            current_save_dir = os.path.join(save_dir,
                                            "epoch_{}".format(epoch + 1))
            if not os.path.isdir(current_save_dir):
                os.makedirs(current_save_dir)
            fluid.save_dygraph(model.state_dict(),
                               os.path.join(current_save_dir, 'model'))
            fluid.save_dygraph(optimizer.state_dict(),
                               os.path.join(current_save_dir, 'model'))

            if eval_dataset is not None:
142
                mean_iou, avg_acc = evaluate(
143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
                    model,
                    eval_dataset,
                    model_dir=current_save_dir,
                    num_classes=num_classes,
                    ignore_index=ignore_index,
                    epoch_id=epoch + 1)
                if mean_iou > best_mean_iou:
                    best_mean_iou = mean_iou
                    best_model_epoch = epoch + 1
                    best_model_dir = os.path.join(save_dir, "best_model")
                    fluid.save_dygraph(model.state_dict(),
                                       os.path.join(best_model_dir, 'model'))
                logging.info(
                    'Current evaluated best model in eval_dataset is epoch_{}, miou={:4f}'
                    .format(best_model_epoch, best_mean_iou))

                if use_vdl:
160 161
                    log_writer.add_scalar('Evaluate/mIoU', mean_iou, epoch + 1)
                    log_writer.add_scalar('Evaluate/aAcc', avg_acc, epoch + 1)
162 163 164
                model.train()
    if use_vdl:
        log_writer.close()