train.py 15.6 KB
Newer Older
B
Bai Yifan 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import sys
import logging
import paddle
import argparse
import functools
import math
import time
G
Guanghua Yu 已提交
27
import random
B
Bai Yifan 已提交
28 29 30 31
import numpy as np
from paddle.distributed import ParallelEnv
from paddle.static import load_program_state
from paddle.vision.models import mobilenet_v1
32
import paddle.vision.transforms as T
B
Bai Yifan 已提交
33 34 35 36 37 38 39 40
from paddleslim.common import get_logger
from paddleslim.dygraph.quant import QAT

sys.path.append(os.path.join(os.path.dirname("__file__")))
from optimizer import create_optimizer
sys.path.append(
    os.path.join(os.path.dirname("__file__"), os.path.pardir, os.path.pardir))
from utility import add_arguments, print_arguments
41
from models.dygraph.mobilenet_v3 import MobileNetV3_large_x1_0
B
Bai Yifan 已提交
42 43 44 45 46 47

_logger = get_logger(__name__, level=logging.INFO)

parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
48
add_arg('batch_size',               int,    128,                                         "Single Card Minibatch size.")
B
Bai Yifan 已提交
49 50 51 52 53 54 55 56
add_arg('use_gpu',                  bool,   True,                                        "Whether to use GPU or not.")
add_arg('model',                    str,    "mobilenet_v3",                              "The target model.")
add_arg('pretrained_model',         str,    "MobileNetV3_large_x1_0_ssld_pretrained",    "Whether to use pretrained model.")
add_arg('lr',                       float,  0.0001,                                      "The learning rate used to fine-tune pruned model.")
add_arg('lr_strategy',              str,    "piecewise_decay",                           "The learning rate decay strategy.")
add_arg('l2_decay',                 float,  3e-5,                                        "The l2_decay parameter.")
add_arg('ls_epsilon',               float,  0.0,                                         "Label smooth epsilon.")
add_arg('use_pact',                 bool,   False,                                       "Whether to use PACT method.")
G
Guanghua Yu 已提交
57
add_arg('ce_test',                 bool,   False,                                        "Whether to CE test.")
58
add_arg('onnx_format',                 bool,   False,                                    "Whether to export the quantized model with format of ONNX.")
B
Bai Yifan 已提交
59 60 61
add_arg('momentum_rate',            float,  0.9,                                         "The value of momentum_rate.")
add_arg('num_epochs',               int,    1,                                           "The number of total epochs.")
add_arg('total_images',             int,    1281167,                                     "The number of total training images.")
62
add_arg('data',                     str,    "imagenet",                                  "Which data to use. 'cifar10' or 'imagenet'")
B
Bai Yifan 已提交
63
add_arg('log_period',               int,    10,                                          "Log period in batches.")
B
Bai Yifan 已提交
64
add_arg('model_save_dir',           str,    "./output_models",                           "model save directory.")
B
Bai Yifan 已提交
65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93
parser.add_argument('--step_epochs', nargs='+', type=int, default=[10, 20, 30], help="piecewise decay step")
# yapf: enable


def load_dygraph_pretrain(model, path=None, load_static_weights=False):
    if not (os.path.isdir(path) or os.path.exists(path + '.pdparams')):
        raise ValueError("Model pretrain path {} does not "
                         "exists.".format(path))
    if load_static_weights:
        pre_state_dict = load_program_state(path)
        param_state_dict = {}
        model_dict = model.state_dict()
        for key in model_dict.keys():
            weight_name = model_dict[key].name
            if weight_name in pre_state_dict.keys():
                print('Load weight: {}, shape: {}'.format(
                    weight_name, pre_state_dict[weight_name].shape))
                param_state_dict[key] = pre_state_dict[weight_name]
            else:
                param_state_dict[key] = model_dict[key]
        model.set_dict(param_state_dict)
        return

    param_state_dict = paddle.load(path + ".pdparams")
    model.set_dict(param_state_dict)
    return


def compress(args):
G
Guanghua Yu 已提交
94 95 96 97 98 99 100 101 102 103 104
    num_workers = 4
    shuffle = True
    if args.ce_test:
        # set seed
        seed = 111
        paddle.seed(seed)
        np.random.seed(seed)
        random.seed(seed)
        num_workers = 0
        shuffle = False

105 106 107 108 109 110
    if args.data == "cifar10":
        transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])])
        train_dataset = paddle.vision.datasets.Cifar10(
            mode="train", backend="cv2", transform=transform)
        val_dataset = paddle.vision.datasets.Cifar10(
            mode="test", backend="cv2", transform=transform)
B
Bai Yifan 已提交
111
        class_dim = 10
112 113 114
        image_shape = [3, 32, 32]
        pretrain = False
        args.total_images = 50000
B
Bai Yifan 已提交
115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131
    elif args.data == "imagenet":
        import imagenet_reader as reader
        train_dataset = reader.ImageNetDataset(mode='train')
        val_dataset = reader.ImageNetDataset(mode='val')
        class_dim = 1000
        image_shape = "3,224,224"
    else:
        raise ValueError("{} is not supported.".format(args.data))

    trainer_num = paddle.distributed.get_world_size()
    use_data_parallel = trainer_num != 1

    place = paddle.set_device('gpu' if args.use_gpu else 'cpu')
    # model definition
    if use_data_parallel:
        paddle.distributed.init_parallel_env()

B
Bai Yifan 已提交
132
    pretrain = True if args.data == "imagenet" else False
B
Bai Yifan 已提交
133
    if args.model == "mobilenet_v1":
B
Bai Yifan 已提交
134
        net = mobilenet_v1(pretrained=pretrain, num_classes=class_dim)
B
Bai Yifan 已提交
135
    elif args.model == "mobilenet_v3":
B
Bai Yifan 已提交
136
        net = MobileNetV3_large_x1_0(class_dim=class_dim)
B
Bai Yifan 已提交
137
        if pretrain:
B
Bai Yifan 已提交
138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167
            load_dygraph_pretrain(net, args.pretrained_model, True)
    else:
        raise ValueError("{} is not supported.".format(args.model))
    _logger.info("Origin model summary:")
    paddle.summary(net, (1, 3, 224, 224))

    ############################################################################################################
    # 1. quantization configs
    ############################################################################################################
    quant_config = {
        # weight preprocess type, default is None and no preprocessing is performed. 
        'weight_preprocess_type': None,
        # activation preprocess type, default is None and no preprocessing is performed.
        'activation_preprocess_type': None,
        # weight quantize type, default is 'channel_wise_abs_max'
        'weight_quantize_type': 'channel_wise_abs_max',
        # activation quantize type, default is 'moving_average_abs_max'
        'activation_quantize_type': 'moving_average_abs_max',
        # weight quantize bit num, default is 8
        'weight_bits': 8,
        # activation quantize bit num, default is 8
        'activation_bits': 8,
        # data type after quantization, such as 'uint8', 'int8', etc. default is 'int8'
        'dtype': 'int8',
        # window size for 'range_abs_max' quantization. default is 10000
        'window_size': 10000,
        # The decay coefficient of moving average, default is 0.9
        'moving_rate': 0.9,
        # for dygraph quantization, layers of type in quantizable_layer_type will be quantized
        'quantizable_layer_type': ['Conv2D', 'Linear'],
168 169
        # # Whether to export the quantized model with format of ONNX.
        'onnx_format': args.onnx_format,
B
Bai Yifan 已提交
170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190
    }

    if args.use_pact:
        quant_config['activation_preprocess_type'] = 'PACT'

    ############################################################################################################
    # 2. Quantize the model with QAT (quant aware training)
    ############################################################################################################

    quanter = QAT(config=quant_config)
    quanter.quantize(net)

    _logger.info("QAT model summary:")
    paddle.summary(net, (1, 3, 224, 224))

    opt, lr = create_optimizer(net, trainer_num, args)

    if use_data_parallel:
        net = paddle.DataParallel(net)

    train_batch_sampler = paddle.io.DistributedBatchSampler(
G
Guanghua Yu 已提交
191 192 193 194
        train_dataset,
        batch_size=args.batch_size,
        shuffle=shuffle,
        drop_last=True)
B
Bai Yifan 已提交
195 196 197 198 199
    train_loader = paddle.io.DataLoader(
        train_dataset,
        batch_sampler=train_batch_sampler,
        places=place,
        return_list=True,
G
Guanghua Yu 已提交
200
        num_workers=num_workers)
B
Bai Yifan 已提交
201 202 203 204 205 206 207 208

    valid_loader = paddle.io.DataLoader(
        val_dataset,
        places=place,
        batch_size=args.batch_size,
        shuffle=False,
        drop_last=False,
        return_list=True,
G
Guanghua Yu 已提交
209
        num_workers=num_workers)
B
Bai Yifan 已提交
210 211 212 213 214 215 216

    @paddle.no_grad()
    def test(epoch, net):
        net.eval()
        batch_id = 0
        acc_top1_ns = []
        acc_top5_ns = []
B
Bai Yifan 已提交
217 218 219 220 221

        eval_reader_cost = 0.0
        eval_run_cost = 0.0
        total_samples = 0
        reader_start = time.time()
B
Bai Yifan 已提交
222
        for data in valid_loader():
B
Bai Yifan 已提交
223
            eval_reader_cost += time.time() - reader_start
B
Bai Yifan 已提交
224 225
            image = data[0]
            label = data[1]
226 227
            if args.data == "cifar10":
                label = paddle.reshape(label, [-1, 1])
B
Bai Yifan 已提交
228 229

            eval_start = time.time()
B
Bai Yifan 已提交
230 231 232 233 234

            out = net(image)
            acc_top1 = paddle.metric.accuracy(input=out, label=label, k=1)
            acc_top5 = paddle.metric.accuracy(input=out, label=label, k=5)

B
Bai Yifan 已提交
235 236 237 238
            eval_run_cost += time.time() - eval_start
            batch_size = image.shape[0]
            total_samples += batch_size

B
Bai Yifan 已提交
239
            if batch_id % args.log_period == 0:
B
Bai Yifan 已提交
240
                log_period = 1 if batch_id == 0 else args.log_period
B
Bai Yifan 已提交
241
                _logger.info(
B
Bai Yifan 已提交
242
                    "Eval epoch[{}] batch[{}] - top1: {:.6f}; top5: {:.6f}; avg_reader_cost: {:.6f} s, avg_batch_cost: {:.6f} s, avg_samples: {}, avg_ips: {:.3f} images/s".
B
Bai Yifan 已提交
243 244
                    format(epoch, batch_id,
                           np.mean(acc_top1.numpy()),
B
Bai Yifan 已提交
245 246 247 248 249 250 251
                           np.mean(acc_top5.numpy()), eval_reader_cost /
                           log_period, (eval_reader_cost + eval_run_cost) /
                           log_period, total_samples / log_period, total_samples
                           / (eval_reader_cost + eval_run_cost)))
                eval_reader_cost = 0.0
                eval_run_cost = 0.0
                total_samples = 0
B
Bai Yifan 已提交
252 253 254
            acc_top1_ns.append(np.mean(acc_top1.numpy()))
            acc_top5_ns.append(np.mean(acc_top5.numpy()))
            batch_id += 1
B
Bai Yifan 已提交
255
            reader_start = time.time()
B
Bai Yifan 已提交
256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280

        _logger.info(
            "Final eval epoch[{}] - acc_top1: {:.6f}; acc_top5: {:.6f}".format(
                epoch,
                np.mean(np.array(acc_top1_ns)), np.mean(np.array(acc_top5_ns))))
        return np.mean(np.array(acc_top1_ns))

    def cross_entropy(input, target, ls_epsilon):
        if ls_epsilon > 0:
            if target.shape[-1] != class_dim:
                target = paddle.nn.functional.one_hot(target, class_dim)
            target = paddle.nn.functional.label_smooth(
                target, epsilon=ls_epsilon)
            target = paddle.reshape(target, shape=[-1, class_dim])
            input = -paddle.nn.functional.log_softmax(input, axis=-1)
            cost = paddle.sum(target * input, axis=-1)
        else:
            cost = paddle.nn.functional.cross_entropy(input=input, label=target)
        avg_cost = paddle.mean(cost)
        return avg_cost

    def train(epoch, net):

        net.train()
        batch_id = 0
B
Bai Yifan 已提交
281 282 283 284 285

        train_reader_cost = 0.0
        train_run_cost = 0.0
        total_samples = 0
        reader_start = time.time()
B
Bai Yifan 已提交
286
        for data in train_loader():
B
Bai Yifan 已提交
287 288
            train_reader_cost += time.time() - reader_start

B
Bai Yifan 已提交
289 290
            image = data[0]
            label = data[1]
291 292
            if args.data == "cifar10":
                label = paddle.reshape(label, [-1, 1])
B
Bai Yifan 已提交
293

B
Bai Yifan 已提交
294
            train_start = time.time()
B
Bai Yifan 已提交
295 296 297 298 299 300 301 302 303 304 305 306 307 308
            out = net(image)
            avg_cost = cross_entropy(out, label, args.ls_epsilon)

            acc_top1 = paddle.metric.accuracy(input=out, label=label, k=1)
            acc_top5 = paddle.metric.accuracy(input=out, label=label, k=5)
            avg_cost.backward()
            opt.step()
            opt.clear_grad()
            lr.step()

            loss_n = np.mean(avg_cost.numpy())
            acc_top1_n = np.mean(acc_top1.numpy())
            acc_top5_n = np.mean(acc_top5.numpy())

B
Bai Yifan 已提交
309 310 311 312
            train_run_cost += time.time() - train_start
            batch_size = image.shape[0]
            total_samples += batch_size

B
Bai Yifan 已提交
313
            if batch_id % args.log_period == 0:
B
Bai Yifan 已提交
314
                log_period = 1 if batch_id == 0 else args.log_period
B
Bai Yifan 已提交
315
                _logger.info(
B
Bai Yifan 已提交
316
                    "epoch[{}]-batch[{}] lr: {:.6f} - loss: {:.6f}; top1: {:.6f}; top5: {:.6f}; avg_reader_cost: {:.6f} s, avg_batch_cost: {:.6f} s, avg_samples: {}, avg_ips: {:.3f} images/s".
B
Bai Yifan 已提交
317
                    format(epoch, batch_id,
B
Bai Yifan 已提交
318 319 320 321 322 323 324 325
                           lr.get_lr(), loss_n, acc_top1_n, acc_top5_n,
                           train_reader_cost / log_period, (
                               train_reader_cost + train_run_cost) / log_period,
                           total_samples / log_period, total_samples / (
                               train_reader_cost + train_run_cost)))
                train_reader_cost = 0.0
                train_run_cost = 0.0
                total_samples = 0
B
Bai Yifan 已提交
326
            batch_id += 1
B
Bai Yifan 已提交
327
            reader_start = time.time()
B
Bai Yifan 已提交
328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352

    ############################################################################################################
    # train loop
    ############################################################################################################
    best_acc1 = 0.0
    best_epoch = 0
    for i in range(args.num_epochs):
        train(i, net)
        acc1 = test(i, net)
        if paddle.distributed.get_rank() == 0:
            model_prefix = os.path.join(args.model_save_dir, "epoch_" + str(i))
            paddle.save(net.state_dict(), model_prefix + ".pdparams")
            paddle.save(opt.state_dict(), model_prefix + ".pdopt")

        if acc1 > best_acc1:
            best_acc1 = acc1
            best_epoch = i
            if paddle.distributed.get_rank() == 0:
                model_prefix = os.path.join(args.model_save_dir, "best_model")
                paddle.save(net.state_dict(), model_prefix + ".pdparams")
                paddle.save(opt.state_dict(), model_prefix + ".pdopt")

    ############################################################################################################
    # 3. Save quant aware model
    ############################################################################################################
353 354 355 356 357 358 359 360 361 362 363 364
    if paddle.distributed.get_rank() == 0:
        # load best model
        load_dygraph_pretrain(net,
                              os.path.join(args.model_save_dir, "best_model"))

        path = os.path.join(args.model_save_dir, "inference_model", 'qat_model')
        quanter.save_quantized_model(
            net,
            path,
            input_spec=[
                paddle.static.InputSpec(
                    shape=[None, 3, 224, 224], dtype='float32')
365
            ])
B
Bai Yifan 已提交
366 367 368 369 370 371 372 373 374 375


def main():
    args = parser.parse_args()
    print_arguments(args)
    compress(args)


if __name__ == '__main__':
    main()