train.py 13.0 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
16 17
import argparse
import ast
18 19 20
import paddle
import paddle.fluid as fluid
from paddle.fluid.layer_helper import LayerHelper
21
from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
22 23
from paddle.fluid.dygraph.base import to_variable

H
Hongyu Liu 已提交
24 25 26 27
from paddle.fluid import framework

import math
import sys
H
hysunflower 已提交
28
import time
H
Hongyu Liu 已提交
29 30 31 32 33

IMAGENET1000 = 1281167
base_lr = 0.1
momentum_rate = 0.9
l2_decay = 1e-4
34 35


36
def parse_args():
D
Divano 已提交
37
    parser = argparse.ArgumentParser("Training for Resnet.")
38 39 40 41
    parser.add_argument(
        "--use_data_parallel",
        type=ast.literal_eval,
        default=False,
C
chengduo 已提交
42 43 44 45 46 47
        help="The flag indicating whether to use data parallel mode to train the model."
    )
    parser.add_argument(
        "-e", "--epoch", default=120, type=int, help="set epoch")
    parser.add_argument(
        "-b", "--batch_size", default=32, type=int, help="set epoch")
D
Divano 已提交
48
    parser.add_argument("--ce", action="store_true", help="run ce")
H
hysunflower 已提交
49 50 51
   
    # NOTE:used in benchmark
    parser.add_argument("--max_iter", default=0, type=int, help="the max iters to train, used in benchmark")
52 53 54 55 56
    args = parser.parse_args()
    return args


args = parse_args()
D
Divano 已提交
57
batch_size = args.batch_size
58

C
chengduo 已提交
59

60
def optimizer_setting(parameter_list=None):
H
Hongyu Liu 已提交
61 62 63 64 65 66 67 68 69 70

    total_images = IMAGENET1000

    step = int(math.ceil(float(total_images) / batch_size))

    epochs = [30, 60, 90]
    bd = [step * e for e in epochs]

    lr = []
    lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
71 72 73 74 75 76 77 78 79 80 81 82 83
    if fluid.in_dygraph_mode():
        optimizer = fluid.optimizer.Momentum(
            learning_rate=fluid.layers.piecewise_decay(
                boundaries=bd, values=lr),
            momentum=momentum_rate,
            regularization=fluid.regularizer.L2Decay(l2_decay),
            parameter_list=parameter_list)
    else:
        optimizer = fluid.optimizer.Momentum(
            learning_rate=fluid.layers.piecewise_decay(
                boundaries=bd, values=lr),
            momentum=momentum_rate,
            regularization=fluid.regularizer.L2Decay(l2_decay))
H
Hongyu Liu 已提交
84 85

    return optimizer
86 87 88 89 90 91 92 93 94 95


class ConvBNLayer(fluid.dygraph.Layer):
    def __init__(self,
                 num_channels,
                 num_filters,
                 filter_size,
                 stride=1,
                 groups=1,
                 act=None):
96
        super(ConvBNLayer, self).__init__()
97 98

        self._conv = Conv2D(
99
            num_channels=num_channels,
100 101 102 103 104 105
            num_filters=num_filters,
            filter_size=filter_size,
            stride=stride,
            padding=(filter_size - 1) // 2,
            groups=groups,
            act=None,
X
xiaoting 已提交
106
            bias_attr=False)
107

108
        self._batch_norm = BatchNorm(num_filters, act=act)
109 110 111 112 113 114 115 116 117

    def forward(self, inputs):
        y = self._conv(inputs)
        y = self._batch_norm(y)

        return y


class BottleneckBlock(fluid.dygraph.Layer):
118
    def __init__(self, num_channels, num_filters, stride, shortcut=True):
119
        super(BottleneckBlock, self).__init__()
120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165

        self.conv0 = ConvBNLayer(
            num_channels=num_channels,
            num_filters=num_filters,
            filter_size=1,
            act='relu')
        self.conv1 = ConvBNLayer(
            num_channels=num_filters,
            num_filters=num_filters,
            filter_size=3,
            stride=stride,
            act='relu')
        self.conv2 = ConvBNLayer(
            num_channels=num_filters,
            num_filters=num_filters * 4,
            filter_size=1,
            act=None)

        if not shortcut:
            self.short = ConvBNLayer(
                num_channels=num_channels,
                num_filters=num_filters * 4,
                filter_size=1,
                stride=stride)

        self.shortcut = shortcut

        self._num_channels_out = num_filters * 4

    def forward(self, inputs):
        y = self.conv0(inputs)
        conv1 = self.conv1(y)
        conv2 = self.conv2(conv1)

        if self.shortcut:
            short = inputs
        else:
            short = self.short(inputs)

        y = fluid.layers.elementwise_add(x=short, y=conv2)

        layer_helper = LayerHelper(self.full_name(), act='relu')
        return layer_helper.append_activation(y)


class ResNet(fluid.dygraph.Layer):
166 167
    def __init__(self, layers=50, class_dim=102):
        super(ResNet, self).__init__()
168 169 170 171 172 173 174 175 176 177 178 179

        self.layers = layers
        supported_layers = [50, 101, 152]
        assert layers in supported_layers, \
            "supported layers are {} but input layer is {}".format(supported_layers, layers)

        if layers == 50:
            depth = [3, 4, 6, 3]
        elif layers == 101:
            depth = [3, 4, 23, 3]
        elif layers == 152:
            depth = [3, 8, 36, 3]
180
        num_channels = [64, 256, 512, 1024]
181 182 183
        num_filters = [64, 128, 256, 512]

        self.conv = ConvBNLayer(
184
            num_channels=3, num_filters=64, filter_size=7, stride=2, act='relu')
185
        self.pool2d_max = Pool2D(
186
            pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
187 188 189 190 191 192 193 194

        self.bottleneck_block_list = []
        for block in range(len(depth)):
            shortcut = False
            for i in range(depth[block]):
                bottleneck_block = self.add_sublayer(
                    'bb_%d_%d' % (block, i),
                    BottleneckBlock(
195 196
                        num_channels=num_channels[block]
                        if i == 0 else num_filters[block] * 4,
197 198 199 200 201 202 203
                        num_filters=num_filters[block],
                        stride=2 if i == 0 and block != 0 else 1,
                        shortcut=shortcut))
                self.bottleneck_block_list.append(bottleneck_block)
                shortcut = True

        self.pool2d_avg = Pool2D(
204 205 206
            pool_size=7, pool_type='avg', global_pooling=True)

        self.pool2d_avg_output = num_filters[len(num_filters) - 1] * 4 * 1 * 1
207 208 209 210

        import math
        stdv = 1.0 / math.sqrt(2048 * 1.0)

211 212 213 214 215 216
        self.out = Linear(
            self.pool2d_avg_output,
            class_dim,
            act='softmax',
            param_attr=fluid.param_attr.ParamAttr(
                initializer=fluid.initializer.Uniform(-stdv, stdv)))
217 218 219 220 221 222 223

    def forward(self, inputs):
        y = self.conv(inputs)
        y = self.pool2d_max(y)
        for bottleneck_block in self.bottleneck_block_list:
            y = bottleneck_block(y)
        y = self.pool2d_avg(y)
224
        y = fluid.layers.reshape(y, shape=[-1, self.pool2d_avg_output])
225 226 227 228
        y = self.out(y)
        return y


229 230 231 232 233 234 235 236 237 238
def reader_decorator(reader):
    def __reader__():
        for item in reader():
            img = np.array(item[0]).astype('float32').reshape(3, 224, 224)
            label = np.array(item[1]).astype('int64').reshape(1)
            yield img, label

    return __reader__


H
Hongyu Liu 已提交
239 240 241 242 243 244 245 246
def eval(model, data):

    model.eval()
    total_loss = 0.0
    total_acc1 = 0.0
    total_acc5 = 0.0
    total_sample = 0
    for batch_id, data in enumerate(data()):
247 248
        img = data[0]
        label = data[1]
249
        label.stop_gradient = True
H
Hongyu Liu 已提交
250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266

        out = model(img)
        #loss = fluid.layers.cross_entropy(input=out, label=label)
        #avg_loss = fluid.layers.mean(x=loss)

        acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
        acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)

        #dy_out = avg_loss.numpy()

        #total_loss += dy_out
        total_acc1 += acc_top1.numpy()
        total_acc5 += acc_top5.numpy()
        total_sample += 1

        # print("epoch id: %d, batch step: %d, loss: %f" % (eop, batch_id, dy_out))
        if batch_id % 10 == 0:
267 268
            print("test | batch step %d, acc1 %0.3f acc5 %0.3f" % \
                  ( batch_id, total_acc1 / total_sample, total_acc5 / total_sample))
D
Divano 已提交
269 270 271
    if args.ce:
        print("kpis\ttest_acc1\t%0.3f" % (total_acc1 / total_sample))
        print("kpis\ttest_acc5\t%0.3f" % (total_acc5 / total_sample))
272 273
    print("final eval acc1 %0.3f acc5 %0.3f" % \
          (total_acc1 / total_sample, total_acc5 / total_sample))
H
Hongyu Liu 已提交
274 275


276
def train_resnet():
D
Divano 已提交
277
    epoch = args.epoch
278 279 280
    place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id) \
        if args.use_data_parallel else fluid.CUDAPlace(0)
    with fluid.dygraph.guard(place):
D
Divano 已提交
281 282 283 284 285 286 287
        if args.ce:
            print("ce mode")
            seed = 33
            np.random.seed(seed)
            fluid.default_startup_program().random_seed = seed
            fluid.default_main_program().random_seed = seed

288 289 290
        if args.use_data_parallel:
            strategy = fluid.dygraph.parallel.prepare_context()

291 292
        resnet = ResNet()
        optimizer = optimizer_setting(parameter_list=resnet.parameters())
293 294 295 296

        if args.use_data_parallel:
            resnet = fluid.dygraph.parallel.DataParallel(resnet, strategy)

297
        train_reader = paddle.batch(
298 299 300 301
            reader_decorator(paddle.dataset.flowers.train(use_xmap=True)),
            batch_size=batch_size,
            drop_last=True)

302
        if args.use_data_parallel:
303 304
            train_reader = fluid.contrib.reader.distributed_batch_reader(
                train_reader)
H
Hongyu Liu 已提交
305 306

        test_reader = paddle.batch(
307 308 309 310 311 312 313 314 315
            reader_decorator(paddle.dataset.flowers.test(use_xmap=True)),
            batch_size=batch_size,
            drop_last=True)

        train_loader = fluid.io.DataLoader.from_generator(capacity=10)
        train_loader.set_sample_list_generator(train_reader, places=place)

        test_loader = fluid.io.DataLoader.from_generator(capacity=10)
        test_loader.set_sample_list_generator(test_reader, places=place)
H
Hongyu Liu 已提交
316 317 318

        #file_name = './model/epoch_0.npz'
        #model_data = np.load( file_name )
319

H
hysunflower 已提交
320 321 322
        #NOTE: used in benchmark 
        total_batch_num = 0

323
        for eop in range(epoch):
H
Hongyu Liu 已提交
324 325 326 327 328 329 330 331 332 333 334 335 336

            resnet.train()
            total_loss = 0.0
            total_acc1 = 0.0
            total_acc5 = 0.0
            total_sample = 0

            #dict_state = resnet.state_dict()

            #resnet.load_dict( model_data )

            print("load finished")

337
            for batch_id, data in enumerate(train_loader()):
H
hysunflower 已提交
338 339 340 341 342
                #NOTE: used in benchmark
                if args.max_iter and total_batch_num == args.max_iter:
                    return
                batch_start = time.time()

343
                img, label = data
344
                label.stop_gradient = True
345

346 347 348
                out = resnet(img)
                loss = fluid.layers.cross_entropy(input=out, label=label)
                avg_loss = fluid.layers.mean(x=loss)
349

H
Hongyu Liu 已提交
350 351 352
                acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
                acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)

353
                dy_out = avg_loss.numpy()
354 355 356 357 358 359 360

                if args.use_data_parallel:
                    avg_loss = resnet.scale_loss(avg_loss)
                    avg_loss.backward()
                    resnet.apply_collective_grads()
                else:
                    avg_loss.backward()
361

362 363
                optimizer.minimize(avg_loss)
                resnet.clear_gradients()
364

H
hysunflower 已提交
365 366
                batch_end = time.time()
                train_batch_cost = batch_end - batch_start
H
Hongyu Liu 已提交
367 368 369 370
                total_loss += dy_out
                total_acc1 += acc_top1.numpy()
                total_acc5 += acc_top5.numpy()
                total_sample += 1
H
hysunflower 已提交
371
                total_batch_num = total_batch_num + 1 #this is for benchmark
H
Hongyu Liu 已提交
372 373
                #print("epoch id: %d, batch step: %d, loss: %f" % (eop, batch_id, dy_out))
                if batch_id % 10 == 0:
H
hysunflower 已提交
374
                    print( "epoch %d | batch step %d, loss %0.3f acc1 %0.3f acc5 %0.3f, batch cost: %.5f" % \
H
Hongyu Liu 已提交
375
                           ( eop, batch_id, total_loss / total_sample, \
H
hysunflower 已提交
376
                             total_acc1 / total_sample, total_acc5 / total_sample, train_batch_cost))
H
Hongyu Liu 已提交
377

D
Divano 已提交
378 379 380 381
            if args.ce:
                print("kpis\ttrain_acc1\t%0.3f" % (total_acc1 / total_sample))
                print("kpis\ttrain_acc5\t%0.3f" % (total_acc5 / total_sample))
                print("kpis\ttrain_loss\t%0.3f" % (total_loss / total_sample))
H
Hongyu Liu 已提交
382 383 384 385
            print("epoch %d | batch step %d, loss %0.3f acc1 %0.3f acc5 %0.3f" % \
                  (eop, batch_id, total_loss / total_sample, \
                   total_acc1 / total_sample, total_acc5 / total_sample))
            resnet.eval()
386
            eval(resnet, test_loader)
C
chengduo 已提交
387 388 389 390 391

            save_parameters = (not args.use_data_parallel) or (
                args.use_data_parallel and
                fluid.dygraph.parallel.Env().local_rank == 0)
            if save_parameters:
392
                fluid.save_dygraph(resnet.state_dict(), 'resnet_params')
393 394 395


if __name__ == '__main__':
396
    train_resnet()