test_tsm.py 12.2 KB
Newer Older
1 2
#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
3 4 5
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
6 7 8
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
9 10 11 12 13
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
14 15 16 17 18 19

import argparse
import os
import random
import sys
import unittest
20 21 22 23

import numpy as np
from tsm_config_utils import merge_configs, parse_config, print_configs

L
Leo Chen 已提交
24
import paddle
25
from paddle import fluid
26
from paddle.fluid.dygraph import to_variable
H
hjyp 已提交
27
from paddle.jit.api import to_static
28
from paddle.nn import BatchNorm, Linear
29 30 31 32 33 34 35

random.seed(0)
np.random.seed(0)


def parse_args():
    parser = argparse.ArgumentParser("Paddle Video train script")
36 37 38 39 40 41 42 43 44 45 46 47
    parser.add_argument(
        '--config',
        type=str,
        default='tsm.yaml',
        help='path to config file of model',
    )
    parser.add_argument(
        '--use_gpu',
        type=bool,
        default=fluid.is_compiled_with_cuda(),
        help='default use gpu.',
    )
48 49 50 51
    args = parser.parse_args(['--config', 'tsm.yaml'])
    return args


52
class ConvBNLayer(paddle.nn.Layer):
53 54 55 56 57 58 59 60 61
    def __init__(
        self,
        num_channels,
        num_filters,
        filter_size,
        stride=1,
        groups=1,
        act=None,
    ):
62
        super().__init__()
63

64 65 66 67
        self._conv = paddle.nn.Conv2D(
            in_channels=num_channels,
            out_channels=num_filters,
            kernel_size=filter_size,
68 69
            stride=stride,
            padding=(filter_size - 1) // 2,
70 71
            groups=1,
            weight_attr=fluid.param_attr.ParamAttr(),
72 73 74 75 76 77 78 79 80
            bias_attr=False,
        )

        self._batch_norm = BatchNorm(
            num_filters,
            act=act,
            param_attr=fluid.param_attr.ParamAttr(),
            bias_attr=fluid.param_attr.ParamAttr(),
        )
81 82 83 84 85 86 87 88

    def forward(self, inputs):
        y = self._conv(inputs)
        y = self._batch_norm(y)

        return y


89
class BottleneckBlock(paddle.nn.Layer):
90 91 92
    def __init__(
        self, num_channels, num_filters, stride, shortcut=True, seg_num=8
    ):
93
        super().__init__()
94

95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
        self.conv0 = ConvBNLayer(
            num_channels=num_channels,
            num_filters=num_filters,
            filter_size=1,
            act='relu',
        )
        self.conv1 = ConvBNLayer(
            num_channels=num_filters,
            num_filters=num_filters,
            filter_size=3,
            stride=stride,
            act='relu',
        )
        self.conv2 = ConvBNLayer(
            num_channels=num_filters,
            num_filters=num_filters * 4,
            filter_size=1,
            act=None,
        )
114 115

        if not shortcut:
116 117 118 119 120 121
            self.short = ConvBNLayer(
                num_channels=num_channels,
                num_filters=num_filters * 4,
                filter_size=1,
                stride=stride,
            )
122 123 124 125 126
        self.shortcut = shortcut
        self.seg_num = seg_num
        self._num_channels_out = int(num_filters * 4)

    def forward(self, inputs):
127 128 129
        shifts = paddle.nn.functional.temporal_shift(
            inputs, self.seg_num, 1.0 / 8
        )
130 131 132 133 134 135 136
        y = self.conv0(shifts)
        conv1 = self.conv1(y)
        conv2 = self.conv2(conv1)
        if self.shortcut:
            short = inputs
        else:
            short = self.short(inputs)
137
        y = paddle.nn.functional.relu(paddle.add(x=short, y=conv2))
138 139 140
        return y


141
class TSM_ResNet(paddle.nn.Layer):
142
    def __init__(self, name_scope, config, mode):
143
        super().__init__(name_scope)
144 145 146 147 148

        self.layers = config.MODEL.num_layers
        self.seg_num = config.MODEL.seg_num
        self.class_dim = config.MODEL.num_classes
        self.reshape_list = [
149 150 151
            config.MODEL.seglen * 3,
            config[mode.upper()]['target_size'],
            config[mode.upper()]['target_size'],
152 153 154 155 156 157 158 159
        ]

        if self.layers == 50:
            depth = [3, 4, 6, 3]
        else:
            raise NotImplementedError
        num_filters = [64, 128, 256, 512]

160 161 162
        self.conv = ConvBNLayer(
            num_channels=3, num_filters=64, filter_size=7, stride=2, act='relu'
        )
163 164
        self.pool2d_max = paddle.nn.MaxPool2D(
            kernel_size=3, stride=2, padding=1
165
        )
166 167 168 169 170 171 172 173 174

        self.bottleneck_block_list = []
        num_channels = 64

        for block in range(len(depth)):
            shortcut = False
            for i in range(depth[block]):
                bottleneck_block = self.add_sublayer(
                    'bb_%d_%d' % (block, i),
175 176 177 178 179 180 181 182
                    BottleneckBlock(
                        num_channels=num_channels,
                        num_filters=num_filters[block],
                        stride=2 if i == 0 and block != 0 else 1,
                        shortcut=shortcut,
                        seg_num=self.seg_num,
                    ),
                )
183 184 185
                num_channels = int(bottleneck_block._num_channels_out)
                self.bottleneck_block_list.append(bottleneck_block)
                shortcut = True
W
wangzhen38 已提交
186
        self.pool2d_avg = paddle.nn.AdaptiveAvgPool2D(1)
187
        import math
188

189 190 191 192 193
        stdv = 1.0 / math.sqrt(2048 * 1.0)

        self.out = Linear(
            2048,
            self.class_dim,
194 195
            weight_attr=paddle.ParamAttr(
                initializer=paddle.nn.initializer.Uniform(-stdv, stdv)
196
            ),
197 198
            bias_attr=paddle.ParamAttr(
                learning_rate=2.0, regularizer=paddle.regularizer.L1Decay()
199 200
            ),
        )
201

H
hjyp 已提交
202
    @to_static
203
    def forward(self, inputs):
204
        y = paddle.reshape(inputs, [-1] + self.reshape_list)
205 206 207 208 209
        y = self.conv(y)
        y = self.pool2d_max(y)
        for bottleneck_block in self.bottleneck_block_list:
            y = bottleneck_block(y)
        y = self.pool2d_avg(y)
C
ccrrong 已提交
210
        y = paddle.nn.functional.dropout(y, p=0.5)
211
        y = paddle.reshape(y, [-1, self.seg_num, y.shape[1]])
212
        y = paddle.mean(y, axis=1)
213
        y = paddle.reshape(y, shape=[-1, 2048])
214
        y = self.out(y)
215
        y = paddle.nn.functional.softmax(y)
216 217 218
        return y


219
class FakeDataReader:
220 221 222 223 224 225 226
    def __init__(self, mode, cfg):
        self.format = cfg.MODEL.format
        self.num_classes = cfg.MODEL.num_classes
        self.seg_num = cfg.MODEL.seg_num
        self.seglen = cfg.MODEL.seglen

        self.target_size = cfg[mode.upper()]['target_size']
227 228 229 230 231 232 233 234 235 236 237 238
        self.img_mean = (
            np.array(cfg.MODEL.image_mean).reshape([3, 1, 1]).astype(np.float32)
        )
        self.img_std = (
            np.array(cfg.MODEL.image_std).reshape([3, 1, 1]).astype(np.float32)
        )

        self.batch_size = (
            1
            if sys.platform == 'darwin' or os.name == 'nt'
            else cfg[mode.upper()]['batch_size']
        )
239 240 241 242 243 244 245 246
        self.generator_out = []
        self.total_iter = 3
        for i in range(self.total_iter):
            batch_out = []
            for j in range(self.batch_size):
                label = np.int64(random.randint(0, self.num_classes - 1))
                random_mean = self.img_mean[0][0][0]
                random_std = self.img_std[0][0][0]
247 248 249 250 251 252 253 254 255 256
                imgs = np.random.normal(
                    random_mean,
                    random_std,
                    [
                        self.seg_num,
                        self.seglen * 3,
                        self.target_size,
                        self.target_size,
                    ],
                ).astype(np.float32)
257 258 259 260 261 262 263 264 265 266 267 268 269
                batch_out.append((imgs, label))
            self.generator_out.append(batch_out)

    def create_reader(self):
        def batch_reader():
            for i in range(self.total_iter):
                yield self.generator_out[i]

        return batch_reader


def create_optimizer(cfg, params):
    total_videos = cfg.total_videos
270 271 272
    batch_size = (
        1 if sys.platform == 'darwin' or os.name == 'nt' else cfg.batch_size
    )
273
    step = int(total_videos / batch_size + 1)
274 275 276 277 278 279 280 281
    bd = [e * step for e in cfg.decay_epochs]
    base_lr = cfg.learning_rate
    lr_decay = cfg.learning_rate_decay
    lr = [base_lr, base_lr * lr_decay, base_lr * lr_decay * lr_decay]
    l2_weight_decay = cfg.l2_weight_decay
    momentum = cfg.momentum

    optimizer = fluid.optimizer.Momentum(
282
        learning_rate=fluid.layers.piecewise_decay(boundaries=bd, values=lr),
283
        momentum=momentum,
284
        regularization=paddle.regularizer.L2Decay(l2_weight_decay),
285 286
        parameter_list=params,
    )
287 288 289 290 291

    return optimizer


def train(args, fake_data_reader, to_static):
R
Ryan 已提交
292
    paddle.jit.enable_to_static(to_static)
293 294 295 296 297 298 299 300 301 302 303

    config = parse_config(args.config)
    train_config = merge_configs(config, 'train', vars(args))
    valid_config = merge_configs(config, 'valid', vars(args))
    print_configs(train_config, 'Train')

    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()

    random.seed(0)
    np.random.seed(0)
    with fluid.dygraph.guard(place):
C
cnn 已提交
304
        paddle.seed(1000)
L
Leo Chen 已提交
305
        paddle.framework.random._manual_program_seed(1000)
306 307 308

        video_model = TSM_ResNet("TSM", train_config, 'Train')

309 310 311
        optimizer = create_optimizer(
            train_config.TRAIN, video_model.parameters()
        )
312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329

        train_reader = fake_data_reader.create_reader()

        ret = []
        for epoch in range(train_config.TRAIN.epoch):
            video_model.train()
            total_loss = 0.0
            total_acc1 = 0.0
            total_acc5 = 0.0
            total_sample = 0
            for batch_id, data in enumerate(train_reader()):
                x_data = np.array([item[0] for item in data])
                y_data = np.array([item[1] for item in data]).reshape([-1, 1])

                imgs = to_variable(x_data)
                labels = to_variable(y_data)
                labels.stop_gradient = True
                outputs = video_model(imgs)
330 331 332 333 334 335
                loss = paddle.nn.functional.cross_entropy(
                    input=outputs,
                    label=labels,
                    ignore_index=-1,
                    reduction='none',
                    use_softmax=False,
336
                )
337
                avg_loss = paddle.mean(loss)
338
                acc_top1 = paddle.static.accuracy(
339 340
                    input=outputs, label=labels, k=1
                )
341
                acc_top5 = paddle.static.accuracy(
342 343
                    input=outputs, label=labels, k=5
                )
344 345 346 347 348

                avg_loss.backward()
                optimizer.minimize(avg_loss)
                video_model.clear_gradients()

349 350 351
                total_loss += float(avg_loss)
                total_acc1 += float(acc_top1)
                total_acc5 += float(acc_top5)
352 353
                total_sample += 1

354 355 356 357
                print(
                    'TRAIN Epoch {}, iter {}, loss = {}, acc1 {}, acc5 {}'.format(
                        epoch,
                        batch_id,
358 359 360
                        float(avg_loss),
                        float(acc_top1),
                        float(acc_top5),
361 362 363 364
                    )
                )
                ret.extend(
                    [
365 366 367
                        float(avg_loss),
                        float(acc_top1),
                        float(acc_top5),
368 369
                    ]
                )
370 371

            print(
372 373 374 375 376 377 378
                'TRAIN End, Epoch {}, avg_loss= {}, avg_acc1= {}, avg_acc5= {}'.format(
                    epoch,
                    total_loss / total_sample,
                    total_acc1 / total_sample,
                    total_acc5 / total_sample,
                )
            )
379 380 381 382 383 384 385 386 387 388 389
        return ret


class TestTsm(unittest.TestCase):
    def test_dygraph_static_same_loss(self):
        if fluid.is_compiled_with_cuda():
            fluid.set_flags({"FLAGS_cudnn_deterministic": True})
        args = parse_args()
        fake_data_reader = FakeDataReader("train", parse_config(args.config))
        dygraph_loss = train(args, fake_data_reader, to_static=False)
        static_loss = train(args, fake_data_reader, to_static=True)
390
        np.testing.assert_allclose(dygraph_loss, static_loss, rtol=1e-05)
391 392 393


if __name__ == '__main__':
394
    unittest.main()