test_tsm.py 12.2 KB
Newer Older
1 2
#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
3 4 5
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
6 7 8
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
9 10 11 12 13
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
14 15 16 17 18 19

import argparse
import os
import random
import sys
import unittest
20 21 22 23

import numpy as np
from tsm_config_utils import merge_configs, parse_config, print_configs

L
Leo Chen 已提交
24
import paddle
25
from paddle import fluid
26
from paddle.fluid.dygraph import to_variable
H
hjyp 已提交
27
from paddle.jit.api import to_static
28
from paddle.nn import BatchNorm, Linear
29 30 31 32 33 34 35

random.seed(0)
np.random.seed(0)


def parse_args():
    parser = argparse.ArgumentParser("Paddle Video train script")
36 37 38 39 40 41 42 43 44 45 46 47
    parser.add_argument(
        '--config',
        type=str,
        default='tsm.yaml',
        help='path to config file of model',
    )
    parser.add_argument(
        '--use_gpu',
        type=bool,
        default=fluid.is_compiled_with_cuda(),
        help='default use gpu.',
    )
X
xiongkun 已提交
48 49 50
    args = parser.parse_args(
        ['--config', __file__.rpartition('/')[0] + '/tsm.yaml']
    )
51 52 53
    return args


54
class ConvBNLayer(paddle.nn.Layer):
55 56 57 58 59 60 61 62 63
    def __init__(
        self,
        num_channels,
        num_filters,
        filter_size,
        stride=1,
        groups=1,
        act=None,
    ):
64
        super().__init__()
65

66 67 68 69
        self._conv = paddle.nn.Conv2D(
            in_channels=num_channels,
            out_channels=num_filters,
            kernel_size=filter_size,
70 71
            stride=stride,
            padding=(filter_size - 1) // 2,
72 73
            groups=1,
            weight_attr=fluid.param_attr.ParamAttr(),
74 75 76 77 78 79 80 81 82
            bias_attr=False,
        )

        self._batch_norm = BatchNorm(
            num_filters,
            act=act,
            param_attr=fluid.param_attr.ParamAttr(),
            bias_attr=fluid.param_attr.ParamAttr(),
        )
83 84 85 86 87 88 89 90

    def forward(self, inputs):
        y = self._conv(inputs)
        y = self._batch_norm(y)

        return y


91
class BottleneckBlock(paddle.nn.Layer):
92 93 94
    def __init__(
        self, num_channels, num_filters, stride, shortcut=True, seg_num=8
    ):
95
        super().__init__()
96

97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
        self.conv0 = ConvBNLayer(
            num_channels=num_channels,
            num_filters=num_filters,
            filter_size=1,
            act='relu',
        )
        self.conv1 = ConvBNLayer(
            num_channels=num_filters,
            num_filters=num_filters,
            filter_size=3,
            stride=stride,
            act='relu',
        )
        self.conv2 = ConvBNLayer(
            num_channels=num_filters,
            num_filters=num_filters * 4,
            filter_size=1,
            act=None,
        )
116 117

        if not shortcut:
118 119 120 121 122 123
            self.short = ConvBNLayer(
                num_channels=num_channels,
                num_filters=num_filters * 4,
                filter_size=1,
                stride=stride,
            )
124 125 126 127 128
        self.shortcut = shortcut
        self.seg_num = seg_num
        self._num_channels_out = int(num_filters * 4)

    def forward(self, inputs):
129 130 131
        shifts = paddle.nn.functional.temporal_shift(
            inputs, self.seg_num, 1.0 / 8
        )
132 133 134 135 136 137 138
        y = self.conv0(shifts)
        conv1 = self.conv1(y)
        conv2 = self.conv2(conv1)
        if self.shortcut:
            short = inputs
        else:
            short = self.short(inputs)
139
        y = paddle.nn.functional.relu(paddle.add(x=short, y=conv2))
140 141 142
        return y


143
class TSM_ResNet(paddle.nn.Layer):
144
    def __init__(self, name_scope, config, mode):
145
        super().__init__(name_scope)
146 147 148 149 150

        self.layers = config.MODEL.num_layers
        self.seg_num = config.MODEL.seg_num
        self.class_dim = config.MODEL.num_classes
        self.reshape_list = [
151 152 153
            config.MODEL.seglen * 3,
            config[mode.upper()]['target_size'],
            config[mode.upper()]['target_size'],
154 155 156 157 158 159 160 161
        ]

        if self.layers == 50:
            depth = [3, 4, 6, 3]
        else:
            raise NotImplementedError
        num_filters = [64, 128, 256, 512]

162 163 164
        self.conv = ConvBNLayer(
            num_channels=3, num_filters=64, filter_size=7, stride=2, act='relu'
        )
165 166
        self.pool2d_max = paddle.nn.MaxPool2D(
            kernel_size=3, stride=2, padding=1
167
        )
168 169 170 171 172 173 174 175 176

        self.bottleneck_block_list = []
        num_channels = 64

        for block in range(len(depth)):
            shortcut = False
            for i in range(depth[block]):
                bottleneck_block = self.add_sublayer(
                    'bb_%d_%d' % (block, i),
177 178 179 180 181 182 183 184
                    BottleneckBlock(
                        num_channels=num_channels,
                        num_filters=num_filters[block],
                        stride=2 if i == 0 and block != 0 else 1,
                        shortcut=shortcut,
                        seg_num=self.seg_num,
                    ),
                )
185 186 187
                num_channels = int(bottleneck_block._num_channels_out)
                self.bottleneck_block_list.append(bottleneck_block)
                shortcut = True
W
wangzhen38 已提交
188
        self.pool2d_avg = paddle.nn.AdaptiveAvgPool2D(1)
189
        import math
190

191 192 193 194 195
        stdv = 1.0 / math.sqrt(2048 * 1.0)

        self.out = Linear(
            2048,
            self.class_dim,
196 197
            weight_attr=paddle.ParamAttr(
                initializer=paddle.nn.initializer.Uniform(-stdv, stdv)
198
            ),
199 200
            bias_attr=paddle.ParamAttr(
                learning_rate=2.0, regularizer=paddle.regularizer.L1Decay()
201 202
            ),
        )
203

H
hjyp 已提交
204
    @to_static
205
    def forward(self, inputs):
206
        y = paddle.reshape(inputs, [-1] + self.reshape_list)
207 208 209 210 211
        y = self.conv(y)
        y = self.pool2d_max(y)
        for bottleneck_block in self.bottleneck_block_list:
            y = bottleneck_block(y)
        y = self.pool2d_avg(y)
C
ccrrong 已提交
212
        y = paddle.nn.functional.dropout(y, p=0.5)
213
        y = paddle.reshape(y, [-1, self.seg_num, y.shape[1]])
214
        y = paddle.mean(y, axis=1)
215
        y = paddle.reshape(y, shape=[-1, 2048])
216
        y = self.out(y)
217
        y = paddle.nn.functional.softmax(y)
218 219 220
        return y


221
class FakeDataReader:
222 223 224 225 226 227 228
    def __init__(self, mode, cfg):
        self.format = cfg.MODEL.format
        self.num_classes = cfg.MODEL.num_classes
        self.seg_num = cfg.MODEL.seg_num
        self.seglen = cfg.MODEL.seglen

        self.target_size = cfg[mode.upper()]['target_size']
229 230 231 232 233 234 235 236 237 238 239 240
        self.img_mean = (
            np.array(cfg.MODEL.image_mean).reshape([3, 1, 1]).astype(np.float32)
        )
        self.img_std = (
            np.array(cfg.MODEL.image_std).reshape([3, 1, 1]).astype(np.float32)
        )

        self.batch_size = (
            1
            if sys.platform == 'darwin' or os.name == 'nt'
            else cfg[mode.upper()]['batch_size']
        )
241 242 243 244 245 246 247 248
        self.generator_out = []
        self.total_iter = 3
        for i in range(self.total_iter):
            batch_out = []
            for j in range(self.batch_size):
                label = np.int64(random.randint(0, self.num_classes - 1))
                random_mean = self.img_mean[0][0][0]
                random_std = self.img_std[0][0][0]
249 250 251 252 253 254 255 256 257 258
                imgs = np.random.normal(
                    random_mean,
                    random_std,
                    [
                        self.seg_num,
                        self.seglen * 3,
                        self.target_size,
                        self.target_size,
                    ],
                ).astype(np.float32)
259 260 261 262 263 264 265 266 267 268 269 270 271
                batch_out.append((imgs, label))
            self.generator_out.append(batch_out)

    def create_reader(self):
        def batch_reader():
            for i in range(self.total_iter):
                yield self.generator_out[i]

        return batch_reader


def create_optimizer(cfg, params):
    total_videos = cfg.total_videos
272 273 274
    batch_size = (
        1 if sys.platform == 'darwin' or os.name == 'nt' else cfg.batch_size
    )
275
    step = int(total_videos / batch_size + 1)
276 277 278 279 280 281 282 283
    bd = [e * step for e in cfg.decay_epochs]
    base_lr = cfg.learning_rate
    lr_decay = cfg.learning_rate_decay
    lr = [base_lr, base_lr * lr_decay, base_lr * lr_decay * lr_decay]
    l2_weight_decay = cfg.l2_weight_decay
    momentum = cfg.momentum

    optimizer = fluid.optimizer.Momentum(
284
        learning_rate=fluid.layers.piecewise_decay(boundaries=bd, values=lr),
285
        momentum=momentum,
286
        regularization=paddle.regularizer.L2Decay(l2_weight_decay),
287 288
        parameter_list=params,
    )
289 290 291 292 293

    return optimizer


def train(args, fake_data_reader, to_static):
R
Ryan 已提交
294
    paddle.jit.enable_to_static(to_static)
295 296 297 298 299 300 301 302 303 304 305

    config = parse_config(args.config)
    train_config = merge_configs(config, 'train', vars(args))
    valid_config = merge_configs(config, 'valid', vars(args))
    print_configs(train_config, 'Train')

    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()

    random.seed(0)
    np.random.seed(0)
    with fluid.dygraph.guard(place):
C
cnn 已提交
306
        paddle.seed(1000)
L
Leo Chen 已提交
307
        paddle.framework.random._manual_program_seed(1000)
308 309 310

        video_model = TSM_ResNet("TSM", train_config, 'Train')

311 312 313
        optimizer = create_optimizer(
            train_config.TRAIN, video_model.parameters()
        )
314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331

        train_reader = fake_data_reader.create_reader()

        ret = []
        for epoch in range(train_config.TRAIN.epoch):
            video_model.train()
            total_loss = 0.0
            total_acc1 = 0.0
            total_acc5 = 0.0
            total_sample = 0
            for batch_id, data in enumerate(train_reader()):
                x_data = np.array([item[0] for item in data])
                y_data = np.array([item[1] for item in data]).reshape([-1, 1])

                imgs = to_variable(x_data)
                labels = to_variable(y_data)
                labels.stop_gradient = True
                outputs = video_model(imgs)
332 333 334 335 336 337
                loss = paddle.nn.functional.cross_entropy(
                    input=outputs,
                    label=labels,
                    ignore_index=-1,
                    reduction='none',
                    use_softmax=False,
338
                )
339
                avg_loss = paddle.mean(loss)
340
                acc_top1 = paddle.static.accuracy(
341 342
                    input=outputs, label=labels, k=1
                )
343
                acc_top5 = paddle.static.accuracy(
344 345
                    input=outputs, label=labels, k=5
                )
346 347 348 349 350

                avg_loss.backward()
                optimizer.minimize(avg_loss)
                video_model.clear_gradients()

351 352 353
                total_loss += float(avg_loss)
                total_acc1 += float(acc_top1)
                total_acc5 += float(acc_top5)
354 355
                total_sample += 1

356 357 358 359
                print(
                    'TRAIN Epoch {}, iter {}, loss = {}, acc1 {}, acc5 {}'.format(
                        epoch,
                        batch_id,
360 361 362
                        float(avg_loss),
                        float(acc_top1),
                        float(acc_top5),
363 364 365 366
                    )
                )
                ret.extend(
                    [
367 368 369
                        float(avg_loss),
                        float(acc_top1),
                        float(acc_top5),
370 371
                    ]
                )
372 373

            print(
374 375 376 377 378 379 380
                'TRAIN End, Epoch {}, avg_loss= {}, avg_acc1= {}, avg_acc5= {}'.format(
                    epoch,
                    total_loss / total_sample,
                    total_acc1 / total_sample,
                    total_acc5 / total_sample,
                )
            )
381 382 383 384 385 386 387 388 389 390 391
        return ret


class TestTsm(unittest.TestCase):
    def test_dygraph_static_same_loss(self):
        if fluid.is_compiled_with_cuda():
            fluid.set_flags({"FLAGS_cudnn_deterministic": True})
        args = parse_args()
        fake_data_reader = FakeDataReader("train", parse_config(args.config))
        dygraph_loss = train(args, fake_data_reader, to_static=False)
        static_loss = train(args, fake_data_reader, to_static=True)
392
        np.testing.assert_allclose(dygraph_loss, static_loss, rtol=1e-05)
393 394 395


if __name__ == '__main__':
396
    unittest.main()