# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. # #Licensed under the Apache License, Version 2.0 (the "License"); #you may not use this file except in compliance with the License. #You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # #Unless required by applicable law or agreed to in writing, software #distributed under the License is distributed on an "AS IS" BASIS, #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #See the License for the specific language governing permissions and #limitations under the License. from paddle.fluid.initializer import MSRA import paddle.fluid as fluid # get init parameters for conv layer def get_conv_init(fan_out): return MSRA(uniform=False, fan_in=fan_out) """Video models.""" class BottleneckTransform(fluid.dygraph.Layer): """ Bottleneck transformation: Tx1x1, 1x3x3, 1x1x1, where T is the size of temporal kernel. """ def __init__( self, dim_in, dim_out, temp_kernel_size, stride, dim_inner, num_groups, stride_1x1=False, inplace_relu=True, eps=1e-5, dilation=1, ): """ Args: dim_in (int): the channel dimensions of the input. dim_out (int): the channel dimension of the output. temp_kernel_size (int): the temporal kernel sizes of the middle convolution in the bottleneck. stride (int): the stride of the bottleneck. dim_inner (int): the inner dimension of the block. num_groups (int): number of groups for the convolution. num_groups=1 is for standard ResNet like networks, and num_groups>1 is for ResNeXt like networks. stride_1x1 (bool): if True, apply stride to 1x1 conv, otherwise apply stride to the 3x3 conv. inplace_relu (bool): if True, calculate the relu on the original input without allocating new memory. eps (float): epsilon for batch norm. dilation (int): size of dilation. """ super(BottleneckTransform, self).__init__() self.temp_kernel_size = temp_kernel_size self._inplace_relu = inplace_relu self._eps = eps self._stride_1x1 = stride_1x1 self._construct(dim_in, dim_out, stride, dim_inner, num_groups, dilation) def _construct(self, dim_in, dim_out, stride, dim_inner, num_groups, dilation): str1x1, str3x3 = (stride, 1) if self._stride_1x1 else (1, stride) fan = (dim_inner) * (self.temp_kernel_size * 1 * 1) initializer_tmp = get_conv_init(fan) batchnorm_weight = 1.0 self.a = fluid.dygraph.nn.Conv3D( num_channels=dim_in, num_filters=dim_inner, filter_size=[self.temp_kernel_size, 1, 1], stride=[1, str1x1, str1x1], padding=[int(self.temp_kernel_size // 2), 0, 0], param_attr=fluid.ParamAttr(initializer=initializer_tmp), bias_attr=False) self.a_bn = fluid.dygraph.BatchNorm( num_channels=dim_inner, epsilon=self._eps, param_attr=fluid.ParamAttr( initializer=fluid.initializer.Constant(batchnorm_weight), regularizer=fluid.regularizer.L2Decay( regularization_coeff=0.0)), bias_attr=fluid.ParamAttr( initializer=fluid.initializer.Constant(0.0), regularizer=fluid.regularizer.L2Decay( regularization_coeff=0.0))) # 1x3x3, BN, ReLU. fan = (dim_inner) * (1 * 3 * 3) initializer_tmp = get_conv_init(fan) batchnorm_weight = 1.0 self.b = fluid.dygraph.nn.Conv3D( num_channels=dim_inner, num_filters=dim_inner, filter_size=[1, 3, 3], stride=[1, str3x3, str3x3], padding=[0, dilation, dilation], groups=num_groups, dilation=[1, dilation, dilation], param_attr=fluid.ParamAttr(initializer=initializer_tmp), bias_attr=False) self.b_bn = fluid.dygraph.BatchNorm( num_channels=dim_inner, epsilon=self._eps, param_attr=fluid.ParamAttr( initializer=fluid.initializer.Constant(batchnorm_weight), regularizer=fluid.regularizer.L2Decay( regularization_coeff=0.0)), bias_attr=fluid.ParamAttr( initializer=fluid.initializer.Constant(0.0), regularizer=fluid.regularizer.L2Decay( regularization_coeff=0.0))) # 1x1x1, BN. fan = (dim_out) * (1 * 1 * 1) initializer_tmp = get_conv_init(fan) batchnorm_weight = 0.0 self.c = fluid.dygraph.nn.Conv3D( num_channels=dim_inner, num_filters=dim_out, filter_size=[1, 1, 1], stride=[1, 1, 1], padding=[0, 0, 0], param_attr=fluid.ParamAttr(initializer=initializer_tmp), bias_attr=False) self.c_bn = fluid.dygraph.BatchNorm( num_channels=dim_out, epsilon=self._eps, param_attr=fluid.ParamAttr( initializer=fluid.initializer.Constant(batchnorm_weight), regularizer=fluid.regularizer.L2Decay( regularization_coeff=0.0)), bias_attr=fluid.ParamAttr( initializer=fluid.initializer.Constant(0.0), regularizer=fluid.regularizer.L2Decay( regularization_coeff=0.0))) def forward(self, x): # Branch2a. x = self.a(x) x = self.a_bn(x) x = fluid.layers.relu(x) # Branch2b. x = self.b(x) x = self.b_bn(x) x = fluid.layers.relu(x) # Branch2c x = self.c(x) x = self.c_bn(x) return x class ResBlock(fluid.dygraph.Layer): """ Residual block. """ def __init__( self, dim_in, dim_out, temp_kernel_size, stride, dim_inner, num_groups=1, stride_1x1=False, inplace_relu=True, eps=1e-5, dilation=1, ): """ ResBlock class constructs redisual blocks. More details can be found in: Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. "Deep residual learning for image recognition." https://arxiv.org/abs/1512.03385 Args: dim_in (int): the channel dimensions of the input. dim_out (int): the channel dimension of the output. temp_kernel_size (int): the temporal kernel sizes of the middle convolution in the bottleneck. stride (int): the stride of the bottleneck. trans_func (string): transform function to be used to construct the bottleneck. dim_inner (int): the inner dimension of the block. num_groups (int): number of groups for the convolution. num_groups=1 is for standard ResNet like networks, and num_groups>1 is for ResNeXt like networks. stride_1x1 (bool): if True, apply stride to 1x1 conv, otherwise apply stride to the 3x3 conv. inplace_relu (bool): calculate the relu on the original input without allocating new memory. eps (float): epsilon for batch norm. dilation (int): size of dilation. """ super(ResBlock, self).__init__() self._inplace_relu = inplace_relu self._eps = eps self._construct( dim_in, dim_out, temp_kernel_size, stride, dim_inner, num_groups, stride_1x1, inplace_relu, dilation, ) def _construct( self, dim_in, dim_out, temp_kernel_size, stride, dim_inner, num_groups, stride_1x1, inplace_relu, dilation, ): # Use skip connection with projection if dim or res change. if (dim_in != dim_out) or (stride != 1): fan = (dim_out) * (1 * 1 * 1) initializer_tmp = get_conv_init(fan) batchnorm_weight = 1.0 self.branch1 = fluid.dygraph.nn.Conv3D( num_channels=dim_in, num_filters=dim_out, filter_size=1, stride=[1, stride, stride], padding=0, param_attr=fluid.ParamAttr(initializer=initializer_tmp), bias_attr=False, dilation=1) self.branch1_bn = fluid.dygraph.BatchNorm( num_channels=dim_out, epsilon=self._eps, param_attr=fluid.ParamAttr( initializer=fluid.initializer.Constant(batchnorm_weight), regularizer=fluid.regularizer.L2Decay( regularization_coeff=0.0)), bias_attr=fluid.ParamAttr( initializer=fluid.initializer.Constant(0.0), regularizer=fluid.regularizer.L2Decay( regularization_coeff=0.0))) self.branch2 = BottleneckTransform( dim_in, dim_out, temp_kernel_size, stride, dim_inner, num_groups, stride_1x1=stride_1x1, inplace_relu=inplace_relu, dilation=dilation, ) def forward(self, x): if hasattr(self, "branch1"): x1 = self.branch1(x) x1 = self.branch1_bn(x1) x2 = self.branch2(x) x = fluid.layers.elementwise_add(x=x1, y=x2) else: x2 = self.branch2(x) x = fluid.layers.elementwise_add(x=x, y=x2) x = fluid.layers.relu(x) return x class ResStage(fluid.dygraph.Layer): """ Stage of 3D ResNet. It expects to have one or more tensors as input for multi-pathway (SlowFast) cases. More details can be found here: Christoph Feichtenhofer, Haoqi Fan, Jitendra Malik, and Kaiming He. "Slowfast networks for video recognition." https://arxiv.org/pdf/1812.03982.pdf """ def __init__( self, dim_in, dim_out, stride, temp_kernel_sizes, num_blocks, dim_inner, num_groups, num_block_temp_kernel, dilation, stride_1x1=False, inplace_relu=True, ): """ The `__init__` method of any subclass should also contain these arguments. ResStage builds p streams, where p can be greater or equal to one. Args: dim_in (list): list of p the channel dimensions of the input. Different channel dimensions control the input dimension of different pathways. dim_out (list): list of p the channel dimensions of the output. Different channel dimensions control the input dimension of different pathways. temp_kernel_sizes (list): list of the p temporal kernel sizes of the convolution in the bottleneck. Different temp_kernel_sizes control different pathway. stride (list): list of the p strides of the bottleneck. Different stride control different pathway. num_blocks (list): list of p numbers of blocks for each of the pathway. dim_inner (list): list of the p inner channel dimensions of the input. Different channel dimensions control the input dimension of different pathways. num_groups (list): list of number of p groups for the convolution. num_groups=1 is for standard ResNet like networks, and num_groups>1 is for ResNeXt like networks. num_block_temp_kernel (list): extent the temp_kernel_sizes to num_block_temp_kernel blocks, then fill temporal kernel size of 1 for the rest of the layers. dilation (list): size of dilation for each pathway. """ super(ResStage, self).__init__() assert all((num_block_temp_kernel[i] <= num_blocks[i] for i in range(len(temp_kernel_sizes)))) self.num_blocks = num_blocks self.temp_kernel_sizes = [(temp_kernel_sizes[i] * num_blocks[i] )[:num_block_temp_kernel[i]] + [1] * (num_blocks[i] - num_block_temp_kernel[i]) for i in range(len(temp_kernel_sizes))] assert (len({ len(dim_in), len(dim_out), len(temp_kernel_sizes), len(stride), len(num_blocks), len(dim_inner), len(num_groups), len(num_block_temp_kernel), }) == 1) self.num_pathways = len(self.num_blocks) self._construct( dim_in, dim_out, stride, dim_inner, num_groups, stride_1x1, inplace_relu, dilation, ) def _construct( self, dim_in, dim_out, stride, dim_inner, num_groups, stride_1x1, inplace_relu, dilation, ): for pathway in range(self.num_pathways): for i in range(self.num_blocks[pathway]): res_block = ResBlock( dim_in[pathway] if i == 0 else dim_out[pathway], dim_out[pathway], self.temp_kernel_sizes[pathway][i], stride[pathway] if i == 0 else 1, dim_inner[pathway], num_groups[pathway], stride_1x1=stride_1x1, inplace_relu=inplace_relu, dilation=dilation[pathway], ) self.add_sublayer("pathway{}_res{}".format(pathway, i), res_block) def forward(self, inputs): output = [] for pathway in range(self.num_pathways): x = inputs[pathway] for i in range(self.num_blocks[pathway]): m = getattr(self, "pathway{}_res{}".format(pathway, i)) x = m(x) output.append(x) return output """ResNe(X)t Head helper.""" class ResNetBasicHead(fluid.dygraph.Layer): """ ResNe(X)t 3D head. This layer performs a fully-connected projection during training, when the input size is 1x1x1. It performs a convolutional projection during testing when the input size is larger than 1x1x1. If the inputs are from multiple different pathways, the inputs will be concatenated after pooling. """ def __init__( self, dim_in, num_classes, pool_size, dropout_rate=0.0, ): """ ResNetBasicHead takes p pathways as input where p in [1, infty]. Args: dim_in (list): the list of channel dimensions of the p inputs to the ResNetHead. num_classes (int): the channel dimensions of the p outputs to the ResNetHead. pool_size (list): the list of kernel sizes of p spatial temporal poolings, temporal pool kernel size, spatial pool kernel size, spatial pool kernel size in order. dropout_rate (float): dropout rate. If equal to 0.0, perform no dropout. """ super(ResNetBasicHead, self).__init__() assert (len({len(pool_size), len(dim_in)}) == 1 ), "pathway dimensions are not consistent." self.num_pathways = len(pool_size) self.pool_size = pool_size self.dropout_rate = dropout_rate fc_init_std = 0.01 initializer_tmp = fluid.initializer.NormalInitializer( loc=0.0, scale=fc_init_std) self.projection = fluid.dygraph.Linear( input_dim=sum(dim_in), output_dim=num_classes, param_attr=fluid.ParamAttr(initializer=initializer_tmp), bias_attr=fluid.ParamAttr( initializer=fluid.initializer.Constant(0.0)), ) def forward(self, inputs, training): assert ( len(inputs) == self.num_pathways ), "Input tensor does not contain {} pathway".format(self.num_pathways) pool_out = [] for pathway in range(self.num_pathways): tmp_out = fluid.layers.pool3d( input=inputs[pathway], pool_type="avg", pool_size=self.pool_size[pathway], pool_stride=1, data_format="NCDHW") pool_out.append(tmp_out) x = fluid.layers.concat(input=pool_out, axis=1, name=None) x = fluid.layers.transpose(x=x, perm=(0, 2, 3, 4, 1)) # Perform dropout. if self.dropout_rate > 0.0: x = fluid.layers.dropout( x, dropout_prob=self.dropout_rate, dropout_implementation='upscale_in_train') x = self.projection(x) # Performs fully convlutional inference. if not training: x = fluid.layers.softmax(x, axis=4) x = fluid.layers.reduce_mean(x, dim=[1, 2, 3]) x = fluid.layers.reshape(x, shape=(x.shape[0], -1)) return x