diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index f6b4348d250a9d99541932fcfefd386584d0d1e5..3a4f93929f51b2181482267735f9c1145622f928 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -39,13 +39,16 @@ __all__ = [ 'chunk_eval', 'sequence_conv', 'conv2d', + 'conv3d', 'sequence_pool', 'sequence_softmax', 'softmax', 'pool2d', + 'pool3d', 'batch_norm', 'beam_search_decode', 'conv2d_transpose', + 'conv3d_transpose', 'sequence_expand', 'lstm_unit', 'reduce_sum', @@ -1385,13 +1388,12 @@ def conv3d(input, The convolution3D layer calculates the output based on the input, filter and strides, paddings, dilations, groups parameters. Input(Input) and - Output(Output) are in NCHW format. Where N is batch size, C is the number of - channels, H is the height of the feature, and W is the width of the feature. - The details of convolution layer, please refer UFLDL's `convolution, - `_ . - If bias attribution and activation type are provided, bias is added to the - output of the convolution, and the corresponding activation function is - applied to the final result. + Output(Output) are in NCDHW format. Where N is batch size C is the number of + channels, D is the depth of the feature, H is the height of the feature, + and W is the width of the feature. Convlution3D is similar with Convlution2D + but adds one dimension(depth). If bias attribution and activation type are + provided, bias is added to the output of the convolution, and the + corresponding activation function is applied to the final result. For each input :math:`X`, the equation is: @@ -1401,8 +1403,8 @@ def conv3d(input, In the above equation: - * :math:`X`: Input value, a tensor with NCHW format. - * :math:`W`: Filter value, a tensor with MCHW format. + * :math:`X`: Input value, a tensor with NCDHW format. + * :math:`W`: Filter value, a tensor with MCDHW format. * :math:`\\ast`: Convolution operation. * :math:`b`: Bias value, a 2-D tensor with shape [M, 1]. * :math:`\\sigma`: Activation function. @@ -1433,16 +1435,16 @@ def conv3d(input, num_filters(int): The number of filter. It is as same as the output image channel. filter_size (int|tuple|None): The filter size. If filter_size is a tuple, - it must contain two integers, (filter_size_D, filter_size_H, filter_size_W). + it must contain three integers, (filter_size_D, filter_size_H, filter_size_W). Otherwise, the filter will be a square. stride (int|tuple): The stride size. If stride is a tuple, it must - contain two integers, (stride_D, stride_H, stride_W). Otherwise, the + contain three integers, (stride_D, stride_H, stride_W). Otherwise, the stride_D = stride_H = stride_W = stride. Default: stride = 1. padding (int|tuple): The padding size. If padding is a tuple, it must - contain two integers, (padding_D, padding_H, padding_W). Otherwise, the + contain three integers, (padding_D, padding_H, padding_W). Otherwise, the padding_D = padding_H = padding_W = padding. Default: padding = 0. dilation (int|tuple): The dilation size. If dilation is a tuple, it must - contain two integers, (dilation_D, dilation_H, dilation_W). Otherwise, the + contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the dilation_D = dilation_H = dilation_W = dilation. Default: dilation = 1. groups (int): The groups number of the Conv3d Layer. According to grouped convolution in Alex Krizhevsky's Deep CNN paper: when group=2, @@ -1528,7 +1530,7 @@ def conv3d(input, 'use_mkldnn': use_mkldnn }) - pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=3) + pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2) return helper.append_activation(pre_act) @@ -1720,12 +1722,84 @@ def pool2d(input, if not isinstance(use_cudnn, bool): raise ValueError("use_cudnn should be True or False") - helper = LayerHelper('pool2d', **locals()) + l_type = 'conv2d' + + helper = LayerHelper(l_type, **locals()) + dtype = helper.input_dtype() + pool_out = helper.create_tmp_variable(dtype) + + helper.append_op( + type=l_type, + inputs={"X": input}, + outputs={"Out": pool_out}, + attrs={ + "pooling_type": pool_type, + "ksize": pool_size, + "global_pooling": global_pooling, + "strides": pool_stride, + "paddings": pool_padding, + "use_cudnn": use_cudnn, + "ceil_mode": ceil_mode, + "use_mkldnn": use_mkldnn + }) + + return pool_out + + +def pool3d(input, + pool_size=-1, + pool_type="max", + pool_stride=1, + pool_padding=0, + global_pooling=False, + use_cudnn=True, + ceil_mode=False, + use_mkldnn=False, + name=None): + """ + This function adds the operator for pooling in 3-dimensions, using the + pooling configurations mentioned in input parameters. + + Args: + input (Variable): ${input_comment} + pool_size (int): ${ksize_comment} + pool_type (str): ${pooling_type_comment} + pool_stride (int): stride of the pooling layer. + pool_padding (int): padding size. + global_pooling (bool): ${global_pooling_comment} + use_cudnn (bool): ${use_cudnn_comment} + ceil_mode (bool): ${ceil_mode_comment} + use_mkldnn (bool): ${use_mkldnn_comment} + name (str): A name for this layer(optional). If set None, the layer + will be named automatically. + + Returns: + Variable: output of pool3d layer. + """ + if pool_type not in ["max", "avg"]: + raise ValueError( + "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.", + str(pool_type)) + + if global_pooling is False and pool_size == -1: + raise ValueError( + "When the global_pooling is False, pool_size must be passed " + "and be a valid value. Received pool_size: " + str(pool_size)) + + pool_size = utils.convert_to_list(pool_size, 3, 'pool_size') + pool_padding = utils.convert_to_list(pool_padding, 3, 'pool_padding') + pool_stride = utils.convert_to_list(pool_stride, 3, 'pool_stride') + + if not isinstance(use_cudnn, bool): + raise ValueError("use_cudnn should be True or False") + + l_type = "pool3d" + helper = LayerHelper(l_type, **locals()) dtype = helper.input_dtype() pool_out = helper.create_tmp_variable(dtype) helper.append_op( - type="pool2d", + type=l_type, inputs={"X": input}, outputs={"Out": pool_out}, attrs={ @@ -2146,6 +2220,173 @@ def conv2d_transpose(input, return out +def conv3d_transpose(input, + num_filters, + output_size=None, + filter_size=None, + padding=0, + stride=1, + dilation=1, + groups=None, + param_attr=None, + bias_attr=None, + use_cudnn=True, + act=None, + name=None): + """ + **Convlution3D transpose layer** + + The convolution3D transpose layer calculates the output based on the input, + filter, and dilations, strides, paddings. Input(Input) and output(Output) + are in NCDHW format. Where N is batch size, C is the number of channels, + D is the depth of the feature, H is the height of the feature, and W + is the width of the feature. Parameters(dilations, strides, paddings) are + two elements. These two elements represent height and width, respectively. + The details of convolution transpose layer, please refer to the following + explanation and references `therein `_. + + For each input :math:`X`, the equation is: + + .. math:: + + Out = W \\ast X + + In the above equation: + + * :math:`X`: Input value, a tensor with NCDHW format. + * :math:`W`: Filter value, a tensor with MCDHW format. + * :math:`\\ast` : Convolution transpose operation. + * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be + different. + + Example: + + - Input: + + Input shape: $(N, C_{in}, D_{in}, H_{in}, W_{in})$ + + Filter shape: $(C_{in}, C_{out}, D_f, H_f, W_f)$ + + - Output: + + Output shape: $(N, C_{out}, D_{out}, H_{out}, W_{out})$ + + Where + + .. math:: + + D_{out} &= (D_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (D_f - 1) + 1 \\\\ + H_{out} &= (H_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (H_f - 1) + 1 \\\\ + W_{out} &= (W_{in} - 1) * strides[2] - 2 * paddings[2] + dilations[2] * (W_f - 1) + 1 + + Args: + input(Variable): The input image with [N, C, D, H, W] format. + num_filters(int): The number of the filter. It is as same as the output + image channel. + output_size(int|tuple|None): The output image size. If output size is a + tuple, it must contain three integers, (image_D, image_H, image_W). This + parameter only works when filter_size is None. + filter_size(int|tuple|None): The filter size. If filter_size is a tuple, + it must contain three integers, (filter_size_D, filter_size_H, filter_size_W). + Otherwise, the filter will be a square. None if use output size to + calculate filter_size. + padding(int|tuple): The padding size. If padding is a tuple, it must + contain three integers, (padding_D, padding_H, padding_W). Otherwise, the + padding_D = padding_H = padding_W = padding. Default: padding = 0. + stride(int|tuple): The stride size. If stride is a tuple, it must + contain three integers, (stride_D, stride_H, stride_W). Otherwise, the + stride_D = stride_H = stride_W = stride. Default: stride = 1. + dilation(int|tuple): The dilation size. If dilation is a tuple, it must + contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the + dilation_D = dilation_H = dilation_W = dilation. Default: dilation = 1. + groups(int): The groups number of the Conv3d transpose layer. Inspired by + grouped convolution in Alex Krizhevsky's Deep CNN paper, in which + when group=2, the first half of the filters is only connected to the + first half of the input channels, while the second half of the + filters is only connected to the second half of the input channels. + Default: groups=1 + param_attr(ParamAttr): The parameters to the Conv3d_transpose Layer. + Default: None + bias_attr(ParamAttr): Bias parameter for the Conv3d layer. Default: None + use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn + library is installed. Default: True + act(str): Activation type. Default: None + name(str|None): A name for this layer(optional). If set None, the layer + will be named automatically. + + Returns: + Variable: The tensor variable storing the convolution transpose result. + + Raises: + ValueError: If the shapes of input, filter_size, stride, padding and + groups mismatch. + + Examples: + .. code-block:: python + + data = fluid.layers.data( + name='data', shape=[3, 12, 32, 32], dtype='float32') + conv2d_transpose = fluid.layers.conv3d_transpose( + input=data, num_filters=2, filter_size=3) + """ + l_type = "conv3d_transpose" + helper = LayerHelper(l_type, **locals()) + if not isinstance(input, Variable): + raise TypeError("Input of conv3d_transpose must be Variable") + input_channel = input.shape[1] + + padding = utils.convert_to_list(padding, 3, 'padding') + stride = utils.convert_to_list(stride, 3, 'stride') + dilation = utils.convert_to_list(dilation, 3, 'dilation') + + if not isinstance(use_cudnn, bool): + raise ValueError("use_cudnn should be True or False") + + if filter_size is None: + if output_size is None: + raise ValueError("output_size must be set when filter_size is None") + if isinstance(output_size, int): + output_size = [output_size, output_size] + + d_in = input.shape[2] + h_in = input.shape[3] + w_in = input.shape[4] + + filter_size_d = (output_size[0] - (d_in - 1) * stride[0] + 2 * + padding[0] - 1) / dilation[0] + 1 + filter_size_h = (output_size[1] - (h_in - 1) * stride[1] + 2 * + padding[1] - 1) / dilation[1] + 1 + filter_size_w = (output_size[2] - (w_in - 1) * stride[2] + 2 * + padding[2] - 1) / dilation[2] + 1 + filter_size = [filter_size_d, filter_size_h, filter_size_w] + else: + filter_size = utils.convert_to_list(filter_size, 3, + 'conv3d_transpose.filter_size') + + groups = 1 if groups is None else groups + filter_shape = [input_channel, num_filters / groups] + filter_size + img_filter = helper.create_parameter( + dtype=input.dtype, shape=filter_shape, attr=helper.param_attr) + + pre_bias = helper.create_tmp_variable(dtype=input.dtype) + helper.append_op( + type=l_type, + inputs={'Input': [input], + 'Filter': [img_filter]}, + outputs={'Output': pre_bias}, + attrs={ + 'strides': stride, + 'paddings': padding, + 'dilations': dilation, + 'groups': groups, + 'use_cudnn': use_cudnn + }) + + pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2) + out = helper.append_activation(pre_act) + return out + + def sequence_expand(x, y, ref_level=-1, name=None): """Sequence Expand Layer. This layer will expand the input variable **x** according to specified level lod of **y**. Please note that lod level of