diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/inplace_addto_op_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/inplace_addto_op_pass.cc index 849d0dabab77969d74d5f7cf094579026cbb96c9..d09de5be84c358d7ba0d474beca4fe7732ebeef2 100644 --- a/paddle/fluid/framework/ir/memory_optimize_pass/inplace_addto_op_pass.cc +++ b/paddle/fluid/framework/ir/memory_optimize_pass/inplace_addto_op_pass.cc @@ -179,7 +179,8 @@ void InplaceAddToOpPass::Run(Graph *graph) const { out_var_ptr->GeneratedOp()); // NOTE(zhiqiu): currently, only conv2d_grad supports addto strategy - if (right_generated_op->Name() != "conv2d_grad") { + if (right_generated_op->Name() != "conv2d_grad" && + right_generated_op->Name() != "resnet_unit_grad") { continue; } @@ -224,11 +225,13 @@ static bool IsValidConv2DGradDataGradNode(const Node &node) { if (node.inputs.empty()) return false; auto *generated_op = node.inputs[0]; auto *op_desc = generated_op->Op(); - if (op_desc == nullptr || op_desc->Type() != "conv2d_grad") { + if (op_desc == nullptr || (op_desc->Type() != "conv2d_grad" && + op_desc->Type() != "resnet_unit_grad")) { return false; } const auto &outputs = op_desc->Outputs(); - auto iter = outputs.find(GradVarName("Input")); + std::string grad_var_name = op_desc->Type() == "conv2d_grad" ? "Input" : "X"; + auto iter = outputs.find(GradVarName(grad_var_name)); return iter != outputs.end() && !iter->second.empty() && iter->second[0] == node.Name() && !op_desc->GetAttrIfExists("use_addto"); diff --git a/paddle/fluid/operators/fused/resnet_unit_op.cc b/paddle/fluid/operators/fused/resnet_unit_op.cc index 062fd3f1cf40884701fb9e05f191491a1e963164..d2ac089d4d1d21cb7767292d41035d86b79cc4f6 100644 --- a/paddle/fluid/operators/fused/resnet_unit_op.cc +++ b/paddle/fluid/operators/fused/resnet_unit_op.cc @@ -232,13 +232,14 @@ class ResNetUnitOpMaker : public framework::OpProtoAndCheckerMaker { "(bool, default false) Set to true for inference only, false " "for training. Some layers may run faster when this is true.") .SetDefault(false); + AddAttr("use_addto", "").SetDefault(false); AddAttr("act_type", "The activation type to be fused.") .SetDefault("relu"); AddComment(R"DOC( -Fusion op of the basic unit of resnet block. +Fusion op of the basic unit of resnet block. The implementation is based on the latest fusion op interface in cuDNN v8.0. -For more details: +For more details: https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnFusedOps_t )DOC"); diff --git a/paddle/fluid/operators/fused/resnet_unit_op.cu b/paddle/fluid/operators/fused/resnet_unit_op.cu index a0126e5a9d4283a718741e18754b15e26e56a28c..b121864f80e4d970911d8bd68afa1966b9c08dc7 100644 --- a/paddle/fluid/operators/fused/resnet_unit_op.cu +++ b/paddle/fluid/operators/fused/resnet_unit_op.cu @@ -55,7 +55,7 @@ class ResNetUnitKernel : public framework::OpKernel { int padding = ctx.Attr("padding"); int stride = ctx.Attr("stride"); int stride_z = ctx.Attr("stride_z"); - int dilate = ctx.Attr("dilate"); + int dilation = ctx.Attr("dilation"); int group = ctx.Attr("group"); double eps = static_cast(ctx.Attr("epsilon")); double momentum = static_cast(ctx.Attr("momentum")); @@ -87,7 +87,7 @@ class ResNetUnitKernel : public framework::OpKernel { sum_x.Resize(param_dims); sum_of_squares_x.Resize(param_dims); CudnnNormConvolution conv_x_op(dev_ctx, input_x_shape, filter_x_shape, - output_shape, padding, stride, dilate, + output_shape, padding, stride, dilation, group); conv_x_op.Forward(dev_ctx, *input_x, *filter_x, conv_out_x, &sum_x, &sum_of_squares_x); @@ -129,8 +129,8 @@ class ResNetUnitKernel : public framework::OpKernel { sum_z.Resize(param_dims); sum_of_squares_z.Resize(param_dims); CudnnNormConvolution conv_z_op(dev_ctx, input_z_shape, filter_z_shape, - output_shape, padding, stride_z, dilate, - group); + output_shape, padding, stride_z, + dilation, group); conv_z_op.Forward(dev_ctx, *input_z, *filter_z, conv_out_z, &sum_z, &sum_of_squares_z); @@ -189,7 +189,7 @@ class ResNetUnitGradKernel : public framework::OpKernel { int padding = ctx.Attr("padding"); int stride = ctx.Attr("stride"); int stride_z = ctx.Attr("stride_z"); - int dilate = ctx.Attr("dilate"); + int dilation = ctx.Attr("dilation"); int group = ctx.Attr("group"); double eps = static_cast(ctx.Attr("epsilon")); double momentum = static_cast(ctx.Attr("momentum")); @@ -263,7 +263,7 @@ class ResNetUnitGradKernel : public framework::OpKernel { auto filter_z_shape = framework::vectorize(filter_z->dims()); CudnnNormConvolutionGrad conv_z_op(dev_ctx, z_shape, filter_z_shape, output_shape, padding, stride_z, - dilate, group); + dilation, group); conv_z_op.Backward(dev_ctx, *z, *filter_z, conv_out_z_grad, z_grad, filter_z_grad); } else { @@ -278,11 +278,12 @@ class ResNetUnitGradKernel : public framework::OpKernel { } // 2. Backward of Conv for x, get x_grad and filter_x_grad + bool use_addto = ctx.Attr("use_addto"); CudnnNormConvolutionGrad conv_x_op(dev_ctx, x_shape, filter_x_shape, - output_shape, padding, stride, dilate, - group); + output_shape, padding, stride, + dilation, group); conv_x_op.Backward(dev_ctx, *x, *filter_x, conv_out_x_grad, x_grad, - filter_x_grad); + filter_x_grad, use_addto); } }; diff --git a/python/paddle/incubate/operators/__init__.py b/python/paddle/incubate/operators/__init__.py index 694cde4f28624b76e8da1af880f530e40789ef68..9a6710d0950974d76f7f96bef9e3d5e9a3e06b2a 100644 --- a/python/paddle/incubate/operators/__init__.py +++ b/python/paddle/incubate/operators/__init__.py @@ -14,3 +14,4 @@ from .softmax_mask_fuse_upper_triangle import softmax_mask_fuse_upper_triangle # noqa: F401 from .softmax_mask_fuse import softmax_mask_fuse # noqa: F401 +from .resnet_unit import ResNetUnit #noqa: F401 diff --git a/python/paddle/incubate/operators/resnet_unit.py b/python/paddle/incubate/operators/resnet_unit.py new file mode 100644 index 0000000000000000000000000000000000000000..cba1d4863cbd4323c985ba2a29a01203bbeef6a6 --- /dev/null +++ b/python/paddle/incubate/operators/resnet_unit.py @@ -0,0 +1,269 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import collections +import itertools +import six +import math +import sys +import warnings +from functools import partial, reduce + +import numpy as np +import paddle +import paddle.fluid as fluid +from paddle import framework +from paddle.device import get_device, get_cudnn_version +from paddle.nn import initializer as I +from paddle.nn import Layer, LayerList +from paddle.fluid.layers import utils +from paddle.fluid.layer_helper import LayerHelper +from paddle.fluid.layers.utils import map_structure, flatten, pack_sequence_as +from paddle.fluid.data_feeder import convert_dtype +from paddle.fluid.param_attr import ParamAttr +from paddle import _C_ops +__all__ = ['resnet_unit', 'ResNetUnit'] + + +def resnet_unit(x, filter_x, scale_x, bias_x, mean_x, var_x, z, filter_z, + scale_z, bias_z, mean_z, var_z, stride, stride_z, padding, + dilation, groups, momentum, eps, data_format, fuse_add, + has_shortcut, use_global_stats, is_test, act): + + helper = LayerHelper('resnet_unit', **locals()) + bn_param_dtype = fluid.core.VarDesc.VarType.FP32 + bit_mask_dtype = fluid.core.VarDesc.VarType.INT32 + out = helper.create_variable_for_type_inference(x.dtype) + bit_mask = helper.create_variable_for_type_inference( + dtype=bit_mask_dtype, stop_gradient=True) + # intermediate_out for x + conv_x = helper.create_variable_for_type_inference( + dtype=x.dtype, stop_gradient=True) + saved_mean_x = helper.create_variable_for_type_inference( + dtype=bn_param_dtype, stop_gradient=True) + saved_invstd_x = helper.create_variable_for_type_inference( + dtype=bn_param_dtype, stop_gradient=True) + running_mean_x = mean_x + running_var_x = var_x + # intermediate_out for z + conv_z = helper.create_variable_for_type_inference( + dtype=x.dtype, stop_gradient=True) + saved_mean_z = helper.create_variable_for_type_inference( + dtype=bn_param_dtype, stop_gradient=True) + saved_invstd_z = helper.create_variable_for_type_inference( + dtype=bn_param_dtype, stop_gradient=True) + running_mean_z = helper.create_variable_for_type_inference( + dtype=bn_param_dtype, stop_gradient=True) if mean_z is None else mean_z + running_var_z = helper.create_variable_for_type_inference( + dtype=bn_param_dtype, stop_gradient=True) if var_z is None else var_z + + inputs = { + 'X': x, + 'FilterX': filter_x, + 'ScaleX': scale_x, + 'BiasX': bias_x, + 'MeanX': mean_x, + 'VarX': var_x, + 'Z': z, + 'FilterZ': filter_z, + 'ScaleZ': scale_z, + 'BiasZ': bias_z, + 'MeanZ': mean_z, + 'VarZ': var_z + } + + attrs = { + 'stride': stride, + 'stride_z': stride_z, + 'padding': padding, + 'dilation': dilation, + 'group': groups, + 'momentum': momentum, + 'epsilon': eps, + 'data_format': data_format, + 'fuse_add': fuse_add, + 'has_shortcut': has_shortcut, + 'use_global_stats': use_global_stats, + 'is_test': is_test, + 'act_type': act + } + + outputs = { + 'Y': out, + 'BitMask': bit_mask, + 'ConvX': conv_x, + 'SavedMeanX': saved_mean_x, + 'SavedInvstdX': saved_invstd_x, + 'RunningMeanX': running_mean_x, + 'RunningVarX': running_var_x, + 'ConvZ': conv_z, + 'SavedMeanZ': saved_mean_z, + 'SavedInvstdZ': saved_invstd_z, + 'RunningMeanZ': running_mean_z, + 'RunningVarZ': running_var_z, + } + + helper.append_op( + type='resnet_unit', inputs=inputs, outputs=outputs, attrs=attrs) + + return out + + +class ResNetUnit(Layer): + r""" + ******Temporary version******. + ResNetUnit is designed for optimize the performence by using cudnnv8 API. + """ + + def __init__(self, + num_channels_x, + num_filters, + filter_size, + stride=1, + momentum=0.9, + eps=1e-5, + data_format='NHWC', + act='relu', + fuse_add=False, + has_shortcut=False, + use_global_stats=False, + is_test=False, + filter_x_attr=None, + scale_x_attr=None, + bias_x_attr=None, + moving_mean_x_name=None, + moving_var_x_name=None, + num_channels_z=1, + stride_z=1, + filter_z_attr=None, + scale_z_attr=None, + bias_z_attr=None, + moving_mean_z_name=None, + moving_var_z_name=None): + super(ResNetUnit, self).__init__() + self._stride = stride + self._stride_z = stride_z + self._dilation = 1 + self._kernel_size = utils.convert_to_list(filter_size, 2, 'kernel_size') + self._padding = (filter_size - 1) // 2 + self._groups = 1 + self._momentum = momentum + self._eps = eps + self._data_format = data_format + self._act = act + self._fuse_add = fuse_add + self._has_shortcut = has_shortcut + self._use_global_stats = use_global_stats + self._is_test = is_test + + # check format + valid_format = {'NHWC'} + if data_format not in valid_format: + raise ValueError( + "conv_format must be one of {}, but got conv_format='{}'". + format(valid_format, data_format)) + + def _get_default_param_initializer(channels): + filter_elem_num = np.prod(self._kernel_size) * channels + std = (2.0 / filter_elem_num)**0.5 + return I.Normal(0.0, std) + + # initial filter + bn_param_dtype = fluid.core.VarDesc.VarType.FP32 + bn_param_shape = [1, 1, 1, num_filters] + filter_x_shape = [num_filters, filter_size, filter_size, num_channels_x] + filter_z_shape = [num_filters, filter_size, filter_size, num_channels_z] + + self.filter_x = self.create_parameter( + shape=filter_x_shape, + attr=filter_x_attr, + default_initializer=_get_default_param_initializer(num_channels_x)) + self.scale_x = self.create_parameter( + shape=bn_param_shape, + attr=scale_x_attr, + dtype=bn_param_dtype, + default_initializer=I.Constant(1.0)) + self.bias_x = self.create_parameter( + shape=bn_param_shape, + attr=bias_x_attr, + dtype=bn_param_dtype, + is_bias=True) + self.mean_x = self.create_parameter( + attr=ParamAttr( + name=moving_mean_x_name, + initializer=I.Constant(0.0), + trainable=False), + shape=bn_param_shape, + dtype=bn_param_dtype) + self.mean_x.stop_gradient = True + self.var_x = self.create_parameter( + attr=ParamAttr( + name=moving_var_x_name, + initializer=I.Constant(1.0), + trainable=False), + shape=bn_param_shape, + dtype=bn_param_dtype) + self.var_x.stop_gradient = True + if has_shortcut: + self.filter_z = self.create_parameter( + shape=filter_z_shape, + attr=filter_z_attr, + default_initializer=_get_default_param_initializer( + num_channels_z)) + self.scale_z = self.create_parameter( + shape=bn_param_shape, + attr=scale_z_attr, + dtype=bn_param_dtype, + default_initializer=I.Constant(1.0)) + self.bias_z = self.create_parameter( + shape=bn_param_shape, + attr=bias_z_attr, + dtype=bn_param_dtype, + is_bias=True) + self.mean_z = self.create_parameter( + attr=ParamAttr( + name=moving_mean_z_name, + initializer=I.Constant(0.0), + trainable=False), + shape=bn_param_shape, + dtype=bn_param_dtype) + self.mean_z.stop_gradient = True + self.var_z = self.create_parameter( + attr=ParamAttr( + name=moving_var_z_name, + initializer=I.Constant(1.0), + trainable=False), + shape=bn_param_shape, + dtype=bn_param_dtype) + self.var_z.stop_gradient = True + else: + self.filter_z = None + self.scale_z = None + self.bias_z = None + self.mean_z = None + self.var_z = None + + def forward(self, x, z=None): + if self._fuse_add and z is None: + raise ValueError("z can not be None") + + out = resnet_unit( + x, self.filter_x, self.scale_x, self.bias_x, self.mean_x, + self.var_x, z, self.filter_z, self.scale_z, self.bias_z, + self.mean_z, self.var_z, self._stride, self._stride_z, + self._padding, self._dilation, self._groups, self._momentum, + self._eps, self._data_format, self._fuse_add, self._has_shortcut, + self._use_global_stats, self._is_test, self._act) + return out