From 767050d934222464e866a8dc73cafeed3e943c69 Mon Sep 17 00:00:00 2001 From: Yiqun Liu Date: Wed, 29 Sep 2021 10:37:16 +0800 Subject: [PATCH] Implement the grad and enhance the cache of norm_convolution fusion ops. (#36168) --- .../fluid/framework/operator_kernel_configs.h | 2 + .../operators/fused/cudnn_fusion_helper.h | 65 +-- .../operators/fused/cudnn_norm_conv.cu.h | 357 ++++++++++---- .../operators/fused/cudnn_norm_conv_test.cc | 459 ++++++++++++------ 4 files changed, 630 insertions(+), 253 deletions(-) diff --git a/paddle/fluid/framework/operator_kernel_configs.h b/paddle/fluid/framework/operator_kernel_configs.h index 68edb7c89dd..ab812a30981 100644 --- a/paddle/fluid/framework/operator_kernel_configs.h +++ b/paddle/fluid/framework/operator_kernel_configs.h @@ -15,8 +15,10 @@ limitations under the License. */ #pragma once #include +#include #include #include +#include "glog/logging.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/operators/fused/cudnn_fusion_helper.h b/paddle/fluid/operators/fused/cudnn_fusion_helper.h index 4434681e60b..fcd354df938 100644 --- a/paddle/fluid/operators/fused/cudnn_fusion_helper.h +++ b/paddle/fluid/operators/fused/cudnn_fusion_helper.h @@ -14,10 +14,8 @@ limitations under the License. */ #pragma once -#include #include -#include "paddle/fluid/platform/cudnn_desc.h" -#include "paddle/fluid/platform/cudnn_helper.h" +#include "paddle/fluid/framework/operator_kernel_configs.h" #include "paddle/fluid/platform/dynload/cudnn.h" #include "paddle/fluid/platform/enforce.h" @@ -41,12 +39,9 @@ class CudnnFusionOp { } ~CudnnFusionOp() { - // New 'fused op' descriptor destruction - PADDLE_ENFORCE_CUDA_SUCCESS( - dynload::cudnnDestroyFusedOpsVariantParamPack(op_variant_params_)); - PADDLE_ENFORCE_CUDA_SUCCESS( - dynload::cudnnDestroyFusedOpsConstParamPack(op_const_params_)); - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyFusedOpsPlan(op_)); + dynload::cudnnDestroyFusedOpsVariantParamPack(op_variant_params_); + dynload::cudnnDestroyFusedOpsConstParamPack(op_const_params_); + dynload::cudnnDestroyFusedOpsPlan(op_); } // Execute fused op @@ -121,41 +116,49 @@ class CudnnFusionOp { // Get the workspace, which is required before Execute(). size_t GetWorkspaceSizeInBytes(cudnnHandle_t cudnn_handle) { - size_t workspace_bytes = 0U; - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnMakeFusedOpsPlan( - cudnn_handle, op_, op_const_params_, &workspace_bytes)); - plan_created_ = true; - return workspace_bytes; + if (!plan_created_) { + workspace_bytes_ = 0U; + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnMakeFusedOpsPlan( + cudnn_handle, op_, op_const_params_, &workspace_bytes_)); + plan_created_ = true; + } + return workspace_bytes_; } private: bool plan_created_; + size_t workspace_bytes_; cudnnFusedOpsPlan_t op_; cudnnFusedOpsConstParamPack_t op_const_params_; cudnnFusedOpsVariantParamPack_t op_variant_params_; }; -static inline std::vector GetStrides(const std::vector &shape) { - if (shape.size() < 1) { - return {}; +class CudnnFusionOpCache { + public: + static CudnnFusionOpCache &Instance() { + static CudnnFusionOpCache instance; + return instance; + } + + framework::AlgorithmsCache *GetForward() { + return &forward_cache_; } - int dim = static_cast(shape.size()); - std::vector pro_shape(shape); - std::vector strides(dim); - int temp = pro_shape[1]; - pro_shape.erase(pro_shape.begin() + 1); - pro_shape.push_back(temp); - strides.back() = 1; - for (int i = dim - 2; i >= 0; --i) { - strides[i] = strides[i + 1] * pro_shape[i + 1]; + framework::AlgorithmsCache *GetBackward() { + return &backward_cache_; } - strides.pop_back(); - strides.insert(strides.begin() + 1, 1); - return strides; -} -static inline int64_t AlignUp(int64_t a, int64_t b) { return (a + b - 1) / b; } + private: + CudnnFusionOpCache() {} + ~CudnnFusionOpCache() { + // Need to delete the memory of cache. + } + CudnnFusionOpCache(const CudnnFusionOpCache &) {} + + private: + framework::AlgorithmsCache forward_cache_; + framework::AlgorithmsCache backward_cache_; +}; #endif // CUDNN_VERSION >= 8000 } // namespace operators diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h b/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h index 1ead78b8b64..1a73281cb8d 100644 --- a/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h +++ b/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h @@ -15,125 +15,320 @@ limitations under the License. */ #pragma once #include "paddle/fluid/operators/fused/cudnn_fusion_helper.h" +#include "paddle/fluid/platform/cudnn_desc.h" +#include "paddle/fluid/platform/cudnn_helper.h" namespace paddle { namespace operators { using Tensor = framework::Tensor; namespace dynload = platform::dynload; +template +using ScalingParamType = typename platform::CudnnDataType::ScalingParamType; + #if CUDNN_VERSION >= 8000 + +static size_t RoundUp(int64_t a, int64_t b) { return (a + b - 1) / b * b; } + template -class CudnnNormConvolutionOp { +struct NormConvolutionArgs { + NormConvolutionArgs() { + dtype = platform::CudnnDataType::type; + format = CUDNN_TENSOR_NHWC; + compute_type = platform::CudnnDataType::type; + } + + void Set(const std::vector &input_shape, + const std::vector &filter_shape, + const std::vector &output_shape, int padding, int stride, + int dilation, int group) { + PADDLE_ENFORCE_EQ( + input_shape.size(), 4U, + platform::errors::InvalidArgument( + "The size of input_shape is expected to 4. But recieved " + "input_shape's size is %d, input_shape is [%s].", + input_shape.size(), framework::make_ddim(input_shape))); + PADDLE_ENFORCE_EQ( + filter_shape.size(), 4U, + platform::errors::InvalidArgument( + "The size of filter_shape is expected to 4. But recieved " + "filter_shape's size is %d, filter_shape is [%s].", + filter_shape.size(), framework::make_ddim(filter_shape))); + PADDLE_ENFORCE_EQ(filter_shape[1] == filter_shape[2] && + (filter_shape[1] == 1 || filter_shape[1] == 3), + true, + platform::errors::InvalidArgument( + "The filter_shape is expected to store as nhwc, and " + "h = w = 1 or 3. But recieved filter_shape is [%s].", + framework::make_ddim(filter_shape))); + PADDLE_ENFORCE_EQ( + output_shape.size(), 4U, + platform::errors::InvalidArgument( + "The size of output_shape is expected to 4. But recieved " + "filter_shape's size is %d, filter_shape is [%s].", + output_shape.size(), framework::make_ddim(output_shape))); + + for (size_t i = 0; i < input_shape.size(); ++i) { + in_dims.push_back(input_shape[i]); + } + for (size_t i = 0; i < filter_shape.size(); ++i) { + filter_dims.push_back(filter_shape[i]); + } + paddings = {padding, padding}; + strides = {stride, stride}; + dilations = {dilation, dilation}; + + in_desc.set(input_shape, format, dtype); + filter_desc.set(filter_shape, format, dtype, group); + out_desc.set(output_shape, format, dtype); + + int output_channel = filter_shape[0]; + std::vector stats_shape = {1, 1, 1, output_channel}; + out_stats_desc.set(stats_shape, format, compute_type); + + conv_desc.set(dtype, paddings, strides, dilations, false, group); + } + + cudnnDataType_t dtype; + cudnnTensorFormat_t format; + cudnnDataType_t compute_type; + + std::vector in_dims; + std::vector filter_dims; + std::vector strides; + std::vector paddings; + std::vector dilations; + + platform::TensorDescriptor in_desc; + platform::FilterDescriptor filter_desc; + platform::TensorDescriptor out_desc; + platform::TensorDescriptor out_stats_desc; + platform::ConvolutionDescriptor conv_desc; +}; + +template +class CudnnNormConvolution { public: - CudnnNormConvolutionOp() - : fwd_op_(CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS) {} - ~CudnnNormConvolutionOp() {} - - void Init(const platform::CUDADeviceContext &ctx, - const std::vector &input_shape, - const std::vector &filter_shape, - const std::vector &output_shape, const int &pad, - const int &stride, const int &dilate, const int &group) { - cudnn_fwd_compute_type_ = platform::CudnnDataType::type; - dtype_ = platform::CudnnDataType::type; - format_ = CUDNN_TENSOR_NHWC; - - InitDescriptors(ctx, input_shape, filter_shape, output_shape, pad, stride, - dilate, group); - GetWorkspaceSize(ctx); + CudnnNormConvolution(const platform::CUDADeviceContext &ctx, + const std::vector &input_shape, + const std::vector &filter_shape, + const std::vector &output_shape, const int &padding, + const int &stride, const int &dilation, + const int &group) { + args_.Set(input_shape, filter_shape, output_shape, padding, stride, + dilation, group); } + ~CudnnNormConvolution() {} void Forward(const platform::CUDADeviceContext &ctx, T *input_ptr, T *filter_ptr, T *output_ptr, float *sum_ptr, float *sum_of_squares_ptr) { - auto handle = ctx.cudnn_handle(); - auto workspace_handle = ctx.cudnn_workspace_handle(); + auto cudnn_handle = ctx.cudnn_handle(); + + CudnnFusionOp *fwd_op = GetForwardOp(ctx); + size_t workspace_size = RoundUp( + static_cast(fwd_op->GetWorkspaceSizeInBytes(cudnn_handle)), + 512); + // Set variant_param // input ptr - fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_XDATA, input_ptr); - fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_WDATA, filter_ptr); - fwd_op_.SetOpVariantParamAttrPtr( - CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES, &fwd_workspace_byte_); + fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_XDATA, input_ptr); + fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_WDATA, filter_ptr); + fwd_op->SetOpVariantParamAttrPtr( + CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES, &workspace_size); + // output ptr - fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_YDATA, output_ptr); - fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_YSUM, sum_ptr); - fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_YSQSUM, sum_of_squares_ptr); - workspace_handle.RunFunc( + fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_YDATA, output_ptr); + fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_YSUM, sum_ptr); + fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_YSQSUM, sum_of_squares_ptr); + + ctx.cudnn_workspace_handle().RunFunc( [&](void *workspace_ptr) { // workspace ptr - fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE, workspace_ptr); + fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE, workspace_ptr); // fused op execute - fwd_op_.Execute(handle); + fwd_op->Execute(cudnn_handle); }, - fwd_workspace_byte_); + workspace_size); } - // TBD - void Backward(const platform::CUDADeviceContext &ctx) {} + private: + CudnnFusionOp *GetForwardOp(const platform::CUDADeviceContext &ctx) { + framework::AlgorithmsCache &cache = + *(CudnnFusionOpCache::Instance().GetForward()); + + CudnnFusionOp *fwd_op = cache.GetAlgorithm( + args_.in_dims, args_.filter_dims, args_.strides, args_.paddings, + args_.dilations, 0, static_cast(args_.dtype), [&]() { + CudnnFusionOp *fwd_op = + new CudnnFusionOp(CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS); + + // Set constant_param + fwd_op->SetOpConstParamAttr( + {CUDNN_PARAM_XDATA_PLACEHOLDER, CUDNN_PARAM_WDATA_PLACEHOLDER, + CUDNN_PARAM_YDATA_PLACEHOLDER}, + CUDNN_PTR_16B_ALIGNED); + fwd_op->SetOpConstParamAttr( + {CUDNN_PARAM_YSUM_PLACEHOLDER, CUDNN_PARAM_YSQSUM_PLACEHOLDER}, + CUDNN_PTR_16B_ALIGNED); + + // conv desc + fwd_op->SetOpConstParamDesc(CUDNN_PARAM_CONV_DESC, + args_.conv_desc.desc()); + // input desc + fwd_op->SetOpConstParamDesc(CUDNN_PARAM_XDESC, args_.in_desc.desc()); + // filter desc + fwd_op->SetOpConstParamDesc(CUDNN_PARAM_WDESC, + args_.filter_desc.desc()); + // output desc + fwd_op->SetOpConstParamDesc(CUDNN_PARAM_YDESC, args_.out_desc.desc()); + // output_stats desc + fwd_op->SetOpConstParamDesc(CUDNN_PARAM_YSTATS_DESC, + args_.out_stats_desc.desc()); + // batch_norm mode + fwd_op->SetOpConstParamAttr(CUDNN_PARAM_BN_MODE, + CUDNN_BATCHNORM_SPATIAL_PERSISTENT); + + // Make cudnn fused ops plan + fwd_op->GetWorkspaceSizeInBytes(ctx.cudnn_handle()); + return fwd_op; + }); + return fwd_op; + } private: - void InitDescriptors(const platform::CUDADeviceContext &ctx, - const std::vector &input_shape, - const std::vector &filter_shape, - const std::vector &output_shape, const int &pad, - const int &stride, const int &dilate, const int &group) { - // Set constant_param - fwd_op_.SetOpConstParamAttr( - {CUDNN_PARAM_XDATA_PLACEHOLDER, CUDNN_PARAM_WDATA_PLACEHOLDER, - CUDNN_PARAM_YDATA_PLACEHOLDER}, - CUDNN_PTR_16B_ALIGNED); - fwd_op_.SetOpConstParamAttr( - {CUDNN_PARAM_YSUM_PLACEHOLDER, CUDNN_PARAM_YSQSUM_PLACEHOLDER}, - CUDNN_PTR_16B_ALIGNED); - - std::vector pad_vec = {pad, pad}; - std::vector stride_vec = {stride, stride}; - std::vector dilate_vec = {dilate, dilate}; - int output_channel = filter_shape[0]; - std::vector stats_shape = {1, 1, 1, output_channel}; + NormConvolutionArgs args_; +}; - // set conv desc - conv_desc_.set(dtype_, pad_vec, stride_vec, dilate_vec, false, group); - fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_CONV_DESC, conv_desc_.desc()); +template +class CudnnNormConvolutionGrad { + public: + CudnnNormConvolutionGrad(const platform::CUDADeviceContext &ctx, + const std::vector &input_shape, + const std::vector &filter_shape, + const std::vector &output_shape, + const int &padding, const int &stride, + const int &dilation, const int &group) { + args_.Set(input_shape, filter_shape, output_shape, padding, stride, + dilation, group); + dgrad_algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; + } + ~CudnnNormConvolutionGrad() {} - // set input desc - in_desc_.set(input_shape, format_, dtype_); - fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_XDESC, in_desc_.desc()); + void Backward(const platform::CUDADeviceContext &ctx, T *input_ptr, + T *output_grad_ptr, T *filter_ptr, T *input_grad_ptr, + T *filter_grad_ptr, bool use_addto = false) { + if (filter_grad_ptr) { + BackwardFilter(ctx, input_ptr, output_grad_ptr, filter_ptr, + filter_grad_ptr); + } + if (input_grad_ptr) { + BackwardData(ctx, input_ptr, output_grad_ptr, filter_ptr, input_grad_ptr, + use_addto); + } + } - // set filter desc - filter_desc_.set(filter_shape, format_, dtype_, group); - fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_WDESC, filter_desc_.desc()); + private: + void BackwardFilter(const platform::CUDADeviceContext &ctx, T *input_ptr, + T *output_grad_ptr, T *filter_ptr, T *filter_grad_ptr) { + auto cudnn_handle = ctx.cudnn_handle(); - // set output desc - out_desc_.set(output_shape, format_, dtype_); - fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_YDESC, out_desc_.desc()); + CudnnFusionOp *wgrad_op = GetBackwardFilterOp(ctx); + size_t workspace_size = RoundUp( + static_cast(wgrad_op->GetWorkspaceSizeInBytes(cudnn_handle)), + 512); - // set output_stats desc - out_stats_desc_.set(stats_shape, format_, cudnn_fwd_compute_type_); - fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_YSTATS_DESC, - out_stats_desc_.desc()); + wgrad_op->SetOpVariantParamAttrPtr(CUDNN_PTR_XDATA, input_ptr); + wgrad_op->SetOpVariantParamAttrPtr(CUDNN_PTR_DYDATA, output_grad_ptr); + wgrad_op->SetOpVariantParamAttrPtr(CUDNN_PTR_DWDATA, filter_grad_ptr); + wgrad_op->SetOpVariantParamAttrPtr( + CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES, &workspace_size); - fwd_op_.SetOpConstParamAttr(CUDNN_PARAM_BN_MODE, CUDNN_BATCHNORM_SPATIAL); + ctx.cudnn_workspace_handle().RunFunc( + [&](void *workspace_ptr) { + // workspace ptr + wgrad_op->SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE, + workspace_ptr); + // fused op execute + wgrad_op->Execute(cudnn_handle); + }, + workspace_size); } - void GetWorkspaceSize(const platform::CUDADeviceContext &ctx) { - auto handle = ctx.cudnn_handle(); - fwd_workspace_byte_ = fwd_op_.GetWorkspaceSizeInBytes(handle); + void BackwardData(const platform::CUDADeviceContext &ctx, T *input_ptr, + T *output_grad_ptr, T *filter_ptr, T *input_grad_ptr, + bool use_addto = false) { + auto cudnn_handle = ctx.cudnn_handle(); + size_t workspace_size = GetWorkspaceSizeBwdData(ctx); + + // Convolution dgrad followed optionally by batchnorm dgrad + ScalingParamType alpha = 1.0f; + ScalingParamType beta = use_addto ? 1.0f : 0.0f; + ctx.cudnn_workspace_handle().RunFunc( + [&](void *cudnn_workspace_ptr) { + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnConvolutionBackwardData( + cudnn_handle, &alpha, args_.filter_desc.desc(), filter_ptr, + args_.out_desc.desc(), output_grad_ptr, + args_.conv_desc.desc(), dgrad_algo_, cudnn_workspace_ptr, + workspace_size, &beta, args_.in_desc.desc(), input_grad_ptr)); + }, + workspace_size); } - size_t fwd_workspace_byte_ = 0; + CudnnFusionOp *GetBackwardFilterOp(const platform::CUDADeviceContext &ctx) { + framework::AlgorithmsCache &cache = + *(CudnnFusionOpCache::Instance().GetBackward()); + + CudnnFusionOp *wgrad_op = cache.GetAlgorithm( + args_.in_dims, args_.filter_dims, args_.strides, args_.paddings, + args_.dilations, 0, static_cast(args_.dtype), [&]() { + CudnnFusionOp *wgrad_op = + new CudnnFusionOp(CUDNN_FUSED_SCALE_BIAS_ACTIVATION_WGRAD); + + wgrad_op->SetOpConstParamAttr( + {CUDNN_PARAM_DYDATA_PLACEHOLDER, CUDNN_PARAM_XDATA_PLACEHOLDER, + CUDNN_PARAM_DWDATA_PLACEHOLDER}, + CUDNN_PTR_16B_ALIGNED); - cudnnDataType_t dtype_; - cudnnDataType_t cudnn_fwd_compute_type_; - platform::TensorDescriptor in_desc_; - platform::FilterDescriptor filter_desc_; - platform::TensorDescriptor out_desc_; - platform::TensorDescriptor out_stats_desc_; - platform::ConvolutionDescriptor conv_desc_; - cudnnTensorFormat_t format_; + // conv desc + wgrad_op->SetOpConstParamDesc(CUDNN_PARAM_CONV_DESC, + args_.conv_desc.desc()); + // input desc + wgrad_op->SetOpConstParamDesc(CUDNN_PARAM_XDESC, + args_.in_desc.desc()); + // filter desc + wgrad_op->SetOpConstParamDesc(CUDNN_PARAM_DWDESC, + args_.filter_desc.desc()); + // output desc + wgrad_op->SetOpConstParamDesc(CUDNN_PARAM_DYDESC, + args_.out_desc.desc()); + wgrad_op->SetOpConstParamAttr(CUDNN_PARAM_BN_MODE, + CUDNN_BATCHNORM_SPATIAL_PERSISTENT); - CudnnFusionOp fwd_op_; + // Make cudnn fused ops plan + wgrad_op->GetWorkspaceSizeInBytes(ctx.cudnn_handle()); + return wgrad_op; + }); + return wgrad_op; + } + + size_t GetWorkspaceSizeBwdData(const platform::CUDADeviceContext &ctx) { + size_t workspace_size = 0U; + auto handle = ctx.cudnn_handle(); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize( + handle, args_.filter_desc.desc(), args_.out_desc.desc(), + args_.conv_desc.desc(), args_.in_desc.desc(), dgrad_algo_, + &workspace_size)); + return RoundUp(workspace_size, 512); + } + + private: + NormConvolutionArgs args_; + cudnnConvolutionBwdDataAlgo_t dgrad_algo_; }; + #endif } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc index 125ed856422..fff7b327f3f 100644 --- a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc +++ b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc @@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + #include #include @@ -29,23 +30,80 @@ namespace op = paddle::operators; using Tensor = paddle::framework::Tensor; USE_OP(conv2d); +USE_OP(conv2d_grad); USE_OP_DEVICE_KERNEL(conv2d, CUDNN); +USE_OP_DEVICE_KERNEL(conv2d_grad, CUDNN); + +template +void InitRandomTensor(const std::vector &dims, + framework::Tensor *cpu_out) { + T *cpu_out_ptr = cpu_out->mutable_data(framework::make_ddim(dims), + platform::CPUPlace()); + + std::default_random_engine random(0); + std::uniform_real_distribution dis(0.0, 1.0); + for (int i = 0; i < cpu_out->numel(); ++i) { + cpu_out_ptr[i] = static_cast(dis(random)); + } +} -// get paddle conv2d op results as baseline template -void Conv2DForwardCompute(const Tensor &x, const Tensor &w, Tensor *y, - const platform::CUDADeviceContext &ctx) { +void TransposeNchwToNhwc(const framework::Tensor &cpu_in, + framework::Tensor *cpu_out) { + auto in_dims = cpu_in.dims(); + EXPECT_EQ(cpu_in.dims().size(), 4); + + const T *cpu_in_ptr = cpu_in.data(); + T *cpu_out_ptr = cpu_out->mutable_data( + {in_dims[0], in_dims[2], in_dims[3], in_dims[1]}, platform::CPUPlace()); + + int64_t n = in_dims[0]; + int64_t c = in_dims[1]; + int64_t hw = in_dims[2] * in_dims[3]; + for (int i = 0; i < n; ++i) { + for (int j = 0; j < hw; ++j) { + for (int k = 0; k < c; ++k) { + int dst_idx = i * hw * c + j * c + k; + int src_idx = i * c * hw + k * hw + j; + cpu_out_ptr[dst_idx] = cpu_in_ptr[src_idx]; + } + } + } +} + +template +void CheckOutput(const framework::Tensor &cpu_res, + const framework::Tensor &cpu_base, float diff, + bool is_relative_atol = false) { + EXPECT_EQ(cpu_res.dims(), cpu_base.dims()); + + const T *cpu_res_ptr = cpu_res.data(); + const T *cpu_base_ptr = cpu_base.data(); + for (int i = 0; i < cpu_res.numel(); ++i) { + if (is_relative_atol) { + EXPECT_LT(static_cast(std::abs((cpu_res_ptr[i] - cpu_base_ptr[i]) / + cpu_base_ptr[i])), + diff); + } else { + EXPECT_LT(static_cast(std::abs(cpu_res_ptr[i] - cpu_base_ptr[i])), + diff); + } + } +} + +// Use Paddle conv2d op results as baseline +template +void ComputeConv2DForward(const platform::CUDADeviceContext &ctx, + const Tensor &cpu_input, const Tensor &cpu_filter, + Tensor *cpu_output) { framework::Scope scope; - auto var_x = scope.Var("Input"); - auto tensor_x = var_x->GetMutable(); - auto var_w = scope.Var("Filter"); - auto tensor_w = var_w->GetMutable(); - auto var_y = scope.Var("Output"); - auto tensor_y = var_y->GetMutable(); + auto *input = scope.Var("Input")->GetMutable(); + auto *filter = scope.Var("Filter")->GetMutable(); + auto *output = scope.Var("Output")->GetMutable(); auto place = ctx.GetPlace(); - TensorCopySync(x, place, tensor_x); - TensorCopySync(w, place, tensor_w); + TensorCopySync(cpu_input, place, input); + TensorCopySync(cpu_filter, place, filter); framework::AttributeMap attrs; bool use_cudnn = true; @@ -60,25 +118,94 @@ void Conv2DForwardCompute(const Tensor &x, const Tensor &w, Tensor *y, {{"Output", {"Output"}}}, attrs); op->Run(scope, ctx.GetPlace()); - TensorCopySync(*tensor_y, place, y); - ctx.Wait(); + TensorCopySync(*output, platform::CPUPlace(), cpu_output); } +// Use Paddle conv2d_grad op results as baseline template -class TestCudnnNormConvOpForward { - public: - TestCudnnNormConvOpForward() { - batch_size_ = 2; - height_ = 8; - width_ = 8; - input_channels_ = 8; - output_channels_ = 32; - kernel_size_ = 1; - stride_ = 1; - pad_ = 0; +void ComputeConv2DBackward(const platform::CUDADeviceContext &ctx, + const Tensor &cpu_input, const Tensor &cpu_filter, + const Tensor &cpu_output_grad, + framework::Tensor *cpu_input_grad, + framework::Tensor *cpu_filter_grad, int stride, + int padding, int dilation) { + framework::Scope scope; + auto *input = scope.Var("Input")->GetMutable(); + auto *filter = scope.Var("Filter")->GetMutable(); + auto *output_grad = + scope.Var("Output@GRAD")->GetMutable(); + auto *input_grad = + scope.Var("Input@GRAD")->GetMutable(); + auto *filter_grad = + scope.Var("Filter@GRAD")->GetMutable(); + + auto place = ctx.GetPlace(); + TensorCopySync(cpu_input, place, input); + TensorCopySync(cpu_filter, place, filter); + TensorCopySync(cpu_output_grad, place, output_grad); + + framework::AttributeMap attrs; + bool use_cudnn = true; + std::string data_format = "NHWC"; + std::string padding_algorithm = "SAME"; + std::vector strides = {stride, stride}; + std::vector paddings = {padding, padding}; + std::vector dilations = {dilation, dilation}; + int groups = 1; + bool exhaustive_search = false; + bool use_addto = false; + attrs.insert({"use_cudnn", use_cudnn}); + attrs.insert({"data_format", data_format}); + attrs.insert({"padding_algorithm", padding_algorithm}); + attrs.insert({"strides", strides}); + attrs.insert({"paddings", paddings}); + attrs.insert({"dilations", dilations}); + attrs.insert({"groups", groups}); + attrs.insert({"exhaustive_search", exhaustive_search}); + attrs.insert({"use_addto", use_addto}); + + auto op = framework::OpRegistry::CreateOp( + "conv2d_grad", {{"Input", {"Input"}}, + {"Filter", {"Filter"}}, + {"Output@GRAD", {"Output@GRAD"}}}, + {{"Input@GRAD", {"Input@GRAD"}}, {"Filter@GRAD", {"Filter@GRAD"}}}, + attrs); + op->Run(scope, ctx.GetPlace()); + + TensorCopySync(*input_grad, platform::CPUPlace(), cpu_input_grad); + TensorCopySync(*filter_grad, platform::CPUPlace(), cpu_filter_grad); +} + +template +void ComputeSumAndSquareSum(const framework::Tensor &cpu_out, + framework::Tensor *cpu_sum, + framework::Tensor *cpu_sum_of_square) { + auto dims = cpu_out.dims(); + int64_t c = dims[3]; + + const T *cpu_out_ptr = cpu_out.data(); + float *cpu_sum_ptr = + cpu_sum->mutable_data({1, 1, 1, c}, platform::CPUPlace()); + float *cpu_sum_square_ptr = cpu_sum_of_square->mutable_data( + {1, 1, 1, c}, platform::CPUPlace()); + + for (int j = 0; j < c; ++j) { + float tmp_sum = 0.0f; + float tmp_sum_of_squares = 0.0f; + for (int i = 0; i < cpu_out.numel() / c; ++i) { + float tmp_out = static_cast(cpu_out_ptr[i * c + j]); + tmp_sum += tmp_out; + tmp_sum_of_squares += tmp_out * tmp_out; + } + cpu_sum_ptr[j] = tmp_sum; + cpu_sum_square_ptr[j] = tmp_sum_of_squares; } +} - TestCudnnNormConvOpForward(int batch_size, int height, int width, +template +class CudnnNormConvolutionTester { + public: + CudnnNormConvolutionTester(int batch_size, int height, int width, int input_channels, int output_channels, int kernel_size, int stride) { batch_size_ = batch_size; @@ -88,133 +215,183 @@ class TestCudnnNormConvOpForward { output_channels_ = output_channels; kernel_size_ = kernel_size; stride_ = stride; - pad_ = (kernel_size_ - 1) / 2; + padding_ = (kernel_size_ - 1) / 2; + SetUp(); } - ~TestCudnnNormConvOpForward() {} + ~CudnnNormConvolutionTester() {} + + void CheckForward(float diff, bool is_relative_atol = false) { + platform::CUDADeviceContext *ctx = + static_cast( + platform::DeviceContextPool::Instance().Get( + platform::CUDAPlace(0))); + + framework::Tensor cpu_output_base; + framework::Tensor cpu_sum_base; + framework::Tensor cpu_sum_of_square_base; + BaselineForward(*ctx, &cpu_output_base, &cpu_sum_base, + &cpu_sum_of_square_base); + + framework::Tensor cpu_output; + framework::Tensor cpu_sum; + framework::Tensor cpu_sum_of_square; + FusedForward(*ctx, &cpu_output, &cpu_sum, &cpu_sum_of_square); + + // Check forward correctness between baseline and results of normconv. + CheckOutput(cpu_output, cpu_output_base, diff, is_relative_atol); + CheckOutput(cpu_sum, cpu_sum_base, diff, is_relative_atol); + CheckOutput(cpu_sum_of_square, cpu_sum_of_square_base, diff, + is_relative_atol); + } + + void CheckBackward(float diff, bool is_relative_atol = false) { + platform::CUDADeviceContext *ctx = + static_cast( + platform::DeviceContextPool::Instance().Get( + platform::CUDAPlace(0))); + + framework::Tensor cpu_input_grad_base; + framework::Tensor cpu_filter_nchw_grad_base; + framework::Tensor cpu_filter_nhwc_grad_base; + BaselineBackward(*ctx, &cpu_input_grad_base, &cpu_filter_nchw_grad_base); + TransposeNchwToNhwc(cpu_filter_nchw_grad_base, + &cpu_filter_nhwc_grad_base); + + framework::Tensor cpu_input_grad; + framework::Tensor cpu_filter_nhwc_grad; + FusedBackward(*ctx, &cpu_input_grad, &cpu_filter_nhwc_grad); + + // Check backward correctness between baseline and results of normconv. + CheckOutput(cpu_input_grad, cpu_input_grad_base, diff, is_relative_atol); + CheckOutput(cpu_filter_nhwc_grad, cpu_filter_nhwc_grad_base, diff, + is_relative_atol); + } + private: void SetUp() { - input_size_ = batch_size_ * height_ * width_ * input_channels_; - filter_size_ = - output_channels_ * input_channels_ * kernel_size_ * kernel_size_; - output_size_ = batch_size_ * height_ * width_ * output_channels_; - param_size_ = output_channels_; - - input_vec_.resize(input_size_); - filter_raw_vec_.resize(filter_size_); - filter_pro_vec_.resize(filter_size_); - - std::default_random_engine random(0); - std::uniform_real_distribution dis(0.0, 1.0); - for (int i = 0; i < input_size_; ++i) { - input_vec_[i] = static_cast(dis(random)); - } - for (int i = 0; i < filter_size_; ++i) { - filter_raw_vec_[i] = static_cast(dis(random)); - } - // transpoes for filter - // NCHW->NHWC - for (int oc = 0; oc < output_channels_; ++oc) { - for (int kh = 0; kh < kernel_size_; ++kh) { - for (int kw = 0; kw < kernel_size_; ++kw) { - for (int ic = 0; ic < input_channels_; ++ic) { - int dst_idx = oc * kernel_size_ * kernel_size_ * input_channels_ + - kh * kernel_size_ * input_channels_ + - kw * input_channels_ + ic; - int src_idx = oc * kernel_size_ * kernel_size_ * input_channels_ + - ic * kernel_size_ * kernel_size_ + kh * kernel_size_ + - kw; - filter_pro_vec_[dst_idx] = filter_raw_vec_[src_idx]; - } - } - } - } + InitRandomTensor({batch_size_, height_, width_, input_channels_}, + &cpu_input_); + InitRandomTensor( + {output_channels_, input_channels_, kernel_size_, kernel_size_}, + &cpu_filter_nchw_); + // transpoes for filter, NCHW -> NHWC + TransposeNchwToNhwc(cpu_filter_nchw_, &cpu_filter_nhwc_); + InitRandomTensor({batch_size_, height_, width_, output_channels_}, + &cpu_output_grad_); + } - framework::TensorFromVector(input_vec_, *ctx_, &input_); - input_.Resize({batch_size_, height_, width_, input_channels_}); - framework::TensorFromVector(filter_raw_vec_, *ctx_, &filter_raw_); - filter_raw_.Resize( - {output_channels_, input_channels_, kernel_size_, kernel_size_}); - framework::TensorFromVector(filter_pro_vec_, *ctx_, &filter_pro_); - filter_pro_.Resize( - {output_channels_, kernel_size_, kernel_size_, input_channels_}); - output_.Resize({batch_size_, height_, width_, output_channels_}); - base_output_.Resize({batch_size_, height_, width_, output_channels_}); - sum_.Resize({1, 1, 1, output_channels_}); - sum_of_squares_.Resize({1, 1, 1, output_channels_}); - ctx_->Wait(); + void BaselineForward(const platform::CUDADeviceContext &ctx, + framework::Tensor *cpu_output_base, + framework::Tensor *cpu_sum_base, + framework::Tensor *cpu_sum_of_square_base) { + ComputeConv2DForward(ctx, cpu_input_, cpu_filter_nchw_, cpu_output_base); + ComputeSumAndSquareSum(*cpu_output_base, cpu_sum_base, + cpu_sum_of_square_base); } - void BaselineForward() { - Conv2DForwardCompute(input_, filter_raw_, &base_output_, *ctx_); - ctx_->Wait(); + void BaselineBackward(const platform::CUDADeviceContext &ctx, + framework::Tensor *cpu_input_grad_base, + framework::Tensor *cpu_filter_grad_base) { + ComputeConv2DBackward(ctx, cpu_input_, cpu_filter_nchw_, + cpu_output_grad_, cpu_input_grad_base, + cpu_filter_grad_base, stride_, padding_, + dilation_); } // get forward results of cudnn_norm_conv - void FusedForward() { - auto input_shape = framework::vectorize(input_.dims()); - auto filter_shape = framework::vectorize(filter_pro_.dims()); - auto output_shape = framework::vectorize(output_.dims()); - T *input_ptr = input_.data(); - T *filter_ptr = filter_pro_.data(); - T *output_ptr = output_.mutable_data(place_); - float *sum_ptr = sum_.mutable_data(place_); - float *sum_of_squares_ptr = sum_of_squares_.mutable_data(place_); - - std::shared_ptr> conv_op( - new op::CudnnNormConvolutionOp()); - conv_op->Init(*ctx_, input_shape, filter_shape, output_shape, pad_, stride_, - dilate_, group_); - conv_op->Forward(*ctx_, input_ptr, filter_ptr, output_ptr, sum_ptr, - sum_of_squares_ptr); - ctx_->Wait(); - } + void FusedForward(const platform::CUDADeviceContext &ctx, + framework::Tensor *cpu_output, framework::Tensor *cpu_sum, + framework::Tensor *cpu_sum_of_square) { + framework::Tensor input; + framework::Tensor filter_nhwc; + framework::Tensor output; + framework::Tensor sum; + framework::Tensor sum_of_square; - void Run() { - SetUp(); - BaselineForward(); - FusedForward(); + auto place = ctx.GetPlace(); + TensorCopySync(cpu_input_, place, &input); + TensorCopySync(cpu_filter_nhwc_, place, &filter_nhwc); + + T *input_ptr = input.data(); + T *filter_ptr = filter_nhwc.data(); + T *output_ptr = output.mutable_data( + {batch_size_, height_, width_, output_channels_}, place); + float *sum_ptr = + sum.mutable_data({1, 1, 1, output_channels_}, place); + float *sum_of_square_ptr = + sum_of_square.mutable_data({1, 1, 1, output_channels_}, place); + + auto input_shape = framework::vectorize(input.dims()); + auto filter_shape = framework::vectorize(filter_nhwc.dims()); + auto output_shape = framework::vectorize(output.dims()); + op::CudnnNormConvolution conv_op(ctx, input_shape, filter_shape, + output_shape, padding_, stride_, + dilation_, group_); + conv_op.Forward(ctx, input_ptr, filter_ptr, output_ptr, sum_ptr, + sum_of_square_ptr); + + TensorCopySync(output, platform::CPUPlace(), cpu_output); + TensorCopySync(sum, platform::CPUPlace(), cpu_sum); + TensorCopySync(sum_of_square, platform::CPUPlace(), cpu_sum_of_square); } - // check forward correctness between baseline and results of normconv. - void CheckOut(const T diff, bool is_relative_atol = false) { - std::vector base_output_vec, output_vec; - output_vec.resize(output_size_); - base_output_vec.resize(output_size_); - TensorToVector(base_output_, *ctx_, &base_output_vec); - TensorToVector(output_, *ctx_, &output_vec); - ctx_->Wait(); - - for (int i = 0; i < output_size_; ++i) { - if (is_relative_atol) { - EXPECT_LT( - std::abs((output_vec[i] - base_output_vec[i]) / base_output_vec[i]), - diff); - } else { - EXPECT_LT(std::abs(output_vec[i] - base_output_vec[i]), diff); - } - } + void FusedBackward(const platform::CUDADeviceContext &ctx, + framework::Tensor *cpu_input_grad, + framework::Tensor *cpu_filter_grad) { + framework::Tensor input; + framework::Tensor filter_nhwc; + framework::Tensor output_grad; + framework::Tensor input_grad; + framework::Tensor filter_grad; + + auto place = ctx.GetPlace(); + TensorCopySync(cpu_input_, place, &input); + TensorCopySync(cpu_filter_nhwc_, place, &filter_nhwc); + TensorCopySync(cpu_output_grad_, place, &output_grad); + + T *input_ptr = input.data(); + T *filter_ptr = filter_nhwc.data(); + T *output_grad_ptr = output_grad.data(); + T *input_grad_ptr = input_grad.mutable_data(input.dims(), place); + T *filter_grad_ptr = filter_grad.mutable_data(filter_nhwc.dims(), place); + + auto input_shape = framework::vectorize(input.dims()); + auto filter_shape = framework::vectorize(filter_nhwc.dims()); + auto output_shape = framework::vectorize(output_grad.dims()); + op::CudnnNormConvolutionGrad conv_grad_op(ctx, input_shape, filter_shape, + output_shape, padding_, + stride_, dilation_, group_); + conv_grad_op.Backward(ctx, input_ptr, output_grad_ptr, filter_ptr, + input_grad_ptr, filter_grad_ptr); + + TensorCopySync(input_grad, platform::CPUPlace(), cpu_input_grad); + TensorCopySync(filter_grad, platform::CPUPlace(), cpu_filter_grad); } private: - int batch_size_, height_, width_, input_channels_, output_channels_; - int kernel_size_, stride_, pad_; - const int dilate_ = 1; + int batch_size_; + int height_; + int width_; + int input_channels_; + int output_channels_; + int kernel_size_; + int stride_; + int padding_; + const int dilation_ = 1; const int group_ = 1; - int input_size_, filter_size_, output_size_, param_size_; - framework::Tensor input_, filter_raw_, filter_pro_, output_, base_output_; - framework::Tensor sum_, sum_of_squares_; - std::vector input_vec_, filter_raw_vec_, filter_pro_vec_; + // Forward input + framework::Tensor cpu_input_; + framework::Tensor cpu_filter_nchw_; + framework::Tensor cpu_filter_nhwc_; - platform::CUDAPlace place_ = platform::CUDAPlace(0); - platform::CUDADeviceContext *ctx_ = - static_cast( - platform::DeviceContextPool::Instance().Get(place_)); + // Backward input + framework::Tensor cpu_output_grad_; }; // test for fp16, kernel = 1, output_channels = input_channels -TEST(CudnnNormConvForward, GPUCudnnNormConvForward1Fp16) { +TEST(CudnnNormConvFp16, K1S1) { int batch_size = 4; int height = 56; int width = 56; @@ -222,15 +399,15 @@ TEST(CudnnNormConvForward, GPUCudnnNormConvForward1Fp16) { int output_channels = 32; int kernel_size = 1; int stride = 1; - TestCudnnNormConvOpForward test( + CudnnNormConvolutionTester test( batch_size, height, width, input_channels, output_channels, kernel_size, stride); - test.Run(); - test.CheckOut(static_cast(1e-3), true); + test.CheckForward(1e-3, true); + test.CheckBackward(1e-3, true); } // test for fp16, kernel = 3, output_channels = input_channels -TEST(CudnnNormConvForward, GPUCudnnNormConvForward2Fp16) { +TEST(CudnnNormConvFp16, K3S1) { int batch_size = 4; int height = 56; int width = 56; @@ -238,15 +415,15 @@ TEST(CudnnNormConvForward, GPUCudnnNormConvForward2Fp16) { int output_channels = 32; int kernel_size = 3; int stride = 1; - TestCudnnNormConvOpForward test( + CudnnNormConvolutionTester test( batch_size, height, width, input_channels, output_channels, kernel_size, stride); - test.Run(); - test.CheckOut(static_cast(1e-3), true); + test.CheckForward(1e-3, true); + test.CheckBackward(1e-3, true); } // test for fp16, kernel = 1, output_channels = input_channels * 4 -TEST(CudnnNormConvForward, GPUCudnnNormConvForward3Fp16) { +TEST(CudnnNormConvFp16, K1S1O4) { int batch_size = 4; int height = 56; int width = 56; @@ -254,9 +431,9 @@ TEST(CudnnNormConvForward, GPUCudnnNormConvForward3Fp16) { int output_channels = 128; int kernel_size = 1; int stride = 1; - TestCudnnNormConvOpForward test( + CudnnNormConvolutionTester test( batch_size, height, width, input_channels, output_channels, kernel_size, stride); - test.Run(); - test.CheckOut(static_cast(1e-3), true); + test.CheckForward(1e-3, true); + test.CheckBackward(1e-3, true); } -- GitLab