diff --git a/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc index 95916746d6fcb528d26a8f8bb39980b55c4f3704..b96992ef8514abe0f71dbf23d38abb626f6c4a5b 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" -USE_OP(conv2d); +USE_OP_ITSELF(conv2d); USE_OP(conv2d_transpose); namespace paddle { diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h index 3bbb284ca821b8576f2752446555f146c16bb189..4e6fda3d09a071f59c97c87315619d126497a756 100644 --- a/paddle/fluid/operators/conv_cudnn_helper.h +++ b/paddle/fluid/operators/conv_cudnn_helper.h @@ -26,6 +26,7 @@ limitations under the License. */ #include "paddle/fluid/operators/eigen/eigen_function.h" #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" +#include "paddle/phi/backends/gpu/gpu_context.h" namespace paddle { namespace operators { @@ -53,12 +54,11 @@ static inline void GetNCDHW(const framework::DDim& dims, } template -static void RemovePaddingSlice(const framework::ExecutionContext& context, +static void RemovePaddingSlice(const phi::GPUContext& context, const Tensor* input, Tensor* out, const std::vector& starts, const std::vector& axes) { - auto& place = - *context.template device_context().eigen_device(); + auto& place = *context.eigen_device(); auto in_dims = input->dims(); auto new_out_dims = out->dims(); auto offsets = Eigen::DSizes(); @@ -171,11 +171,10 @@ void ChooseAlgo(const std::vector& perf_results, using framework::ConvSearchCache; -static void SetConvMathType(const framework::ExecutionContext& ctx, - cudnnDataType_t dtype, +static void SetConvMathType(const phi::GPUContext& ctx, cudnnDataType_t dtype, const platform::ConvolutionDescriptor& cdesc) { #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1) - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx; if (dev_ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) { PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( cdesc.desc(), CUDNN_TENSOR_OP_MATH)); @@ -231,8 +230,7 @@ struct SearchAlgorithm { template static algo_t Find(const ConvArgs& args, bool exhaustive_search, - bool deterministic, - const framework::ExecutionContext& ctx) { + bool deterministic, const phi::GPUContext& ctx) { auto dtype = platform::CudnnDataType::type; bool has_got_workspace_size = true; size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024; @@ -284,8 +282,7 @@ struct SearchAlgorithm { } else if (deterministic) { algo = static_cast(1); } else { - auto& dev_ctx = - ctx.template device_context(); + auto& dev_ctx = ctx; auto workspace_handle = dev_ctx.cudnn_workspace_handle(); AlgorithmsCache& algo_cache = @@ -346,8 +343,7 @@ struct SearchAlgorithm { template static algo_t Find(const ConvArgs& args, bool exhaustive_search, - bool deterministic, - const framework::ExecutionContext& ctx) { + bool deterministic, const phi::GPUContext& ctx) { auto dtype = platform::CudnnDataType::type; size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024; size_t workspace_size = 0; @@ -413,8 +409,7 @@ struct SearchAlgorithm { } else if (deterministic) { return CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; } else { - auto& dev_ctx = - ctx.template device_context(); + auto& dev_ctx = ctx; auto workspace_handle = dev_ctx.cudnn_workspace_handle(); AlgorithmsCache& algo_cache = @@ -478,8 +473,7 @@ struct SearchAlgorithm { template static algo_t Find(const ConvArgs& args, bool exhaustive_search, - bool deterministic, - const framework::ExecutionContext& ctx) { + bool deterministic, const phi::GPUContext& ctx) { platform::CUDAGraphCaptureModeGuard guard; auto dtype = platform::CudnnDataType::type; size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024; @@ -534,8 +528,7 @@ struct SearchAlgorithm { } else if (deterministic) { return CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1; } else { - auto& dev_ctx = - ctx.template device_context(); + auto& dev_ctx = ctx; auto workspace_handle = dev_ctx.cudnn_workspace_handle(); AlgorithmsCache& algo_cache = *(framework::ConvSearchCache::Instance().GetBackwardFilter()); diff --git a/paddle/fluid/operators/conv_cudnn_op.cu b/paddle/fluid/operators/conv_cudnn_op.cu deleted file mode 100644 index 2055bf560e69ca0ed354aadd00cdca331c22c76e..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/conv_cudnn_op.cu +++ /dev/null @@ -1,1478 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the spopecific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/memory/memory.h" -#ifdef PADDLE_WITH_HIP -#include "paddle/fluid/operators/conv_miopen_helper.h" -#else -#include "paddle/fluid/operators/conv_cudnn_helper.h" -#endif -#include "paddle/fluid/operators/conv_op.h" -#include "paddle/fluid/platform/cudnn_workspace_helper.h" -#include "paddle/fluid/platform/float16.h" -#include "paddle/fluid/platform/profiler/event_tracing.h" -#include "paddle/phi/kernels/funcs/padding.h" - -DECLARE_bool(cudnn_deterministic); -DECLARE_uint64(conv_workspace_size_limit); -DECLARE_bool(cudnn_exhaustive_search); - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using ScopedTensorDescriptor = platform::ScopedTensorDescriptor; -using ScopedFilterDescriptor = platform::ScopedFilterDescriptor; -using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor; -using DataLayout = platform::DataLayout; - -static inline bool IsVoltaOrLater(const platform::CUDADeviceContext& dev_ctx) { - return dev_ctx.GetComputeCapability() >= 70; -} - -template -class CUDNNConvOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = ctx.template device_context(); - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace.")); - const Tensor* input = ctx.Input("Input"); - auto* filter = ctx.Input("Filter"); - auto* output = ctx.Output("Output"); - output->mutable_data(ctx.GetPlace()); - const std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); - std::vector dilations = ctx.Attr>("dilations"); - int groups = ctx.Attr("groups"); - - bool exhaustive_search = - FLAGS_cudnn_exhaustive_search || (ctx.HasAttr("exhaustive_search") && - ctx.Attr("exhaustive_search")); - bool deterministic = FLAGS_cudnn_deterministic; - auto exhaustive_deterministic = exhaustive_search && deterministic; - PADDLE_ENFORCE_EQ(exhaustive_deterministic, false, - platform::errors::InvalidArgument( - "Cann't set exhaustive_search True and " - "FLAGS_cudnn_deterministic True at same time.")); - - const std::string padding_algorithm = - ctx.Attr("padding_algorithm"); - const std::string data_format = ctx.Attr("data_format"); - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - auto dtype = platform::CudnnDataType::type; - -#ifdef PADDLE_WITH_HIP - // HIP MIOPEN ONLY SUPPORT NCHW format - auto compute_format = DataLayout::kNCHW; -#else - // Tensor Core introduced from Volta GPUs supports more faster conv op - // with FP16 in NHWC data format. - const bool compute_in_nhwc = - dtype == CUDNN_DATA_HALF && IsVoltaOrLater(dev_ctx); - // We will only do data format conversion from NHWC to NCHW. - // cudnn will convert NCHW to NHWC automatically on Tensor Core. - auto compute_format = - compute_in_nhwc && channel_last ? DataLayout::kNHWC : DataLayout::kNCHW; -#endif - VLOG(3) << "Compute ConvOp with cuDNN:" - << " data_format=" << data_format << " compute_format=" - << (compute_format == DataLayout::kNHWC ? "NHWC" : "NCHW"); - - // ------------ transformed tensor ----------- - Tensor transformed_input_channel(input->type()); - Tensor transformed_output(output->type()); - Tensor transformed_filter_channel(filter->type()); - T* output_data = nullptr; - if (channel_last && compute_format == DataLayout::kNCHW) { - VLOG(3) << "Transform input tensor from NHWC to NCHW."; - ResizeToChannelFirst( - ctx, input, &transformed_input_channel); - TransToChannelFirst( - ctx, input, &transformed_input_channel); - - ResizeToChannelFirst(ctx, output, - &transformed_output); - - } else { - transformed_input_channel.ShareDataWith(*input); - transformed_output.ShareDataWith(*output); - } - if (compute_format == DataLayout::kNHWC) { - VLOG(3) << "Transform filter tensor from NCHW to NHWC."; - ResizeToChannelLast( - ctx, filter, &transformed_filter_channel); - TransToChannelLast( - ctx, filter, &transformed_filter_channel); - } else { - transformed_filter_channel.ShareDataWith(*filter); - } - output_data = transformed_output.data(); - - // update padding and dilation - auto in_dims = transformed_input_channel.dims(); - auto filter_dims = transformed_filter_channel.dims(); - framework::DDim in_data_dims; - framework::DDim filter_data_dims; - - if (compute_format == DataLayout::kNCHW) { - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - filter_data_dims = phi::slice_ddim(filter_dims, 2, filter_dims.size()); - } else { - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - filter_data_dims = - phi::slice_ddim(filter_dims, 1, filter_dims.size() - 1); - } - - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - int data_dim = strides.size(); // 2d or 3d - bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim); - - Tensor transformed_input; - std::vector padding_common(data_dim, 0); - if (!is_sys_pad) { - std::vector padding_diff(data_dim); - std::vector new_input_shape_vec(data_dim + 2); - new_input_shape_vec[0] = transformed_input_channel.dims()[0]; - - if (compute_format == DataLayout::kNCHW) { - new_input_shape_vec[1] = transformed_input_channel.dims()[1]; - } else { - new_input_shape_vec[data_dim + 1] = - transformed_input_channel.dims()[data_dim + 1]; - } - - std::vector input_pad(transformed_input_channel.dims().size() * 2, - 0); - for (size_t i = 0; i < data_dim; ++i) { - padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]); - padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]); - if (compute_format == DataLayout::kNCHW) { - new_input_shape_vec[i + 2] = - transformed_input_channel.dims()[i + 2] + padding_diff[i]; - } else { - new_input_shape_vec[i + 1] = - transformed_input_channel.dims()[i + 1] + padding_diff[i]; - } - if (compute_format == DataLayout::kNCHW) { - input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i]; - input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i]; - } else { - input_pad[2 * i + 2] = paddings[2 * i] - padding_common[i]; - input_pad[2 * i + 2 + 1] = paddings[2 * i + 1] - padding_common[i]; - } - } - framework::DDim new_input_shape(phi::make_ddim(new_input_shape_vec)); - transformed_input.Resize(new_input_shape); - auto& dev_ctx = - ctx.template device_context(); - - transformed_input = - ctx.AllocateTmpTensor( - new_input_shape, dev_ctx); - const int rank = transformed_input_channel.dims().size(); - T pad_value(0.0); - switch (rank) { - case 4: { - phi::funcs::PadFunction( - dev_ctx, input_pad, transformed_input_channel, pad_value, - &transformed_input); - } break; - case 5: { - phi::funcs::PadFunction( - dev_ctx, input_pad, transformed_input_channel, pad_value, - &transformed_input); - } break; - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "ConvOp only support tensors with 4 or 5 dimensions.")); - } - - } else { - transformed_input.ShareDataWith(transformed_input_channel); - if (paddings.size() == data_dim) { - for (size_t i = 0; i < data_dim; ++i) { - padding_common[i] = paddings[i]; - } - } else { - for (size_t i = 0; i < data_dim; ++i) { - padding_common[i] = paddings[2 * i]; - } - } - } - - const T* input_data = transformed_input.data(); - const T* filter_data = transformed_filter_channel.data(); - - // ------------------- cudnn descriptors --------------------- - ConvArgs args{&transformed_input, - &transformed_filter_channel, - &transformed_output, - strides, - padding_common, - dilations, - dtype}; - - auto handle = dev_ctx.cudnn_handle(); - auto workspace_handle = dev_ctx.cudnn_workspace_handle(); - DataLayout layout = compute_format == DataLayout::kNHWC ? DataLayout::kNHWC - : DataLayout::kNCHW; - if (transformed_input.dims().size() == 5) { - layout = compute_format == DataLayout::kNHWC ? DataLayout::kNDHWC - : DataLayout::kNCDHW; - } - auto layout_format = GetCudnnTensorFormat(layout); - - args.handle = handle; - -#ifdef PADDLE_WITH_HIP - // MIOPEN need to set groups in cdesc in miopen_desc.h - args.cdesc.set(dtype, padding_common, strides, dilations, - platform::AllowTF32Cudnn(), groups); -#else - args.cdesc.set(dtype, padding_common, strides, dilations, - platform::AllowTF32Cudnn()); -#endif - -#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1) - // cudnn 7 can support groups, no need to do it manually - // FIXME(typhoonzero): find a better way to disable groups - // rather than setting it to 1. - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionGroupCount( - args.cdesc.desc(), groups)); - groups = 1; -#endif -#ifdef PADDLE_WITH_HIP - // MIOPEN do not set groups in wdesc after set groups in cdesc - groups = 1; -#endif - args.idesc.set(transformed_input, layout_format); - args.wdesc.set(transformed_filter_channel, layout_format, groups); - args.odesc.set(transformed_output, layout_format); - int i_n, i_c, i_d, i_h, i_w; - int o_n, o_c, o_d, o_h, o_w; - - if (compute_format == DataLayout::kNHWC) { - GetNCDHW(transformed_input.dims(), DataLayout::kNHWC, &i_n, &i_c, &i_d, - &i_h, &i_w); - GetNCDHW(transformed_output.dims(), DataLayout::kNHWC, &o_n, &o_c, &o_d, - &o_h, &o_w); - } else { - GetNCDHW(transformed_input.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d, - &i_h, &i_w); - GetNCDHW(transformed_output.dims(), DataLayout::kNCHW, &o_n, &o_c, &o_d, - &o_h, &o_w); - } - - int group_offset_in = i_c / groups * i_h * i_w * i_d; - int group_offset_out = o_c / groups * o_h * o_w * o_d; - int group_offset_filter = transformed_filter_channel.numel() / groups; - // ------------------- cudnn conv workspace --------------------- - size_t workspace_size = 0; // final workspace to allocate. -// ------------------- cudnn conv algorithm --------------------- -#ifdef PADDLE_WITH_HIP - miopenConvFwdAlgorithm_t algo{}; - using search = SearchAlgorithm; - workspace_size = search::GetWorkspaceSize(args); - algo = search::Find(args, exhaustive_search, deterministic, - workspace_size, ctx); -#else - cudnnConvolutionFwdAlgo_t algo{}; - using search = SearchAlgorithm; - algo = search::Find(args, exhaustive_search, deterministic, ctx); - workspace_size = search::GetWorkspaceSize(args, algo); -#endif - -#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1) - // when groups > 1, SearchAlgorithm find algo is CUDNN_CONVOLUTION_\ - // FWD_ALGO_WINOGRAD_NONFUSED, but this kind of algorithm is unstable - // in forward computation, so change the algorithm to CUDNN_CONVOLUTION_\ - // FWD_ALGO_IMPLICIT_GEMM manually. - if (ctx.Attr("groups") > 1) { - algo = static_cast(0); - } -#endif - - // ------------------- cudnn conv forward --------------------- - ScalingParamType alpha = 1.0f; - ScalingParamType beta = 0.0f; - -// NOTE(zhiqiu): inplace addto is not supportted in double grad yet. -// ScalingParamType beta = ctx.Attr("use_addto") ? 1.0f : 0.0f; -// VLOG(4) << "Conv: use_addto = " << ctx.Attr("use_addto"); - -#ifdef PADDLE_WITH_HIP - workspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionForward( - handle, &alpha, args.idesc.desc(), input_data, - args.wdesc.desc(), filter_data, args.cdesc.desc(), algo, - &beta, args.odesc.desc(), output_data, workspace_ptr, - workspace_size)); - }, - workspace_size); -#else - for (int i = 0; i < groups; i++) { - workspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnConvolutionForward( - handle, &alpha, args.idesc.desc(), - input_data + i * group_offset_in, args.wdesc.desc(), - filter_data + i * group_offset_filter, args.cdesc.desc(), - algo, workspace_ptr, workspace_size, &beta, - args.odesc.desc(), output_data + i * group_offset_out)); - }, - workspace_size); - } -#endif - - if (channel_last && compute_format == DataLayout::kNCHW) { - TransToChannelLast( - ctx, &transformed_output, output); - } - } -}; - -template -class CUDNNConvGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = ctx.template device_context(); - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace.")); - auto input = ctx.Input("Input"); - auto filter = ctx.Input("Filter"); - auto output_grad = ctx.Input(framework::GradVarName("Output")); - auto input_grad = ctx.Output(framework::GradVarName("Input")); - auto filter_grad = ctx.Output(framework::GradVarName("Filter")); - - if (input_grad) { - input_grad->mutable_data(ctx.GetPlace()); - } - if (filter_grad) { - filter_grad->mutable_data(ctx.GetPlace()); - } - - std::vector dilations = ctx.Attr>("dilations"); - std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); - std::string padding_algorithm = ctx.Attr("padding_algorithm"); - int groups = ctx.Attr("groups"); - - bool exhaustive_search = - FLAGS_cudnn_exhaustive_search || (ctx.HasAttr("exhaustive_search") && - ctx.Attr("exhaustive_search")); - bool deterministic = FLAGS_cudnn_deterministic; - auto exhaustive_deterministic = exhaustive_search && deterministic; - PADDLE_ENFORCE_EQ(exhaustive_deterministic, false, - platform::errors::InvalidArgument( - "Cann't set exhaustive_search True and " - "FLAGS_cudnn_deterministic True at same time.")); - - const std::string data_format = ctx.Attr("data_format"); - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - auto dtype = platform::CudnnDataType::type; - -#ifdef PADDLE_WITH_HIP - // HIP MIOPEN ONLY SUPPORT NCHW format - auto compute_format = DataLayout::kNCHW; -#else - const bool compute_in_nhwc = - dtype == CUDNN_DATA_HALF && IsVoltaOrLater(dev_ctx); - auto compute_format = - compute_in_nhwc && channel_last ? DataLayout::kNHWC : DataLayout::kNCHW; -#endif - VLOG(3) << "Compute ConvGradOp with cuDNN:" - << " data_format=" << data_format << " compute_format=" - << (compute_format == DataLayout::kNHWC ? "NHWC" : "NCHW"); - - // transform Tensor - Tensor transformed_input_channel(input->type()); - Tensor transformed_output_grad_channel(output_grad->type()); - Tensor transformed_input_grad_channel(input->type()); - Tensor transformed_filter_channel(filter->type()); - Tensor transformed_filter_grad_channel(filter->type()); - - if (channel_last && compute_format == DataLayout::kNCHW) { - VLOG(3) << "Transform input, output_grad, input_grad and tensor from " - "NHWC to NCHW."; - ResizeToChannelFirst( - ctx, input, &transformed_input_channel); - TransToChannelFirst( - ctx, input, &transformed_input_channel); - - ResizeToChannelFirst( - ctx, output_grad, &transformed_output_grad_channel); - TransToChannelFirst( - ctx, output_grad, &transformed_output_grad_channel); - - if (input_grad) { - ResizeToChannelFirst( - ctx, input_grad, &transformed_input_grad_channel); - // NOTE(zhiqiu): If inplace_addto strategy is enabled, we need to copy - // the data of input_grad to transformed_input_grad_channel. - if (ctx.HasAttr("use_addto") && ctx.Attr("use_addto")) { - TransToChannelFirst( - ctx, input_grad, &transformed_input_grad_channel); - } - } - } else { - transformed_input_channel.ShareDataWith(*input); - transformed_output_grad_channel.ShareDataWith(*output_grad); - if (input_grad) { - transformed_input_grad_channel.ShareDataWith(*input_grad); - } - } - - if (compute_format == DataLayout::kNHWC) { - VLOG(3) << "Transform filter and filter_grad tensor from NCHW to NHWC."; - ResizeToChannelLast( - ctx, filter, &transformed_filter_channel); - TransToChannelLast( - ctx, filter, &transformed_filter_channel); - - if (filter_grad) { - ResizeToChannelLast( - ctx, filter_grad, &transformed_filter_grad_channel); - } - } else { - transformed_filter_channel.ShareDataWith(*filter); - if (filter_grad) { - transformed_filter_grad_channel.ShareDataWith(*filter_grad); - } - } - - // update paddings - auto in_dims = transformed_input_channel.dims(); - auto filter_dims = transformed_filter_channel.dims(); - framework::DDim in_data_dims; - framework::DDim filter_data_dims; - if (compute_format == DataLayout::kNCHW) { - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - filter_data_dims = phi::slice_ddim(filter_dims, 2, filter_dims.size()); - } else { - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - filter_data_dims = - phi::slice_ddim(filter_dims, 1, filter_dims.size() - 1); - } - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - // cuDNN only supports padding the same amount on every dimension. - // So we create a new padded input tensor. - int data_dim = strides.size(); // 2d or 3d - bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim); - Tensor transformed_input(input->type()); - Tensor transformed_input_grad(input->type()); - std::vector padding_common(data_dim, 0); - std::vector input_pad(transformed_input_channel.dims().size() * 2, 0); - - if (!is_sys_pad) { - // get pad - std::vector padding_diff(data_dim); - std::vector new_input_shape_vec(data_dim + 2); - new_input_shape_vec[0] = transformed_input_channel.dims()[0]; - if (compute_format == DataLayout::kNCHW) { - new_input_shape_vec[1] = transformed_input_channel.dims()[1]; - } else { - new_input_shape_vec[data_dim + 1] = - transformed_input_channel.dims()[data_dim + 1]; - } - - for (size_t i = 0; i < data_dim; ++i) { - padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]); - padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]); - if (compute_format == DataLayout::kNCHW) { - new_input_shape_vec[i + 2] = - transformed_input_channel.dims()[i + 2] + padding_diff[i]; - } else { - new_input_shape_vec[i + 1] = - transformed_input_channel.dims()[i + 1] + padding_diff[i]; - } - if (compute_format == DataLayout::kNCHW) { - input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i]; - input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i]; - } else { - input_pad[2 * i + 2] = paddings[2 * i] - padding_common[i]; - input_pad[2 * i + 2 + 1] = paddings[2 * i + 1] - padding_common[i]; - } - } - framework::DDim new_input_shape(phi::make_ddim(new_input_shape_vec)); - transformed_input.Resize(new_input_shape); - - transformed_input_grad.Resize(new_input_shape); - auto& dev_ctx = - ctx.template device_context(); - - transformed_input = - ctx.AllocateTmpTensor( - new_input_shape, dev_ctx); - if (input_grad) { - transformed_input_grad = - ctx.AllocateTmpTensor( - new_input_shape, dev_ctx); - } - // pad for input - const int rank = transformed_input_channel.dims().size(); - T pad_value(0.0); - switch (rank) { - case 4: { - phi::funcs::PadFunction( - dev_ctx, input_pad, transformed_input_channel, pad_value, - &transformed_input); - } break; - case 5: { - phi::funcs::PadFunction( - dev_ctx, input_pad, transformed_input_channel, pad_value, - &transformed_input); - } break; - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "ConvOp only support tensors with 4 or 5 dimensions.")); - } - } else { - transformed_input.ShareDataWith(transformed_input_channel); - if (input_grad) { - transformed_input_grad.ShareDataWith(transformed_input_grad_channel); - } - if (paddings.size() == data_dim) { - for (size_t i = 0; i < data_dim; ++i) { - padding_common[i] = paddings[i]; - } - } else { - for (size_t i = 0; i < data_dim; ++i) { - padding_common[i] = paddings[2 * i]; - } - } - } - - const T* input_data = transformed_input.data(); - const T* output_grad_data = transformed_output_grad_channel.data(); - const T* filter_data = transformed_filter_channel.data(); - T* filter_grad_data = nullptr; - T* input_grad_data = nullptr; - T* transformed_input_grad_data = nullptr; - - ConvArgs args1{&transformed_input_grad, - &transformed_filter_channel, - &transformed_output_grad_channel, - strides, - padding_common, - dilations, - dtype}; - ConvArgs args2{&transformed_input, - &transformed_filter_grad_channel, - &transformed_output_grad_channel, - strides, - padding_common, - dilations, - dtype}; - - auto handle = dev_ctx.cudnn_handle(); - DataLayout layout = compute_format == DataLayout::kNHWC ? DataLayout::kNHWC - : DataLayout::kNCHW; - if (transformed_input.dims().size() == 5) { - layout = compute_format == DataLayout::kNHWC ? DataLayout::kNDHWC - : DataLayout::kNCDHW; - } - auto layout_tensor = GetCudnnTensorFormat(layout); - auto workspace_handle = dev_ctx.cudnn_workspace_handle(); - - int i_n, i_c, i_d, i_h, i_w; - int o_n, o_c, o_d, o_h, o_w; - if (compute_format == DataLayout::kNHWC) { - GetNCDHW(transformed_input.dims(), DataLayout::kNHWC, &i_n, &i_c, &i_d, - &i_h, &i_w); - GetNCDHW(transformed_output_grad_channel.dims(), DataLayout::kNHWC, &o_n, - &o_c, &o_d, &o_h, &o_w); - } else { - GetNCDHW(transformed_input.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d, - &i_h, &i_w); - GetNCDHW(transformed_output_grad_channel.dims(), DataLayout::kNCHW, &o_n, - &o_c, &o_d, &o_h, &o_w); - } - - int group_offset_in = i_c / groups * i_h * i_w * i_d; - int group_offset_out = o_c / groups * o_h * o_w * o_d; - int group_offset_filter = transformed_filter_channel.numel() / groups; -// ------------------- cudnn backward algorithm --------------------- -#ifdef PADDLE_WITH_HIP - miopenConvBwdDataAlgorithm_t data_algo = - static_cast(0); - miopenConvBwdWeightsAlgorithm_t filter_algo = - static_cast(0); -#else - cudnnConvolutionBwdDataAlgo_t data_algo = - static_cast(0); - cudnnConvolutionBwdFilterAlgo_t filter_algo = - static_cast(0); -#endif - // input data workspace_size - size_t workspace_size_d = 0; - // weight workspace_size - size_t workspace_size_w = 0; - int iwo_groups = groups; - int c_groups = 1; - -#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) - iwo_groups = 1; - c_groups = groups; - groups = 1; -#endif - - if (input_grad) { - // ------------------- cudnn descriptors --------------------- - input_grad_data = input_grad->data(); - transformed_input_grad_data = transformed_input_grad.data(); - args1.handle = handle; - args1.idesc.set(transformed_input_grad, layout_tensor); - args1.wdesc.set(transformed_filter_channel, layout_tensor, iwo_groups); - args1.odesc.set(transformed_output_grad_channel, layout_tensor); - args1.cdesc.set(dtype, padding_common, strides, dilations, - platform::AllowTF32Cudnn(), c_groups); - -#ifdef PADDLE_WITH_HIP - using search1 = SearchAlgorithm; - workspace_size_d = - std::max(workspace_size_d, search1::GetWorkspaceSize(args1)); - data_algo = search1::Find(args1, exhaustive_search, deterministic, - workspace_size_d, ctx); -#else - using search1 = SearchAlgorithm; - data_algo = - search1::Find(args1, exhaustive_search, deterministic, ctx); - workspace_size_d = std::max(workspace_size_d, - search1::GetWorkspaceSize(args1, data_algo)); -#endif - } - - if (filter_grad) { - // ------------------- cudnn descriptors --------------------- - filter_grad_data = transformed_filter_grad_channel.data(); - args2.handle = handle; - args2.idesc.set(transformed_input, layout_tensor); - args2.wdesc.set(transformed_filter_grad_channel, layout_tensor, - iwo_groups); - args2.odesc.set(transformed_output_grad_channel, layout_tensor); - args2.cdesc.set(dtype, padding_common, strides, dilations, - platform::AllowTF32Cudnn(), c_groups); -#ifdef PADDLE_WITH_HIP - using search2 = SearchAlgorithm; - workspace_size_w = - std::max(workspace_size_w, search2::GetWorkspaceSize(args2)); - filter_algo = search2::Find(args2, exhaustive_search, deterministic, - workspace_size_w, ctx); -#else - using search2 = SearchAlgorithm; - filter_algo = - search2::Find(args2, exhaustive_search, deterministic, ctx); - workspace_size_w = std::max( - workspace_size_w, search2::GetWorkspaceSize(args2, filter_algo)); -#endif - } - - // ------------------- cudnn conv backward data --------------------- - ScalingParamType alpha = 1.0f; -#ifdef PADDLE_WITH_HIP - // MIOPEN ONLY support beta to be 0.0f - ScalingParamType beta = 0.0f; -#else - ScalingParamType beta = - (ctx.HasAttr("use_addto") && ctx.Attr("use_addto")) ? 1.0f : 0.0f; -#endif - VLOG(4) << "Conv_grad: use_addto = " - << (ctx.HasAttr("use_addto") && ctx.Attr("use_addto")); - - if (input_grad) { -// When beta is 0, it is unnecessary to reset input_grad. -// When beta is 1, the output cannot be reset since addt strategy used. -#ifdef PADDLE_WITH_HIP - if (ctx.HasAttr("use_addto") && ctx.Attr("use_addto")) { - Tensor temp_tensor(transformed_input_grad.type()); - temp_tensor.Resize(transformed_input_grad.dims()); - T* temp_tensor_data = temp_tensor.mutable_data(ctx.GetPlace()); - workspace_handle.RunFunc( - [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionBackwardData( - handle, &alpha, args1.odesc.desc(), output_grad_data, - args1.wdesc.desc(), filter_data, args1.cdesc.desc(), - data_algo, &beta, args1.idesc.desc(), temp_tensor_data, - cudnn_workspace_ptr, workspace_size_d)); - }, - workspace_size_d); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenOpTensor( - handle, miopenTensorOpAdd, &alpha, args1.idesc.desc(), - transformed_input_grad_data, &alpha, args1.idesc.desc(), - temp_tensor_data, &beta, args1.idesc.desc(), - transformed_input_grad_data)); - } else { - workspace_handle.RunFunc( - [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionBackwardData( - handle, &alpha, args1.odesc.desc(), output_grad_data, - args1.wdesc.desc(), filter_data, args1.cdesc.desc(), - data_algo, &beta, args1.idesc.desc(), - transformed_input_grad_data, cudnn_workspace_ptr, - workspace_size_d)); - }, - workspace_size_d); - } - -#else - for (int i = 0; i < groups; i++) { - workspace_handle.RunFunc( - [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnConvolutionBackwardData( - handle, &alpha, args1.wdesc.desc(), - filter_data + i * group_offset_filter, args1.odesc.desc(), - output_grad_data + i * group_offset_out, - args1.cdesc.desc(), data_algo, cudnn_workspace_ptr, - workspace_size_d, &beta, args1.idesc.desc(), - transformed_input_grad_data + i * group_offset_in)); - }, - workspace_size_d); - } -#endif - if (!is_sys_pad) { - std::vector starts(transformed_input_channel.dims().size(), 0); - std::vector axes(transformed_input_channel.dims().size(), 0); - - for (size_t i = 0; i < transformed_input_channel.dims().size(); ++i) { - starts[i] = input_pad[2 * i]; - axes[i] = i; - } - - transformed_input_grad_channel.mutable_data(ctx.GetPlace()); - if (transformed_input_channel.dims().size() == 4) { - RemovePaddingSlice( - ctx, &transformed_input_grad, &transformed_input_grad_channel, - starts, axes); - } else { - RemovePaddingSlice( - ctx, &transformed_input_grad, &transformed_input_grad_channel, - starts, axes); - } - } - - if (channel_last && compute_format == DataLayout::kNCHW) { - TransToChannelLast( - ctx, &transformed_input_grad_channel, input_grad); - } - } - - // filter_grad do not use inplace addto. - ScalingParamType beta_filter = 0.0f; - // ------------------- cudnn conv backward filter --------------------- - if (filter_grad) { -// Because beta is zero, it is unnecessary to reset filter_grad. -#ifdef PADDLE_WITH_HIP - workspace_handle.RunFunc( - [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionBackwardWeights( - handle, &alpha, args2.odesc.desc(), output_grad_data, - args2.idesc.desc(), input_data, args2.cdesc.desc(), - filter_algo, &beta, args2.wdesc.desc(), filter_grad_data, - cudnn_workspace_ptr, workspace_size_w)); - }, - workspace_size_w); -#else - for (int i = 0; i < groups; i++) { - workspace_handle.RunFunc( - [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnConvolutionBackwardFilter( - handle, &alpha, args2.idesc.desc(), - input_data + i * group_offset_in, args2.odesc.desc(), - output_grad_data + i * group_offset_out, - args2.cdesc.desc(), filter_algo, cudnn_workspace_ptr, - workspace_size_w, &beta_filter, args2.wdesc.desc(), - filter_grad_data + i * group_offset_filter)); - }, - workspace_size_w); - } -#endif - - if (compute_format == DataLayout::kNHWC) { - TransToChannelFirst( - ctx, &transformed_filter_grad_channel, filter_grad); - } - } - } -}; - -/* - * Inputs: I, W, dO, ddI, ddW - * Outputs: ddO, dW, dI - * ddo = conv(ddI, W) + conv(I, ddW) - * dW = conv_bp_filter(ddI, dO) - * dI = conv_bp_data(ddW, dO) - */ -template -class CUDNNConvDoubleGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = ctx.template device_context(); - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace.")); - auto X = ctx.Input("Input"); - auto W = ctx.Input("Filter"); - auto dO = ctx.Input("DOutput"); - auto ddX = ctx.Input("DDInput"); - auto ddW = ctx.Input("DDFilter"); - - auto ddO = ctx.Output("DDOutput"); - auto dW = ctx.Output("DFilter"); - auto dX = ctx.Output("DInput"); - if (ddO) { - ddO->mutable_data(ctx.GetPlace()); - phi::funcs::SetConstant set_zero; - set_zero(dev_ctx, ddO, static_cast(0)); - } - if (dW) { - dW->mutable_data(ctx.GetPlace()); - } - if (dX) { - dX->mutable_data(ctx.GetPlace()); - } - - // const T* x = X->data(); - const T* dy = dO->data(); - const T* w = W->data(); - - const T* ddx = nullptr; - const T* ddw = nullptr; - T *dw, *dx, *ddy; - dw = dx = ddy = nullptr; - T* transformed_dx = nullptr; - const std::vector& strides = ctx.Attr>("strides"); - std::vector dilations = ctx.Attr>("dilations"); - int groups = ctx.Attr("groups"); - - bool exhaustive_search = - FLAGS_cudnn_exhaustive_search || (ctx.HasAttr("exhaustive_search") && - ctx.Attr("exhaustive_search")); - bool deterministic = FLAGS_cudnn_deterministic; - auto exhaustive_deterministic = exhaustive_search && deterministic; - PADDLE_ENFORCE_EQ(exhaustive_deterministic, false, - platform::errors::InvalidArgument( - "Cann't set exhaustive_search True and " - "FLAGS_cudnn_deterministic True at same time.")); - - std::vector paddings = ctx.Attr>("paddings"); - - std::string padding_algorithm = ctx.Attr("padding_algorithm"); - const std::string data_format = ctx.Attr("data_format"); - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - // transform Tensors to channel first----------- - Tensor transformed_X_channel(X->type()); - Tensor transformed_dO_channel(dO->type()); - Tensor transformed_ddX_channel(X->type()); - - Tensor transformed_ddO_channel(dO->type()); - Tensor transformed_dX_channel(X->type()); - - if (channel_last) { - ResizeToChannelFirst( - ctx, X, &transformed_X_channel); - TransToChannelFirst( - ctx, X, &transformed_X_channel); - - ResizeToChannelFirst( - ctx, dO, &transformed_dO_channel); - TransToChannelFirst( - ctx, dO, &transformed_dO_channel); - - if (ddX) { - ResizeToChannelFirst( - ctx, ddX, &transformed_ddX_channel); - TransToChannelFirst( - ctx, ddX, &transformed_ddX_channel); - } - - if (ddO) { - ResizeToChannelFirst( - ctx, ddO, &transformed_ddO_channel); - } - if (dX) { - ResizeToChannelFirst( - ctx, dX, &transformed_dX_channel); - transformed_dX_channel.mutable_data(ctx.GetPlace()); - } - - } else { - transformed_X_channel = *X; - transformed_dO_channel = *dO; - if (ddX) { - transformed_ddX_channel = *ddX; - } - if (ddO) { - transformed_ddO_channel.ShareDataWith(*ddO); - } - if (dX) { - transformed_dX_channel.ShareDataWith(*dX); - } - } - - auto in_dims = transformed_X_channel.dims(); - auto filter_dims = W->dims(); - framework::DDim in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - int data_dim = strides.size(); // 2d or 3d - bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim); - Tensor transformed_X(X->type()); - Tensor transformed_ddX(X->type()); - - Tensor transformed_dX(X->type()); - - std::vector padding_common(data_dim, 0); - std::vector input_pad(X->dims().size() * 2, 0); - - if (!is_sys_pad) { - // get pad - std::vector padding_diff(data_dim); - std::vector new_input_shape_vec(data_dim + 2); - new_input_shape_vec[0] = transformed_X_channel.dims()[0]; - new_input_shape_vec[1] = transformed_X_channel.dims()[1]; - - for (size_t i = 0; i < data_dim; ++i) { - padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]); - padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]); - new_input_shape_vec[i + 2] = - transformed_X_channel.dims()[i + 2] + padding_diff[i]; - input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i]; - input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i]; - } - framework::DDim new_input_shape(phi::make_ddim(new_input_shape_vec)); - transformed_X.Resize(new_input_shape); - transformed_ddX.Resize(new_input_shape); - transformed_dX.Resize(new_input_shape); - - transformed_X = - ctx.AllocateTmpTensor( - new_input_shape, dev_ctx); - if (ddX) { - transformed_ddX = - ctx.AllocateTmpTensor( - new_input_shape, dev_ctx); - } - if (dX) { - transformed_dX = - ctx.AllocateTmpTensor( - new_input_shape, dev_ctx); - } - - // pad for input - const int rank = X->dims().size(); - T pad_value(0.0); - switch (rank) { - case 4: { - phi::funcs::PadFunction( - dev_ctx, input_pad, transformed_X_channel, pad_value, - &transformed_X); - if (ddX) { - phi::funcs::PadFunction( - dev_ctx, input_pad, transformed_ddX_channel, pad_value, - &transformed_ddX); - } - } break; - case 5: { - phi::funcs::PadFunction( - dev_ctx, input_pad, transformed_X_channel, pad_value, - &transformed_X); - if (ddX) { - phi::funcs::PadFunction( - dev_ctx, input_pad, transformed_ddX_channel, pad_value, - &transformed_ddX); - } - } break; - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "ConvOp only support tensors with 4 or 5 dimensions.")); - } - - } else { - transformed_X.ShareDataWith(transformed_X_channel); - if (ddX) { - transformed_ddX.ShareDataWith(transformed_ddX_channel); - } - if (dX) { - transformed_dX.ShareDataWith(transformed_dX_channel); - } - - if (paddings.size() == data_dim) { - for (size_t i = 0; i < data_dim; ++i) { - padding_common[i] = paddings[i]; - } - } else { - for (size_t i = 0; i < data_dim; ++i) { - padding_common[i] = paddings[2 * i]; - } - } - } - - const T* x = transformed_X.data(); - - int iwo_group = groups; - int c_group = 1; -#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) - iwo_group = 1; - c_group = groups; - groups = 1; -#endif - auto dtype = platform::CudnnDataType::type; - - auto handle = dev_ctx.cudnn_handle(); - - ConvArgs args1{&transformed_ddX, - W, - &transformed_ddO_channel, - strides, - padding_common, - dilations, - dtype}; - ConvArgs args2{ - &transformed_X, ddW, &transformed_ddO_channel, strides, padding_common, - dilations, dtype}; - ConvArgs args3{&transformed_ddX, - dW, - &transformed_dO_channel, - strides, - padding_common, - dilations, - dtype}; - ConvArgs args4{ - &transformed_dX, ddW, &transformed_dO_channel, strides, padding_common, - dilations, dtype}; - -#ifdef PADDLE_WITH_HIP - miopenConvFwdAlgorithm_t fwd_algo1 = - static_cast(0); - miopenConvFwdAlgorithm_t fwd_algo2 = - static_cast(0); - miopenConvBwdDataAlgorithm_t data_algo = - static_cast(0); - miopenConvBwdWeightsAlgorithm_t filter_algo = - static_cast(0); -#else - cudnnConvolutionFwdAlgo_t fwd_algo1 = - static_cast(0); - cudnnConvolutionFwdAlgo_t fwd_algo2 = - static_cast(0); - cudnnConvolutionBwdDataAlgo_t data_algo = - static_cast(0); - cudnnConvolutionBwdFilterAlgo_t filter_algo = - static_cast(0); -#endif - - auto layout = GetCudnnTensorFormat(DataLayout::kNCHW); - - // ddo = conv(ddI, W) + conv(I, ddW) - size_t workspace_size = 0; - - T* transformed_ddy_channel = nullptr; - if (ddO) { - ddy = ddO->data(); - transformed_ddy_channel = transformed_ddO_channel.data(); - if (ddX) { - args1.handle = handle; - args1.idesc.set(transformed_ddX, iwo_group); - args1.wdesc.set(*W, layout, iwo_group); - args1.odesc.set(transformed_ddO_channel, iwo_group); - args1.cdesc.set(dtype, padding_common, strides, dilations, - platform::AllowTF32Cudnn(), c_group); - -#ifdef PADDLE_WITH_HIP - using search1 = SearchAlgorithm; - workspace_size = search1::GetWorkspaceSize(args1); - fwd_algo1 = search1::Find(args1, exhaustive_search, false, - workspace_size, ctx); -#else - using search1 = SearchAlgorithm; - fwd_algo1 = search1::Find(args1, exhaustive_search, false, ctx); - workspace_size = search1::GetWorkspaceSize(args1, fwd_algo1); -#endif - } - - if (ddW) { - ddw = ddW->data(); - args2.handle = handle; - args2.idesc.set(transformed_X, iwo_group); - args2.wdesc.set(*ddW, layout, iwo_group); - args2.odesc.set(transformed_ddO_channel, iwo_group); - args2.cdesc.set(dtype, padding_common, strides, dilations, - platform::AllowTF32Cudnn(), c_group); - -#ifdef PADDLE_WITH_HIP - using search2 = SearchAlgorithm; - workspace_size = - std::max(workspace_size, search2::GetWorkspaceSize(args2)); - fwd_algo2 = search2::Find(args2, exhaustive_search, false, - workspace_size, ctx); -#else - using search2 = SearchAlgorithm; - fwd_algo2 = search2::Find(args2, exhaustive_search, false, ctx); - workspace_size = std::max(workspace_size, - search2::GetWorkspaceSize(args2, fwd_algo2)); -#endif - } - } - - if (dW && ddX) { - dw = dW->data(); - args3.handle = handle; - args3.idesc.set(transformed_ddX, iwo_group); - args3.wdesc.set(*dW, layout, iwo_group); - args3.odesc.set(transformed_dO_channel, iwo_group); - args3.cdesc.set(dtype, padding_common, strides, dilations, - platform::AllowTF32Cudnn(), c_group); - -#ifdef PADDLE_WITH_HIP - using search3 = SearchAlgorithm; - workspace_size = - std::max(workspace_size, search3::GetWorkspaceSize(args3)); - filter_algo = search3::Find(args3, exhaustive_search, deterministic, - workspace_size, ctx); -#else - using search3 = SearchAlgorithm; - filter_algo = - search3::Find(args3, exhaustive_search, deterministic, ctx); - workspace_size = std::max(workspace_size, - search3::GetWorkspaceSize(args3, filter_algo)); -#endif - } - - if (ddW && dX) { - transformed_dx = transformed_dX.data(); - - args4.handle = handle; - args4.idesc.set(transformed_dX, iwo_group); - args4.wdesc.set(*ddW, layout, iwo_group); - args4.odesc.set(transformed_dO_channel, iwo_group); - args4.cdesc.set(dtype, padding_common, strides, dilations, - platform::AllowTF32Cudnn(), c_group); - -#ifdef PADDLE_WITH_HIP - using search4 = SearchAlgorithm; - workspace_size = - std::max(workspace_size, search4::GetWorkspaceSize(args4)); - data_algo = search4::Find(args4, exhaustive_search, deterministic, - workspace_size, ctx); -#else - using search4 = SearchAlgorithm; - data_algo = - search4::Find(args4, exhaustive_search, deterministic, ctx); - workspace_size = - std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo)); -#endif - } - - int i_n, i_c, i_d, i_h, i_w; - GetNCDHW(transformed_X.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d, &i_h, - &i_w); - - int o_n, o_c, o_d, o_h, o_w; - GetNCDHW(transformed_dO_channel.dims(), DataLayout::kNCHW, &o_n, &o_c, &o_d, - &o_h, &o_w); - - int group_offset_in = i_c / groups * i_h * i_w * i_d; - int group_offset_out = o_c / groups * o_h * o_w * o_d; - int group_offset_filter = W->numel() / groups; - - ScalingParamType alpha = 1.0f; - ScalingParamType beta = 0.0f; - - // NOTE(zhiqiu): inplace addto is not supportted in double grad yet. - // ScalingParamType beta = ctx.Attr("use_addto") ? 1.0f : - // 0.0f; - // VLOG(4) << "Conv_grad_grad: use_addto = " << ctx.Attr("use_addto"); - auto wkspace_handle = dev_ctx.cudnn_workspace_handle(); - - if (ddO) { - if (ddX) { - ddx = transformed_ddX.data(); -#ifdef PADDLE_WITH_HIP - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionForward( - handle, &alpha, args1.idesc.desc(), ddx, - args1.wdesc.desc(), w, args1.cdesc.desc(), fwd_algo1, - &beta, args1.odesc.desc(), transformed_ddy_channel, - workspace_ptr, workspace_size)); - }, - workspace_size); -#else - for (int i = 0; i < groups; i++) { - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnConvolutionForward( - handle, &alpha, args1.idesc.desc(), - ddx + i * group_offset_in, args1.wdesc.desc(), - w + i * group_offset_filter, args1.cdesc.desc(), - fwd_algo1, workspace_ptr, workspace_size, &beta, - args1.odesc.desc(), - transformed_ddy_channel + i * group_offset_out)); - }, - workspace_size); - } -#endif - } - if (ddW) { -#ifdef PADDLE_WITH_HIP - // MIOPEN ONLY support beta to be 0.0f - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionForward( - handle, &alpha, args2.idesc.desc(), x, args2.wdesc.desc(), - ddw, args2.cdesc.desc(), fwd_algo2, &beta, - args2.odesc.desc(), transformed_ddy_channel, - workspace_ptr, workspace_size)); - }, - workspace_size); -#else - for (int i = 0; i < groups; i++) { - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnConvolutionForward( - handle, &alpha, args2.idesc.desc(), - x + i * group_offset_in, args2.wdesc.desc(), - ddw + i * group_offset_filter, args2.cdesc.desc(), - fwd_algo2, workspace_ptr, workspace_size, &alpha, - args2.odesc.desc(), - transformed_ddy_channel + i * group_offset_out)); - }, - workspace_size); - } -#endif - } - if (channel_last) { - TransToChannelLast( - ctx, &transformed_ddO_channel, ddO); - } - } - T* transformed_dy_channel = transformed_dO_channel.data(); - if (dW && ddX) { - ddx = transformed_ddX.data(); -#ifdef PADDLE_WITH_HIP - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionBackwardWeights( - handle, &alpha, args3.odesc.desc(), transformed_dy_channel, - args3.idesc.desc(), ddx, args3.cdesc.desc(), filter_algo, - &beta, args3.wdesc.desc(), dw, workspace_ptr, - workspace_size)); - }, - workspace_size); -#else - for (int i = 0; i < groups; i++) { - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnConvolutionBackwardFilter( - handle, &alpha, args3.idesc.desc(), - ddx + i * group_offset_in, args3.odesc.desc(), - transformed_dy_channel + i * group_offset_out, - args3.cdesc.desc(), filter_algo, workspace_ptr, - workspace_size, &beta, args3.wdesc.desc(), - dw + i * group_offset_filter)); - }, - workspace_size); - } -#endif - } - - if (dX && ddW) { - ddw = ddW->data(); -#ifdef PADDLE_WITH_HIP - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionBackwardData( - handle, &alpha, args4.odesc.desc(), transformed_dy_channel, - args4.wdesc.desc(), ddw, args4.cdesc.desc(), data_algo, - &beta, args4.idesc.desc(), transformed_dx, workspace_ptr, - workspace_size)); - }, - workspace_size); -#else - for (int i = 0; i < groups; i++) { - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnConvolutionBackwardData( - handle, &alpha, args4.wdesc.desc(), - ddw + i * group_offset_filter, args4.odesc.desc(), - transformed_dy_channel + i * group_offset_out, - args4.cdesc.desc(), data_algo, workspace_ptr, - workspace_size, &beta, args4.idesc.desc(), - transformed_dx + i * group_offset_in)); - }, - workspace_size); - } -#endif - - if (!is_sys_pad) { - // reverse padded input - std::vector starts(X->dims().size(), 0); - std::vector axes(X->dims().size(), 0); - - for (size_t i = 0; i < X->dims().size(); ++i) { - starts[i] = input_pad[2 * i]; - axes[i] = i; - } - if (X->dims().size() == 4) { - RemovePaddingSlice( - ctx, &transformed_dX, &transformed_dX_channel, starts, axes); - } else { - RemovePaddingSlice( - ctx, &transformed_dX, &transformed_dX_channel, starts, axes); - } - } - if (channel_last) { - TransToChannelLast( - ctx, &transformed_dX_channel, dX); - } - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace plat = paddle::platform; -#ifdef PADDLE_WITH_HIP -// MIOPEN do not support double -REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel); -REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvGradOpKernel, - paddle::operators::CUDNNConvGradOpKernel); -REGISTER_OP_KERNEL( - conv2d_grad_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel); -// ROCM has limit thread in depthwise_conv.cu and willl result in accuracy issue -// Use depthwise_conv2d in MIOPEN to resolve this issue -REGISTER_OP_KERNEL(depthwise_conv2d, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel); -REGISTER_OP_KERNEL(depthwise_conv2d_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvGradOpKernel, - paddle::operators::CUDNNConvGradOpKernel); -REGISTER_OP_CUDA_KERNEL( - depthwise_conv2d_grad_grad, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel); - -REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel); -REGISTER_OP_KERNEL(conv3d_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvGradOpKernel); -REGISTER_OP_KERNEL( - conv3d_grad_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel); -#else -#if CUDNN_VERSION_MIN(8, 1, 0) -REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel); -REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvGradOpKernel, - paddle::operators::CUDNNConvGradOpKernel, - paddle::operators::CUDNNConvGradOpKernel, - paddle::operators::CUDNNConvGradOpKernel); -REGISTER_OP_KERNEL( - conv2d_grad_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel); - -REGISTER_OP_CUDA_KERNEL( - depthwise_conv2d_grad_grad, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel); -#else -REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel); -REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvGradOpKernel, - paddle::operators::CUDNNConvGradOpKernel, - paddle::operators::CUDNNConvGradOpKernel); -REGISTER_OP_KERNEL( - conv2d_grad_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel); - -REGISTER_OP_CUDA_KERNEL( - depthwise_conv2d_grad_grad, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel); -#endif - -REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel); -REGISTER_OP_KERNEL(conv3d_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvGradOpKernel, - paddle::operators::CUDNNConvGradOpKernel); -REGISTER_OP_KERNEL( - conv3d_grad_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel); -#endif diff --git a/paddle/fluid/operators/conv_miopen_helper.h b/paddle/fluid/operators/conv_miopen_helper.h index 9c9795143eb78dc5c1b22ec792d8753f915c976e..66f718693847837a4d169a5cab9629a1f668244f 100644 --- a/paddle/fluid/operators/conv_miopen_helper.h +++ b/paddle/fluid/operators/conv_miopen_helper.h @@ -24,6 +24,7 @@ limitations under the License. */ #include "paddle/fluid/framework/operator_kernel_configs.h" #include "paddle/fluid/operators/conv_cudnn_op_cache.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" +#include "paddle/phi/backends/gpu/gpu_context.h" namespace paddle { namespace operators { @@ -51,12 +52,11 @@ static inline void GetNCDHW(const framework::DDim& dims, } template -static void RemovePaddingSlice(const framework::ExecutionContext& context, +static void RemovePaddingSlice(const phi::GPUContext& context, const Tensor* input, Tensor* out, const std::vector& starts, const std::vector& axes) { - auto& place = - *context.template device_context().eigen_device(); + auto& place = *context.eigen_device(); auto in_dims = input->dims(); auto new_out_dims = out->dims(); auto offsets = Eigen::array(); @@ -128,11 +128,10 @@ struct SearchAlgorithm { template static algo_t Find(const ConvArgs& args, bool exhaustive_search, bool deterministic, size_t workspace_size, - const framework::ExecutionContext& ctx) { + const phi::GPUContext& ctx) { algo_t algo; - auto& dev_ctx = ctx.template device_context(); - auto workspace_handle = dev_ctx.cudnn_workspace_handle(); + auto workspace_handle = ctx.cudnn_workspace_handle(); int find_count; miopenConvAlgoPerf_t find_result; @@ -170,11 +169,10 @@ struct SearchAlgorithm { template static algo_t Find(const ConvArgs& args, bool exhaustive_search, bool deterministic, size_t workspace_size, - const framework::ExecutionContext& ctx) { + const phi::GPUContext& ctx) { algo_t algo; - auto& dev_ctx = ctx.template device_context(); - auto workspace_handle = dev_ctx.cudnn_workspace_handle(); + auto workspace_handle = ctx.cudnn_workspace_handle(); int find_count; miopenConvAlgoPerf_t find_result; @@ -212,11 +210,10 @@ struct SearchAlgorithm { template static algo_t Find(const ConvArgs& args, bool exhaustive_search, bool deterministic, size_t workspace_size, - const framework::ExecutionContext& ctx) { + const phi::GPUContext& ctx) { algo_t algo; - auto& dev_ctx = ctx.template device_context(); - auto workspace_handle = dev_ctx.cudnn_workspace_handle(); + auto workspace_handle = ctx.cudnn_workspace_handle(); int find_count; miopenConvAlgoPerf_t find_result; diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index e345a4d2603b630508e299207984f4708217a1d8..8213e877f722433488cd826bb63cba376972c57a 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -205,14 +205,14 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( paddle::framework::DataTypeToString(input_data_type), paddle::framework::DataTypeToString(filter_data_type))); } -#ifndef PADDLE_WITH_ASCEND_CL - if (input_data_type == framework::proto::VarType::FP16) { - PADDLE_ENFORCE_EQ( - library, framework::LibraryType::kCUDNN, - platform::errors::InvalidArgument( - "float16 can only be used when CUDNN or NPU is used")); - } -#endif +// #ifndef PADDLE_WITH_ASCEND_CL +// if (input_data_type == framework::proto::VarType::FP16) { +// PADDLE_ENFORCE_EQ( +// library, framework::LibraryType::kCUDNN, +// platform::errors::InvalidArgument( +// "float16 can only be used when CUDNN or NPU is used")); +// } +// #endif #if PADDLE_WITH_CUDA if (input_data_type == framework::proto::VarType::BF16 && library == framework::LibraryType::kCUDNN) { @@ -869,42 +869,6 @@ REGISTER_OPERATOR(conv3d_grad, ops::ConvOpGrad, ops::Conv3DDoubleGradMaker); REGISTER_OPERATOR(conv3d_grad_grad, ops::ConvOpDoubleGrad); -// depthwise conv kernel -// TODO(xingzhaolong): neon kernel for mobile -REGISTER_OP_CPU_KERNEL( - depthwise_conv2d, - ops::GemmConvKernel, - ops::GemmConvKernel); - -REGISTER_OP_CPU_KERNEL( - depthwise_conv2d_grad, - ops::GemmConvGradKernel, - ops::GemmConvGradKernel); - -REGISTER_OP_CPU_KERNEL( - conv2d, ops::GemmConvKernel, - ops::GemmConvKernel); -REGISTER_OP_CPU_KERNEL( - conv2d_grad, - ops::GemmConvGradKernel, - ops::GemmConvGradKernel); -REGISTER_OP_CPU_KERNEL( - conv2d_grad_grad, - ops::GemmConvDoubleGradKernel, - ops::GemmConvDoubleGradKernel); - -REGISTER_OP_CPU_KERNEL( - conv3d, ops::GemmConvKernel, - ops::GemmConvKernel); -REGISTER_OP_CPU_KERNEL( - conv3d_grad, - ops::GemmConvGradKernel, - ops::GemmConvGradKernel); -REGISTER_OP_CPU_KERNEL( - conv3d_grad_grad, - ops::GemmConvDoubleGradKernel, - ops::GemmConvDoubleGradKernel); - REGISTER_OP_VERSION(conv2d) .AddCheckpoint( R"ROC( diff --git a/paddle/fluid/operators/conv_op.cu.cc b/paddle/fluid/operators/conv_op.cu.cc deleted file mode 100644 index d07593f5c02e9129c1f333667baccb0531bc31f9..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/conv_op.cu.cc +++ /dev/null @@ -1,43 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/conv_op.h" - -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL( - depthwise_conv2d, - ops::DepthwiseConvKernel, - ops::DepthwiseConvKernel); - -REGISTER_OP_CUDA_KERNEL( - depthwise_conv2d_grad, - ops::DepthwiseConvGradKernel, - ops::DepthwiseConvGradKernel); - -REGISTER_OP_CUDA_KERNEL( - conv2d, ops::GemmConvKernel, - ops::GemmConvKernel); -REGISTER_OP_CUDA_KERNEL( - conv2d_grad, - ops::GemmConvGradKernel, - ops::GemmConvGradKernel); - -REGISTER_OP_CUDA_KERNEL( - conv3d, ops::GemmConvKernel, - ops::GemmConvKernel); -REGISTER_OP_CUDA_KERNEL( - conv3d_grad, - ops::GemmConvGradKernel, - ops::GemmConvGradKernel); diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h index 26166362da8a2984dc3c0670b186b85800767fb7..a5d888765bf37d45d501a3dbe5437f7c2ab5fc51 100644 --- a/paddle/fluid/operators/conv_op.h +++ b/paddle/fluid/operators/conv_op.h @@ -21,7 +21,6 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/layout_utils.h" -#include "paddle/fluid/operators/math/depthwise_conv.h" #include "paddle/fluid/operators/math/im2col.h" #include "paddle/fluid/operators/math/vol2col.h" #include "paddle/phi/kernels/funcs/blas/blas.h" @@ -214,817 +213,5 @@ class ConvOpDoubleGrad : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override; }; -template -class GemmConvKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* input = context.Input("Input"); - // The filter will be reshaped in the calculations, - // so here use an assignment operation, - // that avoids modifying the variable in the Scope. - Tensor filter = *context.Input("Filter"); - Tensor* output = context.Output("Output"); - output->mutable_data(context.GetPlace()); - - const int groups = context.Attr("groups"); - const std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - std::vector dilations = context.Attr>("dilations"); - const std::string padding_algorithm = - context.Attr("padding_algorithm"); - const std::string data_format = context.Attr("data_format"); - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - Tensor transformed_input(input->dtype()); - Tensor transformed_output(output->dtype()); - - if (channel_last) { - ResizeToChannelFirst(context, input, - &transformed_input); - TransToChannelFirst(context, input, &transformed_input); - - ResizeToChannelFirst(context, output, - &transformed_output); - - } else { - transformed_input = *input; - transformed_output = *output; - } - - // update padding and dilation - auto trans_in_dims = transformed_input.dims(); - auto filter_dims = filter.dims(); - - framework::DDim in_data_dims = - phi::slice_ddim(trans_in_dims, 2, trans_in_dims.size()); - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - auto& dev_ctx = context.template device_context(); - - const int batch_size = static_cast(transformed_input.dims()[0]); - - // filter_shape_vec: - // {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w} - std::vector filter_shape_vec(phi::vectorize(filter.dims())); - - // output_shape_vec: - // {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w} - std::vector output_shape_vec( - phi::vectorize(transformed_output.dims())); - - // use col_shape in the im2col calculation - // col_shape_vec: - // {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, - // o_d,o_h, o_w} - size_t data_dim = filter_shape_vec.size() - 2; - - std::vector col_shape_vec(1 + 2 * data_dim); - col_shape_vec[0] = trans_in_dims[1] / groups; - for (size_t j = 0; j < data_dim; ++j) { - col_shape_vec[j + 1] = filter_shape_vec[j + 2]; - col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; - } - - framework::DDim col_shape(phi::make_ddim(col_shape_vec)); - - // use col_matrix_shape in the gemm calculation - // size: - // (i_c/g * k_h * k_w, o_h * o_w) or (i_c/g * k_d * k_h * k_w, o_d * o_h * - // o_w) - - framework::DDim col_matrix_shape = phi::flatten_to_2d(col_shape, data_dim); - - bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations); - - Tensor col; - // col_matrix shares the same piece of data with col, - // but will be reshaped into a two-dimensional matrix shape - // to call the matrix multiplication interface. - Tensor col_matrix; - if (is_expand) { - col = context.AllocateTmpTensor(col_shape, dev_ctx); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } - - framework::DDim in_matrix_shape = phi::slice_ddim( - transformed_input.dims(), 1, transformed_input.dims().size()); - - framework::DDim filter_matrix_shape = {filter.dims()[0], - filter.numel() / filter.dims()[0]}; - filter.Resize(filter_matrix_shape); - - framework::DDim output_matrix_shape = { - transformed_output.dims()[1], - transformed_output.numel() / - (transformed_output.dims()[0] * transformed_output.dims()[1])}; - - // convolution operator: im2col(or vol2col) + gemm - int in_step = static_cast(transformed_input.dims()[1]) / groups; - int out_step = static_cast(transformed_output.dims()[1]) / groups; - - math::Vol2ColFunctor vol2col; - math::Im2ColFunctor im2col; - - auto blas = phi::funcs::GetBlas(dev_ctx); - for (int i = 0; i < batch_size; i++) { - Tensor in_batch = - transformed_input.Slice(i, i + 1).Resize(in_matrix_shape); - Tensor out_batch = - transformed_output.Slice(i, i + 1).Resize(output_matrix_shape); - - for (int g = 0; g < groups; g++) { - Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); - - if (!is_expand) { - col.ShareDataWith(in_slice); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } else if (data_dim == 2U) { - im2col(dev_ctx, in_slice, dilations, strides, - std::vector{paddings[0], paddings[2], paddings[1], - paddings[3]}, - &col); - - } else if (data_dim == 3U) { - vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col); - } - - // gemm - Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); - Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); - blas.MatMul(filter_slice, false, col_matrix, false, T(1.0), &out_slice, - T(0.0)); - } - } - if (channel_last) { - TransToChannelLast(context, &transformed_output, - output); - } - } -}; - -template -class GemmConvGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* input = context.Input("Input"); - const Tensor* output_grad = - context.Input(framework::GradVarName("Output")); - Tensor* input_grad = - context.Output(framework::GradVarName("Input")); - Tensor* filter_grad = - context.Output(framework::GradVarName("Filter")); - // The filter and filter_grad will be reshaped in the calculations, - // so here use an assignment operation, - // that avoids modifying the variable in the Scope. - Tensor filter = *context.Input("Filter"); - - if (!input_grad && !filter_grad) return; - - int groups = context.Attr("groups"); - const std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - std::vector dilations = context.Attr>("dilations"); - const std::string padding_algorithm = - context.Attr("padding_algorithm"); - const std::string data_format = context.Attr("data_format"); - - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - Tensor transformed_input(input->dtype()); - Tensor transformed_output_grad(output_grad->dtype()); - - if (channel_last) { - ResizeToChannelFirst(context, input, - &transformed_input); - TransToChannelFirst(context, input, &transformed_input); - - ResizeToChannelFirst(context, output_grad, - &transformed_output_grad); - TransToChannelFirst(context, output_grad, - &transformed_output_grad); - } else { - transformed_input = *input; - transformed_output_grad = *output_grad; - } - - // update padding and dilation - auto in_dims = transformed_input.dims(); - auto filter_dims = filter.dims(); - framework::DDim in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - const int batch_size = static_cast(transformed_input.dims()[0]); - - auto& dev_ctx = context.template device_context(); - - // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w} - std::vector filter_shape_vec(phi::vectorize(filter.dims())); - // output_shape_vec: {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w} - std::vector output_shape_vec( - phi::vectorize(transformed_output_grad.dims())); - - // use col_shape in the im2col calculation - // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d, - // o_h, o_w} - size_t data_dim = filter_shape_vec.size() - 2; - std::vector col_shape_vec(1 + 2 * data_dim); - col_shape_vec[0] = transformed_input.dims()[1] / groups; - for (size_t j = 0; j < data_dim; ++j) { - col_shape_vec[j + 1] = filter_shape_vec[j + 2]; - col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; - } - framework::DDim col_shape(phi::make_ddim(col_shape_vec)); - - // use col_matrix_shape in the gemm calculation - // size: (i_c/g * k_h * k_w, o_h * o_w) - // or - // (i_c/g * k_d * k_h * k_w, o_d * o_h * o_w) - framework::DDim col_matrix_shape = - phi::flatten_to_2d(col_shape, data_dim + 1); - - framework::DDim input_shape = phi::slice_ddim( - transformed_input.dims(), 1, transformed_input.dims().size()); - - framework::DDim filter_matrix_shape = {filter.dims()[0], - filter.numel() / filter.dims()[0]}; - filter.Resize(filter_matrix_shape); - - framework::DDim output_matrix_shape = { - transformed_output_grad.dims()[1], - transformed_output_grad.numel() / (transformed_output_grad.dims()[0] * - transformed_output_grad.dims()[1])}; - - // convolution backward input operator: gemm + col2im(or col2vol) - // convolution backward weight operator: im2col(or vol2col) + gemm - int in_step = static_cast(transformed_input.dims()[1]) / groups; - int out_step = static_cast(transformed_output_grad.dims()[1]) / groups; - - bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations); - - Tensor col; - // col_matrix shares the same piece of data with col, - // but will be reshaped into a two-dimensional matrix shape - // to call the matrix multiplication interface. - Tensor col_matrix; - if (is_expand) { - col = context.AllocateTmpTensor(col_shape, dev_ctx); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } - - phi::funcs::SetConstant set_zero; - auto blas = phi::funcs::GetBlas(dev_ctx); - - if (input_grad) { - input_grad->mutable_data(context.GetPlace()); - Tensor transformed_input_grad(input_grad->dtype()); - if (channel_last) { - ResizeToChannelFirst(context, input_grad, - &transformed_input_grad); - - } else { - transformed_input_grad = *input_grad; - } - // if is_expand is false, the operation of set_zero is unnecessary, - // because math::matmul will reset input_grad. - if (is_expand) { - set_zero(dev_ctx, &transformed_input_grad, static_cast(0)); - } - math::Col2VolFunctor col2vol; - math::Col2ImFunctor col2im; - - for (int i = 0; i < batch_size; i++) { - Tensor out_grad_batch = - transformed_output_grad.Slice(i, i + 1).Resize(output_matrix_shape); - Tensor in_grad_batch = - transformed_input_grad.Slice(i, i + 1).Resize(input_shape); - for (int g = 0; g < groups; g++) { - // gemm - Tensor out_grad_slice = - out_grad_batch.Slice(g * out_step, (g + 1) * out_step); - Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); - - Tensor in_grad_slice = - in_grad_batch.Slice(g * in_step, (g + 1) * in_step); - - if (!is_expand) { - col_matrix.ShareDataWith(in_grad_slice); - col_matrix.Resize(col_matrix_shape); - } - blas.MatMul(filter_slice, true, out_grad_slice, false, T(1.0), - &col_matrix, T(0.0)); - - if (is_expand && data_dim == 2U) { - col2im(dev_ctx, col, dilations, strides, - std::vector{paddings[0], paddings[2], paddings[1], - paddings[3]}, - &in_grad_slice); - } else if (is_expand && data_dim == 3U) { - col2vol(dev_ctx, col, dilations, strides, paddings, &in_grad_slice); - } - } - } - if (channel_last) { - TransToChannelLast(context, &transformed_input_grad, - input_grad); - } - } - - if (filter_grad) { - filter_grad->mutable_data(context.GetPlace()); - Tensor filter_grad_ = *filter_grad; - filter_grad_.Resize(filter_matrix_shape); - set_zero(dev_ctx, filter_grad, static_cast(0)); - math::Im2ColFunctor im2col; - math::Vol2ColFunctor vol2col; - for (int i = 0; i < batch_size; i++) { - Tensor out_grad_batch = - transformed_output_grad.Slice(i, i + 1).Resize(output_matrix_shape); - Tensor in_batch = transformed_input.Slice(i, i + 1).Resize(input_shape); - for (int g = 0; g < groups; g++) { - // im2col - Tensor out_grad_slice = - out_grad_batch.Slice(g * out_step, (g + 1) * out_step); - Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); - - if (!is_expand) { - col.ShareDataWith(in_slice); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } else if (data_dim == 2U) { - im2col(dev_ctx, in_slice, dilations, strides, - std::vector{paddings[0], paddings[2], paddings[1], - paddings[3]}, - &col); - - } else if (data_dim == 3U) { - vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col); - } - - // gemm - Tensor filter_grad_slice = - filter_grad_.Slice(g * out_step, (g + 1) * out_step); - blas.MatMul(out_grad_slice, false, col_matrix, true, T(1.0), - &filter_grad_slice, T(1.0)); - } - } - } - } -}; - -template -class GemmConvDoubleGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = ctx.template device_context(); - PADDLE_ENFORCE_EQ( - platform::is_cpu_place(ctx.GetPlace()), true, - paddle::platform::errors::PreconditionNotMet("It must use CPUPlace.")); - const Tensor* X = ctx.Input("Input"); - const Tensor* dY = ctx.Input("DOutput"); - const Tensor* ddX = ctx.Input("DDInput"); - const Tensor* ddW_in = ctx.Input("DDFilter"); - - Tensor* ddY = ctx.Output("DDOutput"); - Tensor* dW = ctx.Output("DFilter"); - Tensor* dX = ctx.Output("DInput"); - Tensor W = GET_DATA_SAFELY(ctx.Input("Filter"), "Input", "Filter", - "GemmConvDoubleGrad"); - if (!ddY && !dW && !dX) return; - - const int groups = ctx.Attr("groups"); - const std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); - std::vector dilations = ctx.Attr>("dilations"); - const std::string padding_algorithm = - ctx.Attr("padding_algorithm"); - const std::string data_format = ctx.Attr("data_format"); - - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - // transform Tensor - Tensor transformed_X(X->dtype()); - Tensor transformed_dY(dY->dtype()); - Tensor transformed_ddX(X->dtype()); - - if (channel_last) { - ResizeToChannelFirst(ctx, X, &transformed_X); - TransToChannelFirst(ctx, X, &transformed_X); - - ResizeToChannelFirst(ctx, dY, &transformed_dY); - TransToChannelFirst(ctx, dY, &transformed_dY); - - if (ddX) { - ResizeToChannelFirst(ctx, ddX, &transformed_ddX); - TransToChannelFirst(ctx, ddX, &transformed_ddX); - } - } else { - transformed_X = *X; - transformed_dY = *dY; - if (ddX) { - transformed_ddX = *ddX; - } - } - - // update padding and dilation - auto in_dims = transformed_X.dims(); - auto filter_dims = W.dims(); - - framework::DDim in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - const int batch_size = static_cast(transformed_X.dims()[0]); - std::vector filter_shape_vec(phi::vectorize(W.dims())); - std::vector output_shape_vec( - phi::vectorize(transformed_dY.dims())); - - size_t data_dim = filter_shape_vec.size() - 2; - std::vector col_shape_vec(1 + 2 * data_dim); - // col_shape [in_channel/group, kh, kw, oh, ow] - col_shape_vec[0] = transformed_X.dims()[1] / groups; - for (size_t j = 0; j < data_dim; ++j) { - col_shape_vec[j + 1] = filter_shape_vec[j + 2]; - col_shape_vec[j + data_dim + 1] = output_shape_vec[j + 2]; - } - framework::DDim col_shape(phi::make_ddim(col_shape_vec)); - // col_matrix_shape [in_channel/group * kh * kw, oh * ow] - framework::DDim col_matrix_shape = - phi::flatten_to_2d(col_shape, data_dim + 1); - // input_shape [Cin, H, W] - framework::DDim input_shape = - phi::slice_ddim(transformed_X.dims(), 1, transformed_X.dims().size()); - // filter_matrix_shape [Cout, Cin * kh * kw] - framework::DDim filter_matrix_shape = {W.dims()[0], - W.numel() / W.dims()[0]}; - - W.Resize(filter_matrix_shape); - framework::DDim output_matrix_shape = { - transformed_dY.dims()[1], - transformed_dY.numel() / - (transformed_dY.dims()[0] * transformed_dY.dims()[1])}; - int in_step = static_cast(transformed_X.dims()[1]) / groups; - int out_step = static_cast(transformed_dY.dims()[1]) / groups; - - bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations); - Tensor col; - Tensor col_matrix; - if (is_expand) { - col = ctx.AllocateTmpTensor(col_shape, dev_ctx); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } - - phi::funcs::SetConstant set_zero; - auto blas = phi::funcs::GetBlas(dev_ctx); - - // dx convolution double grad: gemm + col2im(col2vol) - // dx = ddw * dy ==> dx(N, Cin, H, W), ddw(Cout, Cin, kh, kw), dy(N, Cout, - // oH, oW) - if (dX && ddW_in) { - Tensor ddW; - ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape); - dX->mutable_data(ctx.GetPlace()); - - Tensor transformed_dX(dX->dtype()); - - if (channel_last) { - ResizeToChannelFirst(ctx, dX, &transformed_dX); - - } else { - transformed_dX = *dX; - } - // if is_expand is false, the operation of set_zero is unnecessary - // because math::matmul will reset dx - if (is_expand) { - set_zero(dev_ctx, &transformed_dX, static_cast(0)); - } - math::Col2VolFunctor col2vol; - math::Col2ImFunctor col2im; - - for (int i = 0; i < batch_size; i++) { - Tensor dy_batch = - transformed_dY.Slice(i, i + 1).Resize(output_matrix_shape); - Tensor dx_batch = transformed_dX.Slice(i, i + 1).Resize(input_shape); - for (int g = 0; g < groups; g++) { - // gemm - Tensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step); - Tensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step); - Tensor dx_slice = dx_batch.Slice(g * in_step, (g + 1) * in_step); - if (!is_expand) { - col_matrix.ShareDataWith(dx_slice); - col_matrix.Resize(col_matrix_shape); - } - blas.MatMul(ddw_slice, true, dy_slice, false, T(1.0), &col_matrix, - T(0.0)); - - if (is_expand && data_dim == 2U) { - col2im(dev_ctx, col, dilations, strides, - std::vector{paddings[0], paddings[2], paddings[1], - paddings[3]}, - &dx_slice); - } else if (is_expand && data_dim == 3U) { - col2vol(dev_ctx, col, dilations, strides, paddings, &dx_slice); - } - } - } - if (channel_last) { - TransToChannelLast(ctx, &transformed_dX, dX); - } - } - - // dw = ddx * dy ==> dw(Cout, Cin, kh, kw), ddx(N, Cin, H, W), dy(N, Cout, - // oH, oW) - // dw convolution double grad: im2col(vol2col) + gemm - if (dW && ddX) { - dW->mutable_data(ctx.GetPlace()); - set_zero(dev_ctx, dW, static_cast(0)); - Tensor dW_arr = *dW; - dW_arr.Resize(filter_matrix_shape); - math::Im2ColFunctor im2col; - math::Vol2ColFunctor vol2col; - for (int i = 0; i < batch_size; ++i) { - Tensor dy_batch = - transformed_dY.Slice(i, i + 1).Resize(output_matrix_shape); - Tensor ddx_batch = transformed_ddX.Slice(i, i + 1).Resize(input_shape); - for (int g = 0; g < groups; ++g) { - // im2col - Tensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step); - Tensor ddx_slice = ddx_batch.Slice(g * in_step, (g + 1) * in_step); - if (!is_expand) { - col.ShareDataWith(ddx_slice); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } else if (data_dim == 2U) { - im2col(dev_ctx, ddx_slice, dilations, strides, - std::vector{paddings[0], paddings[2], paddings[1], - paddings[3]}, - &col); - } else if (data_dim == 3U) { - vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col); - } - - Tensor dw_slice = dW_arr.Slice(g * out_step, (g + 1) * out_step); - blas.MatMul(dy_slice, false, col_matrix, true, T(1.0), &dw_slice, - T(1.0)); - } - } - } - - // ddy = w * ddx + x * ddw ==> ddy(N, Cout, oH, oW), x/ddx(N, Cin, H, W), - // w/ddw(Cout, Cin, kh, kw) - // ddy convolution double grad: im2col(vol2col) + gemm - if (ddY) { - ddY->mutable_data(ctx.GetPlace()); - - Tensor transformed_ddY(ddY->dtype()); - if (channel_last) { - ResizeToChannelFirst(ctx, ddY, &transformed_ddY); - } else { - transformed_ddY = *ddY; - } - - set_zero(dev_ctx, &transformed_ddY, static_cast(0)); - math::Im2ColFunctor im2col; - math::Vol2ColFunctor vol2col; - for (int i = 0; i < batch_size; ++i) { - Tensor ddy_batch = - transformed_ddY.Slice(i, i + 1).Resize(output_matrix_shape); - for (int g = 0; g < groups; ++g) { - // gemm - Tensor ddy_slice = ddy_batch.Slice(g * out_step, (g + 1) * out_step); - - if (ddX) { - Tensor ddx_batch = - transformed_ddX.Slice(i, i + 1).Resize(input_shape); - Tensor ddx_slice = ddx_batch.Slice(g * in_step, (g + 1) * in_step); - if (!is_expand) { - col.ShareDataWith(ddx_slice); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } else if (data_dim == 2U) { - im2col(dev_ctx, ddx_slice, dilations, strides, - std::vector{paddings[0], paddings[2], paddings[1], - paddings[3]}, - &col); - } else if (data_dim == 3U) { - vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col); - } - Tensor w_slice = W.Slice(g * out_step, (g + 1) * out_step); - blas.MatMul(w_slice, false, col_matrix, false, T(1.0), &ddy_slice, - T(0.0)); - } - - if (ddW_in) { - Tensor x_batch = transformed_X.Slice(i, i + 1).Resize(input_shape); - Tensor x_slice = x_batch.Slice(g * in_step, (g + 1) * in_step); - - Tensor ddW; - ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape); - if (!is_expand) { - col.ShareDataWith(x_slice); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } else if (data_dim == 2U) { - im2col(dev_ctx, x_slice, dilations, strides, - std::vector{paddings[0], paddings[2], paddings[1], - paddings[3]}, - &col); - } else if (data_dim == 3U) { - vol2col(dev_ctx, x_slice, dilations, strides, paddings, &col); - } - - // gemm - Tensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step); - blas.MatMul(ddw_slice, false, col_matrix, false, T(1.0), &ddy_slice, - T(1.0)); - } - } - } - if (channel_last) { - TransToChannelLast(ctx, &transformed_ddY, ddY); - } - } - } -}; - -template -class DepthwiseConvKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* input = context.Input("Input"); - Tensor filter = *context.Input("Filter"); - Tensor* output = context.Output("Output"); - output->mutable_data(context.GetPlace()); - - const std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - std::vector dilations = context.Attr>("dilations"); - bool fuse_relu = context.Attr("fuse_relu_before_depthwise_conv"); - - const std::string padding_algorithm = - context.Attr("padding_algorithm"); - const std::string data_format = context.Attr("data_format"); - - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - if (channel_last) { - PADDLE_ENFORCE_EQ( - output->dims()[output->dims().size() - 1] % - input->dims()[input->dims().size() - 1], - 0, platform::errors::InvalidArgument( - "ShapeError: The output channels must be a multiple of the " - "input channels. But receivced output channel number is %d " - "and input channel number is %d", - output->dims()[output->dims().size() - 1], - input->dims()[input->dims().size() - 1])); - } else { - PADDLE_ENFORCE_EQ( - output->dims()[1] % input->dims()[1], 0, - platform::errors::InvalidArgument( - "ShapeError: The output channels must be a multiple of the " - "input channels. But receivced output channel number is %d " - "and input channel number is %d", - output->dims()[1], input->dims()[1])); - } - - // update padding and dilation - auto in_dims = input->dims(); - auto filter_dims = filter.dims(); - - framework::DDim in_data_dims; - const framework::DataLayout data_layout = - framework::StringToDataLayout(data_format); - if (data_layout != framework::DataLayout::kNHWC) { - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - } else { - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - } - - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - bool is_sys_pad = strides.size() * 2 == paddings.size() ? false : true; - if (!is_sys_pad) { - for (size_t i = 0; i < strides.size(); ++i) { - paddings.erase(paddings.begin() + i + 1); - } - } - - auto& dev_ctx = context.template device_context(); - - if (fuse_relu) { - math::DepthwiseConvFunctor depthwiseConv; - depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations, - output, data_layout); - } else { - math::DepthwiseConvFunctor depthwiseConv; - depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations, - output, data_layout); - } - } -}; - -template -class DepthwiseConvGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* input = context.Input("Input"); - const Tensor* output_grad = - context.Input(framework::GradVarName("Output")); - Tensor* input_grad = - context.Output(framework::GradVarName("Input")); - Tensor* filter_grad = - context.Output(framework::GradVarName("Filter")); - Tensor filter = *context.Input("Filter"); - - if (!input_grad && !filter_grad) return; - - std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - std::vector dilations = context.Attr>("dilations"); - bool fuse_relu = context.Attr("fuse_relu_before_depthwise_conv"); - const std::string padding_algorithm = - context.Attr("padding_algorithm"); - const std::string data_format = context.Attr("data_format"); - - // update padding and dilation - auto in_dims = input->dims(); - auto filter_dims = filter.dims(); - - framework::DDim in_data_dims; - const framework::DataLayout data_layout = - framework::StringToDataLayout(data_format); - if (data_layout != framework::DataLayout::kNHWC) { - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - } else { - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - } - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - bool is_sys_pad = strides.size() * 2 == paddings.size() ? false : true; - if (!is_sys_pad) { - for (size_t i = 0; i < strides.size(); ++i) { - paddings.erase(paddings.begin() + i + 1); - } - } - phi::funcs::SetConstant set_zero; - auto& dev_ctx = context.template device_context(); - - if (input_grad) { - input_grad->mutable_data(context.GetPlace()); - set_zero(dev_ctx, input_grad, static_cast(0)); - - if (fuse_relu) { - math::DepthwiseConvInputGradFunctor - depthwiseConvInputGrad; - depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides, - paddings, dilations, input_grad, data_layout); - } else { - math::DepthwiseConvInputGradFunctor - depthwiseConvInputGrad; - depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides, - paddings, dilations, input_grad, data_layout); - } - } - - if (filter_grad) { - filter_grad->mutable_data(context.GetPlace()); - set_zero(dev_ctx, filter_grad, static_cast(0)); - if (fuse_relu) { - math::DepthwiseConvFilterGradFunctor - depthwiseConvFilterGrad; - depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, - paddings, dilations, filter_grad, data_layout); - } else { - math::DepthwiseConvFilterGradFunctor - depthwiseConvFilterGrad; - depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, - paddings, dilations, filter_grad, data_layout); - } - } - } -}; - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu b/paddle/fluid/operators/conv_transpose_cudnn_op.cu index 141a99f60f104c3bf32e16a1254d0f5eec623645..1841b78af32dd95d6884d5eb78ad30322ba7723e 100644 --- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu +++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu @@ -244,10 +244,14 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { #ifdef PADDLE_WITH_HIP using search = SearchAlgorithm; workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args)); - algo = search::Find(args, false, deterministic, workspace_size, ctx); + algo = search::Find( + args, false, deterministic, workspace_size, + ctx.template device_context()); #else using search = SearchAlgorithm; - algo = search::Find(args, false, deterministic, ctx); + algo = search::Find( + args, false, deterministic, + ctx.template device_context()); workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args, algo)); #endif @@ -501,11 +505,14 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { using search1 = SearchAlgorithm; workspace_size = std::max(workspace_size, search1::GetWorkspaceSize(args1)); - data_algo = - search1::Find(args1, false, deterministic, workspace_size, ctx); + data_algo = search1::Find( + args1, false, deterministic, workspace_size, + ctx.template device_context()); #else using search1 = SearchAlgorithm; - data_algo = search1::Find(args1, false, deterministic, ctx); + data_algo = search1::Find( + args1, false, deterministic, + ctx.template device_context()); workspace_size = std::max(workspace_size, search1::GetWorkspaceSize(args1, data_algo)); #endif @@ -523,11 +530,14 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { using search2 = SearchAlgorithm; workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2)); - filter_algo = - search2::Find(args2, false, deterministic, workspace_size, ctx); + filter_algo = search2::Find( + args2, false, deterministic, workspace_size, + ctx.template device_context()); #else using search2 = SearchAlgorithm; - filter_algo = search2::Find(args2, false, deterministic, ctx); + filter_algo = search2::Find( + args2, false, deterministic, + ctx.template device_context()); workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2, filter_algo)); #endif @@ -944,11 +954,14 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { #ifdef PADDLE_WITH_HIP using search1 = SearchAlgorithm; workspace_size = search1::GetWorkspaceSize(args1); - bwd_algo1 = - search1::Find(args1, false, deterministic, workspace_size, ctx); + bwd_algo1 = search1::Find( + args1, false, deterministic, workspace_size, + ctx.template device_context()); #else using search1 = SearchAlgorithm; - bwd_algo1 = search1::Find(args1, false, deterministic, ctx); + bwd_algo1 = search1::Find( + args1, false, deterministic, + ctx.template device_context()); workspace_size = search1::GetWorkspaceSize(args1, bwd_algo1); #endif } @@ -965,11 +978,14 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { using search2 = SearchAlgorithm; workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2)); - bwd_algo2 = - search2::Find(args2, false, deterministic, workspace_size, ctx); + bwd_algo2 = search2::Find( + args2, false, deterministic, workspace_size, + ctx.template device_context()); #else using search2 = SearchAlgorithm; - bwd_algo2 = search2::Find(args2, false, deterministic, ctx); + bwd_algo2 = search2::Find( + args2, false, deterministic, + ctx.template device_context()); workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2, bwd_algo2)); #endif @@ -990,11 +1006,14 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { using search3 = SearchAlgorithm; workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3)); - filter_algo = - search3::Find(args3, false, deterministic, workspace_size, ctx); + filter_algo = search3::Find( + args3, false, deterministic, workspace_size, + ctx.template device_context()); #else using search3 = SearchAlgorithm; - filter_algo = search3::Find(args3, false, deterministic, ctx); + filter_algo = search3::Find( + args3, false, deterministic, + ctx.template device_context()); workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3, filter_algo)); #endif @@ -1013,11 +1032,14 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { using search4 = SearchAlgorithm; workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4)); - data_algo = - search4::Find(args4, false, deterministic, workspace_size, ctx); + data_algo = search4::Find( + args4, false, deterministic, workspace_size, + ctx.template device_context()); #else using search4 = SearchAlgorithm; - data_algo = search4::Find(args4, false, deterministic, ctx); + data_algo = search4::Find( + args4, false, deterministic, + ctx.template device_context()); workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo)); #endif diff --git a/paddle/fluid/operators/conv_transpose_op.cu b/paddle/fluid/operators/conv_transpose_op.cu index b2a4910222f1178d23e94eade9580248bb103c88..054cb4b33895b02a816cc2bff82b1c9052bc645d 100644 --- a/paddle/fluid/operators/conv_transpose_op.cu +++ b/paddle/fluid/operators/conv_transpose_op.cu @@ -13,10 +13,150 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/conv_transpose_op.h" +#include "paddle/phi/kernels/gpu/depthwise_conv.h" namespace ops = paddle::operators; using CUDA = paddle::platform::CUDADeviceContext; +namespace paddle { +namespace operators { +using Tensor = framework::Tensor; +using DDim = framework::DDim; + +template +class DepthwiseConvTransposeKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const std::string data_layout_str = + context.Attr("data_format"); + const framework::DataLayout data_layout = + framework::StringToDataLayout(data_layout_str); + const Tensor* input = context.Input("Input"); + Tensor filter = *context.Input("Filter"); + Tensor* output = context.Output("Output"); + output->mutable_data(context.GetPlace()); + + int groups = context.Attr("groups"); + PADDLE_ENFORCE_EQ( + groups, filter.dims()[0], + platform::errors::InvalidArgument( + "groups should be error to the 1st dimension of filter. But " + "received groups is %d and filter dimension[0] is %d", + groups, filter.dims()[0])); + + std::vector strides = context.Attr>("strides"); + std::vector paddings = context.Attr>("paddings"); + std::vector dilations = context.Attr>("dilations"); + std::string padding_algorithm = + context.Attr("padding_algorithm"); + for (auto v : dilations) { + PADDLE_ENFORCE_EQ(v, 1, platform::errors::InvalidArgument( + "dilations should be 1 in depthwise conv. " + "But received dilations is %d", + v)); + } + + auto in_dims = input->dims(); + auto filter_dims = filter.dims(); + + framework::DDim in_data_dims; + if (data_layout != framework::DataLayout::kNHWC) { + in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); + } else { + in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); + } + framework::DDim filter_data_dims = + phi::slice_ddim(filter_dims, 2, filter_dims.size()); + std::vector ksize = phi::vectorize(filter_data_dims); + UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, + in_data_dims, strides, ksize); + + output->mutable_data(context.GetPlace()); + auto& dev_ctx = context.template device_context(); + phi::funcs::SetConstant set_zero; + set_zero(dev_ctx, output, static_cast(0)); + + math::DepthwiseConvInputGradFunctor + depthwiseConvInputGrad; + depthwiseConvInputGrad( + static_cast::TYPE&>(dev_ctx), + *output, filter, *input, strides, + std::vector{paddings[0], paddings[2], paddings[1], paddings[3]}, + dilations, output, data_layout); + } +}; + +template +class DepthwiseConvTransposeGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const std::string data_layout_str = + context.Attr("data_format"); + const framework::DataLayout data_layout = + framework::StringToDataLayout(data_layout_str); + const Tensor* input = context.Input("Input"); + const Tensor* output_grad = + context.Input(framework::GradVarName("Output")); + Tensor* input_grad = + context.Output(framework::GradVarName("Input")); + Tensor* filter_grad = + context.Output(framework::GradVarName("Filter")); + Tensor filter = *context.Input("Filter"); + + if (!input_grad && !filter_grad) return; + + auto& dev_ctx = context.template device_context(); + std::vector strides = context.Attr>("strides"); + std::vector paddings = context.Attr>("paddings"); + std::vector dilations = context.Attr>("dilations"); + std::string padding_algorithm = + context.Attr("padding_algorithm"); + + auto in_dims = input->dims(); + auto filter_dims = filter.dims(); + + framework::DDim in_data_dims; + if (data_layout != framework::DataLayout::kNHWC) { + in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); + } else { + in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); + } + framework::DDim filter_data_dims = + phi::slice_ddim(filter_dims, 2, filter_dims.size()); + std::vector ksize = phi::vectorize(filter_data_dims); + UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, + in_data_dims, strides, ksize); + + if (input_grad) { + math::DepthwiseConvFunctor depthwiseConv; + depthwiseConv( + static_cast::TYPE&>(dev_ctx), + *output_grad, filter, strides, + std::vector{paddings[0], paddings[2], paddings[1], paddings[3]}, + dilations, input_grad, data_layout); + } + + if (filter_grad) { + phi::funcs::SetConstant set_zero; + filter_grad->mutable_data(context.GetPlace()); + set_zero(dev_ctx, filter_grad, static_cast(0)); + + math::DepthwiseConvFilterGradFunctor + depthwiseConvFilterGrad; + depthwiseConvFilterGrad( + static_cast::TYPE&>(dev_ctx), + *output_grad, *input, strides, + std::vector{paddings[0], paddings[2], paddings[1], paddings[3]}, + dilations, filter_grad, data_layout); + } + } +}; + +} // namespace operators +} // namespace paddle // conv2d REGISTER_OP_CUDA_KERNEL(conv2d_transpose, ops::GemmConvTransposeKernel, diff --git a/paddle/fluid/operators/conv_transpose_op.h b/paddle/fluid/operators/conv_transpose_op.h index 76d6ad6bf2ff7361a90fb6f013f989db5a2b8845..ee0fb7ab3683364f6db3cffd7ddef67c61f19433 100644 --- a/paddle/fluid/operators/conv_transpose_op.h +++ b/paddle/fluid/operators/conv_transpose_op.h @@ -21,7 +21,6 @@ limitations under the License. */ #include "paddle/fluid/operators/conv_op.h" #include "paddle/fluid/operators/eigen/eigen_function.h" #include "paddle/fluid/operators/math/concat_and_split.h" -#include "paddle/fluid/operators/math/depthwise_conv.h" #include "paddle/fluid/operators/math/im2col.h" #include "paddle/fluid/operators/math/vol2col.h" #include "paddle/phi/kernels/funcs/blas/blas.h" @@ -578,130 +577,5 @@ class GemmConvTransposeGradKernel : public framework::OpKernel { } }; -template -class DepthwiseConvTransposeKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const std::string data_layout_str = - context.Attr("data_format"); - const framework::DataLayout data_layout = - framework::StringToDataLayout(data_layout_str); - const Tensor* input = context.Input("Input"); - Tensor filter = *context.Input("Filter"); - Tensor* output = context.Output("Output"); - output->mutable_data(context.GetPlace()); - - int groups = context.Attr("groups"); - PADDLE_ENFORCE_EQ( - groups, filter.dims()[0], - platform::errors::InvalidArgument( - "groups should be error to the 1st dimension of filter. But " - "received groups is %d and filter dimension[0] is %d", - groups, filter.dims()[0])); - - std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - std::vector dilations = context.Attr>("dilations"); - std::string padding_algorithm = - context.Attr("padding_algorithm"); - for (auto v : dilations) { - PADDLE_ENFORCE_EQ(v, 1, platform::errors::InvalidArgument( - "dilations should be 1 in depthwise conv. " - "But received dilations is %d", - v)); - } - - auto in_dims = input->dims(); - auto filter_dims = filter.dims(); - - framework::DDim in_data_dims; - if (data_layout != framework::DataLayout::kNHWC) { - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - } else { - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - } - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - output->mutable_data(context.GetPlace()); - auto& dev_ctx = context.template device_context(); - phi::funcs::SetConstant set_zero; - set_zero(dev_ctx, output, static_cast(0)); - - math::DepthwiseConvInputGradFunctor - depthwiseConvInputGrad; - depthwiseConvInputGrad( - dev_ctx, *output, filter, *input, strides, - std::vector{paddings[0], paddings[2], paddings[1], paddings[3]}, - dilations, output, data_layout); - } -}; - -template -class DepthwiseConvTransposeGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const std::string data_layout_str = - context.Attr("data_format"); - const framework::DataLayout data_layout = - framework::StringToDataLayout(data_layout_str); - const Tensor* input = context.Input("Input"); - const Tensor* output_grad = - context.Input(framework::GradVarName("Output")); - Tensor* input_grad = - context.Output(framework::GradVarName("Input")); - Tensor* filter_grad = - context.Output(framework::GradVarName("Filter")); - Tensor filter = *context.Input("Filter"); - - if (!input_grad && !filter_grad) return; - - auto& dev_ctx = context.template device_context(); - std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - std::vector dilations = context.Attr>("dilations"); - std::string padding_algorithm = - context.Attr("padding_algorithm"); - - auto in_dims = input->dims(); - auto filter_dims = filter.dims(); - - framework::DDim in_data_dims; - if (data_layout != framework::DataLayout::kNHWC) { - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - } else { - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - } - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - if (input_grad) { - math::DepthwiseConvFunctor depthwiseConv; - depthwiseConv( - dev_ctx, *output_grad, filter, strides, - std::vector{paddings[0], paddings[2], paddings[1], paddings[3]}, - dilations, input_grad, data_layout); - } - - if (filter_grad) { - phi::funcs::SetConstant set_zero; - filter_grad->mutable_data(context.GetPlace()); - set_zero(dev_ctx, filter_grad, static_cast(0)); - - math::DepthwiseConvFilterGradFunctor - depthwiseConvFilterGrad; - depthwiseConvFilterGrad( - dev_ctx, *output_grad, *input, strides, - std::vector{paddings[0], paddings[2], paddings[1], paddings[3]}, - dilations, filter_grad, data_layout); - } - } -}; } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc index 1864bdbb86667290474d297cc481f5d6352c8022..b3792a176fabeb8406fd2f1b83c6723207dad2f1 100644 --- a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc +++ b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc @@ -22,6 +22,7 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/fused/cudnn_norm_conv.cu.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace framework = paddle::framework; @@ -29,10 +30,10 @@ namespace platform = paddle::platform; namespace op = paddle::operators; using Tensor = paddle::framework::Tensor; -USE_OP(conv2d); -USE_OP(conv2d_grad); -USE_OP_DEVICE_KERNEL(conv2d, CUDNN); -USE_OP_DEVICE_KERNEL(conv2d_grad, CUDNN); +USE_OP_ITSELF(conv2d); +USE_OP_ITSELF(conv2d_grad); +PD_DECLARE_KERNEL(conv2d, GPUDNN, ALL_LAYOUT); +PD_DECLARE_KERNEL(conv2d_grad, GPUDNN, ALL_LAYOUT); template void InitRandomTensor(const std::vector &dims, diff --git a/paddle/fluid/operators/math/depthwise_conv.h b/paddle/fluid/operators/math/depthwise_conv.h deleted file mode 100644 index e41f0aedf39ef582b4533b1eeb6ccda1e8ed7e49..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/math/depthwise_conv.h +++ /dev/null @@ -1,72 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/phi/core/hostdevice.h" - -namespace paddle { -namespace operators { -namespace math { - -using DataLayout = framework::DataLayout; - -/* - * \brief Compute the depthwise convolution which include - * forward process and backpropagation process - */ -template -class DepthwiseConvFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& filter, - const std::vector& strides, - const std::vector& paddings, - const std::vector& dilations, framework::Tensor* output, - const DataLayout data_layout = DataLayout::kNCHW); -}; - -template -class DepthwiseConvInputGradFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& filter, - const framework::Tensor& output_grad, - const std::vector& strides, - const std::vector& paddings, - const std::vector& dilations, - framework::Tensor* input_grad, - const DataLayout data_layout = DataLayout::kNCHW); -}; - -template -class DepthwiseConvFilterGradFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output_grad, - const std::vector& strides, - const std::vector& paddings, - const std::vector& dilations, - framework::Tensor* filter_grad, - const DataLayout data_layout = DataLayout::kNCHW); -}; - -} // namespace math -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/math/vol2col.cc b/paddle/fluid/operators/math/vol2col.cc index 42bf1f471deb5238fdb34dcd9284972930305f58..bc5a589ed6fb137c5013253a65971dcf80d4ac72 100644 --- a/paddle/fluid/operators/math/vol2col.cc +++ b/paddle/fluid/operators/math/vol2col.cc @@ -14,6 +14,8 @@ limitations under the License. */ #include "paddle/fluid/operators/math/vol2col.h" +#include "paddle/phi/backends/cpu/cpu_context.h" + namespace paddle { namespace platform { class CPUDeviceContext; @@ -141,6 +143,116 @@ class Vol2ColFunctor { } }; +template +class Vol2ColFunctor { + public: + void operator()(const phi::CPUContext& context, const framework::Tensor& vol, + const std::vector& dilations, + const std::vector& strides, + const std::vector& paddings, framework::Tensor* col, + const DataLayout data_layout) const { + PADDLE_ENFORCE_EQ(vol.dims().size(), 4, + platform::errors::InvalidArgument( + "The dimension of vol should be 4, but received %d.", + vol.dims().size())); + + PADDLE_ENFORCE_EQ(col->dims().size(), 7, + platform::errors::InvalidArgument( + "The dimension of col should be 7, but received %d.", + col->dims().size())); + + int input_channels = + (data_layout != DataLayout::kNHWC ? vol.dims()[0] : vol.dims()[3]); + int input_depth = + (data_layout != DataLayout::kNHWC ? vol.dims()[1] : vol.dims()[0]); + int input_height = + (data_layout != DataLayout::kNHWC ? vol.dims()[2] : vol.dims()[1]); + int input_width = + (data_layout != DataLayout::kNHWC ? vol.dims()[3] : vol.dims()[2]); + int filter_depth = col->dims()[1]; + int filter_height = col->dims()[2]; + int filter_width = col->dims()[3]; + int output_depth = col->dims()[4]; + int output_height = col->dims()[5]; + int output_width = col->dims()[6]; + int channels_col = + input_channels * filter_depth * filter_height * filter_width; + + // changed + bool paddings_size_is_6 = (paddings.size() == 6); + int pad_d_forth = paddings_size_is_6 ? paddings[0] : paddings[0]; + int pad_d_back = paddings_size_is_6 ? paddings[1] : paddings[0]; + int pad_h_up = paddings_size_is_6 ? paddings[2] : paddings[1]; + int pad_h_down = paddings_size_is_6 ? paddings[3] : paddings[1]; + int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2]; + int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2]; + + auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back - + ((dilations[0] * (filter_depth - 1) + 1))) / + strides[0] + + 1; + PADDLE_ENFORCE_EQ( + input_depth_tmp, output_depth, + platform::errors::InvalidArgument( + "input_depth(%d) and output_depth(%d) are mismatching.", + input_depth_tmp, output_depth)); + auto input_height_tmp = (input_height + pad_h_up + pad_h_down - + ((dilations[1] * (filter_height - 1) + 1))) / + strides[1] + + 1; + PADDLE_ENFORCE_EQ( + input_height_tmp, output_height, + platform::errors::InvalidArgument( + "input_height(%d) and output_height(%d) are mismatching.", + input_height_tmp, output_height)); + auto input_width_tmp = (input_width + pad_w_left + pad_w_right - + ((dilations[2] * (filter_width - 1) + 1))) / + strides[2] + + 1; + PADDLE_ENFORCE_EQ( + input_width_tmp, output_width, + platform::errors::InvalidArgument( + "input_width(%d) and output_width(%d) are mismatching.", + input_width_tmp, output_width)); + const T* vol_data = vol.data(); + T* col_data = col->data(); + + for (int c = 0; c < channels_col; ++c) { + int w_offset = c % filter_width; + int h_offset = (c / filter_width) % filter_height; + int d_offset = (c / filter_width / filter_height) % filter_depth; + int c_in = c / filter_width / filter_height / filter_depth; + for (int d = 0; d < output_depth; ++d) { + int d_pad = d * strides[0] - pad_d_forth + d_offset * dilations[0]; + for (int h = 0; h < output_height; ++h) { + int h_pad = h * strides[1] - pad_h_up + h_offset * dilations[1]; + for (int w = 0; w < output_width; ++w) { + int w_pad = w * strides[2] - pad_w_left + w_offset * dilations[2]; + + int col_idx = + ((c * output_depth + d) * output_height + h) * output_width + w; + int vol_idx; + if (data_layout != DataLayout::kNHWC) { + vol_idx = ((c_in * input_depth + d_pad) * input_height + h_pad) * + input_width + + w_pad; + } else { + vol_idx = ((d_pad * input_height + h_pad) * input_width + w_pad) * + input_channels + + c_in; + } + col_data[col_idx] = + (h_pad < 0 || h_pad >= input_height || w_pad < 0 || + w_pad >= input_width || d_pad < 0 || d_pad >= input_depth) + ? static_cast(0) + : vol_data[vol_idx]; + } + } + } + } + } +}; + /* * vol = [input_channels,input_depth, input_height, input_width] * col = @@ -258,10 +370,125 @@ class Col2VolFunctor { } }; +template +class Col2VolFunctor { + public: + void operator()(const phi::CPUContext& context, const framework::Tensor& col, + const std::vector& dilations, + const std::vector& strides, + const std::vector& paddings, framework::Tensor* vol, + const DataLayout data_layout) const { + PADDLE_ENFORCE_EQ(vol->dims().size(), 4, + platform::errors::InvalidArgument( + "The dimension of vol should be 4, but received %d.", + vol->dims().size())); + + PADDLE_ENFORCE_EQ(col.dims().size(), 7, + platform::errors::InvalidArgument( + "The dimension of col should be 7, but received %d.", + col.dims().size())); + + int input_channels = + (data_layout != DataLayout::kNHWC ? vol->dims()[0] : vol->dims()[3]); + int input_depth = + (data_layout != DataLayout::kNHWC ? vol->dims()[1] : vol->dims()[0]); + int input_height = + (data_layout != DataLayout::kNHWC ? vol->dims()[2] : vol->dims()[1]); + int input_width = + (data_layout != DataLayout::kNHWC ? vol->dims()[3] : vol->dims()[2]); + int filter_depth = col.dims()[1]; + int filter_height = col.dims()[2]; + int filter_width = col.dims()[3]; + int output_depth = col.dims()[4]; + int output_height = col.dims()[5]; + int output_width = col.dims()[6]; + int channels_col = + input_channels * filter_depth * filter_height * filter_width; + + bool paddings_size_is_6 = (paddings.size() == 6); + int pad_d_forth = paddings_size_is_6 ? paddings[0] : paddings[0]; + int pad_d_back = paddings_size_is_6 ? paddings[1] : paddings[0]; + int pad_h_up = paddings_size_is_6 ? paddings[2] : paddings[1]; + int pad_h_down = paddings_size_is_6 ? paddings[3] : paddings[1]; + int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2]; + int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2]; + + auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back - + ((dilations[0] * (filter_depth - 1) + 1))) / + strides[0] + + 1; + PADDLE_ENFORCE_EQ( + input_depth_tmp, output_depth, + platform::errors::InvalidArgument( + "input_depth(%d) and output_depth(%d) are mismatching.", + input_depth_tmp, output_depth)); + auto input_height_tmp = (input_height + pad_h_up + pad_h_down - + ((dilations[1] * (filter_height - 1) + 1))) / + strides[1] + + 1; + PADDLE_ENFORCE_EQ( + input_height_tmp, output_height, + platform::errors::InvalidArgument( + "input_height(%d) and output_height(%d) are mismatching.", + input_height_tmp, output_height)); + auto input_width_tmp = (input_width + pad_w_left + pad_w_right - + ((dilations[2] * (filter_width - 1) + 1))) / + strides[2] + + 1; + PADDLE_ENFORCE_EQ( + input_width_tmp, output_width, + platform::errors::InvalidArgument( + "input_width(%d) and output_width(%d) are mismatching.", + input_width_tmp, output_width)); + T* vol_data = vol->data(); + const T* col_data = col.data(); + + for (int c = 0; c < channels_col; ++c) { + int w_offset = c % filter_width; + int h_offset = (c / filter_width) % filter_height; + int d_offset = (c / filter_width / filter_height) % filter_depth; + int cIm = c / filter_width / filter_height / filter_depth; + for (int d = 0; d < output_depth; ++d) { + int d_pad = d * strides[0] - pad_d_forth + d_offset * dilations[0]; + for (int h = 0; h < output_height; ++h) { + int h_pad = h * strides[1] - pad_h_up + h_offset * dilations[1]; + for (int w = 0; w < output_width; ++w) { + int w_pad = w * strides[2] - pad_w_left + w_offset * dilations[2]; + + if (h_pad >= 0 && h_pad < input_height && w_pad >= 0 && + w_pad < input_width && d_pad >= 0 && d_pad < input_depth) { + int vol_idx; + if (data_layout != DataLayout::kNHWC) { + vol_idx = ((cIm * input_depth + d_pad) * input_height + h_pad) * + input_width + + w_pad; + } else { + vol_idx = + ((d_pad * input_height + h_pad) * input_width + w_pad) * + input_channels + + cIm; + } + int col_idx = + ((c * output_depth + d) * output_height + h) * output_width + + w; + vol_data[vol_idx] += col_data[col_idx]; + } + } + } + } + } + } +}; + template class Vol2ColFunctor; template class Vol2ColFunctor; +template class Vol2ColFunctor; +template class Vol2ColFunctor; + template class Col2VolFunctor; template class Col2VolFunctor; +template class Col2VolFunctor; +template class Col2VolFunctor; } // namespace math } // namespace operators diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc index 2fdeecf89346fcf15f38b291ed5af49b8a2c8fc0..05cd264cf3ec9ee6e47d822d7e4d79ab7cd64441 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc @@ -33,7 +33,7 @@ USE_OP(relu); USE_OP_DEVICE_KERNEL(relu, MKLDNN); USE_OP_ITSELF(softmax); USE_OP_DEVICE_KERNEL(softmax, MKLDNN); -USE_OP(conv2d); +USE_OP_ITSELF(conv2d); USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32); namespace paddle { @@ -55,7 +55,7 @@ class CacheTester { onednn_dev_ctx_->ResetBlobMap(nullptr); } - bool Analyze(unsigned short int num_entries) { + bool Analyze(uint16_t num_entries) { // Number of created objects in cache should be as expected (num_entries) return onednn_dev_ctx_->GetCachedObjectsNumber() == num_entries; } diff --git a/paddle/phi/core/compat/arg_map_context.h b/paddle/phi/core/compat/arg_map_context.h index f625d57df2ef2dc2f9505853dc5e07e5d9e0022e..688a0e54a0cf4f0f041704b03c5d256a7c17d1ec 100644 --- a/paddle/phi/core/compat/arg_map_context.h +++ b/paddle/phi/core/compat/arg_map_context.h @@ -18,6 +18,7 @@ limitations under the License. */ #include #include +#include "paddle/phi/common/place.h" #include "paddle/utils/any.h" #include "paddle/utils/flat_hash_map.h" #include "paddle/utils/small_vector.h" diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index ef51d6daf6a0052f39c2cf6253c208412cbb6904..4ffa1826a29fa3904b959a1e8f2fd9ceb27511b4 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -10,7 +10,7 @@ add_subdirectory(funcs) set_property(GLOBAL PROPERTY PHI_KERNELS "") set(COMMON_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils) -set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col concat_and_split_functor softmax) +set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col vol2col concat_and_split_functor softmax) # remove this dep after removing fluid deps on tensor creation set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} phi_api_utils) set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta) diff --git a/paddle/phi/kernels/conv_grad_grad_kernel.h b/paddle/phi/kernels/conv_grad_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..339f1c00eaa505cdcf8976abae92a6c93cfd50eb --- /dev/null +++ b/paddle/phi/kernels/conv_grad_grad_kernel.h @@ -0,0 +1,61 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void ConvGradGradKernel(const Context& dev_ctx, + paddle::optional input_grad_grad, + paddle::optional filter_grad_grad, + const DenseTensor& out_grad, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& paddding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + DenseTensor* out_grad_grad, + DenseTensor* input_grad, + DenseTensor* filter_grad); + +template +void Conv3DGradGradKernel(const Context& dev_ctx, + paddle::optional input_grad_grad, + paddle::optional filter_grad_grad, + const DenseTensor& out_grad, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& paddding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + DenseTensor* out_grad_grad, + DenseTensor* input_grad, + DenseTensor* filter_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/conv_grad_kernel.h b/paddle/phi/kernels/conv_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..bad30989ac90d8b46fee99039c2772d00c7d939a --- /dev/null +++ b/paddle/phi/kernels/conv_grad_kernel.h @@ -0,0 +1,73 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void ConvGradKernel(const Context& dev_ctx, + const DenseTensor& out_grad, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& paddding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + DenseTensor* input_grad, + DenseTensor* filter_grad); + +template +void Conv3DGradKernel(const Context& dev_ctx, + const DenseTensor& out_grad, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& paddding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + DenseTensor* input_grad, + DenseTensor* filter_grad); + +template +void DepthwiseConvGradKernel(const Context& dev_ctx, + const DenseTensor& out_grad, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& paddding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + bool fuse_relu, + DenseTensor* input_grad, + DenseTensor* filter_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/conv_kernel.h b/paddle/phi/kernels/conv_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..eb0bfdd0275b5050054c620e722b0e7653fd678a --- /dev/null +++ b/paddle/phi/kernels/conv_kernel.h @@ -0,0 +1,67 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void ConvKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& paddding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + DenseTensor* out); + +template +void Conv3DKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + DenseTensor* out); + +template +void DepthwiseConvKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& paddding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + bool fuse_relu, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc b/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..f157bb017f81c5636a87ddcecb82e977e4fd18ba --- /dev/null +++ b/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc @@ -0,0 +1,72 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/conv_grad_grad_kernel.h" +#include "paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { +template +void Conv3DGradGradKernel(const Context& ctx, + paddle::optional input_grad_grad, + paddle::optional filter_grad_grad, + const DenseTensor& out_grad, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings_t, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations_t, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search_t, + DenseTensor* out_grad_grad, + DenseTensor* input_grad, + DenseTensor* filter_grad) { + ConvGradGradKernel(ctx, + input_grad_grad, + filter_grad_grad, + out_grad, + input, + filter, + strides, + paddings_t, + padding_algorithm, + groups, + dilations_t, + data_format, + use_addto, + workspace_size_MB, + exhaustive_search_t, + out_grad_grad, + input_grad, + filter_grad); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + conv2d_grad_grad, CPU, ALL_LAYOUT, phi::ConvGradGradKernel, float, double) { +} + +PD_REGISTER_KERNEL(conv3d_grad_grad, + CPU, + ALL_LAYOUT, + phi::Conv3DGradGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/cpu/conv_grad_kernel.cc b/paddle/phi/kernels/cpu/conv_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..994ad861bd15b747524d7e0f47a0ed5b8ee465cd --- /dev/null +++ b/paddle/phi/kernels/cpu/conv_grad_kernel.cc @@ -0,0 +1,103 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/conv_grad_kernel.h" +#include "paddle/phi/kernels/impl/conv_grad_kernel_impl.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void DepthwiseConvGradKernel(const Context& dev_ctx, + const DenseTensor& out_grad, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& paddding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + bool fuse_relu, + DenseTensor* input_grad, + DenseTensor* filter_grad) { + ConvGradKernel(dev_ctx, + out_grad, + input, + filter, + strides, + paddings, + paddding_algorithm, + groups, + dilations, + data_format, + use_addto, + workspace_size_MB, + exhaustive_search, + input_grad, + filter_grad); +} + +template +void Conv3DGradKernel(const Context& dev_ctx, + const DenseTensor& out_grad, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& paddding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + DenseTensor* input_grad, + DenseTensor* filter_grad) { + ConvGradKernel(dev_ctx, + out_grad, + input, + filter, + strides, + paddings, + paddding_algorithm, + groups, + dilations, + data_format, + use_addto, + workspace_size_MB, + exhaustive_search, + input_grad, + filter_grad); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + conv2d_grad, CPU, ALL_LAYOUT, phi::ConvGradKernel, float, double) {} + +PD_REGISTER_KERNEL(depthwise_conv2d_grad, + CPU, + ALL_LAYOUT, + phi::DepthwiseConvGradKernel, + float, + double) {} + +PD_REGISTER_KERNEL( + conv3d_grad, CPU, ALL_LAYOUT, phi::Conv3DGradKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/conv_kernel.cc b/paddle/phi/kernels/cpu/conv_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..e0b4ee7d5776fdaf51955b2d35bb339735411a28 --- /dev/null +++ b/paddle/phi/kernels/cpu/conv_kernel.cc @@ -0,0 +1,92 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/conv_kernel.h" +#include "paddle/phi/kernels/impl/conv_kernel_impl.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { +template +void DepthwiseConvKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + bool fuse_relu, + DenseTensor* out) { + ConvKernel(dev_ctx, + input, + filter, + strides, + paddings, + padding_algorithm, + groups, + dilations, + data_format, + use_addto, + workspace_size_MB, + exhaustive_search, + out); +} + +template +void Conv3DKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + DenseTensor* out) { + ConvKernel(dev_ctx, + input, + filter, + strides, + paddings, + padding_algorithm, + groups, + dilations, + data_format, + use_addto, + workspace_size_MB, + exhaustive_search, + out); +} + +} // namespace phi + +PD_REGISTER_KERNEL(conv2d, CPU, ALL_LAYOUT, phi::ConvKernel, float, double) {} + +PD_REGISTER_KERNEL(depthwise_conv2d, + CPU, + ALL_LAYOUT, + phi::DepthwiseConvKernel, + float, + double) {} + +PD_REGISTER_KERNEL(conv3d, CPU, ALL_LAYOUT, phi::Conv3DKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/conv_util.h b/paddle/phi/kernels/cpu/conv_util.h new file mode 100644 index 0000000000000000000000000000000000000000..d26d89086b27e3db8ccfcf339c51c6a2fdf1988a --- /dev/null +++ b/paddle/phi/kernels/cpu/conv_util.h @@ -0,0 +1,91 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/phi/core/ddim.h" + +namespace phi { + +template +inline void UpdatePaddingAndDilation(std::vector* paddings, + std::vector* dilation, + const std::string padding_algorithm, + const DDim data_dims, + const std::vector& strides, + const std::vector& ksize) { + // set padding size == data_dims.size() * 2 + auto data_shape = vectorize(data_dims); + if (static_cast(paddings->size()) == data_dims.size()) { + for (int i = 0; i < data_dims.size(); ++i) { + T copy_pad = *(paddings->begin() + 2 * i); + paddings->insert(paddings->begin() + 2 * i + 1, copy_pad); + } + } else { + PADDLE_ENFORCE_EQ( + data_dims.size() * 2, + paddings->size(), + phi::errors::InvalidArgument( + "Attribute padding's size should be the same or twice as the " + "input's dimension. " + "But recieved: padding's size is %d, padding is [%s]; input's " + "dimension is %d, input's shape is [%s].", + paddings->size(), + make_ddim(*paddings), + data_dims.size(), + data_dims)); + } + + // when padding_algorithm is "VALID" or "SAME" + if (padding_algorithm == "SAME") { + for (int i = 0; i < data_dims.size(); ++i) { + T out_size = (data_dims[i] + strides[i] - 1) / strides[i]; + T pad_sum = + std::max((out_size - 1) * strides[i] + ksize[i] - data_shape[i], + static_cast(0)); + T pad_0 = pad_sum / 2; + T pad_1 = pad_sum - pad_0; + *(paddings->begin() + i * 2) = pad_0; + *(paddings->begin() + i * 2 + 1) = pad_1; + + // dilation + *(dilation->begin() + i) = 1; + } + + } else if (padding_algorithm == "VALID") { + for (auto it = paddings->begin(); it != paddings->end(); it++) { + *it = 0; + } + } +} + +inline bool IsExpand(const std::vector& filter_dim, + const std::vector& strides, + const std::vector& paddings, + const std::vector& dilations) { + bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true; + for (size_t j = 0; j < strides.size(); ++j) { + filter_1 = filter_1 && (static_cast(filter_dim[j + 2]) == 1); + strides_1 = strides_1 && (strides[j] == 1); + padding_0 = padding_0 && (paddings[j] == 0); + dilation_1 = dilation_1 && (dilations[j] == 1); + } + if (paddings.size() != strides.size()) { + for (size_t j = 0; j < paddings.size(); ++j) { + padding_0 = padding_0 && (paddings[j] == 0); + } + } + return !(filter_1 && strides_1 && padding_0 && dilation_1); +} + +} // namespace phi diff --git a/paddle/phi/kernels/depthwise_conv_grad_kernel.h b/paddle/phi/kernels/depthwise_conv_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..b5eff76e90c472f6deda08ebb560b4763337ab53 --- /dev/null +++ b/paddle/phi/kernels/depthwise_conv_grad_kernel.h @@ -0,0 +1,19 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi {} // namespace phi diff --git a/paddle/phi/kernels/depthwise_conv_kernel.h b/paddle/phi/kernels/depthwise_conv_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..b5eff76e90c472f6deda08ebb560b4763337ab53 --- /dev/null +++ b/paddle/phi/kernels/depthwise_conv_kernel.h @@ -0,0 +1,19 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi {} // namespace phi diff --git a/paddle/phi/kernels/funcs/batch_norm_utils.h b/paddle/phi/kernels/funcs/batch_norm_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..21ebae8487ffc3588034a8ea5feeab8ac1c47fa8 --- /dev/null +++ b/paddle/phi/kernels/funcs/batch_norm_utils.h @@ -0,0 +1,143 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +using Tensor = DenseTensor; + +template +inline void ResizeToChannelFirst(const DeviceContext& context, + const Tensor* input, + Tensor* transformed_input) { + int dim = input->dims().size() - 2; + if (dim == 3) { + // input + transformed_input->Resize(input->dims()); + + auto in_dims_vec = vectorize(input->dims()); + in_dims_vec[1] = input->dims()[4]; + in_dims_vec[2] = input->dims()[1]; + in_dims_vec[3] = input->dims()[2]; + in_dims_vec[4] = input->dims()[3]; + transformed_input->Resize(make_ddim(in_dims_vec)); + transformed_input->mutable_data(context.GetPlace()); + + } else if (dim == 2) { + // input + transformed_input->Resize(input->dims()); + + auto in_dims_vec = vectorize(input->dims()); + in_dims_vec[1] = input->dims()[3]; + in_dims_vec[2] = input->dims()[1]; + in_dims_vec[3] = input->dims()[2]; + transformed_input->Resize(make_ddim(in_dims_vec)); + transformed_input->mutable_data(context.GetPlace()); + } else if (dim == 1) { + transformed_input->Resize(input->dims()); + + auto in_dims_vec = vectorize(input->dims()); + in_dims_vec[1] = input->dims()[2]; + in_dims_vec[2] = input->dims()[1]; + transformed_input->Resize(make_ddim(in_dims_vec)); + transformed_input->mutable_data(context.GetPlace()); + } +} + +template +inline void ResizeToChannelLast(const DeviceContext& context, + const Tensor* input, + Tensor* transformed_input) { + int dim = input->dims().size() - 2; + if (dim == 3) { + // input + transformed_input->Resize(input->dims()); + + auto in_dims_vec = vectorize(input->dims()); + in_dims_vec[1] = input->dims()[2]; + in_dims_vec[2] = input->dims()[3]; + in_dims_vec[3] = input->dims()[4]; + in_dims_vec[4] = input->dims()[1]; + transformed_input->Resize(make_ddim(in_dims_vec)); + transformed_input->mutable_data(context.GetPlace()); + + } else if (dim == 2) { + // input + transformed_input->Resize(input->dims()); + + auto in_dims_vec = vectorize(input->dims()); + in_dims_vec[1] = input->dims()[2]; + in_dims_vec[2] = input->dims()[3]; + in_dims_vec[3] = input->dims()[1]; + transformed_input->Resize(make_ddim(in_dims_vec)); + transformed_input->mutable_data(context.GetPlace()); + } else if (dim == 1) { + transformed_input->Resize(input->dims()); + + auto in_dims_vec = vectorize(input->dims()); + in_dims_vec[1] = input->dims()[2]; + in_dims_vec[2] = input->dims()[1]; + transformed_input->Resize(make_ddim(in_dims_vec)); + transformed_input->mutable_data(context.GetPlace()); + } +} + +template +inline void TransToChannelFirst(const DeviceContext& context, + const Tensor* input, + Tensor* transformed_input) { + VLOG(5) << "Why am I called?"; + int dim = input->dims().size() - 2; + if (dim == 3) { + std::vector axis{0, 4, 1, 2, 3}; + phi::funcs::Transpose trans5; + trans5(context, *input, transformed_input, axis); + + } else if (dim == 2) { + std::vector axis{0, 3, 1, 2}; + phi::funcs::Transpose trans4; + trans4(context, *input, transformed_input, axis); + } else if (dim == 1) { + std::vector axis{0, 2, 1}; + phi::funcs::Transpose trans3; + trans3(context, *input, transformed_input, axis); + } +} + +template +inline void TransToChannelLast(const DeviceContext& context, + const Tensor* input, + Tensor* transformed_input) { + int dim = input->dims().size() - 2; + if (dim == 3) { + std::vector axis{0, 2, 3, 4, 1}; + phi::funcs::Transpose trans5; + trans5(context, *input, transformed_input, axis); + + } else if (dim == 2) { + std::vector axis{0, 2, 3, 1}; + phi::funcs::Transpose trans4; + trans4(context, *input, transformed_input, axis); + } else if (dim == 1) { + std::vector axis{0, 2, 1}; + phi::funcs::Transpose trans3; + trans3(context, *input, transformed_input, axis); + } +} + +} // namespace phi diff --git a/paddle/phi/kernels/funcs/padding.h b/paddle/phi/kernels/funcs/padding.h index 6d10ff2dfcf39c6b57084e99eb31fc1d888f5f75..e2c4e766b605b570463da12c39c456923c439916 100644 --- a/paddle/phi/kernels/funcs/padding.h +++ b/paddle/phi/kernels/funcs/padding.h @@ -15,10 +15,10 @@ limitations under the License. */ #pragma once #include #include -#include "paddle/phi/kernels/funcs/eigen/common.h" -#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/enforce.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" namespace phi { namespace funcs { diff --git a/paddle/phi/kernels/gpu/conv_grad_grad_kernel.cu b/paddle/phi/kernels/gpu/conv_grad_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..6449a193a082e5d17926de9252a79e4c069be224 --- /dev/null +++ b/paddle/phi/kernels/gpu/conv_grad_grad_kernel.cu @@ -0,0 +1,23 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/conv_grad_grad_kernel.h" +#include "paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL( + conv2d_grad_grad, GPU, ALL_LAYOUT, phi::ConvGradGradKernel, float, double) { +} diff --git a/paddle/phi/kernels/gpu/conv_grad_kernel.cu b/paddle/phi/kernels/gpu/conv_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..4df7bb26adf845b4a8f52f1c92beb8621002b3da --- /dev/null +++ b/paddle/phi/kernels/gpu/conv_grad_kernel.cu @@ -0,0 +1,62 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/conv_grad_kernel.h" +#include "paddle/phi/kernels/impl/conv_grad_kernel_impl.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void Conv3DGradKernel(const Context& dev_ctx, + const DenseTensor& out_grad, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& paddding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + DenseTensor* input_grad, + DenseTensor* filter_grad) { + ConvGradKernel(dev_ctx, + out_grad, + input, + filter, + strides, + paddings, + paddding_algorithm, + groups, + dilations, + data_format, + use_addto, + workspace_size_MB, + exhaustive_search, + input_grad, + filter_grad); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + conv2d_grad, GPU, ALL_LAYOUT, phi::ConvGradKernel, float, double) {} + +PD_REGISTER_KERNEL( + conv3d_grad, GPU, ALL_LAYOUT, phi::Conv3DGradKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/conv_kernel.cu b/paddle/phi/kernels/gpu/conv_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..680ee4426af0661d39d4c7bd0abf9e52c4594995 --- /dev/null +++ b/paddle/phi/kernels/gpu/conv_kernel.cu @@ -0,0 +1,56 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/conv_kernel.h" +#include "paddle/phi/kernels/impl/conv_kernel_impl.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void Conv3DKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + DenseTensor* out) { + ConvKernel(dev_ctx, + input, + filter, + strides, + paddings, + padding_algorithm, + groups, + dilations, + data_format, + use_addto, + workspace_size_MB, + exhaustive_search, + out); +} + +} // namespace phi + +PD_REGISTER_KERNEL(conv2d, GPU, ALL_LAYOUT, phi::ConvKernel, float, double) {} + +PD_REGISTER_KERNEL(conv3d, GPU, ALL_LAYOUT, phi::Conv3DKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/conv_test_kernel.cu b/paddle/phi/kernels/gpu/conv_test_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..0544a1e298b8e7dc871d13f546398a5c28308b0e --- /dev/null +++ b/paddle/phi/kernels/gpu/conv_test_kernel.cu @@ -0,0 +1,13 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. diff --git a/paddle/fluid/operators/math/depthwise_conv.cu b/paddle/phi/kernels/gpu/depthwise_conv.h similarity index 62% rename from paddle/fluid/operators/math/depthwise_conv.cu rename to paddle/phi/kernels/gpu/depthwise_conv.h index a4665a8f9a62dde6bfdbad3b05d7065e05f0a92f..5270a4b2fdb8d77aa1dfb20a166a9676b007c93f 100644 --- a/paddle/fluid/operators/math/depthwise_conv.cu +++ b/paddle/phi/kernels/gpu/depthwise_conv.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserved. +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -12,8 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include +#pragma once #include +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/phi/core/hostdevice.h" + #ifdef __NVCC__ #include #endif @@ -21,7 +25,7 @@ limitations under the License. */ #include namespace cub = hipcub; #endif -#include "paddle/fluid/operators/math/depthwise_conv.h" + #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/kernels/funcs/math_function.h" @@ -30,6 +34,58 @@ namespace paddle { namespace operators { namespace math { +using DataLayout = framework::DataLayout; + +/* + * \brief Compute the depthwise convolution which include + * forward process and backpropagation process + */ +template +class DepthwiseConvFunctor { + public: + void operator()(const DeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::vector& dilations, + framework::Tensor* output, + const DataLayout data_layout = DataLayout::kNCHW); +}; + +template +class DepthwiseConvInputGradFunctor { + public: + void operator()(const DeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& filter, + const framework::Tensor& output_grad, + const std::vector& strides, + const std::vector& paddings, + const std::vector& dilations, + framework::Tensor* input_grad, + const DataLayout data_layout = DataLayout::kNCHW); +}; + +template +class DepthwiseConvFilterGradFunctor { + public: + void operator()(const DeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& output_grad, + const std::vector& strides, + const std::vector& paddings, + const std::vector& dilations, + framework::Tensor* filter_grad, + const DataLayout data_layout = DataLayout::kNCHW); +}; + template static __forceinline__ __device__ T WarpReduceSum(T val, int warp_size) { typedef cub::WarpReduce WarpReduce; @@ -293,8 +349,12 @@ __device__ __inline__ void KernelDepthwiseConvCFilterNHWC( } } -template +template __global__ void KernelDepthwiseConvSp(ARG_DEFINE_KernelDepthwiseConv) { int final_filter_multiplier = filter_multiplier; int h_stride = stride_height; @@ -306,34 +366,88 @@ __global__ void KernelDepthwiseConvSp(ARG_DEFINE_KernelDepthwiseConv) { } if (c_filter == -1) { if (data_layout != DataLayout::kNHWC) { - KernelDepthwiseConvNCHW( - input_data, filter_data, batch_size, output_channels, output_height, - output_width, input_channels, input_height, input_width, - final_filter_multiplier, filter_height, filter_width, h_stride, - w_stride, padding_height, padding_width, dilate_height, dilate_width, - output_data); + KernelDepthwiseConvNCHW(input_data, + filter_data, + batch_size, + output_channels, + output_height, + output_width, + input_channels, + input_height, + input_width, + final_filter_multiplier, + filter_height, + filter_width, + h_stride, + w_stride, + padding_height, + padding_width, + dilate_height, + dilate_width, + output_data); } else { - KernelDepthwiseConvNHWC( - input_data, filter_data, batch_size, output_channels, output_height, - output_width, input_channels, input_height, input_width, - final_filter_multiplier, filter_height, filter_width, h_stride, - w_stride, padding_height, padding_width, dilate_height, dilate_width, - output_data); + KernelDepthwiseConvNHWC(input_data, + filter_data, + batch_size, + output_channels, + output_height, + output_width, + input_channels, + input_height, + input_width, + final_filter_multiplier, + filter_height, + filter_width, + h_stride, + w_stride, + padding_height, + padding_width, + dilate_height, + dilate_width, + output_data); } } else { if (data_layout != DataLayout::kNHWC) { KernelDepthwiseConvCFilterNCHW( - input_data, filter_data, batch_size, output_channels, output_height, - output_width, input_channels, input_height, input_width, - final_filter_multiplier, filter_height, filter_width, h_stride, - w_stride, padding_height, padding_width, dilate_height, dilate_width, + input_data, + filter_data, + batch_size, + output_channels, + output_height, + output_width, + input_channels, + input_height, + input_width, + final_filter_multiplier, + filter_height, + filter_width, + h_stride, + w_stride, + padding_height, + padding_width, + dilate_height, + dilate_width, output_data); } else { KernelDepthwiseConvCFilterNHWC( - input_data, filter_data, batch_size, output_channels, output_height, - output_width, input_channels, input_height, input_width, - final_filter_multiplier, filter_height, filter_width, h_stride, - w_stride, padding_height, padding_width, dilate_height, dilate_width, + input_data, + filter_data, + batch_size, + output_channels, + output_height, + output_width, + input_channels, + input_height, + input_width, + final_filter_multiplier, + filter_height, + filter_width, + h_stride, + w_stride, + padding_height, + padding_width, + dilate_height, + dilate_width, output_data); } } @@ -464,7 +578,9 @@ __device__ __inline__ void KernelDepthwiseConvInputGradNHWC( } } -template __device__ __inline__ void KernelDepthwiseConvInputGradCFilterNCHW( ARG_DEFINE_KernelDepthwiseConvInputGrad) { @@ -525,7 +641,9 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilterNCHW( } } -template __device__ __inline__ void KernelDepthwiseConvInputGradCFilterNHWC( ARG_DEFINE_KernelDepthwiseConvInputGrad) { @@ -595,8 +713,12 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilterNHWC( } } -template +template __global__ void KernelDepthwiseConvInputGradSp( ARG_DEFINE_KernelDepthwiseConvInputGrad) { int final_filter_multiplier = filter_multiplier; @@ -611,36 +733,100 @@ __global__ void KernelDepthwiseConvInputGradSp( if (c_filter_multiplier == 0 || c_filter == -1) { if (data_layout != DataLayout::kNHWC) { KernelDepthwiseConvInputGradNCHW( - input_data, output_grad_data, filter_data, batch_size, - output_channels, output_height, output_width, input_channels, - input_height, input_width, final_filter_multiplier, filter_height, - filter_width, h_stride, w_stride, padding_height, padding_width, - dilate_height, dilate_width, input_grad_data); + input_data, + output_grad_data, + filter_data, + batch_size, + output_channels, + output_height, + output_width, + input_channels, + input_height, + input_width, + final_filter_multiplier, + filter_height, + filter_width, + h_stride, + w_stride, + padding_height, + padding_width, + dilate_height, + dilate_width, + input_grad_data); } else { KernelDepthwiseConvInputGradNHWC( - input_data, output_grad_data, filter_data, batch_size, - output_channels, output_height, output_width, input_channels, - input_height, input_width, final_filter_multiplier, filter_height, - filter_width, h_stride, w_stride, padding_height, padding_width, - dilate_height, dilate_width, input_grad_data); + input_data, + output_grad_data, + filter_data, + batch_size, + output_channels, + output_height, + output_width, + input_channels, + input_height, + input_width, + final_filter_multiplier, + filter_height, + filter_width, + h_stride, + w_stride, + padding_height, + padding_width, + dilate_height, + dilate_width, + input_grad_data); } } else { if (data_layout != DataLayout::kNHWC) { - KernelDepthwiseConvInputGradCFilterNCHW( - input_data, output_grad_data, filter_data, batch_size, - output_channels, output_height, output_width, input_channels, - input_height, input_width, c_filter_multiplier, filter_height, - filter_width, c_stride, c_stride, padding_height, padding_width, - dilate_height, dilate_width, input_grad_data); + input_data, + output_grad_data, + filter_data, + batch_size, + output_channels, + output_height, + output_width, + input_channels, + input_height, + input_width, + c_filter_multiplier, + filter_height, + filter_width, + c_stride, + c_stride, + padding_height, + padding_width, + dilate_height, + dilate_width, + input_grad_data); } else { - KernelDepthwiseConvInputGradCFilterNHWC( - input_data, output_grad_data, filter_data, batch_size, - output_channels, output_height, output_width, input_channels, - input_height, input_width, c_filter_multiplier, filter_height, - filter_width, c_stride, c_stride, padding_height, padding_width, - dilate_height, dilate_width, input_grad_data); + input_data, + output_grad_data, + filter_data, + batch_size, + output_channels, + output_height, + output_width, + input_channels, + input_height, + input_width, + c_filter_multiplier, + filter_height, + filter_width, + c_stride, + c_stride, + padding_height, + padding_width, + dilate_height, + dilate_width, + input_grad_data); } } } @@ -648,13 +834,25 @@ __global__ void KernelDepthwiseConvInputGradSp( // Cuda kernel to compute the depthwise convolution backprop w.r.t. filter. template __device__ __inline__ void KernelDepthwiseConvFilterGradNCHW( - const T* output_grad_data, const T* input_data, const int num, - const int output_channels, const int output_height, const int output_width, - const int input_channels, const int input_height, const int input_width, - const int filter_multiplier, const int filter_height, - const int filter_width, const int stride_height, const int stride_width, - const int padding_height, const int padding_width, const int dilate_height, - const int dilate_width, T* filter_grad_data) { + const T* output_grad_data, + const T* input_data, + const int num, + const int output_channels, + const int output_height, + const int output_width, + const int input_channels, + const int input_height, + const int input_width, + const int filter_multiplier, + const int filter_height, + const int filter_width, + const int stride_height, + const int stride_width, + const int padding_height, + const int padding_width, + const int dilate_height, + const int dilate_width, + T* filter_grad_data) { T s = 0; int gbid = ((blockIdx.z * gridDim.y) + blockIdx.y) * gridDim.x + blockIdx.x; @@ -697,13 +895,25 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradNCHW( template __device__ __inline__ void KernelDepthwiseConvFilterGradNHWC( - const T* output_grad_data, const T* input_data, const int num, - const int output_channels, const int output_height, const int output_width, - const int input_channels, const int input_height, const int input_width, - const int filter_multiplier, const int filter_height, - const int filter_width, const int stride_height, const int stride_width, - const int padding_height, const int padding_width, const int dilate_height, - const int dilate_width, T* filter_grad_data) { + const T* output_grad_data, + const T* input_data, + const int num, + const int output_channels, + const int output_height, + const int output_width, + const int input_channels, + const int input_height, + const int input_width, + const int filter_multiplier, + const int filter_height, + const int filter_width, + const int stride_height, + const int stride_width, + const int padding_height, + const int padding_width, + const int dilate_height, + const int dilate_width, + T* filter_grad_data) { int bid = blockIdx.z; int image_h = blockIdx.y; int kernel_iw = blockIdx.x % filter_width; @@ -743,13 +953,25 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradNHWC( template __device__ __inline__ void KernelDepthwiseConvFilterGradCFilterNHWC( - const T* output_grad_data, const T* input_data, const int num, - const int output_channels, const int output_height, const int output_width, - const int input_channels, const int input_height, const int input_width, - const int filter_multiplier, const int filter_height, - const int filter_width, const int stride_height, const int stride_width, - const int padding_height, const int padding_width, const int dilate_height, - const int dilate_width, T* filter_grad_data) { + const T* output_grad_data, + const T* input_data, + const int num, + const int output_channels, + const int output_height, + const int output_width, + const int input_channels, + const int input_height, + const int input_width, + const int filter_multiplier, + const int filter_height, + const int filter_width, + const int stride_height, + const int stride_width, + const int padding_height, + const int padding_width, + const int dilate_height, + const int dilate_width, + T* filter_grad_data) { const int bid = blockIdx.z; int image_h = blockIdx.x * dilate_height + blockIdx.y; if (image_h >= output_height) { @@ -804,16 +1026,31 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradCFilterNHWC( } } -template -__global__ void KernelDepthwiseConvFilterGradSp( - const T* output_grad_data, const T* input_data, const int num, - const int output_channels, const int output_height, const int output_width, - const int input_channels, const int input_height, const int input_width, - const int filter_multiplier, const int filter_height, - const int filter_width, const int stride_height, const int stride_width, - const int padding_height, const int padding_width, const int dilate_height, - const int dilate_width, T* filter_grad_data) { +template +__global__ void KernelDepthwiseConvFilterGradSp(const T* output_grad_data, + const T* input_data, + const int num, + const int output_channels, + const int output_height, + const int output_width, + const int input_channels, + const int input_height, + const int input_width, + const int filter_multiplier, + const int filter_height, + const int filter_width, + const int stride_height, + const int stride_width, + const int padding_height, + const int padding_width, + const int dilate_height, + const int dilate_width, + T* filter_grad_data) { int final_filter_multiplier = filter_multiplier; int h_stride = stride_height; int w_stride = stride_width; @@ -825,34 +1062,91 @@ __global__ void KernelDepthwiseConvFilterGradSp( if (c_filter_multiplier == 0 || c_filter == -1) { if (data_layout != DataLayout::kNHWC) { KernelDepthwiseConvFilterGradNCHW( - output_grad_data, input_data, num, output_channels, output_height, - output_width, input_channels, input_height, input_width, - final_filter_multiplier, filter_height, filter_width, h_stride, - w_stride, padding_height, padding_width, dilate_height, dilate_width, + output_grad_data, + input_data, + num, + output_channels, + output_height, + output_width, + input_channels, + input_height, + input_width, + final_filter_multiplier, + filter_height, + filter_width, + h_stride, + w_stride, + padding_height, + padding_width, + dilate_height, + dilate_width, filter_grad_data); } else { KernelDepthwiseConvFilterGradNHWC( - output_grad_data, input_data, num, output_channels, output_height, - output_width, input_channels, input_height, input_width, - final_filter_multiplier, filter_height, filter_width, h_stride, - w_stride, padding_height, padding_width, dilate_height, dilate_width, + output_grad_data, + input_data, + num, + output_channels, + output_height, + output_width, + input_channels, + input_height, + input_width, + final_filter_multiplier, + filter_height, + filter_width, + h_stride, + w_stride, + padding_height, + padding_width, + dilate_height, + dilate_width, filter_grad_data); } } else { if (data_layout != DataLayout::kNHWC) { KernelDepthwiseConvFilterGradNCHW( - output_grad_data, input_data, num, output_channels, output_height, - output_width, input_channels, input_height, input_width, - final_filter_multiplier, filter_height, filter_width, h_stride, - w_stride, padding_height, padding_width, dilate_height, dilate_width, + output_grad_data, + input_data, + num, + output_channels, + output_height, + output_width, + input_channels, + input_height, + input_width, + final_filter_multiplier, + filter_height, + filter_width, + h_stride, + w_stride, + padding_height, + padding_width, + dilate_height, + dilate_width, filter_grad_data); } else { - KernelDepthwiseConvFilterGradCFilterNHWC( - output_grad_data, input_data, num, output_channels, output_height, - output_width, input_channels, input_height, input_width, - final_filter_multiplier, filter_height, filter_width, h_stride, - w_stride, padding_height, padding_width, dilate_height, dilate_width, + output_grad_data, + input_data, + num, + output_channels, + output_height, + output_width, + input_channels, + input_height, + input_width, + final_filter_multiplier, + filter_height, + filter_width, + h_stride, + w_stride, + padding_height, + padding_width, + dilate_height, + dilate_width, filter_grad_data); } } @@ -864,15 +1158,15 @@ __global__ void KernelDepthwiseConvFilterGradSp( * height and width, respectively. */ template -class DepthwiseConvFunctor { +class DepthwiseConvFunctor { public: - void operator()(const platform::CUDADeviceContext& context, + void operator()(const phi::GPUContext& context, const framework::Tensor& input, const framework::Tensor& filter, const std::vector& strides, const std::vector& paddings, - const std::vector& dilations, framework::Tensor* output, + const std::vector& dilations, + framework::Tensor* output, const DataLayout data_layout = DataLayout::kNCHW) { const int batch_size = input.dims()[0]; const int input_channels = @@ -905,12 +1199,14 @@ class DepthwiseConvFunctor(context.GetPlace()); std::vector perm_axis({2, 3, 0, 1}); - phi::funcs::TransposeNormal trans; + phi::funcs::TransposeNormal trans; trans(context, filter, &filter_hwc, perm_axis); filter_data = filter_hwc.data(); } @@ -940,7 +1236,8 @@ class DepthwiseConvFunctor<<>>( \ - input_data, filter_data, batch_size, output_channels, output_height, \ - output_width, input_channels, input_height, input_width, \ - filter_multiplier, ksize_height, ksize_width, stride_height, \ - stride_width, padding_height, padding_width, dilate_height, \ - dilate_width, output_data); \ - } else { \ - KernelDepthwiseConvSp< \ - T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNHWC, \ - fuse_relu_before_conv><<>>( \ - input_data, filter_data, batch_size, output_channels, output_height, \ - output_width, input_channels, input_height, input_width, \ - filter_multiplier, ksize_height, ksize_width, stride_height, \ - stride_width, padding_height, padding_width, dilate_height, \ - dilate_width, output_data); \ - } \ - return; \ +#define check_case(c_filter_multiplier, c_stride, c_filter) \ + if (c_filter_multiplier == 0 || \ + filter_multiplier == c_filter_multiplier && \ + stride_height == stride_width && stride_height == c_stride && \ + (ksize_height == ksize_width && ksize_height == c_filter || \ + c_filter == -1)) { \ + if (c_filter == -1) { \ + threads.x = block_size; \ + grid.x = grid_size; \ + threads.y = threads.z = grid.y = grid.z = 1; \ + } \ + if (data_layout != DataLayout::kNHWC) { \ + KernelDepthwiseConvSp< \ + T, \ + c_filter_multiplier, \ + c_stride, \ + c_filter, \ + DataLayout::kNCHW, \ + fuse_relu_before_conv><<>>( \ + input_data, \ + filter_data, \ + batch_size, \ + output_channels, \ + output_height, \ + output_width, \ + input_channels, \ + input_height, \ + input_width, \ + filter_multiplier, \ + ksize_height, \ + ksize_width, \ + stride_height, \ + stride_width, \ + padding_height, \ + padding_width, \ + dilate_height, \ + dilate_width, \ + output_data); \ + } else { \ + KernelDepthwiseConvSp< \ + T, \ + c_filter_multiplier, \ + c_stride, \ + c_filter, \ + DataLayout::kNHWC, \ + fuse_relu_before_conv><<>>( \ + input_data, \ + filter_data, \ + batch_size, \ + output_channels, \ + output_height, \ + output_width, \ + input_channels, \ + input_height, \ + input_width, \ + filter_multiplier, \ + ksize_height, \ + ksize_width, \ + stride_height, \ + stride_width, \ + padding_height, \ + padding_width, \ + dilate_height, \ + dilate_width, \ + output_data); \ + } \ + return; \ } check_case(1, 1, 3); check_case(1, 1, 5); @@ -1004,10 +1337,9 @@ class DepthwiseConvFunctor -class DepthwiseConvInputGradFunctor { +class DepthwiseConvInputGradFunctor { public: - void operator()(const platform::CUDADeviceContext& context, + void operator()(const phi::GPUContext& context, const framework::Tensor& input, const framework::Tensor& filter, const framework::Tensor& output_grad, @@ -1048,12 +1380,14 @@ class DepthwiseConvInputGradFunctor(context.GetPlace()); std::vector perm_axis({2, 3, 0, 1}); - phi::funcs::TransposeNormal trans; + phi::funcs::TransposeNormal trans; trans(context, filter, &filter_hwc, perm_axis); filter_data = filter_hwc.data(); } @@ -1078,7 +1412,8 @@ class DepthwiseConvInputGradFunctor<<>>( \ - input_data, output_grad_data, filter_data, batch_size, \ - output_channels, output_height, output_width, input_channels, \ - input_height, input_width, filter_multiplier, ksize_height, \ - ksize_width, stride_height, stride_width, padding_height, \ - padding_width, dilate_height, dilate_width, input_grad_data); \ + input_data, \ + output_grad_data, \ + filter_data, \ + batch_size, \ + output_channels, \ + output_height, \ + output_width, \ + input_channels, \ + input_height, \ + input_width, \ + filter_multiplier, \ + ksize_height, \ + ksize_width, \ + stride_height, \ + stride_width, \ + padding_height, \ + padding_width, \ + dilate_height, \ + dilate_width, \ + input_grad_data); \ } else { \ KernelDepthwiseConvInputGradSp< \ - T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNHWC, \ + T, \ + c_filter_multiplier, \ + c_stride, \ + c_filter, \ + DataLayout::kNHWC, \ fuse_relu_before_conv><<>>( \ - input_data, output_grad_data, filter_data, batch_size, \ - output_channels, output_height, output_width, input_channels, \ - input_height, input_width, filter_multiplier, ksize_height, \ - ksize_width, stride_height, stride_width, padding_height, \ - padding_width, dilate_height, dilate_width, input_grad_data); \ + input_data, \ + output_grad_data, \ + filter_data, \ + batch_size, \ + output_channels, \ + output_height, \ + output_width, \ + input_channels, \ + input_height, \ + input_width, \ + filter_multiplier, \ + ksize_height, \ + ksize_width, \ + stride_height, \ + stride_width, \ + padding_height, \ + padding_width, \ + dilate_height, \ + dilate_width, \ + input_grad_data); \ } \ return; \ } @@ -1129,10 +1502,11 @@ class DepthwiseConvInputGradFunctor -class DepthwiseConvFilterGradFunctor { public: - void operator()(const platform::CUDADeviceContext& context, + void operator()(const phi::GPUContext& context, const framework::Tensor& input, const framework::Tensor& output_grad, const std::vector& strides, @@ -1187,7 +1561,8 @@ class DepthwiseConvFilterGradFunctor<<>>( \ - output_grad_data, input_data, batch_size, output_channels, \ - output_height, output_width, input_channels, input_height, \ - input_width, filter_multiplier, ksize_height, ksize_width, \ - stride_height, stride_width, padding_height, padding_width, \ - dilate_height, dilate_width, filter_grad_data); \ + output_grad_data, \ + input_data, \ + batch_size, \ + output_channels, \ + output_height, \ + output_width, \ + input_channels, \ + input_height, \ + input_width, \ + filter_multiplier, \ + ksize_height, \ + ksize_width, \ + stride_height, \ + stride_width, \ + padding_height, \ + padding_width, \ + dilate_height, \ + dilate_width, \ + filter_grad_data); \ } else { \ framework::Tensor filter_grad_hwc; \ if (c_filter != -1) { \ - framework::DDim filter_grad_hwc_dims( \ - {filter_grad->dims()[2], filter_grad->dims()[3], \ - filter_grad->dims()[0], filter_grad->dims()[1]}); \ + framework::DDim filter_grad_hwc_dims({filter_grad->dims()[2], \ + filter_grad->dims()[3], \ + filter_grad->dims()[0], \ + filter_grad->dims()[1]}); \ filter_grad_hwc.Resize(filter_grad_hwc_dims); \ filter_grad_hwc.mutable_data(context.GetPlace()); \ - phi::funcs::SetConstant set_zero; \ + phi::funcs::SetConstant set_zero; \ set_zero(context, &filter_grad_hwc, static_cast(0)); \ filter_grad_data = filter_grad_hwc.data(); \ } else { \ @@ -1231,16 +1625,34 @@ class DepthwiseConvFilterGradFunctor<<>>( \ - output_grad_data, input_data, batch_size, output_channels, \ - output_height, output_width, input_channels, input_height, \ - input_width, filter_multiplier, ksize_height, ksize_width, \ - stride_height, stride_width, padding_height, padding_width, \ - dilate_height, dilate_width, filter_grad_data); \ + output_grad_data, \ + input_data, \ + batch_size, \ + output_channels, \ + output_height, \ + output_width, \ + input_channels, \ + input_height, \ + input_width, \ + filter_multiplier, \ + ksize_height, \ + ksize_width, \ + stride_height, \ + stride_width, \ + padding_height, \ + padding_width, \ + dilate_height, \ + dilate_width, \ + filter_grad_data); \ if (c_filter != -1) { \ std::vector perm_axis({2, 3, 0, 1}); \ - phi::funcs::TransposeNormal trans; \ + phi::funcs::TransposeNormal trans; \ trans(context, filter_grad_hwc, filter_grad, perm_axis); \ } \ } \ @@ -1263,31 +1675,23 @@ class DepthwiseConvFilterGradFunctor; -template class DepthwiseConvFunctor; +template class DepthwiseConvFunctor; +template class DepthwiseConvFunctor; -template class DepthwiseConvInputGradFunctor; -template class DepthwiseConvInputGradFunctor; +template class DepthwiseConvInputGradFunctor; +template class DepthwiseConvInputGradFunctor; -template class DepthwiseConvFilterGradFunctor; -template class DepthwiseConvFilterGradFunctor; +template class DepthwiseConvFilterGradFunctor; +template class DepthwiseConvFilterGradFunctor; -template class DepthwiseConvFunctor; -template class DepthwiseConvFunctor; +template class DepthwiseConvFunctor; +template class DepthwiseConvFunctor; -template class DepthwiseConvInputGradFunctor; -template class DepthwiseConvInputGradFunctor; +template class DepthwiseConvInputGradFunctor; +template class DepthwiseConvInputGradFunctor; -template class DepthwiseConvFilterGradFunctor; -template class DepthwiseConvFilterGradFunctor; +template class DepthwiseConvFilterGradFunctor; +template class DepthwiseConvFilterGradFunctor; } // namespace math } // namespace operators diff --git a/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu b/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..4f27b6fde99ffa652c734d363ad7731f75b495f4 --- /dev/null +++ b/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu @@ -0,0 +1,142 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/layout.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/conv_util.h" +#include "paddle/phi/kernels/funcs/batch_norm_utils.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/gpu/depthwise_conv.h" + +namespace phi { + +template +void DepthwiseConvGradKernel(const Context& dev_ctx, + const DenseTensor& out_grad, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides_t, + const std::vector& paddings_t, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations_t, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + bool fuse_relu, + DenseTensor* input_grad, + DenseTensor* filter_grad) { + const DenseTensor* output_grad = &out_grad; + + if (!input_grad && !filter_grad) return; + + std::vector strides = strides_t; + std::vector paddings = paddings_t; + std::vector dilations = dilations_t; + + // update padding and dilation + auto in_dims = input.dims(); + auto filter_dims = filter.dims(); + + DDim in_data_dims; + const paddle::framework::DataLayout data_layout = + paddle::framework::StringToDataLayout(data_format); + if (data_layout != paddle::framework::DataLayout::kNHWC) { + in_data_dims = slice_ddim(in_dims, 2, in_dims.size()); + } else { + in_data_dims = slice_ddim(in_dims, 1, in_dims.size() - 1); + } + DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size()); + std::vector ksize = vectorize(filter_data_dims); + UpdatePaddingAndDilation( + &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); + + bool is_sys_pad = strides.size() * 2 == paddings.size() ? false : true; + if (!is_sys_pad) { + for (size_t i = 0; i < strides.size(); ++i) { + paddings.erase(paddings.begin() + i + 1); + } + } + phi::funcs::SetConstant set_zero; + + if (input_grad) { + input_grad->mutable_data(dev_ctx.GetPlace()); + set_zero(dev_ctx, input_grad, static_cast(0)); + + if (fuse_relu) { + paddle::operators::math::DepthwiseConvInputGradFunctor + depthwiseConvInputGrad; + depthwiseConvInputGrad(dev_ctx, + input, + filter, + *output_grad, + strides, + paddings, + dilations, + input_grad, + data_layout); + } else { + paddle::operators::math::DepthwiseConvInputGradFunctor + depthwiseConvInputGrad; + depthwiseConvInputGrad(dev_ctx, + input, + filter, + *output_grad, + strides, + paddings, + dilations, + input_grad, + data_layout); + } + } + + if (filter_grad) { + filter_grad->mutable_data(dev_ctx.GetPlace()); + set_zero(dev_ctx, filter_grad, static_cast(0)); + if (fuse_relu) { + paddle::operators::math::DepthwiseConvFilterGradFunctor + depthwiseConvFilterGrad; + depthwiseConvFilterGrad(dev_ctx, + input, + *output_grad, + strides, + paddings, + dilations, + filter_grad, + data_layout); + } else { + paddle::operators::math::DepthwiseConvFilterGradFunctor + depthwiseConvFilterGrad; + depthwiseConvFilterGrad(dev_ctx, + input, + *output_grad, + strides, + paddings, + dilations, + filter_grad, + data_layout); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(depthwise_conv2d_grad, + GPU, + ALL_LAYOUT, + phi::DepthwiseConvGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu b/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..c50ceae33fc790a763e02bed9b6bed4879b2c547 --- /dev/null +++ b/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu @@ -0,0 +1,130 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +#include "paddle/fluid/operators/conv_op.h" + +#include "paddle/phi/kernels/gpu/depthwise_conv.h" + +#include "paddle/phi/kernels/cpu/conv_util.h" +#include "paddle/phi/kernels/funcs/batch_norm_utils.h" + +namespace phi { + +template +void DepthwiseConvKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides_t, + const std::vector& paddings_t, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations_t, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + bool fuse_relu, + DenseTensor* out) { + DenseTensor* output = out; + output->mutable_data(dev_ctx.GetPlace()); + + const std::vector strides = strides_t; + std::vector dilations = dilations_t; + std::vector paddings = paddings_t; + + const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); + if (channel_last) { + PADDLE_ENFORCE_EQ( + output->dims()[output->dims().size() - 1] % + input.dims()[input.dims().size() - 1], + 0, + phi::errors::InvalidArgument( + "ShapeError: The output channels must be a multiple of the " + "input channels. But receivced output channel number is %d " + "and input channel number is %d", + output->dims()[output->dims().size() - 1], + input.dims()[input.dims().size() - 1])); + } else { + PADDLE_ENFORCE_EQ( + output->dims()[1] % input.dims()[1], + 0, + phi::errors::InvalidArgument( + "ShapeError: The output channels must be a multiple of the " + "input channels. But receivced output channel number is %d " + "and input channel number is %d", + output->dims()[1], + input.dims()[1])); + } + + // update padding and dilation + auto in_dims = input.dims(); + auto filter_dims = filter.dims(); + + DDim in_data_dims; + const paddle::framework::DataLayout data_layout = + paddle::framework::StringToDataLayout(data_format); + if (data_layout != paddle::framework::DataLayout::kNHWC) { + in_data_dims = slice_ddim(in_dims, 2, in_dims.size()); + } else { + in_data_dims = slice_ddim(in_dims, 1, in_dims.size() - 1); + } + + DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size()); + std::vector ksize = vectorize(filter_data_dims); + UpdatePaddingAndDilation( + &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); + + bool is_sys_pad = strides.size() * 2 == paddings.size() ? false : true; + if (!is_sys_pad) { + for (size_t i = 0; i < strides.size(); ++i) { + paddings.erase(paddings.begin() + i + 1); + } + } + + if (fuse_relu) { + paddle::operators::math::DepthwiseConvFunctor + depthwiseConv; + depthwiseConv(dev_ctx, + input, + filter, + strides, + paddings, + dilations, + output, + data_layout); + } else { + paddle::operators::math::DepthwiseConvFunctor + depthwiseConv; + depthwiseConv(dev_ctx, + input, + filter, + strides, + paddings, + dilations, + output, + data_layout); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(depthwise_conv2d, + GPU, + ALL_LAYOUT, + phi::DepthwiseConvKernel, + float, + double) {} diff --git a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel_gpudnn.cu new file mode 100644 index 0000000000000000000000000000000000000000..b4a6fe337c8d21e37beb0d6e5219e1a5edf1f9e8 --- /dev/null +++ b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel_gpudnn.cu @@ -0,0 +1,834 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/conv_grad_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +#include "paddle/fluid/framework/eigen.h" +#ifdef PADDLE_WITH_HIP +#include "paddle/fluid/operators/conv_miopen_helper.h" +#else +#include "paddle/fluid/operators/conv_cudnn_helper.h" +#endif + +#include "paddle/fluid/platform/cudnn_workspace_helper.h" +#include "paddle/fluid/platform/float16.h" +#include "paddle/fluid/platform/profiler.h" +#include "paddle/phi/kernels/funcs/padding.h" + +#include "paddle/phi/kernels/cpu/conv_util.h" +#include "paddle/phi/kernels/funcs/batch_norm_utils.h" + +#include "paddle/phi/kernels/impl/conv_cudnn_impl.h" + +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +void ConvCudnnGradGradKernel( + const Context& ctx, + paddle::optional input_grad_grad, + paddle::optional filter_grad_grad, + const DenseTensor& out_grad, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings_t, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations_t, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search_t, + DenseTensor* out_grad_grad, + DenseTensor* input_grad, + DenseTensor* filter_grad) { + auto X = &input; + auto W = &filter; + auto dO = &out_grad; + auto ddX = input_grad_grad.get_ptr(); + auto ddW = filter_grad_grad.get_ptr(); + + auto ddO = out_grad_grad; + auto dW = filter_grad; + auto dX = input_grad; + if (ddO) { + ddO->mutable_data(ctx.GetPlace()); + phi::funcs::SetConstant set_zero; + set_zero(ctx, ddO, static_cast(0)); + } + if (dW) { + dW->mutable_data(ctx.GetPlace()); + } + if (dX) { + dX->mutable_data(ctx.GetPlace()); + } + + // const T* x = X->data(); + const T* dy = dO->data(); + const T* w = W->data(); + + const T* ddx = nullptr; + const T* ddw = nullptr; + T *dw, *dx, *ddy; + dw = dx = ddy = nullptr; + T* transformed_dx = nullptr; + std::vector dilations = dilations_t; + + bool exhaustive_search = FLAGS_cudnn_exhaustive_search || exhaustive_search_t; + bool deterministic = FLAGS_cudnn_deterministic; + auto exhaustive_deterministic = exhaustive_search && deterministic; + PADDLE_ENFORCE_EQ(exhaustive_deterministic, + false, + phi::errors::InvalidArgument( + "Cann't set exhaustive_search True and " + "FLAGS_cudnn_deterministic True at same time.")); + + std::vector paddings = paddings_t; + + const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); + + // transform Tensors to channel first----------- + DenseTensor transformed_X_channel(X->type()); + DenseTensor transformed_dO_channel(dO->type()); + DenseTensor transformed_ddX_channel(X->type()); + + DenseTensor transformed_ddO_channel(dO->type()); + DenseTensor transformed_dX_channel(X->type()); + + if (channel_last) { + ResizeToChannelFirst(ctx, X, &transformed_X_channel); + TransToChannelFirst(ctx, X, &transformed_X_channel); + + ResizeToChannelFirst(ctx, dO, &transformed_dO_channel); + TransToChannelFirst(ctx, dO, &transformed_dO_channel); + + if (ddX) { + ResizeToChannelFirst(ctx, ddX, &transformed_ddX_channel); + TransToChannelFirst(ctx, ddX, &transformed_ddX_channel); + } + + if (ddO) { + ResizeToChannelFirst(ctx, ddO, &transformed_ddO_channel); + } + if (dX) { + ResizeToChannelFirst(ctx, dX, &transformed_dX_channel); + transformed_dX_channel.mutable_data(ctx.GetPlace()); + } + + } else { + transformed_X_channel = *X; + transformed_dO_channel = *dO; + if (ddX) { + transformed_ddX_channel = *ddX; + } + if (ddO) { + transformed_ddO_channel.ShareDataWith(*ddO); + } + if (dX) { + transformed_dX_channel.ShareDataWith(*dX); + } + } + + auto in_dims = transformed_X_channel.dims(); + auto filter_dims = W->dims(); + DDim in_data_dims = slice_ddim(in_dims, 2, in_dims.size()); + DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size()); + std::vector ksize = vectorize(filter_data_dims); + UpdatePaddingAndDilation( + &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); + + int data_dim = strides.size(); // 2d or 3d + bool is_sys_pad = funcs::IsSymmetricPadding(paddings, data_dim); + DenseTensor transformed_X(X->type()); + DenseTensor transformed_ddX(X->type()); + + DenseTensor transformed_dX(X->type()); + + std::vector padding_common(data_dim, 0); + std::vector input_pad(X->dims().size() * 2, 0); + + if (!is_sys_pad) { + // get pad + std::vector padding_diff(data_dim); + std::vector new_input_shape_vec(data_dim + 2); + new_input_shape_vec[0] = transformed_X_channel.dims()[0]; + new_input_shape_vec[1] = transformed_X_channel.dims()[1]; + + for (size_t i = 0; i < data_dim; ++i) { + padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]); + padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]); + new_input_shape_vec[i + 2] = + transformed_X_channel.dims()[i + 2] + padding_diff[i]; + input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i]; + input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i]; + } + DDim new_input_shape(make_ddim(new_input_shape_vec)); + transformed_X.Resize(new_input_shape); + transformed_ddX.Resize(new_input_shape); + transformed_dX.Resize(new_input_shape); + + transformed_X.mutable_data(ctx.GetPlace()); + + if (ddX) { + transformed_ddX.mutable_data(ctx.GetPlace()); + } + if (dX) { + transformed_dX.mutable_data(ctx.GetPlace()); + } + + // pad for input + const int rank = X->dims().size(); + T pad_value(0.0); + switch (rank) { + case 4: { + funcs::PadFunction( + ctx, input_pad, transformed_X_channel, pad_value, &transformed_X); + if (ddX) { + funcs::PadFunction(ctx, + input_pad, + transformed_ddX_channel, + pad_value, + &transformed_ddX); + } + } break; + case 5: { + funcs::PadFunction( + ctx, input_pad, transformed_X_channel, pad_value, &transformed_X); + if (ddX) { + funcs::PadFunction(ctx, + input_pad, + transformed_ddX_channel, + pad_value, + &transformed_ddX); + } + } break; + default: + PADDLE_THROW(phi::errors::InvalidArgument( + "ConvOp only support tensors with 4 or 5 dimensions.")); + } + + } else { + transformed_X.ShareDataWith(transformed_X_channel); + if (ddX) { + transformed_ddX.ShareDataWith(transformed_ddX_channel); + } + if (dX) { + transformed_dX.ShareDataWith(transformed_dX_channel); + } + + if (paddings.size() == data_dim) { + for (size_t i = 0; i < data_dim; ++i) { + padding_common[i] = paddings[i]; + } + } else { + for (size_t i = 0; i < data_dim; ++i) { + padding_common[i] = paddings[2 * i]; + } + } + } + + const T* x = transformed_X.data(); + + int iwo_group = groups; + int c_group = 1; +#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) + iwo_group = 1; + c_group = groups; + groups = 1; +#endif + auto dtype = paddle::platform::CudnnDataType::type; + + auto handle = ctx.cudnn_handle(); + + paddle::operators::ConvArgs args1{&transformed_ddX, + W, + &transformed_ddO_channel, + strides, + padding_common, + dilations, + dtype}; + paddle::operators::ConvArgs args2{&transformed_X, + ddW, + &transformed_ddO_channel, + strides, + padding_common, + dilations, + dtype}; + paddle::operators::ConvArgs args3{&transformed_ddX, + dW, + &transformed_dO_channel, + strides, + padding_common, + dilations, + dtype}; + paddle::operators::ConvArgs args4{&transformed_dX, + ddW, + &transformed_dO_channel, + strides, + padding_common, + dilations, + dtype}; + +#ifdef PADDLE_WITH_HIP + miopenConvFwdAlgorithm_t fwd_algo1 = static_cast(0); + miopenConvFwdAlgorithm_t fwd_algo2 = static_cast(0); + miopenConvBwdDataAlgorithm_t data_algo = + static_cast(0); + miopenConvBwdWeightsAlgorithm_t filter_algo = + static_cast(0); +#else + cudnnConvolutionFwdAlgo_t fwd_algo1 = + static_cast(0); + cudnnConvolutionFwdAlgo_t fwd_algo2 = + static_cast(0); + cudnnConvolutionBwdDataAlgo_t data_algo = + static_cast(0); + cudnnConvolutionBwdFilterAlgo_t filter_algo = + static_cast(0); +#endif + + auto layout = paddle::platform::GetCudnnTensorFormat( + paddle::platform::DataLayout::kNCHW); + + // ddo = conv(ddI, W) + conv(I, ddW) + size_t workspace_size = 0; + + T* transformed_ddy_channel = nullptr; + if (ddO) { + ddy = ddO->data(); + transformed_ddy_channel = transformed_ddO_channel.data(); + if (ddX) { + args1.handle = handle; + args1.idesc.set(transformed_ddX, iwo_group); + args1.wdesc.set(*W, layout, iwo_group); + args1.odesc.set(transformed_ddO_channel, iwo_group); + args1.cdesc.set(dtype, + padding_common, + strides, + dilations, + paddle::platform::AllowTF32Cudnn(), + c_group); + +#ifdef PADDLE_WITH_HIP + using search1 = + paddle::operators::SearchAlgorithm; + workspace_size = search1::GetWorkspaceSize(args1); + fwd_algo1 = search1::Find( + args1, exhaustive_search, false, workspace_size, ctx); +#else + using search1 = + paddle::operators::SearchAlgorithm; + fwd_algo1 = search1::Find(args1, exhaustive_search, false, ctx); + workspace_size = search1::GetWorkspaceSize(args1, fwd_algo1); +#endif + } + + if (ddW) { + ddw = ddW->data(); + args2.handle = handle; + args2.idesc.set(transformed_X, iwo_group); + args2.wdesc.set(*ddW, layout, iwo_group); + args2.odesc.set(transformed_ddO_channel, iwo_group); + args2.cdesc.set(dtype, + padding_common, + strides, + dilations, + paddle::platform::AllowTF32Cudnn(), + c_group); + +#ifdef PADDLE_WITH_HIP + using search2 = + paddle::operators::SearchAlgorithm; + workspace_size = + std::max(workspace_size, search2::GetWorkspaceSize(args2)); + fwd_algo2 = search2::Find( + args2, exhaustive_search, false, workspace_size, ctx); +#else + using search2 = + paddle::operators::SearchAlgorithm; + fwd_algo2 = search2::Find(args2, exhaustive_search, false, ctx); + workspace_size = + std::max(workspace_size, search2::GetWorkspaceSize(args2, fwd_algo2)); +#endif + } + } + + if (dW && ddX) { + dw = dW->data(); + args3.handle = handle; + args3.idesc.set(transformed_ddX, iwo_group); + args3.wdesc.set(*dW, layout, iwo_group); + args3.odesc.set(transformed_dO_channel, iwo_group); + args3.cdesc.set(dtype, + padding_common, + strides, + dilations, + paddle::platform::AllowTF32Cudnn(), + c_group); + +#ifdef PADDLE_WITH_HIP + using search3 = + paddle::operators::SearchAlgorithm; + workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3)); + filter_algo = search3::Find( + args3, exhaustive_search, deterministic, workspace_size, ctx); +#else + using search3 = + paddle::operators::SearchAlgorithm; + filter_algo = + search3::Find(args3, exhaustive_search, deterministic, ctx); + workspace_size = + std::max(workspace_size, search3::GetWorkspaceSize(args3, filter_algo)); +#endif + } + + if (ddW && dX) { + transformed_dx = transformed_dX.data(); + + args4.handle = handle; + args4.idesc.set(transformed_dX, iwo_group); + args4.wdesc.set(*ddW, layout, iwo_group); + args4.odesc.set(transformed_dO_channel, iwo_group); + args4.cdesc.set(dtype, + padding_common, + strides, + dilations, + paddle::platform::AllowTF32Cudnn(), + c_group); + +#ifdef PADDLE_WITH_HIP + using search4 = + paddle::operators::SearchAlgorithm; + workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4)); + data_algo = search4::Find( + args4, exhaustive_search, deterministic, workspace_size, ctx); +#else + using search4 = + paddle::operators::SearchAlgorithm; + data_algo = search4::Find(args4, exhaustive_search, deterministic, ctx); + workspace_size = + std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo)); +#endif + } + + int i_n, i_c, i_d, i_h, i_w; + GetNCDHW( + transformed_X.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d, &i_h, &i_w); + + int o_n, o_c, o_d, o_h, o_w; + GetNCDHW(transformed_dO_channel.dims(), + DataLayout::kNCHW, + &o_n, + &o_c, + &o_d, + &o_h, + &o_w); + + int group_offset_in = i_c / groups * i_h * i_w * i_d; + int group_offset_out = o_c / groups * o_h * o_w * o_d; + int group_offset_filter = W->numel() / groups; + + paddle::operators::ScalingParamType alpha = 1.0f; + paddle::operators::ScalingParamType beta = 0.0f; + + // NOTE(zhiqiu): inplace addto is not supportted in double grad yet. + // ScalingParamType beta = ctx.Attr("use_addto") ? 1.0f : + // 0.0f; + // VLOG(4) << "Conv_grad_grad: use_addto = " << ctx.Attr("use_addto"); + auto wkspace_handle = ctx.cudnn_workspace_handle(); + + if (ddO) { + if (ddX) { + ddx = transformed_ddX.data(); +#ifdef PADDLE_WITH_HIP + wkspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenConvolutionForward( + handle, + &alpha, + args1.idesc.desc(), + ddx, + args1.wdesc.desc(), + w, + args1.cdesc.desc(), + fwd_algo1, + &beta, + args1.odesc.desc(), + transformed_ddy_channel, + workspace_ptr, + workspace_size)); + }, + workspace_size); +#else + for (int i = 0; i < groups; i++) { + wkspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnConvolutionForward( + handle, + &alpha, + args1.idesc.desc(), + ddx + i * group_offset_in, + args1.wdesc.desc(), + w + i * group_offset_filter, + args1.cdesc.desc(), + fwd_algo1, + workspace_ptr, + workspace_size, + &beta, + args1.odesc.desc(), + transformed_ddy_channel + i * group_offset_out)); + }, + workspace_size); + } +#endif + } + if (ddW) { +#ifdef PADDLE_WITH_HIP + // MIOPEN ONLY support beta to be 0.0f + wkspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenConvolutionForward( + handle, + &alpha, + args2.idesc.desc(), + x, + args2.wdesc.desc(), + ddw, + args2.cdesc.desc(), + fwd_algo2, + &beta, + args2.odesc.desc(), + transformed_ddy_channel, + workspace_ptr, + workspace_size)); + }, + workspace_size); +#else + for (int i = 0; i < groups; i++) { + wkspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnConvolutionForward( + handle, + &alpha, + args2.idesc.desc(), + x + i * group_offset_in, + args2.wdesc.desc(), + ddw + i * group_offset_filter, + args2.cdesc.desc(), + fwd_algo2, + workspace_ptr, + workspace_size, + &alpha, + args2.odesc.desc(), + transformed_ddy_channel + i * group_offset_out)); + }, + workspace_size); + } +#endif + } + if (channel_last) { + TransToChannelLast(ctx, &transformed_ddO_channel, ddO); + } + } + T* transformed_dy_channel = transformed_dO_channel.data(); + if (dW && ddX) { + ddx = transformed_ddX.data(); +#ifdef PADDLE_WITH_HIP + wkspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenConvolutionBackwardWeights( + handle, + &alpha, + args3.odesc.desc(), + transformed_dy_channel, + args3.idesc.desc(), + ddx, + args3.cdesc.desc(), + filter_algo, + &beta, + args3.wdesc.desc(), + dw, + workspace_ptr, + workspace_size)); + }, + workspace_size); +#else + for (int i = 0; i < groups; i++) { + wkspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnConvolutionBackwardFilter( + handle, + &alpha, + args3.idesc.desc(), + ddx + i * group_offset_in, + args3.odesc.desc(), + transformed_dy_channel + i * group_offset_out, + args3.cdesc.desc(), + filter_algo, + workspace_ptr, + workspace_size, + &beta, + args3.wdesc.desc(), + dw + i * group_offset_filter)); + }, + workspace_size); + } +#endif + } + + if (dX && ddW) { + ddw = ddW->data(); +#ifdef PADDLE_WITH_HIP + wkspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenConvolutionBackwardData( + handle, + &alpha, + args4.odesc.desc(), + transformed_dy_channel, + args4.wdesc.desc(), + ddw, + args4.cdesc.desc(), + data_algo, + &beta, + args4.idesc.desc(), + transformed_dx, + workspace_ptr, + workspace_size)); + }, + workspace_size); +#else + for (int i = 0; i < groups; i++) { + wkspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnConvolutionBackwardData( + handle, + &alpha, + args4.wdesc.desc(), + ddw + i * group_offset_filter, + args4.odesc.desc(), + transformed_dy_channel + i * group_offset_out, + args4.cdesc.desc(), + data_algo, + workspace_ptr, + workspace_size, + &beta, + args4.idesc.desc(), + transformed_dx + i * group_offset_in)); + }, + workspace_size); + } +#endif + + if (!is_sys_pad) { + // reverse padded input + std::vector starts(X->dims().size(), 0); + std::vector axes(X->dims().size(), 0); + + for (size_t i = 0; i < X->dims().size(); ++i) { + starts[i] = input_pad[2 * i]; + axes[i] = i; + } + if (X->dims().size() == 4) { + paddle::operators::RemovePaddingSlice( + ctx, &transformed_dX, &transformed_dX_channel, starts, axes); + } else { + paddle::operators::RemovePaddingSlice( + ctx, &transformed_dX, &transformed_dX_channel, starts, axes); + } + } + if (channel_last) { + TransToChannelLast(ctx, &transformed_dX_channel, dX); + } + } +} + +template +void DepthwiseConvCudnnGradGradKernel( + const Context& ctx, + paddle::optional input_grad_grad, + paddle::optional filter_grad_grad, + const DenseTensor& out_grad, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings_t, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations_t, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search_t, + bool fuse_relu, + DenseTensor* out_grad_grad, + DenseTensor* input_grad, + DenseTensor* filter_grad) { + ConvCudnnGradGradKernel(ctx, + input_grad_grad, + filter_grad_grad, + out_grad, + input, + filter, + strides, + paddings_t, + padding_algorithm, + groups, + dilations_t, + data_format, + use_addto, + workspace_size_MB, + exhaustive_search_t, + out_grad_grad, + input_grad, + filter_grad); +} + +template +void Conv3DCudnnGradGradKernel( + const Context& ctx, + paddle::optional input_grad_grad, + paddle::optional filter_grad_grad, + const DenseTensor& out_grad, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings_t, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations_t, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search_t, + DenseTensor* out_grad_grad, + DenseTensor* input_grad, + DenseTensor* filter_grad) { + ConvCudnnGradGradKernel(ctx, + input_grad_grad, + filter_grad_grad, + out_grad, + input, + filter, + strides, + paddings_t, + padding_algorithm, + groups, + dilations_t, + data_format, + use_addto, + workspace_size_MB, + exhaustive_search_t, + out_grad_grad, + input_grad, + filter_grad); +} + +} // namespace phi + +#ifdef PADDLE_WITH_HIP +PD_REGISTER_KERNEL(conv2d_grad_grad, + GPUDNN, + ALL_LAYOUT, + phi::ConvCudnnGradGradKernel, + float, + phi::dtype::float16) {} + +PD_REGISTER_KERNEL(conv3d_grad_grad, + GPUDNN, + ALL_LAYOUT, + phi::Conv3DCudnnGradGradKernel, + float, + phi::dtype::float16) {} + +PD_REGISTER_KERNEL(depthwise_conv2d_grad_grad, + GPU, + ALL_LAYOUT, + phi::DepthwiseConvCudnnGradGradKernel, + float, + phi::dtype::float16) {} +#else +#if CUDNN_VERSION_MIN(8, 1, 0) +PD_REGISTER_KERNEL(conv2d_grad_grad, + GPUDNN, + ALL_LAYOUT, + phi::ConvCudnnGradGradKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} + +PD_REGISTER_KERNEL(conv3d_grad_grad, + GPUDNN, + ALL_LAYOUT, + phi::Conv3DCudnnGradGradKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} + +PD_REGISTER_KERNEL(depthwise_conv2d_grad_grad, + GPU, + ALL_LAYOUT, + phi::DepthwiseConvCudnnGradGradKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} + +#else + +PD_REGISTER_KERNEL(conv2d_grad_grad, + GPUDNN, + ALL_LAYOUT, + phi::ConvCudnnGradGradKernel, + float, + double, + phi::dtype::float16) {} + +PD_REGISTER_KERNEL(conv3d_grad_grad, + GPUDNN, + ALL_LAYOUT, + phi::Conv3DCudnnGradGradKernel, + float, + double, + phi::dtype::float16) {} + +PD_REGISTER_KERNEL(depthwise_conv2d_grad_grad, + GPU, + ALL_LAYOUT, + phi::DepthwiseConvCudnnGradGradKernel, + float, + double, + phi::dtype::float16) {} + +#endif + +#endif diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel_gpudnn.cu new file mode 100644 index 0000000000000000000000000000000000000000..64148e902fdb2123aa3f81846999b5d90f356cd6 --- /dev/null +++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel_gpudnn.cu @@ -0,0 +1,683 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/conv_grad_kernel.h" + +#include "paddle/phi/core/dense_tensor.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +#include "paddle/fluid/framework/eigen.h" +#ifdef PADDLE_WITH_HIP +#include "paddle/fluid/operators/conv_miopen_helper.h" +#else +#include "paddle/fluid/operators/conv_cudnn_helper.h" +#endif + +#include "paddle/fluid/platform/cudnn_workspace_helper.h" +#include "paddle/fluid/platform/float16.h" +#include "paddle/fluid/platform/profiler.h" +#include "paddle/phi/kernels/funcs/padding.h" + +#include "paddle/phi/kernels/cpu/conv_util.h" +#include "paddle/phi/kernels/funcs/batch_norm_utils.h" + +#include "paddle/phi/kernels/impl/conv_cudnn_impl.h" + +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/common/float16.h" + +namespace phi { + +template +void ConvCudnnGradKernel(const Context& ctx, + const DenseTensor& output_grad, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides_t, + const std::vector& paddings_t, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations_t, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search_t, + DenseTensor* input_grad, + DenseTensor* filter_grad) { + if (input_grad) { + input_grad->mutable_data(ctx.GetPlace()); + } + if (filter_grad) { + filter_grad->mutable_data(ctx.GetPlace()); + } + + std::vector dilations = dilations_t; + std::vector strides = strides_t; + std::vector paddings = paddings_t; + + bool exhaustive_search = FLAGS_cudnn_exhaustive_search || exhaustive_search_t; + bool deterministic = FLAGS_cudnn_deterministic; + auto exhaustive_deterministic = exhaustive_search && deterministic; + PADDLE_ENFORCE_EQ(exhaustive_deterministic, + false, + phi::errors::InvalidArgument( + "Cann't set exhaustive_search True and " + "FLAGS_cudnn_deterministic True at same time.")); + + const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); + + auto dtype = paddle::platform::CudnnDataType::type; + +#ifdef PADDLE_WITH_HIP + // HIP MIOPEN ONLY SUPPORT NCHW format + auto compute_format = paddle::platform::DataLayout::kNCHW; +#else + const bool compute_in_nhwc = dtype == CUDNN_DATA_HALF && IsVoltaOrLater(ctx); + auto compute_format = compute_in_nhwc && channel_last + ? paddle::platform::DataLayout::kNHWC + : paddle::platform::DataLayout::kNCHW; +#endif + VLOG(3) << "Compute ConvGradOp with cuDNN:" + << " data_format=" << data_format << " compute_format=" + << (compute_format == paddle::platform::DataLayout::kNHWC ? "NHWC" + : "NCHW"); + + // transform Tensor + DenseTensor transformed_input_channel(input.type()); + DenseTensor transformed_output_grad_channel(output_grad.type()); + DenseTensor transformed_input_grad_channel(input.type()); + DenseTensor transformed_filter_channel(filter.type()); + DenseTensor transformed_filter_grad_channel(filter.type()); + + if (channel_last && compute_format == paddle::platform::DataLayout::kNCHW) { + VLOG(3) << "Transform input, output_grad, input_grad and tensor from " + "NHWC to NCHW."; + ResizeToChannelFirst(ctx, &input, &transformed_input_channel); + TransToChannelFirst(ctx, &input, &transformed_input_channel); + + ResizeToChannelFirst( + ctx, &output_grad, &transformed_output_grad_channel); + TransToChannelFirst( + ctx, &output_grad, &transformed_output_grad_channel); + + if (input_grad) { + ResizeToChannelFirst( + ctx, input_grad, &transformed_input_grad_channel); + // NOTE(zhiqiu): If inplace_addto strategy is enabled, we need to copy + // the data of input_grad to transformed_input_grad_channel. + if (use_addto) { + TransToChannelFirst( + ctx, input_grad, &transformed_input_grad_channel); + } + } + } else { + transformed_input_channel.ShareDataWith(input); + transformed_output_grad_channel.ShareDataWith(output_grad); + if (input_grad) { + transformed_input_grad_channel.ShareDataWith(*input_grad); + } + } + + if (compute_format == paddle::platform::DataLayout::kNHWC) { + VLOG(3) << "Transform filter and filter_grad tensor from NCHW to NHWC."; + ResizeToChannelLast(ctx, &filter, &transformed_filter_channel); + TransToChannelLast(ctx, &filter, &transformed_filter_channel); + + if (filter_grad) { + ResizeToChannelLast( + ctx, filter_grad, &transformed_filter_grad_channel); + } + } else { + transformed_filter_channel.ShareDataWith(filter); + if (filter_grad) { + transformed_filter_grad_channel.ShareDataWith(*filter_grad); + } + } + + // update paddings + auto in_dims = transformed_input_channel.dims(); + auto filter_dims = transformed_filter_channel.dims(); + DDim in_data_dims; + DDim filter_data_dims; + if (compute_format == paddle::platform::DataLayout::kNCHW) { + in_data_dims = slice_ddim(in_dims, 2, in_dims.size()); + filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size()); + } else { + in_data_dims = slice_ddim(in_dims, 1, in_dims.size() - 1); + filter_data_dims = slice_ddim(filter_dims, 1, filter_dims.size() - 1); + } + std::vector ksize = vectorize(filter_data_dims); + UpdatePaddingAndDilation( + &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); + + // cuDNN only supports padding the same amount on every dimension. + // So we create a new padded input tensor. + int data_dim = strides.size(); // 2d or 3d + bool is_sys_pad = funcs::IsSymmetricPadding(paddings, data_dim); + Tensor transformed_input(input.type()); + Tensor transformed_input_grad(input.type()); + std::vector padding_common(data_dim, 0); + std::vector input_pad(transformed_input_channel.dims().size() * 2, 0); + + if (!is_sys_pad) { + // get pad + std::vector padding_diff(data_dim); + std::vector new_input_shape_vec(data_dim + 2); + new_input_shape_vec[0] = transformed_input_channel.dims()[0]; + if (compute_format == paddle::platform::DataLayout::kNCHW) { + new_input_shape_vec[1] = transformed_input_channel.dims()[1]; + } else { + new_input_shape_vec[data_dim + 1] = + transformed_input_channel.dims()[data_dim + 1]; + } + + for (size_t i = 0; i < data_dim; ++i) { + padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]); + padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]); + if (compute_format == paddle::platform::DataLayout::kNCHW) { + new_input_shape_vec[i + 2] = + transformed_input_channel.dims()[i + 2] + padding_diff[i]; + } else { + new_input_shape_vec[i + 1] = + transformed_input_channel.dims()[i + 1] + padding_diff[i]; + } + if (compute_format == paddle::platform::DataLayout::kNCHW) { + input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i]; + input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i]; + } else { + input_pad[2 * i + 2] = paddings[2 * i] - padding_common[i]; + input_pad[2 * i + 2 + 1] = paddings[2 * i + 1] - padding_common[i]; + } + } + DDim new_input_shape(make_ddim(new_input_shape_vec)); + transformed_input.Resize(new_input_shape); + transformed_input.mutable_data(ctx.GetPlace()); + + transformed_input_grad.Resize(new_input_shape); + + if (input_grad) { + transformed_input_grad.mutable_data(ctx.GetPlace()); + } + // pad for input + const int rank = transformed_input_channel.dims().size(); + T pad_value(0.0); + switch (rank) { + case 4: { + funcs::PadFunction(ctx, + input_pad, + transformed_input_channel, + pad_value, + &transformed_input); + } break; + case 5: { + funcs::PadFunction(ctx, + input_pad, + transformed_input_channel, + pad_value, + &transformed_input); + } break; + default: + PADDLE_THROW(phi::errors::InvalidArgument( + "ConvOp only support tensors with 4 or 5 dimensions.")); + } + } else { + transformed_input.ShareDataWith(transformed_input_channel); + if (input_grad) { + transformed_input_grad.ShareDataWith(transformed_input_grad_channel); + } + if (paddings.size() == data_dim) { + for (size_t i = 0; i < data_dim; ++i) { + padding_common[i] = paddings[i]; + } + } else { + for (size_t i = 0; i < data_dim; ++i) { + padding_common[i] = paddings[2 * i]; + } + } + } + + const T* input_data = transformed_input.data(); + const T* output_grad_data = transformed_output_grad_channel.data(); + const T* filter_data = transformed_filter_channel.data(); + T* filter_grad_data = nullptr; + T* input_grad_data = nullptr; + T* transformed_input_grad_data = nullptr; + + paddle::operators::ConvArgs args1{&transformed_input_grad, + &transformed_filter_channel, + &transformed_output_grad_channel, + strides, + padding_common, + dilations, + dtype}; + paddle::operators::ConvArgs args2{&transformed_input, + &transformed_filter_grad_channel, + &transformed_output_grad_channel, + strides, + padding_common, + dilations, + dtype}; + + auto handle = ctx.cudnn_handle(); + // TODO(phlrain): replace paddle::platform::DataLaytout to phi::DataLayout + paddle::platform::DataLayout layout = + compute_format == paddle::platform::DataLayout::kNHWC + ? paddle::platform::DataLayout::kNHWC + : paddle::platform::DataLayout::kNCHW; + if (transformed_input.dims().size() == 5) { + layout = compute_format == paddle::platform::DataLayout::kNHWC + ? paddle::platform::DataLayout::kNDHWC + : paddle::platform::DataLayout::kNCDHW; + } + auto layout_tensor = paddle::platform::GetCudnnTensorFormat(layout); + auto workspace_handle = ctx.cudnn_workspace_handle(); + + int i_n, i_c, i_d, i_h, i_w; + int o_n, o_c, o_d, o_h, o_w; + if (compute_format == paddle::platform::DataLayout::kNHWC) { + paddle::operators::GetNCDHW(transformed_input.dims(), + paddle::platform::DataLayout::kNHWC, + &i_n, + &i_c, + &i_d, + &i_h, + &i_w); + paddle::operators::GetNCDHW(transformed_output_grad_channel.dims(), + paddle::platform::DataLayout::kNHWC, + &o_n, + &o_c, + &o_d, + &o_h, + &o_w); + } else { + paddle::operators::GetNCDHW(transformed_input.dims(), + paddle::platform::DataLayout::kNCHW, + &i_n, + &i_c, + &i_d, + &i_h, + &i_w); + paddle::operators::GetNCDHW(transformed_output_grad_channel.dims(), + paddle::platform::DataLayout::kNCHW, + &o_n, + &o_c, + &o_d, + &o_h, + &o_w); + } + + int group_offset_in = i_c / groups * i_h * i_w * i_d; + int group_offset_out = o_c / groups * o_h * o_w * o_d; + int group_offset_filter = transformed_filter_channel.numel() / groups; +// ------------------- cudnn backward algorithm --------------------- +#ifdef PADDLE_WITH_HIP + miopenConvBwdDataAlgorithm_t data_algo = + static_cast(0); + miopenConvBwdWeightsAlgorithm_t filter_algo = + static_cast(0); +#else + cudnnConvolutionBwdDataAlgo_t data_algo = + static_cast(0); + cudnnConvolutionBwdFilterAlgo_t filter_algo = + static_cast(0); +#endif + // input data workspace_size + size_t workspace_size_d = 0; + // weight workspace_size + size_t workspace_size_w = 0; + int iwo_groups = groups; + int c_groups = 1; + +#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) + iwo_groups = 1; + c_groups = groups; + groups = 1; +#endif + + if (input_grad) { + // ------------------- cudnn descriptors --------------------- + input_grad_data = input_grad->data(); + transformed_input_grad_data = transformed_input_grad.data(); + + args1.handle = handle; + args1.idesc.set(transformed_input_grad, layout_tensor); + args1.wdesc.set(transformed_filter_channel, layout_tensor, iwo_groups); + args1.odesc.set(transformed_output_grad_channel, layout_tensor); + args1.cdesc.set(dtype, + padding_common, + strides, + dilations, + paddle::platform::AllowTF32Cudnn(), + c_groups); + +#ifdef PADDLE_WITH_HIP + using search1 = + paddle::operators::SearchAlgorithm; + workspace_size_d = + std::max(workspace_size_d, search1::GetWorkspaceSize(args1)); + data_algo = search1::Find( + args1, exhaustive_search, deterministic, workspace_size_d, ctx); +#else + using search1 = + paddle::operators::SearchAlgorithm; + data_algo = search1::Find(args1, exhaustive_search, deterministic, ctx); + workspace_size_d = + std::max(workspace_size_d, search1::GetWorkspaceSize(args1, data_algo)); +#endif + } + + if (filter_grad) { + // ------------------- cudnn descriptors --------------------- + filter_grad_data = transformed_filter_grad_channel.data(); + args2.handle = handle; + args2.idesc.set(transformed_input, layout_tensor); + args2.wdesc.set(transformed_filter_grad_channel, layout_tensor, iwo_groups); + args2.odesc.set(transformed_output_grad_channel, layout_tensor); + args2.cdesc.set(dtype, + padding_common, + strides, + dilations, + paddle::platform::AllowTF32Cudnn(), + c_groups); +#ifdef PADDLE_WITH_HIP + using search2 = + paddle::operators::SearchAlgorithm; + workspace_size_w = + std::max(workspace_size_w, search2::GetWorkspaceSize(args2)); + filter_algo = search2::Find( + args2, exhaustive_search, deterministic, workspace_size_w, ctx); +#else + using search2 = + paddle::operators::SearchAlgorithm; + filter_algo = + search2::Find(args2, exhaustive_search, deterministic, ctx); + workspace_size_w = std::max(workspace_size_w, + search2::GetWorkspaceSize(args2, filter_algo)); +#endif + } + + // ------------------- cudnn conv backward data --------------------- + paddle::operators::ScalingParamType alpha = 1.0f; +#ifdef PADDLE_WITH_HIP + // MIOPEN ONLY support beta to be 0.0f + paddle::operators::ScalingParamType beta = 0.0f; +#else + paddle::operators::ScalingParamType beta = use_addto ? 1.0f : 0.0f; + +#endif + VLOG(4) << "Conv_grad: use_addto = " << use_addto; + + if (input_grad) { +// When beta is 0, it is unnecessary to reset input_grad. +// When beta is 1, the output cannot be reset since addt strategy used. +#ifdef PADDLE_WITH_HIP + if (use_addto) { + DenseTensor temp_tensor(transformed_input_grad.type()); + temp_tensor.Resize(transformed_input_grad.dims()); + T* temp_tensor_data = temp_tensor.mutable_data(ctx.GetPlace()); + workspace_handle.RunFunc( + [&](void* cudnn_workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenConvolutionBackwardData( + handle, + &alpha, + args1.odesc.desc(), + output_grad_data, + args1.wdesc.desc(), + filter_data, + args1.cdesc.desc(), + data_algo, + &beta, + args1.idesc.desc(), + temp_tensor_data, + cudnn_workspace_ptr, + workspace_size_d)); + }, + workspace_size_d); + PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::miopenOpTensor( + handle, + miopenTensorOpAdd, + &alpha, + args1.idesc.desc(), + transformed_input_grad_data, + &alpha, + args1.idesc.desc(), + temp_tensor_data, + &beta, + args1.idesc.desc(), + transformed_input_grad_data)); + } else { + workspace_handle.RunFunc( + [&](void* cudnn_workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenConvolutionBackwardData( + handle, + &alpha, + args1.odesc.desc(), + output_grad_data, + args1.wdesc.desc(), + filter_data, + args1.cdesc.desc(), + data_algo, + &beta, + args1.idesc.desc(), + transformed_input_grad_data, + cudnn_workspace_ptr, + workspace_size_d)); + }, + workspace_size_d); + } + +#else + for (int i = 0; i < groups; i++) { + workspace_handle.RunFunc( + [&](void* cudnn_workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnConvolutionBackwardData( + handle, + &alpha, + args1.wdesc.desc(), + filter_data + i * group_offset_filter, + args1.odesc.desc(), + output_grad_data + i * group_offset_out, + args1.cdesc.desc(), + data_algo, + cudnn_workspace_ptr, + workspace_size_d, + &beta, + args1.idesc.desc(), + transformed_input_grad_data + i * group_offset_in)); + }, + workspace_size_d); + } +#endif + if (!is_sys_pad) { + std::vector starts(transformed_input_channel.dims().size(), 0); + std::vector axes(transformed_input_channel.dims().size(), 0); + + for (size_t i = 0; i < transformed_input_channel.dims().size(); ++i) { + starts[i] = input_pad[2 * i]; + axes[i] = i; + } + + transformed_input_grad_channel.mutable_data(ctx.GetPlace()); + if (transformed_input_channel.dims().size() == 4) { + paddle::operators::RemovePaddingSlice( + ctx, + &transformed_input_grad, + &transformed_input_grad_channel, + starts, + axes); + } else { + paddle::operators::RemovePaddingSlice( + ctx, + &transformed_input_grad, + &transformed_input_grad_channel, + starts, + axes); + } + } + + if (channel_last && compute_format == paddle::platform::DataLayout::kNCHW) { + TransToChannelLast( + ctx, &transformed_input_grad_channel, input_grad); + } + } + + // filter_grad do not use inplace addto. + paddle::operators::ScalingParamType beta_filter = 0.0f; + // ------------------- cudnn conv backward filter --------------------- + if (filter_grad) { +// Because beta is zero, it is unnecessary to reset filter_grad. +#ifdef PADDLE_WITH_HIP + workspace_handle.RunFunc( + [&](void* cudnn_workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenConvolutionBackwardWeights( + handle, + &alpha, + args2.odesc.desc(), + output_grad_data, + args2.idesc.desc(), + input_data, + args2.cdesc.desc(), + filter_algo, + &beta, + args2.wdesc.desc(), + filter_grad_data, + cudnn_workspace_ptr, + workspace_size_w)); + }, + workspace_size_w); +#else + for (int i = 0; i < groups; i++) { + workspace_handle.RunFunc( + [&](void* cudnn_workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnConvolutionBackwardFilter( + handle, + &alpha, + args2.idesc.desc(), + input_data + i * group_offset_in, + args2.odesc.desc(), + output_grad_data + i * group_offset_out, + args2.cdesc.desc(), + filter_algo, + cudnn_workspace_ptr, + workspace_size_w, + &beta_filter, + args2.wdesc.desc(), + filter_grad_data + i * group_offset_filter)); + }, + workspace_size_w); + } +#endif + + if (compute_format == paddle::platform::DataLayout::kNHWC) { + TransToChannelFirst( + ctx, &transformed_filter_grad_channel, filter_grad); + } + } +} + +template +void Conv3DCudnnGradKernel(const Context& dev_ctx, + const DenseTensor& out_grad, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& paddding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + DenseTensor* input_grad, + DenseTensor* filter_grad) { + ConvCudnnGradKernel(dev_ctx, + out_grad, + input, + filter, + strides, + paddings, + paddding_algorithm, + groups, + dilations, + data_format, + use_addto, + workspace_size_MB, + exhaustive_search, + input_grad, + filter_grad); +} + +} // namespace phi + +#ifdef PADDLE_WITH_HIP +PD_REGISTER_KERNEL(conv2d_grad, + GPUDNN, + ALL_LAYOUT, + phi::ConvCudnnGradKernel, + float, + phi::dtype::float16) {} + +PD_REGISTER_KERNEL(conv3d_grad, + GPUDNN, + ALL_LAYOUT, + phi::Conv3DCudnnGradKernel, + float, + phi::dtype::float16) {} +#else +#if CUDNN_VERSION_MIN(8, 1, 0) +PD_REGISTER_KERNEL(conv2d_grad, + GPUDNN, + ALL_LAYOUT, + phi::ConvCudnnGradKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} + +PD_REGISTER_KERNEL(conv3d_grad, + GPUDNN, + ALL_LAYOUT, + phi::Conv3DCudnnGradKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} +#else +PD_REGISTER_KERNEL(conv2d_grad, + GPUDNN, + ALL_LAYOUT, + phi::ConvCudnnGradKernel, + float, + double, + phi::dtype::float16) {} + +PD_REGISTER_KERNEL(conv3d_grad, + GPUDNN, + ALL_LAYOUT, + phi::Conv3DCudnnGradKernel, + float, + double, + phi::dtype::float16) {} + +#endif + +#endif diff --git a/paddle/phi/kernels/gpudnn/conv_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/conv_kernel_gpudnn.cu new file mode 100644 index 0000000000000000000000000000000000000000..931b6d68845e27297784603c2427178eae6b6f7d --- /dev/null +++ b/paddle/phi/kernels/gpudnn/conv_kernel_gpudnn.cu @@ -0,0 +1,476 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/conv_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +#include "paddle/fluid/framework/eigen.h" +#ifdef PADDLE_WITH_HIP +#include "paddle/fluid/operators/conv_miopen_helper.h" +#else +#include "paddle/fluid/operators/conv_cudnn_helper.h" +#endif + +#include "paddle/fluid/platform/cudnn_workspace_helper.h" +#include "paddle/fluid/platform/float16.h" +#include "paddle/fluid/platform/profiler.h" +#include "paddle/phi/kernels/funcs/padding.h" + +#include "paddle/phi/kernels/cpu/conv_util.h" +#include "paddle/phi/kernels/funcs/batch_norm_utils.h" + +#include "paddle/phi/kernels/impl/conv_cudnn_impl.h" + +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/common/float16.h" + +namespace phi { + +template +void ConvCudnnKernel(const Context& ctx, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings_t, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations_t, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search_t, + DenseTensor* output) { + output->mutable_data(ctx.GetPlace()); + std::vector paddings = paddings_t; + std::vector dilations = dilations_t; + + bool exhaustive_search = FLAGS_cudnn_exhaustive_search || exhaustive_search_t; + bool deterministic = FLAGS_cudnn_deterministic; + auto exhaustive_deterministic = exhaustive_search && deterministic; + PADDLE_ENFORCE_EQ(exhaustive_deterministic, + false, + phi::errors::InvalidArgument( + "Cann't set exhaustive_search True and " + "FLAGS_cudnn_deterministic True at same time.")); + + const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); + + auto dtype = paddle::platform::CudnnDataType::type; + +#ifdef PADDLE_WITH_HIP + // HIP MIOPEN ONLY SUPPORT NCHW format + auto compute_format = paddle::platform::DataLayout::kNCHW; +#else + // Tensor Core introduced from Volta GPUs supports more faster conv op + // with FP16 in NHWC data format. + const bool compute_in_nhwc = dtype == CUDNN_DATA_HALF && IsVoltaOrLater(ctx); + // We will only do data format conversion from NHWC to NCHW. + // cudnn will convert NCHW to NHWC automatically on Tensor Core. + auto compute_format = compute_in_nhwc && channel_last + ? paddle::platform::DataLayout::kNHWC + : paddle::platform::DataLayout::kNCHW; +#endif + VLOG(3) << "Compute ConvOp with cuDNN:" + << " data_format=" << data_format << " compute_format=" + << (compute_format == paddle::platform::DataLayout::kNHWC ? "NHWC" + : "NCHW"); + + // ------------ transformed tensor ----------- + DenseTensor transformed_input_channel(input.type()); + DenseTensor transformed_output(output->type()); + DenseTensor transformed_filter_channel(filter.type()); + T* output_data = nullptr; + if (channel_last && compute_format == paddle::platform::DataLayout::kNCHW) { + VLOG(3) << "Transform input tensor from NHWC to NCHW."; + ResizeToChannelFirst(ctx, &input, &transformed_input_channel); + TransToChannelFirst(ctx, &input, &transformed_input_channel); + + ResizeToChannelFirst(ctx, output, &transformed_output); + + } else { + transformed_input_channel.ShareDataWith(input); + transformed_output.ShareDataWith(*output); + } + if (compute_format == paddle::platform::DataLayout::kNHWC) { + VLOG(3) << "Transform filter tensor from NCHW to NHWC."; + ResizeToChannelLast(ctx, &filter, &transformed_filter_channel); + TransToChannelLast(ctx, &filter, &transformed_filter_channel); + } else { + transformed_filter_channel.ShareDataWith(filter); + } + output_data = transformed_output.data(); + + // update padding and dilation + auto in_dims = transformed_input_channel.dims(); + auto filter_dims = transformed_filter_channel.dims(); + DDim in_data_dims; + DDim filter_data_dims; + + if (compute_format == paddle::platform::DataLayout::kNCHW) { + in_data_dims = slice_ddim(in_dims, 2, in_dims.size()); + filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size()); + } else { + in_data_dims = slice_ddim(in_dims, 1, in_dims.size() - 1); + filter_data_dims = slice_ddim(filter_dims, 1, filter_dims.size() - 1); + } + + std::vector ksize = vectorize(filter_data_dims); + UpdatePaddingAndDilation( + &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); + + int data_dim = strides.size(); // 2d or 3d + bool is_sys_pad = funcs::IsSymmetricPadding(paddings, data_dim); + + DenseTensor transformed_input; + std::vector padding_common(data_dim, 0); + if (!is_sys_pad) { + std::vector padding_diff(data_dim); + std::vector new_input_shape_vec(data_dim + 2); + new_input_shape_vec[0] = transformed_input_channel.dims()[0]; + + if (compute_format == paddle::platform::DataLayout::kNCHW) { + new_input_shape_vec[1] = transformed_input_channel.dims()[1]; + } else { + new_input_shape_vec[data_dim + 1] = + transformed_input_channel.dims()[data_dim + 1]; + } + + std::vector input_pad(transformed_input_channel.dims().size() * 2, 0); + for (size_t i = 0; i < data_dim; ++i) { + padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]); + padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]); + if (compute_format == paddle::platform::DataLayout::kNCHW) { + new_input_shape_vec[i + 2] = + transformed_input_channel.dims()[i + 2] + padding_diff[i]; + } else { + new_input_shape_vec[i + 1] = + transformed_input_channel.dims()[i + 1] + padding_diff[i]; + } + if (compute_format == paddle::platform::DataLayout::kNCHW) { + input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i]; + input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i]; + } else { + input_pad[2 * i + 2] = paddings[2 * i] - padding_common[i]; + input_pad[2 * i + 2 + 1] = paddings[2 * i + 1] - padding_common[i]; + } + } + DDim new_input_shape(make_ddim(new_input_shape_vec)); + transformed_input.Resize(new_input_shape); + transformed_input.mutable_data(ctx.GetPlace()); + + const int rank = transformed_input_channel.dims().size(); + T pad_value(0.0); + switch (rank) { + case 4: { + funcs::PadFunction(ctx, + input_pad, + transformed_input_channel, + pad_value, + &transformed_input); + } break; + case 5: { + funcs::PadFunction(ctx, + input_pad, + transformed_input_channel, + pad_value, + &transformed_input); + } break; + default: + PADDLE_THROW(phi::errors::InvalidArgument( + "ConvOp only support tensors with 4 or 5 dimensions.")); + } + + } else { + transformed_input.ShareDataWith(transformed_input_channel); + if (paddings.size() == data_dim) { + for (size_t i = 0; i < data_dim; ++i) { + padding_common[i] = paddings[i]; + } + } else { + for (size_t i = 0; i < data_dim; ++i) { + padding_common[i] = paddings[2 * i]; + } + } + } + + const T* input_data = transformed_input.data(); + + const T* filter_data = transformed_filter_channel.data(); + + // ------------------- cudnn descriptors --------------------- + paddle::operators::ConvArgs args{&transformed_input, + &transformed_filter_channel, + &transformed_output, + strides, + padding_common, + dilations, + dtype}; + + auto handle = ctx.cudnn_handle(); + auto workspace_handle = ctx.cudnn_workspace_handle(); + paddle::platform::DataLayout layout = + compute_format == paddle::platform::DataLayout::kNHWC + ? paddle::platform::DataLayout::kNHWC + : paddle::platform::DataLayout::kNCHW; + if (transformed_input.dims().size() == 5) { + layout = compute_format == paddle::platform::DataLayout::kNHWC + ? paddle::platform::DataLayout::kNDHWC + : paddle::platform::DataLayout::kNCDHW; + } + auto layout_format = paddle::platform::GetCudnnTensorFormat(layout); + + args.handle = handle; + +#ifdef PADDLE_WITH_HIP + // MIOPEN need to set groups in cdesc in miopen_desc.h + args.cdesc.set(dtype, + padding_common, + strides, + dilations, + paddle::platform::AllowTF32Cudnn(), + groups); +#else + args.cdesc.set(dtype, + padding_common, + strides, + dilations, + paddle::platform::AllowTF32Cudnn()); +#endif + +#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1) + // cudnn 7 can support groups, no need to do it manually + // FIXME(typhoonzero): find a better way to disable groups + // rather than setting it to 1. + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnSetConvolutionGroupCount( + args.cdesc.desc(), groups)); + groups = 1; +#endif +#ifdef PADDLE_WITH_HIP + // MIOPEN do not set groups in wdesc after set groups in cdesc + groups = 1; +#endif + args.idesc.set(transformed_input, layout_format); + args.wdesc.set(transformed_filter_channel, layout_format, groups); + args.odesc.set(transformed_output, layout_format); + int i_n, i_c, i_d, i_h, i_w; + int o_n, o_c, o_d, o_h, o_w; + + if (compute_format == paddle::platform::DataLayout::kNHWC) { + paddle::operators::GetNCDHW(transformed_input.dims(), + paddle::platform::DataLayout::kNHWC, + &i_n, + &i_c, + &i_d, + &i_h, + &i_w); + paddle::operators::GetNCDHW(transformed_output.dims(), + paddle::platform::DataLayout::kNHWC, + &o_n, + &o_c, + &o_d, + &o_h, + &o_w); + } else { + paddle::operators::GetNCDHW(transformed_input.dims(), + paddle::platform::DataLayout::kNCHW, + &i_n, + &i_c, + &i_d, + &i_h, + &i_w); + paddle::operators::GetNCDHW(transformed_output.dims(), + paddle::platform::DataLayout::kNCHW, + &o_n, + &o_c, + &o_d, + &o_h, + &o_w); + } + + int group_offset_in = i_c / groups * i_h * i_w * i_d; + int group_offset_out = o_c / groups * o_h * o_w * o_d; + int group_offset_filter = transformed_filter_channel.numel() / groups; + // ------------------- cudnn conv workspace --------------------- + size_t workspace_size = 0; // final workspace to allocate. +// ------------------- cudnn conv algorithm --------------------- +#ifdef PADDLE_WITH_HIP + miopenConvFwdAlgorithm_t algo{}; + using search = paddle::operators::SearchAlgorithm; + workspace_size = search::GetWorkspaceSize(args); + algo = search::Find( + args, exhaustive_search, deterministic, workspace_size, ctx); +#else + cudnnConvolutionFwdAlgo_t algo{}; + using search = + paddle::operators::SearchAlgorithm; + algo = search::Find(args, exhaustive_search, deterministic, ctx); + workspace_size = search::GetWorkspaceSize(args, algo); +#endif + +#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1) + // when groups > 1, SearchAlgorithm find algo is CUDNN_CONVOLUTION_\ + // FWD_ALGO_WINOGRAD_NONFUSED, but this kind of algorithm is unstable + // in forward computation, so change the algorithm to CUDNN_CONVOLUTION_\ + // FWD_ALGO_IMPLICIT_GEMM manually. + if (groups > 1) { + algo = static_cast(0); + } +#endif + + // ------------------- cudnn conv forward --------------------- + paddle::operators::ScalingParamType alpha = 1.0f; + paddle::operators::ScalingParamType beta = 0.0f; + +// NOTE(zhiqiu): inplace addto is not supportted in double grad yet. +// ScalingParamType beta = ctx.Attr("use_addto") ? 1.0f : 0.0f; +// VLOG(4) << "Conv: use_addto = " << ctx.Attr("use_addto"); + +#ifdef PADDLE_WITH_HIP + workspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenConvolutionForward( + handle, + &alpha, + args.idesc.desc(), + input_data, + args.wdesc.desc(), + filter_data, + args.cdesc.desc(), + algo, + &beta, + args.odesc.desc(), + output_data, + workspace_ptr, + workspace_size)); + }, + workspace_size); +#else + for (int i = 0; i < groups; i++) { + workspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnConvolutionForward( + handle, + &alpha, + args.idesc.desc(), + input_data + i * group_offset_in, + args.wdesc.desc(), + filter_data + i * group_offset_filter, + args.cdesc.desc(), + algo, + workspace_ptr, + workspace_size, + &beta, + args.odesc.desc(), + output_data + i * group_offset_out)); + }, + workspace_size); + } +#endif + + if (channel_last && compute_format == paddle::platform::DataLayout::kNCHW) { + TransToChannelLast(ctx, &transformed_output, output); + } +} + +template +void Conv3DCudnnKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + DenseTensor* out) { + ConvCudnnKernel(dev_ctx, + input, + filter, + strides, + paddings, + padding_algorithm, + groups, + dilations, + data_format, + use_addto, + workspace_size_MB, + exhaustive_search, + out); +} + +} // namespace phi + +#ifdef PADDLE_WITH_HIP +PD_REGISTER_KERNEL(conv2d, + GPUDNN, + ALL_LAYOUT, + phi::ConvCudnnKernel, + float, + phi::dtype::float16) {} + +PD_REGISTER_KERNEL(conv3d, + GPUDNN, + ALL_LAYOUT, + phi::Conv3DCudnnKernel, + float, + phi::dtype::float16) {} +#else +#if CUDNN_VERSION_MIN(8, 1, 0) +PD_REGISTER_KERNEL(conv2d, + GPUDNN, + ALL_LAYOUT, + phi::ConvCudnnKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} + +PD_REGISTER_KERNEL(conv3d, + GPUDNN, + ALL_LAYOUT, + phi::Conv3DCudnnKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} +#else +PD_REGISTER_KERNEL(conv2d, + GPUDNN, + ALL_LAYOUT, + phi::ConvCudnnKernel, + float, + double, + phi::dtype::float16) {} + +PD_REGISTER_KERNEL(conv3d, + GPUDNN, + ALL_LAYOUT, + phi::Conv3DCudnnKernel, + float, + double, + phi::dtype::float16) {} +#endif + +#endif + +// todo register bfloat16 diff --git a/paddle/phi/kernels/impl/conv_cudnn_impl.h b/paddle/phi/kernels/impl/conv_cudnn_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..93bc5b64adc170901aeffeadfa64d6b5d7ea8c60 --- /dev/null +++ b/paddle/phi/kernels/impl/conv_cudnn_impl.h @@ -0,0 +1,90 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +#include "paddle/fluid/framework/eigen.h" +#ifdef PADDLE_WITH_HIP +#include "paddle/fluid/operators/conv_miopen_helper.h" +#else +#include "paddle/fluid/operators/conv_cudnn_helper.h" +#endif + +#include "paddle/fluid/platform/cudnn_workspace_helper.h" +#include "paddle/fluid/platform/float16.h" +#include "paddle/fluid/platform/profiler.h" +#include "paddle/phi/kernels/funcs/padding.h" + +#include "paddle/fluid/platform/dynload/cudnn.h" +#include "paddle/phi/kernels/cpu/conv_util.h" +#include "paddle/phi/kernels/funcs/batch_norm_utils.h" + +DECLARE_bool(cudnn_deterministic); +DECLARE_uint64(conv_workspace_size_limit); +DECLARE_bool(cudnn_exhaustive_search); + +namespace phi { + +static inline bool IsVoltaOrLater(const phi::GPUContext& dev_ctx) { + return dev_ctx.GetComputeCapability() >= 70; +} + +// inline cudnnTensorFormat_t GetCudnnTensorFormat( +// const phi::DataLayout& order) { // Not use +// switch (order) { +// case phi::DataLayout::kNHWC: +// return CUDNN_TENSOR_NHWC; +// case phi::DataLayout::kNCHW: +// return CUDNN_TENSOR_NCHW; +// case phi::DataLayout::NCDHW: +// return CUDNN_TENSOR_NCHW; // NOTE: cudnn treat NdTensor as the same +// case phi::DataLayout::NDHWC: +// return CUDNN_TENSOR_NHWC; // add, liyamei +// default: +// PADDLE_THROW(phi::errors::Unimplemented( +// "CUDNN has no equivalent dataLayout for input order.")); +// } +// return CUDNN_TENSOR_NCHW; +// } + +static inline void GetNCDHW(const DDim& dims, + const phi::DataLayout& layout, + int* N, + int* C, + int* D, + int* H, + int* W) { + *N = dims[0]; + *C = layout == phi::DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1]; + int i = layout == phi::DataLayout::kNCHW ? 0 : 1; + if (dims.size() == 5) { + *D = dims[2 - i]; + *H = dims[3 - i]; + *W = dims[4 - i]; + } else { + *D = 1; + *H = dims[2 - i]; + *W = dims[3 - i]; + } +} + +} // namespace phi + +// PD_REGISTER_KERNEL(convdnn, GPU, ALL_LAYOUT, phi::ConvKernel, float, double +// ) {} diff --git a/paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h b/paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..fbcebf371a61bd3d652888b5eaad56185499726b --- /dev/null +++ b/paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h @@ -0,0 +1,330 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/operators/math/im2col.h" +#include "paddle/fluid/operators/math/vol2col.h" +#include "paddle/phi/kernels/conv_kernel.h" +#include "paddle/phi/kernels/cpu/conv_util.h" +#include "paddle/phi/kernels/funcs/batch_norm_utils.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +void ConvGradGradKernel(const Context& dev_ctx, + paddle::optional input_grad_grad, + paddle::optional filter_grad_grad, + const DenseTensor& out_grad, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides_t, + const std::vector& paddings_t, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations_t, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + DenseTensor* out_grad_grad, + DenseTensor* input_grad, + DenseTensor* filter_grad) { + const DenseTensor* X = &input; + const DenseTensor* dY = &out_grad; + const DenseTensor* ddX = input_grad_grad.get_ptr(); + const DenseTensor* ddW_in = filter_grad_grad.get_ptr(); + + DenseTensor* ddY = out_grad_grad; + DenseTensor* dW = filter_grad; + DenseTensor* dX = input_grad; + DenseTensor W = filter; + + if (!ddY && !dW && !dX) return; + + const std::vector strides = strides_t; + std::vector paddings = paddings_t; + std::vector dilations = dilations_t; + + const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); + + // transform Tensor + DenseTensor transformed_X(X->type()); + DenseTensor transformed_dY(dY->type()); + DenseTensor transformed_ddX(X->type()); + + if (channel_last) { + ResizeToChannelFirst(dev_ctx, X, &transformed_X); + TransToChannelFirst(dev_ctx, X, &transformed_X); + + ResizeToChannelFirst(dev_ctx, dY, &transformed_dY); + TransToChannelFirst(dev_ctx, dY, &transformed_dY); + + if (ddX) { + ResizeToChannelFirst(dev_ctx, ddX, &transformed_ddX); + TransToChannelFirst(dev_ctx, ddX, &transformed_ddX); + } + } else { + transformed_X = *X; + transformed_dY = *dY; + if (ddX) { + transformed_ddX = *ddX; + } + } + + // update padding and dilation + auto in_dims = transformed_X.dims(); + auto filter_dims = W.dims(); + + DDim in_data_dims = slice_ddim(in_dims, 2, in_dims.size()); + DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size()); + std::vector ksize = vectorize(filter_data_dims); + UpdatePaddingAndDilation( + &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); + + const int batch_size = static_cast(transformed_X.dims()[0]); + std::vector filter_shape_vec(vectorize(W.dims())); + std::vector output_shape_vec(vectorize(transformed_dY.dims())); + + size_t data_dim = filter_shape_vec.size() - 2; + std::vector col_shape_vec(1 + 2 * data_dim); + // col_shape [in_channel/group, kh, kw, oh, ow] + col_shape_vec[0] = transformed_X.dims()[1] / groups; + for (size_t j = 0; j < data_dim; ++j) { + col_shape_vec[j + 1] = filter_shape_vec[j + 2]; + col_shape_vec[j + data_dim + 1] = output_shape_vec[j + 2]; + } + DDim col_shape(make_ddim(col_shape_vec)); + // col_matrix_shape [in_channel/group * kh * kw, oh * ow] + DDim col_matrix_shape = flatten_to_2d(col_shape, data_dim + 1); + // input_shape [Cin, H, W] + DDim input_shape = + slice_ddim(transformed_X.dims(), 1, transformed_X.dims().size()); + // filter_matrix_shape [Cout, Cin * kh * kw] + DDim filter_matrix_shape = {W.dims()[0], W.numel() / W.dims()[0]}; + + W.Resize(filter_matrix_shape); + DDim output_matrix_shape = { + transformed_dY.dims()[1], + transformed_dY.numel() / + (transformed_dY.dims()[0] * transformed_dY.dims()[1])}; + int in_step = static_cast(transformed_X.dims()[1]) / groups; + int out_step = static_cast(transformed_dY.dims()[1]) / groups; + + bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations); + DenseTensor col; + DenseTensor col_matrix; + if (is_expand) { + col.Resize(col_shape); + col.mutable_data(dev_ctx.GetPlace()); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } + + phi::funcs::SetConstant set_zero; + auto blas = phi::funcs::GetBlas(dev_ctx); + + // dx convolution double grad: gemm + col2im(col2vol) + // dx = ddw * dy ==> dx(N, Cin, H, W), ddw(Cout, Cin, kh, kw), dy(N, Cout, + // oH, oW) + if (dX && ddW_in) { + Tensor ddW; + ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape); + dX->mutable_data(dev_ctx.GetPlace()); + + DenseTensor transformed_dX(dX->type()); + + if (channel_last) { + ResizeToChannelFirst(dev_ctx, dX, &transformed_dX); + + } else { + transformed_dX = *dX; + } + // if is_expand is false, the operation of set_zero is unnecessary + // because math::matmul will reset dx + if (is_expand) { + set_zero(dev_ctx, &transformed_dX, static_cast(0)); + } + paddle::operators::math::Col2VolFunctor col2vol; + paddle::operators::math:: + Col2ImFunctor + col2im; + + for (int i = 0; i < batch_size; i++) { + DenseTensor dy_batch = + transformed_dY.Slice(i, i + 1).Resize(output_matrix_shape); + DenseTensor dx_batch = transformed_dX.Slice(i, i + 1).Resize(input_shape); + for (int g = 0; g < groups; g++) { + // gemm + DenseTensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step); + DenseTensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step); + DenseTensor dx_slice = dx_batch.Slice(g * in_step, (g + 1) * in_step); + if (!is_expand) { + col_matrix.ShareDataWith(dx_slice); + col_matrix.Resize(col_matrix_shape); + } + blas.MatMul( + ddw_slice, true, dy_slice, false, T(1.0), &col_matrix, T(0.0)); + + if (is_expand && data_dim == 2U) { + col2im(dev_ctx, + col, + dilations, + strides, + std::vector{ + paddings[0], paddings[2], paddings[1], paddings[3]}, + &dx_slice); + } else if (is_expand && data_dim == 3U) { + col2vol(dev_ctx, col, dilations, strides, paddings, &dx_slice); + } + } + } + if (channel_last) { + TransToChannelLast(dev_ctx, &transformed_dX, dX); + } + } + + // dw = ddx * dy ==> dw(Cout, Cin, kh, kw), ddx(N, Cin, H, W), dy(N, Cout, + // oH, oW) + // dw convolution double grad: im2col(vol2col) + gemm + if (dW && ddX) { + dW->mutable_data(dev_ctx.GetPlace()); + set_zero(dev_ctx, dW, static_cast(0)); + DenseTensor dW_arr = *dW; + dW_arr.Resize(filter_matrix_shape); + paddle::operators::math:: + Im2ColFunctor + im2col; + paddle::operators::math::Vol2ColFunctor vol2col; + for (int i = 0; i < batch_size; ++i) { + DenseTensor dy_batch = + transformed_dY.Slice(i, i + 1).Resize(output_matrix_shape); + Tensor ddx_batch = transformed_ddX.Slice(i, i + 1).Resize(input_shape); + for (int g = 0; g < groups; ++g) { + // im2col + DenseTensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step); + DenseTensor ddx_slice = ddx_batch.Slice(g * in_step, (g + 1) * in_step); + if (!is_expand) { + col.ShareDataWith(ddx_slice); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } else if (data_dim == 2U) { + im2col(dev_ctx, + ddx_slice, + dilations, + strides, + std::vector{ + paddings[0], paddings[2], paddings[1], paddings[3]}, + &col); + } else if (data_dim == 3U) { + vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col); + } + + DenseTensor dw_slice = dW_arr.Slice(g * out_step, (g + 1) * out_step); + blas.MatMul( + dy_slice, false, col_matrix, true, T(1.0), &dw_slice, T(1.0)); + } + } + } + + // ddy = w * ddx + x * ddw ==> ddy(N, Cout, oH, oW), x/ddx(N, Cin, H, W), + // w/ddw(Cout, Cin, kh, kw) + // ddy convolution double grad: im2col(vol2col) + gemm + if (ddY) { + ddY->mutable_data(dev_ctx.GetPlace()); + + DenseTensor transformed_ddY(ddY->type()); + if (channel_last) { + ResizeToChannelFirst(dev_ctx, ddY, &transformed_ddY); + } else { + transformed_ddY = *ddY; + } + + set_zero(dev_ctx, &transformed_ddY, static_cast(0)); + paddle::operators::math:: + Im2ColFunctor + im2col; + paddle::operators::math::Vol2ColFunctor vol2col; + for (int i = 0; i < batch_size; ++i) { + DenseTensor ddy_batch = + transformed_ddY.Slice(i, i + 1).Resize(output_matrix_shape); + for (int g = 0; g < groups; ++g) { + // gemm + DenseTensor ddy_slice = + ddy_batch.Slice(g * out_step, (g + 1) * out_step); + + if (ddX) { + DenseTensor ddx_batch = + transformed_ddX.Slice(i, i + 1).Resize(input_shape); + DenseTensor ddx_slice = + ddx_batch.Slice(g * in_step, (g + 1) * in_step); + if (!is_expand) { + col.ShareDataWith(ddx_slice); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } else if (data_dim == 2U) { + im2col(dev_ctx, + ddx_slice, + dilations, + strides, + std::vector{ + paddings[0], paddings[2], paddings[1], paddings[3]}, + &col); + } else if (data_dim == 3U) { + vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col); + } + DenseTensor w_slice = W.Slice(g * out_step, (g + 1) * out_step); + blas.MatMul( + w_slice, false, col_matrix, false, T(1.0), &ddy_slice, T(0.0)); + } + + if (ddW_in) { + DenseTensor x_batch = + transformed_X.Slice(i, i + 1).Resize(input_shape); + DenseTensor x_slice = x_batch.Slice(g * in_step, (g + 1) * in_step); + + DenseTensor ddW; + ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape); + if (!is_expand) { + col.ShareDataWith(x_slice); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } else if (data_dim == 2U) { + im2col(dev_ctx, + x_slice, + dilations, + strides, + std::vector{ + paddings[0], paddings[2], paddings[1], paddings[3]}, + &col); + } else if (data_dim == 3U) { + vol2col(dev_ctx, x_slice, dilations, strides, paddings, &col); + } + + // gemm + DenseTensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step); + blas.MatMul( + ddw_slice, false, col_matrix, false, T(1.0), &ddy_slice, T(1.0)); + } + } + } + if (channel_last) { + TransToChannelLast(dev_ctx, &transformed_ddY, ddY); + } + } +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/conv_grad_kernel_impl.h b/paddle/phi/kernels/impl/conv_grad_kernel_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..f1971aca800b59171a2e741dbebce6d8adaf7899 --- /dev/null +++ b/paddle/phi/kernels/impl/conv_grad_kernel_impl.h @@ -0,0 +1,257 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/operators/math/im2col.h" +#include "paddle/fluid/operators/math/vol2col.h" +#include "paddle/phi/kernels/conv_grad_kernel.h" +#include "paddle/phi/kernels/cpu/conv_util.h" +#include "paddle/phi/kernels/funcs/batch_norm_utils.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +void ConvGradKernel(const Context& dev_ctx, + const DenseTensor& output_grad, + const DenseTensor& input, + const DenseTensor& filter_t, + const std::vector& strides, + const std::vector& paddings_t, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations_t, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + DenseTensor* input_grad, + DenseTensor* filter_grad) { + // The filter and filter_grad will be reshaped in the calculations, + // so here use an assignment operation, + // that avoids modifying the variable in the Scope. + + if (!input_grad && !filter_grad) return; + std::vector paddings = paddings_t; + std::vector dilations = dilations_t; + + DenseTensor filter = filter_t; + const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); + + DenseTensor transformed_input(input.type()); + DenseTensor transformed_output_grad(output_grad.type()); + + if (channel_last) { + ResizeToChannelFirst(dev_ctx, &input, &transformed_input); + TransToChannelFirst(dev_ctx, &input, &transformed_input); + + ResizeToChannelFirst( + dev_ctx, &output_grad, &transformed_output_grad); + TransToChannelFirst( + dev_ctx, &output_grad, &transformed_output_grad); + } else { + transformed_input = input; + transformed_output_grad = output_grad; + } + + // update padding and dilation + auto in_dims = transformed_input.dims(); + auto filter_dims = filter.dims(); + DDim in_data_dims = slice_ddim(in_dims, 2, in_dims.size()); + DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size()); + std::vector ksize = vectorize(filter_data_dims); + UpdatePaddingAndDilation( + &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); + + const int batch_size = static_cast(transformed_input.dims()[0]); + + // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w} + std::vector filter_shape_vec(vectorize(filter.dims())); + // output_shape_vec: {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w} + std::vector output_shape_vec( + vectorize(transformed_output_grad.dims())); + + // use col_shape in the im2col calculation + // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d, + // o_h, o_w} + size_t data_dim = filter_shape_vec.size() - 2; + std::vector col_shape_vec(1 + 2 * data_dim); + col_shape_vec[0] = transformed_input.dims()[1] / groups; + for (size_t j = 0; j < data_dim; ++j) { + col_shape_vec[j + 1] = filter_shape_vec[j + 2]; + col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; + } + DDim col_shape(make_ddim(col_shape_vec)); + + // use col_matrix_shape in the gemm calculation + // size: (i_c/g * k_h * k_w, o_h * o_w) + // or + // (i_c/g * k_d * k_h * k_w, o_d * o_h * o_w) + DDim col_matrix_shape = flatten_to_2d(col_shape, data_dim + 1); + + DDim input_shape = + slice_ddim(transformed_input.dims(), 1, transformed_input.dims().size()); + + DDim filter_matrix_shape = {filter.dims()[0], + filter.numel() / filter.dims()[0]}; + filter.Resize(filter_matrix_shape); + + DDim output_matrix_shape = { + transformed_output_grad.dims()[1], + transformed_output_grad.numel() / (transformed_output_grad.dims()[0] * + transformed_output_grad.dims()[1])}; + + // convolution backward input operator: gemm + col2im(or col2vol) + // convolution backward weight operator: im2col(or vol2col) + gemm + int in_step = static_cast(transformed_input.dims()[1]) / groups; + int out_step = static_cast(transformed_output_grad.dims()[1]) / groups; + + bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations); + + DenseTensor col; + // col_matrix shares the same piece of data with col, + // but will be reshaped into a two-dimensional matrix shape + // to call the matrix multiplication interface. + DenseTensor col_matrix; + if (is_expand) { + col.Resize(col_shape); + col.mutable_data(dev_ctx.GetPlace()); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } + + phi::funcs::SetConstant set_zero; + auto blas = phi::funcs::GetBlas(dev_ctx); + + if (input_grad) { + input_grad->mutable_data(dev_ctx.GetPlace()); + DenseTensor transformed_input_grad(input_grad->type()); + if (channel_last) { + ResizeToChannelFirst( + dev_ctx, input_grad, &transformed_input_grad); + + } else { + transformed_input_grad = *input_grad; + } + // if is_expand is false, the operation of set_zero is unnecessary, + // because math::matmul will reset input_grad. + if (is_expand) { + set_zero(dev_ctx, &transformed_input_grad, static_cast(0)); + } + paddle::operators::math::Col2VolFunctor col2vol; + paddle::operators::math:: + Col2ImFunctor + col2im; + + for (int i = 0; i < batch_size; i++) { + DenseTensor out_grad_batch = + transformed_output_grad.Slice(i, i + 1).Resize(output_matrix_shape); + DenseTensor in_grad_batch = + transformed_input_grad.Slice(i, i + 1).Resize(input_shape); + for (int g = 0; g < groups; g++) { + // gemm + DenseTensor out_grad_slice = + out_grad_batch.Slice(g * out_step, (g + 1) * out_step); + DenseTensor filter_slice = + filter.Slice(g * out_step, (g + 1) * out_step); + + DenseTensor in_grad_slice = + in_grad_batch.Slice(g * in_step, (g + 1) * in_step); + + if (!is_expand) { + col_matrix.ShareDataWith(in_grad_slice); + col_matrix.Resize(col_matrix_shape); + } + blas.MatMul(filter_slice, + true, + out_grad_slice, + false, + T(1.0), + &col_matrix, + T(0.0)); + + if (is_expand && data_dim == 2U) { + col2im(dev_ctx, + col, + dilations, + strides, + std::vector{ + paddings[0], paddings[2], paddings[1], paddings[3]}, + &in_grad_slice); + } else if (is_expand && data_dim == 3U) { + col2vol(dev_ctx, col, dilations, strides, paddings, &in_grad_slice); + } + } + } + if (channel_last) { + TransToChannelLast( + dev_ctx, &transformed_input_grad, input_grad); + } + } + + if (filter_grad) { + filter_grad->mutable_data(dev_ctx.GetPlace()); + Tensor filter_grad_ = *filter_grad; + filter_grad_.Resize(filter_matrix_shape); + set_zero(dev_ctx, filter_grad, static_cast(0)); + paddle::operators::math:: + Im2ColFunctor + im2col; + paddle::operators::math::Vol2ColFunctor vol2col; + for (int i = 0; i < batch_size; i++) { + DenseTensor out_grad_batch = + transformed_output_grad.Slice(i, i + 1).Resize(output_matrix_shape); + DenseTensor in_batch = + transformed_input.Slice(i, i + 1).Resize(input_shape); + for (int g = 0; g < groups; g++) { + // im2col + DenseTensor out_grad_slice = + out_grad_batch.Slice(g * out_step, (g + 1) * out_step); + DenseTensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); + + if (!is_expand) { + col.ShareDataWith(in_slice); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } else if (data_dim == 2U) { + im2col(dev_ctx, + in_slice, + dilations, + strides, + std::vector{ + paddings[0], paddings[2], paddings[1], paddings[3]}, + &col); + + } else if (data_dim == 3U) { + vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col); + } + + // gemm + DenseTensor filter_grad_slice = + filter_grad_.Slice(g * out_step, (g + 1) * out_step); + blas.MatMul(out_grad_slice, + false, + col_matrix, + true, + T(1.0), + &filter_grad_slice, + T(1.0)); + } + } + } +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/conv_kernel_impl.h b/paddle/phi/kernels/impl/conv_kernel_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..1945468f02551b8e348687ae578c9f23a038b8ca --- /dev/null +++ b/paddle/phi/kernels/impl/conv_kernel_impl.h @@ -0,0 +1,183 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/operators/math/im2col.h" +#include "paddle/fluid/operators/math/vol2col.h" +#include "paddle/phi/kernels/conv_kernel.h" +#include "paddle/phi/kernels/cpu/conv_util.h" +#include "paddle/phi/kernels/funcs/batch_norm_utils.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +void ConvKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& filter_t, + const std::vector& strides, + const std::vector& paddings_t, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations_t, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + DenseTensor* output) { + std::vector paddings = paddings_t; + std::vector dilations = dilations_t; + DenseTensor filter = filter_t; + // The filter will be reshaped in the calculations, + // so here use an assignment operation, + // that avoids modifying the variable in the Scope. + output->mutable_data(dev_ctx.GetPlace()); + + const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); + + DenseTensor transformed_input(input.type()); + DenseTensor transformed_output(output->type()); + + if (channel_last) { + ResizeToChannelFirst(dev_ctx, &input, &transformed_input); + TransToChannelFirst(dev_ctx, &input, &transformed_input); + + ResizeToChannelFirst(dev_ctx, output, &transformed_output); + + } else { + transformed_input = input; + transformed_output = *output; + } + + // update padding and dilation + auto trans_in_dims = transformed_input.dims(); + auto filter_dims = filter.dims(); + + DDim in_data_dims = slice_ddim(trans_in_dims, 2, trans_in_dims.size()); + DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size()); + + std::vector ksize = vectorize(filter_data_dims); + UpdatePaddingAndDilation( + &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); + + const int batch_size = static_cast(transformed_input.dims()[0]); + + // filter_shape_vec: + // {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w} + std::vector filter_shape_vec(vectorize(filter.dims())); + + // output_shape_vec: + // {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w} + std::vector output_shape_vec(vectorize(transformed_output.dims())); + + // use col_shape in the im2col calculation + // col_shape_vec: + // {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, + // o_d,o_h, o_w} + size_t data_dim = filter_shape_vec.size() - 2; + + std::vector col_shape_vec(1 + 2 * data_dim); + col_shape_vec[0] = trans_in_dims[1] / groups; + for (size_t j = 0; j < data_dim; ++j) { + col_shape_vec[j + 1] = filter_shape_vec[j + 2]; + col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; + } + + DDim col_shape(make_ddim(col_shape_vec)); + + // use col_matrix_shape in the gemm calculation + // size: + // (i_c/g * k_h * k_w, o_h * o_w) or (i_c/g * k_d * k_h * k_w, o_d * o_h * + // o_w) + + DDim col_matrix_shape = flatten_to_2d(col_shape, data_dim); + + bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations); + + DenseTensor col; + // col_matrix shares the same piece of data with col, + // but will be reshaped into a two-dimensional matrix shape + // to call the matrix multiplication interface. + DenseTensor col_matrix; + if (is_expand) { + // col = context.AllocateTmpTensor(col_shape, dev_ctx); + col.Resize(col_shape); + col.mutable_data(dev_ctx.GetPlace()); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } + + DDim in_matrix_shape = + slice_ddim(transformed_input.dims(), 1, transformed_input.dims().size()); + + DDim filter_matrix_shape = {filter.dims()[0], + filter.numel() / filter.dims()[0]}; + filter.Resize(filter_matrix_shape); + + DDim output_matrix_shape = { + transformed_output.dims()[1], + transformed_output.numel() / + (transformed_output.dims()[0] * transformed_output.dims()[1])}; + + // convolution operator: im2col(or vol2col) + gemm + int in_step = static_cast(transformed_input.dims()[1]) / groups; + int out_step = static_cast(transformed_output.dims()[1]) / groups; + + paddle::operators::math::Vol2ColFunctor vol2col; + paddle::operators::math:: + Im2ColFunctor + im2col; + + auto blas = phi::funcs::GetBlas(dev_ctx); + for (int i = 0; i < batch_size; i++) { + DenseTensor in_batch = + transformed_input.Slice(i, i + 1).Resize(in_matrix_shape); + DenseTensor out_batch = + transformed_output.Slice(i, i + 1).Resize(output_matrix_shape); + + for (int g = 0; g < groups; g++) { + DenseTensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); + + if (!is_expand) { + col.ShareDataWith(in_slice); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } else if (data_dim == 2U) { + im2col(dev_ctx, + in_slice, + dilations, + strides, + std::vector{ + paddings[0], paddings[2], paddings[1], paddings[3]}, + &col); + + } else if (data_dim == 3U) { + vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col); + } + + // gemm + DenseTensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); + DenseTensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); + blas.MatMul( + filter_slice, false, col_matrix, false, T(1.0), &out_slice, T(0.0)); + } + } + if (channel_last) { + TransToChannelLast(dev_ctx, &transformed_output, output); + } +} + +} // namespace phi diff --git a/paddle/phi/ops/compat/conv2d_sig.cc b/paddle/phi/ops/compat/conv2d_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..a755fdb19ec4b86d4b5265c1d6bce5eecdb9b5b3 --- /dev/null +++ b/paddle/phi/ops/compat/conv2d_sig.cc @@ -0,0 +1,70 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature Conv2dOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("conv2d", + {"Input", "Filter"}, + {"strides", + "paddings", + "padding_algorithm", + "groups", + "dilations", + "data_format", + "use_addto", + "workspace_size_MB", + "exhaustive_search"}, + {"Output"}); +} + +KernelSignature Conv2dGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("conv2d_grad", + {GradVarName("Output"), "Input", "Filter"}, + {"strides", + "paddings", + "padding_algorithm", + "groups", + "dilations", + "data_format", + "use_addto", + "workspace_size_MB", + "exhaustive_search"}, + {GradVarName("Input"), GradVarName("Filter")}); +} + +KernelSignature Conv2dDoubleGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("conv2d_grad_grad", + {"DDInput", "DDFilter", "DOutput", "Input", "Filter"}, + {"strides", + "paddings", + "padding_algorithm", + "groups", + "dilations", + "data_format", + "use_addto", + "workspace_size_MB", + "exhaustive_search"}, + {"DDOutput", "DInput", "DFilter"}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(conv2d, phi::Conv2dOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(conv2d_grad, phi::Conv2dGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(conv2d_grad_grad, + phi::Conv2dDoubleGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/conv3d_sig.cc b/paddle/phi/ops/compat/conv3d_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..a036afac82a8d49455b1a226e62f5fe757d4b4b9 --- /dev/null +++ b/paddle/phi/ops/compat/conv3d_sig.cc @@ -0,0 +1,70 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature Conv3dOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("conv3d", + {"Input", "Filter"}, + {"strides", + "paddings", + "padding_algorithm", + "groups", + "dilations", + "data_format", + "use_addto", + "workspace_size_MB", + "exhaustive_search"}, + {"Output"}); +} + +KernelSignature Conv3dGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("conv2d_grad", + {GradVarName("Output"), "Input", "Filter"}, + {"strides", + "paddings", + "padding_algorithm", + "groups", + "dilations", + "data_format", + "use_addto", + "workspace_size_MB", + "exhaustive_search"}, + {GradVarName("Input"), GradVarName("Filter")}); +} + +KernelSignature Conv3dDoubleGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("conv3d_grad_grad", + {"DDInput", "DDFilter", "DOutput", "Input", "Filter"}, + {"strides", + "paddings", + "padding_algorithm", + "groups", + "dilations", + "data_format", + "use_addto", + "workspace_size_MB", + "exhaustive_search"}, + {"DDOutput", "DInput", "DFilter"}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(conv3d, phi::Conv3dOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(conv3d_grad, phi::Conv3dGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(conv3d_grad_grad, + phi::Conv3dDoubleGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/depthwise_conv2d_sig.cc b/paddle/phi/ops/compat/depthwise_conv2d_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..e2b6801f73bcdb0f20090e3ea7e75b7257bde4e3 --- /dev/null +++ b/paddle/phi/ops/compat/depthwise_conv2d_sig.cc @@ -0,0 +1,77 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature DepthwiseConv2dOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("depthwise_conv2d", + {"Input", "Filter"}, + {"strides", + "paddings", + "padding_algorithm", + "groups", + "dilations", + "data_format", + "use_addto", + "workspace_size_MB", + "exhaustive_search", + "fuse_relu_before_depthwise_conv"}, + {"Output"}); +} + +KernelSignature DepthwiseConv2dGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("depthwise_conv2d_grad", + {GradVarName("Output"), "Input", "Filter"}, + {"strides", + "paddings", + "padding_algorithm", + "groups", + "dilations", + "data_format", + "use_addto", + "workspace_size_MB", + "exhaustive_search", + "fuse_relu_before_depthwise_conv"}, + {GradVarName("Input"), GradVarName("Filter")}); +} + +KernelSignature DepthwiseConv2dDoubleGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("depthwise_conv2d_grad_grad", + {"DDInput", "DDFilter", "DOutput", "Input", "Filter"}, + {"strides", + "paddings", + "padding_algorithm", + "groups", + "dilations", + "data_format", + "use_addto", + "workspace_size_MB", + "exhaustive_search", + "fuse_relu_before_depthwise_conv"}, + {"DDOutput", "DInput", "DFilter"}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(depthwise_conv2d, + phi::DepthwiseConv2dOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(depthwise_conv2d_grad, + phi::DepthwiseConv2dGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(depthwise_conv2d_grad_grad, + phi::DepthwiseConv2dDoubleGradOpArgumentMapping); diff --git a/python/paddle/fluid/tests/unittests/test_conv1d_layer.py b/python/paddle/fluid/tests/unittests/test_conv1d_layer.py index dc460cb16f68c14df2cd7f7f087c602b945ffc7d..ca77177125fcdddf198e6783939bf84b4ccd9b0e 100644 --- a/python/paddle/fluid/tests/unittests/test_conv1d_layer.py +++ b/python/paddle/fluid/tests/unittests/test_conv1d_layer.py @@ -230,4 +230,5 @@ def load_tests(loader, standard_tests, pattern): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_layer.py b/python/paddle/fluid/tests/unittests/test_conv2d_layer.py index f933d5bf7a48f14b0f4cb4f7ce274744f28c4c24..892fa649a6c5b31b54db8204acc76a7cc8794136 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_layer.py +++ b/python/paddle/fluid/tests/unittests/test_conv2d_layer.py @@ -18,6 +18,7 @@ import paddle.fluid.dygraph as dg import paddle.nn.functional as F import paddle.fluid.initializer as I import unittest +import paddle def _reverse_repeat_list(t, n): @@ -284,4 +285,5 @@ def load_tests(loader, standard_tests, pattern): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py index 826f886dab1725e9e26a8826a2277a2d99f93fb6..6a9f7a47f66cce3879e813836745b3e609affd50 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py @@ -604,7 +604,7 @@ class TestWithInput1x1Filter1x1(TestConv2DOp): self.groups = 3 -#----------------Conv2DCUDNN---------------- +# #----------------Conv2DCUDNN---------------- create_test_cudnn_class(TestConv2DOp) create_test_cudnn_class(TestWithPad) diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_op.py index 5f23d04dde51cc21e66098ee6e37027bf82d7537..8cf779ccfdd4292f4cb6cbe74bf58b8ee7b37411 100644 --- a/python/paddle/fluid/tests/unittests/test_conv3d_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv3d_op.py @@ -20,6 +20,7 @@ import numpy as np import paddle.fluid.core as core from op_test import OpTest import paddle.fluid as fluid +import paddle def conv3d_forward_naive(input, @@ -1001,4 +1002,5 @@ class TestConv3DAPI_Error(unittest.TestCase): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py b/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py index 81c6aa1fd17d9ad16a1d24f32e5f55b3b71ca629..784d89b93f9859253a5722232954e7db1080afed 100644 --- a/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py +++ b/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py @@ -16,6 +16,7 @@ from __future__ import print_function import unittest import numpy as np +import paddle import paddle.fluid as fluid import paddle.fluid.layers as layers @@ -44,7 +45,6 @@ class TestConvDoubleGradCheck(unittest.TestCase): def test_grad(self): places = [fluid.CPUPlace()] - places = [] if core.is_compiled_with_cuda(): places.append(fluid.CUDAPlace(0)) @@ -120,7 +120,8 @@ class TestConv3DDoubleGradCheck(unittest.TestCase): [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps) def test_grad(self): - places = [fluid.CPUPlace()] + #places = [fluid.CPUPlace()] + places = [] if core.is_compiled_with_cuda(): places.append(fluid.CUDAPlace(0)) for p in places: @@ -503,4 +504,5 @@ class TestDepthWiseConvDoubleGradCheck(unittest.TestCase): if __name__ == "__main__": + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv2d.py b/python/paddle/fluid/tests/unittests/test_functional_conv2d.py index cec48724da2fe08d0eefcdf0d2df3c54c9aa363d..8e0a744ecdbdabac1a248ed1f0a0d08934749e55 100644 --- a/python/paddle/fluid/tests/unittests/test_functional_conv2d.py +++ b/python/paddle/fluid/tests/unittests/test_functional_conv2d.py @@ -534,4 +534,5 @@ class TestFunctionalConv2DErrorCase13(TestFunctionalConv2DErrorCase12): if __name__ == "__main__": + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv3d.py b/python/paddle/fluid/tests/unittests/test_functional_conv3d.py index 8ccaf30cbdb34bcd215bd6b76431c7a6acfeaa3e..6c208160658820d57456e46f86b76659f4e4f80d 100644 --- a/python/paddle/fluid/tests/unittests/test_functional_conv3d.py +++ b/python/paddle/fluid/tests/unittests/test_functional_conv3d.py @@ -509,4 +509,5 @@ class TestFunctionalConv3DErrorCase12(TestFunctionalConv3DErrorCase11): if __name__ == "__main__": + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py index 9b739ebdfb23c680a86a54a3fa00398805ee8968..d391b04aa4772efbf7fadb7a9556aafd445197db 100644 --- a/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py +++ b/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py @@ -117,4 +117,5 @@ class TestMNIST(TestParallelExecutorBase): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py index 33f304ef33d67004585a01142d24b17bfd4908bc..0a08aa4ba12693e2216ae1b131ea41a5abaabd2a 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py @@ -617,4 +617,5 @@ class TestStarGANWithGradientPenalty(unittest.TestCase): if __name__ == '__main__': + paddle.enable_static() unittest.main()