From 31f57f29ac676de7128415d9b9aa56df4880baaa Mon Sep 17 00:00:00 2001 From: Yiqun Liu Date: Mon, 24 Oct 2022 10:10:54 +0800 Subject: [PATCH] Move the header file of conv cudnn and miopen to phi directory. (#47248) --- paddle/fluid/framework/var_type_traits.cc | 3 +- .../fluid/framework/var_type_traits_test.cc | 2 - .../fluid/operators/fused/conv_fusion_op.cu | 12 +- .../fused/fusion_conv_inception_op.cu | 6 +- .../kernels/gpudnn/conv_cudnn_v7.h} | 141 +++++------- .../kernels/gpudnn/conv_gpudnn_base.h} | 71 ++++-- .../kernels/gpudnn/conv_gpudnn_info.h} | 8 +- .../kernels/gpudnn/conv_grad_grad_kernel.cu | 126 +++++------ paddle/phi/kernels/gpudnn/conv_grad_kernel.cu | 148 ++++++------ paddle/phi/kernels/gpudnn/conv_kernel.cu | 91 ++++---- .../kernels/gpudnn}/conv_miopen_helper.h | 55 +---- .../gpudnn/conv_transpose_grad_kernel.cu | 214 ++++++++---------- .../kernels/gpudnn/conv_transpose_kernel.cu | 36 ++- paddle/phi/kernels/impl/conv_cudnn_impl.h | 4 +- 14 files changed, 415 insertions(+), 502 deletions(-) rename paddle/{fluid/operators/conv_cudnn_helper.h => phi/kernels/gpudnn/conv_cudnn_v7.h} (87%) rename paddle/{fluid/operators/conv_base_helper.h => phi/kernels/gpudnn/conv_gpudnn_base.h} (63%) rename paddle/{fluid/operators/conv_cudnn_op_cache.h => phi/kernels/gpudnn/conv_gpudnn_info.h} (90%) rename paddle/{fluid/operators => phi/kernels/gpudnn}/conv_miopen_helper.h (75%) diff --git a/paddle/fluid/framework/var_type_traits.cc b/paddle/fluid/framework/var_type_traits.cc index 6331ee8861..2a53e2f885 100644 --- a/paddle/fluid/framework/var_type_traits.cc +++ b/paddle/fluid/framework/var_type_traits.cc @@ -26,15 +26,14 @@ #endif #include -#include "paddle/fluid/operators/conv_cudnn_op_cache.h" #include "paddle/fluid/operators/cudnn_rnn_cache.h" +#include "paddle/phi/kernels/gpudnn/conv_gpudnn_info.h" #endif #ifdef PADDLE_WITH_HIP #if defined(PADDLE_WITH_RCCL) #include "paddle/fluid/operators/nccl/nccl_gpu_common.h" // NOLINT #include "paddle/fluid/platform/device/gpu/nccl_helper.h" // NOLINT #endif -#include "paddle/fluid/operators/conv_cudnn_op_cache.h" // NOLINT #include "paddle/fluid/operators/miopen_rnn_cache.h" #endif diff --git a/paddle/fluid/framework/var_type_traits_test.cc b/paddle/fluid/framework/var_type_traits_test.cc index 4a81f66948..63eb0bef5d 100644 --- a/paddle/fluid/framework/var_type_traits_test.cc +++ b/paddle/fluid/framework/var_type_traits_test.cc @@ -26,7 +26,6 @@ #include "paddle/fluid/operators/nccl/nccl_gpu_common.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif -#include "paddle/fluid/operators/conv_cudnn_op_cache.h" #include "paddle/fluid/operators/cudnn_rnn_cache.h" #endif #ifdef PADDLE_WITH_HIP @@ -34,7 +33,6 @@ #include "paddle/fluid/operators/nccl/nccl_gpu_common.h" // NOLINT #include "paddle/fluid/platform/device/gpu/nccl_helper.h" // NOLINT #endif -#include "paddle/fluid/operators/conv_cudnn_op_cache.h" // NOLINT #include "paddle/fluid/operators/miopen_rnn_cache.h" #endif #if defined(PADDLE_WITH_XPU_BKCL) diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cu b/paddle/fluid/operators/fused/conv_fusion_op.cu index 5eee2c9332..a9b577e7f4 100644 --- a/paddle/fluid/operators/fused/conv_fusion_op.cu +++ b/paddle/fluid/operators/fused/conv_fusion_op.cu @@ -16,10 +16,10 @@ limitations under the License. */ #include "paddle/fluid/framework/conv_search_cache.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/conv_cudnn_op_cache.h" #include "paddle/fluid/operators/conv_op.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/phi/kernels/funcs/padding.h" +#include "paddle/phi/kernels/gpudnn/conv_gpudnn_info.h" DECLARE_int64(cudnn_exhaustive_search_times); @@ -216,7 +216,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { cudnn_conv_desc, cudnn_output_desc, output_data, - kNUM_CUDNN_FWD_ALGS, + phi::kNUM_CUDNN_FWD_ALGS, &find_count, &find_result, cudnn_workspace_ptr, @@ -337,7 +337,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { int best_algo_idx = 0; size_t tmp_size = 0; std::unique_ptr perf_results( - new cudnnConvolutionFwdAlgoPerf_t[kNUM_CUDNN_FWD_ALGS]); + new cudnnConvolutionFwdAlgoPerf_t[phi::kNUM_CUDNN_FWD_ALGS]); PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnGetConvolutionForwardAlgorithm_v7( handle, @@ -345,7 +345,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { cudnn_filter_desc, cudnn_conv_desc, cudnn_output_desc, - kNUM_CUDNN_FWD_ALGS, + phi::kNUM_CUDNN_FWD_ALGS, &perf_count, perf_results.get())); algo = (perf_results.get())[best_algo_idx].algo; @@ -378,7 +378,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { [&]() -> SearchFuseResult { int returned_algo_count; SearchFuseResult fwd_result; - std::array + std::array fwd_perf_stat; auto cudnn_find_func = [&](void* cudnn_workspace) { PADDLE_ENFORCE_GPU_SUCCESS( @@ -391,7 +391,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { cudnn_conv_desc, cudnn_output_desc, output_data, - kNUM_CUDNN_FWD_ALGS, + phi::kNUM_CUDNN_FWD_ALGS, &returned_algo_count, fwd_perf_stat.data(), cudnn_workspace, diff --git a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu index 9eee08600a..07cfb44a31 100644 --- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu +++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/conv_cudnn_op_cache.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" +#include "paddle/phi/kernels/gpudnn/conv_gpudnn_info.h" namespace paddle { namespace operators { @@ -206,7 +206,7 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel { int best_algo_idx = 0; size_t tmp_size = 0; std::unique_ptr perf_results( - new cudnnConvolutionFwdAlgoPerf_t[kNUM_CUDNN_FWD_ALGS]); + new cudnnConvolutionFwdAlgoPerf_t[phi::kNUM_CUDNN_FWD_ALGS]); PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnGetConvolutionForwardAlgorithm_v7( handle, @@ -214,7 +214,7 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel { filter_desc[i], conv_desc[i], out_desc[i], - kNUM_CUDNN_FWD_ALGS, + phi::kNUM_CUDNN_FWD_ALGS, &perf_count, perf_results.get())); algo[i] = (perf_results.get())[best_algo_idx].algo; diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/phi/kernels/gpudnn/conv_cudnn_v7.h similarity index 87% rename from paddle/fluid/operators/conv_cudnn_helper.h rename to paddle/phi/kernels/gpudnn/conv_cudnn_v7.h index 0388665a15..7010365f0d 100644 --- a/paddle/fluid/operators/conv_cudnn_helper.h +++ b/paddle/phi/kernels/gpudnn/conv_cudnn_v7.h @@ -14,52 +14,15 @@ limitations under the License. */ #pragma once -#include "paddle/fluid/operators/conv_base_helper.h" #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/phi/kernels/autotune/switch_autotune.h" -#include "paddle/phi/kernels/funcs/eigen/common.h" -#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" +#include "paddle/phi/kernels/gpudnn/conv_gpudnn_base.h" -namespace paddle { -namespace operators { +namespace phi { using ConvArgs = ConvArgsBase; -template -static void RemovePaddingSlice(const phi::GPUContext& context, - const phi::DenseTensor* input, - phi::DenseTensor* out, - const std::vector& starts, - const std::vector& axes) { - auto& place = *context.eigen_device(); - auto in_dims = input->dims(); - auto new_out_dims = out->dims(); - auto offsets = Eigen::DSizes(); - auto extents = Eigen::DSizes(); - for (size_t i = 0; i < D; ++i) { - offsets[i] = 0; - extents[i] = new_out_dims[i]; - } - - for (size_t i = 0; i < axes.size(); ++i) { - int start = starts[i]; - if (start < 0) { - start = (start + in_dims[axes[i]]); - } - start = std::max(start, 0); - offsets[axes[i]] = start; - } - - auto in_t = - phi::EigenTensor::From(*input); - auto out_t = phi::EigenTensor::From( - *out, new_out_dims); - - phi::funcs::EigenSlice, T, D>::Eval( - place, out_t, in_t, offsets, extents); -} - static inline double ToMegaBytes(size_t bytes) { return static_cast(bytes) / (1 << 20); } @@ -70,12 +33,12 @@ static inline bool UseFixedWorkspace() { static size_t CalcWorkspaceLimitInBytes(bool use_fixed_workspace) { if (!use_fixed_workspace) { - int device_id = platform::GetCurrentDeviceId(); + int device_id = phi::backends::gpu::GetCurrentDeviceId(); int64_t allocated = - memory::DeviceMemoryStatCurrentValue("Allocated", device_id); + paddle::memory::DeviceMemoryStatCurrentValue("Allocated", device_id); int64_t reserved = - memory::DeviceMemoryStatCurrentValue("Reserved", device_id); - int64_t availble = platform::GpuAvailableMemToAlloc(); + paddle::memory::DeviceMemoryStatCurrentValue("Reserved", device_id); + int64_t availble = paddle::platform::GpuAvailableMemToAlloc(); VLOG(3) << "[memory] allocated=" << ToMegaBytes(allocated) << " MB, reserved=" << ToMegaBytes(reserved) << " MB, available_to_alloc=" << ToMegaBytes(availble) << " MB."; @@ -164,14 +127,13 @@ struct SearchAlgorithmBase { cudnnConvolutionFwdAlgo_t algo) { size_t workspace_size = 0; PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( - args.handle, - args.idesc.desc(), - args.wdesc.desc(), - args.cdesc.desc(), - args.odesc.desc(), - algo, - &workspace_size)); + phi::dynload::cudnnGetConvolutionForwardWorkspaceSize(args.handle, + args.idesc.desc(), + args.wdesc.desc(), + args.cdesc.desc(), + args.odesc.desc(), + algo, + &workspace_size)); return workspace_size; } @@ -193,7 +155,7 @@ struct SearchAlgorithmBase { int best_algo_idx = 0; std::vector perf_results(kNUM_CUDNN_FWD_ALGS); PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnGetConvolutionForwardAlgorithm_v7( + phi::dynload::cudnnGetConvolutionForwardAlgorithm_v7( args.handle, args.idesc.desc(), args.wdesc.desc(), @@ -220,7 +182,7 @@ struct SearchAlgorithmBase { << result.workspace_size << ") exceeds the limit(" << workspace_size_limit << ")"; PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnGetConvolutionForwardAlgorithm( + phi::dynload::cudnnGetConvolutionForwardAlgorithm( args.handle, args.idesc.desc(), args.wdesc.desc(), @@ -233,7 +195,7 @@ struct SearchAlgorithmBase { } #else PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnGetConvolutionForwardAlgorithm( + phi::dynload::cudnnGetConvolutionForwardAlgorithm( args.handle, args.idesc.desc(), args.wdesc.desc(), @@ -261,7 +223,7 @@ struct SearchAlgorithmBase { std::vector perf_results(kNUM_CUDNN_FWD_ALGS); auto cudnn_find_func = [&](void* workspace_ptr) { PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnFindConvolutionForwardAlgorithmEx( + phi::dynload::cudnnFindConvolutionForwardAlgorithmEx( args.handle, args.idesc.desc(), args.x->data(), @@ -299,15 +261,14 @@ struct SearchAlgorithmBase { size_t max_workspace_size = 0; for (size_t algo = 0; algo < kNUM_CUDNN_FWD_ALGS; ++algo) { size_t workspace_size = 0; - auto status = - platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( - args.handle, - args.idesc.desc(), - args.wdesc.desc(), - args.cdesc.desc(), - args.odesc.desc(), - static_cast(algo), - &workspace_size); + auto status = phi::dynload::cudnnGetConvolutionForwardWorkspaceSize( + args.handle, + args.idesc.desc(), + args.wdesc.desc(), + args.cdesc.desc(), + args.odesc.desc(), + static_cast(algo), + &workspace_size); if (status == CUDNN_STATUS_SUCCESS && workspace_size <= workspace_size_limit) { max_workspace_size = std::max(workspace_size, max_workspace_size); @@ -339,7 +300,7 @@ struct SearchAlgorithmBase { cudnnConvolutionBwdDataAlgo_t algo) { size_t workspace_size = 0; PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize( + phi::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize( args.handle, args.wdesc.desc(), args.odesc.desc(), @@ -369,7 +330,7 @@ struct SearchAlgorithmBase { int best_algo_idx = 0; std::vector perf_results(kNUM_CUDNN_BWD_DATA_ALGS); PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm_v7( + phi::dynload::cudnnGetConvolutionBackwardDataAlgorithm_v7( args.handle, args.wdesc.desc(), args.odesc.desc(), @@ -404,7 +365,7 @@ struct SearchAlgorithmBase { << result.workspace_size << ") exceeds the limit(" << workspace_size_limit << ")"; PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( + phi::dynload::cudnnGetConvolutionBackwardDataAlgorithm( args.handle, args.wdesc.desc(), args.odesc.desc(), @@ -417,7 +378,7 @@ struct SearchAlgorithmBase { } #else PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( + phi::dynload::cudnnGetConvolutionBackwardDataAlgorithm( args.handle, args.wdesc.desc(), args.odesc.desc(), @@ -445,7 +406,7 @@ struct SearchAlgorithmBase { std::vector perf_results(kNUM_CUDNN_BWD_DATA_ALGS); auto cudnn_find_func = [&](void* workspace_ptr) { PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnFindConvolutionBackwardDataAlgorithmEx( + phi::dynload::cudnnFindConvolutionBackwardDataAlgorithmEx( args.handle, args.wdesc.desc(), args.w->data(), @@ -484,7 +445,7 @@ struct SearchAlgorithmBase { for (size_t algo = 0; algo < kNUM_CUDNN_BWD_DATA_ALGS; ++algo) { size_t workspace_size = 0; auto status = - platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize( + phi::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize( args.handle, args.wdesc.desc(), args.odesc.desc(), @@ -519,10 +480,10 @@ struct SearchAlgorithmBase { static size_t GetWorkspaceSize(const ConvArgs& args, cudnnConvolutionBwdFilterAlgo_t algo) { - platform::CUDAGraphCaptureModeGuard guard; + paddle::platform::CUDAGraphCaptureModeGuard guard; size_t workspace_size = 0; PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize( + phi::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize( args.handle, args.idesc.desc(), args.odesc.desc(), @@ -552,7 +513,7 @@ struct SearchAlgorithmBase { int best_algo_idx = 0; std::vector perf_results(kNUM_CUDNN_BWD_FILTER_ALGS); PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm_v7( + phi::dynload::cudnnGetConvolutionBackwardFilterAlgorithm_v7( args.handle, args.idesc.desc(), args.odesc.desc(), @@ -575,7 +536,7 @@ struct SearchAlgorithmBase { << result.workspace_size << ") exceeds the limit(" << workspace_size_limit << ")"; PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( + phi::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( args.handle, args.idesc.desc(), args.odesc.desc(), @@ -588,7 +549,7 @@ struct SearchAlgorithmBase { } #else PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( + phi::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( args.handle, args.idesc.desc(), args.odesc.desc(), @@ -612,7 +573,7 @@ struct SearchAlgorithmBase { size_t workspace_size_limit = CalcWorkspaceLimitInBytes(UseFixedWorkspace()); auto workspace_handle = ctx.cudnn_workspace_handle(); - if (platform::CudnnDataType::type != CUDNN_DATA_HALF) { + if (paddle::platform::CudnnDataType::type != CUDNN_DATA_HALF) { size_t max_workspace_size = GetMaxWorkspaceSize(args, workspace_size_limit); VLOG(3) << "max_workspace_size=" << ToMegaBytes(max_workspace_size) @@ -620,7 +581,7 @@ struct SearchAlgorithmBase { auto cudnn_find_func = [&](void* workspace_ptr) { PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnFindConvolutionBackwardFilterAlgorithmEx( + phi::dynload::cudnnFindConvolutionBackwardFilterAlgorithmEx( args.handle, args.idesc.desc(), args.x->data(), @@ -649,7 +610,7 @@ struct SearchAlgorithmBase { int max_algos = GetAlgorithmMaxCount(args.handle); std::vector perf_results(max_algos); PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnFindConvolutionBackwardFilterAlgorithm( + phi::dynload::cudnnFindConvolutionBackwardFilterAlgorithm( args.handle, args.idesc.desc(), args.odesc.desc(), @@ -676,7 +637,7 @@ struct SearchAlgorithmBase { #if CUDNN_VERSION_MIN(7, 0, 1) int max_algos = 0; auto status = - platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithmMaxCount( + phi::dynload::cudnnGetConvolutionBackwardFilterAlgorithmMaxCount( handle, &max_algos); if (status == gpuSuccess) { VLOG(5) << "[BackwardFilter] max_algos: predefined=" @@ -694,7 +655,7 @@ struct SearchAlgorithmBase { for (size_t algo = 0; algo < kNUM_CUDNN_BWD_FILTER_ALGS; ++algo) { size_t workspace_size = 0; auto status = - platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize( + phi::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize( args.handle, args.idesc.desc(), args.odesc.desc(), @@ -762,7 +723,7 @@ struct SearchAlgorithm : public SearchAlgorithmBase { bool enable_autotune = true) { SearchResult result; bool use_autotune = false; - auto dtype = platform::CudnnDataType::type; + auto dtype = paddle::platform::CudnnDataType::type; SetConvMathType(ctx, dtype, args.cdesc); if (deterministic) { @@ -819,12 +780,13 @@ struct SearchAlgorithm : public SearchAlgorithmBase { return result; } - static void SetConvMathType(const phi::GPUContext& ctx, - cudnnDataType_t dtype, - const platform::ConvolutionDescriptor& cdesc) { + static void SetConvMathType( + const phi::GPUContext& ctx, + cudnnDataType_t dtype, + const paddle::platform::ConvolutionDescriptor& cdesc) { #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1) if (ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetConvolutionMathType( cdesc.desc(), CUDNN_TENSOR_OP_MATH)); VLOG(5) << "Enable Tensor Core for FLOAT16"; #if CUDA_VERSION >= 11000 @@ -832,21 +794,20 @@ struct SearchAlgorithm : public SearchAlgorithmBase { } else if (ctx.GetComputeCapability() >= 80 && dtype == CUDNN_DATA_BFLOAT16) { VLOG(5) << "Enable Tensor Core for BFLOAT16"; - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetConvolutionMathType( cdesc.desc(), CUDNN_TENSOR_OP_MATH)); #endif // CUDNN_VERSION_MIN(8, 1, 0) } else if (dtype == CUDNN_DATA_FLOAT && !cdesc.allow_tf32_) { VLOG(5) << "Disable TensorFloat (Tensor Core) for FLOAT"; - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetConvolutionMathType( cdesc.desc(), CUDNN_FMA_MATH)); #endif // CUDA_VERSION >= 11000 } else { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetConvolutionMathType( cdesc.desc(), CUDNN_DEFAULT_MATH)); } #endif } }; -} // namespace operators -} // namespace paddle +} // namespace phi diff --git a/paddle/fluid/operators/conv_base_helper.h b/paddle/phi/kernels/gpudnn/conv_gpudnn_base.h similarity index 63% rename from paddle/fluid/operators/conv_base_helper.h rename to paddle/phi/kernels/gpudnn/conv_gpudnn_base.h index 00c24ead0c..7d29e64c68 100644 --- a/paddle/fluid/operators/conv_base_helper.h +++ b/paddle/phi/kernels/gpudnn/conv_gpudnn_base.h @@ -20,21 +20,19 @@ limitations under the License. */ #include #include -#include "paddle/fluid/framework/conv_search_cache.h" -#include "paddle/fluid/operators/conv_cudnn_op_cache.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/kernels/autotune/cache.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" +#include "paddle/phi/kernels/gpudnn/conv_gpudnn_info.h" -namespace paddle { -namespace operators { +namespace phi { -using Tensor = phi::DenseTensor; -using DataLayout = platform::DataLayout; -using framework::AlgorithmsCache; -using framework::ConvSearchCache; +using GPUDNNDataLayout = paddle::platform::DataLayout; template -using ScalingParamType = typename platform::CudnnDataType::ScalingParamType; +using ScalingParamType = + typename paddle::platform::CudnnDataType::ScalingParamType; // As the container of searchAlgorithm::Find() result. template @@ -71,9 +69,9 @@ static std::ostream& operator<<(std::ostream& out, const std::vector& v) { template struct ConvArgsBase { HandleT handle; - platform::TensorDescriptor idesc, odesc; - platform::FilterDescriptor wdesc; - platform::ConvolutionDescriptor cdesc; + paddle::platform::TensorDescriptor idesc, odesc; + paddle::platform::FilterDescriptor wdesc; + paddle::platform::ConvolutionDescriptor cdesc; const phi::DenseTensor *x, *w, *o; DataT cudnn_dtype; @@ -88,7 +86,7 @@ struct ConvArgsBase { int group; // data foramt - DataLayout data_layout; + GPUDNNDataLayout data_layout; ConvArgsBase(const phi::DenseTensor* x, const phi::DenseTensor* w, @@ -98,7 +96,7 @@ struct ConvArgsBase { const std::vector d, DataT dtype, int g, - DataLayout layout) + GPUDNNDataLayout layout) : x(x), w(w), o(o), @@ -131,16 +129,16 @@ struct ConvArgsBase { } }; -static inline void GetNCDHW(const framework::DDim& dims, - const DataLayout& layout, +static inline void GetNCDHW(const phi::DDim& dims, + const GPUDNNDataLayout& layout, int* N, int* C, int* D, int* H, int* W) { *N = dims[0]; - *C = layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1]; - int i = layout == DataLayout::kNCHW ? 0 : 1; + *C = layout == GPUDNNDataLayout::kNCHW ? dims[1] : dims[dims.size() - 1]; + int i = layout == GPUDNNDataLayout::kNCHW ? 0 : 1; if (dims.size() == 5) { *D = dims[2 - i]; *H = dims[3 - i]; @@ -152,5 +150,38 @@ static inline void GetNCDHW(const framework::DDim& dims, } } -} // namespace operators -} // namespace paddle +template +static void RemovePaddingSlice(const phi::GPUContext& context, + const phi::DenseTensor* input, + phi::DenseTensor* out, + const std::vector& starts, + const std::vector& axes) { + auto& place = *context.eigen_device(); + auto in_dims = input->dims(); + auto new_out_dims = out->dims(); + auto offsets = Eigen::DSizes(); + auto extents = Eigen::DSizes(); + for (size_t i = 0; i < D; ++i) { + offsets[i] = 0; + extents[i] = new_out_dims[i]; + } + + for (size_t i = 0; i < axes.size(); ++i) { + int start = starts[i]; + if (start < 0) { + start = (start + in_dims[axes[i]]); + } + start = std::max(start, 0); + offsets[axes[i]] = start; + } + + auto in_t = + phi::EigenTensor::From(*input); + auto out_t = phi::EigenTensor::From( + *out, new_out_dims); + + phi::funcs::EigenSlice, T, D>::Eval( + place, out_t, in_t, offsets, extents); +} + +} // namespace phi diff --git a/paddle/fluid/operators/conv_cudnn_op_cache.h b/paddle/phi/kernels/gpudnn/conv_gpudnn_info.h similarity index 90% rename from paddle/fluid/operators/conv_cudnn_op_cache.h rename to paddle/phi/kernels/gpudnn/conv_gpudnn_info.h index 3d704c8be3..e52c2210c8 100644 --- a/paddle/fluid/operators/conv_cudnn_op_cache.h +++ b/paddle/phi/kernels/gpudnn/conv_gpudnn_info.h @@ -18,15 +18,14 @@ limitations under the License. */ #include #include -#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" DECLARE_int64(conv_workspace_size_limit); DECLARE_bool(cudnn_exhaustive_search); DECLARE_int64(cudnn_exhaustive_search_times); -namespace paddle { -namespace operators { +namespace phi { + #ifdef PADDLE_WITH_HIP static constexpr size_t kNUM_CUDNN_FWD_ALGS = 1; static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS = 1; @@ -39,5 +38,4 @@ static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS = CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT; #endif -} // namespace operators -} // namespace paddle +} // namespace phi diff --git a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu index e61f58450b..7b4bd05991 100644 --- a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu @@ -19,9 +19,9 @@ #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/kernel_registry.h" #ifdef PADDLE_WITH_HIP -#include "paddle/fluid/operators/conv_miopen_helper.h" +#include "paddle/phi/kernels/gpudnn/conv_miopen_helper.h" #else -#include "paddle/fluid/operators/conv_cudnn_helper.h" +#include "paddle/phi/kernels/gpudnn/conv_cudnn_v7.h" #endif #include "paddle/fluid/platform/cudnn_workspace_helper.h" @@ -257,55 +257,53 @@ void ConvCudnnGradGradKernel( auto layout = paddle::platform::GetCudnnTensorFormat( paddle::platform::DataLayout::kNCHW); - paddle::operators::ConvArgs args1{&transformed_ddX, - W, - &transformed_ddO_channel, - strides, - padding_common, - dilations, - dtype, - groups, - paddle::platform::DataLayout::kNCHW}; - paddle::operators::ConvArgs args2{&transformed_X, - ddW, - &transformed_ddO_channel, - strides, - padding_common, - dilations, - dtype, - groups, - paddle::platform::DataLayout::kNCHW}; - paddle::operators::ConvArgs args3{&transformed_ddX, - dW, - &transformed_dO_channel, - strides, - padding_common, - dilations, - dtype, - groups, - paddle::platform::DataLayout::kNCHW}; - paddle::operators::ConvArgs args4{&transformed_dX, - ddW, - &transformed_dO_channel, - strides, - padding_common, - dilations, - dtype, - groups, - paddle::platform::DataLayout::kNCHW}; + ConvArgs args1{&transformed_ddX, + W, + &transformed_ddO_channel, + strides, + padding_common, + dilations, + dtype, + groups, + paddle::platform::DataLayout::kNCHW}; + ConvArgs args2{&transformed_X, + ddW, + &transformed_ddO_channel, + strides, + padding_common, + dilations, + dtype, + groups, + paddle::platform::DataLayout::kNCHW}; + ConvArgs args3{&transformed_ddX, + dW, + &transformed_dO_channel, + strides, + padding_common, + dilations, + dtype, + groups, + paddle::platform::DataLayout::kNCHW}; + ConvArgs args4{&transformed_dX, + ddW, + &transformed_dO_channel, + strides, + padding_common, + dilations, + dtype, + groups, + paddle::platform::DataLayout::kNCHW}; #ifdef PADDLE_WITH_HIP - paddle::operators::SearchResult fwd_result1; - paddle::operators::SearchResult fwd_result2; - paddle::operators::SearchResult data_result; - paddle::operators::SearchResult - filter_result; + SearchResult fwd_result1; + SearchResult fwd_result2; + SearchResult data_result; + SearchResult filter_result; #else - paddle::operators::SearchResult fwd_result1; - paddle::operators::SearchResult fwd_result2; - paddle::operators::SearchResult data_result; - paddle::operators::SearchResult - filter_result; + SearchResult fwd_result1; + SearchResult fwd_result2; + SearchResult data_result; + SearchResult filter_result; #endif // ddo = conv(ddI, W) + conv(I, ddW) @@ -328,14 +326,12 @@ void ConvCudnnGradGradKernel( c_group); #ifdef PADDLE_WITH_HIP - using search1 = - paddle::operators::SearchAlgorithm; + using search1 = SearchAlgorithm; workspace_size = search1::GetWorkspaceSize(args1); fwd_result1.algo = search1::Find( args1, exhaustive_search, false, workspace_size, ctx); #else - using search1 = - paddle::operators::SearchAlgorithm; + using search1 = SearchAlgorithm; fwd_result1 = search1::Find(ctx, args1, exhaustive_search, false); workspace_size = search1::GetWorkspaceSize(args1, fwd_result1.algo); #endif @@ -355,15 +351,13 @@ void ConvCudnnGradGradKernel( c_group); #ifdef PADDLE_WITH_HIP - using search2 = - paddle::operators::SearchAlgorithm; + using search2 = SearchAlgorithm; workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2)); fwd_result2.algo = search2::Find( args2, exhaustive_search, false, workspace_size, ctx); #else - using search2 = - paddle::operators::SearchAlgorithm; + using search2 = SearchAlgorithm; fwd_result2 = search2::Find(ctx, args2, exhaustive_search, false); workspace_size = std::max( workspace_size, search2::GetWorkspaceSize(args2, fwd_result2.algo)); @@ -385,14 +379,12 @@ void ConvCudnnGradGradKernel( c_group); #ifdef PADDLE_WITH_HIP - using search3 = - paddle::operators::SearchAlgorithm; + using search3 = SearchAlgorithm; workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3)); filter_result.algo = search3::Find( args3, exhaustive_search, deterministic, workspace_size, ctx); #else - using search3 = - paddle::operators::SearchAlgorithm; + using search3 = SearchAlgorithm; filter_result = search3::Find(ctx, args3, exhaustive_search, deterministic); workspace_size = std::max( @@ -415,14 +407,12 @@ void ConvCudnnGradGradKernel( c_group); #ifdef PADDLE_WITH_HIP - using search4 = - paddle::operators::SearchAlgorithm; + using search4 = SearchAlgorithm; workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4)); data_result.algo = search4::Find( args4, exhaustive_search, deterministic, workspace_size, ctx); #else - using search4 = - paddle::operators::SearchAlgorithm; + using search4 = SearchAlgorithm; data_result = search4::Find(ctx, args4, exhaustive_search, deterministic); workspace_size = std::max( @@ -447,8 +437,8 @@ void ConvCudnnGradGradKernel( int group_offset_out = o_c / groups * o_h * o_w * o_d; int group_offset_filter = W->numel() / groups; - paddle::operators::ScalingParamType alpha = 1.0f; - paddle::operators::ScalingParamType beta = 0.0f; + ScalingParamType alpha = 1.0f; + ScalingParamType beta = 0.0f; // NOTE(zhiqiu): inplace addto is not supportted in double grad yet. // ScalingParamType beta = ctx.Attr("use_addto") ? 1.0f : @@ -657,10 +647,10 @@ void ConvCudnnGradGradKernel( axes[i] = i; } if (X->dims().size() == 4) { - paddle::operators::RemovePaddingSlice( + RemovePaddingSlice( ctx, &transformed_dX, &transformed_dX_channel, starts, axes); } else { - paddle::operators::RemovePaddingSlice( + RemovePaddingSlice( ctx, &transformed_dX, &transformed_dX_channel, starts, axes); } } diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu index 2d61ec6e62..aa8cbd6c87 100644 --- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu @@ -19,9 +19,9 @@ #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/kernel_registry.h" #ifdef PADDLE_WITH_HIP -#include "paddle/fluid/operators/conv_miopen_helper.h" +#include "paddle/phi/kernels/gpudnn/conv_miopen_helper.h" #else -#include "paddle/fluid/operators/conv_cudnn_helper.h" +#include "paddle/phi/kernels/gpudnn/conv_cudnn_v7.h" #endif #include "paddle/fluid/platform/cudnn_workspace_helper.h" @@ -256,24 +256,24 @@ void ConvCudnnGradKernel(const Context& ctx, ? paddle::platform::DataLayout::kNHWC : paddle::platform::DataLayout::kNCHW; - paddle::operators::ConvArgs args1{&transformed_input_grad, - &transformed_filter_channel, - &transformed_output_grad_channel, - strides, - padding_common, - dilations, - dtype, - groups, - layout}; - paddle::operators::ConvArgs args2{&transformed_input, - &transformed_filter_grad_channel, - &transformed_output_grad_channel, - strides, - padding_common, - dilations, - dtype, - groups, - layout}; + ConvArgs args1{&transformed_input_grad, + &transformed_filter_channel, + &transformed_output_grad_channel, + strides, + padding_common, + dilations, + dtype, + groups, + layout}; + ConvArgs args2{&transformed_input, + &transformed_filter_grad_channel, + &transformed_output_grad_channel, + strides, + padding_common, + dilations, + dtype, + groups, + layout}; auto handle = ctx.cudnn_handle(); // TODO(phlrain): replace paddle::platform::DataLaytout to phi::DataLayout @@ -289,35 +289,35 @@ void ConvCudnnGradKernel(const Context& ctx, int i_n, i_c, i_d, i_h, i_w; int o_n, o_c, o_d, o_h, o_w; if (compute_format == paddle::platform::DataLayout::kNHWC) { - paddle::operators::GetNCDHW(transformed_input.dims(), - paddle::platform::DataLayout::kNHWC, - &i_n, - &i_c, - &i_d, - &i_h, - &i_w); - paddle::operators::GetNCDHW(transformed_output_grad_channel.dims(), - paddle::platform::DataLayout::kNHWC, - &o_n, - &o_c, - &o_d, - &o_h, - &o_w); + GetNCDHW(transformed_input.dims(), + paddle::platform::DataLayout::kNHWC, + &i_n, + &i_c, + &i_d, + &i_h, + &i_w); + GetNCDHW(transformed_output_grad_channel.dims(), + paddle::platform::DataLayout::kNHWC, + &o_n, + &o_c, + &o_d, + &o_h, + &o_w); } else { - paddle::operators::GetNCDHW(transformed_input.dims(), - paddle::platform::DataLayout::kNCHW, - &i_n, - &i_c, - &i_d, - &i_h, - &i_w); - paddle::operators::GetNCDHW(transformed_output_grad_channel.dims(), - paddle::platform::DataLayout::kNCHW, - &o_n, - &o_c, - &o_d, - &o_h, - &o_w); + GetNCDHW(transformed_input.dims(), + paddle::platform::DataLayout::kNCHW, + &i_n, + &i_c, + &i_d, + &i_h, + &i_w); + GetNCDHW(transformed_output_grad_channel.dims(), + paddle::platform::DataLayout::kNCHW, + &o_n, + &o_c, + &o_d, + &o_h, + &o_w); } int group_offset_in = i_c / groups * i_h * i_w * i_d; @@ -326,13 +326,11 @@ void ConvCudnnGradKernel(const Context& ctx, // ------------------- cudnn backward algorithm --------------------- #ifdef PADDLE_WITH_HIP - paddle::operators::SearchResult bwd_result; - paddle::operators::SearchResult - filter_result; + SearchResult bwd_result; + SearchResult filter_result; #else - paddle::operators::SearchResult bwd_result; - paddle::operators::SearchResult - filter_result; + SearchResult bwd_result; + SearchResult filter_result; #endif // input data workspace_size size_t workspace_size_d = 0; @@ -364,15 +362,13 @@ void ConvCudnnGradKernel(const Context& ctx, c_groups); #ifdef PADDLE_WITH_HIP - using search1 = - paddle::operators::SearchAlgorithm; + using search1 = SearchAlgorithm; workspace_size_d = std::max(workspace_size_d, search1::GetWorkspaceSize(args1)); bwd_result.algo = search1::Find( args1, exhaustive_search, deterministic, workspace_size_d, ctx); #else - using search1 = - paddle::operators::SearchAlgorithm; + using search1 = SearchAlgorithm; bwd_result = search1::Find(ctx, args1, exhaustive_search, deterministic); workspace_size_d = std::max(workspace_size_d, bwd_result.workspace_size); #endif @@ -392,15 +388,13 @@ void ConvCudnnGradKernel(const Context& ctx, paddle::platform::AllowTF32Cudnn(), c_groups); #ifdef PADDLE_WITH_HIP - using search2 = - paddle::operators::SearchAlgorithm; + using search2 = SearchAlgorithm; workspace_size_w = std::max(workspace_size_w, search2::GetWorkspaceSize(args2)); filter_result.algo = search2::Find( args2, exhaustive_search, deterministic, workspace_size_w, ctx); #else - using search2 = - paddle::operators::SearchAlgorithm; + using search2 = SearchAlgorithm; filter_result = search2::Find(ctx, args2, exhaustive_search, deterministic); VLOG(3) << "filter algo: " << filter_result.algo << ", time " @@ -410,12 +404,12 @@ void ConvCudnnGradKernel(const Context& ctx, } // ------------------- cudnn conv backward data --------------------- - paddle::operators::ScalingParamType alpha = 1.0f; + ScalingParamType alpha = 1.0f; #ifdef PADDLE_WITH_HIP // MIOPEN ONLY support beta to be 0.0f - paddle::operators::ScalingParamType beta = 0.0f; + ScalingParamType beta = 0.0f; #else - paddle::operators::ScalingParamType beta = use_addto ? 1.0f : 0.0f; + ScalingParamType beta = use_addto ? 1.0f : 0.0f; #endif VLOG(4) << "Conv_grad: use_addto = " << use_addto; @@ -515,19 +509,17 @@ void ConvCudnnGradKernel(const Context& ctx, ctx.template Alloc(&transformed_input_grad_channel); if (transformed_input_channel.dims().size() == 4) { - paddle::operators::RemovePaddingSlice( - ctx, - &transformed_input_grad, - &transformed_input_grad_channel, - starts, - axes); + RemovePaddingSlice(ctx, + &transformed_input_grad, + &transformed_input_grad_channel, + starts, + axes); } else { - paddle::operators::RemovePaddingSlice( - ctx, - &transformed_input_grad, - &transformed_input_grad_channel, - starts, - axes); + RemovePaddingSlice(ctx, + &transformed_input_grad, + &transformed_input_grad_channel, + starts, + axes); } } @@ -538,7 +530,7 @@ void ConvCudnnGradKernel(const Context& ctx, } // filter_grad do not use inplace addto. - paddle::operators::ScalingParamType beta_filter = 0.0f; + ScalingParamType beta_filter = 0.0f; // ------------------- cudnn conv backward filter --------------------- if (filter_grad) { // Because beta is zero, it is unnecessary to reset filter_grad. diff --git a/paddle/phi/kernels/gpudnn/conv_kernel.cu b/paddle/phi/kernels/gpudnn/conv_kernel.cu index 7a6e8d8148..a44a98450d 100644 --- a/paddle/phi/kernels/gpudnn/conv_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_kernel.cu @@ -19,9 +19,9 @@ #include "paddle/phi/core/kernel_registry.h" #ifdef PADDLE_WITH_HIP -#include "paddle/fluid/operators/conv_miopen_helper.h" +#include "paddle/phi/kernels/gpudnn/conv_miopen_helper.h" #else -#include "paddle/fluid/operators/conv_cudnn_helper.h" +#include "paddle/phi/kernels/gpudnn/conv_cudnn_v7.h" #endif #include "paddle/fluid/platform/cudnn_workspace_helper.h" @@ -205,15 +205,15 @@ void ConvCudnnKernel(const Context& ctx, const T* filter_data = transformed_filter_channel.data(); // ------------------- cudnn descriptors --------------------- - paddle::operators::ConvArgs args{&transformed_input, - &transformed_filter_channel, - &transformed_output, - strides, - padding_common, - dilations, - dtype, - groups, - compute_format}; + ConvArgs args{&transformed_input, + &transformed_filter_channel, + &transformed_output, + strides, + padding_common, + dilations, + dtype, + groups, + compute_format}; auto handle = ctx.cudnn_handle(); auto workspace_handle = ctx.cudnn_workspace_handle(); @@ -266,35 +266,35 @@ void ConvCudnnKernel(const Context& ctx, int o_n, o_c, o_d, o_h, o_w; if (compute_format == paddle::platform::DataLayout::kNHWC) { - paddle::operators::GetNCDHW(transformed_input.dims(), - paddle::platform::DataLayout::kNHWC, - &i_n, - &i_c, - &i_d, - &i_h, - &i_w); - paddle::operators::GetNCDHW(transformed_output.dims(), - paddle::platform::DataLayout::kNHWC, - &o_n, - &o_c, - &o_d, - &o_h, - &o_w); + GetNCDHW(transformed_input.dims(), + paddle::platform::DataLayout::kNHWC, + &i_n, + &i_c, + &i_d, + &i_h, + &i_w); + GetNCDHW(transformed_output.dims(), + paddle::platform::DataLayout::kNHWC, + &o_n, + &o_c, + &o_d, + &o_h, + &o_w); } else { - paddle::operators::GetNCDHW(transformed_input.dims(), - paddle::platform::DataLayout::kNCHW, - &i_n, - &i_c, - &i_d, - &i_h, - &i_w); - paddle::operators::GetNCDHW(transformed_output.dims(), - paddle::platform::DataLayout::kNCHW, - &o_n, - &o_c, - &o_d, - &o_h, - &o_w); + GetNCDHW(transformed_input.dims(), + paddle::platform::DataLayout::kNCHW, + &i_n, + &i_c, + &i_d, + &i_h, + &i_w); + GetNCDHW(transformed_output.dims(), + paddle::platform::DataLayout::kNCHW, + &o_n, + &o_c, + &o_d, + &o_h, + &o_w); } int group_offset_in = i_c / groups * i_h * i_w * i_d; @@ -304,15 +304,14 @@ void ConvCudnnKernel(const Context& ctx, size_t workspace_size = 0; // final workspace to allocate. // ------------------- cudnn conv algorithm --------------------- #ifdef PADDLE_WITH_HIP - paddle::operators::SearchResult fwd_result; - using search = paddle::operators::SearchAlgorithm; + SearchResult fwd_result; + using search = SearchAlgorithm; workspace_size = search::GetWorkspaceSize(args); fwd_result.algo = search::Find( args, exhaustive_search, deterministic, workspace_size, ctx); #else - paddle::operators::SearchResult fwd_result; - using search = - paddle::operators::SearchAlgorithm; + SearchResult fwd_result; + using search = SearchAlgorithm; fwd_result = search::Find(ctx, args, exhaustive_search, deterministic); workspace_size = fwd_result.workspace_size; #endif @@ -328,8 +327,8 @@ void ConvCudnnKernel(const Context& ctx, #endif // ------------------- cudnn conv forward --------------------- - paddle::operators::ScalingParamType alpha = 1.0f; - paddle::operators::ScalingParamType beta = 0.0f; + ScalingParamType alpha = 1.0f; + ScalingParamType beta = 0.0f; // NOTE(zhiqiu): inplace addto is not supportted in double grad yet. // ScalingParamType beta = ctx.Attr("use_addto") ? 1.0f : 0.0f; diff --git a/paddle/fluid/operators/conv_miopen_helper.h b/paddle/phi/kernels/gpudnn/conv_miopen_helper.h similarity index 75% rename from paddle/fluid/operators/conv_miopen_helper.h rename to paddle/phi/kernels/gpudnn/conv_miopen_helper.h index 907ae50941..9cdbcc1265 100644 --- a/paddle/fluid/operators/conv_miopen_helper.h +++ b/paddle/phi/kernels/gpudnn/conv_miopen_helper.h @@ -14,48 +14,12 @@ limitations under the License. */ #pragma once -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/operators/conv_base_helper.h" +#include "paddle/phi/kernels/gpudnn/conv_gpudnn_base.h" -namespace paddle { -namespace operators { +namespace phi { using ConvArgs = ConvArgsBase; -template -static void RemovePaddingSlice(const phi::GPUContext& context, - const phi::DenseTensor* input, - phi::DenseTensor* out, - const std::vector& starts, - const std::vector& axes) { - auto& place = *context.eigen_device(); - auto in_dims = input->dims(); - auto new_out_dims = out->dims(); - auto offsets = Eigen::array(); - auto extents = Eigen::array(); - for (size_t i = 0; i < D; ++i) { - offsets[i] = 0; - extents[i] = new_out_dims[i]; - } - - for (size_t i = 0; i < axes.size(); ++i) { - int start = starts[i]; - if (start < 0) { - start = (start + in_dims[axes[i]]); - } - start = std::max(start, 0); - offsets[axes[i]] = start; - } - auto in_t = - framework::EigenTensor::From( - *input); - - auto out_t = - framework::EigenTensor::From( - *out, new_out_dims); - out_t.device(place) = in_t.slice(offsets, extents); -} - template struct SearchAlgorithm {}; @@ -78,7 +42,7 @@ struct SearchAlgorithm { miopenConvAlgoPerf_t find_result; auto cudnn_find_func = [&](void* cudnn_workspace_ptr) { PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenFindConvolutionForwardAlgorithm( + phi::dynload::miopenFindConvolutionForwardAlgorithm( args.handle, args.idesc.desc(), args.x->data(), @@ -104,7 +68,7 @@ struct SearchAlgorithm { static size_t GetWorkspaceSize(const ConvArgs& args) { size_t workspace_size = 0; PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionForwardGetWorkSpaceSize( + phi::dynload::miopenConvolutionForwardGetWorkSpaceSize( args.handle, args.wdesc.desc(), args.idesc.desc(), @@ -134,7 +98,7 @@ struct SearchAlgorithm { miopenConvAlgoPerf_t find_result; auto cudnn_find_func = [&](void* cudnn_workspace_ptr) { PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenFindConvolutionBackwardDataAlgorithm( + phi::dynload::miopenFindConvolutionBackwardDataAlgorithm( args.handle, args.odesc.desc(), args.o->data(), @@ -160,7 +124,7 @@ struct SearchAlgorithm { static size_t GetWorkspaceSize(const ConvArgs& args) { size_t workspace_size = 0; PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionBackwardDataGetWorkSpaceSize( + phi::dynload::miopenConvolutionBackwardDataGetWorkSpaceSize( args.handle, args.odesc.desc(), args.wdesc.desc(), @@ -190,7 +154,7 @@ struct SearchAlgorithm { miopenConvAlgoPerf_t find_result; auto cudnn_find_func = [&](void* cudnn_workspace_ptr) { PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenFindConvolutionBackwardWeightsAlgorithm( + phi::dynload::miopenFindConvolutionBackwardWeightsAlgorithm( args.handle, args.odesc.desc(), args.o->data(), @@ -216,7 +180,7 @@ struct SearchAlgorithm { static size_t GetWorkspaceSize(const ConvArgs& args) { size_t workspace_size = 0; PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionBackwardWeightsGetWorkSpaceSize( + phi::dynload::miopenConvolutionBackwardWeightsGetWorkSpaceSize( args.handle, args.odesc.desc(), args.idesc.desc(), @@ -227,5 +191,4 @@ struct SearchAlgorithm { } }; -} // namespace operators -} // namespace paddle +} // namespace phi diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu index d05bd58e33..640ebd23d9 100644 --- a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu @@ -28,11 +28,11 @@ limitations under the License. */ #include "paddle/phi/kernels/transpose_kernel.h" #ifdef PADDLE_WITH_HIP -#include "paddle/fluid/operators/conv_miopen_helper.h" #include "paddle/fluid/platform/device/gpu/rocm/miopen_helper.h" +#include "paddle/phi/kernels/gpudnn/conv_miopen_helper.h" #else -#include "paddle/fluid/operators/conv_cudnn_helper.h" #include "paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h" +#include "paddle/phi/kernels/gpudnn/conv_cudnn_v7.h" #endif namespace phi { @@ -173,33 +173,31 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, auto dtype = paddle::platform::CudnnDataType::type; - paddle::operators::ConvArgs args1{&transformed_dout, - &filter, - &x_transpose, - strides, - padding_common, - dilations_, - dtype, - groups, - layout}; - paddle::operators::ConvArgs args2{&transformed_dout, - &filter, - &x_transpose, - strides, - padding_common, - dilations_, - dtype, - groups, - layout}; + ConvArgs args1{&transformed_dout, + &filter, + &x_transpose, + strides, + padding_common, + dilations_, + dtype, + groups, + layout}; + ConvArgs args2{&transformed_dout, + &filter, + &x_transpose, + strides, + padding_common, + dilations_, + dtype, + groups, + layout}; #ifdef PADDLE_WITH_HIP - paddle::operators::SearchResult fwd_result; - paddle::operators::SearchResult - filter_result; + SearchResult fwd_result; + SearchResult filter_result; #else - paddle::operators::SearchResult fwd_result; - paddle::operators::SearchResult - filter_result; + SearchResult fwd_result; + SearchResult filter_result; #endif auto layout_tensor = paddle::platform::GetCudnnTensorFormat(layout); @@ -222,14 +220,12 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, paddle::platform::AllowTF32Cudnn(), c_groups); #ifdef PADDLE_WITH_HIP - using search1 = - paddle::operators::SearchAlgorithm; + using search1 = SearchAlgorithm; workspace_size = std::max(workspace_size, search1::GetWorkspaceSize(args1)); fwd_result.algo = search1::Find(args1, false, deterministic, workspace_size, ctx); #else - using search1 = - paddle::operators::SearchAlgorithm; + using search1 = SearchAlgorithm; fwd_result = search1::Find(ctx, args1, false, deterministic, false); workspace_size = std::max( workspace_size, search1::GetWorkspaceSize(args1, fwd_result.algo)); @@ -249,14 +245,12 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, paddle::platform::AllowTF32Cudnn(), c_groups); #ifdef PADDLE_WITH_HIP - using search2 = - paddle::operators::SearchAlgorithm; + using search2 = SearchAlgorithm; workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2)); filter_result.algo = search2::Find(args2, false, deterministic, workspace_size, ctx); #else - using search2 = - paddle::operators::SearchAlgorithm; + using search2 = SearchAlgorithm; filter_result = search2::Find(ctx, args2, false, deterministic, false); workspace_size = std::max( workspace_size, search2::GetWorkspaceSize(args2, filter_result.algo)); @@ -269,8 +263,8 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, int dout_offset = transformed_dout.numel() / transformed_dout.dims()[0] / groups; int filter_offset = filter.numel() / groups; - paddle::operators::ScalingParamType alpha = 1.0f; - paddle::operators::ScalingParamType beta = 0.0f; + ScalingParamType alpha = 1.0f; + ScalingParamType beta = 0.0f; auto workspace_handle = ctx.cudnn_workspace_handle(); if (dx) { // Because beta is zero, it is unnecessary to reset dx. @@ -631,55 +625,53 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( auto handle = ctx.cudnn_handle(); auto layout = paddle::platform::GetCudnnTensorFormat(GPUDNNDataLayout::kNCHW); - paddle::operators::ConvArgs args1{&transformed_ddout_channel, - &filter, - &transformed_ddx, - strides, - padding_common, - dilations_, - dtype, - groups, - GPUDNNDataLayout::kNCHW}; - paddle::operators::ConvArgs args2{&transformed_ddout_channel, - &ddfilter, - &transformed_x, - strides, - padding_common, - dilations_, - dtype, - groups, - GPUDNNDataLayout::kNCHW}; - - paddle::operators::ConvArgs args3{&transformed_dout, - dfilter, - &transformed_ddx_channel, - strides, - padding_common, - dilations_, - dtype, - groups, - GPUDNNDataLayout::kNCHW}; - paddle::operators::ConvArgs args4{&transformed_dout, - &ddfilter, - &transformed_dx_channel, - strides, - padding_common, - dilations_, - dtype, - groups, - GPUDNNDataLayout::kNCHW}; + ConvArgs args1{&transformed_ddout_channel, + &filter, + &transformed_ddx, + strides, + padding_common, + dilations_, + dtype, + groups, + GPUDNNDataLayout::kNCHW}; + ConvArgs args2{&transformed_ddout_channel, + &ddfilter, + &transformed_x, + strides, + padding_common, + dilations_, + dtype, + groups, + GPUDNNDataLayout::kNCHW}; + + ConvArgs args3{&transformed_dout, + dfilter, + &transformed_ddx_channel, + strides, + padding_common, + dilations_, + dtype, + groups, + GPUDNNDataLayout::kNCHW}; + ConvArgs args4{&transformed_dout, + &ddfilter, + &transformed_dx_channel, + strides, + padding_common, + dilations_, + dtype, + groups, + GPUDNNDataLayout::kNCHW}; #ifdef PADDLE_WITH_HIP - paddle::operators::SearchResult bwd_result1; - paddle::operators::SearchResult bwd_result2; - paddle::operators::SearchResult - filter_result; - paddle::operators::SearchResult fwd_result; + SearchResult bwd_result1; + SearchResult bwd_result2; + SearchResult filter_result; + SearchResult fwd_result; #else - paddle::operators::SearchResult bwd_result1; - paddle::operators::SearchResult bwd_result2; - paddle::operators::SearchResult - filter_result; - paddle::operators::SearchResult fwd_result; + SearchResult bwd_result1; + SearchResult bwd_result2; + SearchResult filter_result; + SearchResult fwd_result; #endif // ddo = conv(ddI, filter) + conv(I, ddfilter) @@ -702,14 +694,12 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( paddle::platform::AllowTF32Cudnn(), c_group); #ifdef PADDLE_WITH_HIP - using search1 = - paddle::operators::SearchAlgorithm; + using search1 = SearchAlgorithm; workspace_size = search1::GetWorkspaceSize(args1); bwd_result1.algo = search1::Find(args1, false, deterministic, workspace_size, ctx); #else - using search1 = - paddle::operators::SearchAlgorithm; + using search1 = SearchAlgorithm; bwd_result1 = search1::Find(ctx, args1, false, deterministic, false); workspace_size = search1::GetWorkspaceSize(args1, bwd_result1.algo); #endif @@ -726,14 +716,12 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( paddle::platform::AllowTF32Cudnn(), c_group); #ifdef PADDLE_WITH_HIP - using search2 = - paddle::operators::SearchAlgorithm; + using search2 = SearchAlgorithm; workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2)); bwd_result2.algo = search2::Find(args2, false, deterministic, workspace_size, ctx); #else - using search2 = - paddle::operators::SearchAlgorithm; + using search2 = SearchAlgorithm; bwd_result2 = search2::Find(ctx, args2, false, deterministic, false); workspace_size = std::max( workspace_size, search2::GetWorkspaceSize(args2, bwd_result2.algo)); @@ -753,14 +741,12 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( paddle::platform::AllowTF32Cudnn(), c_group); #ifdef PADDLE_WITH_HIP - using search3 = - paddle::operators::SearchAlgorithm; + using search3 = SearchAlgorithm; workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3)); filter_result.algo = search3::Find(args3, false, deterministic, workspace_size, ctx); #else - using search3 = - paddle::operators::SearchAlgorithm; + using search3 = SearchAlgorithm; filter_result = search3::Find(ctx, args3, false, deterministic, false); workspace_size = std::max( workspace_size, search3::GetWorkspaceSize(args3, filter_result.algo)); @@ -781,14 +767,12 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( paddle::platform::AllowTF32Cudnn(), c_group); #ifdef PADDLE_WITH_HIP - using search4 = - paddle::operators::SearchAlgorithm; + using search4 = SearchAlgorithm; workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4)); fwd_result.algo = search4::Find(args4, false, deterministic, workspace_size, ctx); #else - using search4 = - paddle::operators::SearchAlgorithm; + using search4 = SearchAlgorithm; fwd_result = search4::Find(ctx, args4, false, deterministic, false); workspace_size = std::max( workspace_size, search4::GetWorkspaceSize(args4, fwd_result.algo)); @@ -796,22 +780,22 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( } int i_n, i_c, i_d, i_h, i_w; - paddle::operators::GetNCDHW(transformed_x.dims(), - GPUDNNDataLayout::kNCHW, - &i_n, - &i_c, - &i_d, - &i_h, - &i_w); + GetNCDHW(transformed_x.dims(), + GPUDNNDataLayout::kNCHW, + &i_n, + &i_c, + &i_d, + &i_h, + &i_w); int o_n, o_c, o_d, o_h, o_w; - paddle::operators::GetNCDHW(transformed_dout.dims(), - GPUDNNDataLayout::kNCHW, - &o_n, - &o_c, - &o_d, - &o_h, - &o_w); + GetNCDHW(transformed_dout.dims(), + GPUDNNDataLayout::kNCHW, + &o_n, + &o_c, + &o_d, + &o_h, + &o_w); int group_offset_in = transformed_x.numel() / transformed_x.dims()[0] / groups; @@ -819,8 +803,8 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( transformed_dout.numel() / transformed_dout.dims()[0] / groups; int group_offset_filter = filter.numel() / groups; - paddle::operators::ScalingParamType alpha = 1.0f; - paddle::operators::ScalingParamType beta = 0.0f; + ScalingParamType alpha = 1.0f; + ScalingParamType beta = 0.0f; auto wkspace_handle = ctx.cudnn_workspace_handle(); diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu index 84332f0ccb..b98a6d6ae4 100644 --- a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu @@ -26,11 +26,11 @@ limitations under the License. */ #include "paddle/phi/kernels/transpose_kernel.h" #ifdef PADDLE_WITH_HIP -#include "paddle/fluid/operators/conv_miopen_helper.h" #include "paddle/fluid/platform/device/gpu/rocm/miopen_helper.h" +#include "paddle/phi/kernels/gpudnn/conv_miopen_helper.h" #else -#include "paddle/fluid/operators/conv_cudnn_helper.h" #include "paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h" +#include "paddle/phi/kernels/gpudnn/conv_cudnn_v7.h" #endif namespace phi { @@ -199,15 +199,15 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx, auto dtype = paddle::platform::CudnnDataType::type; // ------------------- cudnn descriptors --------------------- - paddle::operators::ConvArgs args{&transformed_out, - &filter, - &transformed_x, - strides, - padding_common, - dilations_, - dtype, - groups, - data_layout}; + ConvArgs args{&transformed_out, + &filter, + &transformed_x, + strides, + padding_common, + dilations_, + dtype, + groups, + data_layout}; args.handle = handle; args.idesc.set(transformed_out, iwo_groups); args.wdesc.set(filter, layout_tensor, iwo_groups); @@ -220,16 +220,14 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx, c_groups); #ifdef PADDLE_WITH_HIP - paddle::operators::SearchResult bwd_result; - using search = - paddle::operators::SearchAlgorithm; + SearchResult bwd_result; + using search = SearchAlgorithm; workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args)); bwd_result.algo = search::Find(args, false, deterministic, workspace_size, ctx); #else - paddle::operators::SearchResult bwd_result; - using search = - paddle::operators::SearchAlgorithm; + SearchResult bwd_result; + using search = SearchAlgorithm; bwd_result = search::Find(ctx, args, false, deterministic, false); workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args, bwd_result.algo)); @@ -239,8 +237,8 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx, int x_offset = transformed_x.numel() / transformed_x.dims()[0] / groups; int out_offset = transformed_out.numel() / transformed_out.dims()[0] / groups; int filter_offset = filter.numel() / groups; - paddle::operators::ScalingParamType alpha = 1.0f; - paddle::operators::ScalingParamType beta = 0.0f; + ScalingParamType alpha = 1.0f; + ScalingParamType beta = 0.0f; auto workspace_handle = ctx.cudnn_workspace_handle(); for (int g = 0; g < groups; g++) { #ifdef PADDLE_WITH_HIP diff --git a/paddle/phi/kernels/impl/conv_cudnn_impl.h b/paddle/phi/kernels/impl/conv_cudnn_impl.h index 132eda7596..ba7f69f9c3 100644 --- a/paddle/phi/kernels/impl/conv_cudnn_impl.h +++ b/paddle/phi/kernels/impl/conv_cudnn_impl.h @@ -19,9 +19,9 @@ #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/kernel_registry.h" #ifdef PADDLE_WITH_HIP -#include "paddle/fluid/operators/conv_miopen_helper.h" +#include "paddle/phi/kernels/gpudnn/conv_miopen_helper.h" #else -#include "paddle/fluid/operators/conv_cudnn_helper.h" +#include "paddle/phi/kernels/gpudnn/conv_cudnn_v7.h" #endif #include "paddle/fluid/platform/cudnn_workspace_helper.h" -- GitLab