From 88ec08a70e9a6501983cbc671812be3433f9f08a Mon Sep 17 00:00:00 2001 From: From00 Date: Mon, 14 Mar 2022 10:48:38 +0800 Subject: [PATCH] Move Pool OPs to phi (#40208) * Move Pool OPs to phi * Fix CI error * Fix conflicts --- paddle/fluid/framework/infershape_utils.cc | 3 +- paddle/fluid/imperative/prepared_operator.h | 23 +- .../inference/tensorrt/convert/pool2d_op.cc | 2 +- .../inference/tensorrt/convert/pool3d_op.cc | 2 +- .../tensorrt/convert/test_pool2d_op.cc | 2 +- .../tensorrt/plugin/pool3d_op_plugin.cu | 22 +- .../tensorrt/plugin/pool_op_plugin.cu | 22 +- paddle/fluid/operators/flatten_op.h | 1 - paddle/fluid/operators/math/CMakeLists.txt | 1 - paddle/fluid/operators/math/pooling.h | 315 ---- .../fluid/operators/mkldnn/pool_mkldnn_op.cc | 16 +- .../operators/mkldnn/test_mkldnn_op_nhwc.cc | 3 +- paddle/fluid/operators/pool_cudnn_op.cu.cc | 567 ------- paddle/fluid/operators/pool_op.cc | 182 +-- paddle/fluid/operators/pool_op.cu | 48 - paddle/fluid/operators/pool_op.h | 304 +--- paddle/fluid/operators/pool_op_mlu.cc | 15 +- paddle/fluid/operators/pool_op_npu.cc | 12 +- paddle/fluid/operators/pool_op_xpu.cc | 6 +- paddle/fluid/operators/pool_with_index_op.cc | 135 +- .../fluid/operators/pool_with_index_op.cu.cc | 43 - paddle/fluid/operators/pool_with_index_op.h | 121 -- paddle/fluid/operators/spp_op.h | 31 +- paddle/fluid/operators/squeeze_op.h | 1 - paddle/fluid/operators/unsqueeze_op.h | 1 - paddle/phi/core/meta_tensor.h | 6 +- paddle/phi/infermeta/backward.cc | 29 + paddle/phi/infermeta/backward.h | 25 + paddle/phi/infermeta/unary.cc | 185 +++ paddle/phi/infermeta/unary.h | 24 + paddle/phi/kernels/CMakeLists.txt | 23 +- paddle/phi/kernels/concat_kernel.h | 2 +- paddle/phi/kernels/cpu/pool_grad_kernel.cc | 49 + paddle/phi/kernels/cpu/pool_kernel.cc | 41 + paddle/phi/kernels/cpu/split_kernel.cc | 2 +- paddle/phi/kernels/funcs/CMakeLists.txt | 9 +- .../math => phi/kernels/funcs}/pooling.cc | 423 ++--- .../math => phi/kernels/funcs}/pooling.cu | 1421 +++++++++++------ paddle/phi/kernels/funcs/pooling.h | 469 ++++++ paddle/phi/kernels/gpu/pool_grad_kernel.cu | 60 + paddle/phi/kernels/gpu/pool_kernel.cu | 54 + paddle/phi/kernels/gpu/split_kernel.cu | 2 +- paddle/phi/kernels/gpudnn/pool_gpudnn.h | 43 + paddle/phi/kernels/gpudnn/pool_grad_kernel.cu | 448 ++++++ paddle/phi/kernels/gpudnn/pool_kernel.cu | 312 ++++ .../phi/kernels/impl/pool_grad_kernel_impl.h | 332 ++++ paddle/phi/kernels/impl/pool_kernel_impl.h | 321 ++++ paddle/phi/kernels/pool_grad_kernel.h | 145 ++ paddle/phi/kernels/pool_kernel.h | 105 ++ paddle/phi/ops/compat/pool_sig.cc | 154 ++ paddle/phi/tests/core/test_meta_fn_utils.cc | 6 +- 51 files changed, 4148 insertions(+), 2420 deletions(-) delete mode 100644 paddle/fluid/operators/math/pooling.h delete mode 100644 paddle/fluid/operators/pool_cudnn_op.cu.cc delete mode 100644 paddle/fluid/operators/pool_op.cu delete mode 100644 paddle/fluid/operators/pool_with_index_op.cu.cc delete mode 100644 paddle/fluid/operators/pool_with_index_op.h create mode 100644 paddle/phi/kernels/cpu/pool_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/pool_kernel.cc rename paddle/{fluid/operators/math => phi/kernels/funcs}/pooling.cc (83%) rename paddle/{fluid/operators/math => phi/kernels/funcs}/pooling.cu (54%) create mode 100644 paddle/phi/kernels/funcs/pooling.h create mode 100644 paddle/phi/kernels/gpu/pool_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/pool_kernel.cu create mode 100644 paddle/phi/kernels/gpudnn/pool_gpudnn.h create mode 100644 paddle/phi/kernels/gpudnn/pool_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpudnn/pool_kernel.cu create mode 100644 paddle/phi/kernels/impl/pool_grad_kernel_impl.h create mode 100644 paddle/phi/kernels/impl/pool_kernel_impl.h create mode 100644 paddle/phi/kernels/pool_grad_kernel.h create mode 100644 paddle/phi/kernels/pool_kernel.h create mode 100644 paddle/phi/ops/compat/pool_sig.cc diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc index f57674d560..5119c30690 100644 --- a/paddle/fluid/framework/infershape_utils.cc +++ b/paddle/fluid/framework/infershape_utils.cc @@ -297,7 +297,8 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, VLOG(3) << "BuildInferMetaContext: op kernel signature - " << signature; // 2. build infermeta context - phi::InferMetaContext infer_meta_context(ctx->IsRuntime()); + phi::InferMetaContext infer_meta_context( + {ctx->IsRuntime(), ctx->IsRunMKLDNNKernel()}); auto& input_names = std::get<0>(signature.args); auto& attr_names = std::get<1>(signature.args); diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h index d7c0c8cc54..91e6974fa2 100644 --- a/paddle/fluid/imperative/prepared_operator.h +++ b/paddle/fluid/imperative/prepared_operator.h @@ -264,14 +264,23 @@ void BuildDygraphPhiKernelContext( size_t start_idx = (i == 0 ? 0 : kernel_ctx->InputRangeAt(i - 1).second); - if ((it == ins.end()) && - (input_defs[i].type_index == - std::type_index(typeid(paddle::optional)))) { - kernel_ctx->EmplaceBackInputWithoutSetRange(nullptr); - auto end_idx = start_idx + 1; - kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i); - continue; + if (it == ins.end()) { + if (LIKELY(input_defs[i].type_index == + std::type_index( + typeid(paddle::optional)))) { + kernel_ctx->EmplaceBackInputWithoutSetRange(nullptr); + auto end_idx = start_idx + 1; + kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i); + continue; + } else { + PADDLE_THROW(phi::errors::NotFound( + "Can not find input variable '%s' for %s OP, please check whether " + "the name setting in OpArgumentMapping is consistent with that in " + "OpMaker.", + input_names[i], pt_kernel_signature.name)); + } } + auto ins_vector = it->second; size_t end_idx = start_idx + ins_vector.size(); diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc index fe04d552e4..7b65d2d7c9 100644 --- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc @@ -328,5 +328,5 @@ class Pool2dOpConverter : public OpConverter { } // namespace inference } // namespace paddle -USE_OP(pool2d); +USE_OP_ITSELF(pool2d); REGISTER_TRT_OP_CONVERTER(pool2d, Pool2dOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/pool3d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool3d_op.cc index b8e87a8d94..5a306f622a 100644 --- a/paddle/fluid/inference/tensorrt/convert/pool3d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/pool3d_op.cc @@ -224,5 +224,5 @@ class Pool3dOpConverter : public OpConverter { } // namespace inference } // namespace paddle -USE_OP(pool3d); +USE_OP_ITSELF(pool3d); REGISTER_TRT_OP_CONVERTER(pool3d, Pool3dOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc index bded833505..36f13262a7 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc @@ -71,4 +71,4 @@ TEST(Pool2dOpConverter, avg_ceil_test) { test_pool2d(false, true, "avg"); } } // namespace inference } // namespace paddle -USE_OP(pool2d); +USE_OP_ITSELF(pool2d); diff --git a/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu index 861a9aa9d0..5596a89a08 100644 --- a/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu @@ -13,7 +13,7 @@ // limitations under the License. #include "paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.h" -#include "paddle/fluid/operators/math/pooling.h" +#include "paddle/phi/kernels/funcs/pooling.h" namespace paddle { namespace inference { @@ -108,16 +108,14 @@ int Pool3DPlugin::enqueue(int batchSize, const void *const *inputs, output_shape.insert(output_shape.begin(), batchSize); if (pool3d_type_ == Pool3DType::max) { - paddle::operators::math::MaxPool pool_process; - paddle::operators::math::Pool3dDirectCUDAFunctor< - paddle::operators::math::MaxPool, float> + phi::funcs::MaxPool pool_process; + phi::funcs::Pool3dDirectCUDAFunctor, float> pool3d_forward; pool3d_forward(idata, input_shape, output_shape, ksize_, strides_, paddings_, true, adaptive_, odatas[0], stream, pool_process); } else if (pool3d_type_ == Pool3DType::avg) { - paddle::operators::math::AvgPool pool_process; - paddle::operators::math::Pool3dDirectCUDAFunctor< - paddle::operators::math::AvgPool, float> + phi::funcs::AvgPool pool_process; + phi::funcs::Pool3dDirectCUDAFunctor, float> pool3d_forward; pool3d_forward(idata, input_shape, output_shape, ksize_, strides_, paddings_, true, adaptive_, odatas[0], stream, pool_process); @@ -351,16 +349,14 @@ int Pool3DPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc, } if (pool3d_type_ == "max") { - paddle::operators::math::MaxPool pool_process; - paddle::operators::math::Pool3dDirectCUDAFunctor< - paddle::operators::math::MaxPool, float> + phi::funcs::MaxPool pool_process; + phi::funcs::Pool3dDirectCUDAFunctor, float> pool3d_forward; pool3d_forward(input, input_shape, output_shape, ksize, strides_, paddings, true, adaptive_, output, stream, pool_process); } else if (pool3d_type_ == "avg") { - paddle::operators::math::AvgPool pool_process; - paddle::operators::math::Pool3dDirectCUDAFunctor< - paddle::operators::math::AvgPool, float> + phi::funcs::AvgPool pool_process; + phi::funcs::Pool3dDirectCUDAFunctor, float> pool3d_forward; pool3d_forward(input, input_shape, output_shape, ksize, strides_, paddings, true, adaptive_, output, stream, pool_process); diff --git a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu index 6d711c26ad..9bfe98d759 100644 --- a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu @@ -13,7 +13,7 @@ // limitations under the License. #include "paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.h" -#include "paddle/fluid/operators/math/pooling.h" +#include "paddle/phi/kernels/funcs/pooling.h" namespace paddle { namespace inference { @@ -84,16 +84,14 @@ int PoolPlugin::enqueue(int batchSize, const void *const *inputs, output_shape.insert(output_shape.begin(), batchSize); if (pool_type_ == PoolType::max) { - paddle::operators::math::MaxPool pool_process; - paddle::operators::math::Pool2dDirectCUDAFunctor< - paddle::operators::math::MaxPool, float> + phi::funcs::MaxPool pool_process; + phi::funcs::Pool2dDirectCUDAFunctor, float> pool2d_forward; pool2d_forward(idata, input_shape, output_shape, ksize_, strides_, paddings_, true, false, odatas[0], stream, pool_process); } else if (pool_type_ == PoolType::avg) { - paddle::operators::math::AvgPool pool_process; - paddle::operators::math::Pool2dDirectCUDAFunctor< - paddle::operators::math::AvgPool, float> + phi::funcs::AvgPool pool_process; + phi::funcs::Pool2dDirectCUDAFunctor, float> pool2d_forward; pool2d_forward(idata, input_shape, output_shape, ksize_, strides_, paddings_, exclusive_, adaptive_, odatas[0], stream, @@ -292,16 +290,14 @@ int PoolPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc, } if (pool_type_ == "max") { - paddle::operators::math::MaxPool pool_process; - paddle::operators::math::Pool2dDirectCUDAFunctor< - paddle::operators::math::MaxPool, float> + phi::funcs::MaxPool pool_process; + phi::funcs::Pool2dDirectCUDAFunctor, float> pool2d_forward; pool2d_forward(input, input_shape, output_shape, ksize, strides_, paddings, true, false, output, stream, pool_process); } else if (pool_type_ == "avg") { - paddle::operators::math::AvgPool pool_process; - paddle::operators::math::Pool2dDirectCUDAFunctor< - paddle::operators::math::AvgPool, float> + phi::funcs::AvgPool pool_process; + phi::funcs::Pool2dDirectCUDAFunctor, float> pool2d_forward; pool2d_forward(input, input_shape, output_shape, ksize, strides_, paddings, exclusive_, adaptive_, output, stream, pool_process); diff --git a/paddle/fluid/operators/flatten_op.h b/paddle/fluid/operators/flatten_op.h index 5ef13b38c8..feae954e35 100644 --- a/paddle/fluid/operators/flatten_op.h +++ b/paddle/fluid/operators/flatten_op.h @@ -16,7 +16,6 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/phi_utils.h" -#include "paddle/fluid/operators/math/pooling.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/flatten_grad_kernel.h" diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index 31a98d9f63..af1069cb86 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -20,7 +20,6 @@ math_library(sampler DEPS generator) # math_library(math_function DEPS blas dense_tensor tensor) math_library(maxouting) -math_library(pooling) if(WITH_MKLDNN) math_library(selected_rows_functor DEPS selected_rows_utils math_function blas mkldnn_axpy_handler) diff --git a/paddle/fluid/operators/math/pooling.h b/paddle/fluid/operators/math/pooling.h deleted file mode 100644 index dfd3dad386..0000000000 --- a/paddle/fluid/operators/math/pooling.h +++ /dev/null @@ -1,315 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/operators/amp/fp16_type_traits.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/macros.h" -#include "paddle/phi/core/hostdevice.h" - -namespace paddle { -namespace operators { -namespace math { - -/* - * \brief Extracting simple operations from pooling. - * Both MaxPool and AvgPool need "initial", "compute" and "finalize" - * operation. - * MaxPool initializes temp variable to the negative maximum to find the - * maximum value in the pooling field. - * AvgPool initializes temp variable to the zero to accumulate all values - * in pool pooling, and finally takes the average. - * MaxPoolGrad and AvgPoolGrad are gradient operations respectively. - */ -template -class MaxPool { - public: - DEVICE inline T initial() { return static_cast(-FLT_MAX); } - DEVICE inline void compute(const T& x, T* y) { *y = *y > x ? *y : x; } - DEVICE inline void finalize(const T& pool_field, T* y) {} -}; - -template -class AvgPool { - using MT = typename details::MPTypeTrait::Type; - MT intermediate_res; - - public: - DEVICE inline T initial() { - intermediate_res = static_cast(0.0f); - return static_cast(0); - } - - DEVICE inline void compute(const T& x, T* y) { - intermediate_res += static_cast(x); - } - - DEVICE inline void finalize(const T& pool_field, T* y) { - *y = static_cast(intermediate_res / (static_cast(pool_field))); - } -}; - -template -class MaxPoolGrad { - public: - static constexpr bool use_x = true; - HOSTDEVICE inline void compute(const T& x, const T& y, const T& dy, T scale, - T* dx) { - *dx += dy * static_cast(x == y); - } -}; - -template -class AvgPoolGrad { - public: - static constexpr bool use_x = false; - HOSTDEVICE inline void compute(const T& x, const T& y, const T& dy, T scale, - T* dx) { - *dx += (scale * dy); - } -}; - -/* used for adaptive pool to calculate start and end index of each divided grid - */ -HOSTDEVICE inline int AdaptStartIndex(int ph, int input_size, int output_size) { - return static_cast( - floor(static_cast(ph * input_size) / output_size)); -} - -HOSTDEVICE inline int AdaptEndIndex(int ph, int input_size, int output_size) { - return static_cast( - ceil(static_cast((ph + 1) * input_size) / output_size)); -} - -/* - * \brief Getting pooling results, and calculating gradient. - * - * In pool2d, all Tensors are in NCHW or NHWC format. Where N is batch size, C - * is the number of channels, H and W is the height and width of feature. - * In pool3d, all Tensors are in NCDHW or NDHWC format. Where N is batch size, C - * is the number of channels, D, H and W is the depth, height and width of - * feature. - * - * In max pooling, it is possible that the pooling region has multiple maximum - * elements. In this case, we should compute the gradient of the first maximum - * element. - * This is different from average pooling. So we rewrite the max_pool_grad: - * MaxPool2dGradFunctor, MaxPool3dGradFunctor. - */ -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -template -class Pool2dDirectCUDAFunctor { - public: - void operator()(const T* input, const std::vector& input_shape, - const std::vector& output_shape, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, bool exclusive, - bool adaptive, T* output, gpuStream_t stream, - PoolProcess pool_compute); -}; -#endif - -template -class Pool2dFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, bool exclusive, - bool adaptive, framework::Tensor* output, - PoolProcess pool_compute); - - // overload operator() to support argument data_format - void operator()(const DeviceContext& context, const framework::Tensor& input, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - const std::string data_format, bool exclusive, bool adaptive, - framework::Tensor* output, PoolProcess pool_compute); -}; - -template -class Pool2dGradFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, bool exclusive, - bool adaptive, framework::Tensor* input_grad, - PoolProcess pool_compute); - // overload operator() to support argument data_format - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - const std::string data_format, bool exclusive, bool adaptive, - framework::Tensor* input_grad, PoolProcess pool_compute); -}; - -template -class MaxPool2dGradFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - framework::Tensor* input_grad); - // overload operator() to support argument data_format - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - const std::string data_format, framework::Tensor* input_grad); -}; - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -template -class Pool3dDirectCUDAFunctor { - public: - void operator()(const T* input, const std::vector& input_shape, - const std::vector& output_shape, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, bool exclusive, - bool adaptive, T* output, gpuStream_t stream, - PoolProcess pool_compute); -}; -#endif - -template -class Pool3dFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, bool exclusive, - bool adaptive, framework::Tensor* output, - PoolProcess pool_compute); - // overload operator() to support argument data_format - void operator()(const DeviceContext& context, const framework::Tensor& input, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - const std::string data_format, bool exclusive, bool adaptive, - framework::Tensor* output, PoolProcess pool_compute); -}; - -template -class Pool3dGradFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, bool exclusive, - bool adaptive, framework::Tensor* input_grad, - PoolProcess pool_compute); - // overload operator() to support argument data_format - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - const std::string data_format, bool exclusive, bool adaptive, - framework::Tensor* input_grad, PoolProcess pool_compute); -}; - -template -class MaxPool3dGradFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - framework::Tensor* input_grad); - // overload operator() to support argument data_format - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - const std::string data_format, framework::Tensor* input_grad); -}; - -/* - * \brief Getting max pooling results and corresponding max index, and - * calculating gradient. - * In up-sampling-pooling, it is necessary to know max element index. - * In pool2d, all tensors are in NCHW format. In pool3d, all tensors are in - * NCDHW format. - */ -template -class MaxPool2dWithIndexFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, bool adaptive, - framework::Tensor* output, framework::Tensor* mask); -}; - -template -class MaxPool2dWithIndexGradFunctor { - public: - void operator()(const DeviceContext& context, - const framework::Tensor& output_grad, - const framework::Tensor& mask, const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, bool adaptive, - framework::Tensor* input_grad); -}; - -template -class MaxPool3dWithIndexFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, bool adaptive, - framework::Tensor* output, framework::Tensor* mask); -}; - -template -class MaxPool3dWithIndexGradFunctor { - public: - void operator()(const DeviceContext& context, - const framework::Tensor& output_grad, - const framework::Tensor& mask, const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, bool adaptive, - framework::Tensor* input_grad); -}; - -} // namespace math -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc index ab02d4cfed..1078b451c5 100644 --- a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc @@ -12,14 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/pool_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/mkldnn_reuse.h" +#include "paddle/phi/kernels/funcs/pooling.h" namespace paddle { namespace operators { using framework::DataLayout; +using framework::Tensor; using dnnl::memory; using dnnl::pooling_backward; using dnnl::pooling_forward; @@ -83,11 +85,11 @@ class PoolingMKLDNNHandler phi::slice_ddim(input_dims, 2, input_dims.size()); if (global_pooling) { - operators::UpdateKsize(&ksize, data_dims); + phi::funcs::UpdateKernelSize(&ksize, data_dims); } - operators::UpdatePadding(&paddings, global_pooling, 0, padding_algorithm, - data_dims, strides, ksize); + phi::funcs::UpdatePadding(&paddings, global_pooling, 0, padding_algorithm, + data_dims, strides, ksize); const auto src_tz = phi::vectorize(input->dims()); const auto dst_tz = phi::vectorize(output->dims()); @@ -173,11 +175,11 @@ class PoolingMKLDNNHandler framework::DDim data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size()); if (global_pooling) { - operators::UpdateKsize(&ksize, data_dims); + phi::funcs::UpdateKernelSize(&ksize, data_dims); } - operators::UpdatePadding(&paddings, global_pooling, 0, padding_algorithm, - data_dims, strides, ksize); + phi::funcs::UpdatePadding(&paddings, global_pooling, 0, padding_algorithm, + data_dims, strides, ksize); auto src_tz = phi::vectorize(in_x->dims()); auto diff_src_tz = phi::vectorize(in_x_grad->dims()); diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc index 9d0062e313..717af61b85 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc @@ -26,13 +26,14 @@ #include "paddle/fluid/platform/place.h" #include "paddle/phi/core/kernel_registry.h" -USE_OP(pool2d); +USE_OP_ITSELF(pool2d); USE_OP_DEVICE_KERNEL(pool2d, MKLDNN); USE_OP_ITSELF(relu); USE_OP_DEVICE_KERNEL(relu, MKLDNN); USE_OP_ITSELF(transpose); USE_OP_DEVICE_KERNEL(transpose, MKLDNN); +PD_DECLARE_KERNEL(pool2d, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(relu, CPU, ALL_LAYOUT); namespace paddle { diff --git a/paddle/fluid/operators/pool_cudnn_op.cu.cc b/paddle/fluid/operators/pool_cudnn_op.cu.cc deleted file mode 100644 index 6335004e69..0000000000 --- a/paddle/fluid/operators/pool_cudnn_op.cu.cc +++ /dev/null @@ -1,567 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/pool_op.h" -#include "paddle/phi/kernels/funcs/math_function.h" -#ifdef PADDLE_WITH_HIP -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/operator.h" -#endif -#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using ScopedTensorDescriptor = platform::ScopedTensorDescriptor; -using ScopedPoolingDescriptor = platform::ScopedPoolingDescriptor; -using DataLayout = platform::DataLayout; -using PoolingMode = platform::PoolingMode; -template -using ScalingParamType = typename platform::CudnnDataType::ScalingParamType; - -DataLayout getLayoutFromStr(std::string data_format) { - if (data_format == "NHWC") { - return DataLayout::kNHWC; - } else if (data_format == "NCHW") { - return DataLayout::kNCHW; - } else if (data_format == "NCDHW") { - return DataLayout::kNCDHW; - } else { - return DataLayout::kNCDHW; - } -} - -template -class PoolCUDNNOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::InvalidArgument("Pool operator CUDA kernel must use " - "CUDAPlace rather than CPUPlace.")); - - const Tensor *input = ctx.Input("X"); - Tensor *output = ctx.Output("Out"); - output->mutable_data(ctx.GetPlace()); - std::string pooling_type = ctx.Attr("pooling_type"); - bool exclusive = ctx.Attr("exclusive"); - bool adaptive = ctx.Attr("adaptive"); - std::vector ksize = ctx.Attr>("ksize"); - std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); - std::string data_format = ctx.Attr("data_format"); - bool global_pooling = ctx.Attr("global_pooling"); - std::string padding_algorithm = ctx.Attr("padding_algorithm"); - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - // update paddings - auto in_x_dims = input->dims(); - framework::DDim data_dims; - if (channel_last) { - data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1); - } else { - data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size()); - } - UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm, - data_dims, strides, ksize); - if (data_dims.size() * 2 == static_cast(paddings.size())) { - for (int i = 0; i < data_dims.size(); ++i) { - paddings.erase(paddings.begin() + i + 1); - } - } - - if (global_pooling) { - UpdateKsize(&ksize, data_dims); - } - - const std::string str_NCHW = "NCHW", str_NHWC = "NHWC"; - const std::string str_NCDHW = "NCDHW", str_NDHWC = "NDHWC"; - - // -----------------transformed tensor ------------------------ - - Tensor transformed_input(input->type()); - Tensor transformed_output(output->type()); - DataLayout layout; - - if (data_format == str_NDHWC) { - layout = DataLayout::kNCDHW; - auto &dev_ctx = - ctx.template device_context(); - std::vector axis{0, 4, 1, 2, 3}; - - // input - transformed_input.Resize(input->dims()); - - auto in_dims_vec = phi::vectorize(input->dims()); - in_dims_vec[1] = input->dims()[4]; - in_dims_vec[2] = input->dims()[1]; - in_dims_vec[3] = input->dims()[2]; - in_dims_vec[4] = input->dims()[3]; - transformed_input.Resize(phi::make_ddim(in_dims_vec)); - transformed_input.mutable_data(ctx.GetPlace(), input->type()); - - phi::funcs::Transpose trans5; - trans5(dev_ctx, *input, &transformed_input, axis); - - // output - transformed_output.Resize(output->dims()); - - auto out_dims_vec = phi::vectorize(output->dims()); - out_dims_vec[1] = output->dims()[4]; - out_dims_vec[2] = output->dims()[1]; - out_dims_vec[3] = output->dims()[2]; - out_dims_vec[4] = output->dims()[3]; - transformed_output.Resize(phi::make_ddim(out_dims_vec)); -#ifdef PADDLE_WITH_HIP - // MIOPEN not support NHWC data layout - } else if (data_format == str_NHWC) { - layout = DataLayout::kNCHW; - auto &dev_ctx = - ctx.template device_context(); - std::vector axis{0, 3, 1, 2}; - - transformed_input.Resize(input->dims()); - auto in_dims_vec = phi::vectorize(input->dims()); - in_dims_vec[1] = input->dims()[3]; - in_dims_vec[2] = input->dims()[1]; - in_dims_vec[3] = input->dims()[2]; - transformed_input.Resize(phi::make_ddim(in_dims_vec)); - transformed_input.mutable_data(ctx.GetPlace(), input->type()); - - phi::funcs::Transpose trans; - trans(dev_ctx, *input, &transformed_input, axis); - - transformed_output.Resize(output->dims()); - auto out_dims_vec = phi::vectorize(output->dims()); - out_dims_vec[1] = output->dims()[3]; - out_dims_vec[2] = output->dims()[1]; - out_dims_vec[3] = output->dims()[2]; - transformed_output.Resize(phi::make_ddim(out_dims_vec)); -#endif - } else { - layout = getLayoutFromStr(data_format); - transformed_input = *input; - transformed_output = *output; - } - - const T *tranformed_input_data = transformed_input.data(); - T *tranformed_output_data = transformed_output.mutable_data( - transformed_output.dims(), ctx.GetPlace()); - - // ------------------- cudnn descriptors --------------------- - ScopedTensorDescriptor input_desc; - ScopedTensorDescriptor output_desc; - ScopedPoolingDescriptor pool_desc; - -#ifdef PADDLE_WITH_HIP - miopenTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( - layout, phi::vectorize(transformed_input.dims())); - miopenTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( - layout, phi::vectorize(transformed_output.dims())); -#else - cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( - layout, phi::vectorize(transformed_input.dims())); - cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( - layout, phi::vectorize(transformed_output.dims())); -#endif - PoolingMode pooling_mode; - if (pooling_type == "max") { - pooling_mode = PoolingMode::kMaximum; - } else { - pooling_mode = exclusive ? PoolingMode::kAverageExclusive - : PoolingMode::kAverageInclusive; - } - -#ifdef PADDLE_WITH_HIP - miopenPoolingDescriptor_t cudnn_pool_desc = - pool_desc.descriptor(pooling_mode, ksize, paddings, strides); -#else - cudnnPoolingDescriptor_t cudnn_pool_desc = - pool_desc.descriptor(pooling_mode, ksize, paddings, strides); -#endif - - // ------------------- cudnn pool algorithm --------------------- - auto handle = ctx.cuda_device_context().cudnn_handle(); - ScalingParamType alpha = 1.0f, beta = 0.0f; - -#ifdef PADDLE_WITH_HIP - char *pool_workspace; - size_t pool_worksize = 0; - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenPoolingGetWorkSpaceSizeV2( - cudnn_pool_desc, cudnn_output_desc, &pool_worksize)); - PADDLE_ENFORCE_GPU_SUCCESS(hipMalloc(&pool_workspace, pool_worksize)); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenPoolingForward( - handle, cudnn_pool_desc, &alpha, cudnn_input_desc, - tranformed_input_data, &beta, cudnn_output_desc, tranformed_output_data, - false, pool_workspace, pool_worksize)); - PADDLE_ENFORCE_GPU_SUCCESS(hipFree(pool_workspace)); -#else - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnPoolingForward( - handle, cudnn_pool_desc, &alpha, cudnn_input_desc, - tranformed_input_data, &beta, cudnn_output_desc, - tranformed_output_data)); -#endif - // add - if (data_format == str_NDHWC) { - auto &dev_ctx = - ctx.template device_context(); - std::vector axis{0, 2, 3, 4, 1}; - phi::funcs::Transpose - trans5_v2; - trans5_v2(dev_ctx, transformed_output, output, axis); - } -#ifdef PADDLE_WITH_HIP - // MIOPEN not support NHWC data layout - if (data_format == str_NHWC) { - auto &dev_ctx = - ctx.template device_context(); - std::vector axis{0, 2, 3, 1}; - phi::funcs::Transpose trans; - trans(dev_ctx, transformed_output, output, axis); - } -#endif - } -}; - -template -class PoolCUDNNGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::InvalidArgument("Pool operator CUDA kernel must use " - "CUDAPlace rather than CPUPlace.")); - - const Tensor *input = ctx.Input("X"); - const Tensor *output = ctx.Input("Out"); - const Tensor *output_grad = - ctx.Input(framework::GradVarName("Out")); - Tensor *input_grad = ctx.Output(framework::GradVarName("X")); - - std::string pooling_type = ctx.Attr("pooling_type"); - bool exclusive = ctx.Attr("exclusive"); - bool adaptive = ctx.Attr("adaptive"); - std::vector ksize = ctx.Attr>("ksize"); - std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); - std::string data_format = ctx.Attr("data_format"); - bool global_pooling = ctx.Attr("global_pooling"); - std::string padding_algorithm = ctx.Attr("padding_algorithm"); - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - -#ifdef PADDLE_WITH_HIP - if (pooling_type == "max") { - using OpKernelMap = paddle::framework::OperatorWithKernel::OpKernelMap; - using OpKernelFunc = paddle::framework::OperatorWithKernel::OpKernelFunc; - auto &all_op_kernels = - paddle::framework::OperatorWithKernel::AllOpKernels(); - std::string op_type = "pool2d_grad"; - auto kernels_iter = all_op_kernels.find(op_type); - PADDLE_ENFORCE_NE( - kernels_iter, all_op_kernels.end(), - platform::errors::Unavailable( - "There are no kernels which are registered in the %s operator.", - op_type)); - OpKernelMap &kernels = kernels_iter->second; - paddle::framework::OpKernelType expected_kernel_key( - paddle::framework::ToDataType(typeid(T)), ctx.GetPlace()); - auto kernel_iter = kernels.find(expected_kernel_key); - PADDLE_ENFORCE_NE(kernel_iter, kernels.end(), - platform::errors::NotFound( - "Operator (%s) does not have kernel for %s.", - op_type, KernelTypeToString(expected_kernel_key))); - std::unique_ptr kernel_func_( - new OpKernelFunc(kernel_iter->second)); - (*kernel_func_)(ctx); - return; - } -#endif - - // update paddings - auto in_x_dims = input->dims(); - framework::DDim data_dims; - if (channel_last) { - data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1); - } else { - data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size()); - } - UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm, - data_dims, strides, ksize); - if (data_dims.size() * 2 == static_cast(paddings.size())) { - for (int i = 0; i < data_dims.size(); ++i) { - paddings.erase(paddings.begin() + i + 1); - } - } - - if (global_pooling) { - UpdateKsize(&ksize, data_dims); - } - - // ------- tensor grad -------------- - Tensor transformed_input(input->type()); - Tensor transformed_output(output->type()); - Tensor transformed_output_grad(output_grad->type()); - - input_grad->mutable_data(ctx.GetPlace()); - Tensor transformed_input_grad(input_grad->type()); - DataLayout layout; - const std::string str_NCHW = "NCHW", str_NHWC = "NHWC"; - const std::string str_NCDHW = "NCDHW", str_NDHWC = "NDHWC"; - if (data_format == str_NDHWC) { - layout = DataLayout::kNCDHW; - auto &dev_ctx = - ctx.template device_context(); - std::vector axis{0, 4, 1, 2, 3}; - - // input - transformed_input.Resize(input->dims()); - auto in_dims_vec = phi::vectorize(input->dims()); - in_dims_vec[1] = input->dims()[4]; - in_dims_vec[2] = input->dims()[1]; - in_dims_vec[3] = input->dims()[2]; - in_dims_vec[4] = input->dims()[3]; - transformed_input.Resize(phi::make_ddim(in_dims_vec)); - transformed_input.mutable_data(ctx.GetPlace(), input->type()); - - phi::funcs::Transpose trans5; - trans5(dev_ctx, *input, &transformed_input, axis); - - // output - transformed_output.Resize(output->dims()); - auto out_dims_vec = phi::vectorize(output->dims()); - out_dims_vec[1] = output->dims()[4]; - out_dims_vec[2] = output->dims()[1]; - out_dims_vec[3] = output->dims()[2]; - out_dims_vec[4] = output->dims()[3]; - transformed_output.Resize(phi::make_ddim(out_dims_vec)); - - transformed_output.mutable_data(ctx.GetPlace(), output->type()); - - phi::funcs::Transpose - trans5_v2; - trans5_v2(dev_ctx, *output, &transformed_output, axis); - - // output grad - transformed_output_grad.Resize(phi::make_ddim(out_dims_vec)); - transformed_output_grad.mutable_data(ctx.GetPlace(), output_grad->type()); - - phi::funcs::Transpose - trans5_v3; - trans5_v3(dev_ctx, *output_grad, &transformed_output_grad, axis); - - // input grad - transformed_input_grad.Resize(phi::make_ddim(in_dims_vec)); - -#ifdef PADDLE_WITH_HIP - // MIOPEN not support NHWC data layout - } else if (data_format == str_NHWC) { - layout = DataLayout::kNCHW; - auto &dev_ctx = - ctx.template device_context(); - std::vector axis{0, 3, 1, 2}; - - // input - transformed_input.Resize(input->dims()); - auto in_dims_vec = phi::vectorize(input->dims()); - in_dims_vec[1] = input->dims()[3]; - in_dims_vec[2] = input->dims()[1]; - in_dims_vec[3] = input->dims()[2]; - transformed_input.Resize(phi::make_ddim(in_dims_vec)); - transformed_input.mutable_data(ctx.GetPlace(), input->type()); - - phi::funcs::Transpose trans4; - trans4(dev_ctx, *input, &transformed_input, axis); - - // output - transformed_output.Resize(output->dims()); - auto out_dims_vec = phi::vectorize(output->dims()); - out_dims_vec[1] = output->dims()[3]; - out_dims_vec[2] = output->dims()[1]; - out_dims_vec[3] = output->dims()[2]; - transformed_output.Resize(phi::make_ddim(out_dims_vec)); - - transformed_output.mutable_data(ctx.GetPlace(), output->type()); - - phi::funcs::Transpose - trans4_v2; - trans4_v2(dev_ctx, *output, &transformed_output, axis); - - // output grad - transformed_output_grad.Resize(phi::make_ddim(out_dims_vec)); - transformed_output_grad.mutable_data(ctx.GetPlace(), output_grad->type()); - - phi::funcs::Transpose - trans4_v3; - trans4_v3(dev_ctx, *output_grad, &transformed_output_grad, axis); - - // input grad - transformed_input_grad.Resize(phi::make_ddim(in_dims_vec)); -#endif - } else { - layout = getLayoutFromStr(data_format); - transformed_input = *input; - transformed_output = *output; - transformed_output_grad = *output_grad; - transformed_input_grad = *input_grad; - } - - const T *input_data = transformed_input.data(); - const T *output_data = transformed_output.data(); - const T *output_grad_data = transformed_output_grad.data(); - - // ------------------- cudnn descriptors --------------------- - ScopedTensorDescriptor input_desc; - ScopedTensorDescriptor output_desc; - ScopedPoolingDescriptor pool_desc; - -#ifdef PADDLE_WITH_HIP - miopenTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( - layout, phi::vectorize(transformed_input.dims())); - miopenTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( - layout, phi::vectorize(transformed_output.dims())); -#else - cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( - layout, phi::vectorize(transformed_input.dims())); - cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( - layout, phi::vectorize(transformed_output.dims())); -#endif - PoolingMode pooling_mode; - if (pooling_type == "max") { - if (FLAGS_cudnn_deterministic) { - pooling_mode = PoolingMode::kMaximumDeterministic; - } else { - pooling_mode = PoolingMode::kMaximum; - } - } else { - pooling_mode = exclusive ? PoolingMode::kAverageExclusive - : PoolingMode::kAverageInclusive; - } - -#ifdef PADDLE_WITH_HIP - miopenPoolingDescriptor_t cudnn_pool_desc = - pool_desc.descriptor(pooling_mode, ksize, paddings, strides); -#else - cudnnPoolingDescriptor_t cudnn_pool_desc = - pool_desc.descriptor(pooling_mode, ksize, paddings, strides); -#endif - - // ------------------- cudnn pool algorithm --------------------- - auto handle = ctx.cuda_device_context().cudnn_handle(); - ScalingParamType alpha = 1.0f, beta = 0.0f; - if (input_grad) { - T *input_grad_data = transformed_input_grad.mutable_data( - transformed_input_grad.dims(), ctx.GetPlace()); -// Because beta is zero, it is unnecessary to reset input_grad. -#ifdef PADDLE_WITH_HIP - char *pool_workspace; - size_t pool_worksize = 0; - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenPoolingGetWorkSpaceSizeV2( - cudnn_pool_desc, cudnn_output_desc, &pool_worksize)); - PADDLE_ENFORCE_GPU_SUCCESS(hipMalloc(&pool_workspace, pool_worksize)); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenPoolingBackward( - handle, cudnn_pool_desc, &alpha, cudnn_output_desc, output_data, - cudnn_output_desc, output_grad_data, cudnn_input_desc, input_data, - &beta, cudnn_input_desc, input_grad_data, pool_workspace)); - PADDLE_ENFORCE_GPU_SUCCESS(hipFree(pool_workspace)); -#else - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnPoolingBackward( - handle, cudnn_pool_desc, &alpha, cudnn_output_desc, output_data, - cudnn_output_desc, output_grad_data, cudnn_input_desc, input_data, - &beta, cudnn_input_desc, input_grad_data)); -#endif - - if (data_format == str_NDHWC) { - auto &dev_ctx = - ctx.template device_context(); - std::vector axis{0, 2, 3, 4, 1}; - phi::funcs::Transpose - trans5_v4; - trans5_v4(dev_ctx, transformed_input_grad, input_grad, axis); - } -#ifdef PADDLE_WITH_HIP - // MIOPEN not support NHWC data layout - if (data_format == str_NHWC) { - auto &dev_ctx = - ctx.template device_context(); - std::vector axis{0, 2, 3, 1}; - phi::funcs::Transpose - trans4_v4; - trans4_v4(dev_ctx, transformed_input_grad, input_grad, axis); - } -#endif - } - } -}; - -template -class PoolCUDNNGradGradOpKernel : public PoolCUDNNOpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - std::string pooling_type = ctx.Attr("pooling_type"); - if (pooling_type == "max") { - PADDLE_THROW(platform::errors::InvalidArgument( - "Pool op grad grad only supports avgpool.")); - } else { - PoolCUDNNOpKernel::Compute(ctx); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -#ifdef PADDLE_WITH_HIP -// MIOPEN do not support double -REGISTER_OP_KERNEL(pool2d, CUDNN, plat::CUDAPlace, - ops::PoolCUDNNOpKernel, - ops::PoolCUDNNOpKernel); -REGISTER_OP_KERNEL(pool2d_grad, CUDNN, plat::CUDAPlace, - ops::PoolCUDNNGradOpKernel, - ops::PoolCUDNNGradOpKernel); - -REGISTER_OP_KERNEL(pool3d, CUDNN, plat::CUDAPlace, - ops::PoolCUDNNOpKernel, - ops::PoolCUDNNOpKernel); -REGISTER_OP_KERNEL(pool3d_grad, CUDNN, plat::CUDAPlace, - ops::PoolCUDNNGradOpKernel); -#else -REGISTER_OP_KERNEL(pool2d, CUDNN, plat::CUDAPlace, - ops::PoolCUDNNOpKernel, - ops::PoolCUDNNOpKernel, - ops::PoolCUDNNOpKernel); -REGISTER_OP_KERNEL(pool2d_grad, CUDNN, plat::CUDAPlace, - ops::PoolCUDNNGradOpKernel, - ops::PoolCUDNNGradOpKernel, - ops::PoolCUDNNGradOpKernel); -REGISTER_OP_KERNEL(pool2d_grad_grad, CUDNN, plat::CUDAPlace, - ops::PoolCUDNNGradGradOpKernel, - ops::PoolCUDNNGradGradOpKernel, - ops::PoolCUDNNGradGradOpKernel); - -REGISTER_OP_KERNEL(pool3d, CUDNN, plat::CUDAPlace, - ops::PoolCUDNNOpKernel, - ops::PoolCUDNNOpKernel, - ops::PoolCUDNNOpKernel); -REGISTER_OP_KERNEL(pool3d_grad, CUDNN, plat::CUDAPlace, - ops::PoolCUDNNGradOpKernel, - ops::PoolCUDNNGradOpKernel); -#endif diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc index ae095c2fa7..44f3d8090e 100644 --- a/paddle/fluid/operators/pool_op.cc +++ b/paddle/fluid/operators/pool_op.cc @@ -15,6 +15,12 @@ limitations under the License. */ #include "paddle/fluid/operators/pool_op.h" #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/unary.h" + #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" @@ -23,125 +29,6 @@ limitations under the License. */ namespace paddle { namespace operators { -int PoolOutputSize(int input_size, int filter_size, int padding_1, - int padding_2, int stride, bool ceil_mode) { - int output_size; - if (!ceil_mode) { - output_size = - (input_size - filter_size + padding_1 + padding_2) / stride + 1; - } else { - output_size = - (input_size - filter_size + padding_1 + padding_2 + stride - 1) / - stride + - 1; - } - PADDLE_ENFORCE_GT( - output_size, 0, - platform::errors::InvalidArgument( - "the output size must be greater than 0. But received: " - "output_size = %d due to the settings of input_size(%d), " - "padding(%d,%d), " - "k_size(%d) and stride(%d). Please check again!", - output_size, input_size, padding_1, padding_2, filter_size, stride)); - return output_size; -} - -void PoolOp::InferShape(framework::InferShapeContext* ctx) const { - PADDLE_ENFORCE_EQ( - ctx->HasInput("X"), true, - platform::errors::NotFound("Input(X) of Pool operator is not found.")); - PADDLE_ENFORCE_EQ( - ctx->HasOutput("Out"), true, - platform::errors::NotFound("Output(Out) of Pool operator is not found.")); - - std::string pooling_type = ctx->Attrs().Get("pooling_type"); - std::vector ksize = ctx->Attrs().Get>("ksize"); - std::vector strides = ctx->Attrs().Get>("strides"); - std::vector paddings = ctx->Attrs().Get>("paddings"); - bool ceil_mode = ctx->Attrs().Get("ceil_mode"); - bool adaptive = ctx->Attrs().Get("adaptive"); - bool global_pooling = ctx->Attrs().Get("global_pooling"); - std::string data_format = ctx->Attrs().Get("data_format"); - std::string padding_algorithm = - ctx->Attrs().Get("padding_algorithm"); - - auto in_x_dims = ctx->GetInputDim("X"); - PADDLE_ENFORCE_EQ( - in_x_dims.size() == 4 || in_x_dims.size() == 5, true, - platform::errors::InvalidArgument( - "the input of Op(pool) should be 4-D or 5-D Tensor. But " - "received: %u-D Tensor and it's shape is [%s].", - in_x_dims.size(), in_x_dims)); - - PADDLE_ENFORCE_EQ( - in_x_dims.size() - ksize.size(), 2U, - platform::errors::InvalidArgument( - "the dimension of input minus the size of " - "Attr(ksize) must be euqal to 2 in Op(pool). " - "But received: the dimension of input minus the size " - "of Attr(ksize) is %d, the " - "input's dimension is %d, the shape of input " - "is [%s], the Attr(ksize)'s size is %d, the Attr(ksize) is [%s].", - in_x_dims.size() - ksize.size(), in_x_dims.size(), in_x_dims, - ksize.size(), phi::make_ddim(ksize))); - - PADDLE_ENFORCE_EQ( - ksize.size(), strides.size(), - platform::errors::InvalidArgument( - "the size of Attr(ksize) and Attr(strides) in " - "Op(pool) must be equal. " - "But received: Attr(ksize)'s size is %d, Attr(strides)'s " - "size is %d, Attr(ksize) is [%s], Attr(strides)is [%s].", - ksize.size(), strides.size(), phi::make_ddim(ksize), - phi::make_ddim(strides))); - - // MKL-DNN Kernels are using NCHW order of dims description - // so we ignore data_format consideration for MKL-DNN kernel - const bool channel_last = (ctx->IsRunMKLDNNKernel() == false) && - (data_format == "NHWC" || data_format == "NDHWC"); - - // update paddings if "SAME" or global_pooling - framework::DDim data_dims; - if (channel_last) { - data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1); - } else { - data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size()); - } - UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm, - data_dims, strides, ksize); - - if (global_pooling) { - UpdateKsize(&ksize, data_dims); - } - - std::vector output_shape; - if (adaptive) { - output_shape.insert(output_shape.end(), ksize.begin(), ksize.end()); - } else { - for (int i = 0; i < data_dims.size(); ++i) { - if ((!ctx->IsRuntime()) && (data_dims[i] < 0)) { - output_shape.push_back(data_dims[i]); - } else { - output_shape.push_back( - PoolOutputSize(data_dims[i], ksize[i], paddings[2 * i], - paddings[2 * i + 1], strides[i], ceil_mode)); - } - } - } - - // output_N = input_N - output_shape.insert(output_shape.begin(), in_x_dims[0]); - // output_C = input_C - if (channel_last) { - output_shape.push_back(in_x_dims[in_x_dims.size() - 1]); - } else { - output_shape.insert(output_shape.begin() + 1, in_x_dims[1]); - } - - ctx->SetOutputDim("Out", phi::make_ddim(output_shape)); - ctx->ShareLoD("X", "Out"); -} - bool CanMKLDNNSupportPool(const framework::ExecutionContext& ctx) { if (ctx.Attr("adaptive") == false) return true; // (jczaja): oneDNN is supporting only unchangable in size pool window @@ -216,16 +103,6 @@ framework::OpKernelType PoolOp::GetKernelTypeForVar( tensor.place(), tensor.layout()); } -void PoolOpGrad::InferShape(framework::InferShapeContext* ctx) const { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::NotFound( - "Input(X) of Pool Gradoperator is not found.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("X")), true, - platform::errors::NotFound( - "Input(X@GRAD) of Pool Gradoperator is not found.")); - ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); -} - framework::OpKernelType PoolOpGrad::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { framework::LibraryType library_{framework::LibraryType::kPlain}; @@ -471,7 +348,7 @@ class Pool2dOpGradGradMaker : public framework::SingleGradOpMaker { protected: void Apply(GradOpPtr grad_op) const override { - grad_op->SetType("pool2d_grad_grad"); + grad_op->SetType("pool2d_double_grad"); grad_op->SetInput("X", this->OutputGrad(framework::GradVarName("X"))); grad_op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out"))); grad_op->SetAttrMap(this->Attrs()); @@ -692,35 +569,34 @@ Example: namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(pool2d, Pool2dInferShapeFunctor, + PD_INFER_META(phi::PoolInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(pool2d_grad, Pool2dGradInferShapeFunctor, + PD_INFER_META(phi::PoolGradInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(pool2d_double_grad, + Pool2dDoubleGradInferShapeFunctor, + PD_INFER_META(phi::PoolInferMeta)); + REGISTER_OPERATOR( pool2d, ops::PoolOp, ops::Pool2dOpMaker, ops::PoolOpInferVarType, paddle::framework::DefaultGradOpMaker, - paddle::framework::DefaultGradOpMaker); + paddle::framework::DefaultGradOpMaker, + Pool2dInferShapeFunctor); REGISTER_OPERATOR(pool2d_grad, ops::PoolOpGrad, ops::Pool2dOpGradGradMaker, - ops::Pool2dOpGradGradMaker); -REGISTER_OPERATOR(pool2d_grad_grad, ops::PoolOp); - -REGISTER_OP_CPU_KERNEL( - pool2d, ops::PoolKernel, - ops::PoolKernel); -REGISTER_OP_CPU_KERNEL( - pool2d_grad, ops::PoolGradKernel, - ops::PoolGradKernel); -REGISTER_OP_CPU_KERNEL( - pool2d_grad_grad, - ops::PoolGradGradKernel, - ops::PoolGradGradKernel); + ops::Pool2dOpGradGradMaker, + Pool2dGradInferShapeFunctor); +REGISTER_OPERATOR(pool2d_double_grad, ops::PoolOp, + Pool2dDoubleGradInferShapeFunctor); + +DECLARE_INFER_SHAPE_FUNCTOR(pool3d, Pool3dInferShapeFunctor, + PD_INFER_META(phi::PoolInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(pool3d_grad, Pool3dGradInferShapeFunctor, + PD_INFER_META(phi::PoolGradInferMeta)); REGISTER_OPERATOR( pool3d, ops::PoolOp, ops::Pool3dOpMaker, ops::PoolOpInferVarType, paddle::framework::DefaultGradOpMaker, - paddle::framework::DefaultGradOpMaker); -REGISTER_OPERATOR(pool3d_grad, ops::PoolOpGrad); - -REGISTER_OP_CPU_KERNEL( - pool3d, ops::PoolKernel, - ops::PoolKernel); -REGISTER_OP_CPU_KERNEL( - pool3d_grad, ops::PoolGradKernel, - ops::PoolGradKernel); + paddle::framework::DefaultGradOpMaker, + Pool3dInferShapeFunctor); +REGISTER_OPERATOR(pool3d_grad, ops::PoolOpGrad, Pool3dGradInferShapeFunctor); diff --git a/paddle/fluid/operators/pool_op.cu b/paddle/fluid/operators/pool_op.cu deleted file mode 100644 index 069ce0c1fd..0000000000 --- a/paddle/fluid/operators/pool_op.cu +++ /dev/null @@ -1,48 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/pool_op.h" - -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL( - pool2d, ops::PoolKernel, - ops::PoolKernel, - ops::PoolKernel); -REGISTER_OP_CUDA_KERNEL( - pool2d_grad, - ops::PoolGradKernel, - ops::PoolGradKernel, - ops::PoolGradKernel); - -REGISTER_OP_CUDA_KERNEL( - pool2d_grad_grad, - ops::PoolGradGradKernel, - ops::PoolGradGradKernel, - ops::PoolGradGradKernel); - -REGISTER_OP_CUDA_KERNEL( - pool3d, ops::PoolKernel, - ops::PoolKernel, - ops::PoolKernel); -REGISTER_OP_CUDA_KERNEL( - pool3d_grad, - ops::PoolGradKernel, - ops::PoolGradKernel, - ops::PoolGradKernel); diff --git a/paddle/fluid/operators/pool_op.h b/paddle/fluid/operators/pool_op.h index bea6506ee8..d48ac3bd35 100644 --- a/paddle/fluid/operators/pool_op.h +++ b/paddle/fluid/operators/pool_op.h @@ -12,19 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#pragma once +// NOTE(Ruibiao): Difficult to remove code from this header file because too +// many files rely on it through "mkldnn_reuse.h" -#include -#include -#include +#pragma once -#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/pooling.h" -#include "paddle/phi/kernels/funcs/math_function.h" -#if defined(__HIPCC__) || defined(__NVCC__) -#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" -#endif namespace paddle { namespace operators { @@ -35,8 +28,6 @@ class PoolOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override; - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override; @@ -50,8 +41,6 @@ class PoolOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override; - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override; @@ -71,292 +60,5 @@ class Pool3dOpMaker : public framework::OpProtoAndCheckerMaker { void Make() override; }; -template -inline void UpdatePadding(std::vector* paddings, const bool global_pooling, - const bool adaptive, - const std::string padding_algorithm, - const framework::DDim data_dims, - const std::vector& strides, - const std::vector& ksize) { - // set padding size == data_dims.size() * 2 - auto data_shape = phi::vectorize(data_dims); - if (static_cast(paddings->size()) == data_dims.size()) { - for (int i = 0; i < data_dims.size(); ++i) { - T copy_pad = *(paddings->begin() + 2 * i); - paddings->insert(paddings->begin() + 2 * i + 1, copy_pad); - } - } else { - PADDLE_ENFORCE_EQ(data_dims.size() * 2, paddings->size(), - platform::errors::InvalidArgument( - "Paddings size %d should be the same or twice as the " - "pooling size %d.", - paddings->size(), data_dims.size() * 2)); - } - - // when padding_algorithm is "VALID" or "SAME" - if (padding_algorithm == "SAME") { - for (int i = 0; i < data_dims.size(); ++i) { - T out_size = (data_dims[i] + strides[i] - 1) / strides[i]; - T pad_sum = - std::max((out_size - 1) * strides[i] + ksize[i] - data_shape[i], - static_cast(0)); - T pad_0 = pad_sum / 2; - T pad_1 = pad_sum - pad_0; - *(paddings->begin() + i * 2) = pad_0; - *(paddings->begin() + i * 2 + 1) = pad_1; - } - } else if (padding_algorithm == "VALID") { - for (auto it = paddings->begin(); it != paddings->end(); it++) { - *it = 0; - } - } - - // if global_pooling == true or adaptive == true, padding will be ignore - if (global_pooling || adaptive) { - for (auto it = paddings->begin(); it != paddings->end(); it++) { - *it = 0; - } - } -} - -template -inline void UpdateKsize(std::vector* ksize, - const framework::DDim data_dims) { - ksize->resize(static_cast(data_dims.size())); - for (size_t i = 0; i < ksize->size(); ++i) { - *(ksize->begin() + i) = static_cast(data_dims[i]); - } -} - -inline int getReduceNum(const framework::Tensor& input, - const framework::Tensor* output, - const std::string data_format, - std::vector* reduce_dim) { - // data_format only can be NCHW - bool channel_last = (data_format == "NHWC"); - if (channel_last) { - return 0; - } - int reduce_num = 0; - const int output_height = output->dims()[2]; - const int output_width = output->dims()[3]; - if ((output_height == 1) && (output_width == 1)) { - reduce_dim->push_back(2); - reduce_dim->push_back(3); - reduce_num = input.dims()[2] * input.dims()[3]; - } - return reduce_num; -} - -template -class PoolKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* in_x = context.Input("X"); - Tensor* out = context.Output("Out"); - - std::string pooling_type = context.Attr("pooling_type"); - std::vector ksize = context.Attr>("ksize"); - std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - std::string data_format = context.Attr("data_format"); - bool exclusive = context.Attr("exclusive"); - bool adaptive = context.Attr("adaptive"); - bool global_pooling = context.Attr("global_pooling"); - std::string padding_algorithm = - context.Attr("padding_algorithm"); - - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - // update paddings - auto in_x_dims = in_x->dims(); - framework::DDim data_dims; - if (channel_last) { - data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1); - } else { - data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size()); - } - - UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm, - data_dims, strides, ksize); - if (data_dims.size() * 2 == static_cast(paddings.size())) { - for (int i = 0; i < data_dims.size(); ++i) { - paddings.erase(paddings.begin() + i + 1); - } - } - - if (global_pooling) { - UpdateKsize(&ksize, data_dims); - } - auto& dev_ctx = context.template device_context(); - switch (ksize.size()) { - case 2: { - if (pooling_type == "max") { - paddle::operators::math::Pool2dFunctor< - DeviceContext, paddle::operators::math::MaxPool, T> - pool2d_forward; - paddle::operators::math::MaxPool pool_process; - pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, data_format, - true, false, out, pool_process); - - } else if (pooling_type == "avg") { - std::vector reduce_dim; - int reduce_num = getReduceNum(*in_x, out, data_format, &reduce_dim); - if (reduce_num > 0 && - adaptive) { // for adaptive_avg_pool2d && output_size == 1 -#if defined(__HIPCC__) || defined(__NVCC__) - auto stream = dev_ctx.stream(); - TensorReduceImpl>( - dev_ctx, *in_x, out, kps::DivideFunctor(reduce_num), - reduce_dim, stream); -#else // for cpu - paddle::operators::math::Pool2dFunctor< - DeviceContext, paddle::operators::math::AvgPool, T> - pool2d_forward; - paddle::operators::math::AvgPool pool_process; - pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, - data_format, exclusive, adaptive, out, pool_process); -#endif - } else { // avgpool_2d or adaptive_avg_pool2d && output_size != 1 - paddle::operators::math::Pool2dFunctor< - DeviceContext, paddle::operators::math::AvgPool, T> - pool2d_forward; - paddle::operators::math::AvgPool pool_process; - pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, - data_format, exclusive, adaptive, out, pool_process); - } - } - } break; - case 3: { - if (pooling_type == "max") { - paddle::operators::math::Pool3dFunctor< - DeviceContext, paddle::operators::math::MaxPool, T> - pool3d_forward; - paddle::operators::math::MaxPool pool_process; - pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, data_format, - true, false, out, pool_process); - - } else if (pooling_type == "avg") { - paddle::operators::math::Pool3dFunctor< - DeviceContext, paddle::operators::math::AvgPool, T> - pool3d_forward; - paddle::operators::math::AvgPool pool_process; - pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, data_format, - exclusive, adaptive, out, pool_process); - } - } break; - default: { - PADDLE_THROW(platform::errors::InvalidArgument( - "Pool op only supports 2D and 3D input.")); - } - } - } -}; - -template -class PoolGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* in_x = context.Input("X"); - const Tensor* out = context.Input("Out"); - const Tensor* out_grad = - context.Input(framework::GradVarName("Out")); - Tensor* in_x_grad = context.Output(framework::GradVarName("X")); - - std::string pooling_type = context.Attr("pooling_type"); - std::vector ksize = context.Attr>("ksize"); - std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - bool exclusive = context.Attr("exclusive"); - bool adaptive = context.Attr("adaptive"); - std::string data_format = context.Attr("data_format"); - bool global_pooling = context.Attr("global_pooling"); - std::string padding_algorithm = - context.Attr("padding_algorithm"); - - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - // update paddings - auto in_x_dims = in_x->dims(); - framework::DDim data_dims; - if (channel_last) { - data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1); - } else { - data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size()); - } - UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm, - data_dims, strides, ksize); - if (data_dims.size() * 2 == static_cast(paddings.size())) { - for (int i = 0; i < data_dims.size(); ++i) { - paddings.erase(paddings.begin() + i + 1); - } - } - - if (global_pooling) { - UpdateKsize(&ksize, data_dims); - } - - auto& dev_ctx = context.template device_context(); - if (in_x_grad) { - in_x_grad->mutable_data(context.GetPlace()); - phi::funcs::SetConstant set_constant; - set_constant(dev_ctx, in_x_grad, static_cast(0.0)); - - switch (ksize.size()) { - case 2: { - if (pooling_type == "max") { - paddle::operators::math::MaxPool2dGradFunctor - pool2d_backward; - pool2d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides, - paddings, data_format, in_x_grad); - } else if (pooling_type == "avg") { - paddle::operators::math::Pool2dGradFunctor< - DeviceContext, paddle::operators::math::AvgPoolGrad, T> - pool2d_backward; - paddle::operators::math::AvgPoolGrad pool_process; - pool2d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides, - paddings, data_format, exclusive, adaptive, - in_x_grad, pool_process); - } - } break; - case 3: { - if (pooling_type == "max") { - paddle::operators::math::MaxPool3dGradFunctor - pool3d_backward; - pool3d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides, - paddings, data_format, in_x_grad); - } else if (pooling_type == "avg") { - paddle::operators::math::Pool3dGradFunctor< - DeviceContext, paddle::operators::math::AvgPoolGrad, T> - pool3d_backward; - paddle::operators::math::AvgPoolGrad pool_process; - pool3d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides, - paddings, data_format, exclusive, adaptive, - in_x_grad, pool_process); - } - } break; - default: { - PADDLE_THROW(platform::errors::InvalidArgument( - "Pool op only supports 2D and 3D input.")); - } - } - } - } -}; - -template -class PoolGradGradKernel : public PoolKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - std::string pooling_type = context.Attr("pooling_type"); - if (pooling_type == "max") { - PADDLE_THROW(platform::errors::InvalidArgument( - "Pool op grad grad only supports avgpool.")); - } else { - PoolKernel::Compute(context); - } - } -}; - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/pool_op_mlu.cc b/paddle/fluid/operators/pool_op_mlu.cc index 08656e6423..fa88d128a9 100644 --- a/paddle/fluid/operators/pool_op_mlu.cc +++ b/paddle/fluid/operators/pool_op_mlu.cc @@ -12,8 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/pool_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/mlu/mlu_baseop.h" +#include "paddle/phi/kernels/funcs/pooling.h" namespace paddle { namespace operators { @@ -80,10 +81,10 @@ class MLUPoolOpKernel : public framework::OpKernel { data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1); } - UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm, - data_dims, strides, ksize); + phi::funcs::UpdatePadding(&paddings, global_pooling, adaptive, + padding_algorithm, data_dims, strides, ksize); if (global_pooling) { - UpdateKsize(&ksize, data_dims); + phi::funcs::UpdateKernelSize(&ksize, data_dims); } MLUCnnlTensorDesc in_x_desc(*in_x, cnnl_layout, ToCnnlDataType()); @@ -191,10 +192,10 @@ class MLUPoolGradOpKernel : public framework::OpKernel { data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1); } - UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm, - data_dims, strides, ksize); + phi::funcs::UpdatePadding(&paddings, global_pooling, adaptive, + padding_algorithm, data_dims, strides, ksize); if (global_pooling) { - UpdateKsize(&ksize, data_dims); + phi::funcs::UpdateKernelSize(&ksize, data_dims); } // inputs need with NHWC layout diff --git a/paddle/fluid/operators/pool_op_npu.cc b/paddle/fluid/operators/pool_op_npu.cc index bd26d6350d..0efcb8b798 100644 --- a/paddle/fluid/operators/pool_op_npu.cc +++ b/paddle/fluid/operators/pool_op_npu.cc @@ -11,8 +11,10 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/pool_op.h" + +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" +#include "paddle/phi/kernels/funcs/pooling.h" namespace paddle { namespace operators { @@ -68,8 +70,8 @@ class NPUPoolOpKernel : public framework::OpKernel { strides_vec[2] = strides[0]; strides_vec[3] = strides[1]; } - UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm, - data_dims, strides, ksize); + phi::funcs::UpdatePadding(&paddings, global_pooling, adaptive, + padding_algorithm, data_dims, strides, ksize); PADDLE_ENFORCE_LT( std::max(paddings[0], paddings[1]), ksize[0], platform::errors::InvalidArgument( @@ -201,8 +203,8 @@ class NPUPoolGradOpKernel : public framework::OpKernel { strides_vec[2] = strides[0]; strides_vec[3] = strides[1]; } - UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm, - data_dims, strides, ksize); + phi::funcs::UpdatePadding(&paddings, global_pooling, adaptive, + padding_algorithm, data_dims, strides, ksize); PADDLE_ENFORCE_LT( std::max(paddings[0], paddings[1]), ksize[0], diff --git a/paddle/fluid/operators/pool_op_xpu.cc b/paddle/fluid/operators/pool_op_xpu.cc index 402dd6c108..87c437d8a7 100644 --- a/paddle/fluid/operators/pool_op_xpu.cc +++ b/paddle/fluid/operators/pool_op_xpu.cc @@ -8,13 +8,17 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/pool_op.h" + #include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor.h" #ifdef PADDLE_WITH_XPU namespace paddle { namespace operators { +using framework::Tensor; + xpu::Pooling_t XPUPoolingType(const std::string& pooltype, bool exclusive, bool is_test) { if (pooltype == "max") { diff --git a/paddle/fluid/operators/pool_with_index_op.cc b/paddle/fluid/operators/pool_with_index_op.cc index d061f9ae05..e0341f4a4b 100644 --- a/paddle/fluid/operators/pool_with_index_op.cc +++ b/paddle/fluid/operators/pool_with_index_op.cc @@ -12,8 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/pool_with_index_op.h" #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -28,71 +32,6 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::InvalidArgument( - "Input(X) of Pooling should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output(Out) of Pooling should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Mask"), true, - platform::errors::InvalidArgument( - "Output(Mask) of Pooling should not be null.")); - - auto in_x_dims = ctx->GetInputDim("X"); - - std::vector ksize = ctx->Attrs().Get>("ksize"); - std::vector strides = ctx->Attrs().Get>("strides"); - std::vector paddings = ctx->Attrs().Get>("paddings"); - bool adaptive = ctx->Attrs().Get("adaptive"); - - PADDLE_ENFORCE( - in_x_dims.size() == 4 || in_x_dims.size() == 5, - platform::errors::InvalidArgument("Pooling intput should be 4-D or 5-D " - "tensor but received %dD-Tensor", - in_x_dims.size())); - - if (ctx->Attrs().Get("global_pooling")) { - ksize.resize(static_cast(in_x_dims.size()) - 2); - for (size_t i = 0; i < ksize.size(); ++i) { - paddings[i] = 0; - ksize[i] = static_cast(in_x_dims[i + 2]); - } - } - - PADDLE_ENFORCE_EQ( - in_x_dims.size() - ksize.size(), 2U, - platform::errors::InvalidArgument( - "The input size %d minus the kernel size %d should equal to 2.", - in_x_dims.size(), ksize.size())); - PADDLE_ENFORCE_EQ( - ksize.size(), strides.size(), - platform::errors::InvalidArgument( - "Strides size %d and pooling size %d should be the same.", - strides.size(), ksize.size())); - PADDLE_ENFORCE_EQ( - ksize.size(), paddings.size(), - platform::errors::InvalidArgument( - "Paddings size %d and pooling size %d should be the same.", - paddings.size(), ksize.size())); - - std::vector output_shape({in_x_dims[0], in_x_dims[1]}); - if (adaptive) { - output_shape.insert(output_shape.end(), ksize.begin(), ksize.end()); - } else { - for (size_t i = 0; i < ksize.size(); ++i) { - if ((!ctx->IsRuntime()) && (in_x_dims[i + 2] < 0)) { - output_shape.push_back(in_x_dims[i + 2]); - } else { - output_shape.push_back(MaxPoolOutputSize(in_x_dims[i + 2], ksize[i], - paddings[i], strides[i])); - } - } - } - ctx->SetOutputDim("Out", phi::make_ddim(output_shape)); - ctx->SetOutputDim("Mask", phi::make_ddim(output_shape)); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -106,22 +45,6 @@ class MaxPoolWithIndexOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE_EQ( - ctx->HasInput("Mask"), true, - platform::errors::InvalidArgument("Input(Mask) must not be null.")); - PADDLE_ENFORCE_EQ( - ctx->HasInput("X"), true, - platform::errors::InvalidArgument("Input(X) must not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true, - platform::errors::InvalidArgument( - "Input(Out@GRAD) should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("X")), true, - platform::errors::InvalidArgument( - "Output(X@GRAD) should not be null.")); - ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -335,40 +258,34 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER( namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(max_pool2d_with_index, + MaxPool2dWithIndexInferShapeFunctor, + PD_INFER_META(phi::MaxPoolWithIndexInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(max_pool2d_with_index_grad, + MaxPool2dWithIndexGradInferShapeFunctor, + PD_INFER_META(phi::MaxPoolWithIndexGradInferMeta)); + REGISTER_OPERATOR(max_pool2d_with_index, ops::MaxPoolWithIndexOp, ops::MaxPool2dWithIndexOpMaker, ops::MaxPoolWithIndexGradOpMaker, - ops::MaxPoolWithIndexGradOpMaker); + ops::MaxPoolWithIndexGradOpMaker, + MaxPool2dWithIndexInferShapeFunctor); REGISTER_OPERATOR(max_pool2d_with_index_grad, ops::MaxPoolWithIndexOpGrad, - ops::MaxPoolWithIndexOpGradNoNeedBufferVarsInferer); + ops::MaxPoolWithIndexOpGradNoNeedBufferVarsInferer, + MaxPool2dWithIndexGradInferShapeFunctor); -REGISTER_OP_CPU_KERNEL( - max_pool2d_with_index, - ops::MaxPoolWithIndexKernel, - ops::MaxPoolWithIndexKernel); -REGISTER_OP_CPU_KERNEL( - max_pool2d_with_index_grad, - ops::MaxPoolWithIndexGradKernel, - ops::MaxPoolWithIndexGradKernel); +DECLARE_INFER_SHAPE_FUNCTOR(max_pool3d_with_index, + MaxPool3dWithIndexInferShapeFunctor, + PD_INFER_META(phi::MaxPoolWithIndexInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(max_pool3d_with_index_grad, + MaxPool3dWithIndexGradInferShapeFunctor, + PD_INFER_META(phi::MaxPoolWithIndexGradInferMeta)); REGISTER_OPERATOR(max_pool3d_with_index, ops::MaxPoolWithIndexOp, ops::MaxPool3dWithIndexOpMaker, ops::MaxPoolWithIndexGradOpMaker, - ops::MaxPoolWithIndexGradOpMaker); + ops::MaxPoolWithIndexGradOpMaker, + MaxPool3dWithIndexInferShapeFunctor); REGISTER_OPERATOR(max_pool3d_with_index_grad, ops::MaxPoolWithIndexOpGrad, - ops::MaxPoolWithIndexOpGradNoNeedBufferVarsInferer); - -REGISTER_OP_CPU_KERNEL( - max_pool3d_with_index, - ops::MaxPoolWithIndexKernel, - ops::MaxPoolWithIndexKernel); -REGISTER_OP_CPU_KERNEL( - max_pool3d_with_index_grad, - ops::MaxPoolWithIndexGradKernel, - ops::MaxPoolWithIndexGradKernel); + ops::MaxPoolWithIndexOpGradNoNeedBufferVarsInferer, + MaxPool3dWithIndexGradInferShapeFunctor); diff --git a/paddle/fluid/operators/pool_with_index_op.cu.cc b/paddle/fluid/operators/pool_with_index_op.cu.cc deleted file mode 100644 index 5497dcbd9c..0000000000 --- a/paddle/fluid/operators/pool_with_index_op.cu.cc +++ /dev/null @@ -1,43 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/pool_with_index_op.h" - -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL( - max_pool2d_with_index, - ops::MaxPoolWithIndexKernel, - ops::MaxPoolWithIndexKernel); -REGISTER_OP_CUDA_KERNEL( - max_pool2d_with_index_grad, - ops::MaxPoolWithIndexGradKernel, - ops::MaxPoolWithIndexGradKernel); - -REGISTER_OP_CUDA_KERNEL( - max_pool3d_with_index, - ops::MaxPoolWithIndexKernel, - ops::MaxPoolWithIndexKernel); -REGISTER_OP_CUDA_KERNEL( - max_pool3d_with_index_grad, - ops::MaxPoolWithIndexGradKernel, - ops::MaxPoolWithIndexGradKernel); diff --git a/paddle/fluid/operators/pool_with_index_op.h b/paddle/fluid/operators/pool_with_index_op.h deleted file mode 100644 index 6e51a833f5..0000000000 --- a/paddle/fluid/operators/pool_with_index_op.h +++ /dev/null @@ -1,121 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/pooling.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class MaxPoolWithIndexKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* in_x = context.Input("X"); - Tensor* out = context.Output("Out"); - Tensor* mask = context.Output("Mask"); - - std::vector ksize = context.Attr>("ksize"); - std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - bool adaptive = context.Attr("adaptive"); - - auto& dev_ctx = context.template device_context(); - if (context.Attr("global_pooling")) { - for (size_t i = 0; i < ksize.size(); ++i) { - paddings[i] = 0; - ksize[i] = static_cast(in_x->dims()[i + 2]); - } - } - - switch (ksize.size()) { - case 2: { - paddle::operators::math::MaxPool2dWithIndexFunctor - pool2d_forward; - pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, adaptive, out, - mask); - } break; - case 3: { - paddle::operators::math::MaxPool3dWithIndexFunctor - pool3d_forward; - pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, adaptive, out, - mask); - } break; - default: { - PADDLE_THROW(platform::errors::InvalidArgument( - "Pool op only supports 2D and 3D input.")); - } - } - } -}; - -template -class MaxPoolWithIndexGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* mask = context.Input("Mask"); - const Tensor* out_grad = - context.Input(framework::GradVarName("Out")); - Tensor* in_x_grad = context.Output(framework::GradVarName("X")); - - std::vector ksize = context.Attr>("ksize"); - std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - bool adaptive = context.Attr("adaptive"); - if (context.Attr("global_pooling")) { - for (size_t i = 0; i < ksize.size(); ++i) { - paddings[i] = 0; - ksize[i] = static_cast(in_x_grad->dims()[i + 2]); - } - } - - if (in_x_grad) { - in_x_grad->mutable_data(context.GetPlace()); - auto& device_ctx = context.template device_context(); - phi::funcs::set_constant(device_ctx, in_x_grad, 0); - - switch (ksize.size()) { - case 2: { - paddle::operators::math::MaxPool2dWithIndexGradFunctor - pool2d_backward; - pool2d_backward(device_ctx, *out_grad, *mask, ksize, strides, - paddings, adaptive, in_x_grad); - } break; - case 3: { - paddle::operators::math::MaxPool3dWithIndexGradFunctor - pool3d_backward; - pool3d_backward(device_ctx, *out_grad, *mask, ksize, strides, - paddings, adaptive, in_x_grad); - } break; - default: { - PADDLE_THROW(platform::errors::InvalidArgument( - "Pool op only supports 2D and 3D input.")); - } - } - } - } -}; -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/spp_op.h b/paddle/fluid/operators/spp_op.h index bff8061814..aa944cfcfb 100644 --- a/paddle/fluid/operators/spp_op.h +++ b/paddle/fluid/operators/spp_op.h @@ -16,9 +16,10 @@ limitations under the License. */ #include #include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/pooling.h" +#include "paddle/fluid/framework/phi_utils.h" #include "paddle/fluid/operators/strided_memcpy.h" #include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/pooling.h" namespace paddle { namespace operators { @@ -53,14 +54,20 @@ class SppKernel : public framework::OpKernel { out_level.mutable_data(output_shape, context.GetPlace()); // pooling if (pooling_type == "max") { - math::Pool2dFunctor, T> pool_forward; - math::MaxPool max_process; + phi::funcs::Pool2dFunctor< + typename framework::ConvertToPhiContext::TYPE, + phi::funcs::MaxPool, T> + pool_forward; + phi::funcs::MaxPool max_process; pool_forward(context.template device_context(), *in_x, kernel_size, strides, paddings, true, false, &out_level, max_process); } else if (pooling_type == "avg") { - math::Pool2dFunctor, T> pool_forward; - math::AvgPool avg_process; + phi::funcs::Pool2dFunctor< + typename framework::ConvertToPhiContext::TYPE, + phi::funcs::AvgPool, T> + pool_forward; + phi::funcs::AvgPool avg_process; pool_forward(context.template device_context(), *in_x, kernel_size, strides, paddings, true, false, &out_level, avg_process); @@ -95,7 +102,9 @@ class SppGradKernel : public framework::OpKernel { std::string pooling_type = context.template Attr("pooling_type"); auto& device_ctx = context.template device_context(); - phi::funcs::SetConstant zero; + phi::funcs::SetConstant< + typename framework::ConvertToPhiContext::TYPE, T> + zero; in_x_grad->mutable_data(context.GetPlace()); zero(device_ctx, in_x_grad, static_cast(0)); auto out_stride = phi::stride(out->dims()); @@ -145,14 +154,18 @@ class SppGradKernel : public framework::OpKernel { outgrad_level.Resize(out_shape); // pooling backward if (pooling_type == "max") { - math::MaxPool2dGradFunctor pool2d_backward; + phi::funcs::MaxPool2dGradFunctor< + typename framework::ConvertToPhiContext::TYPE, T> + pool2d_backward; pool2d_backward(context.template device_context(), *in_x, *&out_level, *&outgrad_level, kernel_size, strides, paddings, in_x_grad); } else if (pooling_type == "avg") { - math::Pool2dGradFunctor, T> + phi::funcs::Pool2dGradFunctor< + typename framework::ConvertToPhiContext::TYPE, + phi::funcs::AvgPoolGrad, T> pool_backward; - math::AvgPoolGrad avg_process; + phi::funcs::AvgPoolGrad avg_process; pool_backward(context.template device_context(), *in_x, *&out_level, *&outgrad_level, kernel_size, strides, paddings, true, false, in_x_grad, avg_process); diff --git a/paddle/fluid/operators/squeeze_op.h b/paddle/fluid/operators/squeeze_op.h index 58e5440689..a776a78616 100644 --- a/paddle/fluid/operators/squeeze_op.h +++ b/paddle/fluid/operators/squeeze_op.h @@ -17,7 +17,6 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/pooling.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/unsqueeze_op.h b/paddle/fluid/operators/unsqueeze_op.h index 7f676cbb65..f6112fb59c 100644 --- a/paddle/fluid/operators/unsqueeze_op.h +++ b/paddle/fluid/operators/unsqueeze_op.h @@ -16,7 +16,6 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/pooling.h" #include "paddle/fluid/operators/utils.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/phi/kernels/funcs/blas/blas.h" diff --git a/paddle/phi/core/meta_tensor.h b/paddle/phi/core/meta_tensor.h index 79f8d1c057..10c3a7c1a3 100644 --- a/paddle/phi/core/meta_tensor.h +++ b/paddle/phi/core/meta_tensor.h @@ -26,11 +26,13 @@ namespace phi { // TODO(chenweihang): add other flags if needed struct MetaConfig { bool is_runtime{true}; - + bool is_run_mkldnn_kernel{false}; MetaConfig() = default; // supporting implicit construction is easier to use - MetaConfig(bool is_runtime) : is_runtime(is_runtime) {} // NOLINT + MetaConfig(bool is_runtime, bool is_run_mkldnn_kernel) + : is_runtime(is_runtime), + is_run_mkldnn_kernel(is_run_mkldnn_kernel) {} // NOLINT }; class MetaTensor { diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc index a2bdf6b963..37d1a234b5 100644 --- a/paddle/phi/infermeta/backward.cc +++ b/paddle/phi/infermeta/backward.cc @@ -122,6 +122,35 @@ void GumbelSoftmaxGradInferMeta(const MetaTensor& out, dx->share_meta(dout); } +void MaxPoolWithIndexGradInferMeta(const MetaTensor& x, + const MetaTensor& mask, + const MetaTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool global_pooling, + bool adaptive, + MetaTensor* dx) { + dx->share_meta(x); +} + +void PoolGradInferMeta(const MetaTensor& x, + const MetaTensor& out, + const MetaTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + MetaTensor* dx) { + dx->share_meta(x); +} + void PsroiPoolGradInferMeta(const MetaTensor& x, const MetaTensor& rois, paddle::optional rois_num, diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h index 921df46011..06ee5a205d 100644 --- a/paddle/phi/infermeta/backward.h +++ b/paddle/phi/infermeta/backward.h @@ -54,6 +54,16 @@ void GumbelSoftmaxGradInferMeta(const MetaTensor& out, int axis, MetaTensor* dx); +void MaxPoolWithIndexGradInferMeta(const MetaTensor& x, + const MetaTensor& mask, + const MetaTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool global_pooling, + bool adaptive, + MetaTensor* dx); + void PsroiPoolGradInferMeta(const MetaTensor& x, const MetaTensor& rois, paddle::optional rois_num, @@ -64,6 +74,21 @@ void PsroiPoolGradInferMeta(const MetaTensor& x, float spatial_scale, MetaTensor* dx); +void PoolGradInferMeta(const MetaTensor& x, + const MetaTensor& out, + const MetaTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + MetaTensor* dx); + void ScatterGradInferMeta(const MetaTensor& index, const MetaTensor& updates, const MetaTensor& out_grad, diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index f7693c2f90..d09a2191fb 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -22,6 +22,7 @@ limitations under the License. */ #include "paddle/phi/common/type_traits.h" #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/kernels/funcs/pooling.h" #include "paddle/phi/kernels/funcs/unfold_functor.h" namespace phi { @@ -553,6 +554,78 @@ void IsfiniteInferMeta(const MetaTensor& x, MetaTensor* out) { out->set_dtype(DataType::BOOL); } +void MaxPoolWithIndexInferMeta(const MetaTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool global_pooling, + bool adaptive, + MetaTensor* out, + MetaTensor* mask, + MetaConfig config) { + std::vector paddings_ = paddings; + std::vector kernel_size_ = kernel_size; + + auto x_dims = x.dims(); + + PADDLE_ENFORCE( + x_dims.size() == 4 || x_dims.size() == 5, + errors::InvalidArgument( + "Pooling intput should be 4-D or 5-D tensor but received %dD-Tensor", + x_dims.size())); + + if (global_pooling) { + kernel_size_.resize(static_cast(x_dims.size()) - 2); + for (size_t i = 0; i < kernel_size_.size(); ++i) { + paddings_[i] = 0; + kernel_size_[i] = static_cast(x_dims[i + 2]); + } + } + + PADDLE_ENFORCE_EQ( + x_dims.size() - kernel_size_.size(), + 2U, + errors::InvalidArgument( + "The input size %d minus the kernel size %d should equal to 2.", + x_dims.size(), + kernel_size_.size())); + PADDLE_ENFORCE_EQ( + kernel_size_.size(), + strides.size(), + errors::InvalidArgument( + "Strides size %d and pooling size %d should be the same.", + strides.size(), + kernel_size_.size())); + PADDLE_ENFORCE_EQ( + kernel_size_.size(), + paddings_.size(), + errors::InvalidArgument( + "Paddings size %d and pooling size %d should be the same.", + paddings_.size(), + kernel_size_.size())); + + std::vector output_shape({x_dims[0], x_dims[1]}); + if (adaptive) { + output_shape.insert( + output_shape.end(), kernel_size_.begin(), kernel_size_.end()); + } else { + for (size_t i = 0; i < kernel_size_.size(); ++i) { + if ((!config.is_runtime) && (x_dims[i + 2] < 0)) { + output_shape.push_back(x_dims[i + 2]); + } else { + output_shape.push_back(funcs::MaxPoolOutputSize( + x_dims[i + 2], kernel_size_[i], paddings_[i], strides[i])); + } + } + } + + out->set_dims(make_ddim(output_shape)); + out->set_dtype(x.dtype()); + + mask->set_dims(make_ddim(output_shape)); + mask->set_dtype(paddle::experimental::CppTypeToDataType::Type()); +} + void MultinomialInferMeta(const MetaTensor& x, int num_samples, bool replacement, @@ -675,6 +748,118 @@ void PixelShuffleInferMeta(const MetaTensor& x, out->set_dims(output_dims); } +void PoolInferMeta(const MetaTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + MetaTensor* out, + MetaConfig config) { + std::vector paddings_ = paddings; + std::vector kernel_size_ = kernel_size; + + auto x_dims = x.dims(); + PADDLE_ENFORCE_EQ( + x_dims.size() == 4 || x_dims.size() == 5, + true, + errors::InvalidArgument( + "the input of Op(pool) should be 4-D or 5-D Tensor. But " + "received: %u-D Tensor and it's shape is [%s].", + x_dims.size(), + x_dims)); + + PADDLE_ENFORCE_EQ(x_dims.size() - kernel_size_.size(), + 2U, + errors::InvalidArgument( + "the dimension of input minus the size of " + "Attr(kernel_size_) must be euqal to 2 in Op(pool). " + "But received: the dimension of input minus the size " + "of Attr(kernel_size_) is %d, the " + "input's dimension is %d, the shape of input " + "is [%s], the Attr(kernel_size_)'s size is %d, the " + "Attr(kernel_size_) is [%s].", + x_dims.size() - kernel_size_.size(), + x_dims.size(), + x_dims, + kernel_size_.size(), + make_ddim(kernel_size_))); + + PADDLE_ENFORCE_EQ( + kernel_size_.size(), + strides.size(), + errors::InvalidArgument( + "the size of Attr(kernel_size_) and Attr(strides) in " + "Op(pool) must be equal. " + "But received: Attr(kernel_size_)'s size is %d, Attr(strides)'s " + "size is %d, Attr(kernel_size_) is [%s], Attr(strides)is [%s].", + kernel_size_.size(), + strides.size(), + make_ddim(kernel_size_), + make_ddim(strides))); + + // MKL-DNN Kernels are using NCHW order of dims description + // so we ignore data_format consideration for MKL-DNN kernel + const bool channel_last = (config.is_run_mkldnn_kernel == false) && + (data_format == "NHWC" || data_format == "NDHWC"); + + // update paddings if "SAME" or global_pooling + DDim data_dims; + if (channel_last) { + data_dims = slice_ddim(x_dims, 1, x_dims.size() - 1); + } else { + data_dims = slice_ddim(x_dims, 2, x_dims.size()); + } + funcs::UpdatePadding(&paddings_, + global_pooling, + adaptive, + padding_algorithm, + data_dims, + strides, + kernel_size_); + + if (global_pooling) { + funcs::UpdateKernelSize(&kernel_size_, data_dims); + } + + std::vector output_shape; + if (adaptive) { + output_shape.insert( + output_shape.end(), kernel_size_.begin(), kernel_size_.end()); + } else { + for (int i = 0; i < data_dims.size(); ++i) { + if ((!config.is_runtime) && (data_dims[i] < 0)) { + output_shape.push_back(data_dims[i]); + } else { + output_shape.push_back(funcs::PoolOutputSize(data_dims[i], + kernel_size_[i], + paddings_[2 * i], + paddings_[2 * i + 1], + strides[i], + ceil_mode)); + } + } + } + + // output_N = input_N + output_shape.insert(output_shape.begin(), x_dims[0]); + // output_C = input_C + if (channel_last) { + output_shape.push_back(x_dims[x_dims.size() - 1]); + } else { + output_shape.insert(output_shape.begin() + 1, x_dims[1]); + } + + out->set_dims(make_ddim(output_shape)); + out->share_lod(x); + out->set_dtype(x.dtype()); +} + void RealAndImagInferMeta(const MetaTensor& x, MetaTensor* out) { out->set_dims(x.dims()); out->set_dtype(dtype::ToReal(x.dtype())); diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index 539b6dcba4..a1fc6fd405 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -98,6 +98,16 @@ void IsEmptyInferMeta(const MetaTensor& x, MetaTensor* out); void IsfiniteInferMeta(const MetaTensor& input, MetaTensor* out); +void MaxPoolWithIndexInferMeta(const MetaTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool global_pooling, + bool adaptive, + MetaTensor* out, + MetaTensor* mask, + MetaConfig config = MetaConfig()); + void MultinomialInferMeta(const MetaTensor& x, int num_samples, bool replacement, @@ -114,6 +124,20 @@ void PixelShuffleInferMeta(const MetaTensor& x, const std::string& data_format, MetaTensor* out); +void PoolInferMeta(const MetaTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + MetaTensor* out, + MetaConfig config = MetaConfig()); + void RealAndImagInferMeta(const MetaTensor& x, MetaTensor* out); void ReduceInferMeta(const MetaTensor& x, diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index 093cb65497..d443b7bb2a 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -11,7 +11,7 @@ set_property(GLOBAL PROPERTY PHI_KERNELS "") # [ 1. Common kernel compilation dependencies ] set(COMMON_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils custom_kernel) -set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col vol2col concat_and_split_functor softmax) +set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col vol2col concat_and_split_functor) # remove this dep after removing fluid deps on tensor creation set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} phi_api_utils) set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta) @@ -27,22 +27,25 @@ kernel_library(full_kernel DEPS ${COMMON_KERNEL_DEPS} empty_kernel) # Some kernels depend on some targets that are not commonly used. # These targets are not suitable for common dependencies. # In this case, you need to manually generate them here. -set(MANUAL_BUILD_KERNELS math_kernel softmax_kernel softmax_grad_kernel triangular_solve_grad_kernel maxout_kernel maxout_grad_kernel put_along_axis_kernel put_along_axis_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel eigh_kernel segment_pool_kernel segment_pool_grad_kernel matrix_power_kernel matrix_power_grad_kernel) +set(MANUAL_BUILD_KERNELS eigh_kernel gumbel_softmax_kernel gumbel_softmax_grad_kernel math_kernel matrix_power_kernel matrix_power_grad_kernel maxout_kernel maxout_grad_kernel pool_kernel put_along_axis_kernel put_along_axis_grad_kernel segment_pool_kernel segment_pool_grad_kernel softmax_kernel softmax_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel triangular_solve_grad_kernel) +kernel_library(eigh_kernel DEPS ${COMMON_KERNEL_DEPS} lapack_function) +kernel_library(gumbel_softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) +kernel_library(gumbel_softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) kernel_library(math_kernel DEPS ${COMMON_KERNEL_DEPS} cast_kernel copy_kernel) -kernel_library(softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) -kernel_library(softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) -kernel_library(triangular_solve_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_reduce) +kernel_library(matrix_power_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse) +kernel_library(matrix_power_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse) kernel_library(maxout_kernel DEPS ${COMMON_KERNEL_DEPS} maxouting) kernel_library(maxout_grad_kernel DEPS ${COMMON_KERNEL_DEPS} maxouting) +kernel_library(pool_kernel DEPS ${COMMON_KERNEL_DEPS} pooling) kernel_library(put_along_axis_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel) kernel_library(put_along_axis_grad_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel) -kernel_library(take_along_axis_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel) -kernel_library(take_along_axis_grad_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel) -kernel_library(matrix_power_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse) -kernel_library(matrix_power_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse) -kernel_library(eigh_kernel DEPS ${COMMON_KERNEL_DEPS} lapack_function) kernel_library(segment_pool_kernel DEPS ${COMMON_KERNEL_DEPS} segment_pooling) kernel_library(segment_pool_grad_kernel DEPS ${COMMON_KERNEL_DEPS} segment_pooling) +kernel_library(softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) +kernel_library(softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) +kernel_library(take_along_axis_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel) +kernel_library(take_along_axis_grad_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel) +kernel_library(triangular_solve_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_reduce) # 4. auto parse and build kernel targets by cmake register_kernels(EXCLUDES ${COMMON_BAISC_KERNELS} ${MANUAL_BUILD_KERNELS} DEPS ${COMMON_KERNEL_DEPS} ${COMMON_BAISC_KERNELS} ) diff --git a/paddle/phi/kernels/concat_kernel.h b/paddle/phi/kernels/concat_kernel.h index 4e72159aec..cf83ab9aaa 100644 --- a/paddle/phi/kernels/concat_kernel.h +++ b/paddle/phi/kernels/concat_kernel.h @@ -40,7 +40,7 @@ DenseTensor Concat(const Context& dev_ctx, DenseTensor dense_out; MetaTensor meta_out(&dense_out); - ConcatInferMeta(meta_x_ptr, axis.to(), &meta_out, /*is_runtime=*/true); + ConcatInferMeta(meta_x_ptr, axis.to(), &meta_out); ConcatKernel(dev_ctx, x, axis, &dense_out); return dense_out; } diff --git a/paddle/phi/kernels/cpu/pool_grad_kernel.cc b/paddle/phi/kernels/cpu/pool_grad_kernel.cc new file mode 100644 index 0000000000..bb97694d8f --- /dev/null +++ b/paddle/phi/kernels/cpu/pool_grad_kernel.cc @@ -0,0 +1,49 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/pool_grad_kernel.h" + +#include "paddle/phi/kernels/impl/pool_grad_kernel_impl.h" + +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL( + pool2d_grad, CPU, ALL_LAYOUT, phi::Pool2dGradKernel, float, double) {} +PD_REGISTER_KERNEL(pool2d_double_grad, + CPU, + ALL_LAYOUT, + phi::Pool2dDoubleGradKernel, + float, + double) {} +PD_REGISTER_KERNEL(max_pool2d_with_index_grad, + CPU, + ALL_LAYOUT, + phi::MaxPool2dWithIndexGradKernel, + float, + double) { + kernel->InputAt(1).SetDataType( + paddle::experimental::CppTypeToDataType::Type()); +} + +PD_REGISTER_KERNEL( + pool3d_grad, CPU, ALL_LAYOUT, phi::Pool3dGradKernel, float, double) {} +PD_REGISTER_KERNEL(max_pool3d_with_index_grad, + CPU, + ALL_LAYOUT, + phi::MaxPool3dWithIndexGradKernel, + float, + double) { + kernel->InputAt(1).SetDataType( + paddle::experimental::CppTypeToDataType::Type()); +} diff --git a/paddle/phi/kernels/cpu/pool_kernel.cc b/paddle/phi/kernels/cpu/pool_kernel.cc new file mode 100644 index 0000000000..1d57e282c3 --- /dev/null +++ b/paddle/phi/kernels/cpu/pool_kernel.cc @@ -0,0 +1,41 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/pool_kernel.h" + +#include "paddle/phi/kernels/impl/pool_kernel_impl.h" + +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL(pool2d, CPU, ALL_LAYOUT, phi::Pool2dKernel, float, double) {} +PD_REGISTER_KERNEL(max_pool2d_with_index, + CPU, + ALL_LAYOUT, + phi::MaxPool2dWithIndexKernel, + float, + double) { + kernel->OutputAt(1).SetDataType( + paddle::experimental::CppTypeToDataType::Type()); +} + +PD_REGISTER_KERNEL(pool3d, CPU, ALL_LAYOUT, phi::Pool3dKernel, float, double) {} +PD_REGISTER_KERNEL(max_pool3d_with_index, + CPU, + ALL_LAYOUT, + phi::MaxPool3dWithIndexKernel, + float, + double) { + kernel->OutputAt(1).SetDataType( + paddle::experimental::CppTypeToDataType::Type()); +} diff --git a/paddle/phi/kernels/cpu/split_kernel.cc b/paddle/phi/kernels/cpu/split_kernel.cc index 324798effb..ea8e2702c1 100644 --- a/paddle/phi/kernels/cpu/split_kernel.cc +++ b/paddle/phi/kernels/cpu/split_kernel.cc @@ -38,7 +38,7 @@ void SplitKernel(const Context& dev_ctx, out_metas_ptr.push_back(&out_metas.back()); } - phi::SplitInferMeta(x, num_or_sections, axis_scalar, out_metas_ptr, true); + phi::SplitInferMeta(x, num_or_sections, axis_scalar, out_metas_ptr); for (size_t i = 0; i < out_metas.size(); ++i) { outs[i]->Resize(out_metas[i].dims()); diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt index e0db7b51f8..942eecae16 100644 --- a/paddle/phi/kernels/funcs/CMakeLists.txt +++ b/paddle/phi/kernels/funcs/CMakeLists.txt @@ -3,11 +3,12 @@ add_subdirectory(blas) add_subdirectory(lapack) add_subdirectory(detail) -math_library(math_function DEPS blas dense_tensor tensor) -math_library(segment_pooling) -math_library(sequence2batch) +math_library(concat_and_split_functor DEPS dense_tensor) math_library(gru_compute DEPS activation_functions math_function) math_library(lstm_compute DEPS activation_functions) -math_library(concat_and_split_functor DEPS dense_tensor) +math_library(math_function DEPS blas dense_tensor tensor) math_library(matrix_reduce DEPS dense_tensor) math_library(matrix_inverse DEPS dense_tensor eigen3 blas) +math_library(pooling DEPS dense_tensor) +math_library(segment_pooling) +math_library(sequence2batch) diff --git a/paddle/fluid/operators/math/pooling.cc b/paddle/phi/kernels/funcs/pooling.cc similarity index 83% rename from paddle/fluid/operators/math/pooling.cc rename to paddle/phi/kernels/funcs/pooling.cc index f2e5e955ec..10c88b9798 100644 --- a/paddle/fluid/operators/math/pooling.cc +++ b/paddle/phi/kernels/funcs/pooling.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -11,11 +11,15 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/math/pooling.h" -namespace paddle { -namespace operators { -namespace math { +#include "paddle/phi/kernels/funcs/pooling.h" + +#include +#include +#include "paddle/phi/backends/cpu/cpu_context.h" + +namespace phi { +namespace funcs { /* * Tensors are in NCHW or NHWC format. @@ -25,13 +29,16 @@ namespace math { * height_down, width_left and width_right, respectively. */ template -class Pool2dFunctor { +class Pool2dFunctor { public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, const std::vector& ksize, + void operator()(const CPUContext& context, + const DenseTensor& input, + const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, bool exclusive, - bool adaptive, framework::Tensor* output, + const std::vector& paddings, + bool exclusive, + bool adaptive, + DenseTensor* output, PoolProcess pool_process) { const int batch_size = input.dims()[0]; const int input_height = input.dims()[2]; @@ -50,7 +57,7 @@ class Pool2dFunctor { const int output_stride = output_height * output_width; const T* input_data = input.data(); - T* output_data = output->mutable_data(context.GetPlace()); + T* output_data = context.template Alloc(output); int hstart = 0, hend = 1; int wstart = 0, wend = 1; @@ -101,12 +108,16 @@ class Pool2dFunctor { } } - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, const std::vector& ksize, + void operator()(const CPUContext& context, + const DenseTensor& input, + const std::vector& ksize, const std::vector& strides, const std::vector& paddings, - const std::string data_format, bool exclusive, bool adaptive, - framework::Tensor* output, PoolProcess pool_process) { + const std::string data_format, + bool exclusive, + bool adaptive, + DenseTensor* output, + PoolProcess pool_process) { bool channel_last = (data_format == "NHWC"); const int batch_size = input.dims()[0]; @@ -131,7 +142,7 @@ class Pool2dFunctor { const int padding_width = paddings[1]; const T* input_data = input.data(); - T* output_data = output->mutable_data(context.GetPlace()); + T* output_data = context.template Alloc(output); int hstart = 0, hend = 1; int wstart = 0, wend = 1; @@ -244,14 +255,19 @@ class Pool2dFunctor { * height_down, width_left and width_right, respectively. */ template -class Pool2dGradFunctor { +class Pool2dGradFunctor { public: - void operator()( - const platform::CPUDeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, const framework::Tensor& output_grad, - const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, bool exclusive, bool adaptive, - framework::Tensor* input_grad, PoolProcess pool_grad_process) { + void operator()(const CPUContext& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + bool exclusive, + bool adaptive, + DenseTensor* input_grad, + PoolProcess pool_grad_process) { const int batch_size = input.dims()[0]; const int input_height = input.dims()[2]; const int input_width = input.dims()[3]; @@ -270,7 +286,7 @@ class Pool2dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T* input_grad_data = context.template Alloc(input_grad); int hstart = 0, hend = 1; int wstart = 0, wend = 1; @@ -324,13 +340,18 @@ class Pool2dGradFunctor { } } - void operator()( - const platform::CPUDeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, const framework::Tensor& output_grad, - const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, const std::string data_format, - bool exclusive, bool adaptive, framework::Tensor* input_grad, - PoolProcess pool_grad_process) { + void operator()(const CPUContext& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::string data_format, + bool exclusive, + bool adaptive, + DenseTensor* input_grad, + PoolProcess pool_grad_process) { bool channel_last = (data_format == "NHWC"); const int batch_size = input.dims()[0]; @@ -357,7 +378,7 @@ class Pool2dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T* input_grad_data = context.template Alloc(input_grad); int hstart = 0, hend = 1; int wstart = 0, wend = 1; @@ -451,10 +472,11 @@ class Pool2dGradFunctor { h * input_width * input_channels + w * input_channels + c; auto output_idx = ph * output_width * output_channels + pw * output_channels + c; - pool_grad_process.compute( - input_data[input_idx], output_data[output_idx], - output_grad_data[output_idx], static_cast(scale), - input_grad_data + input_idx); + pool_grad_process.compute(input_data[input_idx], + output_data[output_idx], + output_grad_data[output_idx], + static_cast(scale), + input_grad_data + input_idx); } } } @@ -477,13 +499,16 @@ class Pool2dGradFunctor { * height_down, width_left and width_right, respectively. */ template -class MaxPool2dGradFunctor { +class MaxPool2dGradFunctor { public: - void operator()( - const platform::CPUDeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, const framework::Tensor& output_grad, - const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, framework::Tensor* input_grad) { + void operator()(const CPUContext& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + DenseTensor* input_grad) { const int batch_size = input.dims()[0]; const int input_height = input.dims()[2]; const int input_width = input.dims()[3]; @@ -502,7 +527,7 @@ class MaxPool2dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T* input_grad_data = context.template Alloc(input_grad); for (int i = 0; i < batch_size; i++) { for (int c = 0; c < output_channels; ++c) { @@ -536,12 +561,15 @@ class MaxPool2dGradFunctor { } } - void operator()( - const platform::CPUDeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, const framework::Tensor& output_grad, - const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, const std::string data_format, - framework::Tensor* input_grad) { + void operator()(const CPUContext& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::string data_format, + DenseTensor* input_grad) { bool channel_last = (data_format == "NHWC"); const int batch_size = input.dims()[0]; @@ -568,7 +596,7 @@ class MaxPool2dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T* input_grad_data = context.template Alloc(input_grad); if (!channel_last) { const int input_stride = input_height * input_width; @@ -641,29 +669,17 @@ class MaxPool2dGradFunctor { } } }; -template class MaxPool2dGradFunctor; -template class MaxPool2dGradFunctor; - -template class Pool2dFunctor, float>; -template class Pool2dFunctor, float>; -template class Pool2dGradFunctor, - float>; -template class Pool2dGradFunctor, - float>; -template class Pool2dFunctor, double>; -template class Pool2dFunctor, double>; -template class Pool2dGradFunctor, - double>; -template class Pool2dGradFunctor, - double>; +template class MaxPool2dGradFunctor; +template class MaxPool2dGradFunctor; + +template class Pool2dFunctor, float>; +template class Pool2dFunctor, float>; +template class Pool2dGradFunctor, float>; +template class Pool2dGradFunctor, float>; +template class Pool2dFunctor, double>; +template class Pool2dFunctor, double>; +template class Pool2dGradFunctor, double>; +template class Pool2dGradFunctor, double>; /* * Tensors are in NCDHW or NDHWC format. @@ -674,13 +690,16 @@ template class Pool2dGradFunctor -class Pool3dFunctor { +class Pool3dFunctor { public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, const std::vector& ksize, + void operator()(const CPUContext& context, + const DenseTensor& input, + const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, bool exclusive, - bool adaptive, framework::Tensor* output, + const std::vector& paddings, + bool exclusive, + bool adaptive, + DenseTensor* output, PoolProcess pool_process) { const int batch_size = input.dims()[0]; const int input_depth = input.dims()[2]; @@ -704,7 +723,7 @@ class Pool3dFunctor { const int output_stride = output_depth * output_height * output_width; const T* input_data = input.data(); - T* output_data = output->mutable_data(context.GetPlace()); + T* output_data = context.template Alloc(output); int dstart = 0, dend = 1; int hstart = 0, hend = 1; @@ -771,12 +790,16 @@ class Pool3dFunctor { } } } - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, const std::vector& ksize, + void operator()(const CPUContext& context, + const DenseTensor& input, + const std::vector& ksize, const std::vector& strides, const std::vector& paddings, - const std::string data_format, bool exclusive, bool adaptive, - framework::Tensor* output, PoolProcess pool_process) { + const std::string data_format, + bool exclusive, + bool adaptive, + DenseTensor* output, + PoolProcess pool_process) { bool channel_last = (data_format == "NDHWC"); const int batch_size = input.dims()[0]; @@ -807,7 +830,7 @@ class Pool3dFunctor { const int padding_width = paddings[2]; const T* input_data = input.data(); - T* output_data = output->mutable_data(context.GetPlace()); + T* output_data = context.template Alloc(output); int dstart = 0, dend = 1; int hstart = 0, hend = 1; @@ -966,14 +989,19 @@ class Pool3dFunctor { * height_up, height_down, width_left and width_right, respectively. */ template -class Pool3dGradFunctor { +class Pool3dGradFunctor { public: - void operator()( - const platform::CPUDeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, const framework::Tensor& output_grad, - const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, bool exclusive, bool adaptive, - framework::Tensor* input_grad, PoolProcess pool_grad_process) { + void operator()(const CPUContext& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + bool exclusive, + bool adaptive, + DenseTensor* input_grad, + PoolProcess pool_grad_process) { const int batch_size = input.dims()[0]; const int input_depth = input.dims()[2]; const int input_height = input.dims()[3]; @@ -997,7 +1025,7 @@ class Pool3dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T* input_grad_data = context.template Alloc(input_grad); int dstart = 0, dend = 1; int hstart = 0, hend = 1; @@ -1051,10 +1079,11 @@ class Pool3dGradFunctor { int input_idx = (d * input_height + h) * input_width + w; int output_idx = (pd * output_height + ph) * output_width + pw; - pool_grad_process.compute( - input_data[input_idx], output_data[output_idx], - output_grad_data[output_idx], static_cast(scale), - input_grad_data + input_idx); + pool_grad_process.compute(input_data[input_idx], + output_data[output_idx], + output_grad_data[output_idx], + static_cast(scale), + input_grad_data + input_idx); } } } @@ -1068,13 +1097,18 @@ class Pool3dGradFunctor { } } } - void operator()( - const platform::CPUDeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, const framework::Tensor& output_grad, - const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, const std::string data_format, - bool exclusive, bool adaptive, framework::Tensor* input_grad, - PoolProcess pool_grad_process) { + void operator()(const CPUContext& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::string data_format, + bool exclusive, + bool adaptive, + DenseTensor* input_grad, + PoolProcess pool_grad_process) { bool channel_last = (data_format == "NDHWC"); const int batch_size = input.dims()[0]; @@ -1105,7 +1139,7 @@ class Pool3dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T* input_grad_data = context.template Alloc(input_grad); int dstart = 0, dend = 1; int hstart = 0, hend = 1; @@ -1164,10 +1198,11 @@ class Pool3dGradFunctor { int input_idx = (d * input_height + h) * input_width + w; int output_idx = (pd * output_height + ph) * output_width + pw; - pool_grad_process.compute( - input_data[input_idx], output_data[output_idx], - output_grad_data[output_idx], static_cast(scale), - input_grad_data + input_idx); + pool_grad_process.compute(input_data[input_idx], + output_data[output_idx], + output_grad_data[output_idx], + static_cast(scale), + input_grad_data + input_idx); } } } @@ -1241,10 +1276,11 @@ class Pool3dGradFunctor { ((pd * output_height + ph) * output_width + pw) * output_channels + c; - pool_grad_process.compute( - input_data[input_idx], output_data[output_idx], - output_grad_data[output_idx], static_cast(scale), - input_grad_data + input_idx); + pool_grad_process.compute(input_data[input_idx], + output_data[output_idx], + output_grad_data[output_idx], + static_cast(scale), + input_grad_data + input_idx); } } } @@ -1270,13 +1306,16 @@ class Pool3dGradFunctor { * height_up, height_down, width_left and width_right, respectively. */ template -class MaxPool3dGradFunctor { +class MaxPool3dGradFunctor { public: - void operator()( - const platform::CPUDeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, const framework::Tensor& output_grad, - const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, framework::Tensor* input_grad) { + void operator()(const CPUContext& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + DenseTensor* input_grad) { const int batch_size = input.dims()[0]; const int input_depth = input.dims()[2]; const int input_height = input.dims()[3]; @@ -1300,7 +1339,7 @@ class MaxPool3dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T* input_grad_data = context.template Alloc(input_grad); for (int i = 0; i < batch_size; i++) { for (int c = 0; c < output_channels; ++c) { @@ -1342,12 +1381,15 @@ class MaxPool3dGradFunctor { } } } - void operator()( - const platform::CPUDeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, const framework::Tensor& output_grad, - const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, const std::string data_format, - framework::Tensor* input_grad) { + void operator()(const CPUContext& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::string data_format, + DenseTensor* input_grad) { bool channel_last = (data_format == "NDHWC"); const int batch_size = input.dims()[0]; @@ -1378,7 +1420,7 @@ class MaxPool3dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T* input_grad_data = context.template Alloc(input_grad); if (!channel_last) { const int input_stride = input_depth * input_height * input_width; @@ -1475,29 +1517,17 @@ class MaxPool3dGradFunctor { } } }; -template class MaxPool3dGradFunctor; -template class MaxPool3dGradFunctor; - -template class Pool3dFunctor, float>; -template class Pool3dFunctor, float>; -template class Pool3dGradFunctor, - float>; -template class Pool3dGradFunctor, - float>; -template class Pool3dFunctor, double>; -template class Pool3dFunctor, double>; -template class Pool3dGradFunctor, - double>; -template class Pool3dGradFunctor, - double>; +template class MaxPool3dGradFunctor; +template class MaxPool3dGradFunctor; + +template class Pool3dFunctor, float>; +template class Pool3dFunctor, float>; +template class Pool3dGradFunctor, float>; +template class Pool3dGradFunctor, float>; +template class Pool3dFunctor, double>; +template class Pool3dFunctor, double>; +template class Pool3dGradFunctor, double>; +template class Pool3dGradFunctor, double>; /* * All tensors are in NCHW format. @@ -1505,13 +1535,16 @@ template class Pool3dGradFunctor -class MaxPool2dWithIndexFunctor { +class MaxPool2dWithIndexFunctor { public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, const std::vector& ksize, + void operator()(const CPUContext& context, + const DenseTensor& input, + const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, bool adaptive, - framework::Tensor* output, framework::Tensor* mask) { + const std::vector& paddings, + bool adaptive, + DenseTensor* output, + DenseTensor* mask) { const int batch_size = input.dims()[0]; const int input_height = input.dims()[2]; const int input_width = input.dims()[3]; @@ -1528,8 +1561,8 @@ class MaxPool2dWithIndexFunctor { const int output_stride = output_height * output_width; const T1* input_data = input.data(); - T1* output_data = output->mutable_data(context.GetPlace()); - T2* mask_data = mask->mutable_data(context.GetPlace()); + T1* output_data = context.template Alloc(output); + T2* mask_data = context.template Alloc(mask); int hstart, hend; int wstart, wend; @@ -1583,14 +1616,16 @@ class MaxPool2dWithIndexFunctor { * height and width, respectively. */ template -class MaxPool2dWithIndexGradFunctor { +class MaxPool2dWithIndexGradFunctor { public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& output_grad, - const framework::Tensor& mask, const std::vector& ksize, + void operator()(const CPUContext& context, + const DenseTensor& output_grad, + const DenseTensor& mask, + const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, bool adaptive, - framework::Tensor* input_grad) { + const std::vector& paddings, + bool adaptive, + DenseTensor* input_grad) { const int batch_size = input_grad->dims()[0]; const int input_height = input_grad->dims()[2]; const int input_width = input_grad->dims()[3]; @@ -1602,7 +1637,7 @@ class MaxPool2dWithIndexGradFunctor { const T2* mask_data = mask.data(); const T1* output_grad_data = output_grad.data(); - T1* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T1* input_grad_data = context.template Alloc(input_grad); for (int n = 0; n < batch_size; ++n) { for (int c = 0; c < output_channels; ++c) { @@ -1622,14 +1657,10 @@ class MaxPool2dWithIndexGradFunctor { } }; -template class MaxPool2dWithIndexFunctor; -template class MaxPool2dWithIndexGradFunctor; -template class MaxPool2dWithIndexFunctor; -template class MaxPool2dWithIndexGradFunctor; +template class MaxPool2dWithIndexFunctor; +template class MaxPool2dWithIndexGradFunctor; +template class MaxPool2dWithIndexFunctor; +template class MaxPool2dWithIndexGradFunctor; /* * All tensors are in NCDHW format. @@ -1637,13 +1668,16 @@ template class MaxPool2dWithIndexGradFunctor -class MaxPool3dWithIndexFunctor { +class MaxPool3dWithIndexFunctor { public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, const std::vector& ksize, + void operator()(const CPUContext& context, + const DenseTensor& input, + const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, bool adaptive, - framework::Tensor* output, framework::Tensor* mask) { + const std::vector& paddings, + bool adaptive, + DenseTensor* output, + DenseTensor* mask) { const int batch_size = input.dims()[0]; const int input_depth = input.dims()[2]; const int input_height = input.dims()[3]; @@ -1665,8 +1699,8 @@ class MaxPool3dWithIndexFunctor { const int output_stride = output_depth * output_height * output_width; const T1* input_data = input.data(); - T1* output_data = output->mutable_data(context.GetPlace()); - T2* mask_data = mask->mutable_data(context.GetPlace()); + T1* output_data = context.template Alloc(output); + T2* mask_data = context.template Alloc(mask); int dstart, dend; int hstart, hend; @@ -1735,14 +1769,16 @@ class MaxPool3dWithIndexFunctor { * depth, height and width, respectively. */ template -class MaxPool3dWithIndexGradFunctor { +class MaxPool3dWithIndexGradFunctor { public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& output_grad, - const framework::Tensor& mask, const std::vector& ksize, + void operator()(const CPUContext& context, + const DenseTensor& output_grad, + const DenseTensor& mask, + const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, bool adaptive, - framework::Tensor* input_grad) { + const std::vector& paddings, + bool adaptive, + DenseTensor* input_grad) { const int batch_size = input_grad->dims()[0]; const int input_depth = input_grad->dims()[2]; const int input_height = input_grad->dims()[3]; @@ -1756,7 +1792,7 @@ class MaxPool3dWithIndexGradFunctor { const T2* mask_data = mask.data(); const T1* output_grad_data = output_grad.data(); - T1* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T1* input_grad_data = context.template Alloc(input_grad); for (int n = 0; n < batch_size; ++n) { for (int c = 0; c < output_channels; ++c) { @@ -1779,14 +1815,9 @@ class MaxPool3dWithIndexGradFunctor { } }; -template class MaxPool3dWithIndexFunctor; -template class MaxPool3dWithIndexGradFunctor; -template class MaxPool3dWithIndexFunctor; -template class MaxPool3dWithIndexGradFunctor; -} // namespace math -} // namespace operators -} // namespace paddle +template class MaxPool3dWithIndexFunctor; +template class MaxPool3dWithIndexGradFunctor; +template class MaxPool3dWithIndexFunctor; +template class MaxPool3dWithIndexGradFunctor; +} // namespace funcs +} // namespace phi diff --git a/paddle/fluid/operators/math/pooling.cu b/paddle/phi/kernels/funcs/pooling.cu similarity index 54% rename from paddle/fluid/operators/math/pooling.cu rename to paddle/phi/kernels/funcs/pooling.cu index 9d96345eb1..4cf5e1c02c 100644 --- a/paddle/fluid/operators/math/pooling.cu +++ b/paddle/phi/kernels/funcs/pooling.cu @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserved. +/* Copyright (c) 2022 paddlepaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -12,63 +12,72 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/phi/kernels/funcs/pooling.h" + #include #include - -#include "paddle/fluid/operators/math/pooling.h" -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/fast_divmod.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" -namespace paddle { -namespace operators { -namespace math { +namespace phi { +namespace funcs { struct FastDivModForPooling { public: - platform::FastDivMod channel; - platform::FastDivMod width; - platform::FastDivMod height; + paddle::platform::FastDivMod channel; + paddle::platform::FastDivMod width; + paddle::platform::FastDivMod height; explicit HOSTDEVICE FastDivModForPooling(const int channels, const int output_width, const int output_height) { - channel = platform::FastDivMod(channels); - width = platform::FastDivMod(output_width); - height = platform::FastDivMod(output_height); + channel = paddle::platform::FastDivMod(channels); + width = paddle::platform::FastDivMod(output_width); + height = paddle::platform::FastDivMod(output_height); } }; struct FastDivModForPoolingWithMoreStaff { public: - platform::FastDivMod channel; - platform::FastDivMod width; - platform::FastDivMod height; - platform::FastDivMod ksize_w; - platform::FastDivMod ksize_h; - platform::FastDivMod stride_w; - platform::FastDivMod stride_h; + paddle::platform::FastDivMod channel; + paddle::platform::FastDivMod width; + paddle::platform::FastDivMod height; + paddle::platform::FastDivMod ksize_w; + paddle::platform::FastDivMod ksize_h; + paddle::platform::FastDivMod stride_w; + paddle::platform::FastDivMod stride_h; explicit HOSTDEVICE FastDivModForPoolingWithMoreStaff( - const int channels, const int input_width, const int input_height, - const int ksize_width, const int ksize_height, const int stride_width, + const int channels, + const int input_width, + const int input_height, + const int ksize_width, + const int ksize_height, + const int stride_width, const int stride_height) { - channel = platform::FastDivMod(channels); - width = platform::FastDivMod(input_width); - height = platform::FastDivMod(input_height); - ksize_w = platform::FastDivMod(ksize_width); - ksize_h = platform::FastDivMod(ksize_height); - stride_w = platform::FastDivMod(stride_width); - stride_h = platform::FastDivMod(stride_height); + channel = paddle::platform::FastDivMod(channels); + width = paddle::platform::FastDivMod(input_width); + height = paddle::platform::FastDivMod(input_height); + ksize_w = paddle::platform::FastDivMod(ksize_width); + ksize_h = paddle::platform::FastDivMod(ksize_height); + stride_w = paddle::platform::FastDivMod(stride_width); + stride_h = paddle::platform::FastDivMod(stride_height); } }; template -__device__ void OffsetPreparationFor4Dimension( - int index, bool channel_last, FastDivModForPooling divmods, - const int pad_width, const int pad_height, const int aux_width, - const int aux_height, int* w_offset, int* h_offset, int* c_offset, - int* stride) { +__device__ void OffsetPreparationFor4Dimension(int index, + bool channel_last, + FastDivModForPooling divmods, + const int pad_width, + const int pad_height, + const int aux_width, + const int aux_height, + int* w_offset, + int* h_offset, + int* c_offset, + int* stride) { if (!channel_last) { /* NCHW */ auto input_width_divmod = divmods.width.Divmod(index); auto input_height_divmod = divmods.height.Divmod(input_width_divmod.val[0]); @@ -91,21 +100,40 @@ __device__ void OffsetPreparationFor4Dimension( } template -__global__ void KernelPool2D( - const int nthreads, const T* input_data, const int channels, - const int input_height, const int input_width, const int output_height, - const int output_width, const int ksize_height, const int ksize_width, - const int stride_height, const int stride_width, const int padding_height, - const int padding_width, FastDivModForPooling divmods, - PoolProcess pool_process, bool exclusive, bool adaptive, T* output_data, - bool channel_last = false) { +__global__ void KernelPool2D(const int nthreads, + const T* input_data, + const int channels, + const int input_height, + const int input_width, + const int output_height, + const int output_width, + const int ksize_height, + const int ksize_width, + const int stride_height, + const int stride_width, + const int padding_height, + const int padding_width, + FastDivModForPooling divmods, + PoolProcess pool_process, + bool exclusive, + bool adaptive, + T* output_data, + bool channel_last = false) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { int hstart, hend, wstart, wend; int w_offset, h_offset, c_offset, input_offset; - OffsetPreparationFor4Dimension( - index, channel_last, divmods, 0, 0, input_width, input_height, - &w_offset, &h_offset, &c_offset, &input_offset); + OffsetPreparationFor4Dimension(index, + channel_last, + divmods, + 0, + 0, + input_width, + input_height, + &w_offset, + &h_offset, + &c_offset, + &input_offset); input_data += input_offset; if (adaptive) { @@ -139,25 +167,43 @@ __global__ void KernelPool2D( } template -__global__ void KernelPool2DGrad( - const int nthreads, const T* __restrict__ input_data, - const T* __restrict__ output_data, const const T* __restrict__ output_grad, - const int output_width, const int output_height, const int input_width, - const int input_height, const int ksize_width, const int ksize_height, - const int stride_width, const int stride_height, const int padding_width, - const int padding_height, FastDivModForPoolingWithMoreStaff divmods, - PoolProcess pool_process, bool exclusive, bool adaptive, - T* __restrict__ input_grad, bool channel_last = false) { +__global__ void KernelPool2DGrad(const int nthreads, + const T* __restrict__ input_data, + const T* __restrict__ output_data, + const const T* __restrict__ output_grad, + const int output_width, + const int output_height, + const int input_width, + const int input_height, + const int ksize_width, + const int ksize_height, + const int stride_width, + const int stride_height, + const int padding_width, + const int padding_height, + FastDivModForPoolingWithMoreStaff divmods, + PoolProcess pool_process, + bool exclusive, + bool adaptive, + T* __restrict__ input_grad, + bool channel_last = false) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { T input = static_cast(0); T input_grad_data = static_cast(0); int phstart, phend, pwstart, pwend; int w_offset, h_offset, c_offset, output_offset; - OffsetPreparationFor4Dimension<>(index, channel_last, divmods, - padding_width, padding_height, - output_width, output_height, &w_offset, - &h_offset, &c_offset, &output_offset); + OffsetPreparationFor4Dimension<>(index, + channel_last, + divmods, + padding_width, + padding_height, + output_width, + output_height, + &w_offset, + &h_offset, + &c_offset, + &output_offset); if (pool_process.use_x) { input = input_data[index]; output_data += output_offset; @@ -188,7 +234,9 @@ __global__ void KernelPool2DGrad( : tmp_idx; T ouput_value = pool_process.use_x ? output_data[output_sub_idx] : static_cast(0); - pool_process.compute(input, ouput_value, output_grad[output_sub_idx], + pool_process.compute(input, + ouput_value, + output_grad[output_sub_idx], static_cast(1.0 / pool_size), &input_grad_data); } @@ -217,9 +265,11 @@ __global__ void KernelPool2DGrad( : tmp_idx; T ouput_value = pool_process.use_x ? output_data[output_sub_idx] : static_cast(0); - pool_process.compute( - input, ouput_value, output_grad[output_sub_idx], - static_cast(1.0 / pool_size), &input_grad_data); + pool_process.compute(input, + ouput_value, + output_grad[output_sub_idx], + static_cast(1.0 / pool_size), + &input_grad_data); } } } else { @@ -232,9 +282,11 @@ __global__ void KernelPool2DGrad( : tmp_idx; T ouput_value = pool_process.use_x ? output_data[output_sub_idx] : static_cast(0); - pool_process.compute( - input, ouput_value, output_grad[output_sub_idx], - static_cast(1.0 / pool_size), &input_grad_data); + pool_process.compute(input, + ouput_value, + output_grad[output_sub_idx], + static_cast(1.0 / pool_size), + &input_grad_data); } } } @@ -244,19 +296,38 @@ __global__ void KernelPool2DGrad( } template -__global__ void KernelMaxPool2DGrad( - const int nthreads, const T* input_data, const T* output_data, - const T* output_grad, const int channels, const int input_height, - const int input_width, const int output_height, const int output_width, - const int ksize_height, const int ksize_width, const int stride_height, - const int stride_width, const int padding_height, const int padding_width, - T* input_grad, FastDivModForPooling divmods, bool channel_last = false) { +__global__ void KernelMaxPool2DGrad(const int nthreads, + const T* input_data, + const T* output_data, + const T* output_grad, + const int channels, + const int input_height, + const int input_width, + const int output_height, + const int output_width, + const int ksize_height, + const int ksize_width, + const int stride_height, + const int stride_width, + const int padding_height, + const int padding_width, + T* input_grad, + FastDivModForPooling divmods, + bool channel_last = false) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { int w_offset, h_offset, c_offset, input_offset; - OffsetPreparationFor4Dimension( - index, channel_last, divmods, 0, 0, input_width, input_height, - &w_offset, &h_offset, &c_offset, &input_offset); + OffsetPreparationFor4Dimension(index, + channel_last, + divmods, + 0, + 0, + input_width, + input_height, + &w_offset, + &h_offset, + &c_offset, + &input_offset); input_data += input_offset; input_grad += input_offset; @@ -285,17 +356,24 @@ __global__ void KernelMaxPool2DGrad( if (maxIndex != -1) { // atomic add - platform::CudaAtomicAdd(input_grad + maxIndex, output_grad[index]); + paddle::platform::CudaAtomicAdd(input_grad + maxIndex, + output_grad[index]); } } } template void Pool2dDirectCUDAFunctor::operator()( - const T* input, const std::vector& input_shape, - const std::vector& output_shape, const std::vector& ksize, - const std::vector& strides, const std::vector& paddings, - bool exclusive, bool adaptive, T* output, gpuStream_t stream, + const T* input, + const std::vector& input_shape, + const std::vector& output_shape, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + bool exclusive, + bool adaptive, + T* output, + gpuStream_t stream, PoolProcess pool_compute) { const int batch_size = input_shape[0]; const int input_channels = input_shape[1]; @@ -314,7 +392,7 @@ void Pool2dDirectCUDAFunctor::operator()( int nthreads = batch_size * output_channels * output_height * output_width; int thread_num = 1024; #ifdef WITH_NV_JETSON - // platform::ChangeThreadNum(context, &thread_num); + // paddle::platform::ChangeThreadNum(context, &thread_num); thread_num = 512; #endif int blocks = (nthreads + thread_num - 1) / thread_num; @@ -323,11 +401,24 @@ void Pool2dDirectCUDAFunctor::operator()( auto pool_divmods = FastDivModForPooling(input_channels, output_width, output_height); - KernelPool2D<<>>( - nthreads, input, input_channels, input_height, input_width, output_height, - output_width, ksize_height, ksize_width, stride_height, stride_width, - padding_height, padding_width, pool_divmods, pool_compute, exclusive, - adaptive, output); + KernelPool2D<<>>(nthreads, + input, + input_channels, + input_height, + input_width, + output_height, + output_width, + ksize_height, + ksize_width, + stride_height, + stride_width, + padding_height, + padding_width, + pool_divmods, + pool_compute, + exclusive, + adaptive, + output); } /* @@ -338,13 +429,16 @@ void Pool2dDirectCUDAFunctor::operator()( * height_down, width_left and width_right, respectively. */ template -class Pool2dFunctor { +class Pool2dFunctor { public: - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, const std::vector& ksize, + void operator()(const phi::GPUContext& context, + const DenseTensor& input, + const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, bool exclusive, - bool adaptive, framework::Tensor* output, + const std::vector& paddings, + bool exclusive, + bool adaptive, + DenseTensor* output, PoolProcess pool_process) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; @@ -361,12 +455,12 @@ class Pool2dFunctor { const int padding_width = paddings[1]; const T* input_data = input.data(); - T* output_data = output->mutable_data(context.GetPlace()); + T* output_data = context.template Alloc(output); int nthreads = batch_size * output_channels * output_height * output_width; int thread_num = 1024; #ifdef WITH_NV_JETSON - platform::ChangeThreadNum(context, &thread_num); + paddle::platform::ChangeThreadNum(context, &thread_num); #endif int blocks = (nthreads + thread_num - 1) / thread_num; dim3 threads(thread_num, 1); @@ -375,17 +469,35 @@ class Pool2dFunctor { auto pool_divmods = FastDivModForPooling(input_channels, output_width, output_height); KernelPool2D<<>>( - nthreads, input_data, input_channels, input_height, input_width, - output_height, output_width, ksize_height, ksize_width, stride_height, - stride_width, padding_height, padding_width, pool_divmods, pool_process, - exclusive, adaptive, output_data); + nthreads, + input_data, + input_channels, + input_height, + input_width, + output_height, + output_width, + ksize_height, + ksize_width, + stride_height, + stride_width, + padding_height, + padding_width, + pool_divmods, + pool_process, + exclusive, + adaptive, + output_data); } - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, const std::vector& ksize, + void operator()(const phi::GPUContext& context, + const DenseTensor& input, + const std::vector& ksize, const std::vector& strides, const std::vector& paddings, - const std::string data_format, bool exclusive, bool adaptive, - framework::Tensor* output, PoolProcess pool_process) { + const std::string data_format, + bool exclusive, + bool adaptive, + DenseTensor* output, + PoolProcess pool_process) { bool channel_last = (data_format == "NHWC"); const int batch_size = input.dims()[0]; @@ -410,12 +522,12 @@ class Pool2dFunctor { const int padding_width = paddings[1]; const T* input_data = input.data(); - T* output_data = output->mutable_data(context.GetPlace()); + T* output_data = context.template Alloc(output); int nthreads = batch_size * output_channels * output_height * output_width; int thread_num = 1024; #ifdef WITH_NV_JETSON - platform::ChangeThreadNum(context, &thread_num); + paddle::platform::ChangeThreadNum(context, &thread_num); #endif int blocks = (nthreads + thread_num - 1) / thread_num; dim3 threads(thread_num, 1); @@ -424,10 +536,25 @@ class Pool2dFunctor { auto pool_divmods = FastDivModForPooling(input_channels, output_width, output_height); KernelPool2D<<>>( - nthreads, input_data, input_channels, input_height, input_width, - output_height, output_width, ksize_height, ksize_width, stride_height, - stride_width, padding_height, padding_width, pool_divmods, pool_process, - exclusive, adaptive, output_data, channel_last); + nthreads, + input_data, + input_channels, + input_height, + input_width, + output_height, + output_width, + ksize_height, + ksize_width, + stride_height, + stride_width, + padding_height, + padding_width, + pool_divmods, + pool_process, + exclusive, + adaptive, + output_data, + channel_last); } }; /* @@ -438,16 +565,18 @@ class Pool2dFunctor { * height_down, width_left and width_right, respectively. */ template -class Pool2dGradFunctor { +class Pool2dGradFunctor { public: - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, + void operator()(const phi::GPUContext& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, bool exclusive, - bool adaptive, framework::Tensor* input_grad, + const std::vector& paddings, + bool exclusive, + bool adaptive, + DenseTensor* input_grad, PoolProcess pool_process) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; @@ -465,30 +594,53 @@ class Pool2dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T* input_grad_data = context.template Alloc(input_grad); int nthreads = batch_size * input_channels * input_height * input_width; - auto pool_divmods = FastDivModForPoolingWithMoreStaff( - input_channels, input_width, input_height, ksize_width, ksize_height, - stride_width, stride_height); - - auto config = GetGpuLaunchConfig1D(context, nthreads); - KernelPool2DGrad<<< - config.block_per_grid, config.thread_per_block, 0, context.stream()>>>( - nthreads, input_data, output_data, output_grad_data, output_width, - output_height, input_width, input_height, ksize_width, ksize_height, - stride_width, stride_height, padding_width, padding_height, - pool_divmods, pool_process, exclusive, adaptive, input_grad_data); + auto pool_divmods = FastDivModForPoolingWithMoreStaff(input_channels, + input_width, + input_height, + ksize_width, + ksize_height, + stride_width, + stride_height); + + auto config = phi::backends::gpu::GetGpuLaunchConfig1D(context, nthreads); + KernelPool2DGrad<<>>(nthreads, + input_data, + output_data, + output_grad_data, + output_width, + output_height, + input_width, + input_height, + ksize_width, + ksize_height, + stride_width, + stride_height, + padding_width, + padding_height, + pool_divmods, + pool_process, + exclusive, + adaptive, + input_grad_data); } - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, + void operator()(const phi::GPUContext& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, const std::vector& ksize, const std::vector& strides, const std::vector& paddings, - const std::string data_format, bool exclusive, bool adaptive, - framework::Tensor* input_grad, PoolProcess pool_process) { + const std::string data_format, + bool exclusive, + bool adaptive, + DenseTensor* input_grad, + PoolProcess pool_process) { bool channel_last = (data_format == "NHWC"); const int batch_size = input.dims()[0]; @@ -514,21 +666,41 @@ class Pool2dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T* input_grad_data = context.template Alloc(input_grad); int nthreads = batch_size * input_channels * input_height * input_width; - auto pool_divmods = FastDivModForPoolingWithMoreStaff( - input_channels, input_width, input_height, ksize_width, ksize_height, - stride_width, stride_height); - - auto config = GetGpuLaunchConfig1D(context, nthreads); - KernelPool2DGrad<<< - config.block_per_grid, config.thread_per_block, 0, context.stream()>>>( - nthreads, input_data, output_data, output_grad_data, output_width, - output_height, input_width, input_height, ksize_width, ksize_height, - stride_width, stride_height, padding_width, padding_height, - pool_divmods, pool_process, exclusive, adaptive, input_grad_data, - channel_last); + auto pool_divmods = FastDivModForPoolingWithMoreStaff(input_channels, + input_width, + input_height, + ksize_width, + ksize_height, + stride_width, + stride_height); + + auto config = phi::backends::gpu::GetGpuLaunchConfig1D(context, nthreads); + KernelPool2DGrad<<>>(nthreads, + input_data, + output_data, + output_grad_data, + output_width, + output_height, + input_width, + input_height, + ksize_width, + ksize_height, + stride_width, + stride_height, + padding_width, + padding_height, + pool_divmods, + pool_process, + exclusive, + adaptive, + input_grad_data, + channel_last); } }; @@ -540,16 +712,16 @@ class Pool2dGradFunctor { * height_down, width_left and width_right, respectively. */ template -class MaxPool2dGradFunctor { +class MaxPool2dGradFunctor { public: - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, + void operator()(const phi::GPUContext& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, const std::vector& ksize, const std::vector& strides, const std::vector& paddings, - framework::Tensor* input_grad) { + DenseTensor* input_grad) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; const int input_height = input.dims()[2]; @@ -567,7 +739,7 @@ class MaxPool2dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T* input_grad_data = context.template Alloc(input_grad); int nthreads = batch_size * output_channels * output_height * output_width; int blocks = (nthreads + 1024 - 1) / 1024; @@ -577,17 +749,33 @@ class MaxPool2dGradFunctor { auto pool_divmods = FastDivModForPooling(input_channels, output_width, output_height); KernelMaxPool2DGrad<<>>( - nthreads, input_data, output_data, output_grad_data, input_channels, - input_height, input_width, output_height, output_width, ksize_height, - ksize_width, stride_height, stride_width, padding_height, padding_width, - input_grad_data, pool_divmods); + nthreads, + input_data, + output_data, + output_grad_data, + input_channels, + input_height, + input_width, + output_height, + output_width, + ksize_height, + ksize_width, + stride_height, + stride_width, + padding_height, + padding_width, + input_grad_data, + pool_divmods); } - void operator()( - const platform::CUDADeviceContext& context, - const framework::Tensor& input, const framework::Tensor& output, - const framework::Tensor& output_grad, const std::vector& ksize, - const std::vector& strides, const std::vector& paddings, - const std::string data_format, framework::Tensor* input_grad) { + void operator()(const phi::GPUContext& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::string data_format, + DenseTensor* input_grad) { bool channel_last = (data_format == "NHWC"); const int batch_size = input.dims()[0]; @@ -614,7 +802,7 @@ class MaxPool2dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T* input_grad_data = context.template Alloc(input_grad); int nthreads = batch_size * output_channels * output_height * output_width; int blocks = (nthreads + 1024 - 1) / 1024; @@ -625,71 +813,80 @@ class MaxPool2dGradFunctor { FastDivModForPooling(input_channels, output_width, output_height); KernelMaxPool2DGrad<<>>( - nthreads, input_data, output_data, output_grad_data, input_channels, - input_height, input_width, output_height, output_width, ksize_height, - ksize_width, stride_height, stride_width, padding_height, padding_width, - input_grad_data, pool_divmods, channel_last); + nthreads, + input_data, + output_data, + output_grad_data, + input_channels, + input_height, + input_width, + output_height, + output_width, + ksize_height, + ksize_width, + stride_height, + stride_width, + padding_height, + padding_width, + input_grad_data, + pool_divmods, + channel_last); } }; -template class Pool2dDirectCUDAFunctor, - float>; -template class Pool2dDirectCUDAFunctor, - float>; - -template class MaxPool2dGradFunctor; -template class MaxPool2dGradFunctor; -template class MaxPool2dGradFunctor; - -template class Pool2dFunctor, float>; -template class Pool2dFunctor, float>; -template class Pool2dGradFunctor, - float>; -template class Pool2dGradFunctor, - float>; -template class Pool2dFunctor, double>; -template class Pool2dFunctor, double>; -template class Pool2dGradFunctor, - double>; -template class Pool2dGradFunctor, - double>; - -template class Pool2dFunctor< - platform::CUDADeviceContext, - paddle::operators::math::MaxPool, - paddle::platform::float16>; -template class Pool2dFunctor< - platform::CUDADeviceContext, - paddle::operators::math::AvgPool, - paddle::platform::float16>; -template class Pool2dGradFunctor< - platform::CUDADeviceContext, - paddle::operators::math::MaxPoolGrad, - paddle::platform::float16>; -template class Pool2dGradFunctor< - platform::CUDADeviceContext, - paddle::operators::math::AvgPoolGrad, - paddle::platform::float16>; +template class Pool2dDirectCUDAFunctor, float>; +template class Pool2dDirectCUDAFunctor, float>; + +template class MaxPool2dGradFunctor; +template class MaxPool2dGradFunctor; +template class MaxPool2dGradFunctor; + +template class Pool2dFunctor, float>; +template class Pool2dFunctor, float>; +template class Pool2dGradFunctor, float>; +template class Pool2dGradFunctor, float>; +template class Pool2dFunctor, double>; +template class Pool2dFunctor, double>; +template class Pool2dGradFunctor, double>; +template class Pool2dGradFunctor, double>; + +template class Pool2dFunctor, + dtype::float16>; +template class Pool2dFunctor, + dtype::float16>; +template class Pool2dGradFunctor, + dtype::float16>; +template class Pool2dGradFunctor, + dtype::float16>; template -__global__ void KernelPool3D( - const int nthreads, const T* input_data, const int channels, - const int input_depth, const int input_height, const int input_width, - const int output_depth, const int output_height, const int output_width, - const int ksize_depth, const int ksize_height, const int ksize_width, - const int stride_depth, const int stride_height, const int stride_width, - const int padding_depth, const int padding_height, const int padding_width, - PoolProcess pool_process, bool exclusive, bool adaptive, T* output_data, - bool channel_last = false) { +__global__ void KernelPool3D(const int nthreads, + const T* input_data, + const int channels, + const int input_depth, + const int input_height, + const int input_width, + const int output_depth, + const int output_height, + const int output_width, + const int ksize_depth, + const int ksize_height, + const int ksize_width, + const int stride_depth, + const int stride_height, + const int stride_width, + const int padding_depth, + const int padding_height, + const int padding_width, + PoolProcess pool_process, + bool exclusive, + bool adaptive, + T* output_data, + bool channel_last = false) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { int pw, ph, pd, c, batch_idx; @@ -764,16 +961,31 @@ __global__ void KernelPool3D( } template -__global__ void KernelPool3DGrad( - const int nthreads, const T* __restrict__ input_data, - const T* __restrict__ output_data, const T* __restrict__ output_grad, - const int channels, const int input_depth, const int input_height, - const int input_width, const int output_depth, const int output_height, - const int output_width, const int ksize_depth, const int ksize_height, - const int ksize_width, const int stride_depth, const int stride_height, - const int stride_width, const int padding_depth, const int padding_height, - const int padding_width, PoolProcess pool_process, bool exclusive, - bool adaptive, T* input_grad, bool channel_last = false) { +__global__ void KernelPool3DGrad(const int nthreads, + const T* __restrict__ input_data, + const T* __restrict__ output_data, + const T* __restrict__ output_grad, + const int channels, + const int input_depth, + const int input_height, + const int input_width, + const int output_depth, + const int output_height, + const int output_width, + const int ksize_depth, + const int ksize_height, + const int ksize_width, + const int stride_depth, + const int stride_height, + const int stride_width, + const int padding_depth, + const int padding_height, + const int padding_width, + PoolProcess pool_process, + bool exclusive, + bool adaptive, + T* input_grad, + bool channel_last = false) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { int w_offset, h_offset, d_offset, c_offset, batch_idx, output_stride; @@ -867,7 +1079,9 @@ __global__ void KernelPool3DGrad( : (pd * output_height + ph) * output_width + pw; T ouput_value = pool_process.use_x ? output_data[output_sub_idx] : static_cast(0); - pool_process.compute(input, ouput_value, output_grad[output_sub_idx], + pool_process.compute(input, + ouput_value, + output_grad[output_sub_idx], static_cast(1.0 / pool_size), &input_grad_data); } @@ -878,15 +1092,28 @@ __global__ void KernelPool3DGrad( } template -__global__ void KernelMaxPool3DGrad( - const int nthreads, const T* input_data, const T* output_data, - const T* output_grad, const int channels, const int input_depth, - const int input_height, const int input_width, const int output_depth, - const int output_height, const int output_width, const int ksize_depth, - const int ksize_height, const int ksize_width, const int stride_depth, - const int stride_height, const int stride_width, const int padding_depth, - const int padding_height, const int padding_width, T* input_grad, - bool channel_last = false) { +__global__ void KernelMaxPool3DGrad(const int nthreads, + const T* input_data, + const T* output_data, + const T* output_grad, + const int channels, + const int input_depth, + const int input_height, + const int input_width, + const int output_depth, + const int output_height, + const int output_width, + const int ksize_depth, + const int ksize_height, + const int ksize_width, + const int stride_depth, + const int stride_height, + const int stride_width, + const int padding_depth, + const int padding_height, + const int padding_width, + T* input_grad, + bool channel_last = false) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { int pw, ph, pd, c, batch_idx; @@ -949,17 +1176,23 @@ __global__ void KernelMaxPool3DGrad( } if (maxIdx != -1) { // atomic add - platform::CudaAtomicAdd(input_grad + maxIdx, output_grad[index]); + paddle::platform::CudaAtomicAdd(input_grad + maxIdx, output_grad[index]); } } } template void Pool3dDirectCUDAFunctor::operator()( - const T* input, const std::vector& input_shape, - const std::vector& output_shape, const std::vector& ksize, - const std::vector& strides, const std::vector& paddings, - bool exclusive, bool adaptive, T* output, gpuStream_t stream, + const T* input, + const std::vector& input_shape, + const std::vector& output_shape, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + bool exclusive, + bool adaptive, + T* output, + gpuStream_t stream, PoolProcess pool_compute) { const int batch_size = input_shape[0]; const int input_channels = input_shape[1]; @@ -990,11 +1223,28 @@ void Pool3dDirectCUDAFunctor::operator()( dim3 threads(thread_num, 1); dim3 grid(blocks, 1); - KernelPool3D<<>>( - nthreads, input, input_channels, input_depth, input_height, input_width, - output_depth, output_height, output_width, ksize_depth, ksize_height, - ksize_width, stride_depth, stride_height, stride_width, padding_depth, - padding_height, padding_width, pool_compute, exclusive, adaptive, output); + KernelPool3D<<>>(nthreads, + input, + input_channels, + input_depth, + input_height, + input_width, + output_depth, + output_height, + output_width, + ksize_depth, + ksize_height, + ksize_width, + stride_depth, + stride_height, + stride_width, + padding_depth, + padding_height, + padding_width, + pool_compute, + exclusive, + adaptive, + output); } /* @@ -1006,13 +1256,16 @@ void Pool3dDirectCUDAFunctor::operator()( * height_up, height_down, width_left and width_right, respectively. */ template -class Pool3dFunctor { +class Pool3dFunctor { public: - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, const std::vector& ksize, + void operator()(const phi::GPUContext& context, + const DenseTensor& input, + const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, bool exclusive, - bool adaptive, framework::Tensor* output, + const std::vector& paddings, + bool exclusive, + bool adaptive, + DenseTensor* output, PoolProcess pool_process) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; @@ -1034,31 +1287,52 @@ class Pool3dFunctor { const int padding_width = paddings[2]; const T* input_data = input.data(); - T* output_data = output->mutable_data(context.GetPlace()); + T* output_data = context.template Alloc(output); int nthreads = batch_size * output_channels * output_depth * output_height * output_width; int thread_num = 1024; #ifdef WITH_NV_JETSON - platform::ChangeThreadNum(context, &thread_num); + paddle::platform::ChangeThreadNum(context, &thread_num); #endif int blocks = (nthreads + thread_num - 1) / thread_num; dim3 threads(thread_num, 1); dim3 grid(blocks, 1); KernelPool3D<<>>( - nthreads, input_data, input_channels, input_depth, input_height, - input_width, output_depth, output_height, output_width, ksize_depth, - ksize_height, ksize_width, stride_depth, stride_height, stride_width, - padding_depth, padding_height, padding_width, pool_process, exclusive, - adaptive, output_data); + nthreads, + input_data, + input_channels, + input_depth, + input_height, + input_width, + output_depth, + output_height, + output_width, + ksize_depth, + ksize_height, + ksize_width, + stride_depth, + stride_height, + stride_width, + padding_depth, + padding_height, + padding_width, + pool_process, + exclusive, + adaptive, + output_data); } - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, const std::vector& ksize, + void operator()(const phi::GPUContext& context, + const DenseTensor& input, + const std::vector& ksize, const std::vector& strides, const std::vector& paddings, - const std::string data_format, bool exclusive, bool adaptive, - framework::Tensor* output, PoolProcess pool_process) { + const std::string data_format, + bool exclusive, + bool adaptive, + DenseTensor* output, + PoolProcess pool_process) { bool channel_last = (data_format == "NDHWC"); const int batch_size = input.dims()[0]; @@ -1089,24 +1363,42 @@ class Pool3dFunctor { const int padding_width = paddings[2]; const T* input_data = input.data(); - T* output_data = output->mutable_data(context.GetPlace()); + T* output_data = context.template Alloc(output); int nthreads = batch_size * output_channels * output_depth * output_height * output_width; int thread_num = 1024; #ifdef WITH_NV_JETSON - platform::ChangeThreadNum(context, &thread_num); + paddle::platform::ChangeThreadNum(context, &thread_num); #endif int blocks = (nthreads + thread_num - 1) / thread_num; dim3 threads(thread_num, 1); dim3 grid(blocks, 1); KernelPool3D<<>>( - nthreads, input_data, input_channels, input_depth, input_height, - input_width, output_depth, output_height, output_width, ksize_depth, - ksize_height, ksize_width, stride_depth, stride_height, stride_width, - padding_depth, padding_height, padding_width, pool_process, exclusive, - adaptive, output_data, channel_last); + nthreads, + input_data, + input_channels, + input_depth, + input_height, + input_width, + output_depth, + output_height, + output_width, + ksize_depth, + ksize_height, + ksize_width, + stride_depth, + stride_height, + stride_width, + padding_depth, + padding_height, + padding_width, + pool_process, + exclusive, + adaptive, + output_data, + channel_last); } }; @@ -1119,16 +1411,18 @@ class Pool3dFunctor { * height_up, height_down, width_left and width_right, respectively. */ template -class Pool3dGradFunctor { +class Pool3dGradFunctor { public: - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, + void operator()(const phi::GPUContext& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, bool exclusive, - bool adaptive, framework::Tensor* input_grad, + const std::vector& paddings, + bool exclusive, + bool adaptive, + DenseTensor* input_grad, PoolProcess pool_process) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; @@ -1152,7 +1446,7 @@ class Pool3dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T* input_grad_data = context.template Alloc(input_grad); int nthreads = batch_size * input_channels * input_depth * input_height * input_width; @@ -1161,21 +1455,43 @@ class Pool3dGradFunctor { dim3 grid(blocks, 1); KernelPool3DGrad<<>>( - nthreads, input_data, output_data, output_grad_data, input_channels, - input_depth, input_height, input_width, output_depth, output_height, - output_width, ksize_depth, ksize_height, ksize_width, stride_depth, - stride_height, stride_width, padding_depth, padding_height, - padding_width, pool_process, exclusive, adaptive, input_grad_data); + nthreads, + input_data, + output_data, + output_grad_data, + input_channels, + input_depth, + input_height, + input_width, + output_depth, + output_height, + output_width, + ksize_depth, + ksize_height, + ksize_width, + stride_depth, + stride_height, + stride_width, + padding_depth, + padding_height, + padding_width, + pool_process, + exclusive, + adaptive, + input_grad_data); } - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, + void operator()(const phi::GPUContext& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, const std::vector& ksize, const std::vector& strides, const std::vector& paddings, - const std::string data_format, bool exclusive, bool adaptive, - framework::Tensor* input_grad, PoolProcess pool_process) { + const std::string data_format, + bool exclusive, + bool adaptive, + DenseTensor* input_grad, + PoolProcess pool_process) { bool channel_last = (data_format == "NDHWC"); const int batch_size = input.dims()[0]; @@ -1206,7 +1522,7 @@ class Pool3dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T* input_grad_data = context.template Alloc(input_grad); int nthreads = batch_size * input_channels * input_depth * input_height * input_width; @@ -1215,11 +1531,30 @@ class Pool3dGradFunctor { dim3 grid(blocks, 1); KernelPool3DGrad<<>>( - nthreads, input_data, output_data, output_grad_data, input_channels, - input_depth, input_height, input_width, output_depth, output_height, - output_width, ksize_depth, ksize_height, ksize_width, stride_depth, - stride_height, stride_width, padding_depth, padding_height, - padding_width, pool_process, exclusive, adaptive, input_grad_data, + nthreads, + input_data, + output_data, + output_grad_data, + input_channels, + input_depth, + input_height, + input_width, + output_depth, + output_height, + output_width, + ksize_depth, + ksize_height, + ksize_width, + stride_depth, + stride_height, + stride_width, + padding_depth, + padding_height, + padding_width, + pool_process, + exclusive, + adaptive, + input_grad_data, channel_last); // add channel_last } }; @@ -1233,16 +1568,16 @@ class Pool3dGradFunctor { * height_up, height_down, width_left and width_right, respectively. */ template -class MaxPool3dGradFunctor { +class MaxPool3dGradFunctor { public: - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, + void operator()(const phi::GPUContext& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, const std::vector& ksize, const std::vector& strides, const std::vector& paddings, - framework::Tensor* input_grad) { + DenseTensor* input_grad) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; const int input_depth = input.dims()[2]; @@ -1265,7 +1600,7 @@ class MaxPool3dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T* input_grad_data = context.template Alloc(input_grad); int nthreads = batch_size * output_channels * output_depth * output_height * output_width; @@ -1274,18 +1609,37 @@ class MaxPool3dGradFunctor { dim3 grid(blocks, 1); KernelMaxPool3DGrad<<>>( - nthreads, input_data, output_data, output_grad_data, input_channels, - input_depth, input_height, input_width, output_depth, output_height, - output_width, ksize_depth, ksize_height, ksize_width, stride_depth, - stride_height, stride_width, padding_depth, padding_height, - padding_width, input_grad_data); + nthreads, + input_data, + output_data, + output_grad_data, + input_channels, + input_depth, + input_height, + input_width, + output_depth, + output_height, + output_width, + ksize_depth, + ksize_height, + ksize_width, + stride_depth, + stride_height, + stride_width, + padding_depth, + padding_height, + padding_width, + input_grad_data); } - void operator()( - const platform::CUDADeviceContext& context, - const framework::Tensor& input, const framework::Tensor& output, - const framework::Tensor& output_grad, const std::vector& ksize, - const std::vector& strides, const std::vector& paddings, - const std::string data_format, framework::Tensor* input_grad) { + void operator()(const phi::GPUContext& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::string data_format, + DenseTensor* input_grad) { bool channel_last = (data_format == "NDHWC"); const int batch_size = input.dims()[0]; @@ -1316,7 +1670,7 @@ class MaxPool3dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T* input_grad_data = context.template Alloc(input_grad); int nthreads = batch_size * output_channels * output_depth * output_height * output_width; @@ -1325,77 +1679,93 @@ class MaxPool3dGradFunctor { dim3 grid(blocks, 1); KernelMaxPool3DGrad<<>>( - nthreads, input_data, output_data, output_grad_data, input_channels, - input_depth, input_height, input_width, output_depth, output_height, - output_width, ksize_depth, ksize_height, ksize_width, stride_depth, - stride_height, stride_width, padding_depth, padding_height, - padding_width, input_grad_data, channel_last); // add channel_last + nthreads, + input_data, + output_data, + output_grad_data, + input_channels, + input_depth, + input_height, + input_width, + output_depth, + output_height, + output_width, + ksize_depth, + ksize_height, + ksize_width, + stride_depth, + stride_height, + stride_width, + padding_depth, + padding_height, + padding_width, + input_grad_data, + channel_last); // add channel_last } }; -template class Pool3dDirectCUDAFunctor, - float>; -template class Pool3dDirectCUDAFunctor, - float>; - -template class MaxPool3dGradFunctor; -template class MaxPool3dGradFunctor; -template class MaxPool3dGradFunctor; - -template class Pool3dFunctor, float>; -template class Pool3dFunctor, float>; -template class Pool3dGradFunctor, - float>; -template class Pool3dGradFunctor, - float>; -template class Pool3dFunctor, double>; -template class Pool3dFunctor, double>; -template class Pool3dGradFunctor, - double>; -template class Pool3dGradFunctor, - double>; - -template class Pool3dFunctor< - platform::CUDADeviceContext, - paddle::operators::math::MaxPool, - paddle::platform::float16>; -template class Pool3dFunctor< - platform::CUDADeviceContext, - paddle::operators::math::AvgPool, - paddle::platform::float16>; -template class Pool3dGradFunctor< - platform::CUDADeviceContext, - paddle::operators::math::MaxPoolGrad, - paddle::platform::float16>; -template class Pool3dGradFunctor< - platform::CUDADeviceContext, - paddle::operators::math::AvgPoolGrad, - paddle::platform::float16>; +template class Pool3dDirectCUDAFunctor, float>; +template class Pool3dDirectCUDAFunctor, float>; + +template class MaxPool3dGradFunctor; +template class MaxPool3dGradFunctor; +template class MaxPool3dGradFunctor; + +template class Pool3dFunctor, float>; +template class Pool3dFunctor, float>; +template class Pool3dGradFunctor, float>; +template class Pool3dGradFunctor, float>; +template class Pool3dFunctor, double>; +template class Pool3dFunctor, double>; +template class Pool3dGradFunctor, double>; +template class Pool3dGradFunctor, double>; + +template class Pool3dFunctor, + dtype::float16>; +template class Pool3dFunctor, + dtype::float16>; +template class Pool3dGradFunctor, + dtype::float16>; +template class Pool3dGradFunctor, + dtype::float16>; template -__global__ void KernelMaxPool2dWithIdx( - const int nthreads, const T1* input_data, const int channels, - const int input_height, const int input_width, const int output_height, - const int output_width, const int ksize_height, const int ksize_width, - const int stride_height, const int stride_width, const int padding_height, - const int padding_width, bool adaptive, T1* output_data, T2* mask_data, - FastDivModForPooling divmods) { +__global__ void KernelMaxPool2dWithIdx(const int nthreads, + const T1* input_data, + const int channels, + const int input_height, + const int input_width, + const int output_height, + const int output_width, + const int ksize_height, + const int ksize_width, + const int stride_height, + const int stride_width, + const int padding_height, + const int padding_width, + bool adaptive, + T1* output_data, + T2* mask_data, + FastDivModForPooling divmods) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { int hstart, hend, wstart, wend; int w_offset, h_offset, c_offset, input_offset; - OffsetPreparationFor4Dimension( - index, false, divmods, 0, 0, input_width, input_height, &w_offset, - &h_offset, &c_offset, &input_offset); + OffsetPreparationFor4Dimension(index, + false, + divmods, + 0, + 0, + input_width, + input_height, + &w_offset, + &h_offset, + &c_offset, + &input_offset); input_data += input_offset; if (adaptive) { @@ -1431,20 +1801,38 @@ __global__ void KernelMaxPool2dWithIdx( } template -__global__ void KernelMaxPool2DWithIdxGrad( - const int nthreads, const T1* output_grad, const T2* mask_data, - const int channels, const int input_height, const int input_width, - const int output_height, const int output_width, const int ksize_height, - const int ksize_width, const int stride_height, const int stride_width, - const int padding_height, const int padding_width, bool adaptive, - T1* input_grad, FastDivModForPooling divmods) { +__global__ void KernelMaxPool2DWithIdxGrad(const int nthreads, + const T1* output_grad, + const T2* mask_data, + const int channels, + const int input_height, + const int input_width, + const int output_height, + const int output_width, + const int ksize_height, + const int ksize_width, + const int stride_height, + const int stride_width, + const int padding_height, + const int padding_width, + bool adaptive, + T1* input_grad, + FastDivModForPooling divmods) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { int phstart, phend, pwstart, pwend; int w_offset, h_offset, c_offset, output_offset; - OffsetPreparationFor4Dimension( - index, false, divmods, 0, 0, output_width, output_height, &w_offset, - &h_offset, &c_offset, &output_offset); + OffsetPreparationFor4Dimension(index, + false, + divmods, + 0, + 0, + output_width, + output_height, + &w_offset, + &h_offset, + &c_offset, + &output_offset); mask_data += output_offset; output_grad += output_offset; @@ -1487,13 +1875,16 @@ __global__ void KernelMaxPool2DWithIdxGrad( * height and width, respectively. */ template -class MaxPool2dWithIndexFunctor { +class MaxPool2dWithIndexFunctor { public: - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, const std::vector& ksize, + void operator()(const phi::GPUContext& context, + const DenseTensor& input, + const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, bool adaptive, - framework::Tensor* output, framework::Tensor* mask) { + const std::vector& paddings, + bool adaptive, + DenseTensor* output, + DenseTensor* mask) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; const int input_height = input.dims()[2]; @@ -1509,13 +1900,13 @@ class MaxPool2dWithIndexFunctor { const int padding_width = paddings[1]; const T1* input_data = input.data(); - T1* output_data = output->mutable_data(context.GetPlace()); - T2* mask_data = mask->mutable_data(context.GetPlace()); + T1* output_data = context.template Alloc(output); + T2* mask_data = context.template Alloc(mask); int nthreads = batch_size * output_channels * output_height * output_width; int thread_num = 1024; #ifdef WITH_NV_JETSON - platform::ChangeThreadNum(context, &thread_num); + paddle::platform::ChangeThreadNum(context, &thread_num); #endif int blocks = (nthreads + thread_num - 1) / thread_num; @@ -1525,10 +1916,23 @@ class MaxPool2dWithIndexFunctor { auto pool_divmods = FastDivModForPooling(input_channels, output_width, output_height); KernelMaxPool2dWithIdx<<>>( - nthreads, input_data, input_channels, input_height, input_width, - output_height, output_width, ksize_height, ksize_width, stride_height, - stride_width, padding_height, padding_width, adaptive, output_data, - mask_data, pool_divmods); + nthreads, + input_data, + input_channels, + input_height, + input_width, + output_height, + output_width, + ksize_height, + ksize_width, + stride_height, + stride_width, + padding_height, + padding_width, + adaptive, + output_data, + mask_data, + pool_divmods); } }; @@ -1538,14 +1942,16 @@ class MaxPool2dWithIndexFunctor { * height and width, respectively. */ template -class MaxPool2dWithIndexGradFunctor { +class MaxPool2dWithIndexGradFunctor { public: - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& output_grad, - const framework::Tensor& mask, const std::vector& ksize, + void operator()(const phi::GPUContext& context, + const DenseTensor& output_grad, + const DenseTensor& mask, + const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, bool adaptive, - framework::Tensor* input_grad) { + const std::vector& paddings, + bool adaptive, + DenseTensor* input_grad) { const int batch_size = input_grad->dims()[0]; const int input_channels = input_grad->dims()[1]; const int input_height = input_grad->dims()[2]; @@ -1561,7 +1967,7 @@ class MaxPool2dWithIndexGradFunctor { const T2* mask_data = mask.data(); const T1* output_grad_data = output_grad.data(); - T1* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T1* input_grad_data = context.template Alloc(input_grad); int nthreads = batch_size * input_channels * input_height * input_width; int blocks = (nthreads + 1024 - 1) / 1024; @@ -1571,31 +1977,53 @@ class MaxPool2dWithIndexGradFunctor { auto pool_divmods = FastDivModForPooling(input_channels, input_width, input_height); KernelMaxPool2DWithIdxGrad<<>>( - nthreads, output_grad_data, mask_data, input_channels, input_height, - input_width, output_height, output_width, ksize_height, ksize_width, - stride_height, stride_width, padding_height, padding_width, adaptive, - input_grad_data, pool_divmods); + nthreads, + output_grad_data, + mask_data, + input_channels, + input_height, + input_width, + output_height, + output_width, + ksize_height, + ksize_width, + stride_height, + stride_width, + padding_height, + padding_width, + adaptive, + input_grad_data, + pool_divmods); } }; -template class MaxPool2dWithIndexFunctor; -template class MaxPool2dWithIndexGradFunctor; -template class MaxPool2dWithIndexFunctor; -template class MaxPool2dWithIndexGradFunctor; +template class MaxPool2dWithIndexFunctor; +template class MaxPool2dWithIndexGradFunctor; +template class MaxPool2dWithIndexFunctor; +template class MaxPool2dWithIndexGradFunctor; template -__global__ void KernelMaxPool3DWithIdx( - const int nthreads, const T1* input_data, const int channels, - const int input_depth, const int input_height, const int input_width, - const int output_depth, const int output_height, const int output_width, - const int ksize_depth, const int ksize_height, const int ksize_width, - const int stride_depth, const int stride_height, const int stride_width, - const int padding_depth, const int padding_height, const int padding_width, - bool adaptive, T1* output_data, T2* mask_data) { +__global__ void KernelMaxPool3DWithIdx(const int nthreads, + const T1* input_data, + const int channels, + const int input_depth, + const int input_height, + const int input_width, + const int output_depth, + const int output_height, + const int output_width, + const int ksize_depth, + const int ksize_height, + const int ksize_width, + const int stride_depth, + const int stride_height, + const int stride_width, + const int padding_depth, + const int padding_height, + const int padding_width, + bool adaptive, + T1* output_data, + T2* mask_data) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { int pw = index % output_width; @@ -1650,14 +2078,27 @@ __global__ void KernelMaxPool3DWithIdx( } template -__global__ void KernelMaxPool3DWithIdxGrad( - const int nthreads, const T1* output_grad, const T2* mask, - const int channels, const int input_depth, const int input_height, - const int input_width, const int output_depth, const int output_height, - const int output_width, const int ksize_depth, const int ksize_height, - const int ksize_width, const int stride_depth, const int stride_height, - const int stride_width, const int padding_depth, const int padding_height, - const int padding_width, bool adaptive, T1* input_grad) { +__global__ void KernelMaxPool3DWithIdxGrad(const int nthreads, + const T1* output_grad, + const T2* mask, + const int channels, + const int input_depth, + const int input_height, + const int input_width, + const int output_depth, + const int output_height, + const int output_width, + const int ksize_depth, + const int ksize_height, + const int ksize_width, + const int stride_depth, + const int stride_height, + const int stride_width, + const int padding_depth, + const int padding_height, + const int padding_width, + bool adaptive, + T1* input_grad) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { int w_offset = index % input_width; @@ -1727,13 +2168,16 @@ __global__ void KernelMaxPool3DWithIdxGrad( * depth, height and width, respectively. */ template -class MaxPool3dWithIndexFunctor { +class MaxPool3dWithIndexFunctor { public: - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, const std::vector& ksize, + void operator()(const phi::GPUContext& context, + const DenseTensor& input, + const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, bool adaptive, - framework::Tensor* output, framework::Tensor* mask) { + const std::vector& paddings, + bool adaptive, + DenseTensor* output, + DenseTensor* mask) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; const int input_depth = input.dims()[2]; @@ -1754,14 +2198,14 @@ class MaxPool3dWithIndexFunctor { const int padding_width = paddings[2]; const T1* input_data = input.data(); - T1* output_data = output->mutable_data(context.GetPlace()); - T2* mask_data = mask->mutable_data(context.GetPlace()); + T1* output_data = context.template Alloc(output); + T2* mask_data = context.template Alloc(mask); int nthreads = batch_size * output_channels * output_depth * output_height * output_width; int thread_num = 1024; #ifdef WITH_NV_JETSON - platform::ChangeThreadNum(context, &thread_num); + paddle::platform::ChangeThreadNum(context, &thread_num); #endif int blocks = (nthreads + thread_num - 1) / thread_num; @@ -1769,10 +2213,26 @@ class MaxPool3dWithIndexFunctor { dim3 grid(blocks, 1); KernelMaxPool3DWithIdx<<>>( - nthreads, input_data, input_channels, input_depth, input_height, - input_width, output_depth, output_height, output_width, ksize_depth, - ksize_height, ksize_width, stride_depth, stride_height, stride_width, - padding_depth, padding_height, padding_width, adaptive, output_data, + nthreads, + input_data, + input_channels, + input_depth, + input_height, + input_width, + output_depth, + output_height, + output_width, + ksize_depth, + ksize_height, + ksize_width, + stride_depth, + stride_height, + stride_width, + padding_depth, + padding_height, + padding_width, + adaptive, + output_data, mask_data); } }; @@ -1783,14 +2243,16 @@ class MaxPool3dWithIndexFunctor { * depth, height and width, respectively. */ template -class MaxPool3dWithIndexGradFunctor { +class MaxPool3dWithIndexGradFunctor { public: - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& output_grad, - const framework::Tensor& mask, const std::vector& ksize, + void operator()(const phi::GPUContext& context, + const DenseTensor& output_grad, + const DenseTensor& mask, + const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, bool adaptive, - framework::Tensor* input_grad) { + const std::vector& paddings, + bool adaptive, + DenseTensor* input_grad) { const int batch_size = input_grad->dims()[0]; const int input_channels = input_grad->dims()[1]; const int input_depth = input_grad->dims()[2]; @@ -1811,7 +2273,7 @@ class MaxPool3dWithIndexGradFunctor { const T1* output_grad_data = output_grad.data(); const T2* mask_data = mask.data(); - T1* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T1* input_grad_data = context.template Alloc(input_grad); int nthreads = batch_size * input_channels * input_depth * input_height * input_width; @@ -1820,23 +2282,34 @@ class MaxPool3dWithIndexGradFunctor { dim3 grid(blocks, 1); KernelMaxPool3DWithIdxGrad<<>>( - nthreads, output_grad_data, mask_data, input_channels, input_depth, - input_height, input_width, output_depth, output_height, output_width, - ksize_depth, ksize_height, ksize_width, stride_depth, stride_height, - stride_width, padding_depth, padding_height, padding_width, adaptive, + nthreads, + output_grad_data, + mask_data, + input_channels, + input_depth, + input_height, + input_width, + output_depth, + output_height, + output_width, + ksize_depth, + ksize_height, + ksize_width, + stride_depth, + stride_height, + stride_width, + padding_depth, + padding_height, + padding_width, + adaptive, input_grad_data); } }; -template class MaxPool3dWithIndexFunctor; -template class MaxPool3dWithIndexGradFunctor; -template class MaxPool3dWithIndexFunctor; -template class MaxPool3dWithIndexGradFunctor; - -} // namespace math -} // namespace operators -} // namespace paddle +template class MaxPool3dWithIndexFunctor; +template class MaxPool3dWithIndexGradFunctor; +template class MaxPool3dWithIndexFunctor; +template class MaxPool3dWithIndexGradFunctor; + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/pooling.h b/paddle/phi/kernels/funcs/pooling.h new file mode 100644 index 0000000000..19c6d52c4c --- /dev/null +++ b/paddle/phi/kernels/funcs/pooling.h @@ -0,0 +1,469 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include "paddle/fluid/platform/macros.h" // import FLT_MAX +#include "paddle/phi/common/amp_type_traits.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/hostdevice.h" + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#include "paddle/phi/backends/gpu/gpu_decls.h" +#endif + +namespace phi { +namespace funcs { + +/* + * \brief Extracting simple operations from pooling. + * Both MaxPool and AvgPool need "initial", "compute" and "finalize" + * operation. + * MaxPool initializes temp variable to the negative maximum to find the + * maximum value in the pooling field. + * AvgPool initializes temp variable to the zero to accumulate all values + * in pool pooling, and finally takes the average. + * MaxPoolGrad and AvgPoolGrad are gradient operations respectively. + */ +template +class MaxPool { + public: + DEVICE inline T initial() { return static_cast(-FLT_MAX); } + DEVICE inline void compute(const T& x, T* y) { *y = *y > x ? *y : x; } + DEVICE inline void finalize(const T& pool_field, T* y) {} +}; + +template +class AvgPool { + using MT = typename dtype::MPTypeTrait::Type; + MT intermediate_res; + + public: + DEVICE inline T initial() { + intermediate_res = static_cast(0.0f); + return static_cast(0); + } + + DEVICE inline void compute(const T& x, T* y) { + intermediate_res += static_cast(x); + } + + DEVICE inline void finalize(const T& pool_field, T* y) { + *y = static_cast(intermediate_res / (static_cast(pool_field))); + } +}; + +template +class MaxPoolGrad { + public: + static constexpr bool use_x = true; + HOSTDEVICE inline void compute( + const T& x, const T& y, const T& dy, T scale, T* dx) { + *dx += dy * static_cast(x == y); + } +}; + +template +class AvgPoolGrad { + public: + static constexpr bool use_x = false; + HOSTDEVICE inline void compute( + const T& x, const T& y, const T& dy, T scale, T* dx) { + *dx += (scale * dy); + } +}; + +/* used for adaptive pool to calculate start and end index of each divided grid + */ +HOSTDEVICE inline int AdaptStartIndex(int ph, int input_size, int output_size) { + return static_cast( + floor(static_cast(ph * input_size) / output_size)); +} + +HOSTDEVICE inline int AdaptEndIndex(int ph, int input_size, int output_size) { + return static_cast( + ceil(static_cast((ph + 1) * input_size) / output_size)); +} + +/* + * \brief Getting pooling results, and calculating gradient. + * + * In pool2d, all Tensors are in NCHW or NHWC format. Where N is batch size, C + * is the number of channels, H and W is the height and width of feature. + * In pool3d, all Tensors are in NCDHW or NDHWC format. Where N is batch size, C + * is the number of channels, D, H and W is the depth, height and width of + * feature. + * + * In max pooling, it is possible that the pooling region has multiple maximum + * elements. In this case, we should compute the gradient of the first maximum + * element. + * This is different from average pooling. So we rewrite the max_pool_grad: + * MaxPool2dGradFunctor, MaxPool3dGradFunctor. + */ +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +template +class Pool2dDirectCUDAFunctor { + public: + void operator()(const T* input, + const std::vector& input_shape, + const std::vector& output_shape, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + bool exclusive, + bool adaptive, + T* output, + gpuStream_t stream, + PoolProcess pool_compute); +}; +#endif + +template +class Pool2dFunctor { + public: + void operator()(const Context& context, + const DenseTensor& input, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + bool exclusive, + bool adaptive, + DenseTensor* output, + PoolProcess pool_compute); + + // overload operator() to support argument data_format + void operator()(const Context& context, + const DenseTensor& input, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::string data_format, + bool exclusive, + bool adaptive, + DenseTensor* output, + PoolProcess pool_compute); +}; + +template +class Pool2dGradFunctor { + public: + void operator()(const Context& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + bool exclusive, + bool adaptive, + DenseTensor* input_grad, + PoolProcess pool_compute); + // overload operator() to support argument data_format + void operator()(const Context& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::string data_format, + bool exclusive, + bool adaptive, + DenseTensor* input_grad, + PoolProcess pool_compute); +}; + +template +class MaxPool2dGradFunctor { + public: + void operator()(const Context& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + DenseTensor* input_grad); + // overload operator() to support argument data_format + void operator()(const Context& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::string data_format, + DenseTensor* input_grad); +}; + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +template +class Pool3dDirectCUDAFunctor { + public: + void operator()(const T* input, + const std::vector& input_shape, + const std::vector& output_shape, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + bool exclusive, + bool adaptive, + T* output, + gpuStream_t stream, + PoolProcess pool_compute); +}; +#endif + +template +class Pool3dFunctor { + public: + void operator()(const Context& context, + const DenseTensor& input, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + bool exclusive, + bool adaptive, + DenseTensor* output, + PoolProcess pool_compute); + // overload operator() to support argument data_format + void operator()(const Context& context, + const DenseTensor& input, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::string data_format, + bool exclusive, + bool adaptive, + DenseTensor* output, + PoolProcess pool_compute); +}; + +template +class Pool3dGradFunctor { + public: + void operator()(const Context& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + bool exclusive, + bool adaptive, + DenseTensor* input_grad, + PoolProcess pool_compute); + // overload operator() to support argument data_format + void operator()(const Context& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::string data_format, + bool exclusive, + bool adaptive, + DenseTensor* input_grad, + PoolProcess pool_compute); +}; + +template +class MaxPool3dGradFunctor { + public: + void operator()(const Context& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + DenseTensor* input_grad); + // overload operator() to support argument data_format + void operator()(const Context& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::string data_format, + DenseTensor* input_grad); +}; + +/* + * \brief Getting max pooling results and corresponding max index, and + * calculating gradient. + * In up-sampling-pooling, it is necessary to know max element index. + * In pool2d, all tensors are in NCHW format. In pool3d, all tensors are in + * NCDHW format. + */ +template +class MaxPool2dWithIndexFunctor { + public: + void operator()(const Context& context, + const DenseTensor& input, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + bool adaptive, + DenseTensor* output, + DenseTensor* mask); +}; + +template +class MaxPool2dWithIndexGradFunctor { + public: + void operator()(const Context& context, + const DenseTensor& output_grad, + const DenseTensor& mask, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + bool adaptive, + DenseTensor* input_grad); +}; + +template +class MaxPool3dWithIndexFunctor { + public: + void operator()(const Context& context, + const DenseTensor& input, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + bool adaptive, + DenseTensor* output, + DenseTensor* mask); +}; + +template +class MaxPool3dWithIndexGradFunctor { + public: + void operator()(const Context& context, + const DenseTensor& output_grad, + const DenseTensor& mask, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + bool adaptive, + DenseTensor* input_grad); +}; + +inline int PoolOutputSize(int input_size, + int filter_size, + int padding_1, + int padding_2, + int stride, + bool ceil_mode) { + int output_size; + if (!ceil_mode) { + output_size = + (input_size - filter_size + padding_1 + padding_2) / stride + 1; + } else { + output_size = + (input_size - filter_size + padding_1 + padding_2 + stride - 1) / + stride + + 1; + } + PADDLE_ENFORCE_GT( + output_size, + 0, + errors::InvalidArgument( + "the output size must be greater than 0. But received: " + "output_size = %d due to the settings of input_size(%d), " + "padding(%d,%d), " + "k_size(%d) and stride(%d). Please check again!", + output_size, + input_size, + padding_1, + padding_2, + filter_size, + stride)); + return output_size; +} + +inline int MaxPoolOutputSize(int input_size, + int filter_size, + int padding, + int stride) { + int output_size = (input_size - filter_size + 2 * padding) / stride + 1; + return output_size; +} + +template +inline void UpdatePadding(std::vector* paddings, + const bool global_pooling, + const bool adaptive, + const std::string padding_algorithm, + const DDim data_dims, + const std::vector& strides, + const std::vector& kernel_size) { + // set padding size == data_dims.size() * 2 + auto data_shape = vectorize(data_dims); + if (static_cast(paddings->size()) == data_dims.size()) { + for (int i = 0; i < data_dims.size(); ++i) { + T copy_pad = *(paddings->begin() + 2 * i); + paddings->insert(paddings->begin() + 2 * i + 1, copy_pad); + } + } else { + PADDLE_ENFORCE_EQ(data_dims.size() * 2, + paddings->size(), + errors::InvalidArgument( + "Paddings size %d should be the same or twice as the " + "pooling size %d.", + paddings->size(), + data_dims.size() * 2)); + } + + // when padding_algorithm is "VALID" or "SAME" + if (padding_algorithm == "SAME") { + for (int i = 0; i < data_dims.size(); ++i) { + T out_size = (data_dims[i] + strides[i] - 1) / strides[i]; + T pad_sum = + std::max((out_size - 1) * strides[i] + kernel_size[i] - data_shape[i], + static_cast(0)); + T pad_0 = pad_sum / 2; + T pad_1 = pad_sum - pad_0; + *(paddings->begin() + i * 2) = pad_0; + *(paddings->begin() + i * 2 + 1) = pad_1; + } + } else if (padding_algorithm == "VALID") { + for (auto it = paddings->begin(); it != paddings->end(); it++) { + *it = 0; + } + } + + // if global_pooling == true or adaptive == true, padding will be ignore + if (global_pooling || adaptive) { + for (auto it = paddings->begin(); it != paddings->end(); it++) { + *it = 0; + } + } +} + +template +inline void UpdateKernelSize(std::vector* kernel_size, + const DDim data_dims) { + kernel_size->resize(static_cast(data_dims.size())); + for (size_t i = 0; i < kernel_size->size(); ++i) { + *(kernel_size->begin() + i) = static_cast(data_dims[i]); + } +} + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/gpu/pool_grad_kernel.cu b/paddle/phi/kernels/gpu/pool_grad_kernel.cu new file mode 100644 index 0000000000..a5ab6a1ccd --- /dev/null +++ b/paddle/phi/kernels/gpu/pool_grad_kernel.cu @@ -0,0 +1,60 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/pool_grad_kernel.h" + +#include "paddle/phi/kernels/impl/pool_grad_kernel_impl.h" + +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL(pool2d_grad, + GPU, + ALL_LAYOUT, + phi::Pool2dGradKernel, + float, + double, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(pool2d_double_grad, + GPU, + ALL_LAYOUT, + phi::Pool2dDoubleGradKernel, + float, + double) {} +PD_REGISTER_KERNEL(max_pool2d_with_index_grad, + GPU, + ALL_LAYOUT, + phi::MaxPool2dWithIndexGradKernel, + float, + double) { + kernel->InputAt(1).SetDataType( + paddle::experimental::CppTypeToDataType::Type()); +} + +PD_REGISTER_KERNEL(pool3d_grad, + GPU, + ALL_LAYOUT, + phi::Pool3dGradKernel, + float, + double, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(max_pool3d_with_index_grad, + GPU, + ALL_LAYOUT, + phi::MaxPool3dWithIndexGradKernel, + float, + double) { + kernel->InputAt(1).SetDataType( + paddle::experimental::CppTypeToDataType::Type()); +} diff --git a/paddle/phi/kernels/gpu/pool_kernel.cu b/paddle/phi/kernels/gpu/pool_kernel.cu new file mode 100644 index 0000000000..e8641395be --- /dev/null +++ b/paddle/phi/kernels/gpu/pool_kernel.cu @@ -0,0 +1,54 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/pool_kernel.h" + +#include "paddle/phi/kernels/impl/pool_kernel_impl.h" + +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL(pool2d, + GPU, + ALL_LAYOUT, + phi::Pool2dKernel, + float, + double, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(max_pool2d_with_index, + GPU, + ALL_LAYOUT, + phi::MaxPool2dWithIndexKernel, + float, + double) { + kernel->OutputAt(1).SetDataType( + paddle::experimental::CppTypeToDataType::Type()); +} + +PD_REGISTER_KERNEL(pool3d, + GPU, + ALL_LAYOUT, + phi::Pool3dKernel, + float, + double, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(max_pool3d_with_index, + GPU, + ALL_LAYOUT, + phi::MaxPool3dWithIndexKernel, + float, + double) { + kernel->OutputAt(1).SetDataType( + paddle::experimental::CppTypeToDataType::Type()); +} diff --git a/paddle/phi/kernels/gpu/split_kernel.cu b/paddle/phi/kernels/gpu/split_kernel.cu index c28fc3794f..83c2ec4b6e 100644 --- a/paddle/phi/kernels/gpu/split_kernel.cu +++ b/paddle/phi/kernels/gpu/split_kernel.cu @@ -37,7 +37,7 @@ void SplitKernel(const Context& dev_ctx, out_metas_ptr.push_back(&out_metas.back()); } - phi::SplitInferMeta(x, num_or_sections, axis_scalar, out_metas_ptr, true); + phi::SplitInferMeta(x, num_or_sections, axis_scalar, out_metas_ptr); for (size_t i = 0; i < out_metas.size(); ++i) { outs[i]->Resize(out_metas[i].dims()); diff --git a/paddle/phi/kernels/gpudnn/pool_gpudnn.h b/paddle/phi/kernels/gpudnn/pool_gpudnn.h new file mode 100644 index 0000000000..0cf2c99146 --- /dev/null +++ b/paddle/phi/kernels/gpudnn/pool_gpudnn.h @@ -0,0 +1,43 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" + +namespace phi { + +using GPUDNNDataLayout = paddle::platform::DataLayout; +using PoolingMode = paddle::platform::PoolingMode; +using ScopedPoolingDescriptor = paddle::platform::ScopedPoolingDescriptor; +using ScopedTensorDescriptor = paddle::platform::ScopedTensorDescriptor; + +template +using ScalingParamType = + typename paddle::platform::CudnnDataType::ScalingParamType; + +inline GPUDNNDataLayout GetLayoutFromStr(std::string data_format) { + if (data_format == "NHWC") { + return GPUDNNDataLayout::kNHWC; + } else if (data_format == "NCHW") { + return GPUDNNDataLayout::kNCHW; + } else if (data_format == "NCDHW") { + return GPUDNNDataLayout::kNCDHW; + } else { + return GPUDNNDataLayout::kNCDHW; + } +} + +} // namespace phi diff --git a/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu b/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu new file mode 100644 index 0000000000..b731d03347 --- /dev/null +++ b/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu @@ -0,0 +1,448 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/pool_grad_kernel.h" + +#include "paddle/phi/kernels/gpudnn/pool_gpudnn.h" + +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/pooling.h" +#include "paddle/phi/kernels/pool_kernel.h" + +#ifdef PADDLE_WITH_HIP +#include "paddle/phi/kernels/impl/pool_grad_kernel_impl.h" // PoolGradRawGPUDNNKernel will call PoolGradRawKernel for pooling type "max" in ROCm +#endif + +namespace phi { + +template +void PoolGradRawGPUDNNKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* dx) { + PADDLE_ENFORCE_EQ( + paddle::platform::is_gpu_place(ctx.GetPlace()), + true, + errors::InvalidArgument("Pool operator CUDA kernel must use CUDAPlace " + "rather than CPUPlace.")); + + const DenseTensor* input = &x; + const DenseTensor* output = &out; + const DenseTensor* output_grad = &dout; + DenseTensor* input_grad = dx; + std::vector paddings_ = paddings; + std::vector kernel_size_ = kernel_size; + + const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); + +#ifdef PADDLE_WITH_HIP + if (pooling_type == "max") { + PoolGradRawKernel(ctx, + x, + out, + dout, + kernel_size, + strides, + paddings_, + exclusive, + data_format, + pooling_type, + global_pooling, + adaptive, + padding_algorithm, + dx); + return; + } +#endif + + // update paddings + auto in_x_dims = input->dims(); + DDim data_dims; + if (channel_last) { + data_dims = slice_ddim(in_x_dims, 1, in_x_dims.size() - 1); + } else { + data_dims = slice_ddim(in_x_dims, 2, in_x_dims.size()); + } + funcs::UpdatePadding(&paddings_, + global_pooling, + adaptive, + padding_algorithm, + data_dims, + strides, + kernel_size_); + if (data_dims.size() * 2 == static_cast(paddings_.size())) { + for (int i = 0; i < data_dims.size(); ++i) { + paddings_.erase(paddings_.begin() + i + 1); + } + } + + if (global_pooling) { + funcs::UpdateKernelSize(&kernel_size_, data_dims); + } + + // ------- tensor grad -------------- + DenseTensor transformed_input(input->type()); + DenseTensor transformed_output(output->type()); + DenseTensor transformed_output_grad(output_grad->type()); + + ctx.template Alloc(input_grad); + DenseTensor transformed_input_grad(input_grad->type()); + GPUDNNDataLayout layout; + const std::string str_NCHW = "NCHW", str_NHWC = "NHWC"; + const std::string str_NCDHW = "NCDHW", str_NDHWC = "NDHWC"; + if (data_format == str_NDHWC) { + layout = GPUDNNDataLayout::kNCDHW; + std::vector axis{0, 4, 1, 2, 3}; + + // input + transformed_input.Resize(input->dims()); + auto in_dims_vec = vectorize(input->dims()); + in_dims_vec[1] = input->dims()[4]; + in_dims_vec[2] = input->dims()[1]; + in_dims_vec[3] = input->dims()[2]; + in_dims_vec[4] = input->dims()[3]; + transformed_input.Resize(make_ddim(in_dims_vec)); + ctx.Alloc(&transformed_input, input->type()); + + funcs::Transpose trans5; + trans5(ctx, *input, &transformed_input, axis); + + // output + transformed_output.Resize(output->dims()); + auto out_dims_vec = vectorize(output->dims()); + out_dims_vec[1] = output->dims()[4]; + out_dims_vec[2] = output->dims()[1]; + out_dims_vec[3] = output->dims()[2]; + out_dims_vec[4] = output->dims()[3]; + transformed_output.Resize(make_ddim(out_dims_vec)); + + ctx.Alloc(&transformed_output, output->type()); + + funcs::Transpose trans5_v2; + trans5_v2(ctx, *output, &transformed_output, axis); + + // output grad + transformed_output_grad.Resize(make_ddim(out_dims_vec)); + ctx.Alloc(&transformed_output_grad, output_grad->type()); + + funcs::Transpose trans5_v3; + trans5_v3(ctx, *output_grad, &transformed_output_grad, axis); + + // input grad + transformed_input_grad.Resize(make_ddim(in_dims_vec)); + +#ifdef PADDLE_WITH_HIP + // MIOPEN not support NHWC data layout + } else if (data_format == str_NHWC) { + layout = GPUDNNDataLayout::kNCHW; + + std::vector axis{0, 3, 1, 2}; + + // input + transformed_input.Resize(input->dims()); + auto in_dims_vec = vectorize(input->dims()); + in_dims_vec[1] = input->dims()[3]; + in_dims_vec[2] = input->dims()[1]; + in_dims_vec[3] = input->dims()[2]; + transformed_input.Resize(make_ddim(in_dims_vec)); + ctx.Alloc(&transformed_input, input->type()); + + funcs::Transpose trans4; + trans4(ctx, *input, &transformed_input, axis); + + // output + transformed_output.Resize(output->dims()); + auto out_dims_vec = vectorize(output->dims()); + out_dims_vec[1] = output->dims()[3]; + out_dims_vec[2] = output->dims()[1]; + out_dims_vec[3] = output->dims()[2]; + transformed_output.Resize(make_ddim(out_dims_vec)); + ctx.Alloc(&transformed_output, output->type()); + + funcs::Transpose trans4_v2; + trans4_v2(ctx, *output, &transformed_output, axis); + + // output grad + transformed_output_grad.Resize(make_ddim(out_dims_vec)); + ctx.Alloc(&transformed_output_grad, output_grad->type()); + + funcs::Transpose trans4_v3; + trans4_v3(ctx, *output_grad, &transformed_output_grad, axis); + + // input grad + transformed_input_grad.Resize(make_ddim(in_dims_vec)); +#endif + } else { + layout = GetLayoutFromStr(data_format); + transformed_input = *input; + transformed_output = *output; + transformed_output_grad = *output_grad; + transformed_input_grad = *input_grad; + } + + const T* input_data = transformed_input.data(); + const T* output_data = transformed_output.data(); + const T* output_grad_data = transformed_output_grad.data(); + + // ------------------- cudnn descriptors --------------------- + ScopedTensorDescriptor input_desc; + ScopedTensorDescriptor output_desc; + ScopedPoolingDescriptor pool_desc; + +#ifdef PADDLE_WITH_HIP + miopenTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( + layout, vectorize(transformed_input.dims())); + miopenTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( + layout, vectorize(transformed_output.dims())); +#else + cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( + layout, vectorize(transformed_input.dims())); + cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( + layout, vectorize(transformed_output.dims())); +#endif + PoolingMode pooling_mode; + if (pooling_type == "max") { + if (FLAGS_cudnn_deterministic) { + pooling_mode = PoolingMode::kMaximumDeterministic; + } else { + pooling_mode = PoolingMode::kMaximum; + } + } else { + pooling_mode = exclusive ? PoolingMode::kAverageExclusive + : PoolingMode::kAverageInclusive; + } + +#ifdef PADDLE_WITH_HIP + miopenPoolingDescriptor_t cudnn_pool_desc = + pool_desc.descriptor(pooling_mode, kernel_size_, paddings_, strides); +#else + cudnnPoolingDescriptor_t cudnn_pool_desc = + pool_desc.descriptor(pooling_mode, kernel_size_, paddings_, strides); +#endif + + // ------------------- cudnn pool algorithm --------------------- + auto handle = ctx.cudnn_handle(); + ScalingParamType alpha = 1.0f, beta = 0.0f; + if (input_grad) { + T* input_grad_data = ctx.template Alloc(&transformed_input_grad); +// Because beta is zero, it is unnecessary to reset input_grad. +#ifdef PADDLE_WITH_HIP + char* pool_workspace; + size_t pool_worksize = 0; + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenPoolingGetWorkSpaceSizeV2( + cudnn_pool_desc, cudnn_output_desc, &pool_worksize)); + PADDLE_ENFORCE_GPU_SUCCESS(hipMalloc(&pool_workspace, pool_worksize)); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenPoolingBackward(handle, + cudnn_pool_desc, + &alpha, + cudnn_output_desc, + output_data, + cudnn_output_desc, + output_grad_data, + cudnn_input_desc, + input_data, + &beta, + cudnn_input_desc, + input_grad_data, + pool_workspace)); + PADDLE_ENFORCE_GPU_SUCCESS(hipFree(pool_workspace)); +#else + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnPoolingBackward(handle, + cudnn_pool_desc, + &alpha, + cudnn_output_desc, + output_data, + cudnn_output_desc, + output_grad_data, + cudnn_input_desc, + input_data, + &beta, + cudnn_input_desc, + input_grad_data)); +#endif + + if (data_format == str_NDHWC) { + std::vector axis{0, 2, 3, 4, 1}; + funcs::Transpose trans5_v4; + trans5_v4(ctx, transformed_input_grad, input_grad, axis); + } +#ifdef PADDLE_WITH_HIP + // MIOPEN not support NHWC data layout + if (data_format == str_NHWC) { + std::vector axis{0, 2, 3, 1}; + funcs::Transpose trans4_v4; + trans4_v4(ctx, transformed_input_grad, input_grad, axis); + } +#endif + } +} + +template +void Pool2dGradGPUDNNKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* dx) { + PoolGradRawGPUDNNKernel(ctx, + x, + out, + dout, + kernel_size, + strides, + paddings, + exclusive, + data_format, + pooling_type, + global_pooling, + adaptive, + padding_algorithm, + dx); +} + +template +void Pool2dDoubleGradGPUDNNKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* out) { + if (pooling_type == "max") { + PADDLE_THROW( + errors::InvalidArgument("Pool op grad grad only supports avgpool.")); + } else { + Pool2dGPUDNNKernel(ctx, + x, + kernel_size, + strides, + paddings, + ceil_mode, + exclusive, + data_format, + pooling_type, + global_pooling, + adaptive, + padding_algorithm, + out); + } +} + +template +void Pool3dGradGPUDNNKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* dx) { + PoolGradRawGPUDNNKernel(ctx, + x, + out, + dout, + kernel_size, + strides, + paddings, + exclusive, + data_format, + pooling_type, + global_pooling, + adaptive, + padding_algorithm, + dx); +} + +} // namespace phi + +using phi::dtype::float16; + +#ifdef PADDLE_WITH_HIP +// MIOPEN do not support double +PD_REGISTER_KERNEL(pool2d_grad, + GPUDNN, + ALL_LAYOUT, + phi::Pool2dGradGPUDNNKernel, + float, + float16) {} +PD_REGISTER_KERNEL(pool2d_double_grad, + GPUDNN, + ALL_LAYOUT, + phi::Pool2dDoubleGradGPUDNNKernel, + float, + float16) {} +PD_REGISTER_KERNEL(pool3d_grad, + GPUDNN, + ALL_LAYOUT, + phi::Pool3dGradGPUDNNKernel, + float, + float16) {} +#else +PD_REGISTER_KERNEL(pool2d_grad, + GPUDNN, + ALL_LAYOUT, + phi::Pool2dGradGPUDNNKernel, + float, + double, + float16) {} +PD_REGISTER_KERNEL(pool2d_double_grad, + GPUDNN, + ALL_LAYOUT, + phi::Pool2dDoubleGradGPUDNNKernel, + float, + double, + float16) {} +PD_REGISTER_KERNEL(pool3d_grad, + GPUDNN, + ALL_LAYOUT, + phi::Pool3dGradGPUDNNKernel, + float, + double, + float16) {} +#endif diff --git a/paddle/phi/kernels/gpudnn/pool_kernel.cu b/paddle/phi/kernels/gpudnn/pool_kernel.cu new file mode 100644 index 0000000000..d8f9656677 --- /dev/null +++ b/paddle/phi/kernels/gpudnn/pool_kernel.cu @@ -0,0 +1,312 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/pool_kernel.h" + +#include "paddle/phi/kernels/gpudnn/pool_gpudnn.h" + +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/pooling.h" + +namespace phi { + +template +void PoolRawGPUDNNKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* out) { + PADDLE_ENFORCE_EQ( + paddle::platform::is_gpu_place(ctx.GetPlace()), + true, + errors::InvalidArgument("Pool operator CUDA kernel must use CUDAPlace " + "rather than CPUPlace.")); + + const DenseTensor* input = &x; + DenseTensor* output = out; + std::vector paddings_ = paddings; + std::vector kernel_size_ = kernel_size; + + ctx.template Alloc(output); + + const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); + + // update paddings_ + auto x_dims = input->dims(); + DDim data_dims; + if (channel_last) { + data_dims = slice_ddim(x_dims, 1, x_dims.size() - 1); + } else { + data_dims = slice_ddim(x_dims, 2, x_dims.size()); + } + funcs::UpdatePadding(&paddings_, + global_pooling, + adaptive, + padding_algorithm, + data_dims, + strides, + kernel_size_); + if (data_dims.size() * 2 == static_cast(paddings_.size())) { + for (int i = 0; i < data_dims.size(); ++i) { + paddings_.erase(paddings_.begin() + i + 1); + } + } + + if (global_pooling) { + funcs::UpdateKernelSize(&kernel_size_, data_dims); + } + + const std::string str_NCHW = "NCHW", str_NHWC = "NHWC"; + const std::string str_NCDHW = "NCDHW", str_NDHWC = "NDHWC"; + + // -----------------transformed tensor ------------------------ + + DenseTensor transformed_input(input->type()); + DenseTensor transformed_output(output->type()); + GPUDNNDataLayout layout; + + if (data_format == str_NDHWC) { + layout = GPUDNNDataLayout::kNCDHW; + std::vector axis{0, 4, 1, 2, 3}; + + // input + transformed_input.Resize(input->dims()); + + auto in_dims_vec = vectorize(input->dims()); + in_dims_vec[1] = input->dims()[4]; + in_dims_vec[2] = input->dims()[1]; + in_dims_vec[3] = input->dims()[2]; + in_dims_vec[4] = input->dims()[3]; + transformed_input.Resize(make_ddim(in_dims_vec)); + ctx.Alloc(&transformed_input, input->type()); + + funcs::Transpose trans5; + trans5(ctx, *input, &transformed_input, axis); + + // output + transformed_output.Resize(output->dims()); + + auto out_dims_vec = vectorize(output->dims()); + out_dims_vec[1] = output->dims()[4]; + out_dims_vec[2] = output->dims()[1]; + out_dims_vec[3] = output->dims()[2]; + out_dims_vec[4] = output->dims()[3]; + transformed_output.Resize(make_ddim(out_dims_vec)); +#ifdef PADDLE_WITH_HIP + // MIOPEN not support NHWC data layout + } else if (data_format == str_NHWC) { + layout = GPUDNNDataLayout::kNCHW; + + std::vector axis{0, 3, 1, 2}; + + transformed_input.Resize(input->dims()); + auto in_dims_vec = vectorize(input->dims()); + in_dims_vec[1] = input->dims()[3]; + in_dims_vec[2] = input->dims()[1]; + in_dims_vec[3] = input->dims()[2]; + transformed_input.Resize(make_ddim(in_dims_vec)); + ctx.Alloc(&transformed_input, input->type()); + + funcs::Transpose trans; + trans(ctx, *input, &transformed_input, axis); + + transformed_output.Resize(output->dims()); + auto out_dims_vec = vectorize(output->dims()); + out_dims_vec[1] = output->dims()[3]; + out_dims_vec[2] = output->dims()[1]; + out_dims_vec[3] = output->dims()[2]; + transformed_output.Resize(make_ddim(out_dims_vec)); +#endif + } else { + layout = GetLayoutFromStr(data_format); + transformed_input = *input; + transformed_output = *output; + } + + const T* tranformed_input_data = transformed_input.data(); + T* tranformed_output_data = ctx.template Alloc(&transformed_output); + + // ------------------- cudnn descriptors --------------------- + ScopedTensorDescriptor input_desc; + ScopedTensorDescriptor output_desc; + ScopedPoolingDescriptor pool_desc; + +#ifdef PADDLE_WITH_HIP + miopenTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( + layout, vectorize(transformed_input.dims())); + miopenTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( + layout, vectorize(transformed_output.dims())); +#else + cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( + layout, vectorize(transformed_input.dims())); + cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( + layout, vectorize(transformed_output.dims())); +#endif + PoolingMode pooling_mode; + if (pooling_type == "max") { + pooling_mode = PoolingMode::kMaximum; + } else { + pooling_mode = exclusive ? PoolingMode::kAverageExclusive + : PoolingMode::kAverageInclusive; + } + +#ifdef PADDLE_WITH_HIP + miopenPoolingDescriptor_t cudnn_pool_desc = + pool_desc.descriptor(pooling_mode, kernel_size_, paddings_, strides); +#else + cudnnPoolingDescriptor_t cudnn_pool_desc = + pool_desc.descriptor(pooling_mode, kernel_size_, paddings_, strides); +#endif + + // ------------------- cudnn pool algorithm --------------------- + auto handle = ctx.cudnn_handle(); + ScalingParamType alpha = 1.0f, beta = 0.0f; + +#ifdef PADDLE_WITH_HIP + char* pool_workspace; + size_t pool_workernel_size_ = 0; + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenPoolingGetWorkSpaceSizeV2( + cudnn_pool_desc, cudnn_output_desc, &pool_workernel_size_)); + PADDLE_ENFORCE_GPU_SUCCESS(hipMalloc(&pool_workspace, pool_workernel_size_)); + PADDLE_ENFORCE_GPU_SUCCESS( + dynload::miopenPoolingForward(handle, + cudnn_pool_desc, + &alpha, + cudnn_input_desc, + tranformed_input_data, + &beta, + cudnn_output_desc, + tranformed_output_data, + false, + pool_workspace, + pool_workernel_size_)); + PADDLE_ENFORCE_GPU_SUCCESS(hipFree(pool_workspace)); +#else + PADDLE_ENFORCE_GPU_SUCCESS( + dynload::cudnnPoolingForward(handle, + cudnn_pool_desc, + &alpha, + cudnn_input_desc, + tranformed_input_data, + &beta, + cudnn_output_desc, + tranformed_output_data)); +#endif + // add + if (data_format == str_NDHWC) { + std::vector axis{0, 2, 3, 4, 1}; + funcs::Transpose trans5_v2; + trans5_v2(ctx, transformed_output, output, axis); + } +#ifdef PADDLE_WITH_HIP + // MIOPEN not support NHWC data layout + if (data_format == str_NHWC) { + std::vector axis{0, 2, 3, 1}; + funcs::Transpose trans; + trans(ctx, transformed_output, output, axis); + } +#endif +} + +template +void Pool2dGPUDNNKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* out) { + PoolRawGPUDNNKernel(ctx, + x, + kernel_size, + strides, + paddings, + exclusive, + data_format, + pooling_type, + global_pooling, + adaptive, + padding_algorithm, + out); +} + +template +void Pool3dGPUDNNKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* out) { + PoolRawGPUDNNKernel(ctx, + x, + kernel_size, + strides, + paddings, + exclusive, + data_format, + pooling_type, + global_pooling, + adaptive, + padding_algorithm, + out); +} + +} // namespace phi + +using phi::dtype::float16; + +#ifdef PADDLE_WITH_HIP +// MIOPEN do not support double +PD_REGISTER_KERNEL( + pool2d, GPUDNN, ALL_LAYOUT, phi::Pool2dGPUDNNKernel, float, float16) {} +PD_REGISTER_KERNEL( + pool3d, GPUDNN, ALL_LAYOUT, phi::Pool3dGPUDNNKernel, float, float16) {} +#else +PD_REGISTER_KERNEL(pool2d, + GPUDNN, + ALL_LAYOUT, + phi::Pool2dGPUDNNKernel, + float, + double, + float16) {} +PD_REGISTER_KERNEL(pool3d, + GPUDNN, + ALL_LAYOUT, + phi::Pool3dGPUDNNKernel, + float, + double, + float16) {} +#endif diff --git a/paddle/phi/kernels/impl/pool_grad_kernel_impl.h b/paddle/phi/kernels/impl/pool_grad_kernel_impl.h new file mode 100644 index 0000000000..7fe89ce34c --- /dev/null +++ b/paddle/phi/kernels/impl/pool_grad_kernel_impl.h @@ -0,0 +1,332 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/kernels/pool_grad_kernel.h" + +#include "paddle/phi/core/ddim.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/pooling.h" +#include "paddle/phi/kernels/pool_kernel.h" + +namespace phi { + +template +void PoolGradRawKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* dx) { + const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); + std::vector paddings_ = paddings; + std::vector kernel_size_ = kernel_size; + + // update paddings + auto x_dims = x.dims(); + DDim data_dims; + if (channel_last) { + data_dims = slice_ddim(x_dims, 1, x_dims.size() - 1); + } else { + data_dims = slice_ddim(x_dims, 2, x_dims.size()); + } + funcs::UpdatePadding(&paddings_, + global_pooling, + adaptive, + padding_algorithm, + data_dims, + strides, + kernel_size_); + if (data_dims.size() * 2 == static_cast(paddings_.size())) { + for (int i = 0; i < data_dims.size(); ++i) { + paddings_.erase(paddings_.begin() + i + 1); + } + } + + if (global_pooling) { + funcs::UpdateKernelSize(&kernel_size_, data_dims); + } + + if (dx) { + ctx.template Alloc(dx); + funcs::SetConstant set_constant; + set_constant(ctx, dx, static_cast(0.0)); + + switch (kernel_size_.size()) { + case 2: { + if (pooling_type == "max") { + funcs::MaxPool2dGradFunctor pool2d_backward; + pool2d_backward(ctx, + x, + out, + dout, + kernel_size_, + strides, + paddings_, + data_format, + dx); + } else if (pooling_type == "avg") { + funcs::Pool2dGradFunctor, T> + pool2d_backward; + funcs::AvgPoolGrad pool_process; + pool2d_backward(ctx, + x, + out, + dout, + kernel_size_, + strides, + paddings_, + data_format, + exclusive, + adaptive, + dx, + pool_process); + } + } break; + case 3: { + if (pooling_type == "max") { + funcs::MaxPool3dGradFunctor pool3d_backward; + pool3d_backward(ctx, + x, + out, + dout, + kernel_size_, + strides, + paddings_, + data_format, + dx); + } else if (pooling_type == "avg") { + funcs::Pool3dGradFunctor, T> + pool3d_backward; + funcs::AvgPoolGrad pool_process; + pool3d_backward(ctx, + x, + out, + dout, + kernel_size_, + strides, + paddings_, + data_format, + exclusive, + adaptive, + dx, + pool_process); + } + } break; + default: { + PADDLE_THROW( + errors::InvalidArgument("Pool op only supports 2D and 3D input.")); + } + } + } +} + +template +void MaxPoolWithIndexGradRawKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& mask, + const DenseTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool global_pooling, + bool adaptive, + DenseTensor* dx) { + std::vector paddings_ = paddings; + std::vector kernel_size_ = kernel_size; + + if (global_pooling) { + for (size_t i = 0; i < kernel_size_.size(); ++i) { + paddings_[i] = 0; + kernel_size_[i] = static_cast(dx->dims()[i + 2]); + } + } + + if (dx) { + ctx.template Alloc(dx); + funcs::set_constant(ctx, dx, 0); + + switch (kernel_size_.size()) { + case 2: { + funcs::MaxPool2dWithIndexGradFunctor pool2d_backward; + pool2d_backward( + ctx, dout, mask, kernel_size_, strides, paddings_, adaptive, dx); + } break; + case 3: { + funcs::MaxPool3dWithIndexGradFunctor pool3d_backward; + pool3d_backward( + ctx, dout, mask, kernel_size_, strides, paddings_, adaptive, dx); + } break; + default: { + PADDLE_THROW( + errors::InvalidArgument("Pool op only supports 2D and 3D input.")); + } + } + } +} + +template +void Pool2dGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* dx) { + PoolGradRawKernel(ctx, + x, + out, + dout, + kernel_size, + strides, + paddings, + exclusive, + data_format, + pooling_type, + global_pooling, + adaptive, + padding_algorithm, + dx); +} + +template +void Pool2dDoubleGradKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* out) { + if (pooling_type == "max") { + PADDLE_THROW( + errors::InvalidArgument("Pool op grad grad only supports avgpool.")); + } else { + Pool2dKernel(ctx, + x, + kernel_size, + strides, + paddings, + ceil_mode, + exclusive, + data_format, + pooling_type, + global_pooling, + adaptive, + padding_algorithm, + out); + } +} + +template +void MaxPool2dWithIndexGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& mask, + const DenseTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool global_pooling, + bool adaptive, + DenseTensor* dx) { + MaxPoolWithIndexGradRawKernel(ctx, + x, + mask, + dout, + kernel_size, + strides, + paddings, + global_pooling, + adaptive, + dx); +} + +template +void Pool3dGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* dx) { + PoolGradRawKernel(ctx, + x, + out, + dout, + kernel_size, + strides, + paddings, + exclusive, + data_format, + pooling_type, + global_pooling, + adaptive, + padding_algorithm, + dx); +} + +template +void MaxPool3dWithIndexGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& mask, + const DenseTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool global_pooling, + bool adaptive, + DenseTensor* dx) { + MaxPoolWithIndexGradRawKernel(ctx, + x, + mask, + dout, + kernel_size, + strides, + paddings, + global_pooling, + adaptive, + dx); +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/pool_kernel_impl.h b/paddle/phi/kernels/impl/pool_kernel_impl.h new file mode 100644 index 0000000000..665d02fd01 --- /dev/null +++ b/paddle/phi/kernels/impl/pool_kernel_impl.h @@ -0,0 +1,321 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/kernels/pool_kernel.h" + +#include +#include "paddle/phi/core/ddim.h" +#include "paddle/phi/kernels/funcs/pooling.h" + +#if defined(__HIPCC__) || defined(__NVCC__) +#include "paddle/phi/kernels/funcs/reduce_function.h" +#include "paddle/phi/kernels/primitive/functor_primitives.h" +#endif + +namespace phi { + +inline int GetReduceNum(const DenseTensor& input, + const DenseTensor* output, + const std::string data_format, + std::vector* reduce_dim) { + // data_format only can be NCHW + bool channel_last = (data_format == "NHWC"); + if (channel_last) { + return 0; + } + int reduce_num = 0; + const int output_height = output->dims()[2]; + const int output_width = output->dims()[3]; + if ((output_height == 1) && (output_width == 1)) { + reduce_dim->push_back(2); + reduce_dim->push_back(3); + reduce_num = input.dims()[2] * input.dims()[3]; + } + return reduce_num; +} + +template +void PoolRawKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* out) { + const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); + std::vector paddings_ = paddings; + std::vector kernel_size_ = kernel_size; + + // update paddings + auto x_dims = x.dims(); + DDim data_dims; + if (channel_last) { + data_dims = slice_ddim(x_dims, 1, x_dims.size() - 1); + } else { + data_dims = slice_ddim(x_dims, 2, x_dims.size()); + } + + funcs::UpdatePadding(&paddings_, + global_pooling, + adaptive, + padding_algorithm, + data_dims, + strides, + kernel_size_); + + if (data_dims.size() * 2 == static_cast(paddings_.size())) { + for (int i = 0; i < data_dims.size(); ++i) { + paddings_.erase(paddings_.begin() + i + 1); + } + } + + if (global_pooling) { + funcs::UpdateKernelSize(&kernel_size_, data_dims); + } + + switch (kernel_size_.size()) { + case 2: { + if (pooling_type == "max") { + funcs::Pool2dFunctor, T> pool2d_forward; + funcs::MaxPool pool_process; + pool2d_forward(ctx, + x, + kernel_size_, + strides, + paddings_, + data_format, + true, + false, + out, + pool_process); + + } else if (pooling_type == "avg") { + std::vector reduce_dim; + int reduce_num = GetReduceNum(x, out, data_format, &reduce_dim); + if (reduce_num > 0 && + adaptive) { // for adaptive_avg_pool2d && output_size == 1 +#if defined(__HIPCC__) || defined(__NVCC__) + auto stream = ctx.stream(); + funcs::ReduceKernel>( + ctx, x, out, kps::DivideFunctor(reduce_num), reduce_dim); +#else // for cpu + funcs::Pool2dFunctor, T> pool2d_forward; + funcs::AvgPool pool_process; + pool2d_forward(ctx, + x, + kernel_size_, + strides, + paddings_, + data_format, + exclusive, + adaptive, + out, + pool_process); +#endif + } else { // avgpool_2d or adaptive_avg_pool2d && output_size != 1 + funcs::Pool2dFunctor, T> pool2d_forward; + funcs::AvgPool pool_process; + pool2d_forward(ctx, + x, + kernel_size_, + strides, + paddings_, + data_format, + exclusive, + adaptive, + out, + pool_process); + } + } + } break; + case 3: { + if (pooling_type == "max") { + funcs::Pool3dFunctor, T> pool3d_forward; + funcs::MaxPool pool_process; + pool3d_forward(ctx, + x, + kernel_size_, + strides, + paddings_, + data_format, + true, + false, + out, + pool_process); + } else if (pooling_type == "avg") { + funcs::Pool3dFunctor, T> pool3d_forward; + funcs::AvgPool pool_process; + pool3d_forward(ctx, + x, + kernel_size_, + strides, + paddings_, + data_format, + exclusive, + adaptive, + out, + pool_process); + } + } break; + default: { + PADDLE_THROW( + errors::InvalidArgument("Pool op only supports 2D and 3D input.")); + } + } +} + +template +void MaxPoolWithIndexRawKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool global_pooling, + bool adaptive, + DenseTensor* out, + DenseTensor* mask) { + std::vector paddings_ = paddings; + std::vector kernel_size_ = kernel_size; + + if (global_pooling) { + for (size_t i = 0; i < kernel_size_.size(); ++i) { + paddings_[i] = 0; + kernel_size_[i] = static_cast(x.dims()[i + 2]); + } + } + + switch (kernel_size_.size()) { + case 2: { + funcs::MaxPool2dWithIndexFunctor pool2d_forward; + pool2d_forward( + ctx, x, kernel_size_, strides, paddings_, adaptive, out, mask); + } break; + case 3: { + funcs::MaxPool3dWithIndexFunctor pool3d_forward; + pool3d_forward( + ctx, x, kernel_size_, strides, paddings_, adaptive, out, mask); + } break; + default: { + PADDLE_THROW( + errors::InvalidArgument("Pool op only supports 2D and 3D input.")); + } + } +} + +template +void Pool2dKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* out) { + PoolRawKernel(ctx, + x, + kernel_size, + strides, + paddings, + exclusive, + data_format, + pooling_type, + global_pooling, + adaptive, + padding_algorithm, + out); +} + +template +void MaxPool2dWithIndexKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool global_pooling, + bool adaptive, + DenseTensor* out, + DenseTensor* mask) { + MaxPoolWithIndexRawKernel(ctx, + x, + kernel_size, + strides, + paddings, + global_pooling, + adaptive, + out, + mask); +} + +template +void Pool3dKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* out) { + PoolRawKernel(ctx, + x, + kernel_size, + strides, + paddings, + exclusive, + data_format, + pooling_type, + global_pooling, + adaptive, + padding_algorithm, + out); +} + +template +void MaxPool3dWithIndexKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool global_pooling, + bool adaptive, + DenseTensor* out, + DenseTensor* mask) { + MaxPoolWithIndexRawKernel(ctx, + x, + kernel_size, + strides, + paddings, + global_pooling, + adaptive, + out, + mask); +} + +} // namespace phi diff --git a/paddle/phi/kernels/pool_grad_kernel.h b/paddle/phi/kernels/pool_grad_kernel.h new file mode 100644 index 0000000000..0658dc22c8 --- /dev/null +++ b/paddle/phi/kernels/pool_grad_kernel.h @@ -0,0 +1,145 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void Pool2dGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* dx); + +template +void Pool2dGradGPUDNNKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* dx); + +template +void Pool2dDoubleGradKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* out); + +template +void Pool2dDoubleGradGPUDNNKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* out); + +template +void MaxPool2dWithIndexGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& mask, + const DenseTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool global_pooling, + bool adaptive, + DenseTensor* dx); + +template +void Pool3dGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* dx); + +template +void Pool3dGradGPUDNNKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* dx); + +template +void MaxPool3dWithIndexGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& mask, + const DenseTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool global_pooling, + bool adaptive, + DenseTensor* dx); + +} // namespace phi diff --git a/paddle/phi/kernels/pool_kernel.h b/paddle/phi/kernels/pool_kernel.h new file mode 100644 index 0000000000..348af02181 --- /dev/null +++ b/paddle/phi/kernels/pool_kernel.h @@ -0,0 +1,105 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void Pool2dKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* out); + +template +void Pool2dGPUDNNKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* out); + +template +void MaxPool2dWithIndexKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool global_pooling, + bool adaptive, + DenseTensor* out, + DenseTensor* mask); + +template +void Pool3dKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* out); + +template +void Pool3dGPUDNNKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* out); + +template +void MaxPool3dWithIndexKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool global_pooling, + bool adaptive, + DenseTensor* out, + DenseTensor* mask); + +} // namespace phi diff --git a/paddle/phi/ops/compat/pool_sig.cc b/paddle/phi/ops/compat/pool_sig.cc new file mode 100644 index 0000000000..390d3db5e7 --- /dev/null +++ b/paddle/phi/ops/compat/pool_sig.cc @@ -0,0 +1,154 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature Pool2dOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("pool2d", + {"X"}, + {"ksize", + "strides", + "paddings", + "ceil_mode", + "exclusive", + "data_format", + "pooling_type", + "global_pooling", + "adaptive", + "padding_algorithm"}, + {"Out"}); +} + +KernelSignature Pool2dGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("pool2d_grad", + {"X", "Out", GradVarName("Out")}, + {"ksize", + "strides", + "paddings", + "ceil_mode", + "exclusive", + "data_format", + "pooling_type", + "global_pooling", + "adaptive", + "padding_algorithm"}, + {GradVarName("X")}); +} + +KernelSignature Pool2dDoubleGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("pool2d_double_grad", + {"X"}, + {"ksize", + "strides", + "paddings", + "ceil_mode", + "exclusive", + "data_format", + "pooling_type", + "global_pooling", + "adaptive", + "padding_algorithm"}, + {"Out"}); +} + +KernelSignature MaxPool2dWithIndexOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "max_pool2d_with_index", + {"X"}, + {"ksize", "strides", "paddings", "global_pooling", "adaptive"}, + {"Out", "Mask"}); +} + +KernelSignature MaxPool2dWithIndexGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "max_pool2d_with_index_grad", + {"X", "Mask", GradVarName("Out")}, + {"ksize", "strides", "paddings", "global_pooling", "adaptive"}, + {GradVarName("X")}); +} + +KernelSignature Pool3dOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("pool3d", + {"X"}, + {"ksize", + "strides", + "paddings", + "ceil_mode", + "exclusive", + "data_format", + "pooling_type", + "global_pooling", + "adaptive", + "padding_algorithm"}, + {"Out"}); +} + +KernelSignature Pool3dGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("pool3d_grad", + {"X", "Out", GradVarName("Out")}, + {"ksize", + "strides", + "paddings", + "ceil_mode", + "exclusive", + "data_format", + "pooling_type", + "global_pooling", + "adaptive", + "padding_algorithm"}, + {GradVarName("X")}); +} + +KernelSignature MaxPool3dWithIndexOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "max_pool3d_with_index", + {"X"}, + {"ksize", "strides", "paddings", "global_pooling", "adaptive"}, + {"Out", "Mask"}); +} + +KernelSignature MaxPool3dWithIndexGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "max_pool3d_with_index_grad", + {"X", "Mask", GradVarName("Out")}, + {"ksize", "strides", "paddings", "global_pooling", "adaptive"}, + {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(pool2d, phi::Pool2dOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(pool2d_grad, phi::Pool2dGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(pool2d_double_grad, + phi::Pool2dDoubleGradOpArgumentMapping); + +PD_REGISTER_ARG_MAPPING_FN(max_pool2d_with_index, + phi::MaxPool2dWithIndexOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(max_pool2d_with_index_grad, + phi::MaxPool2dWithIndexGradOpArgumentMapping); + +PD_REGISTER_ARG_MAPPING_FN(pool3d, phi::Pool3dOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(pool3d_grad, phi::Pool3dGradOpArgumentMapping); + +PD_REGISTER_ARG_MAPPING_FN(max_pool3d_with_index, + phi::MaxPool3dWithIndexOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(max_pool3d_with_index_grad, + phi::MaxPool3dWithIndexGradOpArgumentMapping); diff --git a/paddle/phi/tests/core/test_meta_fn_utils.cc b/paddle/phi/tests/core/test_meta_fn_utils.cc index f4288c2aa2..399112d09c 100644 --- a/paddle/phi/tests/core/test_meta_fn_utils.cc +++ b/paddle/phi/tests/core/test_meta_fn_utils.cc @@ -52,7 +52,7 @@ TEST(MetaFnFactory, InferMetaFnExists) { phi::InferMetaContext ctx; ctx.EmplaceBackInput(shared_meat_x); ctx.EmplaceBackOutput(shared_meta_out); - ctx.SetMetaConfig(/*is_runtime=*/true); + ctx.SetMetaConfig({/*is_runtime =*/true, /*is_run_mkldnn_kernel=*/false}); phi::MetaFnFactory::Instance().Get("sign")(&ctx); EXPECT_EQ(dense_out1.dims().size(), dense_out2.dims().size()); @@ -78,7 +78,7 @@ TEST(MetaFnFactory, CopyInferMetaFn) { ctx.EmplaceBackAttr(Backend::CPU); ctx.EmplaceBackAttr(false); ctx.EmplaceBackOutput(shared_meta_out); - ctx.SetMetaConfig(/*is_runtime=*/true); + ctx.SetMetaConfig({/*is_runtime =*/true, /*is_run_mkldnn_kernel=*/false}); phi::MetaFnFactory::Instance().Get("copy_to")(&ctx); EXPECT_EQ(dense_out1.dims().size(), dense_out2.dims().size()); @@ -105,7 +105,7 @@ TEST(MetaFnFactory, SplitInferMetaFn) { ctx.EmplaceBackAttr(num_or_sections); ctx.EmplaceBackAttr(axis); ctx.EmplaceBackOutputs(out); - ctx.SetMetaConfig(/*is_runtime=*/true); + ctx.SetMetaConfig({/*is_runtime =*/true, /*is_run_mkldnn_kernel=*/false}); phi::MetaFnFactory::Instance().Get("split")(&ctx); ASSERT_EQ(dense_out1.dims().size(), 2); -- GitLab