From d65a7a46d2994a9646659738f0887a084f610a60 Mon Sep 17 00:00:00 2001 From: chentianyu03 Date: Fri, 1 Apr 2022 14:50:36 +0800 Subject: [PATCH] [Phi]Interploatd kernels into phi (#40855) * add interploate cpu kernel * fix nullptr bug * add interpolate gpu kernel * fix unit test error * remove raw kernels * add cuda kernel impl * add infermeta * recover accidentally deleted kernels in interpolate op * fix grad x_grad name error * remove interpolate_v2_op.h * rm unused codes * fix xpu build error * fix build error * fix namespace error * add register header for nup * fix infermeta error * modify by review * add the missing args in test_trt_convert_nearest_interp_v2 --- paddle/fluid/framework/operator.cc | 10 +- paddle/fluid/imperative/prepared_operator.h | 11 + paddle/fluid/operators/interpolate_v2_op.cc | 65 +- paddle/fluid/operators/interpolate_v2_op.cu | 2210 ----------------- paddle/fluid/operators/interpolate_v2_op.h | 1618 ------------ .../fluid/operators/interpolate_v2_op_npu.cc | 20 +- .../fluid/operators/interpolate_v2_op_xpu.cc | 21 +- paddle/phi/backends/gpu/gpu_launch_config.h | 37 + paddle/phi/core/infermeta_utils.cc | 17 + paddle/phi/core/infermeta_utils.h | 22 + paddle/phi/core/kernel_context.h | 16 + paddle/phi/core/kernel_registry.h | 7 + paddle/phi/core/kernel_utils.h | 25 + paddle/phi/infermeta/multiary.cc | 500 ++++ paddle/phi/infermeta/multiary.h | 16 + .../kernels/cpu/interpolate_grad_kernel.cc | 1067 ++++++++ paddle/phi/kernels/cpu/interpolate_kernel.cc | 1225 +++++++++ paddle/phi/kernels/funcs/aligned_vector.h | 2 +- .../phi/kernels/funcs/interpolate_function.h | 154 ++ .../kernels/gpu/interpolate_grad_kernel.cu | 1601 ++++++++++++ paddle/phi/kernels/gpu/interpolate_kernel.cu | 1479 +++++++++++ paddle/phi/kernels/interpolate_grad_kernel.h | 39 + paddle/phi/kernels/interpolate_kernel.h | 110 + paddle/phi/ops/compat/interpolate_sig.cc | 194 ++ .../test_trt_convert_nearest_interp_v2.py | 2 + 25 files changed, 6586 insertions(+), 3882 deletions(-) delete mode 100644 paddle/fluid/operators/interpolate_v2_op.cu delete mode 100644 paddle/fluid/operators/interpolate_v2_op.h create mode 100644 paddle/phi/kernels/cpu/interpolate_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/interpolate_kernel.cc create mode 100644 paddle/phi/kernels/funcs/interpolate_function.h create mode 100644 paddle/phi/kernels/gpu/interpolate_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/interpolate_kernel.cu create mode 100644 paddle/phi/kernels/interpolate_grad_kernel.h create mode 100644 paddle/phi/kernels/interpolate_kernel.h create mode 100644 paddle/phi/ops/compat/interpolate_sig.cc diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 83380d1f26..19fa0f6673 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -2167,7 +2167,11 @@ void OperatorWithKernel::BuildPhiKernelContext( typeid(paddle::optional)) || input_defs[i].type_index == std::type_index( - typeid(paddle::optional)))) { + typeid(paddle::optional)) || + input_defs[i].type_index == + std::type_index( + typeid(paddle::optional< + const std::vector>)))) { pt_kernel_context->EmplaceBackInputWithoutSetRange(nullptr); auto end_idx = start_idx + 1; pt_kernel_context->AssignInputRange(std::make_pair(start_idx, end_idx), @@ -2429,6 +2433,10 @@ void OperatorWithKernel::BuildPhiKernelContext( std::type_index(typeid(std::vector))) { pt_kernel_context->EmplaceBackAttr( BOOST_GET_CONST(std::vector, attr_it->second)); + } else if (attr_defs[i].type_index == + std::type_index(typeid(std::vector))) { + pt_kernel_context->EmplaceBackAttr( + BOOST_GET_CONST(std::vector, attr_it->second)); } else { PADDLE_THROW(platform::errors::Unimplemented( "Unsupported cast op attribute `%s` when construct " diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h index 43bed5fd35..04d0b4ca7a 100644 --- a/paddle/fluid/imperative/prepared_operator.h +++ b/paddle/fluid/imperative/prepared_operator.h @@ -272,6 +272,14 @@ void BuildDygraphPhiKernelContext( auto end_idx = start_idx + 1; kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i); continue; + } else if (input_defs[i].type_index == + std::type_index( + typeid(paddle::optional< + const std::vector>))) { + kernel_ctx->EmplaceBackInputWithoutSetRange(nullptr); + auto end_idx = start_idx + 1; + kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i); + continue; } else { PADDLE_THROW(phi::errors::NotFound( "Can not find input variable '%s' for %s OP, please check whether " @@ -545,6 +553,9 @@ void BuildDygraphPhiKernelContext( std::type_index(typeid(std::vector))) { kernel_ctx->EmplaceBackAttr( BOOST_GET_CONST(std::vector, attr)); + } else if (attr_defs[i].type_index == + std::type_index(typeid(std::vector))) { + kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(std::vector, attr)); } else { PADDLE_THROW(platform::errors::Unimplemented( "Unsupported cast op attribute `%s` when construct " diff --git a/paddle/fluid/operators/interpolate_v2_op.cc b/paddle/fluid/operators/interpolate_v2_op.cc index 4b5a18141d..d0d7b7694f 100644 --- a/paddle/fluid/operators/interpolate_v2_op.cc +++ b/paddle/fluid/operators/interpolate_v2_op.cc @@ -9,11 +9,15 @@ See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/interpolate_v2_op.h" #include #include #include + +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/multiary.h" + #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" #endif @@ -722,64 +726,51 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(InterpolateV2GradNoNeedBufferVarsInferer, // not // compatible with interp_op, so a new one is added in paddle2.0 namespace ops = paddle::operators; + +DECLARE_INFER_SHAPE_FUNCTOR(bilinear_interp_v2, BilinearInterpInferShapeFunctor, + PD_INFER_META(phi::InterpolateInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(nearest_interp_v2, NearestInterpInferShapeFunctor, + PD_INFER_META(phi::InterpolateInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(trilinear_interp_v2, + TrilinearInterpInferShapeFunctor, + PD_INFER_META(phi::InterpolateInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(bicubic_interp_v2, BicubicInterpInferShapeFunctor, + PD_INFER_META(phi::InterpolateInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(linear_interp_v2, LinearInterpInferShapeFunctor, + PD_INFER_META(phi::InterpolateInferMeta)); + REGISTER_OPERATOR(bilinear_interp_v2, ops::InterpolateV2Op, ops::InterpolateV2OpMaker, ops::InterpolateV2GradMaker, - ops::InterpolateV2GradMaker); + ops::InterpolateV2GradMaker, + BilinearInterpInferShapeFunctor); REGISTER_OPERATOR(bilinear_interp_v2_grad, ops::InterpolateV2OpGrad, ops::InterpolateV2GradNoNeedBufferVarsInferer); REGISTER_OPERATOR(nearest_interp_v2, ops::InterpolateV2Op, ops::InterpolateV2OpMaker, ops::InterpolateV2GradMaker, - ops::InterpolateV2GradMaker); + ops::InterpolateV2GradMaker, + NearestInterpInferShapeFunctor); REGISTER_OPERATOR(nearest_interp_v2_grad, ops::InterpolateV2OpGrad, ops::InterpolateV2GradNoNeedBufferVarsInferer); REGISTER_OPERATOR(trilinear_interp_v2, ops::InterpolateV2Op, ops::InterpolateV2OpMaker, ops::InterpolateV2GradMaker, - ops::InterpolateV2GradMaker); + ops::InterpolateV2GradMaker, + TrilinearInterpInferShapeFunctor); REGISTER_OPERATOR(trilinear_interp_v2_grad, ops::InterpolateV2OpGrad, ops::InterpolateV2GradNoNeedBufferVarsInferer); REGISTER_OPERATOR(bicubic_interp_v2, ops::InterpolateV2Op, ops::InterpolateV2OpMaker, ops::InterpolateV2GradMaker, - ops::InterpolateV2GradMaker); + ops::InterpolateV2GradMaker, + BicubicInterpInferShapeFunctor); REGISTER_OPERATOR(bicubic_interp_v2_grad, ops::InterpolateV2OpGrad, ops::InterpolateV2GradNoNeedBufferVarsInferer); -REGISTER_OP_CPU_KERNEL(bilinear_interp_v2, ops::InterpolateV2Kernel, - ops::InterpolateV2Kernel, - ops::InterpolateV2Kernel); -REGISTER_OP_CPU_KERNEL(bilinear_interp_v2_grad, - ops::InterpolateV2GradKernel, - ops::InterpolateV2GradKernel); -REGISTER_OP_CPU_KERNEL(nearest_interp_v2, ops::InterpolateV2Kernel, - ops::InterpolateV2Kernel, - ops::InterpolateV2Kernel, - ops::InterpolateV2Kernel, - ops::InterpolateV2Kernel); -REGISTER_OP_CPU_KERNEL(nearest_interp_v2_grad, - ops::InterpolateV2GradKernel, - ops::InterpolateV2GradKernel); -REGISTER_OP_CPU_KERNEL(trilinear_interp_v2, ops::InterpolateV2Kernel, - ops::InterpolateV2Kernel, - ops::InterpolateV2Kernel); -REGISTER_OP_CPU_KERNEL(trilinear_interp_v2_grad, - ops::InterpolateV2GradKernel, - ops::InterpolateV2GradKernel); REGISTER_OPERATOR(linear_interp_v2, ops::InterpolateV2Op, ops::InterpolateV2OpMaker, ops::InterpolateV2GradMaker, - ops::InterpolateV2GradMaker); + ops::InterpolateV2GradMaker, + LinearInterpInferShapeFunctor); REGISTER_OPERATOR(linear_interp_v2_grad, ops::InterpolateV2OpGrad, ops::InterpolateV2GradNoNeedBufferVarsInferer); -REGISTER_OP_CPU_KERNEL(linear_interp_v2, ops::InterpolateV2Kernel, - ops::InterpolateV2Kernel, - ops::InterpolateV2Kernel); -REGISTER_OP_CPU_KERNEL(linear_interp_v2_grad, - ops::InterpolateV2GradKernel, - ops::InterpolateV2GradKernel); -REGISTER_OP_CPU_KERNEL(bicubic_interp_v2, ops::InterpolateV2Kernel, - ops::InterpolateV2Kernel); -REGISTER_OP_CPU_KERNEL(bicubic_interp_v2_grad, - ops::InterpolateV2GradKernel, - ops::InterpolateV2GradKernel); diff --git a/paddle/fluid/operators/interpolate_v2_op.cu b/paddle/fluid/operators/interpolate_v2_op.cu deleted file mode 100644 index cd297c53f8..0000000000 --- a/paddle/fluid/operators/interpolate_v2_op.cu +++ /dev/null @@ -1,2210 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#include -#include -#include "paddle/fluid/operators/interpolate_v2_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" -#include "paddle/fluid/platform/fast_divmod.h" -#include "paddle/phi/kernels/funcs/math_cuda_utils.h" - -namespace paddle { -namespace operators { - -using framework::Tensor; -using platform::FastDivMod; -using DataLayout = framework::DataLayout; - -static inline int GetLastPow2(int n) { - n |= (n >> 1); - n |= (n >> 2); - n |= (n >> 4); - n |= (n >> 8); - n |= (n >> 16); - return std::max(1, n - (n >> 1)); -} - -inline platform::GpuLaunchConfig GetGpuLaunchConfig3D( - const platform::CUDADeviceContext& context, int num_img, int height, - int width) { - const int kThreadsPerBlock = 256; - int max_threads_per_block = context.GetMaxThreadsPerBlock(); // 1024 - int max_threads = std::min(kThreadsPerBlock, max_threads_per_block); - - int block_x = std::min(GetLastPow2(width), max_threads); - int block_y = std::min(GetLastPow2(height), max_threads / block_x); - int block_z = std::min(num_img, max_threads / block_x / block_y); - - auto max_grid_dim = context.GetCUDAMaxGridDimSize(); - int grid_x = std::min(max_grid_dim[0], platform::DivUp(width, block_x)); - int grid_y = std::min(max_grid_dim[1], platform::DivUp(height, block_y)); - int grid_z = - std::min(max_grid_dim[2], platform::DivUp(num_img, block_z * 4)); - - const int capability = context.GetComputeCapability(); - platform::GpuLaunchConfig config; - config.compute_capability = capability; - config.thread_per_block = dim3(block_x, block_y, block_z); - config.block_per_grid = dim3(grid_x, grid_y, grid_z); - return config; -} - -template -__forceinline__ __device__ void PreCalculatorForLinearInterpInputIndex( - int* in_img_idx, int* x_id, T* lambda1, T* lambda2, T src_x, - const int in_img_x) { - src_x = (src_x > 0) ? src_x : 0.f; - *in_img_idx = static_cast(src_x); - *x_id = (*in_img_idx < in_img_x - 1) ? 1 : 0; - *lambda1 = src_x - *in_img_idx; - *lambda2 = 1.f - *lambda1; -} - -struct FastDivModForInterpolate { - public: - FastDivMod channels_div; - FastDivMod output_w_div; - FastDivMod output_wc_div; - - explicit HOSTDEVICE FastDivModForInterpolate(const int channels, - const int output_w, - const int outout_wc) - : channels_div(FastDivMod(channels)), - output_w_div(FastDivMod(output_w)), - output_wc_div(FastDivMod(outout_wc)) {} -}; - -template -__global__ void KeNearestNeighborInterpNCHWFw( - const T* in, const size_t in_img_h, const size_t in_img_w, T* out, - const size_t out_img_h, const size_t out_img_w, const size_t nc, - const float ratio_h, const float ratio_w, const bool align_corners) { - int out_img_idx = threadIdx.x + blockIdx.x * blockDim.x; - int out_img_idy = threadIdx.y + blockIdx.y * blockDim.y; - int nc_id = threadIdx.z + blockIdx.z * blockDim.z; - int nc_stride = blockDim.z * gridDim.z; - - // nearest_sampling by multiple read in_addr and write to out_addr - int in_img_idx = (align_corners) - ? static_cast(ratio_w * out_img_idx + 0.5) - : static_cast(ratio_w * out_img_idx); - int in_img_idy = (align_corners) - ? static_cast(ratio_h * out_img_idy + 0.5) - : static_cast(ratio_h * out_img_idy); - - int in_index = (nc_id * in_img_h + in_img_idy) * in_img_w + in_img_idx; - int in_index_stride = nc_stride * in_img_h * in_img_w; - - int out_index = (nc_id * out_img_h + out_img_idy) * out_img_w + out_img_idx; - int out_index_stride = nc_stride * out_img_h * out_img_w; - - // prevent from multiple threads writing - if (out_img_idx < out_img_w && out_img_idy < out_img_h) { - while (nc_id < nc) { - out[out_index] = in[in_index]; - in_index += in_index_stride; - out_index += out_index_stride; - nc_id += nc_stride; - } - } -} - -template -__global__ void KeNearestNeighborInterpFw( - const T* in, const size_t in_img_h, const size_t in_img_w, - const size_t input_h, const size_t input_w, T* out, const size_t out_img_h, - const size_t out_img_w, const size_t output_h, const size_t output_w, - const size_t num_channels, const float ratio_h, const float ratio_w, - const bool align_corners, FastDivModForInterpolate divmods) { - int nthreads = output_h * output_w; - int tid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - int in_img_size = in_img_h * in_img_w; - int out_img_size = out_img_h * out_img_w; - - for (; tid < nthreads; tid += stride) { - auto out_id_divmod = divmods.output_w_div.Divmod(tid); - int out_id_h = out_id_divmod.val[0]; - int out_id_w = out_id_divmod.val[1]; - - int channel_id = divmods.channels_div.Divmod(tid).val[1]; - auto outimg_id_divmod = divmods.output_wc_div.Divmod(out_id_w); - int out_img_idy = outimg_id_divmod.val[0]; - int out_img_idx = - divmods.channels_div.Divmod(outimg_id_divmod.val[1]).val[0]; - - int in_img_idy = (align_corners) - ? static_cast(ratio_h * out_img_idy + 0.5) - : static_cast(ratio_h * out_img_idy); - int in_img_idx = (align_corners) - ? static_cast(ratio_w * out_img_idx + 0.5) - : static_cast(ratio_w * out_img_idx); - - out[tid] = in[out_id_h * input_w + in_img_idy * in_img_w * num_channels + - in_img_idx * num_channels + channel_id]; - } -} - -template -__global__ void KeNearestNeighbor3DInterpFw( - const T* in, const size_t in_img_d, const size_t in_img_h, - const size_t in_img_w, const size_t input_h, const size_t input_w, T* out, - const size_t out_img_d, const size_t out_img_h, const size_t out_img_w, - const size_t output_h, const size_t output_w, const size_t num_channels, - const float ratio_d, const float ratio_h, const float ratio_w, - const bool align_corners, const DataLayout data_layout) { - int nthreads = output_h * output_w; // ncdhw - int tid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - for (; tid < nthreads; tid += stride) { - int out_id_h = tid / output_w; - int out_id_w = tid % output_w; - int in_img_size = input_w / num_channels; - int out_img_size = output_w / num_channels; - - int channel_id, out_img_idt, out_img_idy, out_img_idx; - if (data_layout == DataLayout::kNCHW) { - channel_id = out_id_w / out_img_size; - out_img_idt = (out_id_w % out_img_size) / out_img_h / out_img_w; - out_img_idy = ((out_id_w % out_img_size) / out_img_w) % out_img_h; - out_img_idx = tid % out_img_w; - } else { - out_img_idt = out_id_w / (out_img_h * out_img_w * num_channels); - out_img_idy = out_id_w % (out_img_h * out_img_w * num_channels) / - (out_img_w * num_channels); - out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels; - channel_id = tid % num_channels; - } - - int in_img_idt = (align_corners) - ? static_cast(ratio_d * out_img_idt + 0.5) - : static_cast(ratio_d * out_img_idt); - - int in_img_idy = (align_corners) - ? static_cast(ratio_h * out_img_idy + 0.5) - : static_cast(ratio_h * out_img_idy); - int in_img_idx = (align_corners) - ? static_cast(ratio_w * out_img_idx + 0.5) - : static_cast(ratio_w * out_img_idx); - - if (data_layout == DataLayout::kNCHW) { - out[tid] = in[out_id_h * input_w + channel_id * in_img_size + - in_img_idt * in_img_h * in_img_w + in_img_idy * in_img_w + - in_img_idx]; - } else { - out[tid] = in[out_id_h * input_w + - in_img_idt * in_img_h * in_img_w * num_channels + - in_img_idy * in_img_w * num_channels + - in_img_idx * num_channels + channel_id]; - } - } -} - -template -__global__ void KeNearestNeighborInterpNCHWBw( - T* in, const size_t in_img_h, const size_t in_img_w, const T* out, - const size_t out_img_h, const size_t out_img_w, const size_t nc, - const float ratio_h, const float ratio_w, const bool align_corners) { - int out_img_idx = threadIdx.x + blockIdx.x * blockDim.x; - int out_img_idy = threadIdx.y + blockIdx.y * blockDim.y; - int nc_id = threadIdx.z + blockIdx.z * blockDim.z; - int nc_stride = blockDim.z * gridDim.z; - - // nearest_sampling by multiple read in_addr and write to out_addr - int in_img_idx = (align_corners) - ? static_cast(ratio_w * out_img_idx + 0.5) - : static_cast(ratio_w * out_img_idx); - int in_img_idy = (align_corners) - ? static_cast(ratio_h * out_img_idy + 0.5) - : static_cast(ratio_h * out_img_idy); - - int in_index = (nc_id * in_img_h + in_img_idy) * in_img_w + in_img_idx; - int in_index_stride = nc_stride * in_img_h * in_img_w; - - int out_index = (nc_id * out_img_h + out_img_idy) * out_img_w + out_img_idx; - int out_index_stride = nc_stride * out_img_h * out_img_w; - - // prevent from multiple threads writing - if (out_img_idx < out_img_w && out_img_idy < out_img_h) { - while (nc_id < nc) { - T* in_pos = &in[in_index]; - const T out_pos = out[out_index]; - platform::CudaAtomicAdd(in_pos, out_pos); - in_index += in_index_stride; - out_index += out_index_stride; - nc_id += nc_stride; - } - } -} - -template -__global__ void KeNearestNeighborInterpBw( - T* in, const size_t in_img_h, const size_t in_img_w, const size_t input_h, - const size_t input_w, const T* out, const size_t out_img_h, - const size_t out_img_w, const size_t output_h, const size_t output_w, - const size_t num_channels, const float ratio_h, const float ratio_w, - const bool align_corners, FastDivModForInterpolate divmods) { - int nthreads = output_h * output_w; - int tid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - int in_img_size = in_img_h * in_img_w; - int out_img_size = out_img_h * out_img_w; - - for (; tid < nthreads; tid += stride) { - auto out_id_divmod = divmods.output_w_div.Divmod(tid); - int out_id_h = out_id_divmod.val[0]; - int out_id_w = out_id_divmod.val[1]; - - int channel_id = divmods.channels_div.Divmod(tid).val[1]; - auto outimg_id_divmod = divmods.output_wc_div.Divmod(out_id_w); - int out_img_idy = outimg_id_divmod.val[0]; - int out_img_idx = - divmods.channels_div.Divmod(outimg_id_divmod.val[1]).val[0]; - - int in_img_idy = (align_corners) - ? static_cast(ratio_h * out_img_idy + 0.5) - : static_cast(ratio_h * out_img_idy); - int in_img_idx = (align_corners) - ? static_cast(ratio_w * out_img_idx + 0.5) - : static_cast(ratio_w * out_img_idx); - - T* in_pos = &in[out_id_h * input_w + in_img_idy * in_img_w * num_channels + - in_img_idx * num_channels + channel_id]; - - const T out_pos = out[tid]; - platform::CudaAtomicAdd(in_pos, out_pos); - } -} - -template -__global__ void KeNearestNeighbor3DInterpBw( - T* in, const size_t in_img_d, const size_t in_img_h, const size_t in_img_w, - const size_t input_h, const size_t input_w, const T* out, - const size_t out_img_d, const size_t out_img_h, const size_t out_img_w, - const size_t output_h, const size_t output_w, const size_t num_channels, - const float ratio_d, const float ratio_h, const float ratio_w, - const bool align_corners, const DataLayout data_layout) { - int nthreads = output_h * output_w; - int tid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - for (; tid < nthreads; tid += stride) { - int out_id_h = tid / output_w; - int out_id_w = tid % output_w; - int in_img_size = input_w / num_channels; - int out_img_size = output_w / num_channels; - - int channel_id, out_img_idt, out_img_idy, out_img_idx; - if (data_layout == DataLayout::kNCHW) { - channel_id = out_id_w / out_img_size; - out_img_idt = (out_id_w % out_img_size) / out_img_h / out_img_w; - out_img_idy = ((out_id_w % out_img_size) / out_img_w) % out_img_h; - out_img_idx = tid % out_img_w; - } else { - out_img_idt = out_id_w / (out_img_h * out_img_w * num_channels); - out_img_idy = out_id_w % (out_img_h * out_img_w * num_channels) / - (out_img_w * num_channels); - out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels; - channel_id = tid % num_channels; - } - - int in_img_idt = (align_corners) - ? static_cast(ratio_d * out_img_idt + 0.5) - : static_cast(ratio_d * out_img_idt); - int in_img_idy = (align_corners) - ? static_cast(ratio_h * out_img_idy + 0.5) - : static_cast(ratio_h * out_img_idy); - int in_img_idx = (align_corners) - ? static_cast(ratio_w * out_img_idx + 0.5) - : static_cast(ratio_w * out_img_idx); - - T* in_pos; - if (data_layout == DataLayout::kNCHW) { - in_pos = &in[out_id_h * input_w + channel_id * in_img_size + - in_img_idt * in_img_h * in_img_w + in_img_idy * in_img_w + - in_img_idx]; - } else { - in_pos = &in[out_id_h * input_w + - in_img_idt * in_img_h * in_img_w * num_channels + - in_img_idy * in_img_w * num_channels + - in_img_idx * num_channels + channel_id]; - } - const T out_pos = out[out_id_h * output_w + out_id_w]; - platform::CudaAtomicAdd(in_pos, out_pos); - } -} - -template -__global__ void KeLinearInterpFw(const T* in, const size_t in_img_w, - const size_t input_w, T* out, - const size_t out_img_w, const size_t output_h, - const size_t output_w, - const size_t num_channels, const float ratio_w, - const bool align_corners, const int align_mode, - const DataLayout data_layout) { - int nthreads = output_h * output_w; - int tid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - bool align_flag = (align_mode == 0 && !align_corners); - for (; tid < nthreads; tid += stride) { - int out_id_h = tid / output_w; - int out_id_w = tid % output_w; - int in_img_size = input_w / num_channels; - int out_img_size = output_w / num_channels; - - int channel_id, out_img_idy, out_img_idx; - if (data_layout == DataLayout::kNCHW) { - channel_id = out_id_w / out_img_size; - out_img_idx = tid % out_img_w; - } else { - out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels; - channel_id = tid % num_channels; - } - - int in_img_idx = align_flag - ? static_cast(ratio_w * (out_img_idx + 0.5) - 0.5) - : static_cast(ratio_w * out_img_idx); - in_img_idx = (in_img_idx > 0) ? in_img_idx : 0; // w - int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0; // w_id - - T src_w = ratio_w * (out_img_idx + 0.5) - 0.5; - src_w = (src_w > 0) ? src_w : 0; - T w1lambda = - align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx; - T w2lambda = 1.f - w1lambda; - - if (data_layout == DataLayout::kNCHW) { - const T* in_pos = - &in[out_id_h * out_id_w + channel_id * in_img_size + in_img_idx]; - // linear interpolation - out[out_id_h * output_w + out_id_w] = - w2lambda * in_pos[0] + w1lambda * in_pos[w_id]; - - } else { - const T* in_pos = - &in[out_id_h * input_w + in_img_idx * num_channels + channel_id]; - // linear interpolation - out[out_id_h * output_w + out_id_w] = - w2lambda * in_pos[0] + w1lambda * in_pos[w_id * num_channels]; - } - } -} - -template -__global__ void KeLinearInterpBw(T* in, const size_t in_img_w, - const size_t input_w, const T* out, - const size_t out_img_w, const size_t output_h, - const size_t output_w, - const size_t num_channels, const T ratio_w, - const bool align_corners, const int align_mode, - const DataLayout data_layout) { - int nthreads = output_h * output_w; - int tid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - bool align_flag = (align_mode == 0 && !align_corners); - for (; tid < nthreads; tid += stride) { - int out_id_h = tid / output_w; - int out_id_w = tid % output_w; - int in_img_size = input_w / num_channels; - int out_img_size = output_w / num_channels; - - int channel_id, out_img_idx; - if (data_layout == DataLayout::kNCHW) { - channel_id = out_id_w / out_img_size; - out_img_idx = tid % out_img_w; - } else { - out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels; - channel_id = tid % num_channels; - } - - int in_img_idx = align_flag ? ratio_w * (out_img_idx + 0.5) - 0.5 - : ratio_w * out_img_idx; - in_img_idx = (in_img_idx > 0) ? in_img_idx : 0; // w - int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0; // w_id - - T src_w = ratio_w * (out_img_idx + 0.5) - 0.5; - src_w = (src_w > 0) ? src_w : 0; - T w1lambda = - align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx; - T w2lambda = 1.f - w1lambda; - - T* in_pos; - if (data_layout == DataLayout::kNCHW) { - in_pos = &in[out_id_h * input_w + channel_id * in_img_size + in_img_idx]; - } else { - in_pos = &in[out_id_h * input_w + in_img_idx * num_channels + channel_id]; - } - const T* out_pos = &out[out_id_w]; - - if (data_layout == DataLayout::kNCHW) { - platform::CudaAtomicAdd(&in_pos[0], w2lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos[w_id], w1lambda * out_pos[0]); - } else { - platform::CudaAtomicAdd(&in_pos[0], w2lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos[w_id * num_channels], - w1lambda * out_pos[0]); - } - } -} - -template -__global__ void KeBilinearInterpNCHWFw(const T* in, const size_t in_img_h, - const size_t in_img_w, T* out, - const size_t out_img_h, - const size_t out_img_w, const size_t nc, - const float ratio_h, const float ratio_w, - const T align_type_value) { - int out_img_idx = threadIdx.x + blockIdx.x * blockDim.x; - int out_img_idy = threadIdx.y + blockIdx.y * blockDim.y; - int nc_id = threadIdx.z + blockIdx.z * blockDim.z; - int nc_stride = blockDim.z * gridDim.z; - - int in_img_idx, in_img_idy, h_id, w_id; - T h1lambda, w1lambda, h2lambda, w2lambda; - T src_w = ratio_w * (out_img_idx + align_type_value) - align_type_value; - T src_h = ratio_h * (out_img_idy + align_type_value) - align_type_value; - - PreCalculatorForLinearInterpInputIndex(&in_img_idx, &w_id, &w1lambda, - &w2lambda, src_w, in_img_w); - PreCalculatorForLinearInterpInputIndex(&in_img_idy, &h_id, &h1lambda, - &h2lambda, src_h, in_img_h); - - int in_index = (nc_id * in_img_h + in_img_idy) * in_img_w + in_img_idx; - int in_index_stride = nc_stride * in_img_h * in_img_w; - - int out_index = (nc_id * out_img_h + out_img_idy) * out_img_w + out_img_idx; - int out_index_stride = nc_stride * out_img_h * out_img_w; - - // prevent from multiple threads writing - if (out_img_idx < out_img_w && out_img_idy < out_img_h) { - while (nc_id < nc) { - const T* in_pos = &in[in_index]; - out[out_index] = - h2lambda * (w2lambda * in_pos[0] + w1lambda * in_pos[w_id]) + - h1lambda * (w2lambda * in_pos[h_id * in_img_w] + - w1lambda * in_pos[h_id * in_img_w + w_id]); - - in_index += in_index_stride; - out_index += out_index_stride; - nc_id += nc_stride; - } - } -} - -template -__global__ void KeBilinearInterpFw( - const T* in, const size_t in_img_h, const size_t in_img_w, - const size_t input_h, const size_t input_w, T* out, const size_t out_img_h, - const size_t out_img_w, const size_t output_h, const size_t output_w, - const size_t num_channels, const float ratio_h, const float ratio_w, - const T align_type_value, FastDivModForInterpolate divmods) { - int nthreads = output_h * output_w; - int tid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - - for (; tid < nthreads; tid += stride) { - auto out_id_divmod = divmods.output_w_div.Divmod(tid); - int out_id_h = out_id_divmod.val[0]; - int out_id_w = out_id_divmod.val[1]; - - int channel_id = divmods.channels_div.Divmod(tid).val[1]; - auto outimg_id_divmod = divmods.output_wc_div.Divmod(out_id_w); - int out_img_idy = outimg_id_divmod.val[0]; - int out_img_idx = - divmods.channels_div.Divmod(outimg_id_divmod.val[1]).val[0]; - - int in_img_idx, in_img_idy, h_id, w_id; - T h1lambda, w1lambda, h2lambda, w2lambda; - T src_w = ratio_w * (out_img_idx + align_type_value) - align_type_value; - T src_h = ratio_h * (out_img_idy + align_type_value) - align_type_value; - - PreCalculatorForLinearInterpInputIndex(&in_img_idx, &w_id, &w1lambda, - &w2lambda, src_w, in_img_w); - PreCalculatorForLinearInterpInputIndex(&in_img_idy, &h_id, &h1lambda, - &h2lambda, src_h, in_img_h); - - // bilinear interpolation - const T* in_pos = - &in[out_id_h * input_w + in_img_idy * in_img_w * num_channels + - in_img_idx * num_channels + channel_id]; - out[tid] = - h2lambda * - (w2lambda * in_pos[0] + w1lambda * in_pos[w_id * num_channels]) + - h1lambda * - (w2lambda * in_pos[h_id * in_img_w * num_channels] + - w1lambda * - in_pos[h_id * in_img_w * num_channels + w_id * num_channels]); - } -} - -/* Calculate the minimum of partial elements in a block */ -template -__inline__ __device__ T PartialBlockMin(T val, size_t threads_num_in_block, - unsigned mask) { - __shared__ T shared[WARP_SIZE]; - __shared__ T shared_last_val; - __shared__ int shared_last_idx; - int lane = threadIdx.x & 0x1f; - int wid = threadIdx.x >> 5; - int threshold = (threads_num_in_block & (-WARP_SIZE)); - - if (threadIdx.x < threshold) { - shared_last_idx = (threshold >> 5) - 1; - val = phi::funcs::warpReduceMin(val, mask); - if (lane == 0) { - shared[wid] = val; - } - } else { - shared_last_val = std::numeric_limits::max(); - platform::CudaAtomicMin(&shared_last_val, val); - shared[wid] = shared_last_val; - shared_last_idx = wid; - } - __syncthreads(); - - if (threadIdx.x < threshold) { - val = (lane <= shared_last_idx) ? shared[lane] - : std::numeric_limits::max(); - val = phi::funcs::warpReduceMin(val, mask); - shared_last_val = val; - } - __syncthreads(); - if (threadIdx.x >= threshold) { - val = shared_last_val; - } - return val; -} - -template -__global__ void KeBilinearInterpBwShareMemory( - T* in, const int in_h, const int in_w, const T* __restrict__ out, - const int out_h, const int out_w, const int n, const int num_channels, - float ratio_h, float ratio_w, const T align_type_value, bool is_nchw) { - __shared__ T s_data[2][1024]; - int tid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - int in_chw = in_h * in_w * num_channels; - int out_chw = num_channels * out_h * out_w; - int nthreads = n * out_chw; - - for (; tid < nthreads; tid += stride) { - int out_id_h = tid / out_chw; - int out_id_w = tid % out_chw; - const int in_img_size = in_h * in_w; - const int out_img_size = out_h * out_w; - T value = out[out_id_h * out_chw + out_id_w]; - - int channel_id = out_id_w / out_img_size; - int out_img_idy = (out_id_w % out_img_size) / out_w; - int out_img_idx = tid % out_w; - - int in_img_idx, in_img_idy, w_id, h_id; - T w1lambda, h1lambda, w2lambda, h2lambda; - T src_w = ratio_w * (out_img_idx + align_type_value) - align_type_value; - T src_h = ratio_h * (out_img_idy + align_type_value) - align_type_value; - - PreCalculatorForLinearInterpInputIndex(&in_img_idx, &w_id, &w1lambda, - &w2lambda, src_w, in_w); - PreCalculatorForLinearInterpInputIndex(&in_img_idy, &h_id, &h1lambda, - &h2lambda, src_h, in_h); - - // top_left_index is just input_index. - int input_index = out_id_h * in_chw + channel_id * in_img_size + - in_img_idy * in_w + in_img_idx; - int top_right_index = input_index + w_id; - int bot_left_index = input_index + h_id * in_w; - int bot_right_index = input_index + h_id * in_w + w_id; - int in_top_min_index, in_bot_min_index; - - s_data[0][threadIdx.x] = 0.f; - s_data[1][threadIdx.x] = 0.f; - int remain = nthreads - (tid & (-blockDim.x)); - int in_top_max_index = - phi::funcs::blockReduceMax(top_right_index, FINAL_MASK); - int in_bot_max_index = - phi::funcs::blockReduceMax(bot_right_index, FINAL_MASK); - - if (remain > blockDim.x) { - in_top_min_index = phi::funcs::blockReduceMin(input_index, FINAL_MASK); - in_bot_min_index = phi::funcs::blockReduceMin(bot_left_index, FINAL_MASK); - } else { - in_top_min_index = PartialBlockMin(input_index, remain, FINAL_MASK); - in_bot_min_index = PartialBlockMin(bot_left_index, remain, FINAL_MASK); - } - int upper_limit_share_idx = (in_top_max_index - in_top_min_index) > - (in_bot_max_index - in_bot_min_index) - ? (in_top_max_index - in_top_min_index) - : (in_bot_max_index - in_bot_min_index); - if (h_id != 0) { - platform::CudaAtomicAdd(&s_data[0][input_index - in_top_min_index], - h2lambda * w2lambda * value); - platform::CudaAtomicAdd(&s_data[0][top_right_index - in_top_min_index], - h2lambda * w1lambda * value); - platform::CudaAtomicAdd(&s_data[1][bot_left_index - in_bot_min_index], - h1lambda * w2lambda * value); - platform::CudaAtomicAdd(&s_data[1][bot_right_index - in_bot_min_index], - h1lambda * w1lambda * value); - } else { - platform::CudaAtomicAdd(&s_data[0][top_right_index - in_top_min_index], - (h2lambda + h1lambda) * w1lambda * value); - platform::CudaAtomicAdd(&s_data[1][bot_left_index - in_bot_min_index], - (h1lambda + h2lambda) * w2lambda * value); - } - __syncthreads(); - - if (threadIdx.x <= upper_limit_share_idx) { - platform::CudaAtomicAdd(&in[in_top_min_index + threadIdx.x], - s_data[0][threadIdx.x]); - platform::CudaAtomicAdd(&in[in_bot_min_index + threadIdx.x], - s_data[1][threadIdx.x]); - } - } -} - -__device__ __forceinline__ int GetInputIndex(const size_t nc, const int height, - const int width, const int h, - const int w) { - return (nc * height + h) * width + w; -} - -template -__global__ void KeBilinearInterpNCHWBw(T* in, const int in_h, const int in_w, - const int out_h, const int out_w, - const int n, const int num_channels, - float ratio_h, float ratio_w, - const T* __restrict__ out, - const T align_type_value) { - int index = threadIdx.x + blockDim.x * blockIdx.x; - int stride = blockDim.x * gridDim.x; - int num_out = n * num_channels * out_h * out_w; - int num_in = n * num_channels * in_h * in_w; - - for (; index < num_out; index += stride) { - int index_tmp = index; - int w2 = index_tmp % out_w; - index_tmp /= out_w; - int h2 = index_tmp % out_h; - int nc = index_tmp / out_h; - - int h1, y_id; - T h1lambda, h0lambda; - T src_y = ratio_h * (h2 + align_type_value) - align_type_value; - - PreCalculatorForLinearInterpInputIndex(&h1, &y_id, &h1lambda, &h0lambda, - src_y, in_h); - int w1, x_id; - T w1lambda, w0lambda; - T src_x = ratio_w * (w2 + align_type_value) - align_type_value; - PreCalculatorForLinearInterpInputIndex(&w1, &x_id, &w1lambda, &w0lambda, - src_x, in_w); - - T d2val = out[index]; - - platform::CudaAtomicAdd(in + GetInputIndex(nc, in_h, in_w, h1, w1), - h0lambda * w0lambda * d2val); - platform::CudaAtomicAdd(in + GetInputIndex(nc, in_h, in_w, h1, w1 + x_id), - h0lambda * w1lambda * d2val); - platform::CudaAtomicAdd(in + GetInputIndex(nc, in_h, in_w, h1 + y_id, w1), - h1lambda * w0lambda * d2val); - platform::CudaAtomicAdd( - in + GetInputIndex(nc, in_h, in_w, h1 + y_id, w1 + x_id), - h1lambda * w1lambda * d2val); - } -} - -template -__global__ void KeBilinearInterpBw(T* in, const int in_h, const int in_w, - const T* __restrict__ out, const int out_h, - const int out_w, const int n, - const int out_chw, const int num_channels, - float ratio_h, float ratio_w, - const T align_type_value, - FastDivModForInterpolate divmods) { - int tid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - int in_chw = in_h * in_w * num_channels; - int nthreads = n * out_chw; - - for (; tid < nthreads; tid += stride) { - auto out_id_divmod = divmods.output_w_div.Divmod(tid); - int out_id_h = out_id_divmod.val[0]; - int out_id_w = out_id_divmod.val[1]; - - int channel_id = divmods.channels_div.Divmod(tid).val[1]; - auto outimg_id_divmod = divmods.output_wc_div.Divmod(out_id_w); - int out_img_idy = outimg_id_divmod.val[0]; - int out_img_idx = - divmods.channels_div.Divmod(outimg_id_divmod.val[1]).val[0]; - - int in_img_idx, in_img_idy, w_id, h_id; - T w1lambda, h1lambda, w2lambda, h2lambda; - T src_w = ratio_w * (out_img_idx + align_type_value) - align_type_value; - T src_h = ratio_h * (out_img_idy + align_type_value) - align_type_value; - - PreCalculatorForLinearInterpInputIndex(&in_img_idx, &w_id, &w1lambda, - &w2lambda, src_w, in_w); - PreCalculatorForLinearInterpInputIndex(&in_img_idy, &h_id, &h1lambda, - &h2lambda, src_h, in_h); - - T value = out[tid]; - T* in_pos = &in[out_id_h * in_chw + in_img_idy * in_w * num_channels + - in_img_idx * num_channels + channel_id]; - platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * value); - platform::CudaAtomicAdd(&in_pos[w_id * num_channels], - h2lambda * w1lambda * value); - platform::CudaAtomicAdd(&in_pos[h_id * in_w * num_channels], - h1lambda * w2lambda * value); - platform::CudaAtomicAdd( - &in_pos[h_id * in_w * num_channels + w_id * num_channels], - h1lambda * w1lambda * value); - } -} - -template -__global__ void KeTrilinearInterpFw( - const T* in, const size_t in_img_d, const size_t in_img_h, - const size_t in_img_w, const size_t input_h, const size_t input_w, T* out, - const size_t out_img_d, const size_t out_img_h, const size_t out_img_w, - const size_t output_h, const size_t output_w, const size_t num_channels, - const float ratio_d, const float ratio_h, const float ratio_w, - const bool align_corners, const int align_mode, - const DataLayout data_layout) { - int nthreads = output_h * output_w; - int tid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - bool align_flag = (align_mode == 0 && !align_corners); - for (; tid < nthreads; tid += stride) { - int out_id_h = tid / output_w; - int out_id_w = tid % output_w; - int in_img_size = input_w / num_channels; - int out_img_size = output_w / num_channels; - - int channel_id, out_img_idt, out_img_idy, out_img_idx; - if (data_layout == DataLayout::kNCHW) { - channel_id = out_id_w / out_img_size; - out_img_idt = (out_id_w % out_img_size) / out_img_h / out_img_w; - out_img_idy = ((out_id_w % out_img_size) / out_img_w) % out_img_h; - out_img_idx = tid % out_img_w; - } else { - out_img_idt = out_id_w / (out_img_h * out_img_w * num_channels); - out_img_idy = out_id_w % (out_img_h * out_img_w * num_channels) / - (out_img_w * num_channels); - out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels; - channel_id = tid % num_channels; - } - - int in_img_idt = align_flag - ? static_cast(ratio_d * (out_img_idt + 0.5) - 0.5) - : static_cast(ratio_d * out_img_idt); - in_img_idt = (in_img_idt > 0) ? in_img_idt : 0; - int d_id = (in_img_idt < in_img_d - 1) ? 1 : 0; - T src_d = ratio_d * (out_img_idt + 0.5) - 0.5; - src_d = (src_d > 0) ? src_d : 0; - T d1lambda = - align_flag ? src_d - in_img_idt : ratio_d * out_img_idt - in_img_idt; - T d2lambda = 1.f - d1lambda; - - int in_img_idy = align_flag - ? static_cast(ratio_h * (out_img_idy + 0.5) - 0.5) - : static_cast(ratio_h * out_img_idy); - in_img_idy = (in_img_idy > 0) ? in_img_idy : 0; - int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0; - T src_h = ratio_h * (out_img_idy + 0.5) - 0.5; - src_h = (src_h > 0) ? src_h : 0; - T h1lambda = - align_flag ? src_h - in_img_idy : ratio_h * out_img_idy - in_img_idy; - T h2lambda = 1.f - h1lambda; - - int in_img_idx = align_flag - ? static_cast(ratio_w * (out_img_idx + 0.5) - 0.5) - : static_cast(ratio_w * out_img_idx); - in_img_idx = (in_img_idx > 0) ? in_img_idx : 0; - int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0; - T src_w = ratio_w * (out_img_idx + 0.5) - 0.5; - src_w = (src_w > 0) ? src_w : 0; - T w1lambda = - align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx; - T w2lambda = 1.f - w1lambda; - - if (data_layout == DataLayout::kNCHW) { - int in_pos1_idx = out_id_h * input_w + channel_id * in_img_size + - (in_img_idt * in_img_h + in_img_idy) * in_img_w + - in_img_idx; - const T* in_pos1 = &in[in_pos1_idx]; - int in_pos2_idx = in_pos1_idx + d_id * in_img_h * in_img_w; - const T* in_pos2 = &in[in_pos2_idx]; - - // trilinear interpolation - out[out_id_h * output_w + out_id_w] = - d2lambda * - (h2lambda * (w2lambda * in_pos1[0] + w1lambda * in_pos1[w_id]) + - h1lambda * (w2lambda * in_pos1[h_id * in_img_w] + - w1lambda * in_pos1[h_id * in_img_w + w_id])) + - d1lambda * - (h2lambda * (w2lambda * in_pos2[0] + w1lambda * in_pos2[w_id]) + - h1lambda * (w2lambda * in_pos2[h_id * in_img_w] + - w1lambda * in_pos2[h_id * in_img_w + w_id])); - - } else { - int in_pos1_idx = out_id_h * input_w + - in_img_idt * in_img_h * in_img_w * num_channels + - in_img_idy * in_img_w * num_channels + - in_img_idx * num_channels + channel_id; - const T* in_pos1 = &in[in_pos1_idx]; - int in_pos2_idx = in_pos1_idx + d_id * in_img_h * in_img_w * num_channels; - const T* in_pos2 = &in[in_pos2_idx]; - - // trilinear interpolation - out[out_id_h * output_w + out_id_w] = - d2lambda * - (h2lambda * (w2lambda * in_pos1[0] + - w1lambda * in_pos1[w_id * num_channels]) + - h1lambda * (w2lambda * in_pos1[h_id * in_img_w * num_channels] + - w1lambda * in_pos1[h_id * in_img_w * num_channels + - w_id * num_channels])) + - d1lambda * - (h2lambda * (w2lambda * in_pos2[0] + - w1lambda * in_pos2[w_id * num_channels]) + - h1lambda * (w2lambda * in_pos2[h_id * in_img_w * num_channels] + - w1lambda * in_pos2[h_id * in_img_w * num_channels + - w_id * num_channels])); - } - } -} - -template -__global__ void KeTrilinearInterpBw( - T* in, const size_t in_img_d, const size_t in_img_h, const size_t in_img_w, - const size_t input_h, const size_t input_w, const T* out, - const size_t out_img_d, const size_t out_img_h, const size_t out_img_w, - const size_t output_h, const size_t output_w, const size_t num_channels, - const T ratio_d, const T ratio_h, const T ratio_w, const bool align_corners, - const int align_mode, const DataLayout data_layout) { - int nthreads = output_h * output_w; - int tid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - bool align_flag = (align_mode == 0 && !align_corners); - for (; tid < nthreads; tid += stride) { - int out_id_h = tid / output_w; - int out_id_w = tid % output_w; - int in_img_size = input_w / num_channels; - int out_img_size = output_w / num_channels; - - int channel_id, out_img_idt, out_img_idy, out_img_idx; - if (data_layout == DataLayout::kNCHW) { - channel_id = out_id_w / out_img_size; - out_img_idt = (out_id_w % out_img_size) / out_img_h / out_img_w; - out_img_idy = ((out_id_w % out_img_size) / out_img_w) % out_img_h; - out_img_idx = tid % out_img_w; - } else { - out_img_idt = out_id_w / (out_img_h * out_img_w * num_channels); - out_img_idy = out_id_w % (out_img_h * out_img_w * num_channels) / - (out_img_w * num_channels); - out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels; - channel_id = tid % num_channels; - } - - int in_img_idt = align_flag - ? static_cast(ratio_d * (out_img_idt + 0.5) - 0.5) - : static_cast(ratio_d * out_img_idt); - in_img_idt = (in_img_idt > 0) ? in_img_idt : 0; - int d_id = (in_img_idt < in_img_d - 1) ? 1 : 0; - T src_d = ratio_d * (out_img_idt + 0.5) - 0.5; - src_d = (src_d > 0) ? src_d : 0; - T d1lambda = - align_flag ? src_d - in_img_idt : ratio_d * out_img_idt - in_img_idt; - T d2lambda = 1.f - d1lambda; - - int in_img_idy = align_flag - ? static_cast(ratio_h * (out_img_idy + 0.5) - 0.5) - : static_cast(ratio_h * out_img_idy); - in_img_idy = (in_img_idy > 0) ? in_img_idy : 0; - int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0; - T src_h = ratio_h * (out_img_idy + 0.5) - 0.5; - src_h = (src_h > 0) ? src_h : 0; - T h1lambda = - align_flag ? src_h - in_img_idy : ratio_h * out_img_idy - in_img_idy; - T h2lambda = 1.f - h1lambda; - - int in_img_idx = align_flag - ? static_cast(ratio_w * (out_img_idx + 0.5) - 0.5) - : static_cast(ratio_w * out_img_idx); - in_img_idx = (in_img_idx > 0) ? in_img_idx : 0; - int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0; - T src_w = ratio_w * (out_img_idx + 0.5) - 0.5; - src_w = (src_w > 0) ? src_w : 0; - T w1lambda = - align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx; - T w2lambda = 1.f - w1lambda; - - if (data_layout == DataLayout::kNCHW) { - int in_pos1_idx = out_id_h * input_w + channel_id * in_img_size + - (in_img_idt * in_img_h + in_img_idy) * in_img_w + - in_img_idx; - T* in_pos1 = &in[in_pos1_idx]; - int in_pos2_idx = in_pos1_idx + d_id * in_img_h * in_img_w; - T* in_pos2 = &in[in_pos2_idx]; - - const T* out_pos = &out[out_id_h * output_w + out_id_w]; - - // trilinear interpolation grad - platform::CudaAtomicAdd(&in_pos1[0], - d2lambda * h2lambda * w2lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos1[w_id], - d2lambda * h2lambda * w1lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos1[h_id * in_img_w], - d2lambda * h1lambda * w2lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos1[h_id * in_img_w + w_id], - d2lambda * h1lambda * w1lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos2[0], - d1lambda * h2lambda * w2lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos2[w_id], - d1lambda * h2lambda * w1lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos2[h_id * in_img_w], - d1lambda * h1lambda * w2lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos2[h_id * in_img_w + w_id], - d1lambda * h1lambda * w1lambda * out_pos[0]); - } else { - int in_pos1_idx = out_id_h * input_w + - in_img_idt * in_img_h * in_img_w * num_channels + - in_img_idy * in_img_w * num_channels + - in_img_idx * num_channels + channel_id; - T* in_pos1 = &in[in_pos1_idx]; - int in_pos2_idx = in_pos1_idx + d_id * in_img_h * in_img_w * num_channels; - T* in_pos2 = &in[in_pos2_idx]; - - const T* out_pos = &out[out_id_h * output_w + out_id_w]; - - // trilinear interpolation grad - platform::CudaAtomicAdd(&in_pos1[0], - d2lambda * h2lambda * w2lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos1[w_id * num_channels], - d2lambda * h2lambda * w1lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos1[h_id * in_img_w * num_channels], - d2lambda * h1lambda * w2lambda * out_pos[0]); - platform::CudaAtomicAdd( - &in_pos1[h_id * in_img_w * num_channels + w_id * num_channels], - d2lambda * h1lambda * w1lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos2[0], - d1lambda * h2lambda * w2lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos2[w_id * num_channels], - d1lambda * h2lambda * w1lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos2[h_id * in_img_w * num_channels], - d1lambda * h1lambda * w2lambda * out_pos[0]); - platform::CudaAtomicAdd( - &in_pos2[h_id * in_img_w * num_channels + w_id * num_channels], - d1lambda * h1lambda * w1lambda * out_pos[0]); - } - } -} - -template -__device__ __forceinline__ static T Kecubic_interp(const T x0, const T x1, - const T x2, const T x3, - T t) { - T coeffs[4]; - T a = -0.75; - T x_1 = t; - T x_2 = 1.0 - t; - coeffs[0] = cubic_convolution2(x_1 + 1.0, a); - coeffs[1] = cubic_convolution1(x_1, a); - coeffs[2] = cubic_convolution1(x_2, a); - coeffs[3] = cubic_convolution2(x_2 + 1.0, a); - return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3]; -} - -template -__global__ void KeBicubicInterpFw( - const T* in, const size_t in_img_h, const size_t in_img_w, - const size_t input_h, const size_t input_w, T* out, const size_t out_img_h, - const size_t out_img_w, const size_t output_h, const size_t output_w, - const size_t num_channels, const float ratio_h, const float ratio_w, - const bool align_corners, const DataLayout data_layout) { - int nthreads = output_h * output_w; - int tid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - - for (; tid < nthreads; tid += stride) { - int out_id_h = tid / output_w; - int out_id_w = tid % output_w; - int in_img_size = input_w / num_channels; - int out_img_size = output_w / num_channels; - - int channel_id, out_img_idy, out_img_idx; - - if (data_layout == DataLayout::kNCHW) { - channel_id = out_id_w / out_img_size; - out_img_idy = (out_id_w % out_img_size) / out_img_w; - out_img_idx = tid % out_img_w; - } else { - out_img_idy = out_id_w / (out_img_w * num_channels); - out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels; - channel_id = tid % num_channels; - } - - T in_img_idy = align_corners - ? static_cast(ratio_h * out_img_idy) - : static_cast(ratio_h * (out_img_idy + 0.5) - 0.5); - int input_y = floorf(in_img_idy); - const T y_t = in_img_idy - input_y; - - T in_img_idx = align_corners - ? static_cast(ratio_w * out_img_idx) - : static_cast(ratio_w * (out_img_idx + 0.5) - 0.5); - int input_x = floorf(in_img_idx); - const T x_t = in_img_idx - input_x; - - T coefficients[4]; - const T* in_pos_0; - const T* in_pos_1; - const T* in_pos_2; - const T* in_pos_3; - int access_x_0; - if (data_layout == DataLayout::kNCHW) { - for (int k = 0; k < 4; k++) { - int access_y = - max(min(input_y - 1 + k, static_cast(in_img_h - 1)), 0); - access_x_0 = max(min(input_x - 1, static_cast(in_img_w - 1)), 0); - int access_x_1 = - max(min(input_x + 0, static_cast(in_img_w - 1)), 0); - int access_x_2 = - max(min(input_x + 1, static_cast(in_img_w - 1)), 0); - int access_x_3 = - max(min(input_x + 2, static_cast(in_img_w - 1)), 0); - - in_pos_0 = &in[out_id_h * input_w + channel_id * in_img_size + - access_y * in_img_w + access_x_0]; - in_pos_1 = &in[out_id_h * input_w + channel_id * in_img_size + - access_y * in_img_w + access_x_1]; - in_pos_2 = &in[out_id_h * input_w + channel_id * in_img_size + - access_y * in_img_w + access_x_2]; - in_pos_3 = &in[out_id_h * input_w + channel_id * in_img_size + - access_y * in_img_w + access_x_3]; - - coefficients[k] = Kecubic_interp(in_pos_0[0], in_pos_1[0], - in_pos_2[0], in_pos_3[0], x_t); - } - - out[out_id_h * output_w + out_id_w] = - Kecubic_interp(coefficients[0], coefficients[1], coefficients[2], - coefficients[3], y_t); - - } else { - for (int k = 0; k < 4; k++) { - int access_y = - max(min(input_y - 1 + k, static_cast((in_img_h - 1))), 0); - int access_x_0 = - max(min(input_x - 1, static_cast((in_img_w - 1))), 0); - int access_x_1 = - max(min(input_x + 0, static_cast((in_img_w - 1))), 0); - int access_x_2 = - max(min(input_x + 1, static_cast((in_img_w - 1))), 0); - int access_x_3 = - max(min(input_x + 2, static_cast((in_img_w - 1))), 0); - - const T* in_pos_0 = - &in[out_id_h * input_w + access_y * in_img_w * num_channels + - access_x_0 * num_channels + channel_id]; - const T* in_pos_1 = - &in[out_id_h * input_w + access_y * in_img_w * num_channels + - access_x_1 * num_channels + channel_id]; - const T* in_pos_2 = - &in[out_id_h * input_w + access_y * in_img_w * num_channels + - access_x_2 * num_channels + channel_id]; - const T* in_pos_3 = - &in[out_id_h * input_w + access_y * in_img_w * num_channels + - access_x_3 * num_channels + channel_id]; - - coefficients[k] = Kecubic_interp(in_pos_0[0], in_pos_1[0], in_pos_2[0], - in_pos_3[0], x_t); - } - - out[out_id_h * output_w + out_id_w] = - static_cast(Kecubic_interp(coefficients[0], coefficients[1], - coefficients[2], coefficients[3], y_t)); - } - } -} - -template -__global__ void KeBicubicInterpBw( - T* in, const size_t in_img_h, const size_t in_img_w, const size_t input_h, - const size_t input_w, const T* out, const size_t out_img_h, - const size_t out_img_w, const size_t output_h, const size_t output_w, - const size_t num_channels, const float ratio_h, const float ratio_w, - const bool align_corners, const DataLayout data_layout) { - int nthreads = output_h * output_w; - int tid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - - for (; tid < nthreads; tid += stride) { - int out_id_h = tid / output_w; - int out_id_w = tid % output_w; - int in_img_size = input_w / num_channels; - int out_img_size = output_w / num_channels; - - int channel_id, out_img_idy, out_img_idx; - if (data_layout == DataLayout::kNCHW) { - channel_id = out_id_w / out_img_size; - out_img_idy = (out_id_w % out_img_size) / out_img_w; - out_img_idx = tid % out_img_w; - } else { - out_img_idy = out_id_w / (out_img_w * num_channels); - out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels; - channel_id = tid % num_channels; - } - - T in_img_idy = align_corners - ? static_cast(ratio_h * out_img_idy) - : static_cast(ratio_h * (out_img_idy + 0.5) - 0.5); - int input_y = floorf(in_img_idy); - const T y_t = in_img_idy - input_y; - - T in_img_idx = align_corners - ? static_cast(ratio_w * out_img_idx) - : static_cast(ratio_w * (out_img_idx + 0.5) - 0.5); - int input_x = floorf(in_img_idx); - - const T x_t = in_img_idx - input_x; - - T x_coeffs[4]; - T y_coeffs[4]; - - get_cubic_upsample_coefficients(x_coeffs, x_t); - get_cubic_upsample_coefficients(y_coeffs, y_t); - - const T* out_pos = &out[out_id_h * output_w + out_id_w]; - T* in_pos; - - for (int i = 0; i < 4; i++) { - for (int j = 0; j < 4; j++) { - int access_y = max(min(static_cast(input_y - 1 + j), - static_cast(in_img_h - 1)), - 0); - int access_x = max(min(static_cast(input_x - 1 + i), - static_cast(in_img_w - 1)), - 0); - if (data_layout == DataLayout::kNCHW) { - in_pos = &in[out_id_h * input_w + channel_id * in_img_size + - access_y * in_img_w + access_x]; - } else { - in_pos = &in[out_id_h * input_w + access_y * in_img_w * num_channels + - access_x * num_channels + channel_id]; - } - platform::CudaAtomicAdd(&in_pos[0], - (out_pos[0] * y_coeffs[j] * x_coeffs[i])); - } - } - } -} - -template -static void Interpolate1DCUDAFwd(const framework::ExecutionContext& ctx, - const Tensor& input, Tensor* output) { - auto* input_data = input.data(); - - const std::string data_layout_str = ctx.Attr("data_layout"); - const DataLayout data_layout = framework::StringToDataLayout(data_layout_str); - int n, c, in_d, in_h, in_w; - ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); - - auto interp_method = ctx.Attr("interp_method"); - bool align_corners = ctx.Attr("align_corners"); - int align_mode = ctx.Attr("align_mode"); - - int out_w = ctx.Attr("out_w"); - - auto list_new_shape_tensor = ctx.MultiInput("SizeTensor"); - float scale_w = -1; - if (list_new_shape_tensor.size() > 0) { - // have size tensor - auto new_size = get_new_shape(list_new_shape_tensor); - out_w = new_size[0]; - } else { - auto scale_tensor = ctx.Input("Scale"); - auto scale = ctx.Attr>("scale"); - if (scale_tensor != nullptr) { - auto scale_data = get_new_data_from_tensor(scale_tensor); - scale_w = scale_data[0]; - PADDLE_ENFORCE_EQ( - scale_w > 0, true, - platform::errors::InvalidArgument( - "The scale_w in input 'Scale' Tensor of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - } else { - if (scale.size() > 0) { - scale_w = scale[0]; - PADDLE_ENFORCE_EQ( - scale_w > 0, true, - platform::errors::InvalidArgument( - "The scale_w in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - } - } - if (scale_w > 0.) { - out_w = static_cast(in_w * scale_w); - } - auto out_size = ctx.Input("OutSize"); - if (out_size != nullptr) { - Tensor sizes; - framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes); - auto size_data = sizes.data(); - out_w = size_data[0]; - } - } - PADDLE_ENFORCE_GT(out_w, 0, platform::errors::InvalidArgument( - "out_w in Attr(out_shape) of Op(interpolate) " - "should be greater than 0.")); - framework::DDim dim_out; - if (data_layout == DataLayout::kNCHW) { - dim_out = {n, c, out_w}; - } else { - dim_out = {n, out_w, c}; - } - auto output_data = output->mutable_data(dim_out, ctx.GetPlace()); - - if (in_w == out_w) { - framework::TensorCopy(input, ctx.GetPlace(), output); - return; - } - - float ratio_w = 0.f; - if (out_w > 1) { - float new_scale_w = 0.f; - new_scale_w = (scale_w > 0) ? static_cast(1. / scale_w) - : static_cast(in_w) / out_w; - ratio_w = (align_corners) ? static_cast(in_w - 1.0) / (out_w - 1.0) - : static_cast(new_scale_w); - } - - int64_t in_cw = c * in_w; - int64_t out_cw = c * out_w; - auto pixelNum = n * out_cw; - - platform::GpuLaunchConfig config = - platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum); - - if ("linear" == interp_method) { - KeLinearInterpFw<<>>( - input_data, in_w, in_cw, output_data, out_w, n, out_cw, c, ratio_w, - align_corners, align_mode, data_layout); - } -} - -template -static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx, - const Tensor& input, Tensor* output) { - auto* input_data = input.data(); - - const std::string data_layout_str = ctx.Attr("data_layout"); - const DataLayout data_layout = framework::StringToDataLayout(data_layout_str); - int n, c, in_d, in_h, in_w; - ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); - - auto interp_method = ctx.Attr("interp_method"); - bool align_corners = ctx.Attr("align_corners"); - int align_mode = ctx.Attr("align_mode"); - - int out_h = ctx.Attr("out_h"); - int out_w = ctx.Attr("out_w"); - - auto list_new_shape_tensor = ctx.MultiInput("SizeTensor"); - float scale_w = -1; - float scale_h = -1; - if (list_new_shape_tensor.size() > 0) { - // have size tensor - auto new_size = get_new_shape(list_new_shape_tensor); - out_h = new_size[0]; - out_w = new_size[1]; - } else { - auto scale_tensor = ctx.Input("Scale"); - auto scale = ctx.Attr>("scale"); - if (scale_tensor != nullptr) { - auto scale_data = get_new_data_from_tensor(scale_tensor); - if (scale_data.size() > 1) { - scale_h = scale_data[0]; - scale_w = scale_data[1]; - } else { - scale_h = scale_data[0]; - scale_w = scale_data[0]; - } - - PADDLE_ENFORCE_EQ( - scale_w > 0, true, - platform::errors::InvalidArgument( - "The scale_w in input 'Scale' Tensor of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - PADDLE_ENFORCE_EQ( - scale_h > 0, true, - platform::errors::InvalidArgument( - "The scale_h in input 'Scale' Tensor of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_h)); - } else { - if (scale.size() > 1) { - scale_w = scale[1]; - scale_h = scale[0]; - - PADDLE_ENFORCE_EQ( - scale_w > 0, true, - platform::errors::InvalidArgument( - "The scale_w in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - PADDLE_ENFORCE_EQ( - scale_h > 0, true, - platform::errors::InvalidArgument( - "The scale_h in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_h)); - } - } - if (scale_w > 0. && scale_h > 0.) { - out_h = static_cast(in_h * scale_h); - out_w = static_cast(in_w * scale_w); - } - auto out_size = ctx.Input("OutSize"); - if (out_size != nullptr) { - Tensor sizes; - framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes); - auto size_data = sizes.data(); - out_h = size_data[0]; - out_w = size_data[1]; - } - } - PADDLE_ENFORCE_GT(out_h, 0, platform::errors::InvalidArgument( - "out_h in Attr(out_shape) of Op(interpolate) " - "should be greater than 0.")); - PADDLE_ENFORCE_GT(out_w, 0, platform::errors::InvalidArgument( - "out_w in Attr(out_shape) of Op(interpolate) " - "should be greater than 0.")); - - framework::DDim dim_out; - if (data_layout == DataLayout::kNCHW) { - dim_out = {n, c, out_h, out_w}; - } else { - dim_out = {n, out_h, out_w, c}; - } - auto output_data = output->mutable_data(dim_out, ctx.GetPlace()); - - if (in_h == out_h && in_w == out_w) { - framework::TensorCopy(input, ctx.GetPlace(), output); - return; - } - - float ratio_h = 0.f; - float ratio_w = 0.f; - if (out_h > 1) { - float new_scale_h = 0.f; - new_scale_h = (scale_h > 0) ? static_cast(1. / scale_h) - : static_cast(in_h) / out_h; - ratio_h = (align_corners) ? static_cast(in_h - 1) / (out_h - 1) - : static_cast(new_scale_h); - } - if (out_w > 1) { - float new_scale_w = 0.f; - new_scale_w = (scale_w > 0) ? static_cast(1. / scale_w) - : static_cast(in_w) / out_w; - ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) - : static_cast(new_scale_w); - } - - int64_t in_hw = in_h * in_w; - int64_t out_hw = out_h * out_w; - int64_t in_chw = c * in_hw; - int64_t out_chw = c * out_hw; - - auto pixelNum = n * out_chw; - - platform::GpuLaunchConfig config = - platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum); - - if ("nearest" == interp_method) { - if (data_layout == DataLayout::kNCHW) { - // get launch 3D config - int nc = n * c; - platform::GpuLaunchConfig config_3d = - GetGpuLaunchConfig3D(ctx.cuda_device_context(), nc, out_h, out_w); - KeNearestNeighborInterpNCHWFw< - T><<>>( - input_data, in_h, in_w, output_data, out_h, out_w, nc, ratio_h, - ratio_w, align_corners); - } else { - int64_t cw = c * out_w; - auto interp_divmods = FastDivModForInterpolate(c, out_chw, cw); - KeNearestNeighborInterpFw< - T><<>>( - input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n, - out_chw, c, ratio_h, ratio_w, align_corners, interp_divmods); - } - } else if ("bilinear" == interp_method) { - dim3 thread_num = config.thread_per_block; -#ifdef WITH_NV_JETSON - if (config.compute_capability == 53 || config.compute_capability == 62) { - thread_num = 512; - } -#endif - const T align_type_value = (align_mode == 0 && !align_corners) ? 0.5f : 0; - if (data_layout == DataLayout::kNCHW) { - // get launch 3D config - int nc = n * c; - platform::GpuLaunchConfig config_3d = - GetGpuLaunchConfig3D(ctx.cuda_device_context(), nc, out_h, out_w); - KeBilinearInterpNCHWFw< - T><<>>( - input_data, in_h, in_w, output_data, out_h, out_w, nc, ratio_h, - ratio_w, align_type_value); - } else { - int64_t cw = c * out_w; - auto interp_divmods = FastDivModForInterpolate(c, out_chw, cw); - KeBilinearInterpFw<<>>( - input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n, - out_chw, c, ratio_h, ratio_w, align_type_value, interp_divmods); - } - } else if ("bicubic" == interp_method) { -#ifdef __HIPCC__ - constexpr int thread_per_block = 256; -#else - constexpr int thread_per_block = 512; -#endif - KeBicubicInterpFw<<>>( - input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n, - out_chw, c, ratio_h, ratio_w, align_corners, data_layout); - } -} - -template -static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx, - const Tensor& input, Tensor* output) { - auto* input_data = input.data(); - - const std::string data_layout_str = ctx.Attr("data_layout"); - const DataLayout data_layout = framework::StringToDataLayout(data_layout_str); - int n, c, in_d, in_h, in_w; - ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); - - auto interp_method = ctx.Attr("interp_method"); - bool align_corners = ctx.Attr("align_corners"); - int align_mode = ctx.Attr("align_mode"); - - int out_d = ctx.Attr("out_d"); - int out_h = ctx.Attr("out_h"); - int out_w = ctx.Attr("out_w"); - - auto list_new_shape_tensor = ctx.MultiInput("SizeTensor"); - float scale_w = -1; - float scale_d = -1; - float scale_h = -1; - if (list_new_shape_tensor.size() > 0) { - // have size tensor - auto new_size = get_new_shape(list_new_shape_tensor); - out_d = new_size[0]; - out_h = new_size[1]; - out_w = new_size[2]; - } else { - auto scale_tensor = ctx.Input("Scale"); - auto scale = ctx.Attr>("scale"); - if (scale_tensor != nullptr) { - auto scale_data = get_new_data_from_tensor(scale_tensor); - if (scale_data.size() > 1) { - scale_d = scale_data[0]; - scale_h = scale_data[1]; - scale_w = scale_data[2]; - } else { - scale_d = scale_data[0]; - scale_h = scale_data[0]; - scale_w = scale_data[0]; - } - - PADDLE_ENFORCE_EQ( - scale_w > 0, true, - platform::errors::InvalidArgument( - "The scale_w in input 'Scale' Tensor of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - PADDLE_ENFORCE_EQ( - scale_h > 0, true, - platform::errors::InvalidArgument( - "The scale_h in input 'Scale' Tensor of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_h)); - PADDLE_ENFORCE_EQ( - scale_d > 0, true, - platform::errors::InvalidArgument( - "The scale_d in input 'Scale' Tensor of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_d)); - } else { - if (scale.size() > 1) { - scale_d = scale[0]; - scale_h = scale[1]; - scale_w = scale[2]; - - PADDLE_ENFORCE_EQ( - scale_w > 0, true, - platform::errors::InvalidArgument( - "The scale_w in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - PADDLE_ENFORCE_EQ( - scale_h > 0, true, - platform::errors::InvalidArgument( - "The scale_h in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_h)); - PADDLE_ENFORCE_EQ( - scale_d > 0, true, - platform::errors::InvalidArgument( - "The scale_d in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_d)); - } - } - if (scale_d > 0. && scale_h > 0. && scale_w > 0.) { - out_d = static_cast(in_d * scale_d); - out_h = static_cast(in_h * scale_h); - out_w = static_cast(in_w * scale_w); - } - auto out_size = ctx.Input("OutSize"); - if (out_size != nullptr) { - Tensor sizes; - framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes); - auto size_data = sizes.data(); - out_d = size_data[0]; - out_h = size_data[1]; - out_w = size_data[2]; - } - } - PADDLE_ENFORCE_GT(out_d, 0, platform::errors::InvalidArgument( - "out_d in Attr(out_shape) of Op(interpolate) " - "should be greater than 0.")); - PADDLE_ENFORCE_GT(out_h, 0, platform::errors::InvalidArgument( - "out_h in Attr(out_shape) of Op(interpolate) " - "should be greater than 0.")); - PADDLE_ENFORCE_GT(out_w, 0, platform::errors::InvalidArgument( - "out_w in Attr(out_shape) of Op(interpolate) " - "should be greater than 0.")); - - framework::DDim dim_out; - if (data_layout == DataLayout::kNCHW) { - dim_out = {n, c, out_d, out_h, out_w}; - } else { - dim_out = {n, out_d, out_h, out_w, c}; - } - auto output_data = output->mutable_data(dim_out, ctx.GetPlace()); - - if (in_d == out_d && in_h == out_h && in_w == out_w) { - framework::TensorCopy(input, ctx.GetPlace(), output); - return; - } - - float ratio_d = 0.f; - float ratio_h = 0.f; - float ratio_w = 0.f; - if (out_d > 1) { - float new_scale_d = 0.f; - new_scale_d = (scale_d > 0) ? static_cast(1. / scale_d) - : static_cast(in_d) / out_d; - ratio_d = (align_corners) ? static_cast(in_d - 1) / (out_d - 1) - : static_cast(new_scale_d); - } - if (out_h > 1) { - float new_scale_h = 0.f; - new_scale_h = (scale_h > 0) ? static_cast(1. / scale_h) - : static_cast(in_h) / out_h; - ratio_h = (align_corners) ? static_cast(in_h - 1) / (out_h - 1) - : static_cast(new_scale_h); - } - if (out_w > 1) { - float new_scale_w = 0.f; - new_scale_w = (scale_w > 0) ? static_cast(1. / scale_w) - : static_cast(in_w) / out_w; - ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) - : static_cast(new_scale_w); - } - - int64_t in_dhw = in_d * in_h * in_w; - int64_t out_dhw = out_d * out_h * out_w; - int64_t in_cdhw = c * in_dhw; - int64_t out_cdhw = c * out_dhw; - - auto pixelNum = n * out_cdhw; - - platform::GpuLaunchConfig config = - platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum); - - if ("trilinear" == interp_method) { - KeTrilinearInterpFw<<>>( - input_data, in_d, in_h, in_w, n, in_cdhw, output_data, out_d, out_h, - out_w, n, out_cdhw, c, ratio_d, ratio_h, ratio_w, align_corners, - align_mode, data_layout); - } else if ("nearest" == interp_method) { - KeNearestNeighbor3DInterpFw< - T><<>>( - input_data, in_d, in_h, in_w, n, in_cdhw, output_data, out_d, out_h, - out_w, n, out_cdhw, c, ratio_d, ratio_h, ratio_w, align_corners, - data_layout); - } -} - -template -static void Interpolate1DCUDABwd(const framework::ExecutionContext& ctx, - Tensor* input_grad, const Tensor output_grad) { - auto* input = ctx.Input("X"); - const std::string data_layout_str = ctx.Attr("data_layout"); - const DataLayout data_layout = framework::StringToDataLayout(data_layout_str); - int n, c, in_d, in_h, in_w; - ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); - - auto interp_method = ctx.Attr("interp_method"); - bool align_corners = ctx.Attr("align_corners"); - int align_mode = ctx.Attr("align_mode"); - - int out_w = ctx.Attr("out_w"); - float scale_w = -1; - auto scale_tensor = ctx.Input("Scale"); - auto scale = ctx.Attr>("scale"); - if (scale_tensor != nullptr) { - auto scale_data = get_new_data_from_tensor(scale_tensor); - scale_w = scale_data[0]; - PADDLE_ENFORCE_EQ( - scale_w > 0, true, - platform::errors::InvalidArgument( - "The scale_w in input 'Scale' Tensor of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - } else { - if (scale.size() > 0) { - scale_w = scale[0]; - - PADDLE_ENFORCE_EQ( - scale_w > 0, true, - platform::errors::InvalidArgument( - "The scale_w in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - } - } - if (scale_w > 0.) { - out_w = static_cast(in_w * scale_w); - } - - auto out_size = ctx.Input("OutSize"); - if (out_size != nullptr) { - Tensor sizes; - framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes); - auto size_data = sizes.data(); - out_w = size_data[0]; - } - auto list_new_size_tensor = ctx.MultiInput("SizeTensor"); - if (list_new_size_tensor.size() > 0) { - // have size tensor - auto new_size = get_new_shape(list_new_size_tensor); - out_w = new_size[0]; - } - - auto* output_grad_data = output_grad.data(); - framework::DDim dim_grad; - if (data_layout == DataLayout::kNCHW) { - dim_grad = {n, c, in_w}; - } else { - dim_grad = {n, in_w, c}; - } - input_grad->mutable_data(dim_grad, ctx.GetPlace()); - auto* input_grad_data = input_grad->mutable_data(dim_grad, ctx.GetPlace()); - auto& device_ctx = ctx.template device_context(); - phi::funcs::SetConstant zero; - zero(device_ctx, input_grad, static_cast(0.0)); - - if (in_w == out_w) { - framework::TensorCopy(output_grad, ctx.GetPlace(), input_grad); - return; - } - - float ratio_w = 0.f; - if (out_w > 1) { - float new_scale_w = 0.f; - new_scale_w = (scale_w > 0) ? static_cast(1. / scale_w) - : static_cast(in_w) / out_w; - ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) - : static_cast(new_scale_w); - } - int64_t in_cw = c * in_w; - int64_t out_cw = c * out_w; - auto pixelNum = n * out_cw; - - platform::GpuLaunchConfig config = - platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum); - - if ("linear" == interp_method) { - KeLinearInterpBw<<>>( - input_grad_data, in_w, in_cw, output_grad_data, out_w, n, out_cw, c, - ratio_w, align_corners, align_mode, data_layout); - } -} - -template -static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx, - Tensor* input_grad, const Tensor output_grad) { - auto* input = ctx.Input("X"); - const std::string data_layout_str = ctx.Attr("data_layout"); - const DataLayout data_layout = framework::StringToDataLayout(data_layout_str); - int n, c, in_d, in_h, in_w; - ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); - - auto interp_method = ctx.Attr("interp_method"); - bool align_corners = ctx.Attr("align_corners"); - int align_mode = ctx.Attr("align_mode"); - - int out_h = ctx.Attr("out_h"); - int out_w = ctx.Attr("out_w"); - float scale_h = -1; - float scale_w = -1; - auto scale_tensor = ctx.Input("Scale"); - auto scale = ctx.Attr>("scale"); - if (scale_tensor != nullptr) { - auto scale_data = get_new_data_from_tensor(scale_tensor); - if (scale_data.size() > 1) { - scale_h = scale_data[0]; - scale_w = scale_data[1]; - } else { - scale_h = scale_data[0]; - scale_w = scale_data[0]; - } - - PADDLE_ENFORCE_EQ( - scale_w > 0, true, - platform::errors::InvalidArgument( - "The scale_w in input 'Scale' Tensor of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - PADDLE_ENFORCE_EQ( - scale_h > 0, true, - platform::errors::InvalidArgument( - "The scale_h in input 'Scale' Tensor of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_h)); - } else { - if (scale.size() > 1) { - scale_w = scale[1]; - scale_h = scale[0]; - - PADDLE_ENFORCE_EQ( - scale_w > 0, true, - platform::errors::InvalidArgument( - "The scale_w in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - PADDLE_ENFORCE_EQ( - scale_h > 0, true, - platform::errors::InvalidArgument( - "The scale_h in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_h)); - } - } - if (scale_w > 0. && scale_h > 0.) { - out_h = static_cast(in_h * scale_h); - out_w = static_cast(in_w * scale_w); - } - - auto out_size = ctx.Input("OutSize"); - if (out_size != nullptr) { - Tensor sizes; - framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes); - auto size_data = sizes.data(); - out_h = size_data[0]; - out_w = size_data[1]; - } - auto list_new_size_tensor = ctx.MultiInput("SizeTensor"); - if (list_new_size_tensor.size() > 0) { - // have size tensor - auto new_size = get_new_shape(list_new_size_tensor); - out_h = new_size[0]; - out_w = new_size[1]; - } - - auto* output_grad_data = output_grad.data(); - framework::DDim dim_grad; - if (data_layout == DataLayout::kNCHW) { - dim_grad = {n, c, in_h, in_w}; - } else { - dim_grad = {n, in_h, in_w, c}; - } - input_grad->mutable_data(dim_grad, ctx.GetPlace()); - auto* input_grad_data = input_grad->mutable_data(dim_grad, ctx.GetPlace()); - auto& device_ctx = ctx.template device_context(); - phi::funcs::SetConstant zero; - zero(device_ctx, input_grad, static_cast(0.0)); - - if (in_h == out_h && in_w == out_w) { - framework::TensorCopy(output_grad, ctx.GetPlace(), input_grad); - return; - } - - float ratio_h = 0.f; - float ratio_w = 0.f; - if (out_h > 1) { - float new_scale_h = 0.f; - new_scale_h = (scale_h > 0) ? static_cast(1. / scale_h) - : static_cast(in_h) / out_h; - ratio_h = (align_corners) ? static_cast(in_h - 1) / (out_h - 1) - : static_cast(new_scale_h); - } - if (out_w > 1) { - float new_scale_w = 0.f; - new_scale_w = (scale_w > 0) ? static_cast(1. / scale_w) - : static_cast(in_w) / out_w; - ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) - : static_cast(new_scale_w); - } - - int64_t in_hw = in_h * in_w; - int64_t out_hw = out_h * out_w; - int64_t in_chw = c * in_hw; - int64_t out_chw = c * out_hw; - auto pixelNum = n * out_chw; - - platform::GpuLaunchConfig config = - platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum); - - if ("nearest" == interp_method) { - if (data_layout == DataLayout::kNCHW) { - // get launch 3D config - int nc = n * c; - platform::GpuLaunchConfig config_3d = - GetGpuLaunchConfig3D(ctx.cuda_device_context(), nc, out_h, out_w); - KeNearestNeighborInterpNCHWBw< - T><<>>( - input_grad_data, in_h, in_w, output_grad_data, out_h, out_w, nc, - ratio_h, ratio_w, align_corners); - } else { - int64_t cw = c * out_w; - auto interp_divmods = FastDivModForInterpolate(c, out_chw, cw); - KeNearestNeighborInterpBw< - T><<>>( - input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, - out_w, n, out_chw, c, ratio_h, ratio_w, align_corners, - interp_divmods); - } - } else if ("bilinear" == interp_method) { - const T align_type_value = (align_mode == 0 && !align_corners) ? 0.5f : 0; - bool is_nchw = (data_layout == DataLayout::kNCHW) ? true : false; - bool optimize_flag = false; -#ifndef __HIPCC__ - optimize_flag = (in_h < (out_h >> 6) && in_w < (out_w >> 6)) - ? true - : ((in_h == 1 && in_w == 1) ? true : false); -#endif - - if (optimize_flag & is_nchw) { - KeBilinearInterpBwShareMemory< - T><<>>( - input_grad_data, in_h, in_w, output_grad_data, out_h, out_w, n, c, - ratio_h, ratio_w, align_type_value, is_nchw); - } else if (!optimize_flag & is_nchw) { - // - const int num_kernels = n * c * out_h * out_w; - const int num_threads = - std::min(ctx.cuda_device_context().GetMaxThreadsPerBlock(), 1024); - KeBilinearInterpNCHWBw< - T><<>>( - input_grad_data, in_h, in_w, out_h, out_w, n, c, ratio_h, ratio_w, - output_grad_data, align_type_value); - } else { - int64_t cw = c * out_w; - auto interp_divmods = FastDivModForInterpolate(c, out_chw, cw); - KeBilinearInterpBw<<>>( - input_grad_data, in_h, in_w, output_grad_data, out_h, out_w, n, - out_chw, c, ratio_h, ratio_w, align_type_value, interp_divmods); - } - } else if ("bicubic" == interp_method) { -#ifdef __HIPCC__ - constexpr int thread_per_block = 256; -#else - constexpr int thread_per_block = 512; -#endif - KeBicubicInterpBw<<>>( - input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w, - n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout); - } -} - -template -static void Interpolate3DCUDABwd(const framework::ExecutionContext& ctx, - Tensor* input_grad, - const Tensor& output_grad) { - auto* input = ctx.Input("X"); - const std::string data_layout_str = ctx.Attr("data_layout"); - const DataLayout data_layout = framework::StringToDataLayout(data_layout_str); - int n, c, in_d, in_h, in_w; - ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); - - auto interp_method = ctx.Attr("interp_method"); - bool align_corners = ctx.Attr("align_corners"); - int align_mode = ctx.Attr("align_mode"); - - int out_d = ctx.Attr("out_d"); - int out_h = ctx.Attr("out_h"); - int out_w = ctx.Attr("out_w"); - float scale_d = -1; - float scale_h = -1; - float scale_w = -1; - auto scale_tensor = ctx.Input("Scale"); - auto scale = ctx.Attr>("scale"); - if (scale_tensor != nullptr) { - auto scale_data = get_new_data_from_tensor(scale_tensor); - if (scale_data.size() > 1) { - scale_d = scale_data[0]; - scale_h = scale_data[1]; - scale_w = scale_data[2]; - } else { - scale_d = scale_data[0]; - scale_h = scale_data[0]; - scale_w = scale_data[0]; - } - PADDLE_ENFORCE_EQ( - scale_w > 0, true, - platform::errors::InvalidArgument( - "The scale_w in input 'Scale' Tensor of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - PADDLE_ENFORCE_EQ( - scale_h > 0, true, - platform::errors::InvalidArgument( - "The scale_h in input 'Scale' Tensor of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_h)); - PADDLE_ENFORCE_EQ( - scale_d > 0, true, - platform::errors::InvalidArgument( - "The scale_d in input 'Scale' Tensor of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_d)); - } else { - if (scale.size() > 1) { - scale_d = scale[0]; - scale_h = scale[1]; - scale_w = scale[2]; - - PADDLE_ENFORCE_EQ( - scale_w > 0, true, - platform::errors::InvalidArgument( - "The scale_w in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - PADDLE_ENFORCE_EQ( - scale_h > 0, true, - platform::errors::InvalidArgument( - "The scale_h in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_h)); - PADDLE_ENFORCE_EQ( - scale_d > 0, true, - platform::errors::InvalidArgument( - "The scale_d in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_d)); - } - } - if (scale_d > 0. && scale_h > 0. && scale_w > 0.) { - out_d = static_cast(in_d * scale_d); - out_h = static_cast(in_h * scale_h); - out_w = static_cast(in_w * scale_w); - } - - auto out_size = ctx.Input("OutSize"); - if (out_size != nullptr) { - Tensor sizes; - framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes); - auto size_data = sizes.data(); - out_d = size_data[0]; - out_h = size_data[1]; - out_w = size_data[2]; - } - auto list_new_size_tensor = ctx.MultiInput("SizeTensor"); - if (list_new_size_tensor.size() > 0) { - // have size tensor - auto new_size = get_new_shape(list_new_size_tensor); - out_d = new_size[0]; - out_h = new_size[1]; - out_w = new_size[2]; - } - - auto* output_grad_data = output_grad.data(); - framework::DDim dim_grad; - if (data_layout == DataLayout::kNCHW) { - dim_grad = {n, c, in_d, in_h, in_w}; - } else { - dim_grad = {n, in_d, in_h, in_w, c}; - } - auto* input_grad_data = input_grad->mutable_data(dim_grad, ctx.GetPlace()); - auto& device_ctx = ctx.template device_context(); - phi::funcs::SetConstant zero; - zero(device_ctx, input_grad, static_cast(0.0)); - - if (in_d == out_d && in_h == out_h && in_w == out_w) { - framework::TensorCopy(output_grad, ctx.GetPlace(), input_grad); - return; - } - - float ratio_d = 0.f; - float ratio_h = 0.f; - float ratio_w = 0.f; - if (out_d > 1) { - float new_scale_d = 0.f; - new_scale_d = (scale_d > 0) ? static_cast(1. / scale_d) - : static_cast(in_d) / out_d; - ratio_d = (align_corners) ? static_cast(in_d - 1) / (out_d - 1) - : static_cast(new_scale_d); - } - if (out_h > 1) { - float new_scale_h = 0.f; - new_scale_h = (scale_h > 0) ? static_cast(1. / scale_h) - : static_cast(in_h) / out_h; - ratio_h = (align_corners) ? static_cast(in_h - 1) / (out_h - 1) - : static_cast(new_scale_h); - } - if (out_w > 1) { - float new_scale_w = 0.f; - new_scale_w = (scale_w > 0) ? static_cast(1. / scale_w) - : static_cast(in_w) / out_w; - ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) - : static_cast(new_scale_w); - } - - int64_t in_dhw = in_d * in_h * in_w; - int64_t out_dhw = out_d * out_h * out_w; - int64_t in_cdhw = c * in_dhw; - int64_t out_cdhw = c * out_dhw; - - auto pixelNum = n * out_cdhw; - - platform::GpuLaunchConfig config = - platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum); - - if ("trilinear" == interp_method) { - KeTrilinearInterpBw<<>>( - input_grad_data, in_d, in_h, in_w, n, in_cdhw, output_grad_data, out_d, - out_h, out_w, n, out_cdhw, c, ratio_d, ratio_h, ratio_w, align_corners, - align_mode, data_layout); - } else if ("nearest" == interp_method) { - KeNearestNeighbor3DInterpBw< - T><<>>( - input_grad_data, in_d, in_h, in_w, n, in_cdhw, output_grad_data, out_d, - out_h, out_w, n, out_cdhw, c, ratio_d, ratio_h, ratio_w, align_corners, - data_layout); - } -} - -template -class InterpolateOpV2CUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::NotFound("This kernel only runs on GPU device.")); - auto* input = ctx.Input("X"); - auto* output = ctx.Output("Out"); - - auto input_dims = input->dims(); - if (input_dims.size() == 3) { // 1D interpolation - Interpolate1DCUDAFwd(ctx, *input, output); - } else if (input_dims.size() == 4) { // 2D interpolation - Interpolate2DCUDAFwd(ctx, *input, output); - } else if (input_dims.size() == 5) { // 3D interpolation - Interpolate3DCUDAFwd(ctx, *input, output); - } - } -}; - -template -class InterpolateV2GradOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::NotFound("This kernel only runs on GPU device.")); - auto* input_grad = ctx.Output(framework::GradVarName("X")); - auto* output_grad = ctx.Input(framework::GradVarName("Out")); - - auto output_grad_dims = output_grad->dims(); - if (output_grad_dims.size() == 3) { // 1D interpolation - Interpolate1DCUDABwd(ctx, input_grad, *output_grad); - } else if (output_grad_dims.size() == 4) { // 2D interpolation - Interpolate2DCUDABwd(ctx, input_grad, *output_grad); - } else if (output_grad_dims.size() == 5) { // 3D interpolation - Interpolate3DCUDABwd(ctx, input_grad, *output_grad); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL(bilinear_interp_v2, - ops::InterpolateOpV2CUDAKernel, - ops::InterpolateOpV2CUDAKernel, - ops::InterpolateOpV2CUDAKernel); -REGISTER_OP_CUDA_KERNEL(bilinear_interp_v2_grad, - ops::InterpolateV2GradOpCUDAKernel, - ops::InterpolateV2GradOpCUDAKernel); -REGISTER_OP_CUDA_KERNEL(nearest_interp_v2, - ops::InterpolateOpV2CUDAKernel, - ops::InterpolateOpV2CUDAKernel, - ops::InterpolateOpV2CUDAKernel, - ops::InterpolateOpV2CUDAKernel); -REGISTER_OP_CUDA_KERNEL(nearest_interp_v2_grad, - ops::InterpolateV2GradOpCUDAKernel, - ops::InterpolateV2GradOpCUDAKernel); -REGISTER_OP_CUDA_KERNEL(trilinear_interp_v2, - ops::InterpolateOpV2CUDAKernel, - ops::InterpolateOpV2CUDAKernel, - ops::InterpolateOpV2CUDAKernel); -REGISTER_OP_CUDA_KERNEL(trilinear_interp_v2_grad, - ops::InterpolateV2GradOpCUDAKernel, - ops::InterpolateV2GradOpCUDAKernel); -REGISTER_OP_CUDA_KERNEL(linear_interp_v2, ops::InterpolateOpV2CUDAKernel, - ops::InterpolateOpV2CUDAKernel, - ops::InterpolateOpV2CUDAKernel); -REGISTER_OP_CUDA_KERNEL(linear_interp_v2_grad, - ops::InterpolateV2GradOpCUDAKernel, - ops::InterpolateV2GradOpCUDAKernel); -REGISTER_OP_CUDA_KERNEL(bicubic_interp_v2, - ops::InterpolateOpV2CUDAKernel, - ops::InterpolateOpV2CUDAKernel, - ops::InterpolateOpV2CUDAKernel); -REGISTER_OP_CUDA_KERNEL(bicubic_interp_v2_grad, - ops::InterpolateV2GradOpCUDAKernel, - ops::InterpolateV2GradOpCUDAKernel); diff --git a/paddle/fluid/operators/interpolate_v2_op.h b/paddle/fluid/operators/interpolate_v2_op.h deleted file mode 100644 index f99d3f6c32..0000000000 --- a/paddle/fluid/operators/interpolate_v2_op.h +++ /dev/null @@ -1,1618 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#pragma once -#include -#include -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/core/hostdevice.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -template -using EigenTensor = framework::EigenTensor; -using Tensor = framework::Tensor; -using DataLayout = framework::DataLayout; - -inline std::vector get_new_shape( - const std::vector& list_new_shape_tensor) { - // get tensor from - std::vector vec_new_shape; - for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) { - auto tensor = list_new_shape_tensor[i]; - PADDLE_ENFORCE_EQ(tensor->dims(), phi::make_ddim({1}), - platform::errors::InvalidArgument( - "The shape of dimension tensor should be [1]," - "but received d%.", - tensor->dims())); - if (platform::is_gpu_place(tensor->place())) { - framework::Tensor temp; - paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp); - vec_new_shape.push_back(static_cast(*temp.data())); - } else { - vec_new_shape.push_back(static_cast(*tensor->data())); - } - } - - return vec_new_shape; -} - -template -inline std::vector get_new_data_from_tensor(const Tensor* new_data_tensor) { - std::vector vec_new_data; - auto* new_data = new_data_tensor->data(); - framework::Tensor cpu_starts_tensor; - if (platform::is_gpu_place(new_data_tensor->place())) { - paddle::framework::TensorCopySync(*new_data_tensor, platform::CPUPlace(), - &cpu_starts_tensor); - new_data = cpu_starts_tensor.data(); - } -#ifdef PADDLE_WITH_ASCEND_CL - if (platform::is_npu_place(new_data_tensor->place())) { - paddle::framework::TensorCopySync(*new_data_tensor, platform::CPUPlace(), - &cpu_starts_tensor); - new_data = cpu_starts_tensor.data(); - } -#endif -#ifdef PADDLE_WITH_XPU - if (platform::is_xpu_place(new_data_tensor->place())) { - paddle::framework::TensorCopySync(*new_data_tensor, platform::CPUPlace(), - &cpu_starts_tensor); - new_data = cpu_starts_tensor.data(); - } -#endif - vec_new_data = std::vector(new_data, new_data + new_data_tensor->numel()); - return vec_new_data; -} - -inline void ExtractNCDWH(const framework::DDim& dims, - const DataLayout& data_layout, int* N, int* C, int* D, - int* H, int* W) { - *N = dims[0]; - - if (dims.size() == 3) { - *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[2]; - *D = 1; - *H = 1; - *W = data_layout == DataLayout::kNCHW ? dims[2] : dims[1]; - } else if (dims.size() == 4) { - *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[3]; - *D = 1; - *H = data_layout == DataLayout::kNCHW ? dims[2] : dims[1]; - *W = data_layout == DataLayout::kNCHW ? dims[3] : dims[2]; - } else { - *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[4]; - *D = data_layout == DataLayout::kNCHW ? dims[2] : dims[1]; - *H = data_layout == DataLayout::kNCHW ? dims[3] : dims[2]; - *W = data_layout == DataLayout::kNCHW ? dims[4] : dims[3]; - } -} - -template -static void NearestNeighborInterpolate(const Tensor& input, Tensor* output, - const float ratio_h, const float ratio_w, - const int n, const int c, - const int out_h, const int out_w, - const bool align_corners, - const DataLayout& data_layout) { - auto input_t = EigenTensor::From(input); - auto output_t = EigenTensor::From(*output); - for (int k = 0; k < out_h; k++) { // loop for images - int in_k = (align_corners) ? static_cast(ratio_h * k + 0.5) - : static_cast(ratio_h * k); - - for (int l = 0; l < out_w; l++) { - int in_l = (align_corners) ? static_cast(ratio_w * l + 0.5) - : static_cast(ratio_w * l); - - for (int i = 0; i < n; i++) { // loop for batches - for (int j = 0; j < c; j++) { // loop for channels - if (data_layout == DataLayout::kNCHW) { - output_t(i, j, k, l) = input_t(i, j, in_k, in_l); - } else { - output_t(i, k, l, j) = input_t(i, in_k, in_l, j); - } - } - } - } - } -} - -template -static void NearestNeighbor3DInterpolate( - const Tensor& input, Tensor* output, const float ratio_d, - const float ratio_h, const float ratio_w, const int n, const int c, - const int out_d, const int out_h, const int out_w, const bool align_corners, - const DataLayout& data_layout) { - auto input_t = EigenTensor::From(input); - auto output_t = EigenTensor::From(*output); - for (int d = 0; d < out_d; d++) { // loop for images - int in_d = (align_corners) ? static_cast(ratio_d * d + 0.5) - : static_cast(ratio_d * d); - for (int k = 0; k < out_h; k++) { - int in_k = (align_corners) ? static_cast(ratio_h * k + 0.5) - : static_cast(ratio_h * k); - - for (int l = 0; l < out_w; l++) { - int in_l = (align_corners) ? static_cast(ratio_w * l + 0.5) - : static_cast(ratio_w * l); - - for (int i = 0; i < n; i++) { // loop for batches - for (int j = 0; j < c; j++) { // loop for channels - if (data_layout == DataLayout::kNCHW) { - output_t(i, j, d, k, l) = input_t(i, j, in_d, in_k, in_l); - } else { // NDHWC - output_t(i, d, k, l, j) = input_t(i, in_d, in_k, in_l, j); - } - } - } - } - } - } -} - -template -static void LinearInterpolation(const Tensor& input, Tensor* output, - const float ratio_w, const int in_w, - const int n, const int c, const int out_w, - const bool align_corners, const bool align_mode, - const DataLayout data_layout) { - auto input_t = EigenTensor::From(input); - auto output_t = EigenTensor::From(*output); - bool align_flag = (align_mode == 0 && !align_corners); - - std::vector vx_w, vx_e; - std::vector vd_w, vd_e; - vx_w.reserve(out_w); - vx_e.reserve(out_w); - vd_w.reserve(out_w); - vd_e.reserve(out_w); -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (int l = 0; l < out_w; l++) { - int x_w = align_flag ? static_cast(ratio_w * (l + 0.5) - 0.5) - : static_cast(ratio_w * l); - x_w = (x_w > 0) ? x_w : 0; // w - int x_e = (x_w < (in_w - 1)) ? (x_w + 1) : x_w; // w_id - - float idx_src_x = ratio_w * (l + 0.5) - 0.5; - idx_src_x = (idx_src_x > 0) ? idx_src_x : 0; - float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w; // w1lambda - float d_e = 1.f - d_w; // w2lambda - { - vx_w[l] = x_w; - vx_e[l] = x_e; - vd_w[l] = d_w; - vd_e[l] = d_e; - } - } - -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for collapse(3) -#endif - for (int i = 0; i < n; i++) { // loop for batches - for (int j = 0; j < c; j++) { // loop for channels - for (int l = 0; l < out_w; l++) { - // linear interpolation - T out_t; - if (data_layout == DataLayout::kNCHW) { - out_t = input_t(i, j, vx_w[l]) * vd_e[l] + - input_t(i, j, vx_e[l]) * vd_w[l]; - output_t(i, j, l) = out_t; - } else { - out_t = input_t(i, vx_w[l], j) * vd_e[l] + - input_t(i, vx_e[l], j) * vd_w[l]; - output_t(i, l, j) = out_t; - } - } - } - } -} - -template -static void LinearInterpolationGrad(const Tensor& output_grad, - Tensor* input_grad, const float ratio_w, - const int in_w, const int n, const int c, - const int out_w, const bool align_corners, - const int align_mode, - const DataLayout data_layout) { - auto input_grad_t = EigenTensor::From(*input_grad); - auto output_grad_t = EigenTensor::From(output_grad); - bool align_flag = (align_mode == 0 && !align_corners); - for (int l = 0; l < out_w; l++) { - int x_w = align_flag ? static_cast(ratio_w * (l + 0.5) - 0.5) - : static_cast(ratio_w * l); - x_w = (x_w > 0) ? x_w : 0; // w - int x_e = (x_w < (in_w - 1)) ? (x_w + 1) : x_w; // w_id - - float idx_src_x = ratio_w * (l + 0.5) - 0.5; - idx_src_x = (idx_src_x > 0) ? idx_src_x : 0; - float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w; // w1lambda - float d_e = 1.f - d_w; // w2lambda - - for (int i = 0; i < n; i++) { // loop for batches - for (int j = 0; j < c; j++) { // loop for channels - // linear interpolation grad - if (data_layout == DataLayout::kNCHW) { - const T grad = output_grad_t(i, j, l); - input_grad_t(i, j, x_w) += static_cast(grad * d_e); - input_grad_t(i, j, x_e) += static_cast(grad * d_w); - } else { - const T grad = output_grad_t(i, l, j); - input_grad_t(i, x_w, j) += static_cast(grad * d_e); - input_grad_t(i, x_e, j) += static_cast(grad * d_w); - } - } - } - } -} - -template -static void BilinearInterpolation(const Tensor& input, Tensor* output, - const float ratio_h, const float ratio_w, - const int in_h, const int in_w, const int n, - const int c, const int out_h, const int out_w, - const bool align_corners, - const bool align_mode, - const DataLayout data_layout) { - auto input_t = EigenTensor::From(input); - auto output_t = EigenTensor::From(*output); - bool align_flag = (align_mode == 0 && !align_corners); - - std::vector vy_n, vy_s; - std::vector vd_n, vd_s; - vy_n.reserve(out_h); - vy_s.reserve(out_h); - vd_n.reserve(out_h); - vd_s.reserve(out_h); -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (int k = 0; k < out_h; k++) { - int y_n = align_flag ? static_cast(ratio_h * (k + 0.5) - 0.5) - : static_cast(ratio_h * k); - y_n = (y_n > 0) ? y_n : 0; - int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1); - float idx_src_y = ratio_h * (k + 0.5) - 0.5; - idx_src_y = (idx_src_y > 0) ? idx_src_y : 0; - float d_n = align_flag ? idx_src_y - y_n : ratio_h * k - y_n; - float d_s = 1.f - d_n; - { - vy_n[k] = y_n; - vy_s[k] = y_s; - vd_n[k] = d_n; - vd_s[k] = d_s; - } - } - - std::vector vx_w, vx_e; - std::vector vd_w, vd_e; - vx_w.reserve(out_w); - vx_e.reserve(out_w); - vd_w.reserve(out_w); - vd_e.reserve(out_w); -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (int l = 0; l < out_w; l++) { - int x_w = (align_mode == 0 && !align_corners) - ? static_cast(ratio_w * (l + 0.5) - 0.5) - : static_cast(ratio_w * l); - x_w = (x_w > 0) ? x_w : 0; - int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1); - float idx_src_x = ratio_w * (l + 0.5) - 0.5; - idx_src_x = (idx_src_x > 0) ? idx_src_x : 0; - float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w; - float d_e = 1.f - d_w; - { - vx_w[l] = x_w; - vx_e[l] = x_e; - vd_w[l] = d_w; - vd_e[l] = d_e; - } - } - -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for collapse(4) -#endif - for (int i = 0; i < n; i++) { // loop for batches - for (int j = 0; j < c; j++) { // loop for channels - for (int k = 0; k < out_h; k++) { // loop for images - for (int l = 0; l < out_w; l++) { - // bilinear interpolation - T out_t; - if (data_layout == DataLayout::kNCHW) { - out_t = input_t(i, j, vy_n[k], vx_w[l]) * vd_s[k] * vd_e[l] + - input_t(i, j, vy_s[k], vx_w[l]) * vd_n[k] * vd_e[l] + - input_t(i, j, vy_n[k], vx_e[l]) * vd_s[k] * vd_w[l] + - input_t(i, j, vy_s[k], vx_e[l]) * vd_n[k] * vd_w[l]; - output_t(i, j, k, l) = out_t; - - } else { - out_t = input_t(i, vy_n[k], vx_w[l], j) * vd_s[k] * vd_e[l] + - input_t(i, vy_s[k], vx_w[l], j) * vd_n[k] * vd_e[l] + - input_t(i, vy_n[k], vx_e[l], j) * vd_s[k] * vd_w[l] + - input_t(i, vy_s[k], vx_e[l], j) * vd_n[k] * vd_w[l]; - output_t(i, k, l, j) = out_t; - } - } - } - } - } -} - -template -static void TrilinearInterpolation( - const Tensor& input, Tensor* output, const float ratio_d, - const float ratio_h, const float ratio_w, const int in_d, const int in_h, - const int in_w, const int n, const int c, const int out_d, const int out_h, - const int out_w, const bool align_corners, const bool align_mode, - const DataLayout& data_layout) { - auto input_t = EigenTensor::From(input); - auto output_t = EigenTensor::From(*output); - bool align_flag = (align_mode == 0 && !align_corners); - - std::vector vt_f, vt_b; - std::vector vd_f, vd_b; - vt_f.reserve(out_d); - vt_b.reserve(out_d); - vd_f.reserve(out_d); - vd_b.reserve(out_d); -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (int j = 0; j < out_d; j++) { - int t_f = align_flag ? static_cast(ratio_d * (j + 0.5) - 0.5) - : static_cast(ratio_d * j); - t_f = (t_f > 0) ? t_f : 0; - int t_b = (t_f + 1) < (in_d - 1) ? (t_f + 1) : (in_d - 1); - float idx_src_t = ratio_d * (j + 0.5) - 0.5; - idx_src_t = (idx_src_t > 0) ? idx_src_t : 0; - float d_f = align_flag ? idx_src_t - t_f : ratio_d * j - t_f; - float d_b = 1.f - d_f; - { - vt_f[j] = t_f; - vt_b[j] = t_b; - vd_f[j] = d_f; - vd_b[j] = d_b; - } - } - - std::vector vy_n, vy_s; - std::vector vd_n, vd_s; - vy_n.reserve(out_h); - vy_s.reserve(out_h); - vd_n.reserve(out_h); - vd_s.reserve(out_h); -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (int k = 0; k < out_h; k++) { - int y_n = align_flag ? static_cast(ratio_h * (k + 0.5) - 0.5) - : static_cast(ratio_h * k); - y_n = (y_n > 0) ? y_n : 0; - int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1); - float idx_src_y = ratio_h * (k + 0.5) - 0.5; - idx_src_y = (idx_src_y > 0) ? idx_src_y : 0; - float d_n = align_flag ? idx_src_y - y_n : ratio_h * k - y_n; - float d_s = 1.f - d_n; - { - vy_n[k] = y_n; - vy_s[k] = y_s; - vd_n[k] = d_n; - vd_s[k] = d_s; - } - } - - std::vector vx_w, vx_e; - std::vector vd_w, vd_e; - vx_w.reserve(out_w); - vx_e.reserve(out_w); - vd_w.reserve(out_w); - vd_e.reserve(out_w); -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (int l = 0; l < out_w; l++) { - int x_w = (align_mode == 0 && !align_corners) - ? static_cast(ratio_w * (l + 0.5) - 0.5) - : static_cast(ratio_w * l); - x_w = (x_w > 0) ? x_w : 0; - int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1); - float idx_src_x = ratio_w * (l + 0.5) - 0.5; - idx_src_x = (idx_src_x > 0) ? idx_src_x : 0; - float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w; - float d_e = 1.f - d_w; - { - vx_w[l] = x_w; - vx_e[l] = x_e; - vd_w[l] = d_w; - vd_e[l] = d_e; - } - } - -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for collapse(5) -#endif - for (int b = 0; b < n; b++) { // loop for batches - for (int i = 0; i < c; i++) { // loop for channels - for (int j = 0; j < out_d; j++) { // loop for D, H, W - for (int k = 0; k < out_h; k++) { - for (int l = 0; l < out_w; l++) { - // trilinear interpolation - if (data_layout == DataLayout::kNCHW) { - T out_t = input_t(b, i, vt_f[j], vy_n[k], vx_w[l]) * vd_b[j] * - vd_s[k] * vd_e[l] + - input_t(b, i, vt_f[j], vy_n[k], vx_e[l]) * vd_b[j] * - vd_s[k] * vd_w[l] + - input_t(b, i, vt_f[j], vy_s[k], vx_w[l]) * vd_b[j] * - vd_n[k] * vd_e[l] + - input_t(b, i, vt_f[j], vy_s[k], vx_e[l]) * vd_b[j] * - vd_n[k] * vd_w[l] + - input_t(b, i, vt_b[j], vy_n[k], vx_w[l]) * vd_f[j] * - vd_s[k] * vd_e[l] + - input_t(b, i, vt_b[j], vy_n[k], vx_e[l]) * vd_f[j] * - vd_s[k] * vd_w[l] + - input_t(b, i, vt_b[j], vy_s[k], vx_w[l]) * vd_f[j] * - vd_n[k] * vd_e[l] + - input_t(b, i, vt_b[j], vy_s[k], vx_e[l]) * vd_f[j] * - vd_n[k] * vd_w[l]; - output_t(b, i, j, k, l) = out_t; - } else { - T out_t = input_t(b, vt_f[j], vy_n[k], vx_w[l], i) * vd_b[j] * - vd_s[k] * vd_e[l] + - input_t(b, vt_f[j], vy_n[k], vx_e[l], i) * vd_b[j] * - vd_s[k] * vd_w[l] + - input_t(b, vt_f[j], vy_s[k], vx_w[l], i) * vd_b[j] * - vd_n[k] * vd_e[l] + - input_t(b, vt_f[j], vy_s[k], vx_e[l], i) * vd_b[j] * - vd_n[k] * vd_w[l] + - input_t(b, vt_b[j], vy_n[k], vx_w[l], i) * vd_f[j] * - vd_s[k] * vd_e[l] + - input_t(b, vt_b[j], vy_n[k], vx_e[l], i) * vd_f[j] * - vd_s[k] * vd_w[l] + - input_t(b, vt_b[j], vy_s[k], vx_w[l], i) * vd_f[j] * - vd_n[k] * vd_e[l] + - input_t(b, vt_b[j], vy_s[k], vx_e[l], i) * vd_f[j] * - vd_n[k] * vd_w[l]; - output_t(b, j, k, l, i) = out_t; - } - } - } - } - } - } -} - -template -HOSTDEVICE inline T cubic_convolution1(T x, T A) { - return ((A + 2) * x - (A + 3)) * x * x + 1; -} - -template -HOSTDEVICE inline T cubic_convolution2(T x, T A) { - return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A; -} - -template -HOSTDEVICE inline void get_cubic_upsample_coefficients(T coeffs[4], T t) { - T A = -0.75; - - T x1 = t; - coeffs[0] = cubic_convolution2(x1 + 1.0, A); - coeffs[1] = cubic_convolution1(x1, A); - - // opposite coefficients - T x2 = 1.0 - t; - coeffs[2] = cubic_convolution1(x2, A); - coeffs[3] = cubic_convolution2(x2 + 1.0, A); -} - -template -static inline T cubic_interp(T x0, T x1, T x2, T x3, T t) { - T coeffs[4]; - get_cubic_upsample_coefficients(coeffs, t); - - return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3]; -} - -template -static void BicubicInterpolation(const Tensor& input, Tensor* output, - const float ratio_h, const float ratio_w, - const int in_h, const int in_w, const int n, - const int c, const int out_h, const int out_w, - const bool align_corners, - const DataLayout data_layout) { - auto input_t = EigenTensor::From(input); - auto output_t = EigenTensor::From(*output); - - for (int k = 0; k < out_h; k++) { // loop for images - T y_n = align_corners ? static_cast(ratio_h * k) - : static_cast(ratio_h * (k + 0.5) - 0.5); - int input_y = floorf(y_n); - const T y_t = y_n - input_y; - - for (int l = 0; l < out_w; l++) { - T x_n = align_corners ? static_cast(ratio_w * l) - : static_cast(ratio_w * (l + 0.5) - 0.5); - int input_x = floorf(x_n); - const T x_t = x_n - input_x; - - for (int i = 0; i < n; i++) { // loop for batches - for (int j = 0; j < c; j++) { // loop for channels - T coefficients[4]; - // interp 4 times in x direction - for (int ii = 0; ii < 4; ii++) { - int access_y = std::max(std::min(input_y - 1 + ii, in_h - 1), - static_cast(0)); - int access_x_0 = - std::max(std::min(input_x - 1, in_w - 1), static_cast(0)); - int access_x_1 = - std::max(std::min(input_x + 0, in_w - 1), static_cast(0)); - int access_x_2 = - std::max(std::min(input_x + 1, in_w - 1), static_cast(0)); - int access_x_3 = - std::max(std::min(input_x + 2, in_w - 1), static_cast(0)); - if (data_layout == DataLayout::kNCHW) { - coefficients[ii] = - cubic_interp(input_t(i, j, access_y, access_x_0), - input_t(i, j, access_y, access_x_1), - input_t(i, j, access_y, access_x_2), - input_t(i, j, access_y, access_x_3), x_t); - } else { - coefficients[ii] = - cubic_interp(input_t(i, access_y, access_x_0, j), - input_t(i, access_y, access_x_1, j), - input_t(i, access_y, access_x_2, j), - input_t(i, access_y, access_x_3, j), x_t); - } - } - - // interp y direction - if (data_layout == DataLayout::kNCHW) { - output_t(i, j, k, l) = - cubic_interp(coefficients[0], coefficients[1], - coefficients[2], coefficients[3], y_t); - } else { - output_t(i, k, l, j) = - cubic_interp(coefficients[0], coefficients[1], - coefficients[2], coefficients[3], y_t); - } - } - } - } - } -} - -template -static void NearestNeighborInterpolateGrad( - const Tensor& output_grad, Tensor* input_grad, const float ratio_h, - const float ratio_w, const int n, const int c, const int out_h, - const int out_w, const bool align_corners, const DataLayout data_layout) { - auto input_grad_t = EigenTensor::From(*input_grad); - auto output_grad_t = EigenTensor::From(output_grad); - - for (int k = 0; k < out_h; k++) { // loop for images - int in_k = (align_corners) ? static_cast(ratio_h * k + 0.5) - : static_cast(ratio_h * k); - - for (int l = 0; l < out_w; l++) { - int in_l = (align_corners) ? static_cast(ratio_w * l + 0.5) - : static_cast(ratio_w * l); - - for (int i = 0; i < n; i++) { // loop for batches - for (int j = 0; j < c; j++) { // loop for channels - if (data_layout == DataLayout::kNCHW) { - input_grad_t(i, j, in_k, in_l) += output_grad_t(i, j, k, l); - } else { - input_grad_t(i, in_k, in_l, j) += output_grad_t(i, k, l, j); - } - } - } - } - } -} - -template -static void NearestNeighbor3DInterpolateGrad( - const Tensor& output_grad, Tensor* input_grad, const float ratio_d, - const float ratio_h, const float ratio_w, const int n, const int c, - const int out_d, const int out_h, const int out_w, const bool align_corners, - const DataLayout data_layout) { - auto input_grad_t = EigenTensor::From(*input_grad); - auto output_grad_t = EigenTensor::From(output_grad); - - for (int d = 0; d < out_d; d++) { - int in_d = (align_corners) ? static_cast(ratio_d * d + 0.5) - : static_cast(ratio_d * d); - for (int k = 0; k < out_h; k++) { // loop for images - int in_k = (align_corners) ? static_cast(ratio_h * k + 0.5) - : static_cast(ratio_h * k); - - for (int l = 0; l < out_w; l++) { - int in_l = (align_corners) ? static_cast(ratio_w * l + 0.5) - : static_cast(ratio_w * l); - - for (int i = 0; i < n; i++) { // loop for batches - for (int j = 0; j < c; j++) { // loop for channels - if (data_layout == DataLayout::kNCHW) { - input_grad_t(i, j, in_d, in_k, in_l) += - output_grad_t(i, j, d, k, l); - } else { - input_grad_t(i, in_d, in_k, in_l, j) += - output_grad_t(i, d, k, l, j); - } - } - } - } - } - } -} - -template -static void BilinearInterpolationGrad( - const Tensor& output_grad, Tensor* input_grad, const float ratio_h, - const float ratio_w, const int in_h, const int in_w, const int n, - const int c, const int out_h, const int out_w, const bool align_corners, - const int align_mode, const DataLayout data_layout) { - auto input_grad_t = EigenTensor::From(*input_grad); - auto output_grad_t = EigenTensor::From(output_grad); - bool align_flag = (align_mode == 0 && !align_corners); - for (int k = 0; k < out_h; k++) { // loop for images - int y_n = align_flag ? static_cast(ratio_h * (k + 0.5) - 0.5) - : static_cast(ratio_h * k); - y_n = (y_n > 0) ? y_n : 0; - int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1); - float idx_src_y = ratio_h * (k + 0.5) - 0.5; - idx_src_y = (idx_src_y > 0) ? idx_src_y : 0; - float d_n = align_flag ? idx_src_y - y_n : ratio_h * k - y_n; - float d_s = 1.f - d_n; - - for (int l = 0; l < out_w; l++) { - int x_w = align_flag ? static_cast(ratio_w * (l + 0.5) - 0.5) - : static_cast(ratio_w * l); - x_w = (x_w > 0) ? x_w : 0; - int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1); - float idx_src_x = ratio_w * (l + 0.5) - 0.5; - idx_src_x = (idx_src_x > 0) ? idx_src_x : 0; - float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w; - float d_e = 1.f - d_w; - - for (int i = 0; i < n; i++) { // loop for batches - for (int j = 0; j < c; j++) { // loop for channels - // bilinear interpolation grad - if (data_layout == DataLayout::kNCHW) { - const T grad = output_grad_t(i, j, k, l); - input_grad_t(i, j, y_n, x_w) += static_cast(grad * d_s * d_e); - input_grad_t(i, j, y_s, x_w) += static_cast(grad * d_n * d_e); - input_grad_t(i, j, y_n, x_e) += static_cast(grad * d_s * d_w); - input_grad_t(i, j, y_s, x_e) += static_cast(grad * d_n * d_w); - } else { - const T grad = output_grad_t(i, k, l, j); - input_grad_t(i, y_n, x_w, j) += static_cast(grad * d_s * d_e); - input_grad_t(i, y_s, x_w, j) += static_cast(grad * d_n * d_e); - input_grad_t(i, y_n, x_e, j) += static_cast(grad * d_s * d_w); - input_grad_t(i, y_s, x_e, j) += static_cast(grad * d_n * d_w); - } - } - } - } - } -} - -template -static void TrilinearInterpolationGrad( - const Tensor& output_grad, Tensor* input_grad, const float ratio_d, - const float ratio_h, const float ratio_w, const int in_d, const int in_h, - const int in_w, const int n, const int c, const int out_d, const int out_h, - const int out_w, const bool align_corners, const int align_mode, - const DataLayout data_layout) { - auto input_grad_t = EigenTensor::From(*input_grad); - auto output_grad_t = EigenTensor::From(output_grad); - bool align_flag = (align_mode == 0 && !align_corners); - for (int j = 0; j < out_d; j++) { // loop for D - int t_f = align_flag ? static_cast(ratio_d * (j + 0.5) - 0.5) - : static_cast(ratio_d * j); - t_f = (t_f > 0) ? t_f : 0; - int t_b = (t_f + 1) < (in_d - 1) ? (t_f + 1) : (in_d - 1); - float idx_src_t = ratio_d * (j + 0.5) - 0.5; - idx_src_t = (idx_src_t > 0) ? idx_src_t : 0; - float d_f = align_flag ? idx_src_t - t_f : ratio_d * j - t_f; - float d_b = 1.f - d_f; - - for (int k = 0; k < out_h; k++) { // loop for H - int y_n = align_flag ? static_cast(ratio_h * (k + 0.5) - 0.5) - : static_cast(ratio_h * k); - y_n = (y_n > 0) ? y_n : 0; - int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1); - float idx_src_y = ratio_h * (k + 0.5) - 0.5; - idx_src_y = (idx_src_y > 0) ? idx_src_y : 0; - float d_n = align_flag ? idx_src_y - y_n : ratio_h * k - y_n; - float d_s = 1.f - d_n; - - for (int l = 0; l < out_w; l++) { // loop for W - int x_w = align_flag ? static_cast(ratio_w * (l + 0.5) - 0.5) - : static_cast(ratio_w * l); - x_w = (x_w > 0) ? x_w : 0; - int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1); - float idx_src_x = ratio_w * (l + 0.5) - 0.5; - idx_src_x = (idx_src_x > 0) ? idx_src_x : 0; - float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w; - float d_e = 1.f - d_w; - - for (int b = 0; b < n; b++) { // loop for batches - for (int i = 0; i < c; i++) { // loop for channels - // trilinear interpolation grad - if (data_layout == DataLayout::kNCHW) { - const T grad = output_grad_t(b, i, j, k, l); - input_grad_t(b, i, t_f, y_n, x_w) += - static_cast(grad * d_b * d_s * d_e); - input_grad_t(b, i, t_f, y_n, x_e) += - static_cast(grad * d_b * d_s * d_w); - input_grad_t(b, i, t_f, y_s, x_w) += - static_cast(grad * d_b * d_n * d_e); - input_grad_t(b, i, t_f, y_s, x_e) += - static_cast(grad * d_b * d_n * d_w); - input_grad_t(b, i, t_b, y_n, x_w) += - static_cast(grad * d_f * d_s * d_e); - input_grad_t(b, i, t_b, y_n, x_e) += - static_cast(grad * d_f * d_s * d_w); - input_grad_t(b, i, t_b, y_s, x_w) += - static_cast(grad * d_f * d_n * d_e); - input_grad_t(b, i, t_b, y_s, x_e) += - static_cast(grad * d_f * d_n * d_w); - } else { - const T grad = output_grad_t(b, j, k, l, i); - input_grad_t(b, t_f, y_n, x_w, i) += - static_cast(grad * d_b * d_s * d_e); - input_grad_t(b, t_f, y_n, x_e, i) += - static_cast(grad * d_b * d_s * d_w); - input_grad_t(b, t_f, y_s, x_w, i) += - static_cast(grad * d_b * d_n * d_e); - input_grad_t(b, t_f, y_s, x_e, i) += - static_cast(grad * d_b * d_n * d_w); - input_grad_t(b, t_b, y_n, x_w, i) += - static_cast(grad * d_f * d_s * d_e); - input_grad_t(b, t_b, y_n, x_e, i) += - static_cast(grad * d_f * d_s * d_w); - input_grad_t(b, t_b, y_s, x_w, i) += - static_cast(grad * d_f * d_n * d_e); - input_grad_t(b, t_b, y_s, x_e, i) += - static_cast(grad * d_f * d_n * d_w); - } - } - } - } - } - } -} - -template -static void BicubicInterpolationGrad(const Tensor& output_grad, - Tensor* input_grad, const float ratio_h, - const float ratio_w, const int in_h, - const int in_w, const int n, const int c, - const int out_h, const int out_w, - const bool align_corners, - const DataLayout data_layout) { - auto input_grad_t = EigenTensor::From(*input_grad); - auto output_grad_t = EigenTensor::From(output_grad); - - for (int k = 0; k < out_h; k++) { // loop for images - T y_n = align_corners ? static_cast(ratio_h * k) - : static_cast(ratio_h * (k + 0.5) - 0.5); - int input_y = floorf(y_n); - T y_t = y_n - input_y; - - for (int l = 0; l < out_w; l++) { - T x_n = align_corners ? static_cast(ratio_w * l) - : static_cast(ratio_w * (l + 0.5) - 0.5); - int input_x = floorf(x_n); - T x_t = x_n - input_x; - - T x_coeffs[4]; - T y_coeffs[4]; - - get_cubic_upsample_coefficients(x_coeffs, x_t); - get_cubic_upsample_coefficients(y_coeffs, y_t); - - for (int i = 0; i < n; i++) { // loop for batches - for (int j = 0; j < c; j++) { // loop for channels - // bicubic interpolation grad - for (int ii = 0; ii < 4; ii++) { - for (int jj = 0; jj < 4; jj++) { - int access_x = std::max(std::min(input_x - 1 + ii, in_w - 1), - static_cast(0)); - int access_y = std::max(std::min(input_y - 1 + jj, in_h - 1), - static_cast(0)); - if (data_layout == DataLayout::kNCHW) { - T grad = output_grad_t(i, j, k, l); - input_grad_t(i, j, access_y, access_x) += - grad * y_coeffs[jj] * x_coeffs[ii]; - } else { - T grad = output_grad_t(i, k, l, j); - input_grad_t(i, access_y, access_x, j) += - grad * y_coeffs[jj] * x_coeffs[ii]; - } - } - } - } - } - } - } -} - -template -static void Interpolate1DCPUFwd(const framework::ExecutionContext& ctx, - const Tensor& input, Tensor* output) { - const std::string data_layout_str = ctx.Attr("data_layout"); - const DataLayout data_layout = framework::StringToDataLayout(data_layout_str); - int n, c, in_d, in_h, in_w; - ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); - - auto interp_method = ctx.Attr("interp_method"); - bool align_corners = ctx.Attr("align_corners"); - int align_mode = ctx.Attr("align_mode"); - - int out_w = ctx.Attr("out_w"); - auto list_new_size_tensor = ctx.MultiInput("SizeTensor"); - float scale_w = -1.; - if (list_new_size_tensor.size() > 0) { - // have size tensor - auto new_size = get_new_shape(list_new_size_tensor); - out_w = new_size[0]; - } else { - // float scale_w = -1; - auto scale_tensor = ctx.Input("Scale"); - auto scale = ctx.Attr>("scale"); - if (scale_tensor != nullptr) { - auto scale_data = get_new_data_from_tensor(scale_tensor); - scale_w = scale_data[0]; - PADDLE_ENFORCE_EQ( - scale_w > 0, true, - platform::errors::InvalidArgument( - "The scale_w in input 'Scale' Tensor of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - } else { - if (scale.size() > 0) { - scale_w = scale[0]; - - PADDLE_ENFORCE_EQ( - scale_w > 0, true, - platform::errors::InvalidArgument( - "The scale_w in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - } - } - if (scale_w > 0.) { - out_w = static_cast(in_w * scale_w); - } - auto out_size = ctx.Input("OutSize"); - if (out_size != nullptr) { - auto out_size_data = get_new_data_from_tensor(out_size); - out_w = out_size_data[0]; - } - } - PADDLE_ENFORCE_GT(out_w, 0, platform::errors::InvalidArgument( - "out_w in Attr(out_shape) of Op(interpolate) " - "should be greater than 0.")); - framework::DDim dim_out; - if (data_layout == DataLayout::kNCHW) { - dim_out = {n, c, out_w}; - } else { - dim_out = {n, out_w, c}; - } - output->mutable_data(dim_out, ctx.GetPlace()); - - if (in_w == out_w) { - framework::TensorCopy(input, ctx.GetPlace(), output); - return; - } - - float ratio_w = 0.f; - if (out_w > 1) { - float new_scale_w = 0.f; - new_scale_w = (scale_w > 0) ? static_cast(1. / scale_w) - : static_cast(in_w) / out_w; - ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) - : static_cast(new_scale_w); - } - if ("linear" == interp_method) { - LinearInterpolation(input, output, ratio_w, in_w, n, c, out_w, - align_corners, align_mode, data_layout); - } -} - -template -static void Interpolate2DCPUFwd(const framework::ExecutionContext& ctx, - const Tensor& input, Tensor* output) { - const std::string data_layout_str = ctx.Attr("data_layout"); - const DataLayout data_layout = framework::StringToDataLayout(data_layout_str); - int n, c, in_d, in_h, in_w; - ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); - - auto interp_method = ctx.Attr("interp_method"); - bool align_corners = ctx.Attr("align_corners"); - int align_mode = ctx.Attr("align_mode"); - - int out_h = ctx.Attr("out_h"); - int out_w = ctx.Attr("out_w"); - float scale_h = -1; - float scale_w = -1; - - auto list_new_size_tensor = ctx.MultiInput("SizeTensor"); - if (list_new_size_tensor.size() > 0) { - // have size tensor - auto new_size = get_new_shape(list_new_size_tensor); - out_h = new_size[0]; - out_w = new_size[1]; - } else { - auto scale_tensor = ctx.Input("Scale"); - auto scale = ctx.Attr>("scale"); - if (scale_tensor != nullptr) { - auto scale_data = get_new_data_from_tensor(scale_tensor); - if (scale_data.size() > 1) { - scale_h = scale_data[0]; - scale_w = scale_data[1]; - } else { - scale_h = scale_data[0]; - scale_w = scale_data[0]; - } - PADDLE_ENFORCE_EQ( - scale_w > 0, true, - platform::errors::InvalidArgument( - "The scale_w in input 'Scale' Tensor of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - PADDLE_ENFORCE_EQ( - scale_h > 0, true, - platform::errors::InvalidArgument( - "The scale_h in input 'Scale' Tensor of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_h)); - } else { - if (scale.size() > 1) { - scale_h = scale[0]; - scale_w = scale[1]; - - PADDLE_ENFORCE_EQ( - scale_w > 0, true, - platform::errors::InvalidArgument( - "The scale_w in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - PADDLE_ENFORCE_EQ( - scale_h > 0, true, - platform::errors::InvalidArgument( - "The scale_h in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_h)); - } - } - if (scale_h > 0. && scale_w > 0.) { - out_h = static_cast(in_h * scale_h); - out_w = static_cast(in_w * scale_w); - } - auto out_size = ctx.Input("OutSize"); - if (out_size != nullptr) { - auto out_size_data = get_new_data_from_tensor(out_size); - out_h = out_size_data[0]; - out_w = out_size_data[1]; - } - } - PADDLE_ENFORCE_GT(out_h, 0, platform::errors::InvalidArgument( - "out_h in Attr(out_shape) of Op(interpolate) " - "should be greater than 0.")); - PADDLE_ENFORCE_GT(out_w, 0, platform::errors::InvalidArgument( - "out_w in Attr(out_shape) of Op(interpolate) " - "should be greater than 0.")); - framework::DDim dim_out; - if (data_layout == DataLayout::kNCHW) { - dim_out = {n, c, out_h, out_w}; - } else { - dim_out = {n, out_h, out_w, c}; - } - output->mutable_data(dim_out, ctx.GetPlace()); - - if (in_h == out_h && in_w == out_w) { - framework::TensorCopy(input, ctx.GetPlace(), output); - return; - } - - float ratio_h = 0.f; - float ratio_w = 0.f; - if (out_h > 1) { - float new_scale_h = 0.f; - new_scale_h = (scale_h > 0) ? static_cast(1. / scale_h) - : static_cast(in_h) / out_h; - ratio_h = (align_corners) ? static_cast(in_h - 1) / (out_h - 1) - : static_cast(new_scale_h); - } - if (out_w > 1) { - float new_scale_w = 0.f; - new_scale_w = (scale_w > 0) ? static_cast(1. / scale_w) - : static_cast(in_w) / out_w; - ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) - : static_cast(new_scale_w); - } - - if ("bilinear" == interp_method) { - BilinearInterpolation(input, output, ratio_h, ratio_w, in_h, in_w, n, c, - out_h, out_w, align_corners, align_mode, - data_layout); - } else if ("nearest" == interp_method) { - NearestNeighborInterpolate(input, output, ratio_h, ratio_w, n, c, out_h, - out_w, align_corners, data_layout); - } else if ("bicubic" == interp_method) { - BicubicInterpolation(input, output, ratio_h, ratio_w, in_h, in_w, n, c, - out_h, out_w, align_corners, data_layout); - } -} - -template -static void Interpolate3DCPUFwd(const framework::ExecutionContext& ctx, - const Tensor& input, Tensor* output) { - const std::string data_layout_str = ctx.Attr("data_layout"); - const DataLayout data_layout = framework::StringToDataLayout(data_layout_str); - int n, c, in_d, in_h, in_w; - ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); - - auto interp_method = ctx.Attr("interp_method"); - bool align_corners = ctx.Attr("align_corners"); - int align_mode = ctx.Attr("align_mode"); - - int out_d = ctx.Attr("out_d"); - int out_h = ctx.Attr("out_h"); - int out_w = ctx.Attr("out_w"); - - float scale_d = -1; - float scale_h = -1; - float scale_w = -1; - - auto list_new_size_tensor = ctx.MultiInput("SizeTensor"); - if (list_new_size_tensor.size() > 0) { - // have size tensor - auto new_size = get_new_shape(list_new_size_tensor); - out_d = new_size[0]; - out_h = new_size[1]; - out_w = new_size[2]; - } else { - auto scale_tensor = ctx.Input("Scale"); - auto scale = ctx.Attr>("scale"); - if (scale_tensor != nullptr) { - auto scale_data = get_new_data_from_tensor(scale_tensor); - if (scale_data.size() > 1) { - scale_d = scale_data[0]; - scale_h = scale_data[1]; - scale_w = scale_data[2]; - } else { - scale_d = scale_data[0]; - scale_h = scale_data[0]; - scale_w = scale_data[0]; - } - PADDLE_ENFORCE_EQ( - scale_w > 0, true, - platform::errors::InvalidArgument( - "The scale_w in input 'Scale' Tensor of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - PADDLE_ENFORCE_EQ( - scale_h > 0, true, - platform::errors::InvalidArgument( - "The scale_h in input 'Scale' Tensor of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_h)); - PADDLE_ENFORCE_EQ( - scale_d > 0, true, - platform::errors::InvalidArgument( - "The scale_d in input 'Scale' Tensor of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_d)); - } else { - if (scale.size() > 1) { - scale_d = scale[0]; - scale_h = scale[1]; - scale_w = scale[2]; - - PADDLE_ENFORCE_EQ( - scale_w > 0, true, - platform::errors::InvalidArgument( - "The scale_w in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - PADDLE_ENFORCE_EQ( - scale_h > 0, true, - platform::errors::InvalidArgument( - "The scale_h in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_h)); - PADDLE_ENFORCE_EQ( - scale_d > 0, true, - platform::errors::InvalidArgument( - "The scale_d in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_d)); - } - } - if (scale_w > 0. && scale_h > 0. && scale_d > 0.) { - out_d = static_cast(in_d * scale_d); - out_h = static_cast(in_h * scale_h); - out_w = static_cast(in_w * scale_w); - } - auto out_size = ctx.Input("OutSize"); - if (out_size != nullptr) { - auto out_size_data = get_new_data_from_tensor(out_size); - out_d = out_size_data[0]; - out_h = out_size_data[1]; - out_w = out_size_data[2]; - } - } - PADDLE_ENFORCE_GT(out_d, 0, platform::errors::InvalidArgument( - "out_d in Attr(out_shape) of Op(interpolate) " - "should be greater than 0.")); - PADDLE_ENFORCE_GT(out_h, 0, platform::errors::InvalidArgument( - "out_h in Attr(out_shape) of Op(interpolate) " - "should be greater than 0.")); - PADDLE_ENFORCE_GT(out_w, 0, platform::errors::InvalidArgument( - "out_w in Attr(out_shape) of Op(interpolate) " - "should be greater than 0.")); - - framework::DDim dim_out; - if (data_layout == DataLayout::kNCHW) { - dim_out = {n, c, out_d, out_h, out_w}; - } else { - dim_out = {n, out_d, out_h, out_w, c}; - } - - output->mutable_data(dim_out, ctx.GetPlace()); - - if (in_d == out_d && in_h == out_h && in_w == out_w) { - framework::TensorCopy(input, ctx.GetPlace(), output); - return; - } - - float ratio_d = 0.f; - float ratio_h = 0.f; - float ratio_w = 0.f; - if (out_d > 1) { - float new_scale_d = 0.f; - new_scale_d = (scale_d > 0) ? static_cast(1. / scale_d) - : static_cast(in_d) / out_d; - ratio_d = (align_corners) ? static_cast(in_d - 1) / (out_d - 1) - : static_cast(new_scale_d); - } - if (out_h > 1) { - float new_scale_h = 0.f; - new_scale_h = (scale_h > 0) ? static_cast(1. / scale_h) - : static_cast(in_h) / out_h; - ratio_h = (align_corners) ? static_cast(in_h - 1) / (out_h - 1) - : static_cast(new_scale_h); - } - if (out_w > 1) { - float new_scale_w = 0.f; - new_scale_w = (scale_w > 0) ? static_cast(1. / scale_w) - : static_cast(in_w) / out_w; - ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) - : static_cast(new_scale_w); - } - - if ("trilinear" == interp_method) { - TrilinearInterpolation(input, output, ratio_d, ratio_h, ratio_w, in_d, - in_h, in_w, n, c, out_d, out_h, out_w, - align_corners, align_mode, data_layout); - } else if ("nearest" == interp_method) { - NearestNeighbor3DInterpolate(input, output, ratio_d, ratio_h, ratio_w, n, - c, out_d, out_h, out_w, align_corners, - data_layout); - } -} - -template -static void Interpolate1DCPUBwd(const framework::ExecutionContext& ctx, - Tensor* input_grad, const Tensor& output_grad) { - auto* input = ctx.Input("X"); - const std::string data_layout_str = ctx.Attr("data_layout"); - const DataLayout data_layout = framework::StringToDataLayout(data_layout_str); - int n, c, in_d, in_h, in_w; - ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); - - auto interp_method = ctx.Attr("interp_method"); - bool align_corners = ctx.Attr("align_corners"); - int align_mode = ctx.Attr("align_mode"); - - int out_w = ctx.Attr("out_w"); - float scale_w = -1.0; - auto scale_tensor = ctx.Input("Scale"); - auto scale = ctx.Attr>("scale"); - if (scale_tensor != nullptr) { - auto scale_data = get_new_data_from_tensor(scale_tensor); - scale_w = scale_data[0]; - PADDLE_ENFORCE_EQ( - scale_w > 0, true, - platform::errors::InvalidArgument( - "The scale_w in input 'Scale' Tensor of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - } else { - if (scale.size() > 0) { - scale_w = scale[0]; - PADDLE_ENFORCE_EQ( - scale_w > 0, true, - platform::errors::InvalidArgument( - "The scale_w in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - } - } - if (scale_w > 0.) { - out_w = static_cast(in_w * scale_w); - } - auto out_size = ctx.Input("OutSize"); - if (out_size != nullptr) { - auto out_size_data = get_new_data_from_tensor(out_size); - out_w = out_size_data[0]; - } - auto list_new_size_tensor = ctx.MultiInput("SizeTensor"); - if (list_new_size_tensor.size() > 0) { - // have size tensor - auto new_size = get_new_shape(list_new_size_tensor); - out_w = new_size[0]; - } - - framework::DDim dim_grad; - if (data_layout == DataLayout::kNCHW) { - dim_grad = {n, c, in_w}; - } else { - dim_grad = {n, in_w, c}; - } - input_grad->mutable_data(dim_grad, ctx.GetPlace()); - - auto& device_ctx = ctx.template device_context(); - phi::funcs::SetConstant zero; - zero(device_ctx, input_grad, static_cast(0.0)); - - if (in_w == out_w) { - framework::TensorCopy(output_grad, ctx.GetPlace(), input_grad); - return; - } - - float ratio_w = 0.f; - if (out_w > 1) { - float new_scale_w = 0.f; - new_scale_w = (scale_w > 0) ? static_cast(1. / scale_w) - : static_cast(in_w) / out_w; - ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) - : static_cast(new_scale_w); - } - if ("linear" == interp_method) { - LinearInterpolationGrad(output_grad, input_grad, ratio_w, in_w, n, c, - out_w, align_corners, align_mode, data_layout); - } -} - -template -static void Interpolate2DCPUBwd(const framework::ExecutionContext& ctx, - Tensor* input_grad, const Tensor& output_grad) { - auto* input = ctx.Input("X"); - const std::string data_layout_str = ctx.Attr("data_layout"); - const DataLayout data_layout = framework::StringToDataLayout(data_layout_str); - int n, c, in_d, in_h, in_w; - ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); - - auto interp_method = ctx.Attr("interp_method"); - bool align_corners = ctx.Attr("align_corners"); - int align_mode = ctx.Attr("align_mode"); - - int out_h = ctx.Attr("out_h"); - int out_w = ctx.Attr("out_w"); - float scale_h = -1; - float scale_w = -1; - auto scale_tensor = ctx.Input("Scale"); - auto scale = ctx.Attr>("scale"); - if (scale_tensor != nullptr) { - auto scale_data = get_new_data_from_tensor(scale_tensor); - if (scale_data.size() > 1) { - scale_h = scale_data[0]; - scale_w = scale_data[1]; - } else { - scale_w = scale_data[0]; - scale_h = scale_data[0]; - } - PADDLE_ENFORCE_EQ( - scale_w > 0, true, - platform::errors::InvalidArgument( - "The scale_w in input 'Scale' Tensor of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - PADDLE_ENFORCE_EQ( - scale_h > 0, true, - platform::errors::InvalidArgument( - "The scale_h in input 'Scale' Tensor of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_h)); - } else { - if (scale.size() > 1) { - scale_h = scale[0]; - scale_w = scale[1]; - PADDLE_ENFORCE_EQ( - scale_w > 0, true, - platform::errors::InvalidArgument( - "The scale_w in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - PADDLE_ENFORCE_EQ( - scale_h > 0, true, - platform::errors::InvalidArgument( - "The scale_h in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_h)); - } - } - if (scale_h > 0. && scale_w > 0.) { - out_h = static_cast(in_h * scale_h); - out_w = static_cast(in_w * scale_w); - } - auto out_size = ctx.Input("OutSize"); - if (out_size != nullptr) { - auto out_size_data = get_new_data_from_tensor(out_size); - out_h = out_size_data[0]; - out_w = out_size_data[1]; - } - auto list_new_size_tensor = ctx.MultiInput("SizeTensor"); - if (list_new_size_tensor.size() > 0) { - // have size tensor - auto new_size = get_new_shape(list_new_size_tensor); - out_h = new_size[0]; - out_w = new_size[1]; - } - - framework::DDim dim_grad; - if (data_layout == DataLayout::kNCHW) { - dim_grad = {n, c, in_h, in_w}; - } else { - dim_grad = {n, in_h, in_w, c}; - } - input_grad->mutable_data(dim_grad, ctx.GetPlace()); - - auto& device_ctx = ctx.template device_context(); - phi::funcs::SetConstant zero; - zero(device_ctx, input_grad, static_cast(0.0)); - - if (in_h == out_h && in_w == out_w) { - framework::TensorCopy(output_grad, ctx.GetPlace(), input_grad); - return; - } - - float ratio_h = 0.f; - float ratio_w = 0.f; - if (out_h > 1) { - float new_scale_h = 0.f; - new_scale_h = (scale_h > 0) ? static_cast(1. / scale_h) - : static_cast(in_h) / out_h; - ratio_h = (align_corners) ? static_cast(in_h - 1) / (out_h - 1) - : static_cast(new_scale_h); - } - if (out_w > 1) { - float new_scale_w = 0.f; - new_scale_w = (scale_w > 0) ? static_cast(1. / scale_w) - : static_cast(in_w) / out_w; - ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) - : static_cast(new_scale_w); - } - - if ("bilinear" == interp_method) { - BilinearInterpolationGrad(output_grad, input_grad, ratio_h, ratio_w, - in_h, in_w, n, c, out_h, out_w, align_corners, - align_mode, data_layout); - } else if ("nearest" == interp_method) { - NearestNeighborInterpolateGrad(output_grad, input_grad, ratio_h, ratio_w, - n, c, out_h, out_w, align_corners, - data_layout); - } else if ("bicubic" == interp_method) { - BicubicInterpolationGrad(output_grad, input_grad, ratio_h, ratio_w, in_h, - in_w, n, c, out_h, out_w, align_corners, - data_layout); - } -} - -template -static void Interpolate3DCPUBwd(const framework::ExecutionContext& ctx, - Tensor* input_grad, const Tensor output_grad) { - auto* input = ctx.Input("X"); - const std::string data_layout_str = ctx.Attr("data_layout"); - const DataLayout data_layout = framework::StringToDataLayout(data_layout_str); - int n, c, in_d, in_h, in_w; - ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); - - auto interp_method = ctx.Attr("interp_method"); - bool align_corners = ctx.Attr("align_corners"); - int align_mode = ctx.Attr("align_mode"); - - int out_d = ctx.Attr("out_d"); - int out_h = ctx.Attr("out_h"); - int out_w = ctx.Attr("out_w"); - float scale_d = -1; - float scale_h = -1; - float scale_w = -1; - auto scale_tensor = ctx.Input("Scale"); - auto scale = ctx.Attr>("scale"); - if (scale_tensor != nullptr) { - auto scale_data = get_new_data_from_tensor(scale_tensor); - if (scale_data.size() > 1) { - scale_d = scale_data[0]; - scale_h = scale_data[1]; - scale_w = scale_data[2]; - } else { - scale_d = scale_data[0]; - scale_h = scale_data[0]; - scale_w = scale_data[0]; - } - PADDLE_ENFORCE_EQ( - scale_w > 0, true, - platform::errors::InvalidArgument( - "The scale_w in input 'Scale' Tensor of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - PADDLE_ENFORCE_EQ( - scale_h > 0, true, - platform::errors::InvalidArgument( - "The scale_h in input 'Scale' Tensor of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_h)); - PADDLE_ENFORCE_EQ( - scale_d > 0, true, - platform::errors::InvalidArgument( - "The scale_d in input 'Scale' Tensor of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_d)); - } else { - if (scale.size() > 1) { - scale_d = scale[0]; - scale_h = scale[1]; - scale_w = scale[2]; - PADDLE_ENFORCE_EQ( - scale_w > 0, true, - platform::errors::InvalidArgument( - "The scale_w in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - PADDLE_ENFORCE_EQ( - scale_h > 0, true, - platform::errors::InvalidArgument( - "The scale_h in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_h)); - PADDLE_ENFORCE_EQ( - scale_d > 0, true, - platform::errors::InvalidArgument( - "The scale_d in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_d)); - } - } - if (scale_d > 0. && scale_h > 0. && scale_w > 0.) { - out_d = static_cast(in_d * scale_d); - out_h = static_cast(in_h * scale_h); - out_w = static_cast(in_w * scale_w); - } - auto out_size = ctx.Input("OutSize"); - if (out_size != nullptr) { - auto out_size_data = get_new_data_from_tensor(out_size); - out_d = out_size_data[0]; - out_h = out_size_data[1]; - out_w = out_size_data[2]; - } - auto list_new_size_tensor = ctx.MultiInput("SizeTensor"); - if (list_new_size_tensor.size() > 0) { - // have size tensor - auto new_size = get_new_shape(list_new_size_tensor); - out_d = new_size[0]; - out_h = new_size[1]; - out_w = new_size[2]; - } - - framework::DDim dim_grad; - if (data_layout == DataLayout::kNCHW) { - dim_grad = {n, c, in_d, in_h, in_w}; - } else { - dim_grad = {n, in_d, in_h, in_w, c}; - } - input_grad->mutable_data(dim_grad, ctx.GetPlace()); - auto& device_ctx = ctx.template device_context(); - phi::funcs::SetConstant zero; - zero(device_ctx, input_grad, static_cast(0.0)); - - if (in_d == out_d && in_h == out_h && in_w == out_w) { - framework::TensorCopy(output_grad, ctx.GetPlace(), input_grad); - return; - } - - float ratio_d = 0.f; - float ratio_h = 0.f; - float ratio_w = 0.f; - if (out_d > 1) { - float new_scale_d = 0.f; - new_scale_d = (scale_d > 0) ? static_cast(1. / scale_d) - : static_cast(in_d) / out_d; - ratio_d = (align_corners) ? static_cast(in_d - 1) / (out_d - 1) - : static_cast(new_scale_d); - } - if (out_h > 1) { - float new_scale_h = 0.f; - new_scale_h = (scale_h > 0) ? static_cast(1. / scale_h) - : static_cast(in_h) / out_h; - ratio_h = (align_corners) ? static_cast(in_h - 1) / (out_h - 1) - : static_cast(new_scale_h); - } - if (out_w > 1) { - float new_scale_w = 0.f; - new_scale_w = (scale_w > 0) ? static_cast(1. / scale_w) - : static_cast(in_w) / out_w; - ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) - : static_cast(new_scale_w); - } - - if ("trilinear" == interp_method) { - TrilinearInterpolationGrad( - output_grad, input_grad, ratio_d, ratio_h, ratio_w, in_d, in_h, in_w, n, - c, out_d, out_h, out_w, align_corners, align_mode, data_layout); - } else if ("nearest" == interp_method) { - NearestNeighbor3DInterpolateGrad(output_grad, input_grad, ratio_d, - ratio_h, ratio_w, n, c, out_d, out_h, - out_w, align_corners, data_layout); - } -} - -template -class InterpolateV2Kernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("X"); - auto* output = ctx.Output("Out"); - - auto input_dims = input->dims(); - if (input_dims.size() == 3) { // 1D interpolation - Interpolate1DCPUFwd(ctx, *input, output); - } else if (input_dims.size() == 4) { // 2D interpolation - Interpolate2DCPUFwd(ctx, *input, output); - } else if (input_dims.size() == 5) { // 3D interpolation - Interpolate3DCPUFwd(ctx, *input, output); - } - } -}; - -template -class InterpolateV2GradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input_grad = ctx.Output(framework::GradVarName("X")); - auto* output_grad = ctx.Input(framework::GradVarName("Out")); - - auto output_grad_dims = output_grad->dims(); - if (output_grad_dims.size() == 3) { // 1D interpolation grad - Interpolate1DCPUBwd(ctx, input_grad, *output_grad); - } else if (output_grad_dims.size() == 4) { // 2D interpolation grad - Interpolate2DCPUBwd(ctx, input_grad, *output_grad); - } else if (output_grad_dims.size() == 5) { // 3D interpolation grad - Interpolate3DCPUBwd(ctx, input_grad, *output_grad); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/interpolate_v2_op_npu.cc b/paddle/fluid/operators/interpolate_v2_op_npu.cc index bf29c2aabb..615b5ea142 100644 --- a/paddle/fluid/operators/interpolate_v2_op_npu.cc +++ b/paddle/fluid/operators/interpolate_v2_op_npu.cc @@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/interpolate_v2_op.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/kernels/funcs/interpolate_function.h" + namespace paddle { namespace operators { @@ -401,7 +403,8 @@ class InterpolateV2NPUKernel : public framework::OpKernel { const DataLayout data_layout = framework::StringToDataLayout(data_layout_str); int n, c, in_d, in_h, in_w; - ExtractNCDWH(input_dims, data_layout, &n, &c, &in_d, &in_h, &in_w); + phi::funcs::ExtractNCDWH(input_dims, data_layout, &n, &c, &in_d, &in_h, + &in_w); auto interp_method = ctx.Attr("interp_method"); bool align_corners = ctx.Attr("align_corners"); @@ -431,14 +434,15 @@ class InterpolateV2NPUKernel : public framework::OpKernel { out_w = output_w[0]; } else if (ctx.HasInput("OutSize")) { auto out_size = ctx.Input("OutSize"); - auto out_size_data = get_new_data_from_tensor(out_size); + auto out_size_data = phi::funcs::get_new_data_from_tensor(out_size); out_h = out_size_data[0]; out_w = out_size_data[1]; } else { auto scale_tensor = ctx.Input("Scale"); auto scale = ctx.Attr>("scale"); if (scale_tensor != nullptr) { - auto scale_data = get_new_data_from_tensor(scale_tensor); + auto scale_data = + phi::funcs::get_new_data_from_tensor(scale_tensor); if (scale_data.size() > 1) { scale_h = scale_data[0]; scale_w = scale_data[1]; @@ -538,7 +542,8 @@ class InterpolateV2NPUGradKernel : public framework::OpKernel { const DataLayout data_layout = framework::StringToDataLayout(data_layout_str); int n, c, in_d, in_h, in_w; - ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); + phi::funcs::ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, + &in_w); auto interp_method = ctx.Attr("interp_method"); bool align_corners = ctx.Attr("align_corners"); @@ -567,14 +572,15 @@ class InterpolateV2NPUGradKernel : public framework::OpKernel { out_w = output_w[0]; } else if (ctx.HasInput("OutSize")) { auto out_size = ctx.Input("OutSize"); - auto out_size_data = get_new_data_from_tensor(out_size); + auto out_size_data = phi::funcs::get_new_data_from_tensor(out_size); out_h = out_size_data[0]; out_w = out_size_data[1]; } else { auto scale_tensor = ctx.Input("Scale"); auto scale = ctx.Attr>("scale"); if (scale_tensor != nullptr) { - auto scale_data = get_new_data_from_tensor(scale_tensor); + auto scale_data = + phi::funcs::get_new_data_from_tensor(scale_tensor); if (scale_data.size() > 1) { scale_h = scale_data[0]; scale_w = scale_data[1]; diff --git a/paddle/fluid/operators/interpolate_v2_op_xpu.cc b/paddle/fluid/operators/interpolate_v2_op_xpu.cc index 850dbe025b..9cbfc95158 100644 --- a/paddle/fluid/operators/interpolate_v2_op_xpu.cc +++ b/paddle/fluid/operators/interpolate_v2_op_xpu.cc @@ -14,8 +14,7 @@ #include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/interpolate_v2_op.h" - +#include "paddle/phi/kernels/funcs/interpolate_function.h" #ifdef PADDLE_WITH_XPU namespace paddle { @@ -57,7 +56,8 @@ class InterpolateV2XPUKernel : public framework::OpKernel { const DataLayout data_layout = framework::StringToDataLayout(data_layout_str); int n, c, in_d, in_h, in_w; - ExtractNCDWH(input_dims, data_layout, &n, &c, &in_d, &in_h, &in_w); + phi::funcs::ExtractNCDWH(input_dims, data_layout, &n, &c, &in_d, &in_h, + &in_w); auto interp_method = ctx.Attr("interp_method"); bool align_corners = ctx.Attr("align_corners"); @@ -78,7 +78,8 @@ class InterpolateV2XPUKernel : public framework::OpKernel { auto scale_tensor = ctx.Input("Scale"); auto scale = ctx.Attr>("scale"); if (scale_tensor != nullptr) { - auto scale_data = get_new_data_from_tensor(scale_tensor); + auto scale_data = + phi::funcs::get_new_data_from_tensor(scale_tensor); if (scale_data.size() > 1) { scale_h = scale_data[0]; scale_w = scale_data[1]; @@ -107,7 +108,8 @@ class InterpolateV2XPUKernel : public framework::OpKernel { } auto out_size = ctx.Input("OutSize"); if (out_size != nullptr) { - auto out_size_data = get_new_data_from_tensor(out_size); + auto out_size_data = + phi::funcs::get_new_data_from_tensor(out_size); out_h = out_size_data[0]; out_w = out_size_data[1]; } @@ -169,7 +171,8 @@ class InterpolateV2GradXPUKernel : public framework::OpKernel { const DataLayout data_layout = framework::StringToDataLayout(data_layout_str); int n, c, in_d, in_h, in_w; - ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); + phi::funcs::ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, + &in_w); auto interp_method = ctx.Attr("interp_method"); bool align_corners = ctx.Attr("align_corners"); @@ -190,7 +193,8 @@ class InterpolateV2GradXPUKernel : public framework::OpKernel { auto scale_tensor = ctx.Input("Scale"); auto scale = ctx.Attr>("scale"); if (scale_tensor != nullptr) { - auto scale_data = get_new_data_from_tensor(scale_tensor); + auto scale_data = + phi::funcs::get_new_data_from_tensor(scale_tensor); if (scale_data.size() > 1) { scale_h = scale_data[0]; scale_w = scale_data[1]; @@ -219,7 +223,8 @@ class InterpolateV2GradXPUKernel : public framework::OpKernel { } auto out_size = ctx.Input("OutSize"); if (out_size != nullptr) { - auto out_size_data = get_new_data_from_tensor(out_size); + auto out_size_data = + phi::funcs::get_new_data_from_tensor(out_size); out_h = out_size_data[0]; out_w = out_size_data[1]; } diff --git a/paddle/phi/backends/gpu/gpu_launch_config.h b/paddle/phi/backends/gpu/gpu_launch_config.h index 41bc6bb47c..ea54083e81 100644 --- a/paddle/phi/backends/gpu/gpu_launch_config.h +++ b/paddle/phi/backends/gpu/gpu_launch_config.h @@ -179,6 +179,43 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context, return config; } +static inline int GetLastPow2(int n) { + n |= (n >> 1); + n |= (n >> 2); + n |= (n >> 4); + n |= (n >> 8); + n |= (n >> 16); + return std::max(1, n - (n >> 1)); +} + +inline GpuLaunchConfig GetGpuLaunchConfig3D(const phi::GPUContext& context, + int num_img, + int height, + int width) { + const int kThreadsPerBlock = 256; + int max_threads_per_block = context.GetMaxThreadsPerBlock(); // 1024 + int max_threads = std::min(kThreadsPerBlock, max_threads_per_block); + + int block_x = std::min(GetLastPow2(width), max_threads); + int block_y = std::min(GetLastPow2(height), max_threads / block_x); + int block_z = std::min(num_img, max_threads / block_x / block_y); + + auto max_grid_dim = context.GetCUDAMaxGridDimSize(); + int grid_x = + std::min(max_grid_dim[0], backends::gpu::DivUp(width, block_x)); + int grid_y = + std::min(max_grid_dim[1], backends::gpu::DivUp(height, block_y)); + int grid_z = std::min(max_grid_dim[2], + backends::gpu::DivUp(num_img, block_z * 4)); + + const int capability = context.GetComputeCapability(); + GpuLaunchConfig config; + config.compute_capability = capability; + config.thread_per_block = dim3(block_x, block_y, block_z); + config.block_per_grid = dim3(grid_x, grid_y, grid_z); + return config; +} + } // namespace gpu } // namespace backends } // namespace phi diff --git a/paddle/phi/core/infermeta_utils.cc b/paddle/phi/core/infermeta_utils.cc index 671ba2ec7d..0496d727e8 100644 --- a/paddle/phi/core/infermeta_utils.cc +++ b/paddle/phi/core/infermeta_utils.cc @@ -87,6 +87,23 @@ std::vector InferMetaContext::InputsBetween(size_t start, return result; } +paddle::optional> +InferMetaContext::OptionalInputsBetween(size_t start, size_t end) const { + const auto& first = inputs_.at(start); + + if (first) { + std::vector result; + result.reserve(end - start); + + for (size_t i = start; i < end; ++i) { + result.push_back(inputs_.at(i).get()); + } + + return paddle::optional>(result); + } + return paddle::optional>(paddle::none); +} + MetaTensor* InferMetaContext::MutableOutputAt(size_t idx) { return outputs_.at(idx).get(); } diff --git a/paddle/phi/core/infermeta_utils.h b/paddle/phi/core/infermeta_utils.h index 0278e444e2..fad437f82c 100644 --- a/paddle/phi/core/infermeta_utils.h +++ b/paddle/phi/core/infermeta_utils.h @@ -54,6 +54,8 @@ class InferMetaContext { const MetaTensor& InputAt(size_t idx) const; paddle::optional OptionalInputAt(size_t idx) const; std::vector InputsBetween(size_t start, size_t end) const; + paddle::optional> + OptionalInputsBetween(size_t start, size_t end) const; MetaTensor* MutableOutputAt(size_t idx); std::vector MutableOutputBetween(size_t start, size_t end); @@ -174,6 +176,26 @@ struct InferMetaFnImpl { } }; + template + struct InferMetaFnCallHelper< + paddle::optional>, + Tail...> { + template + static void Call(InferMetaContext* ctx, PreviousArgs&... pargs) { + static_assert(attr_idx == 0, + "InferMeta's Input should appear before Attributes."); + static_assert(out_idx == 0, + "InferMeta's Input should appear before Outputs."); + const std::pair range = ctx->InputRangeAt(in_idx); + paddle::optional> arg = + ctx->OptionalInputsBetween(range.first, range.second); + InferMetaFnCallHelper< + Tail...>::template Call(ctx, + pargs..., + arg); + } + }; + // TODO(chenweihang): support other attr type later PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(bool); PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(int); diff --git a/paddle/phi/core/kernel_context.h b/paddle/phi/core/kernel_context.h index d3ca1ffc61..ab4e044e62 100644 --- a/paddle/phi/core/kernel_context.h +++ b/paddle/phi/core/kernel_context.h @@ -97,6 +97,22 @@ class KernelContext { return v; } + template + paddle::optional> OptionalInputsBetween( + size_t start, size_t end) { + const auto& first = inputs_.at(start); + + if (first) { + std::vector v; + for (size_t i = start; i < end; ++i) { + auto* t = static_cast(inputs_.at(i)); + v.emplace_back(t); + } + return paddle::optional>(v); + } + return paddle::optional>(paddle::none); + } + template TensorType* MutableOutputAt(size_t idx) { return static_cast(outputs_.at(idx)); diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h index fac4b1e827..b18fd9e05f 100644 --- a/paddle/phi/core/kernel_registry.h +++ b/paddle/phi/core/kernel_registry.h @@ -81,6 +81,13 @@ struct KernelArgsParseFunctor { default_tensor_layout, default_key.dtype(), arg_type); + } else if (arg_type == std::type_index(typeid( + paddle::optional< + const std::vector>))) { + args_def->AppendInput(default_key.backend(), + default_tensor_layout, + default_key.dtype(), + arg_type); } else if (arg_type == std::type_index(typeid( paddle::optional))) { args_def->AppendInput(default_key.backend(), diff --git a/paddle/phi/core/kernel_utils.h b/paddle/phi/core/kernel_utils.h index 5317288a2a..55574ea03a 100644 --- a/paddle/phi/core/kernel_utils.h +++ b/paddle/phi/core/kernel_utils.h @@ -126,6 +126,30 @@ namespace phi { } \ } +#define PD_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_MULTI_INPUT(tensor_type) \ + template \ + struct KernelCallHelper< \ + paddle::optional>, \ + Tail...> { \ + template \ + static void Compute(KernelContext* ctx, PreviousArgs&... pargs) { \ + static_assert(attr_idx == 0, \ + "Kernel's Input should appear before Attributes."); \ + static_assert(out_idx == 0, \ + "Kernel's Input should appear before Outputs."); \ + const std::pair range = ctx->InputRangeAt(in_idx); \ + paddle::optional> arg = \ + ctx->OptionalInputsBetween(range.first, range.second); \ + KernelCallHelper:: \ + template Compute( \ + ctx, pargs..., arg); \ + } \ + } + #define PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(attr_type) \ template \ struct KernelCallHelper { \ @@ -224,6 +248,7 @@ struct KernelImpl { PD_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SelectedRows); PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(DenseTensor); PD_SPECIALIZE_KernelCallHelper_FOR_INPUT(SelectedRows); + PD_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_MULTI_INPUT(DenseTensor); PD_SPECIALIZE_KernelCallHelper_FOR_INPUT(SparseCooTensor); PD_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SparseCooTensor); diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index c6940492ce..1f6cf1a688 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -890,6 +890,506 @@ void HierarchicalSigmoidInferMeta(const MetaTensor& x, out->set_dtype(x.dtype()); } +static void Interpolate1DInferShapeCheck( + const MetaTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const std::string& data_layout_str, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + MetaTensor* output, + MetaConfig config) { + auto dim_x = x.dims(); + + PADDLE_ENFORCE_EQ("linear", + interp_method, + phi::errors::InvalidArgument( + "Interpolation method can only be \"linear\" when" + "Input(X) dimension is 3, but got method = %s .", + interp_method)); + const DataLayout data_layout = + paddle::framework::StringToDataLayout(data_layout_str); + for (int i = 0; i < dim_x.size(); ++i) { + PADDLE_ENFORCE_NE( + dim_x[i], + 0, + phi::errors::InvalidArgument("The shape of input(x) should be larged " + "than 0, bug received shape[%d] is %d ", + i, + dim_x[i])); + } + if (size_tensor && size_tensor->size() > 0) { + // top prority size + auto inputs_name = size_tensor.get(); + PADDLE_ENFORCE_EQ( + inputs_name.size(), + 1, + phi::errors::InvalidArgument( + "Input(SizeTensor)'size of Op(interpolate) must be 1. " + "Attr(out_shape)'s length must be 1 for 3-D input tensor, but got " + "size = %d .", + inputs_name.size())); + phi::DDim dim_out; + if (data_layout == DataLayout::kNCHW) { + dim_out = {dim_x[0], dim_x[1], out_w}; + } else { + dim_out = {dim_x[0], out_w, dim_x[2]}; + } + output->set_dims(dim_out); + output->set_dtype(x.dtype()); + + return; + } + + int out_w_tmp; + if (scale_tensor) { + auto scale_tensor_dim = scale_tensor->dims(); + PADDLE_ENFORCE_EQ( + scale_tensor_dim.size(), + 1, + phi::errors::InvalidArgument( + "Scale's dimension size must be 1, but got dimension = %d .", + scale_tensor_dim.size())); + PADDLE_ENFORCE_EQ(scale_tensor_dim[0], + 1, + phi::errors::InvalidArgument( + "Scale's shape must be 1, but got shape = %d .", + scale_tensor_dim[0])); + out_w_tmp = -1; + } else { + if (scale.size() > 0) { + float scale_w = -1; + scale_w = scale[0]; + PADDLE_ENFORCE_EQ( + scale_w > 0, + true, + phi::errors::InvalidArgument( + "The scale_w in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + if (scale_w > 0.) { + // round down + out_w_tmp = (data_layout == DataLayout::kNCHW + ? static_cast(dim_x[2] * scale_w) + : static_cast(dim_x[1] * scale_w)); + // protect when input shape is -1 + out_w_tmp = out_w_tmp > 0 ? out_w_tmp : -1; + } + } else { + out_w_tmp = out_w; + } + } + + if (out_size && config.is_runtime) { + auto out_size_dim = out_size->dims(); + PADDLE_ENFORCE_EQ( + out_size_dim.size(), + 1, + phi::errors::InvalidArgument( + "OutSize's dimension size must be 1, but got dimention = %d .", + out_size_dim.size())); + PADDLE_ENFORCE_EQ( + out_size_dim[0], + 1, + phi::errors::InvalidArgument( + "OutSize's 0-th dimension's value must be 1, but got value = %d .", + out_size_dim[0])); + + // dims will be seted in kernel + output->set_dtype(x.dtype()); + output->share_lod(x); + return; + } + + phi::DDim dim_out; + if (data_layout == DataLayout::kNCHW) { + dim_out = {dim_x[0], dim_x[1], out_w_tmp}; + } else { + dim_out = {dim_x[0], out_w_tmp, dim_x[2]}; + } + output->set_dims(dim_out); + output->set_dtype(x.dtype()); +} + +static void Interpolate2DInferShapeCheck( + const MetaTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const std::string& data_layout_str, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + MetaTensor* output, + MetaConfig config) { + auto dim_x = x.dims(); + + PADDLE_ENFORCE( + "bilinear" == interp_method || "nearest" == interp_method || + "bicubic" == interp_method, + phi::errors::InvalidArgument( + "Interpolation method can only be \"bilinear\" or \"nearest\" when " + "Input(X) dimension is 4, but got method = %s.", + interp_method)); + const DataLayout data_layout = + paddle::framework::StringToDataLayout(data_layout_str); + + for (int i = 0; i < dim_x.size(); ++i) { + PADDLE_ENFORCE_NE( + dim_x[i], + 0, + phi::errors::InvalidArgument("The shape of input(x) should be larged " + "than 0, bug received shape[%d] is %d ", + i, + dim_x[i])); + } + + if (size_tensor && size_tensor->size()) { + // top prority size + auto inputs_name = size_tensor.get(); + PADDLE_ENFORCE_EQ( + inputs_name.size(), + 2, + phi::errors::InvalidArgument( + "Input(SizeTensor)'size of Op(interpolate) must be 2. " + "Attr(out_shape)'s length must be 2 for 4-D input " + "tensor, but got size = %d .", + inputs_name.size())); + phi::DDim dim_out; + if (data_layout == DataLayout::kNCHW) { + dim_out = {dim_x[0], dim_x[1], out_h, out_w}; + } else { + dim_out = {dim_x[0], out_h, out_w, dim_x[3]}; + } + output->set_dims(dim_out); + output->set_dtype(x.dtype()); + + return; + } + + int out_h_tmp, out_w_tmp; + if (scale_tensor) { + auto scale_tensor_dim = scale_tensor->dims(); + PADDLE_ENFORCE_EQ( + scale_tensor_dim.size(), + 1, + phi::errors::InvalidArgument( + "Scale's dimension size must be 1, but got dimension = %d .", + scale_tensor_dim.size())); + PADDLE_ENFORCE_EQ(scale_tensor_dim[0] == 2 || scale_tensor_dim[0] == 1, + true, + phi::errors::InvalidArgument( + "Scale's shape must be 2 or 1, but got shape = %d .", + scale_tensor_dim[0])); + out_h_tmp = -1; + out_w_tmp = -1; + } else { + if (scale.size() > 0) { + float scale_h = -1; + float scale_w = -1; + scale_h = scale[0]; + scale_w = scale[1]; + PADDLE_ENFORCE_EQ( + scale_w > 0, + true, + phi::errors::InvalidArgument( + "The scale_w in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, + true, + phi::errors::InvalidArgument( + "The scale_h in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); + if (scale_h > 0. && scale_w > 0.) { + // round down + out_h_tmp = (data_layout == DataLayout::kNCHW + ? static_cast(dim_x[2] * scale_h) + : static_cast(dim_x[1] * scale_h)); + out_w_tmp = (data_layout == DataLayout::kNCHW + ? static_cast(dim_x[3] * scale_w) + : static_cast(dim_x[2] * scale_w)); + // protect when input shape is -1 + out_h_tmp = out_h_tmp > 0 ? out_h_tmp : -1; + out_w_tmp = out_w_tmp > 0 ? out_w_tmp : -1; + } + } else { + out_h_tmp = out_h; + out_w_tmp = out_w; + } + } + + if (out_size && config.is_runtime) { + auto out_size_dim = out_size->dims(); + PADDLE_ENFORCE_EQ( + out_size_dim.size(), + 1, + phi::errors::InvalidArgument( + "OutSize's dimension size must be 1, but got dimension = %d .", + out_size_dim.size())); + PADDLE_ENFORCE_EQ( + out_size_dim[0], + 2, + phi::errors::InvalidArgument( + "OutSize's dim[0] must be 2, but got dimention = %d .", + out_size_dim[0])); + // dims will be seted in kernel + output->set_dtype(x.dtype()); + output->share_lod(x); + return; + } + + phi::DDim dim_out; + if (data_layout == DataLayout::kNCHW) { + dim_out = {dim_x[0], dim_x[1], out_h_tmp, out_w_tmp}; + } else { + dim_out = {dim_x[0], out_h_tmp, out_w_tmp, dim_x[3]}; + } + + output->set_dims(dim_out); + output->set_dtype(x.dtype()); +} + +static void Interpolate3DInferShapeCheck( + const MetaTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const std::string& data_layout_str, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + MetaTensor* output, + MetaConfig config) { + auto dim_x = x.dims(); + + PADDLE_ENFORCE("nearest" == interp_method || "trilinear" == interp_method, + phi::errors::InvalidArgument( + "Interpolation method can only be \"trilinear\" or " + "\"nearest\" when Input(X) " + "dimension is 5, but got method = %s .", + interp_method)); + const DataLayout data_layout = + paddle::framework::StringToDataLayout(data_layout_str); + + for (int i = 0; i < dim_x.size(); ++i) { + PADDLE_ENFORCE_NE( + dim_x[i], + 0, + phi::errors::InvalidArgument("The shape of input(x) should be larged " + "than 0, bug received shape[%d] is %d ", + i, + dim_x[i])); + } + + if (size_tensor && size_tensor->size() > 0) { + // top prority size + auto inputs_name = size_tensor.get(); + PADDLE_ENFORCE_EQ( + inputs_name.size(), + 3, + phi::errors::InvalidArgument( + "Input(SizeTensor)'s size of Op(interpolate) must be 3. " + "Attr(out_shape)'s length must be 3 for 5-D input " + "tensor, but got size = %d .", + inputs_name.size())); + phi::DDim dim_out; + if (data_layout == DataLayout::kNCHW) { + dim_out = {dim_x[0], dim_x[1], out_d, out_h, out_w}; + } else { + dim_out = {dim_x[0], out_d, out_h, out_w, dim_x[4]}; + } + output->set_dims(dim_out); + output->set_dtype(x.dtype()); + return; + } + + int out_d_tmp, out_h_tmp, out_w_tmp; + if (scale_tensor) { + auto scale_tensor_dim = scale_tensor->dims(); + PADDLE_ENFORCE_EQ( + scale_tensor_dim.size(), + 1, + phi::errors::InvalidArgument( + "Scale's dimension size must be 1, but got size = %d .", + scale_tensor_dim.size())); + PADDLE_ENFORCE_EQ(scale_tensor_dim[0] == 3 || scale_tensor_dim[0] == 1, + true, + phi::errors::InvalidArgument( + "Scale's shape must be 3 or 1, but got shape = %d .", + scale_tensor_dim[0])); + out_d_tmp = -1; + out_h_tmp = -1; + out_w_tmp = -1; + } else { + if (scale.size() > 0) { + float scale_d = -1; + float scale_h = -1; + float scale_w = -1; + scale_d = scale[0]; + scale_h = scale[1]; + scale_w = scale[2]; + PADDLE_ENFORCE_EQ( + scale_w > 0, + true, + phi::errors::InvalidArgument( + "The scale_w in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, + true, + phi::errors::InvalidArgument( + "The scale_h in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); + PADDLE_ENFORCE_EQ( + scale_d > 0, + true, + phi::errors::InvalidArgument( + "The scale_d in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_d)); + if (scale_d > 0. && scale_h > 0. && scale_w > 0.) { + // round down + out_d_tmp = (data_layout == DataLayout::kNCHW + ? static_cast(dim_x[2] * scale_d) + : static_cast(dim_x[1] * scale_d)); + out_h_tmp = (data_layout == DataLayout::kNCHW + ? static_cast(dim_x[3] * scale_h) + : static_cast(dim_x[2] * scale_h)); + out_w_tmp = (data_layout == DataLayout::kNCHW + ? static_cast(dim_x[4] * scale_w) + : static_cast(dim_x[3] * scale_w)); + // protect when input shape is -1 + out_d_tmp = out_d_tmp > 0 ? out_d_tmp : -1; + out_h_tmp = out_h_tmp > 0 ? out_h_tmp : -1; + out_w_tmp = out_w_tmp > 0 ? out_w_tmp : -1; + } + } else { + out_d_tmp = out_d; + out_h_tmp = out_h; + out_w_tmp = out_w; + } + } + + if (out_size && config.is_runtime) { + auto out_size_dim = out_size->dims(); + PADDLE_ENFORCE_EQ( + out_size_dim.size(), + 1, + phi::errors::InvalidArgument( + "OutSize's dimension size must be 1, but got size is %d.", + out_size_dim.size())); + PADDLE_ENFORCE_EQ(out_size_dim[0], + 3, + phi::errors::InvalidArgument( + "OutSize's dim[0] must be 3, but got size is %d.", + out_size_dim[0])); + // dims will be seted in kernel + output->set_dtype(x.dtype()); + output->share_lod(x); + return; + } + + phi::DDim dim_out; + if (data_layout == DataLayout::kNCHW) { + dim_out = {dim_x[0], dim_x[1], out_d_tmp, out_h_tmp, out_w_tmp}; + } else { + dim_out = {dim_x[0], out_d_tmp, out_h_tmp, out_w_tmp, dim_x[4]}; + } + output->set_dims(dim_out); + output->set_dtype(x.dtype()); +} + +void InterpolateInferMeta( + const MetaTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const std::string& data_layout_str, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + MetaTensor* output, + MetaConfig config) { + auto dim_x = x.dims(); // NCHW format + PADDLE_ENFORCE( + dim_x.size() == 3 || dim_x.size() == 4 || dim_x.size() == 5, + phi::errors::Unimplemented( + "Input(X) dimension must be 3, 4 or 5, but got dimension = %d .", + dim_x.size())); + if (dim_x.size() == 3) { + // shape check for 1D interpolate for input tensor shape NCHW + Interpolate1DInferShapeCheck(x, + out_size, + size_tensor, + scale_tensor, + data_layout_str, + out_d, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + output, + config); + } else if (dim_x.size() == 4) { + // shape check for 2D interpolate for input tensor shape NCHW + Interpolate2DInferShapeCheck(x, + out_size, + size_tensor, + scale_tensor, + data_layout_str, + out_d, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + output, + config); + } else { // dim_x.size() == 5 + // shape check for 3D interpolate for input tensor shape NCDHW + Interpolate3DInferShapeCheck(x, + out_size, + size_tensor, + scale_tensor, + data_layout_str, + out_d, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + output, + config); + } +} + void MultiDotInferMeta(const std::vector& x, MetaTensor* out) { auto inputs_dims = GetMetaTensorsDim(x); diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index 4a8020aefc..b748d898c1 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -199,6 +199,22 @@ void HierarchicalSigmoidInferMeta(const MetaTensor& x, MetaTensor* pre_out, MetaTensor* w_out); +void InterpolateInferMeta( + const MetaTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + MetaTensor* output, + MetaConfig config = MetaConfig()); + void MultiDotInferMeta(const std::vector& x, MetaTensor* out); void MultiplexInferMeta(const std::vector& ins, diff --git a/paddle/phi/kernels/cpu/interpolate_grad_kernel.cc b/paddle/phi/kernels/cpu/interpolate_grad_kernel.cc new file mode 100644 index 0000000000..550439a525 --- /dev/null +++ b/paddle/phi/kernels/cpu/interpolate_grad_kernel.cc @@ -0,0 +1,1067 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/interpolate_grad_kernel.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/common/layout.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/interpolate_function.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +static void LinearInterpolationGrad(const DenseTensor& output_grad, + DenseTensor* input_grad, + const float ratio_w, + const int in_w, + const int n, + const int c, + const int out_w, + const bool align_corners, + const int align_mode, + const DataLayout data_layout) { + auto input_grad_t = EigenTensor::From(*input_grad); + auto output_grad_t = EigenTensor::From(output_grad); + bool align_flag = (align_mode == 0 && !align_corners); + for (int l = 0; l < out_w; l++) { + int x_w = align_flag ? static_cast(ratio_w * (l + 0.5) - 0.5) + : static_cast(ratio_w * l); + x_w = (x_w > 0) ? x_w : 0; // w + int x_e = (x_w < (in_w - 1)) ? (x_w + 1) : x_w; // w_id + + float idx_src_x = ratio_w * (l + 0.5) - 0.5; + idx_src_x = (idx_src_x > 0) ? idx_src_x : 0; + float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w; // w1lambda + float d_e = 1.f - d_w; // w2lambda + + for (int i = 0; i < n; i++) { // loop for batches + for (int j = 0; j < c; j++) { // loop for channels + // linear interpolation grad + if (data_layout == DataLayout::kNCHW) { + const T grad = output_grad_t(i, j, l); + input_grad_t(i, j, x_w) += static_cast(grad * d_e); + input_grad_t(i, j, x_e) += static_cast(grad * d_w); + } else { + const T grad = output_grad_t(i, l, j); + input_grad_t(i, x_w, j) += static_cast(grad * d_e); + input_grad_t(i, x_e, j) += static_cast(grad * d_w); + } + } + } + } +} + +template +static void BilinearInterpolationGrad(const DenseTensor& output_grad, + DenseTensor* input_grad, + const float ratio_h, + const float ratio_w, + const int in_h, + const int in_w, + const int n, + const int c, + const int out_h, + const int out_w, + const bool align_corners, + const int align_mode, + const DataLayout data_layout) { + auto input_grad_t = EigenTensor::From(*input_grad); + auto output_grad_t = EigenTensor::From(output_grad); + bool align_flag = (align_mode == 0 && !align_corners); + for (int k = 0; k < out_h; k++) { // loop for images + int y_n = align_flag ? static_cast(ratio_h * (k + 0.5) - 0.5) + : static_cast(ratio_h * k); + y_n = (y_n > 0) ? y_n : 0; + int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1); + float idx_src_y = ratio_h * (k + 0.5) - 0.5; + idx_src_y = (idx_src_y > 0) ? idx_src_y : 0; + float d_n = align_flag ? idx_src_y - y_n : ratio_h * k - y_n; + float d_s = 1.f - d_n; + + for (int l = 0; l < out_w; l++) { + int x_w = align_flag ? static_cast(ratio_w * (l + 0.5) - 0.5) + : static_cast(ratio_w * l); + x_w = (x_w > 0) ? x_w : 0; + int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1); + float idx_src_x = ratio_w * (l + 0.5) - 0.5; + idx_src_x = (idx_src_x > 0) ? idx_src_x : 0; + float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w; + float d_e = 1.f - d_w; + + for (int i = 0; i < n; i++) { // loop for batches + for (int j = 0; j < c; j++) { // loop for channels + // bilinear interpolation grad + if (data_layout == DataLayout::kNCHW) { + const T grad = output_grad_t(i, j, k, l); + input_grad_t(i, j, y_n, x_w) += static_cast(grad * d_s * d_e); + input_grad_t(i, j, y_s, x_w) += static_cast(grad * d_n * d_e); + input_grad_t(i, j, y_n, x_e) += static_cast(grad * d_s * d_w); + input_grad_t(i, j, y_s, x_e) += static_cast(grad * d_n * d_w); + } else { + const T grad = output_grad_t(i, k, l, j); + input_grad_t(i, y_n, x_w, j) += static_cast(grad * d_s * d_e); + input_grad_t(i, y_s, x_w, j) += static_cast(grad * d_n * d_e); + input_grad_t(i, y_n, x_e, j) += static_cast(grad * d_s * d_w); + input_grad_t(i, y_s, x_e, j) += static_cast(grad * d_n * d_w); + } + } + } + } + } +} + +template +static void NearestNeighborInterpolateGrad(const DenseTensor& output_grad, + DenseTensor* input_grad, + const float ratio_h, + const float ratio_w, + const int n, + const int c, + const int out_h, + const int out_w, + const bool align_corners, + const DataLayout data_layout) { + auto input_grad_t = EigenTensor::From(*input_grad); + auto output_grad_t = EigenTensor::From(output_grad); + + for (int k = 0; k < out_h; k++) { // loop for images + int in_k = (align_corners) ? static_cast(ratio_h * k + 0.5) + : static_cast(ratio_h * k); + + for (int l = 0; l < out_w; l++) { + int in_l = (align_corners) ? static_cast(ratio_w * l + 0.5) + : static_cast(ratio_w * l); + + for (int i = 0; i < n; i++) { // loop for batches + for (int j = 0; j < c; j++) { // loop for channels + if (data_layout == DataLayout::kNCHW) { + input_grad_t(i, j, in_k, in_l) += output_grad_t(i, j, k, l); + } else { + input_grad_t(i, in_k, in_l, j) += output_grad_t(i, k, l, j); + } + } + } + } + } +} + +template +static void BicubicInterpolationGrad(const DenseTensor& output_grad, + DenseTensor* input_grad, + const float ratio_h, + const float ratio_w, + const int in_h, + const int in_w, + const int n, + const int c, + const int out_h, + const int out_w, + const bool align_corners, + const DataLayout data_layout) { + auto input_grad_t = EigenTensor::From(*input_grad); + auto output_grad_t = EigenTensor::From(output_grad); + + for (int k = 0; k < out_h; k++) { // loop for images + T y_n = align_corners ? static_cast(ratio_h * k) + : static_cast(ratio_h * (k + 0.5) - 0.5); + int input_y = floorf(y_n); + T y_t = y_n - input_y; + + for (int l = 0; l < out_w; l++) { + T x_n = align_corners ? static_cast(ratio_w * l) + : static_cast(ratio_w * (l + 0.5) - 0.5); + int input_x = floorf(x_n); + T x_t = x_n - input_x; + + T x_coeffs[4]; + T y_coeffs[4]; + + funcs::get_cubic_upsample_coefficients(x_coeffs, x_t); + funcs::get_cubic_upsample_coefficients(y_coeffs, y_t); + + for (int i = 0; i < n; i++) { // loop for batches + for (int j = 0; j < c; j++) { // loop for channels + // bicubic interpolation grad + for (int ii = 0; ii < 4; ii++) { + for (int jj = 0; jj < 4; jj++) { + int access_x = std::max(std::min(input_x - 1 + ii, in_w - 1), + static_cast(0)); + int access_y = std::max(std::min(input_y - 1 + jj, in_h - 1), + static_cast(0)); + if (data_layout == DataLayout::kNCHW) { + T grad = output_grad_t(i, j, k, l); + input_grad_t(i, j, access_y, access_x) += + grad * y_coeffs[jj] * x_coeffs[ii]; + } else { + T grad = output_grad_t(i, k, l, j); + input_grad_t(i, access_y, access_x, j) += + grad * y_coeffs[jj] * x_coeffs[ii]; + } + } + } + } + } + } + } +} + +template +static void TrilinearInterpolationGrad(const DenseTensor& output_grad, + DenseTensor* input_grad, + const float ratio_d, + const float ratio_h, + const float ratio_w, + const int in_d, + const int in_h, + const int in_w, + const int n, + const int c, + const int out_d, + const int out_h, + const int out_w, + const bool align_corners, + const int align_mode, + const DataLayout data_layout) { + auto input_grad_t = EigenTensor::From(*input_grad); + auto output_grad_t = EigenTensor::From(output_grad); + bool align_flag = (align_mode == 0 && !align_corners); + for (int j = 0; j < out_d; j++) { // loop for D + int t_f = align_flag ? static_cast(ratio_d * (j + 0.5) - 0.5) + : static_cast(ratio_d * j); + t_f = (t_f > 0) ? t_f : 0; + int t_b = (t_f + 1) < (in_d - 1) ? (t_f + 1) : (in_d - 1); + float idx_src_t = ratio_d * (j + 0.5) - 0.5; + idx_src_t = (idx_src_t > 0) ? idx_src_t : 0; + float d_f = align_flag ? idx_src_t - t_f : ratio_d * j - t_f; + float d_b = 1.f - d_f; + + for (int k = 0; k < out_h; k++) { // loop for H + int y_n = align_flag ? static_cast(ratio_h * (k + 0.5) - 0.5) + : static_cast(ratio_h * k); + y_n = (y_n > 0) ? y_n : 0; + int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1); + float idx_src_y = ratio_h * (k + 0.5) - 0.5; + idx_src_y = (idx_src_y > 0) ? idx_src_y : 0; + float d_n = align_flag ? idx_src_y - y_n : ratio_h * k - y_n; + float d_s = 1.f - d_n; + + for (int l = 0; l < out_w; l++) { // loop for W + int x_w = align_flag ? static_cast(ratio_w * (l + 0.5) - 0.5) + : static_cast(ratio_w * l); + x_w = (x_w > 0) ? x_w : 0; + int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1); + float idx_src_x = ratio_w * (l + 0.5) - 0.5; + idx_src_x = (idx_src_x > 0) ? idx_src_x : 0; + float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w; + float d_e = 1.f - d_w; + + for (int b = 0; b < n; b++) { // loop for batches + for (int i = 0; i < c; i++) { // loop for channels + // trilinear interpolation grad + if (data_layout == DataLayout::kNCHW) { + const T grad = output_grad_t(b, i, j, k, l); + input_grad_t(b, i, t_f, y_n, x_w) += + static_cast(grad * d_b * d_s * d_e); + input_grad_t(b, i, t_f, y_n, x_e) += + static_cast(grad * d_b * d_s * d_w); + input_grad_t(b, i, t_f, y_s, x_w) += + static_cast(grad * d_b * d_n * d_e); + input_grad_t(b, i, t_f, y_s, x_e) += + static_cast(grad * d_b * d_n * d_w); + input_grad_t(b, i, t_b, y_n, x_w) += + static_cast(grad * d_f * d_s * d_e); + input_grad_t(b, i, t_b, y_n, x_e) += + static_cast(grad * d_f * d_s * d_w); + input_grad_t(b, i, t_b, y_s, x_w) += + static_cast(grad * d_f * d_n * d_e); + input_grad_t(b, i, t_b, y_s, x_e) += + static_cast(grad * d_f * d_n * d_w); + } else { + const T grad = output_grad_t(b, j, k, l, i); + input_grad_t(b, t_f, y_n, x_w, i) += + static_cast(grad * d_b * d_s * d_e); + input_grad_t(b, t_f, y_n, x_e, i) += + static_cast(grad * d_b * d_s * d_w); + input_grad_t(b, t_f, y_s, x_w, i) += + static_cast(grad * d_b * d_n * d_e); + input_grad_t(b, t_f, y_s, x_e, i) += + static_cast(grad * d_b * d_n * d_w); + input_grad_t(b, t_b, y_n, x_w, i) += + static_cast(grad * d_f * d_s * d_e); + input_grad_t(b, t_b, y_n, x_e, i) += + static_cast(grad * d_f * d_s * d_w); + input_grad_t(b, t_b, y_s, x_w, i) += + static_cast(grad * d_f * d_n * d_e); + input_grad_t(b, t_b, y_s, x_e, i) += + static_cast(grad * d_f * d_n * d_w); + } + } + } + } + } + } +} + +template +static void NearestNeighbor3DInterpolateGrad(const DenseTensor& output_grad, + DenseTensor* input_grad, + const float ratio_d, + const float ratio_h, + const float ratio_w, + const int n, + const int c, + const int out_d, + const int out_h, + const int out_w, + const bool align_corners, + const DataLayout data_layout) { + auto input_grad_t = EigenTensor::From(*input_grad); + auto output_grad_t = EigenTensor::From(output_grad); + + for (int d = 0; d < out_d; d++) { + int in_d = (align_corners) ? static_cast(ratio_d * d + 0.5) + : static_cast(ratio_d * d); + for (int k = 0; k < out_h; k++) { // loop for images + int in_k = (align_corners) ? static_cast(ratio_h * k + 0.5) + : static_cast(ratio_h * k); + + for (int l = 0; l < out_w; l++) { + int in_l = (align_corners) ? static_cast(ratio_w * l + 0.5) + : static_cast(ratio_w * l); + + for (int i = 0; i < n; i++) { // loop for batches + for (int j = 0; j < c; j++) { // loop for channels + if (data_layout == DataLayout::kNCHW) { + input_grad_t(i, j, in_d, in_k, in_l) += + output_grad_t(i, j, d, k, l); + } else { + input_grad_t(i, in_d, in_k, in_l, j) += + output_grad_t(i, d, k, l, j); + } + } + } + } + } + } +} + +template +static void Interpolate1DCPUBwd( + const Context& dev_ctx, + const DenseTensor& input, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const DenseTensor& output_grad, + const std::string& data_layout_str, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* input_grad) { + const DataLayout data_layout = + paddle::framework::StringToDataLayout(data_layout_str); + int n, c, in_d, in_h, in_w; + funcs::ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); + + float scale_w = -1.0; + if (scale_tensor) { + auto scale_data = + funcs::get_new_data_from_tensor(scale_tensor.get_ptr()); + scale_w = scale_data[0]; + PADDLE_ENFORCE_EQ( + scale_w > 0, + true, + errors::InvalidArgument( + "The scale_w in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + } else { + if (scale.size() > 0) { + scale_w = scale[0]; + PADDLE_ENFORCE_EQ( + scale_w > 0, + true, + errors::InvalidArgument( + "The scale_w in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + } + } + if (scale_w > 0.) { + out_w = static_cast(in_w * scale_w); + } + if (out_size) { + auto out_size_data = + funcs::get_new_data_from_tensor(out_size.get_ptr()); + out_w = out_size_data[0]; + } + if (size_tensor && size_tensor->size() > 0) { + // have size tensor + auto new_size = funcs::get_new_shape(size_tensor.get()); + out_w = new_size[0]; + } + + phi::DDim dim_grad; + if (data_layout == DataLayout::kNCHW) { + dim_grad = {n, c, in_w}; + } else { + dim_grad = {n, in_w, c}; + } + + input_grad->Resize(dim_grad); + dev_ctx.template Alloc(input_grad); + + phi::funcs::SetConstant zero; + zero(dev_ctx, input_grad, static_cast(0.0)); + + if (in_w == out_w) { + paddle::framework::TensorCopy(output_grad, dev_ctx.GetPlace(), input_grad); + return; + } + + float ratio_w = 0.f; + if (out_w > 1) { + float new_scale_w = 0.f; + new_scale_w = (scale_w > 0) ? static_cast(1. / scale_w) + : static_cast(in_w) / out_w; + ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) + : static_cast(new_scale_w); + } + if ("linear" == interp_method) { + LinearInterpolationGrad(output_grad, + input_grad, + ratio_w, + in_w, + n, + c, + out_w, + align_corners, + align_mode, + data_layout); + } +} + +template +static void Interpolate2DCPUBwd( + const Context& dev_ctx, + const DenseTensor& input, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const DenseTensor& output_grad, + const std::string& data_layout_str, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* input_grad) { + const DataLayout data_layout = + paddle::framework::StringToDataLayout(data_layout_str); + int n, c, in_d, in_h, in_w; + funcs::ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); + + float scale_h = -1; + float scale_w = -1; + if (scale_tensor) { + auto scale_data = + funcs::get_new_data_from_tensor(scale_tensor.get_ptr()); + if (scale_data.size() > 1) { + scale_h = scale_data[0]; + scale_w = scale_data[1]; + } else { + scale_w = scale_data[0]; + scale_h = scale_data[0]; + } + PADDLE_ENFORCE_EQ( + scale_w > 0, + true, + errors::InvalidArgument( + "The scale_w in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, + true, + errors::InvalidArgument( + "The scale_h in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); + } else { + if (scale.size() > 1) { + scale_h = scale[0]; + scale_w = scale[1]; + PADDLE_ENFORCE_EQ( + scale_w > 0, + true, + errors::InvalidArgument( + "The scale_w in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, + true, + errors::InvalidArgument( + "The scale_h in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); + } + } + if (scale_h > 0. && scale_w > 0.) { + out_h = static_cast(in_h * scale_h); + out_w = static_cast(in_w * scale_w); + } + if (out_size) { + auto out_size_data = + funcs::get_new_data_from_tensor(out_size.get_ptr()); + out_h = out_size_data[0]; + out_w = out_size_data[1]; + } + if (size_tensor && size_tensor->size() > 0) { + // have size tensor + auto new_size = funcs::get_new_shape(size_tensor.get()); + out_h = new_size[0]; + out_w = new_size[1]; + } + + phi::DDim dim_grad; + if (data_layout == DataLayout::kNCHW) { + dim_grad = {n, c, in_h, in_w}; + } else { + dim_grad = {n, in_h, in_w, c}; + } + + input_grad->Resize(dim_grad); + dev_ctx.template Alloc(input_grad); + + phi::funcs::SetConstant zero; + zero(dev_ctx, input_grad, static_cast(0.0)); + + if (in_h == out_h && in_w == out_w) { + paddle::framework::TensorCopy(output_grad, dev_ctx.GetPlace(), input_grad); + return; + } + + float ratio_h = 0.f; + float ratio_w = 0.f; + if (out_h > 1) { + float new_scale_h = 0.f; + new_scale_h = (scale_h > 0) ? static_cast(1. / scale_h) + : static_cast(in_h) / out_h; + ratio_h = (align_corners) ? static_cast(in_h - 1) / (out_h - 1) + : static_cast(new_scale_h); + } + if (out_w > 1) { + float new_scale_w = 0.f; + new_scale_w = (scale_w > 0) ? static_cast(1. / scale_w) + : static_cast(in_w) / out_w; + ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) + : static_cast(new_scale_w); + } + + if ("bilinear" == interp_method) { + BilinearInterpolationGrad(output_grad, + input_grad, + ratio_h, + ratio_w, + in_h, + in_w, + n, + c, + out_h, + out_w, + align_corners, + align_mode, + data_layout); + } else if ("nearest" == interp_method) { + NearestNeighborInterpolateGrad(output_grad, + input_grad, + ratio_h, + ratio_w, + n, + c, + out_h, + out_w, + align_corners, + data_layout); + } else if ("bicubic" == interp_method) { + BicubicInterpolationGrad(output_grad, + input_grad, + ratio_h, + ratio_w, + in_h, + in_w, + n, + c, + out_h, + out_w, + align_corners, + data_layout); + } +} + +template +static void Interpolate3DCPUBwd( + const Context& dev_ctx, + const DenseTensor& input, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const DenseTensor& output_grad, + const std::string& data_layout_str, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* input_grad) { + const DataLayout data_layout = + paddle::framework::StringToDataLayout(data_layout_str); + int n, c, in_d, in_h, in_w; + funcs::ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); + + float scale_d = -1; + float scale_h = -1; + float scale_w = -1; + if (scale_tensor) { + auto scale_data = + funcs::get_new_data_from_tensor(scale_tensor.get_ptr()); + if (scale_data.size() > 1) { + scale_d = scale_data[0]; + scale_h = scale_data[1]; + scale_w = scale_data[2]; + } else { + scale_d = scale_data[0]; + scale_h = scale_data[0]; + scale_w = scale_data[0]; + } + PADDLE_ENFORCE_EQ( + scale_w > 0, + true, + errors::InvalidArgument( + "The scale_w in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, + true, + errors::InvalidArgument( + "The scale_h in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); + PADDLE_ENFORCE_EQ( + scale_d > 0, + true, + errors::InvalidArgument( + "The scale_d in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_d)); + } else { + if (scale.size() > 1) { + scale_d = scale[0]; + scale_h = scale[1]; + scale_w = scale[2]; + PADDLE_ENFORCE_EQ( + scale_w > 0, + true, + errors::InvalidArgument( + "The scale_w in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, + true, + errors::InvalidArgument( + "The scale_h in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); + PADDLE_ENFORCE_EQ( + scale_d > 0, + true, + errors::InvalidArgument( + "The scale_d in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_d)); + } + } + if (scale_d > 0. && scale_h > 0. && scale_w > 0.) { + out_d = static_cast(in_d * scale_d); + out_h = static_cast(in_h * scale_h); + out_w = static_cast(in_w * scale_w); + } + if (out_size) { + auto out_size_data = + funcs::get_new_data_from_tensor(out_size.get_ptr()); + out_d = out_size_data[0]; + out_h = out_size_data[1]; + out_w = out_size_data[2]; + } + if (size_tensor && size_tensor->size() > 0) { + // have size tensor + auto new_size = funcs::get_new_shape(size_tensor.get()); + out_d = new_size[0]; + out_h = new_size[1]; + out_w = new_size[2]; + } + + phi::DDim dim_grad; + if (data_layout == DataLayout::kNCHW) { + dim_grad = {n, c, in_d, in_h, in_w}; + } else { + dim_grad = {n, in_d, in_h, in_w, c}; + } + input_grad->Resize(dim_grad); + dev_ctx.template Alloc(input_grad); + + phi::funcs::SetConstant zero; + zero(dev_ctx, input_grad, static_cast(0.0)); + + if (in_d == out_d && in_h == out_h && in_w == out_w) { + paddle::framework::TensorCopy(output_grad, dev_ctx.GetPlace(), input_grad); + return; + } + + float ratio_d = 0.f; + float ratio_h = 0.f; + float ratio_w = 0.f; + if (out_d > 1) { + float new_scale_d = 0.f; + new_scale_d = (scale_d > 0) ? static_cast(1. / scale_d) + : static_cast(in_d) / out_d; + ratio_d = (align_corners) ? static_cast(in_d - 1) / (out_d - 1) + : static_cast(new_scale_d); + } + if (out_h > 1) { + float new_scale_h = 0.f; + new_scale_h = (scale_h > 0) ? static_cast(1. / scale_h) + : static_cast(in_h) / out_h; + ratio_h = (align_corners) ? static_cast(in_h - 1) / (out_h - 1) + : static_cast(new_scale_h); + } + if (out_w > 1) { + float new_scale_w = 0.f; + new_scale_w = (scale_w > 0) ? static_cast(1. / scale_w) + : static_cast(in_w) / out_w; + ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) + : static_cast(new_scale_w); + } + + if ("trilinear" == interp_method) { + TrilinearInterpolationGrad(output_grad, + input_grad, + ratio_d, + ratio_h, + ratio_w, + in_d, + in_h, + in_w, + n, + c, + out_d, + out_h, + out_w, + align_corners, + align_mode, + data_layout); + } else if ("nearest" == interp_method) { + NearestNeighbor3DInterpolateGrad(output_grad, + input_grad, + ratio_d, + ratio_h, + ratio_w, + n, + c, + out_d, + out_h, + out_w, + align_corners, + data_layout); + } +} + +template +void InterpolateGradKernel( + const Context& dev_ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const DenseTensor& output_grad, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* x_grad) { + auto output_grad_dims = output_grad.dims(); + if (output_grad_dims.size() == 3) { // 1D interpolation grad + Interpolate1DCPUBwd(dev_ctx, + x, + out_size, + size_tensor, + scale_tensor, + output_grad, + data_layout, + out_w, + scale, + interp_method, + align_corners, + align_mode, + x_grad); + } else if (output_grad_dims.size() == 4) { // 2D interpolation grad + Interpolate2DCPUBwd(dev_ctx, + x, + out_size, + size_tensor, + scale_tensor, + output_grad, + data_layout, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + x_grad); + + } else if (output_grad_dims.size() == 5) { // 3D interpolation grad + Interpolate3DCPUBwd(dev_ctx, + x, + out_size, + size_tensor, + scale_tensor, + output_grad, + data_layout, + out_d, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + x_grad); + } +} + +template +void BilinearInterpGradKernel( + const Context& dev_ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const DenseTensor& out_grad, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* x_grad) { + InterpolateGradKernel(dev_ctx, + x, + out_size, + size_tensor, + scale_tensor, + out_grad, + data_layout, + out_d, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + x_grad); +} + +template +void NearestInterpGradKernel( + const Context& dev_ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const DenseTensor& out_grad, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* x_grad) { + InterpolateGradKernel(dev_ctx, + x, + out_size, + size_tensor, + scale_tensor, + out_grad, + data_layout, + out_d, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + x_grad); +} + +template +void TrilinearInterpGradKernel( + const Context& dev_ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const DenseTensor& out_grad, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* x_grad) { + InterpolateGradKernel(dev_ctx, + x, + out_size, + size_tensor, + scale_tensor, + out_grad, + data_layout, + out_d, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + x_grad); +} + +template +void LinearInterpGradKernel( + const Context& dev_ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const DenseTensor& out_grad, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* x_grad) { + InterpolateGradKernel(dev_ctx, + x, + out_size, + size_tensor, + scale_tensor, + out_grad, + data_layout, + out_d, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + x_grad); +} + +template +void BicubicInterpGradKernel( + const Context& dev_ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const DenseTensor& out_grad, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* x_grad) { + InterpolateGradKernel(dev_ctx, + x, + out_size, + size_tensor, + scale_tensor, + out_grad, + data_layout, + out_d, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + x_grad); +} + +} // namespace phi + +PD_REGISTER_KERNEL(bilinear_interp_v2_grad, + CPU, + ALL_LAYOUT, + phi::BilinearInterpGradKernel, + float, + double) {} +PD_REGISTER_KERNEL(nearest_interp_v2_grad, + CPU, + ALL_LAYOUT, + phi::NearestInterpGradKernel, + float, + double) {} +PD_REGISTER_KERNEL(trilinear_interp_v2_grad, + CPU, + ALL_LAYOUT, + phi::TrilinearInterpGradKernel, + float, + double) {} +PD_REGISTER_KERNEL(linear_interp_v2_grad, + CPU, + ALL_LAYOUT, + phi::LinearInterpGradKernel, + float, + double) {} +PD_REGISTER_KERNEL(bicubic_interp_v2_grad, + CPU, + ALL_LAYOUT, + phi::BicubicInterpGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/cpu/interpolate_kernel.cc b/paddle/phi/kernels/cpu/interpolate_kernel.cc new file mode 100644 index 0000000000..da9a54748f --- /dev/null +++ b/paddle/phi/kernels/cpu/interpolate_kernel.cc @@ -0,0 +1,1225 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/interpolate_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/common/layout.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/interpolate_function.h" + +namespace phi { + +template +static inline T cubic_interp(T x0, T x1, T x2, T x3, T t) { + T coeffs[4]; + funcs::get_cubic_upsample_coefficients(coeffs, t); + + return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3]; +} + +template +static void LinearInterpolation(const DenseTensor& input, + DenseTensor* output, + const float ratio_w, + const int in_w, + const int n, + const int c, + const int out_w, + const bool align_corners, + const int align_mode, + const DataLayout data_layout) { + auto input_t = EigenTensor::From(input); + auto output_t = EigenTensor::From(*output); + bool align_flag = (align_mode == 0 && !align_corners); + + std::vector vx_w, vx_e; + std::vector vd_w, vd_e; + vx_w.reserve(out_w); + vx_e.reserve(out_w); + vd_w.reserve(out_w); + vd_e.reserve(out_w); +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (int l = 0; l < out_w; l++) { + int x_w = align_flag ? static_cast(ratio_w * (l + 0.5) - 0.5) + : static_cast(ratio_w * l); + x_w = (x_w > 0) ? x_w : 0; // w + int x_e = (x_w < (in_w - 1)) ? (x_w + 1) : x_w; // w_id + + float idx_src_x = ratio_w * (l + 0.5) - 0.5; + idx_src_x = (idx_src_x > 0) ? idx_src_x : 0; + float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w; // w1lambda + float d_e = 1.f - d_w; // w2lambda + { + vx_w[l] = x_w; + vx_e[l] = x_e; + vd_w[l] = d_w; + vd_e[l] = d_e; + } + } + +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for collapse(3) +#endif + for (int i = 0; i < n; i++) { // loop for batches + for (int j = 0; j < c; j++) { // loop for channels + for (int l = 0; l < out_w; l++) { + // linear interpolation + T out_t; + if (data_layout == DataLayout::kNCHW) { + out_t = input_t(i, j, vx_w[l]) * vd_e[l] + + input_t(i, j, vx_e[l]) * vd_w[l]; + output_t(i, j, l) = out_t; + } else { + out_t = input_t(i, vx_w[l], j) * vd_e[l] + + input_t(i, vx_e[l], j) * vd_w[l]; + output_t(i, l, j) = out_t; + } + } + } + } +} + +template +static void BilinearInterpolation(const DenseTensor& input, + DenseTensor* output, + const float ratio_h, + const float ratio_w, + const int in_h, + const int in_w, + const int n, + const int c, + const int out_h, + const int out_w, + const bool align_corners, + const int align_mode, + const DataLayout data_layout) { + auto input_t = EigenTensor::From(input); + auto output_t = EigenTensor::From(*output); + bool align_flag = (align_mode == 0 && !align_corners); + + std::vector vy_n, vy_s; + std::vector vd_n, vd_s; + vy_n.reserve(out_h); + vy_s.reserve(out_h); + vd_n.reserve(out_h); + vd_s.reserve(out_h); +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (int k = 0; k < out_h; k++) { + int y_n = align_flag ? static_cast(ratio_h * (k + 0.5) - 0.5) + : static_cast(ratio_h * k); + y_n = (y_n > 0) ? y_n : 0; + int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1); + float idx_src_y = ratio_h * (k + 0.5) - 0.5; + idx_src_y = (idx_src_y > 0) ? idx_src_y : 0; + float d_n = align_flag ? idx_src_y - y_n : ratio_h * k - y_n; + float d_s = 1.f - d_n; + { + vy_n[k] = y_n; + vy_s[k] = y_s; + vd_n[k] = d_n; + vd_s[k] = d_s; + } + } + + std::vector vx_w, vx_e; + std::vector vd_w, vd_e; + vx_w.reserve(out_w); + vx_e.reserve(out_w); + vd_w.reserve(out_w); + vd_e.reserve(out_w); +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (int l = 0; l < out_w; l++) { + int x_w = (align_mode == 0 && !align_corners) + ? static_cast(ratio_w * (l + 0.5) - 0.5) + : static_cast(ratio_w * l); + x_w = (x_w > 0) ? x_w : 0; + int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1); + float idx_src_x = ratio_w * (l + 0.5) - 0.5; + idx_src_x = (idx_src_x > 0) ? idx_src_x : 0; + float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w; + float d_e = 1.f - d_w; + { + vx_w[l] = x_w; + vx_e[l] = x_e; + vd_w[l] = d_w; + vd_e[l] = d_e; + } + } + +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for collapse(4) +#endif + for (int i = 0; i < n; i++) { // loop for batches + for (int j = 0; j < c; j++) { // loop for channels + for (int k = 0; k < out_h; k++) { // loop for images + for (int l = 0; l < out_w; l++) { + // bilinear interpolation + T out_t; + if (data_layout == DataLayout::kNCHW) { + out_t = input_t(i, j, vy_n[k], vx_w[l]) * vd_s[k] * vd_e[l] + + input_t(i, j, vy_s[k], vx_w[l]) * vd_n[k] * vd_e[l] + + input_t(i, j, vy_n[k], vx_e[l]) * vd_s[k] * vd_w[l] + + input_t(i, j, vy_s[k], vx_e[l]) * vd_n[k] * vd_w[l]; + output_t(i, j, k, l) = out_t; + + } else { + out_t = input_t(i, vy_n[k], vx_w[l], j) * vd_s[k] * vd_e[l] + + input_t(i, vy_s[k], vx_w[l], j) * vd_n[k] * vd_e[l] + + input_t(i, vy_n[k], vx_e[l], j) * vd_s[k] * vd_w[l] + + input_t(i, vy_s[k], vx_e[l], j) * vd_n[k] * vd_w[l]; + output_t(i, k, l, j) = out_t; + } + } + } + } + } +} + +template +static void NearestNeighborInterpolate(const DenseTensor& input, + DenseTensor* output, + const float ratio_h, + const float ratio_w, + const int n, + const int c, + const int out_h, + const int out_w, + const bool align_corners, + const DataLayout& data_layout) { + auto input_t = EigenTensor::From(input); + auto output_t = EigenTensor::From(*output); + for (int k = 0; k < out_h; k++) { // loop for images + int in_k = (align_corners) ? static_cast(ratio_h * k + 0.5) + : static_cast(ratio_h * k); + + for (int l = 0; l < out_w; l++) { + int in_l = (align_corners) ? static_cast(ratio_w * l + 0.5) + : static_cast(ratio_w * l); + + for (int i = 0; i < n; i++) { // loop for batches + for (int j = 0; j < c; j++) { // loop for channels + if (data_layout == DataLayout::kNCHW) { + output_t(i, j, k, l) = input_t(i, j, in_k, in_l); + } else { + output_t(i, k, l, j) = input_t(i, in_k, in_l, j); + } + } + } + } + } +} + +template +static void BicubicInterpolation(const DenseTensor& input, + DenseTensor* output, + const float ratio_h, + const float ratio_w, + const int in_h, + const int in_w, + const int n, + const int c, + const int out_h, + const int out_w, + const bool align_corners, + const DataLayout data_layout) { + auto input_t = EigenTensor::From(input); + auto output_t = EigenTensor::From(*output); + + for (int k = 0; k < out_h; k++) { // loop for images + T y_n = align_corners ? static_cast(ratio_h * k) + : static_cast(ratio_h * (k + 0.5) - 0.5); + int input_y = floorf(y_n); + const T y_t = y_n - input_y; + + for (int l = 0; l < out_w; l++) { + T x_n = align_corners ? static_cast(ratio_w * l) + : static_cast(ratio_w * (l + 0.5) - 0.5); + int input_x = floorf(x_n); + const T x_t = x_n - input_x; + + for (int i = 0; i < n; i++) { // loop for batches + for (int j = 0; j < c; j++) { // loop for channels + T coefficients[4]; + // interp 4 times in x direction + for (int ii = 0; ii < 4; ii++) { + int access_y = std::max(std::min(input_y - 1 + ii, in_h - 1), + static_cast(0)); + int access_x_0 = + std::max(std::min(input_x - 1, in_w - 1), static_cast(0)); + int access_x_1 = + std::max(std::min(input_x + 0, in_w - 1), static_cast(0)); + int access_x_2 = + std::max(std::min(input_x + 1, in_w - 1), static_cast(0)); + int access_x_3 = + std::max(std::min(input_x + 2, in_w - 1), static_cast(0)); + if (data_layout == DataLayout::kNCHW) { + coefficients[ii] = + cubic_interp(input_t(i, j, access_y, access_x_0), + input_t(i, j, access_y, access_x_1), + input_t(i, j, access_y, access_x_2), + input_t(i, j, access_y, access_x_3), + x_t); + } else { + coefficients[ii] = + cubic_interp(input_t(i, access_y, access_x_0, j), + input_t(i, access_y, access_x_1, j), + input_t(i, access_y, access_x_2, j), + input_t(i, access_y, access_x_3, j), + x_t); + } + } + + // interp y direction + if (data_layout == DataLayout::kNCHW) { + output_t(i, j, k, l) = cubic_interp(coefficients[0], + coefficients[1], + coefficients[2], + coefficients[3], + y_t); + } else { + output_t(i, k, l, j) = cubic_interp(coefficients[0], + coefficients[1], + coefficients[2], + coefficients[3], + y_t); + } + } + } + } + } +} + +template +static void TrilinearInterpolation(const DenseTensor& input, + DenseTensor* output, + const float ratio_d, + const float ratio_h, + const float ratio_w, + const int in_d, + const int in_h, + const int in_w, + const int n, + const int c, + const int out_d, + const int out_h, + const int out_w, + const bool align_corners, + const int align_mode, + const DataLayout& data_layout) { + auto input_t = EigenTensor::From(input); + auto output_t = EigenTensor::From(*output); + bool align_flag = (align_mode == 0 && !align_corners); + + std::vector vt_f, vt_b; + std::vector vd_f, vd_b; + vt_f.reserve(out_d); + vt_b.reserve(out_d); + vd_f.reserve(out_d); + vd_b.reserve(out_d); +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (int j = 0; j < out_d; j++) { + int t_f = align_flag ? static_cast(ratio_d * (j + 0.5) - 0.5) + : static_cast(ratio_d * j); + t_f = (t_f > 0) ? t_f : 0; + int t_b = (t_f + 1) < (in_d - 1) ? (t_f + 1) : (in_d - 1); + float idx_src_t = ratio_d * (j + 0.5) - 0.5; + idx_src_t = (idx_src_t > 0) ? idx_src_t : 0; + float d_f = align_flag ? idx_src_t - t_f : ratio_d * j - t_f; + float d_b = 1.f - d_f; + { + vt_f[j] = t_f; + vt_b[j] = t_b; + vd_f[j] = d_f; + vd_b[j] = d_b; + } + } + + std::vector vy_n, vy_s; + std::vector vd_n, vd_s; + vy_n.reserve(out_h); + vy_s.reserve(out_h); + vd_n.reserve(out_h); + vd_s.reserve(out_h); +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (int k = 0; k < out_h; k++) { + int y_n = align_flag ? static_cast(ratio_h * (k + 0.5) - 0.5) + : static_cast(ratio_h * k); + y_n = (y_n > 0) ? y_n : 0; + int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1); + float idx_src_y = ratio_h * (k + 0.5) - 0.5; + idx_src_y = (idx_src_y > 0) ? idx_src_y : 0; + float d_n = align_flag ? idx_src_y - y_n : ratio_h * k - y_n; + float d_s = 1.f - d_n; + { + vy_n[k] = y_n; + vy_s[k] = y_s; + vd_n[k] = d_n; + vd_s[k] = d_s; + } + } + + std::vector vx_w, vx_e; + std::vector vd_w, vd_e; + vx_w.reserve(out_w); + vx_e.reserve(out_w); + vd_w.reserve(out_w); + vd_e.reserve(out_w); +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (int l = 0; l < out_w; l++) { + int x_w = (align_mode == 0 && !align_corners) + ? static_cast(ratio_w * (l + 0.5) - 0.5) + : static_cast(ratio_w * l); + x_w = (x_w > 0) ? x_w : 0; + int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1); + float idx_src_x = ratio_w * (l + 0.5) - 0.5; + idx_src_x = (idx_src_x > 0) ? idx_src_x : 0; + float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w; + float d_e = 1.f - d_w; + { + vx_w[l] = x_w; + vx_e[l] = x_e; + vd_w[l] = d_w; + vd_e[l] = d_e; + } + } + +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for collapse(5) +#endif + for (int b = 0; b < n; b++) { // loop for batches + for (int i = 0; i < c; i++) { // loop for channels + for (int j = 0; j < out_d; j++) { // loop for D, H, W + for (int k = 0; k < out_h; k++) { + for (int l = 0; l < out_w; l++) { + // trilinear interpolation + if (data_layout == DataLayout::kNCHW) { + T out_t = input_t(b, i, vt_f[j], vy_n[k], vx_w[l]) * vd_b[j] * + vd_s[k] * vd_e[l] + + input_t(b, i, vt_f[j], vy_n[k], vx_e[l]) * vd_b[j] * + vd_s[k] * vd_w[l] + + input_t(b, i, vt_f[j], vy_s[k], vx_w[l]) * vd_b[j] * + vd_n[k] * vd_e[l] + + input_t(b, i, vt_f[j], vy_s[k], vx_e[l]) * vd_b[j] * + vd_n[k] * vd_w[l] + + input_t(b, i, vt_b[j], vy_n[k], vx_w[l]) * vd_f[j] * + vd_s[k] * vd_e[l] + + input_t(b, i, vt_b[j], vy_n[k], vx_e[l]) * vd_f[j] * + vd_s[k] * vd_w[l] + + input_t(b, i, vt_b[j], vy_s[k], vx_w[l]) * vd_f[j] * + vd_n[k] * vd_e[l] + + input_t(b, i, vt_b[j], vy_s[k], vx_e[l]) * vd_f[j] * + vd_n[k] * vd_w[l]; + output_t(b, i, j, k, l) = out_t; + } else { + T out_t = input_t(b, vt_f[j], vy_n[k], vx_w[l], i) * vd_b[j] * + vd_s[k] * vd_e[l] + + input_t(b, vt_f[j], vy_n[k], vx_e[l], i) * vd_b[j] * + vd_s[k] * vd_w[l] + + input_t(b, vt_f[j], vy_s[k], vx_w[l], i) * vd_b[j] * + vd_n[k] * vd_e[l] + + input_t(b, vt_f[j], vy_s[k], vx_e[l], i) * vd_b[j] * + vd_n[k] * vd_w[l] + + input_t(b, vt_b[j], vy_n[k], vx_w[l], i) * vd_f[j] * + vd_s[k] * vd_e[l] + + input_t(b, vt_b[j], vy_n[k], vx_e[l], i) * vd_f[j] * + vd_s[k] * vd_w[l] + + input_t(b, vt_b[j], vy_s[k], vx_w[l], i) * vd_f[j] * + vd_n[k] * vd_e[l] + + input_t(b, vt_b[j], vy_s[k], vx_e[l], i) * vd_f[j] * + vd_n[k] * vd_w[l]; + output_t(b, j, k, l, i) = out_t; + } + } + } + } + } + } +} + +template +static void NearestNeighbor3DInterpolate(const DenseTensor& input, + DenseTensor* output, + const float ratio_d, + const float ratio_h, + const float ratio_w, + const int n, + const int c, + const int out_d, + const int out_h, + const int out_w, + const bool align_corners, + const DataLayout& data_layout) { + auto input_t = EigenTensor::From(input); + auto output_t = EigenTensor::From(*output); + for (int d = 0; d < out_d; d++) { // loop for images + int in_d = (align_corners) ? static_cast(ratio_d * d + 0.5) + : static_cast(ratio_d * d); + for (int k = 0; k < out_h; k++) { + int in_k = (align_corners) ? static_cast(ratio_h * k + 0.5) + : static_cast(ratio_h * k); + + for (int l = 0; l < out_w; l++) { + int in_l = (align_corners) ? static_cast(ratio_w * l + 0.5) + : static_cast(ratio_w * l); + + for (int i = 0; i < n; i++) { // loop for batches + for (int j = 0; j < c; j++) { // loop for channels + if (data_layout == DataLayout::kNCHW) { + output_t(i, j, d, k, l) = input_t(i, j, in_d, in_k, in_l); + } else { // NDHWC + output_t(i, d, k, l, j) = input_t(i, in_d, in_k, in_l, j); + } + } + } + } + } + } +} + +template +static void Interpolate1DCPUFwd( + const Context& dev_ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const std::string& data_layout_str, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* output) { + const DataLayout data_layout = + paddle::framework::StringToDataLayout(data_layout_str); + int n, c, in_d, in_h, in_w; + funcs::ExtractNCDWH(x.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); + + float scale_w = -1.; + if (size_tensor && size_tensor->size() > 0) { + // have size tensor + auto new_size = funcs::get_new_shape(size_tensor.get()); + out_w = new_size[0]; + } else { + if (scale_tensor) { + auto scale_data = + funcs::get_new_data_from_tensor(scale_tensor.get_ptr()); + scale_w = scale_data[0]; + PADDLE_ENFORCE_EQ( + scale_w > 0, + true, + errors::InvalidArgument( + "The scale_w in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + } else { + if (scale.size() > 0) { + scale_w = scale[0]; + + PADDLE_ENFORCE_EQ( + scale_w > 0, + true, + errors::InvalidArgument( + "The scale_w in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + } + } + if (scale_w > 0.) { + out_w = static_cast(in_w * scale_w); + } + if (out_size) { + auto out_size_data = + funcs::get_new_data_from_tensor(out_size.get_ptr()); + out_w = out_size_data[0]; + } + } + PADDLE_ENFORCE_GT( + out_w, + 0, + errors::InvalidArgument("out_w in Attr(out_shape) of Op(interpolate) " + "should be greater than 0.")); + phi::DDim dim_out; + if (data_layout == DataLayout::kNCHW) { + dim_out = {n, c, out_w}; + } else { + dim_out = {n, out_w, c}; + } + output->Resize(dim_out); + dev_ctx.template Alloc(output); + + if (in_w == out_w) { + paddle::framework::TensorCopy(x, dev_ctx.GetPlace(), output); + return; + } + + float ratio_w = 0.f; + if (out_w > 1) { + float new_scale_w = 0.f; + new_scale_w = (scale_w > 0) ? static_cast(1. / scale_w) + : static_cast(in_w) / out_w; + ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) + : static_cast(new_scale_w); + } + if ("linear" == interp_method) { + LinearInterpolation(x, + output, + ratio_w, + in_w, + n, + c, + out_w, + align_corners, + align_mode, + data_layout); + } +} + +template +static void Interpolate2DCPUFwd( + const Context& dev_ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const std::string& data_layout_str, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* output) { + const DataLayout data_layout = + paddle::framework::StringToDataLayout(data_layout_str); + int n, c, in_d, in_h, in_w; + funcs::ExtractNCDWH(x.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); + + float scale_h = -1; + float scale_w = -1; + + if (size_tensor && size_tensor->size() > 0) { + // have size tensor + auto new_size = funcs::get_new_shape(size_tensor.get()); + out_h = new_size[0]; + out_w = new_size[1]; + } else { + if (scale_tensor) { + auto scale_data = + funcs::get_new_data_from_tensor(scale_tensor.get_ptr()); + if (scale_data.size() > 1) { + scale_h = scale_data[0]; + scale_w = scale_data[1]; + } else { + scale_h = scale_data[0]; + scale_w = scale_data[0]; + } + PADDLE_ENFORCE_EQ( + scale_w > 0, + true, + errors::InvalidArgument( + "The scale_w in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, + true, + errors::InvalidArgument( + "The scale_h in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); + } else { + if (scale.size() > 1) { + scale_h = scale[0]; + scale_w = scale[1]; + + PADDLE_ENFORCE_EQ( + scale_w > 0, + true, + errors::InvalidArgument( + "The scale_w in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, + true, + errors::InvalidArgument( + "The scale_h in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); + } + } + if (scale_h > 0. && scale_w > 0.) { + out_h = static_cast(in_h * scale_h); + out_w = static_cast(in_w * scale_w); + } + if (out_size) { + auto out_size_data = + funcs::get_new_data_from_tensor(out_size.get_ptr()); + out_h = out_size_data[0]; + out_w = out_size_data[1]; + } + } + PADDLE_ENFORCE_GT( + out_h, + 0, + errors::InvalidArgument("out_h in Attr(out_shape) of Op(interpolate) " + "should be greater than 0.")); + PADDLE_ENFORCE_GT( + out_w, + 0, + errors::InvalidArgument("out_w in Attr(out_shape) of Op(interpolate) " + "should be greater than 0.")); + phi::DDim dim_out; + if (data_layout == DataLayout::kNCHW) { + dim_out = {n, c, out_h, out_w}; + } else { + dim_out = {n, out_h, out_w, c}; + } + output->Resize(dim_out); + dev_ctx.template Alloc(output); + + if (in_h == out_h && in_w == out_w) { + paddle::framework::TensorCopy(x, dev_ctx.GetPlace(), output); + return; + } + + float ratio_h = 0.f; + float ratio_w = 0.f; + if (out_h > 1) { + float new_scale_h = 0.f; + new_scale_h = (scale_h > 0) ? static_cast(1. / scale_h) + : static_cast(in_h) / out_h; + ratio_h = (align_corners) ? static_cast(in_h - 1) / (out_h - 1) + : static_cast(new_scale_h); + } + if (out_w > 1) { + float new_scale_w = 0.f; + new_scale_w = (scale_w > 0) ? static_cast(1. / scale_w) + : static_cast(in_w) / out_w; + ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) + : static_cast(new_scale_w); + } + + if ("bilinear" == interp_method) { + BilinearInterpolation(x, + output, + ratio_h, + ratio_w, + in_h, + in_w, + n, + c, + out_h, + out_w, + align_corners, + align_mode, + data_layout); + } else if ("nearest" == interp_method) { + NearestNeighborInterpolate(x, + output, + ratio_h, + ratio_w, + n, + c, + out_h, + out_w, + align_corners, + data_layout); + } else if ("bicubic" == interp_method) { + BicubicInterpolation(x, + output, + ratio_h, + ratio_w, + in_h, + in_w, + n, + c, + out_h, + out_w, + align_corners, + data_layout); + } +} + +template +static void Interpolate3DCPUFwd( + const Context& dev_ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const std::string& data_layout_str, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* output) { + const DataLayout data_layout = + paddle::framework::StringToDataLayout(data_layout_str); + int n, c, in_d, in_h, in_w; + funcs::ExtractNCDWH(x.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); + + float scale_d = -1; + float scale_h = -1; + float scale_w = -1; + + if (size_tensor && size_tensor->size() > 0) { + // have size tensor + auto new_size = funcs::get_new_shape(size_tensor.get()); + out_d = new_size[0]; + out_h = new_size[1]; + out_w = new_size[2]; + } else { + if (scale_tensor) { + auto scale_data = + funcs::get_new_data_from_tensor(scale_tensor.get_ptr()); + if (scale_data.size() > 1) { + scale_d = scale_data[0]; + scale_h = scale_data[1]; + scale_w = scale_data[2]; + } else { + scale_d = scale_data[0]; + scale_h = scale_data[0]; + scale_w = scale_data[0]; + } + PADDLE_ENFORCE_EQ( + scale_w > 0, + true, + errors::InvalidArgument( + "The scale_w in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, + true, + errors::InvalidArgument( + "The scale_h in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); + PADDLE_ENFORCE_EQ( + scale_d > 0, + true, + errors::InvalidArgument( + "The scale_d in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_d)); + } else { + if (scale.size() > 1) { + scale_d = scale[0]; + scale_h = scale[1]; + scale_w = scale[2]; + + PADDLE_ENFORCE_EQ( + scale_w > 0, + true, + errors::InvalidArgument( + "The scale_w in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, + true, + errors::InvalidArgument( + "The scale_h in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); + PADDLE_ENFORCE_EQ( + scale_d > 0, + true, + errors::InvalidArgument( + "The scale_d in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_d)); + } + } + if (scale_w > 0. && scale_h > 0. && scale_d > 0.) { + out_d = static_cast(in_d * scale_d); + out_h = static_cast(in_h * scale_h); + out_w = static_cast(in_w * scale_w); + } + if (out_size) { + auto out_size_data = + funcs::get_new_data_from_tensor(out_size.get_ptr()); + out_d = out_size_data[0]; + out_h = out_size_data[1]; + out_w = out_size_data[2]; + } + } + PADDLE_ENFORCE_GT( + out_d, + 0, + errors::InvalidArgument("out_d in Attr(out_shape) of Op(interpolate) " + "should be greater than 0.")); + PADDLE_ENFORCE_GT( + out_h, + 0, + errors::InvalidArgument("out_h in Attr(out_shape) of Op(interpolate) " + "should be greater than 0.")); + PADDLE_ENFORCE_GT( + out_w, + 0, + errors::InvalidArgument("out_w in Attr(out_shape) of Op(interpolate) " + "should be greater than 0.")); + + phi::DDim dim_out; + if (data_layout == DataLayout::kNCHW) { + dim_out = {n, c, out_d, out_h, out_w}; + } else { + dim_out = {n, out_d, out_h, out_w, c}; + } + + output->Resize(dim_out); + dev_ctx.template Alloc(output); + + if (in_d == out_d && in_h == out_h && in_w == out_w) { + paddle::framework::TensorCopy(x, dev_ctx.GetPlace(), output); + return; + } + + float ratio_d = 0.f; + float ratio_h = 0.f; + float ratio_w = 0.f; + if (out_d > 1) { + float new_scale_d = 0.f; + new_scale_d = (scale_d > 0) ? static_cast(1. / scale_d) + : static_cast(in_d) / out_d; + ratio_d = (align_corners) ? static_cast(in_d - 1) / (out_d - 1) + : static_cast(new_scale_d); + } + if (out_h > 1) { + float new_scale_h = 0.f; + new_scale_h = (scale_h > 0) ? static_cast(1. / scale_h) + : static_cast(in_h) / out_h; + ratio_h = (align_corners) ? static_cast(in_h - 1) / (out_h - 1) + : static_cast(new_scale_h); + } + if (out_w > 1) { + float new_scale_w = 0.f; + new_scale_w = (scale_w > 0) ? static_cast(1. / scale_w) + : static_cast(in_w) / out_w; + ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) + : static_cast(new_scale_w); + } + + if ("trilinear" == interp_method) { + TrilinearInterpolation(x, + output, + ratio_d, + ratio_h, + ratio_w, + in_d, + in_h, + in_w, + n, + c, + out_d, + out_h, + out_w, + align_corners, + align_mode, + data_layout); + } else if ("nearest" == interp_method) { + NearestNeighbor3DInterpolate(x, + output, + ratio_d, + ratio_h, + ratio_w, + n, + c, + out_d, + out_h, + out_w, + align_corners, + data_layout); + } +} + +template +void InterpolateKernel( + const Context& ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* output) { + auto input_dims = x.dims(); + if (input_dims.size() == 3) { // 1D interpolation + Interpolate1DCPUFwd(ctx, + x, + out_size, + size_tensor, + scale_tensor, + data_layout, + out_w, + scale, + interp_method, + align_corners, + align_mode, + output); + + } else if (input_dims.size() == 4) { // 2D interpolation + Interpolate2DCPUFwd(ctx, + x, + out_size, + size_tensor, + scale_tensor, + data_layout, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + output); + } else if (input_dims.size() == 5) { // 3D interpolation + Interpolate3DCPUFwd(ctx, + x, + out_size, + size_tensor, + scale_tensor, + data_layout, + out_d, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + output); + } +} + +template +void BilinearInterpKernel( + const Context& ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* output) { + InterpolateKernel(ctx, + x, + out_size, + size_tensor, + scale_tensor, + data_layout, + out_d, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + output); +} + +template +void NearestInterpKernel( + const Context& ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* output) { + InterpolateKernel(ctx, + x, + out_size, + size_tensor, + scale_tensor, + data_layout, + out_d, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + output); +} + +template +void TrilinearInterpKernel( + const Context& ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* output) { + InterpolateKernel(ctx, + x, + out_size, + size_tensor, + scale_tensor, + data_layout, + out_d, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + output); +} + +template +void LinearInterpKernel( + const Context& ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* output) { + InterpolateKernel(ctx, + x, + out_size, + size_tensor, + scale_tensor, + data_layout, + out_d, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + output); +} + +template +void BicubicInterpKernel( + const Context& ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* output) { + InterpolateKernel(ctx, + x, + out_size, + size_tensor, + scale_tensor, + data_layout, + out_d, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + output); +} + +} // namespace phi + +PD_REGISTER_KERNEL(bilinear_interp_v2, + CPU, + ALL_LAYOUT, + phi::BilinearInterpKernel, + float, + double, + uint8_t) {} +PD_REGISTER_KERNEL(nearest_interp_v2, + CPU, + ALL_LAYOUT, + phi::NearestInterpKernel, + float, + double, + int, + int64_t, + uint8_t) {} +PD_REGISTER_KERNEL(trilinear_interp_v2, + CPU, + ALL_LAYOUT, + phi::TrilinearInterpKernel, + float, + double, + uint8_t) {} +PD_REGISTER_KERNEL(linear_interp_v2, + CPU, + ALL_LAYOUT, + phi::LinearInterpKernel, + float, + double, + uint8_t) {} +PD_REGISTER_KERNEL(bicubic_interp_v2, + CPU, + ALL_LAYOUT, + phi::BicubicInterpKernel, + float, + double) {} diff --git a/paddle/phi/kernels/funcs/aligned_vector.h b/paddle/phi/kernels/funcs/aligned_vector.h index 9382b03cf9..d71a61f107 100644 --- a/paddle/phi/kernels/funcs/aligned_vector.h +++ b/paddle/phi/kernels/funcs/aligned_vector.h @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once - +#include #include "paddle/phi/core/hostdevice.h" namespace phi { diff --git a/paddle/phi/kernels/funcs/interpolate_function.h b/paddle/phi/kernels/funcs/interpolate_function.h new file mode 100644 index 0000000000..453f9ea87c --- /dev/null +++ b/paddle/phi/kernels/funcs/interpolate_function.h @@ -0,0 +1,154 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/phi/common/layout.h" +#include "paddle/phi/core/ddim.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" + +#if defined(__NVCC__) || defined(__HIPCC__) +#include "paddle/fluid/platform/fast_divmod.h" +#endif + +namespace phi { +namespace funcs { + +template +HOSTDEVICE inline T CubicConvolution1(T x, T A) { + return ((A + 2) * x - (A + 3)) * x * x + 1; +} + +template +HOSTDEVICE inline T CubicConvolution2(T x, T A) { + return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A; +} + +template +HOSTDEVICE inline void get_cubic_upsample_coefficients(T coeffs[4], T t) { + T A = -0.75; + + T x1 = t; + coeffs[0] = CubicConvolution2(x1 + 1.0, A); + coeffs[1] = CubicConvolution1(x1, A); + + // opposite coefficients + T x2 = 1.0 - t; + coeffs[2] = CubicConvolution1(x2, A); + coeffs[3] = CubicConvolution2(x2 + 1.0, A); +} + +inline void ExtractNCDWH(const DDim& dims, + const DataLayout& data_layout, + int* N, + int* C, + int* D, + int* H, + int* W) { + *N = dims[0]; + + if (dims.size() == 3) { + *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[2]; + *D = 1; + *H = 1; + *W = data_layout == DataLayout::kNCHW ? dims[2] : dims[1]; + } else if (dims.size() == 4) { + *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[3]; + *D = 1; + *H = data_layout == DataLayout::kNCHW ? dims[2] : dims[1]; + *W = data_layout == DataLayout::kNCHW ? dims[3] : dims[2]; + } else { + *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[4]; + *D = data_layout == DataLayout::kNCHW ? dims[2] : dims[1]; + *H = data_layout == DataLayout::kNCHW ? dims[3] : dims[2]; + *W = data_layout == DataLayout::kNCHW ? dims[4] : dims[3]; + } +} + +inline std::vector get_new_shape( + const std::vector& list_new_shape_tensor) { + // get tensor from + std::vector vec_new_shape; + for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) { + auto tensor = list_new_shape_tensor[i]; + PADDLE_ENFORCE_EQ( + tensor->dims(), + phi::make_ddim({1}), + errors::InvalidArgument("The shape of dimension tensor should be [1]," + "but received d%.", + tensor->dims())); + if (paddle::platform::is_gpu_place(tensor->place())) { + DenseTensor temp; + paddle::framework::TensorCopySync( + *tensor, paddle::platform::CPUPlace(), &temp); + vec_new_shape.push_back(static_cast(*temp.data())); + } else { + vec_new_shape.push_back(static_cast(*tensor->data())); + } + } + + return vec_new_shape; +} + +template +inline std::vector get_new_data_from_tensor( + const DenseTensor* new_data_tensor) { + std::vector vec_new_data; + auto* new_data = new_data_tensor->data(); + DenseTensor cpu_starts_tensor; + if (paddle::platform::is_gpu_place(new_data_tensor->place())) { + paddle::framework::TensorCopySync( + *new_data_tensor, paddle::platform::CPUPlace(), &cpu_starts_tensor); + new_data = cpu_starts_tensor.data(); + } +#ifdef PADDLE_WITH_ASCEND_CL + if (paddle::platform::is_npu_place(new_data_tensor->place())) { + paddle::framework::TensorCopySync( + *new_data_tensor, paddle::platform::CPUPlace(), &cpu_starts_tensor); + new_data = cpu_starts_tensor.data(); + } +#endif +#ifdef PADDLE_WITH_XPU + if (paddle::platform::is_xpu_place(new_data_tensor->place())) { + paddle::framework::TensorCopySync( + *new_data_tensor, paddle::platform::CPUPlace(), &cpu_starts_tensor); + new_data = cpu_starts_tensor.data(); + } +#endif + vec_new_data = std::vector(new_data, new_data + new_data_tensor->numel()); + return vec_new_data; +} + +#if defined(__NVCC__) || defined(__HIPCC__) +using paddle::platform::FastDivMod; + +struct FastDivModForInterpolate { + public: + FastDivMod channels_div; + FastDivMod output_w_div; + FastDivMod output_wc_div; + + explicit HOSTDEVICE FastDivModForInterpolate(const int channels, + const int output_w, + const int outout_wc) + : channels_div(FastDivMod(channels)), + output_w_div(FastDivMod(output_w)), + output_wc_div(FastDivMod(outout_wc)) {} +}; + +#endif + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu b/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu new file mode 100644 index 0000000000..73334d9c38 --- /dev/null +++ b/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu @@ -0,0 +1,1601 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/interpolate_grad_kernel.h" + +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/fluid/platform/fast_divmod.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/common/layout.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/interpolate_function.h" +#include "paddle/phi/kernels/funcs/math_cuda_utils.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +__forceinline__ __device__ void PreCalculatorForLinearInterpInputIndex( + int* in_img_idx, + int* x_id, + T* lambda1, + T* lambda2, + T src_x, + const int in_img_x) { + src_x = (src_x > 0) ? src_x : 0.f; + *in_img_idx = static_cast(src_x); + *x_id = (*in_img_idx < in_img_x - 1) ? 1 : 0; + *lambda1 = src_x - *in_img_idx; + *lambda2 = 1.f - *lambda1; +} + +template +__global__ void KeLinearInterpBw(T* in, + const size_t in_img_w, + const size_t input_w, + const T* out, + const size_t out_img_w, + const size_t output_h, + const size_t output_w, + const size_t num_channels, + const T ratio_w, + const bool align_corners, + const int align_mode, + const DataLayout data_layout) { + int nthreads = output_h * output_w; + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + bool align_flag = (align_mode == 0 && !align_corners); + for (; tid < nthreads; tid += stride) { + int out_id_h = tid / output_w; + int out_id_w = tid % output_w; + int in_img_size = input_w / num_channels; + int out_img_size = output_w / num_channels; + + int channel_id, out_img_idx; + if (data_layout == DataLayout::kNCHW) { + channel_id = out_id_w / out_img_size; + out_img_idx = tid % out_img_w; + } else { + out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels; + channel_id = tid % num_channels; + } + + int in_img_idx = align_flag ? ratio_w * (out_img_idx + 0.5) - 0.5 + : ratio_w * out_img_idx; + in_img_idx = (in_img_idx > 0) ? in_img_idx : 0; // w + int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0; // w_id + + T src_w = ratio_w * (out_img_idx + 0.5) - 0.5; + src_w = (src_w > 0) ? src_w : 0; + T w1lambda = + align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx; + T w2lambda = 1.f - w1lambda; + + T* in_pos; + if (data_layout == DataLayout::kNCHW) { + in_pos = &in[out_id_h * input_w + channel_id * in_img_size + in_img_idx]; + } else { + in_pos = &in[out_id_h * input_w + in_img_idx * num_channels + channel_id]; + } + const T* out_pos = &out[out_id_w]; + + if (data_layout == DataLayout::kNCHW) { + paddle::platform::CudaAtomicAdd(&in_pos[0], w2lambda * out_pos[0]); + paddle::platform::CudaAtomicAdd(&in_pos[w_id], w1lambda * out_pos[0]); + } else { + paddle::platform::CudaAtomicAdd(&in_pos[0], w2lambda * out_pos[0]); + paddle::platform::CudaAtomicAdd(&in_pos[w_id * num_channels], + w1lambda * out_pos[0]); + } + } +} + +template +__global__ void KeNearestNeighborInterpNCHWBw(T* in, + const size_t in_img_h, + const size_t in_img_w, + const T* out, + const size_t out_img_h, + const size_t out_img_w, + const size_t nc, + const float ratio_h, + const float ratio_w, + const bool align_corners) { + int out_img_idx = threadIdx.x + blockIdx.x * blockDim.x; + int out_img_idy = threadIdx.y + blockIdx.y * blockDim.y; + int nc_id = threadIdx.z + blockIdx.z * blockDim.z; + int nc_stride = blockDim.z * gridDim.z; + + // nearest_sampling by multiple read in_addr and write to out_addr + int in_img_idx = (align_corners) + ? static_cast(ratio_w * out_img_idx + 0.5) + : static_cast(ratio_w * out_img_idx); + int in_img_idy = (align_corners) + ? static_cast(ratio_h * out_img_idy + 0.5) + : static_cast(ratio_h * out_img_idy); + + int in_index = (nc_id * in_img_h + in_img_idy) * in_img_w + in_img_idx; + int in_index_stride = nc_stride * in_img_h * in_img_w; + + int out_index = (nc_id * out_img_h + out_img_idy) * out_img_w + out_img_idx; + int out_index_stride = nc_stride * out_img_h * out_img_w; + + // prevent from multiple threads writing + if (out_img_idx < out_img_w && out_img_idy < out_img_h) { + while (nc_id < nc) { + T* in_pos = &in[in_index]; + const T out_pos = out[out_index]; + paddle::platform::CudaAtomicAdd(in_pos, out_pos); + in_index += in_index_stride; + out_index += out_index_stride; + nc_id += nc_stride; + } + } +} + +template +__global__ void KeNearestNeighborInterpBw( + T* in, + const size_t in_img_h, + const size_t in_img_w, + const size_t input_h, + const size_t input_w, + const T* out, + const size_t out_img_h, + const size_t out_img_w, + const size_t output_h, + const size_t output_w, + const size_t num_channels, + const float ratio_h, + const float ratio_w, + const bool align_corners, + funcs::FastDivModForInterpolate divmods) { + int nthreads = output_h * output_w; + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + int in_img_size = in_img_h * in_img_w; + int out_img_size = out_img_h * out_img_w; + + for (; tid < nthreads; tid += stride) { + auto out_id_divmod = divmods.output_w_div.Divmod(tid); + int out_id_h = out_id_divmod.val[0]; + int out_id_w = out_id_divmod.val[1]; + + int channel_id = divmods.channels_div.Divmod(tid).val[1]; + auto outimg_id_divmod = divmods.output_wc_div.Divmod(out_id_w); + int out_img_idy = outimg_id_divmod.val[0]; + int out_img_idx = + divmods.channels_div.Divmod(outimg_id_divmod.val[1]).val[0]; + + int in_img_idy = (align_corners) + ? static_cast(ratio_h * out_img_idy + 0.5) + : static_cast(ratio_h * out_img_idy); + int in_img_idx = (align_corners) + ? static_cast(ratio_w * out_img_idx + 0.5) + : static_cast(ratio_w * out_img_idx); + + T* in_pos = &in[out_id_h * input_w + in_img_idy * in_img_w * num_channels + + in_img_idx * num_channels + channel_id]; + + const T out_pos = out[tid]; + paddle::platform::CudaAtomicAdd(in_pos, out_pos); + } +} + +/* Calculate the minimum of partial elements in a block */ +template +__inline__ __device__ T PartialBlockMin(T val, + size_t threads_num_in_block, + unsigned mask) { + __shared__ T shared[WARP_SIZE]; + __shared__ T shared_last_val; + __shared__ int shared_last_idx; + int lane = threadIdx.x & 0x1f; + int wid = threadIdx.x >> 5; + int threshold = (threads_num_in_block & (-WARP_SIZE)); + + if (threadIdx.x < threshold) { + shared_last_idx = (threshold >> 5) - 1; + val = phi::funcs::warpReduceMin(val, mask); + if (lane == 0) { + shared[wid] = val; + } + } else { + shared_last_val = std::numeric_limits::max(); + paddle::platform::CudaAtomicMin(&shared_last_val, val); + shared[wid] = shared_last_val; + shared_last_idx = wid; + } + __syncthreads(); + + if (threadIdx.x < threshold) { + val = (lane <= shared_last_idx) ? shared[lane] + : std::numeric_limits::max(); + val = phi::funcs::warpReduceMin(val, mask); + shared_last_val = val; + } + __syncthreads(); + if (threadIdx.x >= threshold) { + val = shared_last_val; + } + return val; +} + +template +__global__ void KeBilinearInterpBwShareMemory(T* in, + const int in_h, + const int in_w, + const T* __restrict__ out, + const int out_h, + const int out_w, + const int n, + const int num_channels, + float ratio_h, + float ratio_w, + const T align_type_value, + bool is_nchw) { + __shared__ T s_data[2][1024]; + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + int in_chw = in_h * in_w * num_channels; + int out_chw = num_channels * out_h * out_w; + int nthreads = n * out_chw; + + for (; tid < nthreads; tid += stride) { + int out_id_h = tid / out_chw; + int out_id_w = tid % out_chw; + const int in_img_size = in_h * in_w; + const int out_img_size = out_h * out_w; + T value = out[out_id_h * out_chw + out_id_w]; + + int channel_id = out_id_w / out_img_size; + int out_img_idy = (out_id_w % out_img_size) / out_w; + int out_img_idx = tid % out_w; + + int in_img_idx, in_img_idy, w_id, h_id; + T w1lambda, h1lambda, w2lambda, h2lambda; + T src_w = ratio_w * (out_img_idx + align_type_value) - align_type_value; + T src_h = ratio_h * (out_img_idy + align_type_value) - align_type_value; + + PreCalculatorForLinearInterpInputIndex( + &in_img_idx, &w_id, &w1lambda, &w2lambda, src_w, in_w); + PreCalculatorForLinearInterpInputIndex( + &in_img_idy, &h_id, &h1lambda, &h2lambda, src_h, in_h); + + // top_left_index is just input_index. + int input_index = out_id_h * in_chw + channel_id * in_img_size + + in_img_idy * in_w + in_img_idx; + int top_right_index = input_index + w_id; + int bot_left_index = input_index + h_id * in_w; + int bot_right_index = input_index + h_id * in_w + w_id; + int in_top_min_index, in_bot_min_index; + + s_data[0][threadIdx.x] = 0.f; + s_data[1][threadIdx.x] = 0.f; + int remain = nthreads - (tid & (-blockDim.x)); + int in_top_max_index = + phi::funcs::blockReduceMax(top_right_index, FINAL_MASK); + int in_bot_max_index = + phi::funcs::blockReduceMax(bot_right_index, FINAL_MASK); + + if (remain > blockDim.x) { + in_top_min_index = phi::funcs::blockReduceMin(input_index, FINAL_MASK); + in_bot_min_index = phi::funcs::blockReduceMin(bot_left_index, FINAL_MASK); + } else { + in_top_min_index = PartialBlockMin(input_index, remain, FINAL_MASK); + in_bot_min_index = PartialBlockMin(bot_left_index, remain, FINAL_MASK); + } + int upper_limit_share_idx = (in_top_max_index - in_top_min_index) > + (in_bot_max_index - in_bot_min_index) + ? (in_top_max_index - in_top_min_index) + : (in_bot_max_index - in_bot_min_index); + if (h_id != 0) { + paddle::platform::CudaAtomicAdd( + &s_data[0][input_index - in_top_min_index], + h2lambda * w2lambda * value); + paddle::platform::CudaAtomicAdd( + &s_data[0][top_right_index - in_top_min_index], + h2lambda * w1lambda * value); + paddle::platform::CudaAtomicAdd( + &s_data[1][bot_left_index - in_bot_min_index], + h1lambda * w2lambda * value); + paddle::platform::CudaAtomicAdd( + &s_data[1][bot_right_index - in_bot_min_index], + h1lambda * w1lambda * value); + } else { + paddle::platform::CudaAtomicAdd( + &s_data[0][top_right_index - in_top_min_index], + (h2lambda + h1lambda) * w1lambda * value); + paddle::platform::CudaAtomicAdd( + &s_data[1][bot_left_index - in_bot_min_index], + (h1lambda + h2lambda) * w2lambda * value); + } + __syncthreads(); + + if (threadIdx.x <= upper_limit_share_idx) { + paddle::platform::CudaAtomicAdd(&in[in_top_min_index + threadIdx.x], + s_data[0][threadIdx.x]); + paddle::platform::CudaAtomicAdd(&in[in_bot_min_index + threadIdx.x], + s_data[1][threadIdx.x]); + } + } +} + +__device__ __forceinline__ int GetInputIndex(const size_t nc, + const int height, + const int width, + const int h, + const int w) { + return (nc * height + h) * width + w; +} + +template +__global__ void KeBilinearInterpNCHWBw(T* in, + const int in_h, + const int in_w, + const int out_h, + const int out_w, + const int n, + const int num_channels, + float ratio_h, + float ratio_w, + const T* __restrict__ out, + const T align_type_value) { + int index = threadIdx.x + blockDim.x * blockIdx.x; + int stride = blockDim.x * gridDim.x; + int num_out = n * num_channels * out_h * out_w; + int num_in = n * num_channels * in_h * in_w; + + for (; index < num_out; index += stride) { + int index_tmp = index; + int w2 = index_tmp % out_w; + index_tmp /= out_w; + int h2 = index_tmp % out_h; + int nc = index_tmp / out_h; + + int h1, y_id; + T h1lambda, h0lambda; + T src_y = ratio_h * (h2 + align_type_value) - align_type_value; + + PreCalculatorForLinearInterpInputIndex( + &h1, &y_id, &h1lambda, &h0lambda, src_y, in_h); + int w1, x_id; + T w1lambda, w0lambda; + T src_x = ratio_w * (w2 + align_type_value) - align_type_value; + PreCalculatorForLinearInterpInputIndex( + &w1, &x_id, &w1lambda, &w0lambda, src_x, in_w); + + T d2val = out[index]; + + paddle::platform::CudaAtomicAdd(in + GetInputIndex(nc, in_h, in_w, h1, w1), + h0lambda * w0lambda * d2val); + paddle::platform::CudaAtomicAdd( + in + GetInputIndex(nc, in_h, in_w, h1, w1 + x_id), + h0lambda * w1lambda * d2val); + paddle::platform::CudaAtomicAdd( + in + GetInputIndex(nc, in_h, in_w, h1 + y_id, w1), + h1lambda * w0lambda * d2val); + paddle::platform::CudaAtomicAdd( + in + GetInputIndex(nc, in_h, in_w, h1 + y_id, w1 + x_id), + h1lambda * w1lambda * d2val); + } +} + +template +__global__ void KeBilinearInterpBw(T* in, + const int in_h, + const int in_w, + const T* __restrict__ out, + const int out_h, + const int out_w, + const int n, + const int out_chw, + const int num_channels, + float ratio_h, + float ratio_w, + const T align_type_value, + funcs::FastDivModForInterpolate divmods) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + int in_chw = in_h * in_w * num_channels; + int nthreads = n * out_chw; + + for (; tid < nthreads; tid += stride) { + auto out_id_divmod = divmods.output_w_div.Divmod(tid); + int out_id_h = out_id_divmod.val[0]; + int out_id_w = out_id_divmod.val[1]; + + int channel_id = divmods.channels_div.Divmod(tid).val[1]; + auto outimg_id_divmod = divmods.output_wc_div.Divmod(out_id_w); + int out_img_idy = outimg_id_divmod.val[0]; + int out_img_idx = + divmods.channels_div.Divmod(outimg_id_divmod.val[1]).val[0]; + + int in_img_idx, in_img_idy, w_id, h_id; + T w1lambda, h1lambda, w2lambda, h2lambda; + T src_w = ratio_w * (out_img_idx + align_type_value) - align_type_value; + T src_h = ratio_h * (out_img_idy + align_type_value) - align_type_value; + + PreCalculatorForLinearInterpInputIndex( + &in_img_idx, &w_id, &w1lambda, &w2lambda, src_w, in_w); + PreCalculatorForLinearInterpInputIndex( + &in_img_idy, &h_id, &h1lambda, &h2lambda, src_h, in_h); + + T value = out[tid]; + T* in_pos = &in[out_id_h * in_chw + in_img_idy * in_w * num_channels + + in_img_idx * num_channels + channel_id]; + paddle::platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * value); + paddle::platform::CudaAtomicAdd(&in_pos[w_id * num_channels], + h2lambda * w1lambda * value); + paddle::platform::CudaAtomicAdd(&in_pos[h_id * in_w * num_channels], + h1lambda * w2lambda * value); + paddle::platform::CudaAtomicAdd( + &in_pos[h_id * in_w * num_channels + w_id * num_channels], + h1lambda * w1lambda * value); + } +} + +template +__global__ void KeBicubicInterpBw(T* in, + const size_t in_img_h, + const size_t in_img_w, + const size_t input_h, + const size_t input_w, + const T* out, + const size_t out_img_h, + const size_t out_img_w, + const size_t output_h, + const size_t output_w, + const size_t num_channels, + const float ratio_h, + const float ratio_w, + const bool align_corners, + const DataLayout data_layout) { + int nthreads = output_h * output_w; + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + + for (; tid < nthreads; tid += stride) { + int out_id_h = tid / output_w; + int out_id_w = tid % output_w; + int in_img_size = input_w / num_channels; + int out_img_size = output_w / num_channels; + + int channel_id, out_img_idy, out_img_idx; + if (data_layout == DataLayout::kNCHW) { + channel_id = out_id_w / out_img_size; + out_img_idy = (out_id_w % out_img_size) / out_img_w; + out_img_idx = tid % out_img_w; + } else { + out_img_idy = out_id_w / (out_img_w * num_channels); + out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels; + channel_id = tid % num_channels; + } + + T in_img_idy = align_corners + ? static_cast(ratio_h * out_img_idy) + : static_cast(ratio_h * (out_img_idy + 0.5) - 0.5); + int input_y = floorf(in_img_idy); + const T y_t = in_img_idy - input_y; + + T in_img_idx = align_corners + ? static_cast(ratio_w * out_img_idx) + : static_cast(ratio_w * (out_img_idx + 0.5) - 0.5); + int input_x = floorf(in_img_idx); + + const T x_t = in_img_idx - input_x; + + T x_coeffs[4]; + T y_coeffs[4]; + + funcs::get_cubic_upsample_coefficients(x_coeffs, x_t); + funcs::get_cubic_upsample_coefficients(y_coeffs, y_t); + + const T* out_pos = &out[out_id_h * output_w + out_id_w]; + T* in_pos; + + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) { + int access_y = max(min(static_cast(input_y - 1 + j), + static_cast(in_img_h - 1)), + 0); + int access_x = max(min(static_cast(input_x - 1 + i), + static_cast(in_img_w - 1)), + 0); + if (data_layout == DataLayout::kNCHW) { + in_pos = &in[out_id_h * input_w + channel_id * in_img_size + + access_y * in_img_w + access_x]; + } else { + in_pos = &in[out_id_h * input_w + access_y * in_img_w * num_channels + + access_x * num_channels + channel_id]; + } + paddle::platform::CudaAtomicAdd( + &in_pos[0], (out_pos[0] * y_coeffs[j] * x_coeffs[i])); + } + } + } +} + +template +__global__ void KeTrilinearInterpBw(T* in, + const size_t in_img_d, + const size_t in_img_h, + const size_t in_img_w, + const size_t input_h, + const size_t input_w, + const T* out, + const size_t out_img_d, + const size_t out_img_h, + const size_t out_img_w, + const size_t output_h, + const size_t output_w, + const size_t num_channels, + const T ratio_d, + const T ratio_h, + const T ratio_w, + const bool align_corners, + const int align_mode, + const DataLayout data_layout) { + int nthreads = output_h * output_w; + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + bool align_flag = (align_mode == 0 && !align_corners); + for (; tid < nthreads; tid += stride) { + int out_id_h = tid / output_w; + int out_id_w = tid % output_w; + int in_img_size = input_w / num_channels; + int out_img_size = output_w / num_channels; + + int channel_id, out_img_idt, out_img_idy, out_img_idx; + if (data_layout == DataLayout::kNCHW) { + channel_id = out_id_w / out_img_size; + out_img_idt = (out_id_w % out_img_size) / out_img_h / out_img_w; + out_img_idy = ((out_id_w % out_img_size) / out_img_w) % out_img_h; + out_img_idx = tid % out_img_w; + } else { + out_img_idt = out_id_w / (out_img_h * out_img_w * num_channels); + out_img_idy = out_id_w % (out_img_h * out_img_w * num_channels) / + (out_img_w * num_channels); + out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels; + channel_id = tid % num_channels; + } + + int in_img_idt = align_flag + ? static_cast(ratio_d * (out_img_idt + 0.5) - 0.5) + : static_cast(ratio_d * out_img_idt); + in_img_idt = (in_img_idt > 0) ? in_img_idt : 0; + int d_id = (in_img_idt < in_img_d - 1) ? 1 : 0; + T src_d = ratio_d * (out_img_idt + 0.5) - 0.5; + src_d = (src_d > 0) ? src_d : 0; + T d1lambda = + align_flag ? src_d - in_img_idt : ratio_d * out_img_idt - in_img_idt; + T d2lambda = 1.f - d1lambda; + + int in_img_idy = align_flag + ? static_cast(ratio_h * (out_img_idy + 0.5) - 0.5) + : static_cast(ratio_h * out_img_idy); + in_img_idy = (in_img_idy > 0) ? in_img_idy : 0; + int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0; + T src_h = ratio_h * (out_img_idy + 0.5) - 0.5; + src_h = (src_h > 0) ? src_h : 0; + T h1lambda = + align_flag ? src_h - in_img_idy : ratio_h * out_img_idy - in_img_idy; + T h2lambda = 1.f - h1lambda; + + int in_img_idx = align_flag + ? static_cast(ratio_w * (out_img_idx + 0.5) - 0.5) + : static_cast(ratio_w * out_img_idx); + in_img_idx = (in_img_idx > 0) ? in_img_idx : 0; + int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0; + T src_w = ratio_w * (out_img_idx + 0.5) - 0.5; + src_w = (src_w > 0) ? src_w : 0; + T w1lambda = + align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx; + T w2lambda = 1.f - w1lambda; + + if (data_layout == DataLayout::kNCHW) { + int in_pos1_idx = out_id_h * input_w + channel_id * in_img_size + + (in_img_idt * in_img_h + in_img_idy) * in_img_w + + in_img_idx; + T* in_pos1 = &in[in_pos1_idx]; + int in_pos2_idx = in_pos1_idx + d_id * in_img_h * in_img_w; + T* in_pos2 = &in[in_pos2_idx]; + + const T* out_pos = &out[out_id_h * output_w + out_id_w]; + + // trilinear interpolation grad + paddle::platform::CudaAtomicAdd( + &in_pos1[0], d2lambda * h2lambda * w2lambda * out_pos[0]); + paddle::platform::CudaAtomicAdd( + &in_pos1[w_id], d2lambda * h2lambda * w1lambda * out_pos[0]); + paddle::platform::CudaAtomicAdd( + &in_pos1[h_id * in_img_w], + d2lambda * h1lambda * w2lambda * out_pos[0]); + paddle::platform::CudaAtomicAdd( + &in_pos1[h_id * in_img_w + w_id], + d2lambda * h1lambda * w1lambda * out_pos[0]); + paddle::platform::CudaAtomicAdd( + &in_pos2[0], d1lambda * h2lambda * w2lambda * out_pos[0]); + paddle::platform::CudaAtomicAdd( + &in_pos2[w_id], d1lambda * h2lambda * w1lambda * out_pos[0]); + paddle::platform::CudaAtomicAdd( + &in_pos2[h_id * in_img_w], + d1lambda * h1lambda * w2lambda * out_pos[0]); + paddle::platform::CudaAtomicAdd( + &in_pos2[h_id * in_img_w + w_id], + d1lambda * h1lambda * w1lambda * out_pos[0]); + } else { + int in_pos1_idx = out_id_h * input_w + + in_img_idt * in_img_h * in_img_w * num_channels + + in_img_idy * in_img_w * num_channels + + in_img_idx * num_channels + channel_id; + T* in_pos1 = &in[in_pos1_idx]; + int in_pos2_idx = in_pos1_idx + d_id * in_img_h * in_img_w * num_channels; + T* in_pos2 = &in[in_pos2_idx]; + + const T* out_pos = &out[out_id_h * output_w + out_id_w]; + + // trilinear interpolation grad + paddle::platform::CudaAtomicAdd( + &in_pos1[0], d2lambda * h2lambda * w2lambda * out_pos[0]); + paddle::platform::CudaAtomicAdd( + &in_pos1[w_id * num_channels], + d2lambda * h2lambda * w1lambda * out_pos[0]); + paddle::platform::CudaAtomicAdd( + &in_pos1[h_id * in_img_w * num_channels], + d2lambda * h1lambda * w2lambda * out_pos[0]); + paddle::platform::CudaAtomicAdd( + &in_pos1[h_id * in_img_w * num_channels + w_id * num_channels], + d2lambda * h1lambda * w1lambda * out_pos[0]); + paddle::platform::CudaAtomicAdd( + &in_pos2[0], d1lambda * h2lambda * w2lambda * out_pos[0]); + paddle::platform::CudaAtomicAdd( + &in_pos2[w_id * num_channels], + d1lambda * h2lambda * w1lambda * out_pos[0]); + paddle::platform::CudaAtomicAdd( + &in_pos2[h_id * in_img_w * num_channels], + d1lambda * h1lambda * w2lambda * out_pos[0]); + paddle::platform::CudaAtomicAdd( + &in_pos2[h_id * in_img_w * num_channels + w_id * num_channels], + d1lambda * h1lambda * w1lambda * out_pos[0]); + } + } +} + +template +__global__ void KeNearestNeighbor3DInterpBw(T* in, + const size_t in_img_d, + const size_t in_img_h, + const size_t in_img_w, + const size_t input_h, + const size_t input_w, + const T* out, + const size_t out_img_d, + const size_t out_img_h, + const size_t out_img_w, + const size_t output_h, + const size_t output_w, + const size_t num_channels, + const float ratio_d, + const float ratio_h, + const float ratio_w, + const bool align_corners, + const DataLayout data_layout) { + int nthreads = output_h * output_w; + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + for (; tid < nthreads; tid += stride) { + int out_id_h = tid / output_w; + int out_id_w = tid % output_w; + int in_img_size = input_w / num_channels; + int out_img_size = output_w / num_channels; + + int channel_id, out_img_idt, out_img_idy, out_img_idx; + if (data_layout == DataLayout::kNCHW) { + channel_id = out_id_w / out_img_size; + out_img_idt = (out_id_w % out_img_size) / out_img_h / out_img_w; + out_img_idy = ((out_id_w % out_img_size) / out_img_w) % out_img_h; + out_img_idx = tid % out_img_w; + } else { + out_img_idt = out_id_w / (out_img_h * out_img_w * num_channels); + out_img_idy = out_id_w % (out_img_h * out_img_w * num_channels) / + (out_img_w * num_channels); + out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels; + channel_id = tid % num_channels; + } + + int in_img_idt = (align_corners) + ? static_cast(ratio_d * out_img_idt + 0.5) + : static_cast(ratio_d * out_img_idt); + int in_img_idy = (align_corners) + ? static_cast(ratio_h * out_img_idy + 0.5) + : static_cast(ratio_h * out_img_idy); + int in_img_idx = (align_corners) + ? static_cast(ratio_w * out_img_idx + 0.5) + : static_cast(ratio_w * out_img_idx); + + T* in_pos; + if (data_layout == DataLayout::kNCHW) { + in_pos = &in[out_id_h * input_w + channel_id * in_img_size + + in_img_idt * in_img_h * in_img_w + in_img_idy * in_img_w + + in_img_idx]; + } else { + in_pos = &in[out_id_h * input_w + + in_img_idt * in_img_h * in_img_w * num_channels + + in_img_idy * in_img_w * num_channels + + in_img_idx * num_channels + channel_id]; + } + const T out_pos = out[out_id_h * output_w + out_id_w]; + paddle::platform::CudaAtomicAdd(in_pos, out_pos); + } +} + +template +static void Interpolate1DCUDABwd( + const Context& dev_ctx, + const DenseTensor& input, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const DenseTensor& output_grad, + const std::string& data_layout_str, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* input_grad) { + const DataLayout data_layout = + paddle::framework::StringToDataLayout(data_layout_str); + int n, c, in_d, in_h, in_w; + funcs::ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); + + float scale_w = -1; + if (scale_tensor) { + auto scale_data = + funcs::get_new_data_from_tensor(scale_tensor.get_ptr()); + scale_w = scale_data[0]; + PADDLE_ENFORCE_EQ( + scale_w > 0, + true, + errors::InvalidArgument( + "The scale_w in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + } else { + if (scale.size() > 0) { + scale_w = scale[0]; + + PADDLE_ENFORCE_EQ( + scale_w > 0, + true, + errors::InvalidArgument( + "The scale_w in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + } + } + if (scale_w > 0.) { + out_w = static_cast(in_w * scale_w); + } + + if (out_size) { + DenseTensor sizes; + paddle::framework::TensorCopySync( + *out_size, paddle::platform::CPUPlace(), &sizes); + auto size_data = sizes.data(); + out_w = size_data[0]; + } + if (size_tensor && size_tensor->size() > 0) { + // have size tensor + auto new_size = funcs::get_new_shape(size_tensor.get()); + out_w = new_size[0]; + } + + auto* output_grad_data = output_grad.data(); + phi::DDim dim_grad; + if (data_layout == DataLayout::kNCHW) { + dim_grad = {n, c, in_w}; + } else { + dim_grad = {n, in_w, c}; + } + input_grad->Resize(dim_grad); + auto* input_grad_data = dev_ctx.template Alloc(input_grad); + + phi::funcs::SetConstant zero; + zero(dev_ctx, input_grad, static_cast(0.0)); + + if (in_w == out_w) { + paddle::framework::TensorCopy(output_grad, dev_ctx.GetPlace(), input_grad); + return; + } + + float ratio_w = 0.f; + if (out_w > 1) { + float new_scale_w = 0.f; + new_scale_w = (scale_w > 0) ? static_cast(1. / scale_w) + : static_cast(in_w) / out_w; + ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) + : static_cast(new_scale_w); + } + int64_t in_cw = c * in_w; + int64_t out_cw = c * out_w; + auto pixelNum = n * out_cw; + + backends::gpu::GpuLaunchConfig config = + backends::gpu::GetGpuLaunchConfig1D(dev_ctx, pixelNum); + + if ("linear" == interp_method) { + KeLinearInterpBw<<>>(input_grad_data, + in_w, + in_cw, + output_grad_data, + out_w, + n, + out_cw, + c, + ratio_w, + align_corners, + align_mode, + data_layout); + } +} + +template +static void Interpolate2DCUDABwd( + const Context& dev_ctx, + const DenseTensor& input, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const DenseTensor& output_grad, + const std::string& data_layout_str, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* input_grad) { + const DataLayout data_layout = + paddle::framework::StringToDataLayout(data_layout_str); + int n, c, in_d, in_h, in_w; + funcs::ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); + + float scale_h = -1; + float scale_w = -1; + if (scale_tensor) { + auto scale_data = + funcs::get_new_data_from_tensor(scale_tensor.get_ptr()); + if (scale_data.size() > 1) { + scale_h = scale_data[0]; + scale_w = scale_data[1]; + } else { + scale_h = scale_data[0]; + scale_w = scale_data[0]; + } + + PADDLE_ENFORCE_EQ( + scale_w > 0, + true, + errors::InvalidArgument( + "The scale_w in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, + true, + errors::InvalidArgument( + "The scale_h in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); + } else { + if (scale.size() > 1) { + scale_w = scale[1]; + scale_h = scale[0]; + + PADDLE_ENFORCE_EQ( + scale_w > 0, + true, + errors::InvalidArgument( + "The scale_w in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, + true, + errors::InvalidArgument( + "The scale_h in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); + } + } + if (scale_w > 0. && scale_h > 0.) { + out_h = static_cast(in_h * scale_h); + out_w = static_cast(in_w * scale_w); + } + + if (out_size) { + DenseTensor sizes; + paddle::framework::TensorCopySync( + *out_size, paddle::platform::CPUPlace(), &sizes); + auto size_data = sizes.data(); + out_h = size_data[0]; + out_w = size_data[1]; + } + if (size_tensor && size_tensor->size() > 0) { + // have size tensor + auto new_size = funcs::get_new_shape(size_tensor.get()); + out_h = new_size[0]; + out_w = new_size[1]; + } + + auto* output_grad_data = output_grad.data(); + phi::DDim dim_grad; + if (data_layout == DataLayout::kNCHW) { + dim_grad = {n, c, in_h, in_w}; + } else { + dim_grad = {n, in_h, in_w, c}; + } + input_grad->Resize(dim_grad); + auto* input_grad_data = dev_ctx.template Alloc(input_grad); + phi::funcs::SetConstant zero; + zero(dev_ctx, input_grad, static_cast(0.0)); + + if (in_h == out_h && in_w == out_w) { + paddle::framework::TensorCopy(output_grad, dev_ctx.GetPlace(), input_grad); + return; + } + + float ratio_h = 0.f; + float ratio_w = 0.f; + if (out_h > 1) { + float new_scale_h = 0.f; + new_scale_h = (scale_h > 0) ? static_cast(1. / scale_h) + : static_cast(in_h) / out_h; + ratio_h = (align_corners) ? static_cast(in_h - 1) / (out_h - 1) + : static_cast(new_scale_h); + } + if (out_w > 1) { + float new_scale_w = 0.f; + new_scale_w = (scale_w > 0) ? static_cast(1. / scale_w) + : static_cast(in_w) / out_w; + ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) + : static_cast(new_scale_w); + } + + int64_t in_hw = in_h * in_w; + int64_t out_hw = out_h * out_w; + int64_t in_chw = c * in_hw; + int64_t out_chw = c * out_hw; + auto pixelNum = n * out_chw; + + backends::gpu::GpuLaunchConfig config = + backends::gpu::GetGpuLaunchConfig1D(dev_ctx, pixelNum); + + if ("nearest" == interp_method) { + if (data_layout == DataLayout::kNCHW) { + // get launch 3D config + int nc = n * c; + backends::gpu::GpuLaunchConfig config_3d = + backends::gpu::GetGpuLaunchConfig3D(dev_ctx, nc, out_h, out_w); + KeNearestNeighborInterpNCHWBw<<>>(input_grad_data, + in_h, + in_w, + output_grad_data, + out_h, + out_w, + nc, + ratio_h, + ratio_w, + align_corners); + } else { + int64_t cw = c * out_w; + auto interp_divmods = funcs::FastDivModForInterpolate(c, out_chw, cw); + KeNearestNeighborInterpBw<<>>(input_grad_data, + in_h, + in_w, + n, + in_chw, + output_grad_data, + out_h, + out_w, + n, + out_chw, + c, + ratio_h, + ratio_w, + align_corners, + interp_divmods); + } + } else if ("bilinear" == interp_method) { + const T align_type_value = (align_mode == 0 && !align_corners) ? 0.5f : 0; + bool is_nchw = (data_layout == DataLayout::kNCHW) ? true : false; + bool optimize_flag = false; +#ifndef __HIPCC__ + optimize_flag = (in_h < (out_h >> 6) && in_w < (out_w >> 6)) + ? true + : ((in_h == 1 && in_w == 1) ? true : false); +#endif + + if (optimize_flag & is_nchw) { + KeBilinearInterpBwShareMemory<<>>(input_grad_data, + in_h, + in_w, + output_grad_data, + out_h, + out_w, + n, + c, + ratio_h, + ratio_w, + align_type_value, + is_nchw); + } else if (!optimize_flag & is_nchw) { + const int num_kernels = n * c * out_h * out_w; + const int num_threads = std::min(dev_ctx.GetMaxThreadsPerBlock(), 1024); + KeBilinearInterpNCHWBw< + T><<>>(input_grad_data, + in_h, + in_w, + out_h, + out_w, + n, + c, + ratio_h, + ratio_w, + output_grad_data, + align_type_value); + } else { + int64_t cw = c * out_w; + auto interp_divmods = funcs::FastDivModForInterpolate(c, out_chw, cw); + KeBilinearInterpBw<<>>(input_grad_data, + in_h, + in_w, + output_grad_data, + out_h, + out_w, + n, + out_chw, + c, + ratio_h, + ratio_w, + align_type_value, + interp_divmods); + } + } else if ("bicubic" == interp_method) { +#ifdef __HIPCC__ + constexpr int thread_per_block = 256; +#else + constexpr int thread_per_block = 512; +#endif + KeBicubicInterpBw< + T><<>>( + input_grad_data, + in_h, + in_w, + n, + in_chw, + output_grad_data, + out_h, + out_w, + n, + out_chw, + c, + ratio_h, + ratio_w, + align_corners, + data_layout); + } +} + +template +static void Interpolate3DCUDABwd( + const Context& dev_ctx, + const DenseTensor& input, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const DenseTensor& output_grad, + const std::string& data_layout_str, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* input_grad) { + const DataLayout data_layout = + paddle::framework::StringToDataLayout(data_layout_str); + int n, c, in_d, in_h, in_w; + funcs::ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); + + float scale_d = -1; + float scale_h = -1; + float scale_w = -1; + if (scale_tensor) { + auto scale_data = + funcs::get_new_data_from_tensor(scale_tensor.get_ptr()); + if (scale_data.size() > 1) { + scale_d = scale_data[0]; + scale_h = scale_data[1]; + scale_w = scale_data[2]; + } else { + scale_d = scale_data[0]; + scale_h = scale_data[0]; + scale_w = scale_data[0]; + } + PADDLE_ENFORCE_EQ( + scale_w > 0, + true, + errors::InvalidArgument( + "The scale_w in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, + true, + errors::InvalidArgument( + "The scale_h in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); + PADDLE_ENFORCE_EQ( + scale_d > 0, + true, + errors::InvalidArgument( + "The scale_d in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_d)); + } else { + if (scale.size() > 1) { + scale_d = scale[0]; + scale_h = scale[1]; + scale_w = scale[2]; + + PADDLE_ENFORCE_EQ( + scale_w > 0, + true, + errors::InvalidArgument( + "The scale_w in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, + true, + errors::InvalidArgument( + "The scale_h in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); + PADDLE_ENFORCE_EQ( + scale_d > 0, + true, + errors::InvalidArgument( + "The scale_d in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_d)); + } + } + if (scale_d > 0. && scale_h > 0. && scale_w > 0.) { + out_d = static_cast(in_d * scale_d); + out_h = static_cast(in_h * scale_h); + out_w = static_cast(in_w * scale_w); + } + + if (out_size) { + DenseTensor sizes; + paddle::framework::TensorCopySync( + *out_size, paddle::platform::CPUPlace(), &sizes); + auto size_data = sizes.data(); + out_d = size_data[0]; + out_h = size_data[1]; + out_w = size_data[2]; + } + if (size_tensor && size_tensor->size() > 0) { + // have size tensor + auto new_size = funcs::get_new_shape(size_tensor.get()); + out_d = new_size[0]; + out_h = new_size[1]; + out_w = new_size[2]; + } + + auto* output_grad_data = output_grad.data(); + phi::DDim dim_grad; + if (data_layout == DataLayout::kNCHW) { + dim_grad = {n, c, in_d, in_h, in_w}; + } else { + dim_grad = {n, in_d, in_h, in_w, c}; + } + input_grad->Resize(dim_grad); + auto* input_grad_data = dev_ctx.template Alloc(input_grad); + phi::funcs::SetConstant zero; + zero(dev_ctx, input_grad, static_cast(0.0)); + + if (in_d == out_d && in_h == out_h && in_w == out_w) { + paddle::framework::TensorCopy(output_grad, dev_ctx.GetPlace(), input_grad); + return; + } + + float ratio_d = 0.f; + float ratio_h = 0.f; + float ratio_w = 0.f; + if (out_d > 1) { + float new_scale_d = 0.f; + new_scale_d = (scale_d > 0) ? static_cast(1. / scale_d) + : static_cast(in_d) / out_d; + ratio_d = (align_corners) ? static_cast(in_d - 1) / (out_d - 1) + : static_cast(new_scale_d); + } + if (out_h > 1) { + float new_scale_h = 0.f; + new_scale_h = (scale_h > 0) ? static_cast(1. / scale_h) + : static_cast(in_h) / out_h; + ratio_h = (align_corners) ? static_cast(in_h - 1) / (out_h - 1) + : static_cast(new_scale_h); + } + if (out_w > 1) { + float new_scale_w = 0.f; + new_scale_w = (scale_w > 0) ? static_cast(1. / scale_w) + : static_cast(in_w) / out_w; + ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) + : static_cast(new_scale_w); + } + + int64_t in_dhw = in_d * in_h * in_w; + int64_t out_dhw = out_d * out_h * out_w; + int64_t in_cdhw = c * in_dhw; + int64_t out_cdhw = c * out_dhw; + + auto pixelNum = n * out_cdhw; + + backends::gpu::GpuLaunchConfig config = + backends::gpu::GetGpuLaunchConfig1D(dev_ctx, pixelNum); + + if ("trilinear" == interp_method) { + KeTrilinearInterpBw<<>>(input_grad_data, + in_d, + in_h, + in_w, + n, + in_cdhw, + output_grad_data, + out_d, + out_h, + out_w, + n, + out_cdhw, + c, + ratio_d, + ratio_h, + ratio_w, + align_corners, + align_mode, + data_layout); + } else if ("nearest" == interp_method) { + KeNearestNeighbor3DInterpBw<<>>(input_grad_data, + in_d, + in_h, + in_w, + n, + in_cdhw, + output_grad_data, + out_d, + out_h, + out_w, + n, + out_cdhw, + c, + ratio_d, + ratio_h, + ratio_w, + align_corners, + data_layout); + } +} + +template +void InterpolateGradKernel( + const Context& dev_ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const DenseTensor& output_grad, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* x_grad) { + auto output_grad_dims = output_grad.dims(); + if (output_grad_dims.size() == 3) { // 1D interpolation grad + Interpolate1DCUDABwd(dev_ctx, + x, + out_size, + size_tensor, + scale_tensor, + output_grad, + data_layout, + out_w, + scale, + interp_method, + align_corners, + align_mode, + x_grad); + } else if (output_grad_dims.size() == 4) { // 2D interpolation grad + Interpolate2DCUDABwd(dev_ctx, + x, + out_size, + size_tensor, + scale_tensor, + output_grad, + data_layout, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + x_grad); + + } else if (output_grad_dims.size() == 5) { // 3D interpolation grad + Interpolate3DCUDABwd(dev_ctx, + x, + out_size, + size_tensor, + scale_tensor, + output_grad, + data_layout, + out_d, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + x_grad); + } +} + +template +void BilinearInterpGradKernel( + const Context& dev_ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const DenseTensor& out_grad, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* x_grad) { + InterpolateGradKernel(dev_ctx, + x, + out_size, + size_tensor, + scale_tensor, + out_grad, + data_layout, + out_d, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + x_grad); +} + +template +void NearestInterpGradKernel( + const Context& dev_ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const DenseTensor& out_grad, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* x_grad) { + InterpolateGradKernel(dev_ctx, + x, + out_size, + size_tensor, + scale_tensor, + out_grad, + data_layout, + out_d, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + x_grad); +} + +template +void TrilinearInterpGradKernel( + const Context& dev_ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const DenseTensor& out_grad, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* x_grad) { + InterpolateGradKernel(dev_ctx, + x, + out_size, + size_tensor, + scale_tensor, + out_grad, + data_layout, + out_d, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + x_grad); +} + +template +void LinearInterpGradKernel( + const Context& dev_ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const DenseTensor& out_grad, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* x_grad) { + InterpolateGradKernel(dev_ctx, + x, + out_size, + size_tensor, + scale_tensor, + out_grad, + data_layout, + out_d, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + x_grad); +} + +template +void BicubicInterpGradKernel( + const Context& dev_ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const DenseTensor& out_grad, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* x_grad) { + InterpolateGradKernel(dev_ctx, + x, + out_size, + size_tensor, + scale_tensor, + out_grad, + data_layout, + out_d, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + x_grad); +} + +} // namespace phi + +PD_REGISTER_KERNEL(bilinear_interp_v2_grad, + GPU, + ALL_LAYOUT, + phi::BilinearInterpGradKernel, + float, + double) {} +PD_REGISTER_KERNEL(nearest_interp_v2_grad, + GPU, + ALL_LAYOUT, + phi::NearestInterpGradKernel, + float, + double) {} +PD_REGISTER_KERNEL(trilinear_interp_v2_grad, + GPU, + ALL_LAYOUT, + phi::TrilinearInterpGradKernel, + float, + double) {} +PD_REGISTER_KERNEL(linear_interp_v2_grad, + GPU, + ALL_LAYOUT, + phi::LinearInterpGradKernel, + float, + double) {} +PD_REGISTER_KERNEL(bicubic_interp_v2_grad, + GPU, + ALL_LAYOUT, + phi::BicubicInterpGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/gpu/interpolate_kernel.cu b/paddle/phi/kernels/gpu/interpolate_kernel.cu new file mode 100644 index 0000000000..6e609aa116 --- /dev/null +++ b/paddle/phi/kernels/gpu/interpolate_kernel.cu @@ -0,0 +1,1479 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/interpolate_kernel.h" + +#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/fluid/platform/fast_divmod.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/layout.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/interpolate_function.h" + +namespace phi { +using paddle::platform::FastDivMod; + +template +__forceinline__ __device__ void PreCalculatorForLinearInterpInputIndex( + int* in_img_idx, + int* x_id, + T* lambda1, + T* lambda2, + T src_x, + const int in_img_x) { + src_x = (src_x > 0) ? src_x : 0.f; + *in_img_idx = static_cast(src_x); + *x_id = (*in_img_idx < in_img_x - 1) ? 1 : 0; + *lambda1 = src_x - *in_img_idx; + *lambda2 = 1.f - *lambda1; +} + +template +__global__ void KeLinearInterpFw(const T* in, + const size_t in_img_w, + const size_t input_w, + T* out, + const size_t out_img_w, + const size_t output_h, + const size_t output_w, + const size_t num_channels, + const float ratio_w, + const bool align_corners, + const int align_mode, + const DataLayout data_layout) { + int nthreads = output_h * output_w; + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + bool align_flag = (align_mode == 0 && !align_corners); + for (; tid < nthreads; tid += stride) { + int out_id_h = tid / output_w; + int out_id_w = tid % output_w; + int in_img_size = input_w / num_channels; + int out_img_size = output_w / num_channels; + + int channel_id, out_img_idy, out_img_idx; + if (data_layout == DataLayout::kNCHW) { + channel_id = out_id_w / out_img_size; + out_img_idx = tid % out_img_w; + } else { + out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels; + channel_id = tid % num_channels; + } + + int in_img_idx = align_flag + ? static_cast(ratio_w * (out_img_idx + 0.5) - 0.5) + : static_cast(ratio_w * out_img_idx); + in_img_idx = (in_img_idx > 0) ? in_img_idx : 0; // w + int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0; // w_id + + T src_w = ratio_w * (out_img_idx + 0.5) - 0.5; + src_w = (src_w > 0) ? src_w : 0; + T w1lambda = + align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx; + T w2lambda = 1.f - w1lambda; + + if (data_layout == DataLayout::kNCHW) { + const T* in_pos = + &in[out_id_h * out_id_w + channel_id * in_img_size + in_img_idx]; + // linear interpolation + out[out_id_h * output_w + out_id_w] = + w2lambda * in_pos[0] + w1lambda * in_pos[w_id]; + + } else { + const T* in_pos = + &in[out_id_h * input_w + in_img_idx * num_channels + channel_id]; + // linear interpolation + out[out_id_h * output_w + out_id_w] = + w2lambda * in_pos[0] + w1lambda * in_pos[w_id * num_channels]; + } + } +} + +template +__global__ void KeNearestNeighborInterpNCHWFw(const T* in, + const size_t in_img_h, + const size_t in_img_w, + T* out, + const size_t out_img_h, + const size_t out_img_w, + const size_t nc, + const float ratio_h, + const float ratio_w, + const bool align_corners) { + int out_img_idx = threadIdx.x + blockIdx.x * blockDim.x; + int out_img_idy = threadIdx.y + blockIdx.y * blockDim.y; + int nc_id = threadIdx.z + blockIdx.z * blockDim.z; + int nc_stride = blockDim.z * gridDim.z; + + // nearest_sampling by multiple read in_addr and write to out_addr + int in_img_idx = (align_corners) + ? static_cast(ratio_w * out_img_idx + 0.5) + : static_cast(ratio_w * out_img_idx); + int in_img_idy = (align_corners) + ? static_cast(ratio_h * out_img_idy + 0.5) + : static_cast(ratio_h * out_img_idy); + + int in_index = (nc_id * in_img_h + in_img_idy) * in_img_w + in_img_idx; + int in_index_stride = nc_stride * in_img_h * in_img_w; + + int out_index = (nc_id * out_img_h + out_img_idy) * out_img_w + out_img_idx; + int out_index_stride = nc_stride * out_img_h * out_img_w; + + // prevent from multiple threads writing + if (out_img_idx < out_img_w && out_img_idy < out_img_h) { + while (nc_id < nc) { + out[out_index] = in[in_index]; + in_index += in_index_stride; + out_index += out_index_stride; + nc_id += nc_stride; + } + } +} + +template +__global__ void KeNearestNeighborInterpFw( + const T* in, + const size_t in_img_h, + const size_t in_img_w, + const size_t input_h, + const size_t input_w, + T* out, + const size_t out_img_h, + const size_t out_img_w, + const size_t output_h, + const size_t output_w, + const size_t num_channels, + const float ratio_h, + const float ratio_w, + const bool align_corners, + funcs::FastDivModForInterpolate divmods) { + int nthreads = output_h * output_w; + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + int in_img_size = in_img_h * in_img_w; + int out_img_size = out_img_h * out_img_w; + + for (; tid < nthreads; tid += stride) { + auto out_id_divmod = divmods.output_w_div.Divmod(tid); + int out_id_h = out_id_divmod.val[0]; + int out_id_w = out_id_divmod.val[1]; + + int channel_id = divmods.channels_div.Divmod(tid).val[1]; + auto outimg_id_divmod = divmods.output_wc_div.Divmod(out_id_w); + int out_img_idy = outimg_id_divmod.val[0]; + int out_img_idx = + divmods.channels_div.Divmod(outimg_id_divmod.val[1]).val[0]; + + int in_img_idy = (align_corners) + ? static_cast(ratio_h * out_img_idy + 0.5) + : static_cast(ratio_h * out_img_idy); + int in_img_idx = (align_corners) + ? static_cast(ratio_w * out_img_idx + 0.5) + : static_cast(ratio_w * out_img_idx); + + out[tid] = in[out_id_h * input_w + in_img_idy * in_img_w * num_channels + + in_img_idx * num_channels + channel_id]; + } +} + +template +__global__ void KeBilinearInterpFw(const T* in, + const size_t in_img_h, + const size_t in_img_w, + const size_t input_h, + const size_t input_w, + T* out, + const size_t out_img_h, + const size_t out_img_w, + const size_t output_h, + const size_t output_w, + const size_t num_channels, + const float ratio_h, + const float ratio_w, + const T align_type_value, + funcs::FastDivModForInterpolate divmods) { + int nthreads = output_h * output_w; + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + + for (; tid < nthreads; tid += stride) { + auto out_id_divmod = divmods.output_w_div.Divmod(tid); + int out_id_h = out_id_divmod.val[0]; + int out_id_w = out_id_divmod.val[1]; + + int channel_id = divmods.channels_div.Divmod(tid).val[1]; + auto outimg_id_divmod = divmods.output_wc_div.Divmod(out_id_w); + int out_img_idy = outimg_id_divmod.val[0]; + int out_img_idx = + divmods.channels_div.Divmod(outimg_id_divmod.val[1]).val[0]; + + int in_img_idx, in_img_idy, h_id, w_id; + T h1lambda, w1lambda, h2lambda, w2lambda; + T src_w = ratio_w * (out_img_idx + align_type_value) - align_type_value; + T src_h = ratio_h * (out_img_idy + align_type_value) - align_type_value; + + PreCalculatorForLinearInterpInputIndex( + &in_img_idx, &w_id, &w1lambda, &w2lambda, src_w, in_img_w); + PreCalculatorForLinearInterpInputIndex( + &in_img_idy, &h_id, &h1lambda, &h2lambda, src_h, in_img_h); + + // bilinear interpolation + const T* in_pos = + &in[out_id_h * input_w + in_img_idy * in_img_w * num_channels + + in_img_idx * num_channels + channel_id]; + out[tid] = + h2lambda * + (w2lambda * in_pos[0] + w1lambda * in_pos[w_id * num_channels]) + + h1lambda * + (w2lambda * in_pos[h_id * in_img_w * num_channels] + + w1lambda * + in_pos[h_id * in_img_w * num_channels + w_id * num_channels]); + } +} + +template +__global__ void KeBilinearInterpNCHWFw(const T* in, + const size_t in_img_h, + const size_t in_img_w, + T* out, + const size_t out_img_h, + const size_t out_img_w, + const size_t nc, + const float ratio_h, + const float ratio_w, + const T align_type_value) { + int out_img_idx = threadIdx.x + blockIdx.x * blockDim.x; + int out_img_idy = threadIdx.y + blockIdx.y * blockDim.y; + int nc_id = threadIdx.z + blockIdx.z * blockDim.z; + int nc_stride = blockDim.z * gridDim.z; + + int in_img_idx, in_img_idy, h_id, w_id; + T h1lambda, w1lambda, h2lambda, w2lambda; + T src_w = ratio_w * (out_img_idx + align_type_value) - align_type_value; + T src_h = ratio_h * (out_img_idy + align_type_value) - align_type_value; + + PreCalculatorForLinearInterpInputIndex( + &in_img_idx, &w_id, &w1lambda, &w2lambda, src_w, in_img_w); + PreCalculatorForLinearInterpInputIndex( + &in_img_idy, &h_id, &h1lambda, &h2lambda, src_h, in_img_h); + + int in_index = (nc_id * in_img_h + in_img_idy) * in_img_w + in_img_idx; + int in_index_stride = nc_stride * in_img_h * in_img_w; + + int out_index = (nc_id * out_img_h + out_img_idy) * out_img_w + out_img_idx; + int out_index_stride = nc_stride * out_img_h * out_img_w; + + // prevent from multiple threads writing + if (out_img_idx < out_img_w && out_img_idy < out_img_h) { + while (nc_id < nc) { + const T* in_pos = &in[in_index]; + out[out_index] = + h2lambda * (w2lambda * in_pos[0] + w1lambda * in_pos[w_id]) + + h1lambda * (w2lambda * in_pos[h_id * in_img_w] + + w1lambda * in_pos[h_id * in_img_w + w_id]); + + in_index += in_index_stride; + out_index += out_index_stride; + nc_id += nc_stride; + } + } +} + +template +__device__ __forceinline__ static T Kecubic_interp( + const T x0, const T x1, const T x2, const T x3, T t) { + T coeffs[4]; + T a = -0.75; + T x_1 = t; + T x_2 = 1.0 - t; + coeffs[0] = funcs::CubicConvolution2(x_1 + 1.0, a); + coeffs[1] = funcs::CubicConvolution1(x_1, a); + coeffs[2] = funcs::CubicConvolution1(x_2, a); + coeffs[3] = funcs::CubicConvolution2(x_2 + 1.0, a); + return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3]; +} + +template +__global__ void KeBicubicInterpFw(const T* in, + const size_t in_img_h, + const size_t in_img_w, + const size_t input_h, + const size_t input_w, + T* out, + const size_t out_img_h, + const size_t out_img_w, + const size_t output_h, + const size_t output_w, + const size_t num_channels, + const float ratio_h, + const float ratio_w, + const bool align_corners, + const DataLayout data_layout) { + int nthreads = output_h * output_w; + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + + for (; tid < nthreads; tid += stride) { + int out_id_h = tid / output_w; + int out_id_w = tid % output_w; + int in_img_size = input_w / num_channels; + int out_img_size = output_w / num_channels; + + int channel_id, out_img_idy, out_img_idx; + + if (data_layout == DataLayout::kNCHW) { + channel_id = out_id_w / out_img_size; + out_img_idy = (out_id_w % out_img_size) / out_img_w; + out_img_idx = tid % out_img_w; + } else { + out_img_idy = out_id_w / (out_img_w * num_channels); + out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels; + channel_id = tid % num_channels; + } + + T in_img_idy = align_corners + ? static_cast(ratio_h * out_img_idy) + : static_cast(ratio_h * (out_img_idy + 0.5) - 0.5); + int input_y = floorf(in_img_idy); + const T y_t = in_img_idy - input_y; + + T in_img_idx = align_corners + ? static_cast(ratio_w * out_img_idx) + : static_cast(ratio_w * (out_img_idx + 0.5) - 0.5); + int input_x = floorf(in_img_idx); + const T x_t = in_img_idx - input_x; + + T coefficients[4]; + const T* in_pos_0; + const T* in_pos_1; + const T* in_pos_2; + const T* in_pos_3; + int access_x_0; + if (data_layout == DataLayout::kNCHW) { + for (int k = 0; k < 4; k++) { + int access_y = + max(min(input_y - 1 + k, static_cast(in_img_h - 1)), 0); + access_x_0 = max(min(input_x - 1, static_cast(in_img_w - 1)), 0); + int access_x_1 = + max(min(input_x + 0, static_cast(in_img_w - 1)), 0); + int access_x_2 = + max(min(input_x + 1, static_cast(in_img_w - 1)), 0); + int access_x_3 = + max(min(input_x + 2, static_cast(in_img_w - 1)), 0); + + in_pos_0 = &in[out_id_h * input_w + channel_id * in_img_size + + access_y * in_img_w + access_x_0]; + in_pos_1 = &in[out_id_h * input_w + channel_id * in_img_size + + access_y * in_img_w + access_x_1]; + in_pos_2 = &in[out_id_h * input_w + channel_id * in_img_size + + access_y * in_img_w + access_x_2]; + in_pos_3 = &in[out_id_h * input_w + channel_id * in_img_size + + access_y * in_img_w + access_x_3]; + + coefficients[k] = Kecubic_interp( + in_pos_0[0], in_pos_1[0], in_pos_2[0], in_pos_3[0], x_t); + } + + out[out_id_h * output_w + out_id_w] = Kecubic_interp(coefficients[0], + coefficients[1], + coefficients[2], + coefficients[3], + y_t); + + } else { + for (int k = 0; k < 4; k++) { + int access_y = + max(min(input_y - 1 + k, static_cast((in_img_h - 1))), 0); + int access_x_0 = + max(min(input_x - 1, static_cast((in_img_w - 1))), 0); + int access_x_1 = + max(min(input_x + 0, static_cast((in_img_w - 1))), 0); + int access_x_2 = + max(min(input_x + 1, static_cast((in_img_w - 1))), 0); + int access_x_3 = + max(min(input_x + 2, static_cast((in_img_w - 1))), 0); + + const T* in_pos_0 = + &in[out_id_h * input_w + access_y * in_img_w * num_channels + + access_x_0 * num_channels + channel_id]; + const T* in_pos_1 = + &in[out_id_h * input_w + access_y * in_img_w * num_channels + + access_x_1 * num_channels + channel_id]; + const T* in_pos_2 = + &in[out_id_h * input_w + access_y * in_img_w * num_channels + + access_x_2 * num_channels + channel_id]; + const T* in_pos_3 = + &in[out_id_h * input_w + access_y * in_img_w * num_channels + + access_x_3 * num_channels + channel_id]; + + coefficients[k] = Kecubic_interp( + in_pos_0[0], in_pos_1[0], in_pos_2[0], in_pos_3[0], x_t); + } + + out[out_id_h * output_w + out_id_w] = + static_cast(Kecubic_interp(coefficients[0], + coefficients[1], + coefficients[2], + coefficients[3], + y_t)); + } + } +} + +template +__global__ void KeTrilinearInterpFw(const T* in, + const size_t in_img_d, + const size_t in_img_h, + const size_t in_img_w, + const size_t input_h, + const size_t input_w, + T* out, + const size_t out_img_d, + const size_t out_img_h, + const size_t out_img_w, + const size_t output_h, + const size_t output_w, + const size_t num_channels, + const float ratio_d, + const float ratio_h, + const float ratio_w, + const bool align_corners, + const int align_mode, + const DataLayout data_layout) { + int nthreads = output_h * output_w; + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + bool align_flag = (align_mode == 0 && !align_corners); + for (; tid < nthreads; tid += stride) { + int out_id_h = tid / output_w; + int out_id_w = tid % output_w; + int in_img_size = input_w / num_channels; + int out_img_size = output_w / num_channels; + + int channel_id, out_img_idt, out_img_idy, out_img_idx; + if (data_layout == DataLayout::kNCHW) { + channel_id = out_id_w / out_img_size; + out_img_idt = (out_id_w % out_img_size) / out_img_h / out_img_w; + out_img_idy = ((out_id_w % out_img_size) / out_img_w) % out_img_h; + out_img_idx = tid % out_img_w; + } else { + out_img_idt = out_id_w / (out_img_h * out_img_w * num_channels); + out_img_idy = out_id_w % (out_img_h * out_img_w * num_channels) / + (out_img_w * num_channels); + out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels; + channel_id = tid % num_channels; + } + + int in_img_idt = align_flag + ? static_cast(ratio_d * (out_img_idt + 0.5) - 0.5) + : static_cast(ratio_d * out_img_idt); + in_img_idt = (in_img_idt > 0) ? in_img_idt : 0; + int d_id = (in_img_idt < in_img_d - 1) ? 1 : 0; + T src_d = ratio_d * (out_img_idt + 0.5) - 0.5; + src_d = (src_d > 0) ? src_d : 0; + T d1lambda = + align_flag ? src_d - in_img_idt : ratio_d * out_img_idt - in_img_idt; + T d2lambda = 1.f - d1lambda; + + int in_img_idy = align_flag + ? static_cast(ratio_h * (out_img_idy + 0.5) - 0.5) + : static_cast(ratio_h * out_img_idy); + in_img_idy = (in_img_idy > 0) ? in_img_idy : 0; + int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0; + T src_h = ratio_h * (out_img_idy + 0.5) - 0.5; + src_h = (src_h > 0) ? src_h : 0; + T h1lambda = + align_flag ? src_h - in_img_idy : ratio_h * out_img_idy - in_img_idy; + T h2lambda = 1.f - h1lambda; + + int in_img_idx = align_flag + ? static_cast(ratio_w * (out_img_idx + 0.5) - 0.5) + : static_cast(ratio_w * out_img_idx); + in_img_idx = (in_img_idx > 0) ? in_img_idx : 0; + int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0; + T src_w = ratio_w * (out_img_idx + 0.5) - 0.5; + src_w = (src_w > 0) ? src_w : 0; + T w1lambda = + align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx; + T w2lambda = 1.f - w1lambda; + + if (data_layout == DataLayout::kNCHW) { + int in_pos1_idx = out_id_h * input_w + channel_id * in_img_size + + (in_img_idt * in_img_h + in_img_idy) * in_img_w + + in_img_idx; + const T* in_pos1 = &in[in_pos1_idx]; + int in_pos2_idx = in_pos1_idx + d_id * in_img_h * in_img_w; + const T* in_pos2 = &in[in_pos2_idx]; + + // trilinear interpolation + out[out_id_h * output_w + out_id_w] = + d2lambda * + (h2lambda * (w2lambda * in_pos1[0] + w1lambda * in_pos1[w_id]) + + h1lambda * (w2lambda * in_pos1[h_id * in_img_w] + + w1lambda * in_pos1[h_id * in_img_w + w_id])) + + d1lambda * + (h2lambda * (w2lambda * in_pos2[0] + w1lambda * in_pos2[w_id]) + + h1lambda * (w2lambda * in_pos2[h_id * in_img_w] + + w1lambda * in_pos2[h_id * in_img_w + w_id])); + + } else { + int in_pos1_idx = out_id_h * input_w + + in_img_idt * in_img_h * in_img_w * num_channels + + in_img_idy * in_img_w * num_channels + + in_img_idx * num_channels + channel_id; + const T* in_pos1 = &in[in_pos1_idx]; + int in_pos2_idx = in_pos1_idx + d_id * in_img_h * in_img_w * num_channels; + const T* in_pos2 = &in[in_pos2_idx]; + + // trilinear interpolation + out[out_id_h * output_w + out_id_w] = + d2lambda * + (h2lambda * (w2lambda * in_pos1[0] + + w1lambda * in_pos1[w_id * num_channels]) + + h1lambda * (w2lambda * in_pos1[h_id * in_img_w * num_channels] + + w1lambda * in_pos1[h_id * in_img_w * num_channels + + w_id * num_channels])) + + d1lambda * + (h2lambda * (w2lambda * in_pos2[0] + + w1lambda * in_pos2[w_id * num_channels]) + + h1lambda * (w2lambda * in_pos2[h_id * in_img_w * num_channels] + + w1lambda * in_pos2[h_id * in_img_w * num_channels + + w_id * num_channels])); + } + } +} + +template +__global__ void KeNearestNeighbor3DInterpFw(const T* in, + const size_t in_img_d, + const size_t in_img_h, + const size_t in_img_w, + const size_t input_h, + const size_t input_w, + T* out, + const size_t out_img_d, + const size_t out_img_h, + const size_t out_img_w, + const size_t output_h, + const size_t output_w, + const size_t num_channels, + const float ratio_d, + const float ratio_h, + const float ratio_w, + const bool align_corners, + const DataLayout data_layout) { + int nthreads = output_h * output_w; // ncdhw + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + for (; tid < nthreads; tid += stride) { + int out_id_h = tid / output_w; + int out_id_w = tid % output_w; + int in_img_size = input_w / num_channels; + int out_img_size = output_w / num_channels; + + int channel_id, out_img_idt, out_img_idy, out_img_idx; + if (data_layout == DataLayout::kNCHW) { + channel_id = out_id_w / out_img_size; + out_img_idt = (out_id_w % out_img_size) / out_img_h / out_img_w; + out_img_idy = ((out_id_w % out_img_size) / out_img_w) % out_img_h; + out_img_idx = tid % out_img_w; + } else { + out_img_idt = out_id_w / (out_img_h * out_img_w * num_channels); + out_img_idy = out_id_w % (out_img_h * out_img_w * num_channels) / + (out_img_w * num_channels); + out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels; + channel_id = tid % num_channels; + } + + int in_img_idt = (align_corners) + ? static_cast(ratio_d * out_img_idt + 0.5) + : static_cast(ratio_d * out_img_idt); + + int in_img_idy = (align_corners) + ? static_cast(ratio_h * out_img_idy + 0.5) + : static_cast(ratio_h * out_img_idy); + int in_img_idx = (align_corners) + ? static_cast(ratio_w * out_img_idx + 0.5) + : static_cast(ratio_w * out_img_idx); + + if (data_layout == DataLayout::kNCHW) { + out[tid] = in[out_id_h * input_w + channel_id * in_img_size + + in_img_idt * in_img_h * in_img_w + in_img_idy * in_img_w + + in_img_idx]; + } else { + out[tid] = in[out_id_h * input_w + + in_img_idt * in_img_h * in_img_w * num_channels + + in_img_idy * in_img_w * num_channels + + in_img_idx * num_channels + channel_id]; + } + } +} + +template +static void Interpolate1DCUDAFwd( + const Context& dev_ctx, + const DenseTensor& input, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const std::string& data_layout_str, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* output) { + auto* input_data = input.data(); + + const DataLayout data_layout = + paddle::framework::StringToDataLayout(data_layout_str); + int n, c, in_d, in_h, in_w; + funcs::ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); + + float scale_w = -1; + if (size_tensor && size_tensor->size() > 0) { + // have size tensor + auto new_size = funcs::get_new_shape(size_tensor.get()); + out_w = new_size[0]; + } else { + if (scale_tensor) { + auto scale_data = + funcs::get_new_data_from_tensor(scale_tensor.get_ptr()); + scale_w = scale_data[0]; + PADDLE_ENFORCE_EQ( + scale_w > 0, + true, + errors::InvalidArgument( + "The scale_w in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + } else { + if (scale.size() > 0) { + scale_w = scale[0]; + PADDLE_ENFORCE_EQ( + scale_w > 0, + true, + errors::InvalidArgument( + "The scale_w in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + } + } + if (scale_w > 0.) { + out_w = static_cast(in_w * scale_w); + } + if (out_size) { + DenseTensor sizes; + paddle::framework::TensorCopySync( + *out_size, paddle::platform::CPUPlace(), &sizes); + auto size_data = sizes.data(); + out_w = size_data[0]; + } + } + PADDLE_ENFORCE_GT( + out_w, + 0, + errors::InvalidArgument("out_w in Attr(out_shape) of Op(interpolate) " + "should be greater than 0.")); + phi::DDim dim_out; + if (data_layout == DataLayout::kNCHW) { + dim_out = {n, c, out_w}; + } else { + dim_out = {n, out_w, c}; + } + output->Resize(dim_out); + auto output_data = dev_ctx.template Alloc(output); + + if (in_w == out_w) { + paddle::framework::TensorCopy(input, dev_ctx.GetPlace(), output); + return; + } + + float ratio_w = 0.f; + if (out_w > 1) { + float new_scale_w = 0.f; + new_scale_w = (scale_w > 0) ? static_cast(1. / scale_w) + : static_cast(in_w) / out_w; + ratio_w = (align_corners) ? static_cast(in_w - 1.0) / (out_w - 1.0) + : static_cast(new_scale_w); + } + + int64_t in_cw = c * in_w; + int64_t out_cw = c * out_w; + auto pixelNum = n * out_cw; + + backends::gpu::GpuLaunchConfig config = + backends::gpu::GetGpuLaunchConfig1D(dev_ctx, pixelNum); + + if ("linear" == interp_method) { + KeLinearInterpFw<<>>(input_data, + in_w, + in_cw, + output_data, + out_w, + n, + out_cw, + c, + ratio_w, + align_corners, + align_mode, + data_layout); + } +} + +template +static void Interpolate2DCUDAFwd( + const Context& dev_ctx, + const DenseTensor& input, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const std::string& data_layout_str, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* output) { + auto* input_data = input.data(); + + const DataLayout data_layout = + paddle::framework::StringToDataLayout(data_layout_str); + int n, c, in_d, in_h, in_w; + funcs::ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); + + float scale_w = -1; + float scale_h = -1; + if (size_tensor && size_tensor->size() > 0) { + // have size tensor + auto new_size = funcs::get_new_shape(size_tensor.get()); + out_h = new_size[0]; + out_w = new_size[1]; + } else { + if (scale_tensor) { + auto scale_data = + funcs::get_new_data_from_tensor(scale_tensor.get_ptr()); + if (scale_data.size() > 1) { + scale_h = scale_data[0]; + scale_w = scale_data[1]; + } else { + scale_h = scale_data[0]; + scale_w = scale_data[0]; + } + + PADDLE_ENFORCE_EQ( + scale_w > 0, + true, + errors::InvalidArgument( + "The scale_w in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, + true, + errors::InvalidArgument( + "The scale_h in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); + } else { + if (scale.size() > 1) { + scale_w = scale[1]; + scale_h = scale[0]; + + PADDLE_ENFORCE_EQ( + scale_w > 0, + true, + errors::InvalidArgument( + "The scale_w in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, + true, + errors::InvalidArgument( + "The scale_h in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); + } + } + if (scale_w > 0. && scale_h > 0.) { + out_h = static_cast(in_h * scale_h); + out_w = static_cast(in_w * scale_w); + } + if (out_size) { + DenseTensor sizes; + paddle::framework::TensorCopySync( + *out_size, paddle::platform::CPUPlace(), &sizes); + auto size_data = sizes.data(); + out_h = size_data[0]; + out_w = size_data[1]; + } + } + PADDLE_ENFORCE_GT( + out_h, + 0, + errors::InvalidArgument("out_h in Attr(out_shape) of Op(interpolate) " + "should be greater than 0.")); + PADDLE_ENFORCE_GT( + out_w, + 0, + errors::InvalidArgument("out_w in Attr(out_shape) of Op(interpolate) " + "should be greater than 0.")); + + phi::DDim dim_out; + if (data_layout == DataLayout::kNCHW) { + dim_out = {n, c, out_h, out_w}; + } else { + dim_out = {n, out_h, out_w, c}; + } + output->Resize(dim_out); + auto output_data = dev_ctx.template Alloc(output); + + if (in_h == out_h && in_w == out_w) { + paddle::framework::TensorCopy(input, dev_ctx.GetPlace(), output); + return; + } + + float ratio_h = 0.f; + float ratio_w = 0.f; + if (out_h > 1) { + float new_scale_h = 0.f; + new_scale_h = (scale_h > 0) ? static_cast(1. / scale_h) + : static_cast(in_h) / out_h; + ratio_h = (align_corners) ? static_cast(in_h - 1) / (out_h - 1) + : static_cast(new_scale_h); + } + if (out_w > 1) { + float new_scale_w = 0.f; + new_scale_w = (scale_w > 0) ? static_cast(1. / scale_w) + : static_cast(in_w) / out_w; + ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) + : static_cast(new_scale_w); + } + + int64_t in_hw = in_h * in_w; + int64_t out_hw = out_h * out_w; + int64_t in_chw = c * in_hw; + int64_t out_chw = c * out_hw; + + auto pixelNum = n * out_chw; + + backends::gpu::GpuLaunchConfig config = + backends::gpu::GetGpuLaunchConfig1D(dev_ctx, pixelNum); + + if ("nearest" == interp_method) { + if (data_layout == DataLayout::kNCHW) { + // get launch 3D config + int nc = n * c; + backends::gpu::GpuLaunchConfig config_3d = + backends::gpu::GetGpuLaunchConfig3D(dev_ctx, nc, out_h, out_w); + KeNearestNeighborInterpNCHWFw<<>>(input_data, + in_h, + in_w, + output_data, + out_h, + out_w, + nc, + ratio_h, + ratio_w, + align_corners); + } else { + int64_t cw = c * out_w; + auto interp_divmods = funcs::FastDivModForInterpolate(c, out_chw, cw); + KeNearestNeighborInterpFw<<>>(input_data, + in_h, + in_w, + n, + in_chw, + output_data, + out_h, + out_w, + n, + out_chw, + c, + ratio_h, + ratio_w, + align_corners, + interp_divmods); + } + } else if ("bilinear" == interp_method) { + dim3 thread_num = config.thread_per_block; +#ifdef WITH_NV_JETSON + if (config.compute_capability == 53 || config.compute_capability == 62) { + thread_num = 512; + } +#endif + const T align_type_value = (align_mode == 0 && !align_corners) ? 0.5f : 0; + if (data_layout == DataLayout::kNCHW) { + // get launch 3D config + int nc = n * c; + backends::gpu::GpuLaunchConfig config_3d = + backends::gpu::GetGpuLaunchConfig3D(dev_ctx, nc, out_h, out_w); + KeBilinearInterpNCHWFw<<>>(input_data, + in_h, + in_w, + output_data, + out_h, + out_w, + nc, + ratio_h, + ratio_w, + align_type_value); + } else { + int64_t cw = c * out_w; + auto interp_divmods = funcs::FastDivModForInterpolate(c, out_chw, cw); + KeBilinearInterpFw< + T><<>>( + input_data, + in_h, + in_w, + n, + in_chw, + output_data, + out_h, + out_w, + n, + out_chw, + c, + ratio_h, + ratio_w, + align_type_value, + interp_divmods); + } + } else if ("bicubic" == interp_method) { +#ifdef __HIPCC__ + constexpr int thread_per_block = 256; +#else + constexpr int thread_per_block = 512; +#endif + KeBicubicInterpFw< + T><<>>( + input_data, + in_h, + in_w, + n, + in_chw, + output_data, + out_h, + out_w, + n, + out_chw, + c, + ratio_h, + ratio_w, + align_corners, + data_layout); + } +} + +template +static void Interpolate3DCUDAFwd( + const Context& dev_ctx, + const DenseTensor& input, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const std::string& data_layout_str, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* output) { + auto* input_data = input.data(); + + const DataLayout data_layout = + paddle::framework::StringToDataLayout(data_layout_str); + int n, c, in_d, in_h, in_w; + funcs::ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); + + float scale_w = -1; + float scale_d = -1; + float scale_h = -1; + if (size_tensor && size_tensor->size() > 0) { + // have size tensor + auto new_size = funcs::get_new_shape(size_tensor.get()); + out_d = new_size[0]; + out_h = new_size[1]; + out_w = new_size[2]; + } else { + if (scale_tensor) { + auto scale_data = + funcs::get_new_data_from_tensor(scale_tensor.get_ptr()); + if (scale_data.size() > 1) { + scale_d = scale_data[0]; + scale_h = scale_data[1]; + scale_w = scale_data[2]; + } else { + scale_d = scale_data[0]; + scale_h = scale_data[0]; + scale_w = scale_data[0]; + } + + PADDLE_ENFORCE_EQ( + scale_w > 0, + true, + errors::InvalidArgument( + "The scale_w in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, + true, + errors::InvalidArgument( + "The scale_h in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); + PADDLE_ENFORCE_EQ( + scale_d > 0, + true, + errors::InvalidArgument( + "The scale_d in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_d)); + } else { + if (scale.size() > 1) { + scale_d = scale[0]; + scale_h = scale[1]; + scale_w = scale[2]; + + PADDLE_ENFORCE_EQ( + scale_w > 0, + true, + errors::InvalidArgument( + "The scale_w in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, + true, + errors::InvalidArgument( + "The scale_h in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); + PADDLE_ENFORCE_EQ( + scale_d > 0, + true, + errors::InvalidArgument( + "The scale_d in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_d)); + } + } + if (scale_d > 0. && scale_h > 0. && scale_w > 0.) { + out_d = static_cast(in_d * scale_d); + out_h = static_cast(in_h * scale_h); + out_w = static_cast(in_w * scale_w); + } + if (out_size) { + DenseTensor sizes; + paddle::framework::TensorCopySync( + *out_size, paddle::platform::CPUPlace(), &sizes); + auto size_data = sizes.data(); + out_d = size_data[0]; + out_h = size_data[1]; + out_w = size_data[2]; + } + } + PADDLE_ENFORCE_GT( + out_d, + 0, + errors::InvalidArgument("out_d in Attr(out_shape) of Op(interpolate) " + "should be greater than 0.")); + PADDLE_ENFORCE_GT( + out_h, + 0, + errors::InvalidArgument("out_h in Attr(out_shape) of Op(interpolate) " + "should be greater than 0.")); + PADDLE_ENFORCE_GT( + out_w, + 0, + errors::InvalidArgument("out_w in Attr(out_shape) of Op(interpolate) " + "should be greater than 0.")); + + phi::DDim dim_out; + if (data_layout == DataLayout::kNCHW) { + dim_out = {n, c, out_d, out_h, out_w}; + } else { + dim_out = {n, out_d, out_h, out_w, c}; + } + output->Resize(dim_out); + auto output_data = dev_ctx.template Alloc(output); + + if (in_d == out_d && in_h == out_h && in_w == out_w) { + paddle::framework::TensorCopy(input, dev_ctx.GetPlace(), output); + return; + } + + float ratio_d = 0.f; + float ratio_h = 0.f; + float ratio_w = 0.f; + if (out_d > 1) { + float new_scale_d = 0.f; + new_scale_d = (scale_d > 0) ? static_cast(1. / scale_d) + : static_cast(in_d) / out_d; + ratio_d = (align_corners) ? static_cast(in_d - 1) / (out_d - 1) + : static_cast(new_scale_d); + } + if (out_h > 1) { + float new_scale_h = 0.f; + new_scale_h = (scale_h > 0) ? static_cast(1. / scale_h) + : static_cast(in_h) / out_h; + ratio_h = (align_corners) ? static_cast(in_h - 1) / (out_h - 1) + : static_cast(new_scale_h); + } + if (out_w > 1) { + float new_scale_w = 0.f; + new_scale_w = (scale_w > 0) ? static_cast(1. / scale_w) + : static_cast(in_w) / out_w; + ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) + : static_cast(new_scale_w); + } + + int64_t in_dhw = in_d * in_h * in_w; + int64_t out_dhw = out_d * out_h * out_w; + int64_t in_cdhw = c * in_dhw; + int64_t out_cdhw = c * out_dhw; + + auto pixelNum = n * out_cdhw; + + backends::gpu::GpuLaunchConfig config = + backends::gpu::GetGpuLaunchConfig1D(dev_ctx, pixelNum); + + if ("trilinear" == interp_method) { + KeTrilinearInterpFw<<>>(input_data, + in_d, + in_h, + in_w, + n, + in_cdhw, + output_data, + out_d, + out_h, + out_w, + n, + out_cdhw, + c, + ratio_d, + ratio_h, + ratio_w, + align_corners, + align_mode, + data_layout); + } else if ("nearest" == interp_method) { + KeNearestNeighbor3DInterpFw<<>>(input_data, + in_d, + in_h, + in_w, + n, + in_cdhw, + output_data, + out_d, + out_h, + out_w, + n, + out_cdhw, + c, + ratio_d, + ratio_h, + ratio_w, + align_corners, + data_layout); + } +} + +template +void InterpolateKernel( + const Context& dev_ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* output) { + auto input_dims = x.dims(); + if (input_dims.size() == 3) { // 1D interpolation + Interpolate1DCUDAFwd(dev_ctx, + x, + out_size, + size_tensor, + scale_tensor, + data_layout, + out_w, + scale, + interp_method, + align_corners, + align_mode, + output); + } else if (input_dims.size() == 4) { // 2D interpolation + Interpolate2DCUDAFwd(dev_ctx, + x, + out_size, + size_tensor, + scale_tensor, + data_layout, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + output); + } else if (input_dims.size() == 5) { // 3D interpolation + Interpolate3DCUDAFwd(dev_ctx, + x, + out_size, + size_tensor, + scale_tensor, + data_layout, + out_d, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + output); + } +} + +template +void BilinearInterpKernel( + const Context& dev_ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* output) { + InterpolateKernel(dev_ctx, + x, + out_size, + size_tensor, + scale_tensor, + data_layout, + out_d, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + output); +} + +template +void NearestInterpKernel( + const Context& dev_ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* output) { + InterpolateKernel(dev_ctx, + x, + out_size, + size_tensor, + scale_tensor, + data_layout, + out_d, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + output); +} + +template +void TrilinearInterpKernel( + const Context& dev_ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* output) { + InterpolateKernel(dev_ctx, + x, + out_size, + size_tensor, + scale_tensor, + data_layout, + out_d, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + output); +} + +template +void LinearInterpKernel( + const Context& dev_ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* output) { + InterpolateKernel(dev_ctx, + x, + out_size, + size_tensor, + scale_tensor, + data_layout, + out_d, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + output); +} + +template +void BicubicInterpKernel( + const Context& dev_ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* output) { + InterpolateKernel(dev_ctx, + x, + out_size, + size_tensor, + scale_tensor, + data_layout, + out_d, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + output); +} + +} // namespace phi + +PD_REGISTER_KERNEL(bilinear_interp_v2, + GPU, + ALL_LAYOUT, + phi::BilinearInterpKernel, + float, + double, + int) {} +PD_REGISTER_KERNEL(nearest_interp_v2, + GPU, + ALL_LAYOUT, + phi::NearestInterpKernel, + float, + double, + int, + int64_t) {} +PD_REGISTER_KERNEL(trilinear_interp_v2, + GPU, + ALL_LAYOUT, + phi::TrilinearInterpKernel, + float, + double, + int) {} +PD_REGISTER_KERNEL(linear_interp_v2, + GPU, + ALL_LAYOUT, + phi::LinearInterpKernel, + float, + double, + int) {} +PD_REGISTER_KERNEL(bicubic_interp_v2, + GPU, + ALL_LAYOUT, + phi::BicubicInterpKernel, + float, + double, + int) {} diff --git a/paddle/phi/kernels/interpolate_grad_kernel.h b/paddle/phi/kernels/interpolate_grad_kernel.h new file mode 100644 index 0000000000..59d2dddd87 --- /dev/null +++ b/paddle/phi/kernels/interpolate_grad_kernel.h @@ -0,0 +1,39 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void BilinearInterpGradKernel( + const Context& ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const DenseTensor& out_grad, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* x_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/interpolate_kernel.h b/paddle/phi/kernels/interpolate_kernel.h new file mode 100644 index 0000000000..4623657f5a --- /dev/null +++ b/paddle/phi/kernels/interpolate_kernel.h @@ -0,0 +1,110 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void BilinearInterpKernel( + const Context& ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* output); + +template +void NearestInterpKernel( + const Context& ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + + paddle::optional scale_tensor, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* output); + +template +void TrilinearInterpKernel( + const Context& ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + + paddle::optional scale_tensor, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* output); + +template +void LinearInterpKernel( + const Context& ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + + paddle::optional scale_tensor, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* output); + +template +void BicubicInterpKernel( + const Context& ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + + paddle::optional scale_tensor, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* output); + +} // namespace phi diff --git a/paddle/phi/ops/compat/interpolate_sig.cc b/paddle/phi/ops/compat/interpolate_sig.cc new file mode 100644 index 0000000000..ba0e971e4a --- /dev/null +++ b/paddle/phi/ops/compat/interpolate_sig.cc @@ -0,0 +1,194 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature BilinearInterpOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("bilinear_interp_v2", + {"X", "OutSize", "SizeTensor", "Scale"}, + {"data_layout", + "out_d", + "out_h", + "out_w", + "scale", + "interp_method", + "align_corners", + "align_mode"}, + {"Out"}); +} + +KernelSignature NearestInterpOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("nearest_interp_v2", + {"X", "OutSize", "SizeTensor", "Scale"}, + {"data_layout", + "out_d", + "out_h", + "out_w", + "scale", + "interp_method", + "align_corners", + "align_mode"}, + {"Out"}); +} +KernelSignature TrilinearInterpOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("trilinear_interp_v2", + {"X", "OutSize", "SizeTensor", "Scale"}, + {"data_layout", + "out_d", + "out_h", + "out_w", + "scale", + "interp_method", + "align_corners", + "align_mode"}, + {"Out"}); +} + +KernelSignature LinearInterpOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("linear_interp_v2", + {"X", "OutSize", "SizeTensor", "Scale"}, + {"data_layout", + "out_d", + "out_h", + "out_w", + "scale", + "interp_method", + "align_corners", + "align_mode"}, + {"Out"}); +} + +KernelSignature BicubicInterpOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("bicubic_interp_v2", + {"X", "OutSize", "SizeTensor", "Scale"}, + {"data_layout", + "out_d", + "out_h", + "out_w", + "scale", + "interp_method", + "align_corners", + "align_mode"}, + {"Out"}); +} + +KernelSignature BilinearInterpGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "bilinear_interp_v2_grad", + {"X", "OutSize", "SizeTensor", "Scale", GradVarName("Out")}, + {"data_layout", + "out_d", + "out_h", + "out_w", + "scale", + "interp_method", + "align_corners", + "align_mode"}, + {GradVarName("X")}); +} + +KernelSignature NearestInterpGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "nearest_interp_v2_grad", + {"X", "OutSize", "SizeTensor", "Scale", GradVarName("Out")}, + {"data_layout", + "out_d", + "out_h", + "out_w", + "scale", + "interp_method", + "align_corners", + "align_mode"}, + {GradVarName("X")}); +} +KernelSignature TrilinearInterpGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "trilinear_interp_v2_grad", + {"X", "OutSize", "SizeTensor", "Scale", GradVarName("Out")}, + {"data_layout", + "out_d", + "out_h", + "out_w", + "scale", + "interp_method", + "align_corners", + "align_mode"}, + {GradVarName("X")}); +} + +KernelSignature LinearInterpGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "linear_interp_v2_grad", + {"X", "OutSize", "SizeTensor", "Scale", GradVarName("Out")}, + {"data_layout", + "out_d", + "out_h", + "out_w", + "scale", + "interp_method", + "align_corners", + "align_mode"}, + {GradVarName("X")}); +} + +KernelSignature BicubicInterpGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "bicubic_interp_v2_grad", + {"X", "OutSize", "SizeTensor", "Scale", GradVarName("Out")}, + {"data_layout", + "out_d", + "out_h", + "out_w", + "scale", + "interp_method", + "align_corners", + "align_mode"}, + {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(bilinear_interp_v2, + phi::BilinearInterpOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(nearest_interp_v2, + phi::NearestInterpOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(trilinear_interp_v2, + phi::TrilinearInterpOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(linear_interp_v2, + phi::LinearInterpOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(bicubic_interp_v2, + phi::BicubicInterpOpArgumentMapping); + +PD_REGISTER_ARG_MAPPING_FN(bilinear_interp_v2_grad, + phi::BilinearInterpGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(nearest_interp_v2_grad, + phi::NearestInterpGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(trilinear_interp_v2_grad, + phi::TrilinearInterpGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(linear_interp_v2_grad, + phi::LinearInterpGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(bicubic_interp_v2_grad, + phi::BicubicInterpGradOpArgumentMapping); diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_nearest_interp_v2.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_nearest_interp_v2.py index 57d7d70c66..cf8b7b3516 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_nearest_interp_v2.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_nearest_interp_v2.py @@ -41,7 +41,9 @@ class TrtConvertNearestInterpV2Test(TrtLayerAutoScanTest): "data_layout": "NCHW", "interp_method": "nearest", "align_corners": False, + "align_mode": 1, "scale": [2., 2.], + "out_d": 0, "out_h": 0, "out_w": 0 } -- GitLab