// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "paddle/phi/kernels/pad3d_kernel.h" #include #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" namespace phi { using paddle::platform::PADDLE_CUDA_NUM_THREADS; template __global__ void Pad3DConstNCDHW(const int nthreads, const T* in_data, const int num, const int channels, const int in_depth, const int in_height, const int in_width, const int out_depth, const int out_height, const int out_width, const int pad_front, const int pad_top, const int pad_left, T value, T* out_data) { CUDA_KERNEL_LOOP(index, nthreads) { int nc = index / out_width; const int out_w = index % out_width; const int out_h = nc % out_height; nc /= out_height; const int out_d = nc % out_depth; nc /= out_depth; int in_d = out_d - pad_front; int in_h = out_h - pad_top; int in_w = out_w - pad_left; out_data[index] = (in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth || in_h >= in_height || in_w >= in_width) ? value : in_data[nc * in_depth * in_height * in_width + in_d * in_height * in_width + in_h * in_width + in_w]; } } template __global__ void Pad3DConstNDHWC(const int nthreads, const T* in_data, const int num, const int channels, const int in_depth, const int in_height, const int in_width, const int out_depth, const int out_height, const int out_width, const int pad_front, const int pad_top, const int pad_left, T value, T* out_data) { CUDA_KERNEL_LOOP(index, nthreads) { int n = index / channels; const int c = index % channels; const int out_w = n % out_width; n /= out_width; const int out_h = n % out_height; n /= out_height; const int out_d = n % out_depth; n /= out_depth; const int in_d = out_d - pad_front; const int in_h = out_h - pad_top; const int in_w = out_w - pad_left; out_data[index] = (in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth || in_h >= in_height || in_w >= in_width) ? value : in_data[n * in_depth * in_height * in_width * channels + in_d * in_height * in_width * channels + in_h * in_width * channels + in_w * channels + c]; } } template __global__ void Pad3DReflectNCDHW(const int nthreads, const T* in_data, const int num, const int channels, const int in_depth, const int in_height, const int in_width, const int out_depth, const int out_height, const int out_width, const int pad_front, const int pad_top, const int pad_left, T* out_data) { CUDA_KERNEL_LOOP(index, nthreads) { int nc = index / out_width; const int out_w = index % out_width; const int out_h = nc % out_height; nc /= out_height; const int out_d = nc % out_depth; nc /= out_depth; int in_d = out_d - pad_front; int in_h = out_h - pad_top; int in_w = out_w - pad_left; in_d = max(in_d, -in_d); // reflect by 0 in_d = min(in_d, 2 * in_depth - in_d - 2); // reflect by in_depth in_h = max(in_h, -in_h); // reflect by 0 in_h = min(in_h, 2 * in_height - in_h - 2); // reflect by in_height in_w = max(in_w, -in_w); // reflect by 0 in_w = min(in_w, 2 * in_width - in_w - 2); // reflect by in_width out_data[index] = in_data[(nc * in_depth * in_height + in_d * in_height + in_h) * in_width + in_w]; } } template __global__ void Pad3DReflectNDHWC(const int nthreads, const T* in_data, const int num, const int channels, const int in_depth, const int in_height, const int in_width, const int out_depth, const int out_height, const int out_width, const int pad_front, const int pad_top, const int pad_left, T* out_data) { CUDA_KERNEL_LOOP(index, nthreads) { int n = index / channels; const int c = index % channels; const int out_w = n % out_width; n /= out_width; const int out_h = n % out_height; n /= out_height; const int out_d = n % out_depth; n /= out_depth; int in_d = out_d - pad_front; int in_h = out_h - pad_top; int in_w = out_w - pad_left; in_d = max(in_d, -in_d); in_d = min(in_d, 2 * in_depth - in_d - 2); in_h = max(in_h, -in_h); in_h = min(in_h, 2 * in_height - in_h - 2); in_w = max(in_w, -in_w); in_w = min(in_w, 2 * in_width - in_w - 2); out_data[index] = in_data[n * in_depth * in_height * in_width * channels + in_d * in_height * in_width * channels + in_h * in_width * channels + in_w * channels + c]; } } template __global__ void Pad3DReplicateNCDHW(const int nthreads, const T* in_data, const int num, const int channels, const int in_depth, const int in_height, const int in_width, const int out_depth, const int out_height, const int out_width, const int pad_front, const int pad_top, const int pad_left, T* out_data) { CUDA_KERNEL_LOOP(index, nthreads) { int nc = index / out_width; const int out_w = index % out_width; const int out_h = nc % out_height; nc /= out_height; const int out_d = nc % out_depth; nc /= out_depth; int in_d = min(in_depth - 1, max(out_d - pad_front, 0)); int in_h = min(in_height - 1, max(out_h - pad_top, 0)); int in_w = min(in_width - 1, max(out_w - pad_left, 0)); out_data[index] = in_data[(nc * in_depth * in_height + in_d * in_height + in_h) * in_width + in_w]; } } template __global__ void Pad3DReplicateNDHWC(const int nthreads, const T* in_data, const int num, const int channels, const int in_depth, const int in_height, const int in_width, const int out_depth, const int out_height, const int out_width, const int pad_front, const int pad_top, const int pad_left, T* out_data) { CUDA_KERNEL_LOOP(index, nthreads) { int n = index / channels; const int c = index % channels; const int out_w = n % out_width; n /= out_width; const int out_h = n % out_height; n /= out_height; const int out_d = n % out_depth; n /= out_depth; int in_d = min(in_depth - 1, max(out_d - pad_front, 0)); int in_h = min(in_height - 1, max(out_h - pad_top, 0)); int in_w = min(in_width - 1, max(out_w - pad_left, 0)); out_data[index] = in_data[n * in_depth * in_height * in_width * channels + in_d * in_height * in_width * channels + in_h * in_width * channels + in_w * channels + c]; } } template __global__ void Pad3DCircularNCDHW(const int nthreads, const T* in_data, const int num, const int channels, const int in_depth, const int in_height, const int in_width, const int out_depth, const int out_height, const int out_width, const int pad_front, const int pad_top, const int pad_left, T* out_data) { CUDA_KERNEL_LOOP(index, nthreads) { int nc = index / out_width; const int out_w = index % out_width; const int out_h = nc % out_height; nc /= out_height; const int out_d = nc % out_depth; nc /= out_depth; int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth; int in_h = ((out_h - pad_top) % in_height + in_height) % in_height; int in_w = ((out_w - pad_left) % in_width + in_width) % in_width; out_data[index] = in_data[(nc * in_depth * in_height + in_d * in_height + in_h) * in_width + in_w]; } } template __global__ void Pad3DCircularNDHWC(const int nthreads, const T* in_data, const int num, const int channels, const int in_depth, const int in_height, const int in_width, const int out_depth, const int out_height, const int out_width, const int pad_front, const int pad_top, const int pad_left, T* out_data) { CUDA_KERNEL_LOOP(index, nthreads) { int n = index / channels; const int c = index % channels; const int out_w = n % out_width; n /= out_width; const int out_h = n % out_height; n /= out_height; const int out_d = n % out_depth; n /= out_depth; int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth; int in_h = ((out_h - pad_top) % in_height + in_height) % in_height; int in_w = ((out_w - pad_left) % in_width + in_width) % in_width; out_data[index] = in_data[n * in_depth * in_height * in_width * channels + in_d * in_height * in_width * channels + in_h * in_width * channels + in_w * channels + c]; } } template void Pad3dKernel(const Context& dev_ctx, const DenseTensor& x, const ScalarArray& paddings, const std::string& mode, float pad_value, const std::string& data_format, DenseTensor* out) { std::vector pads = paddings.GetData(); auto in_dims = x.dims(); const T* in_data = x.data(); auto out_dims = out->dims(); T value = static_cast(pad_value); if (data_format == "NCDHW") { out_dims[0] = in_dims[0]; out_dims[1] = in_dims[1]; out_dims[2] = in_dims[2] + pads[4] + pads[5]; out_dims[3] = in_dims[3] + pads[2] + pads[3]; out_dims[4] = in_dims[4] + pads[0] + pads[1]; } else { out_dims[0] = in_dims[0]; out_dims[1] = in_dims[1] + pads[4] + pads[5]; out_dims[2] = in_dims[2] + pads[2] + pads[3]; out_dims[3] = in_dims[3] + pads[0] + pads[1]; out_dims[4] = in_dims[4]; } out->Resize(out_dims); T* out_data = dev_ctx.template Alloc(out); int channels = in_dims[1]; int in_depth = in_dims[2]; int in_height = in_dims[3]; int in_width = in_dims[4]; int out_depth = out_dims[2]; int out_height = out_dims[3]; int out_width = out_dims[4]; if (data_format == "NDHWC") { channels = in_dims[4]; in_depth = in_dims[1]; in_height = in_dims[2]; in_width = in_dims[3]; out_depth = out_dims[1]; out_height = out_dims[2]; out_width = out_dims[3]; } if (mode == "reflect") { PADDLE_ENFORCE_GT( in_depth, pads[4], errors::InvalidArgument("The depth of Input(X)'s dimension should be " "greater than pad_front" " in reflect mode" ", but received depth(%d) and pad_front(%d).", in_depth, pads[4])); PADDLE_ENFORCE_GT( in_depth, pads[5], errors::InvalidArgument("The depth of Input(X)'s dimension should be " "greater than pad_back" " in reflect mode" ", but received depth(%d) and pad_back(%d).", in_depth, pads[5])); PADDLE_ENFORCE_GT( in_height, pads[2], errors::InvalidArgument("The height of Input(X)'s dimension should be " "greater than pad_top" " in reflect mode" ", but received depth(%d) and pad_top(%d).", in_height, pads[2])); PADDLE_ENFORCE_GT( in_height, pads[3], errors::InvalidArgument("The height of Input(X)'s dimension should be " "greater than pad_bottom" " in reflect mode" ", but received depth(%d) and pad_bottom(%d).", in_height, pads[3])); PADDLE_ENFORCE_GT( in_width, pads[0], errors::InvalidArgument("The width of Input(X)'s dimension should be " "greater than pad_left" " in reflect mode" ", but received depth(%d) and pad_left(%d).", in_width, pads[0])); PADDLE_ENFORCE_GT( in_width, pads[1], errors::InvalidArgument("The width of Input(X)'s dimension should be " "greater than pad_right" " in reflect mode" ", but received depth(%d) and pad_right(%d).", in_width, pads[1])); } else if (mode == "circular" || mode == "replicate") { PADDLE_ENFORCE_NE(in_depth * in_height * in_width, 0, errors::InvalidArgument( "The input tensor size can not be 0 for circular " "or replicate padding mode.")); } const int pad_left = pads[0]; const int pad_top = pads[2]; const int pad_front = pads[4]; const int num = in_dims[0]; auto stream = dev_ctx.stream(); int block = PADDLE_CUDA_NUM_THREADS; const int out_size = out->numel(); int grid = (out_size + block - 1) / block; if (data_format == "NCDHW") { if (mode == "reflect") { Pad3DReflectNCDHW<<>>(out_size, in_data, num, channels, in_depth, in_height, in_width, out_depth, out_height, out_width, pad_front, pad_top, pad_left, out_data); } else if (mode == "replicate") { Pad3DReplicateNCDHW<<>>(out_size, in_data, num, channels, in_depth, in_height, in_width, out_depth, out_height, out_width, pad_front, pad_top, pad_left, out_data); } else if (mode == "circular") { Pad3DCircularNCDHW<<>>(out_size, in_data, num, channels, in_depth, in_height, in_width, out_depth, out_height, out_width, pad_front, pad_top, pad_left, out_data); } else { Pad3DConstNCDHW<<>>(out_size, in_data, num, channels, in_depth, in_height, in_width, out_depth, out_height, out_width, pad_front, pad_top, pad_left, value, out_data); } } else { if (mode == "reflect") { Pad3DReflectNDHWC<<>>(out_size, in_data, num, channels, in_depth, in_height, in_width, out_depth, out_height, out_width, pad_front, pad_top, pad_left, out_data); } else if (mode == "replicate") { Pad3DReplicateNDHWC<<>>(out_size, in_data, num, channels, in_depth, in_height, in_width, out_depth, out_height, out_width, pad_front, pad_top, pad_left, out_data); } else if (mode == "circular") { Pad3DCircularNDHWC<<>>(out_size, in_data, num, channels, in_depth, in_height, in_width, out_depth, out_height, out_width, pad_front, pad_top, pad_left, out_data); } else { Pad3DConstNDHWC<<>>(out_size, in_data, num, channels, in_depth, in_height, in_width, out_depth, out_height, out_width, pad_front, pad_top, pad_left, value, out_data); } } } } // namespace phi PD_REGISTER_KERNEL(pad3d, GPU, ALL_LAYOUT, phi::Pad3dKernel, phi::dtype::float16, float, double, int, int64_t) {}