From 5e0614a112a8c1fd47ae902cbbaaa5ce673ff7fa Mon Sep 17 00:00:00 2001 From: Rayman Date: Mon, 10 Oct 2022 10:15:19 +0800 Subject: [PATCH] =?UTF-8?q?=E3=80=90Hackathon=20No.56&38=E3=80=91deformabl?= =?UTF-8?q?e=5Fconv=5Fv1=20=E7=AE=97=E5=AD=90=E5=AE=9E=E7=8E=B0=20float16?= =?UTF-8?q?=20=E6=95=B0=E6=8D=AE=E7=B1=BB=E5=9E=8B=E6=94=AF=E6=8C=81&?= =?UTF-8?q?=E5=89=8D=E5=90=91=E8=BF=90=E8=A1=8C=E5=8A=A0=E9=80=9F=20(#4611?= =?UTF-8?q?1)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit support fp16 for deformable conv --- .../cpu/deformable_conv_grad_kernel.cc | 34 +++---- .../kernels/funcs/deformable_conv_functor.cc | 6 +- .../kernels/funcs/deformable_conv_functor.cu | 41 ++++++--- .../kernels/funcs/deformable_conv_functor.h | 55 ++++++------ .../gpu/deformable_conv_grad_kernel.cu | 90 ++++++++++--------- .../phi/kernels/gpu/deformable_conv_kernel.cu | 3 +- .../impl/deformable_conv_grad_kernel_impl.h | 56 +++++++----- .../impl/deformable_conv_kernel_impl.h | 46 ++++++++-- .../unittests/test_deformable_conv_v1_op.py | 81 +++++++++++++++-- 9 files changed, 275 insertions(+), 137 deletions(-) diff --git a/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc b/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc index a4d43ef8fbe..050c61596fe 100644 --- a/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc @@ -97,14 +97,14 @@ inline void ModulatedDeformableCol2imCPUKernel( width); *(grad_im + cur_bottom_grad_pos) = - *(grad_im + cur_bottom_grad_pos) + weight * cur_top_grad; + *(grad_im + cur_bottom_grad_pos) + (weight * cur_top_grad); } } } } } -template +template void ModulatedDeformableCol2im(const Context& dev_ctx, const T* data_col, const T* data_offset, @@ -116,7 +116,7 @@ void ModulatedDeformableCol2im(const Context& dev_ctx, const std::vector& stride, const std::vector& dilation, const int deformable_group, - T* grad_im) { + MT* grad_im) { int channel_per_deformable_group = im_shape[0] / deformable_group; int num_kernels = col_shape[0] * col_shape[1] * col_shape[2] * col_shape[3]; @@ -222,22 +222,22 @@ void ModulatedDeformableCol2imCoordCPUKernel( if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) { inv_h = inv_w = -2; } else { - mval += data_col_ptr[col_pos] * - funcs::DmcnIm2colBilinear(data_im_ptr + cnt * height * width, - width, - height, - width, - inv_h, - inv_w); + mval += data_col_ptr[col_pos] * funcs::DmcnIm2colBilinear( + data_im_ptr + cnt * height * width, + width, + height, + width, + inv_h, + inv_w); } const T weight = - DmcnGetCoordinateWeight(inv_h, - inv_w, - height, - width, - data_im_ptr + cnt * height * width, - width, - bp_dir); + DmcnGetCoordinateWeight(inv_h, + inv_w, + height, + width, + data_im_ptr + cnt * height * width, + width, + bp_dir); if (data_mask_ptr) { const int data_mask_hw_ptr = (((i * kernel_w + j) * height_col + h_out) * width_col + w_out); diff --git a/paddle/phi/kernels/funcs/deformable_conv_functor.cc b/paddle/phi/kernels/funcs/deformable_conv_functor.cc index 48858fa5939..253a66adfc6 100644 --- a/paddle/phi/kernels/funcs/deformable_conv_functor.cc +++ b/paddle/phi/kernels/funcs/deformable_conv_functor.cc @@ -13,8 +13,8 @@ // limitations under the License. #include "paddle/phi/kernels/funcs/deformable_conv_functor.h" - #include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/common/amp_type_traits.h" namespace phi { namespace funcs { @@ -82,8 +82,8 @@ inline void ModulatedDeformableIm2colCPUKernel( const T h_im = h_in + i * dilation_h + offset_h; const T w_im = w_in + j * dilation_w + offset_w; if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) { - val = - DmcnIm2colBilinear(data_im_ptr, width, height, width, h_im, w_im); + val = DmcnIm2colBilinear( + data_im_ptr, width, height, width, h_im, w_im); } *data_col_ptr = val; if (data_mask_ptr) { diff --git a/paddle/phi/kernels/funcs/deformable_conv_functor.cu b/paddle/phi/kernels/funcs/deformable_conv_functor.cu index bebea5dcb74..0d5076a4937 100644 --- a/paddle/phi/kernels/funcs/deformable_conv_functor.cu +++ b/paddle/phi/kernels/funcs/deformable_conv_functor.cu @@ -12,8 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/kernels/funcs/deformable_conv_functor.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/amp_type_traits.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/device_context.h" namespace phi { namespace funcs { @@ -51,6 +54,8 @@ __global__ void ModulatedDeformableIm2colGpuKernel( T* data_col) { int index = blockIdx.x * blockDim.x + threadIdx.x; int offset = blockDim.x * gridDim.x; + + using MT = typename phi::dtype::MPTypeTrait::Type; for (size_t i = index; i < nthreads; i += offset) { const int w_col = i % width_col; const int h_col = (i / width_col) % height_col; @@ -85,22 +90,22 @@ __global__ void ModulatedDeformableIm2colGpuKernel( ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col; - const T offset_h = data_offset_ptr[data_offset_h_ptr]; - const T offset_w = data_offset_ptr[data_offset_w_ptr]; - T val = static_cast(0); - const T h_im = h_in + i * dilation_h + offset_h; - const T w_im = w_in + j * dilation_w + offset_w; + const MT offset_h = static_cast(data_offset_ptr[data_offset_h_ptr]); + const MT offset_w = static_cast(data_offset_ptr[data_offset_w_ptr]); + MT val = static_cast(0); + const MT h_im = h_in + i * dilation_h + offset_h; + const MT w_im = w_in + j * dilation_w + offset_w; if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) { - val = - DmcnIm2colBilinear(data_im_ptr, width, height, width, h_im, w_im); + val = DmcnIm2colBilinear( + data_im_ptr, width, height, width, h_im, w_im); } - *data_col_ptr = val; if (data_mask_ptr) { const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col; - const T mask = data_mask_ptr[data_mask_hw_ptr]; - *data_col_ptr *= mask; + const MT mask = static_cast(data_mask_ptr[data_mask_hw_ptr]); + val *= mask; } + *data_col_ptr = static_cast(val); data_col_ptr += batch_size * height_col * width_col; } } @@ -164,6 +169,20 @@ template void ModulatedDeformableIm2col( const int deformable_groups, float* data_col); +template void ModulatedDeformableIm2col( + const phi::GPUContext& dev_ctx, + const phi::dtype::float16* data_im, + const phi::dtype::float16* data_offset, + const phi::dtype::float16* data_mask, + const std::vector& im_shape, + const std::vector& col_shape, + const std::vector& filter_shape, + const std::vector& paddings, + const std::vector& strides, + const std::vector& dilations, + const int deformable_groups, + phi::dtype::float16* data_col); + template void ModulatedDeformableIm2col( const phi::GPUContext& dev_ctx, const double* data_im, diff --git a/paddle/phi/kernels/funcs/deformable_conv_functor.h b/paddle/phi/kernels/funcs/deformable_conv_functor.h index eecda729275..62e42cd5833 100644 --- a/paddle/phi/kernels/funcs/deformable_conv_functor.h +++ b/paddle/phi/kernels/funcs/deformable_conv_functor.h @@ -14,44 +14,47 @@ #pragma once +#include "paddle/phi/common/amp_type_traits.h" +#include "paddle/phi/common/float16.h" #include "paddle/phi/core/dense_tensor.h" namespace phi { namespace funcs { -template -HOSTDEVICE T DmcnIm2colBilinear(const T* bottom_data, - const int data_width, - const int height, - const int width, - T h, - T w) { +template +HOSTDEVICE MT DmcnIm2colBilinear(const T* bottom_data, + const int data_width, + const int height, + const int width, + MT h, + MT w) { int h_low = floor(h); int w_low = floor(w); int h_high = h_low + 1; int w_high = w_low + 1; - T lh = h - h_low; - T lw = w - w_low; - T hh = 1 - lh; - T hw = 1 - lw; + MT lh = h - h_low; + MT lw = w - w_low; + MT hh = 1 - lh; + MT hw = 1 - lw; - T v1 = - (h_low >= 0 && w_low >= 0) ? bottom_data[h_low * data_width + w_low] : 0; - T v2 = (h_low >= 0 && w_high <= width - 1) - ? bottom_data[h_low * data_width + w_high] - : 0; - T v3 = (h_high <= height - 1 && w_low >= 0) - ? bottom_data[h_high * data_width + w_low] - : 0; - T v4 = (h_high <= height - 1 && w_high <= width - 1) - ? bottom_data[h_high * data_width + w_high] - : 0; + MT v1 = (h_low >= 0 && w_low >= 0) + ? static_cast(bottom_data[h_low * data_width + w_low]) + : 0; + MT v2 = (h_low >= 0 && w_high <= width - 1) + ? static_cast(bottom_data[h_low * data_width + w_high]) + : 0; + MT v3 = (h_high <= height - 1 && w_low >= 0) + ? static_cast(bottom_data[h_high * data_width + w_low]) + : 0; + MT v4 = (h_high <= height - 1 && w_high <= width - 1) + ? static_cast(bottom_data[h_high * data_width + w_high]) + : 0; - T w1 = hh * hw; - T w2 = hh * lw; - T w3 = lh * hw; - T w4 = lh * lw; + MT w1 = hh * hw; + MT w2 = hh * lw; + MT w3 = lh * hw; + MT w4 = lh * lw; return w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4; } diff --git a/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu b/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu index b46f1f4a331..5d2f4727c53 100644 --- a/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu @@ -29,7 +29,7 @@ static inline int NumBlocks(const int N) { kNumMaximumNumBlocks); } -template +template __global__ void ModulatedDeformableCol2imGpuKernel( const int nthreads, const T* data_col, @@ -51,9 +51,10 @@ __global__ void ModulatedDeformableCol2imGpuKernel( const int deformable_group, const int height_col, const int width_col, - T* grad_im) { + MT* grad_im) { int index = blockIdx.x * blockDim.x + threadIdx.x; int offset = blockDim.x * gridDim.x; + // using MT = typename phi::dtype::MPTypeTrait::Type; for (size_t thread = index; thread < nthreads; thread += offset) { const int j = (thread / width_col / height_col / batch_size) % kernel_w; const int i = @@ -78,17 +79,17 @@ __global__ void ModulatedDeformableCol2imGpuKernel( ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out; const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_out) * width_col + w_out; - const T offset_h = data_offset_ptr[data_offset_h_ptr]; - const T offset_w = data_offset_ptr[data_offset_w_ptr]; - const T cur_inv_h_data = h_in + i * dilation_h + offset_h; - const T cur_inv_w_data = w_in + j * dilation_w + offset_w; + const MT offset_h = static_cast(data_offset_ptr[data_offset_h_ptr]); + const MT offset_w = static_cast(data_offset_ptr[data_offset_w_ptr]); + const MT cur_inv_h_data = h_in + i * dilation_h + offset_h; + const MT cur_inv_w_data = w_in + j * dilation_w + offset_w; - T cur_top_grad = data_col[thread]; + MT cur_top_grad = static_cast(data_col[thread]); if (data_mask) { const T* data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; - const T mask = data_mask_ptr[data_mask_hw_ptr]; + const MT mask = static_cast(data_mask_ptr[data_mask_hw_ptr]); cur_top_grad *= mask; } const int cur_h = static_cast(cur_inv_h_data); @@ -100,13 +101,12 @@ __global__ void ModulatedDeformableCol2imGpuKernel( abs(cur_inv_w_data - (cur_w + dx)) < 1) { int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx; - T weight = DmcnGetGradientWeight(cur_inv_h_data, - cur_inv_w_data, - cur_h + dy, - cur_w + dx, - height, - width); - + MT weight = DmcnGetGradientWeight(cur_inv_h_data, + cur_inv_w_data, + cur_h + dy, + cur_w + dx, + height, + width); paddle::platform::CudaAtomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad); } @@ -115,7 +115,7 @@ __global__ void ModulatedDeformableCol2imGpuKernel( } } -template +template void ModulatedDeformableCol2im(const Context& dev_ctx, const T* data_col, const T* data_offset, @@ -127,13 +127,13 @@ void ModulatedDeformableCol2im(const Context& dev_ctx, const std::vector& stride, const std::vector& dilation, const int deformable_group, - T* grad_im) { + MT* grad_im) { int channel_per_deformable_group = im_shape[0] / deformable_group; int num_kernels = col_shape[0] * col_shape[1] * col_shape[2] * col_shape[3]; int blocks = NumBlocks(num_kernels); int threads = kNumCUDAThreads; - ModulatedDeformableCol2imGpuKernel + ModulatedDeformableCol2imGpuKernel <<>>(num_kernels, data_col, data_offset, @@ -185,8 +185,9 @@ __global__ void ModulatedDeformableCol2imCoordGpuKernel( T* grad_mask) { int index = blockIdx.x * blockDim.x + threadIdx.x; int offset = blockDim.x * gridDim.x; + using MT = typename phi::dtype::MPTypeTrait::Type; for (size_t i = index; i < nthreads; i += offset) { - T val = 0, mval = 0; + MT val = 0, mval = 0; const int w = i % width_col; const int h = (i / width_col) % height_col; const int c = (i / width_col / height_col) % offset_channels; @@ -231,40 +232,42 @@ __global__ void ModulatedDeformableCol2imCoordGpuKernel( const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out); - const T offset_h = data_offset_ptr[data_offset_h_ptr]; - const T offset_w = data_offset_ptr[data_offset_w_ptr]; - T inv_h = h_in + i * dilation_h + offset_h; - T inv_w = w_in + j * dilation_w + offset_w; + + const MT offset_h = static_cast(data_offset_ptr[data_offset_h_ptr]); + const MT offset_w = static_cast(data_offset_ptr[data_offset_w_ptr]); + MT inv_h = h_in + i * dilation_h + offset_h; + MT inv_w = w_in + j * dilation_w + offset_w; if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) { inv_h = inv_w = -2; } else { - mval += data_col_ptr[col_pos] * - funcs::DmcnIm2colBilinear(data_im_ptr + cnt * height * width, - width, - height, - width, - inv_h, - inv_w); + mval += + static_cast(data_col_ptr[col_pos]) * + funcs::DmcnIm2colBilinear(data_im_ptr + cnt * height * width, + width, + height, + width, + inv_h, + inv_w); } - const T weight = - DmcnGetCoordinateWeight(inv_h, - inv_w, - height, - width, - data_im_ptr + cnt * height * width, - width, - bp_dir); + const MT weight = + DmcnGetCoordinateWeight(inv_h, + inv_w, + height, + width, + data_im_ptr + cnt * height * width, + width, + bp_dir); if (data_mask_ptr) { const int data_mask_hw_ptr = (((i * kernel_w + j) * height_col + h_out) * width_col + w_out); - const T mask = data_mask_ptr[data_mask_hw_ptr]; - val += weight * data_col_ptr[col_pos] * mask; + const MT mask = static_cast(data_mask_ptr[data_mask_hw_ptr]); + val += weight * static_cast(data_col_ptr[col_pos]) * mask; } else { - val += weight * data_col_ptr[col_pos]; + val += weight * static_cast(data_col_ptr[col_pos]); } cnt += 1; } - grad_offset[i] = val; + grad_offset[i] = static_cast(val); if (grad_mask && offset_c % 2 == 0) grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + @@ -359,4 +362,5 @@ PD_REGISTER_KERNEL(deformable_conv_grad, ALL_LAYOUT, phi::DeformableConvGradKernel, float, - double) {} + double, + paddle::platform::float16) {} diff --git a/paddle/phi/kernels/gpu/deformable_conv_kernel.cu b/paddle/phi/kernels/gpu/deformable_conv_kernel.cu index 2476dcbafb9..021791ca930 100644 --- a/paddle/phi/kernels/gpu/deformable_conv_kernel.cu +++ b/paddle/phi/kernels/gpu/deformable_conv_kernel.cu @@ -23,4 +23,5 @@ PD_REGISTER_KERNEL(deformable_conv, ALL_LAYOUT, phi::DeformableConvKernel, float, - double) {} + double, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h index 744c48b2bfb..7402a227336 100644 --- a/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h @@ -14,8 +14,10 @@ #pragma once +#include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/kernels/cast_kernel.h" #include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/full_kernel.h" #include "paddle/phi/kernels/funcs/blas/blas.h" @@ -58,14 +60,14 @@ HOSTDEVICE T DmcnGetGradientWeight(T argmax_h, return weight; } -template -HOSTDEVICE T DmcnGetCoordinateWeight(T argmax_h, - T argmax_w, - const int height, - const int width, - const T* im_data, - const int data_width, - const int bp_dir) { +template +HOSTDEVICE MT DmcnGetCoordinateWeight(MT argmax_h, + MT argmax_w, + const int height, + const int width, + const T* im_data, + const int data_width, + const int bp_dir) { if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) { return 0; @@ -76,43 +78,51 @@ HOSTDEVICE T DmcnGetCoordinateWeight(T argmax_h, int argmax_h_high = argmax_h_low + 1; int argmax_w_high = argmax_w_low + 1; - T weight = 0; + MT weight = 0; if (bp_dir == 0) { weight += (argmax_h_low >= 0 && argmax_w_low >= 0) ? -1 * (argmax_w_low + 1 - argmax_w) * - im_data[argmax_h_low * data_width + argmax_w_low] + static_cast( + im_data[argmax_h_low * data_width + argmax_w_low]) : 0; weight += (argmax_h_low >= 0 && argmax_w_high <= width - 1) ? -1 * (argmax_w - argmax_w_low) * - im_data[argmax_h_low * data_width + argmax_w_high] + static_cast( + im_data[argmax_h_low * data_width + argmax_w_high]) : 0; weight += (argmax_h_high <= height - 1 && argmax_w_low >= 0) ? (argmax_w_low + 1 - argmax_w) * - im_data[argmax_h_high * data_width + argmax_w_low] + static_cast( + im_data[argmax_h_high * data_width + argmax_w_low]) : 0; weight += (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) ? (argmax_w - argmax_w_low) * - im_data[argmax_h_high * data_width + argmax_w_high] + static_cast( + im_data[argmax_h_high * data_width + argmax_w_high]) : 0; } else if (bp_dir == 1) { weight += (argmax_h_low >= 0 && argmax_w_low >= 0) ? -1 * (argmax_h_low + 1 - argmax_h) * - im_data[argmax_h_low * data_width + argmax_w_low] + static_cast( + im_data[argmax_h_low * data_width + argmax_w_low]) : 0; weight += (argmax_h_low >= 0 && argmax_w_high <= width - 1) ? (argmax_h_low + 1 - argmax_h) * - im_data[argmax_h_low * data_width + argmax_w_high] + static_cast( + im_data[argmax_h_low * data_width + argmax_w_high]) : 0; weight += (argmax_h_high <= height - 1 && argmax_w_low >= 0) ? -1 * (argmax_h - argmax_h_low) * - im_data[argmax_h_high * data_width + argmax_w_low] + static_cast( + im_data[argmax_h_high * data_width + argmax_w_low]) : 0; weight += (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) ? (argmax_h - argmax_h_low) * - im_data[argmax_h_high * data_width + argmax_w_high] + static_cast( + im_data[argmax_h_high * data_width + argmax_w_high]) : 0; } @@ -135,7 +145,7 @@ void ModulatedDeformableCol2imCoord(const Context& dev_ctx, T* grad_offset, T* grad_mask); -template +template void ModulatedDeformableCol2im(const Context& dev_ctx, const T* data_col, const T* data_offset, @@ -147,7 +157,7 @@ void ModulatedDeformableCol2im(const Context& dev_ctx, const std::vector& stride, const std::vector& dilation, const int deformable_group, - T* grad_im); + MT* grad_im); template void FilterGradAddup(const Context& dev_ctx, @@ -176,7 +186,7 @@ void DeformableConvGradKernel(const Context& dev_ctx, DenseTensor* filter_grad, DenseTensor* mask_grad) { const int batch_size = static_cast(x.dims()[0]); - + using MT = typename phi::dtype::MPTypeTrait::Type; DDim input_shape = phi::slice_ddim(x.dims(), 1, x.dims().size()); std::vector input_shape_vec = phi::vectorize(input_shape); std::vector filter_shape_vec(phi::vectorize(filter.dims())); @@ -292,8 +302,8 @@ void DeformableConvGradKernel(const Context& dev_ctx, mask_grad_data_ptr); } if (dx) { - T* dx_ptr = dx->data(); - // get grad of input + MT* mt_dx_ptr = dev_ctx.template Alloc(dx); + ModulatedDeformableCol2im(dev_ctx, col_buffer_ptr, offset_ptr + i * im2col_step * input_offset_dim, @@ -305,7 +315,7 @@ void DeformableConvGradKernel(const Context& dev_ctx, strides, dilations, deformable_groups, - dx_ptr + i * im2col_step * input_dim); + mt_dx_ptr + i * im2col_step * input_dim); dx->Resize(x.dims()); } diff --git a/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h index f864c2e5f0e..d66f4e58e5b 100644 --- a/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h +++ b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h @@ -14,11 +14,13 @@ #pragma once +#include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/hostdevice.h" #include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/deformable_conv_functor.h" +#include "paddle/phi/kernels/transpose_kernel.h" #include "paddle/utils/optional.h" namespace phi { @@ -38,6 +40,12 @@ void DeformableConvKernel(const Context& dev_ctx, DenseTensor* out) { const int batch_size = static_cast(x.dims()[0]); + int temp_step = std::min(64, batch_size); + if (batch_size % temp_step == 0) { + im2col_step = temp_step; + } + + using MT = typename phi::dtype::MPTypeTrait::Type; std::vector filter_shape_vec(phi::vectorize(filter.dims())); std::vector output_shape_vec(phi::vectorize(out->dims())); @@ -101,8 +109,11 @@ void DeformableConvKernel(const Context& dev_ctx, dilations, deformable_groups, col_buffer_ptr); - DenseTensor output_3d = output_4d.Slice(i, i + 1).Resize( - phi::slice_ddim(output_4d.dims(), 1, output_4d.dims().size())); + DenseTensor output_3d = output_4d.Slice(i, i + 1).Resize(phi::slice_ddim( + output_4d.dims(), + 1, + output_4d.dims().size())); // group * C/group * (im2step * H * W) + // get the product of pixel and weight for (int g = 0; g < groups; ++g) { DenseTensor weight_3d_slice = weight_3d.Slice(g, g + 1).Resize( @@ -110,8 +121,11 @@ void DeformableConvKernel(const Context& dev_ctx, DenseTensor col_buffer_3d_slice = col_buffer_3d.Slice(g, g + 1).Resize(phi::slice_ddim( col_buffer_3d.dims(), 1, col_buffer_3d.dims().size())); - DenseTensor output_3d_slice = output_3d.Slice(g, g + 1).Resize( - phi::slice_ddim(output_3d.dims(), 1, output_3d.dims().size())); + DenseTensor output_3d_slice = + output_3d.Slice(g, g + 1).Resize(phi::slice_ddim( + output_3d.dims(), + 1, + output_3d.dims().size())); // C * ((im2col_step)*H*W)) blas.MatMul(weight_3d_slice, false, col_buffer_3d_slice, @@ -121,7 +135,29 @@ void DeformableConvKernel(const Context& dev_ctx, T(0.0)); } } - out->ShareDataWith(output_buffer).Resize(phi::make_ddim(output_shape_vec)); + + // swap axis to get the right result when im2col_step is greater than 1 + if (im2col_step > 1) { + std::vector axis(4); + axis[0] = 0; + axis[1] = 2; + axis[2] = 1; + axis[3] = 3; + + DenseTensor real_output_buffer = phi::Transpose( + dev_ctx, + output_4d.Resize( + phi::make_ddim({batch_size / im2col_step, + output_shape_vec[1], + im2col_step, + output_shape_vec[2] * output_shape_vec[3]})), + axis); + + out->ShareDataWith(real_output_buffer) + .Resize(phi::make_ddim(output_shape_vec)); + } else { + out->ShareDataWith(output_buffer).Resize(phi::make_ddim(output_shape_vec)); + } } } // namespace phi diff --git a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py index 3625dc6a53c..982bc5a3799 100644 --- a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py +++ b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py @@ -17,9 +17,11 @@ import unittest import numpy as np import paddle.fluid as fluid import paddle.fluid.core as core -from op_test import OpTest +from op_test import OpTest, skip_check_grad_ci from paddle.fluid.framework import _test_eager_guard +paddle.enable_static() + def dmc_bilinear(data_im, height, width, h, w): h_low = int(np.floor(h)) @@ -59,8 +61,8 @@ def dconv_im2col_gemm(input, offset, filter, group, conv_param): assert f_c * group == in_c assert np.mod(out_c, group) == 0 - stride, pad, dilation = conv_param['stride'], conv_param['pad'],\ - conv_param['dilation'] + stride, pad, dilation = conv_param['stride'], conv_param['pad'], \ + conv_param['dilation'] out_h = 1 + (in_h + 2 * pad[0] - (dilation[0] * (f_h - 1) + 1)) // stride[0] out_w = 1 + (in_w + 2 * pad[1] - (dilation[1] * (f_w - 1) + 1)) // stride[1] assert out_h == in_h @@ -74,18 +76,18 @@ def dconv_im2col_gemm(input, offset, filter, group, conv_param): for kh in range(f_h): for kw in range(f_w): offset_h_table = \ - offset[n, ::2, h, w].reshape(f_h, f_w) + offset[n, ::2, h, w].reshape(f_h, f_w) offset_w_table = \ - offset[n, 1::2, h, w].reshape(f_h, f_w) + offset[n, 1::2, h, w].reshape(f_h, f_w) offset_h = offset_h_table[kh, kw] offset_w = offset_w_table[kh, kw] val = 0 im_h = h * stride[0] + kh * dilation[0] \ - + offset_h - pad[0] + + offset_h - pad[0] im_w = w * stride[0] + kw * dilation[0] \ - + offset_w - pad[1] + + offset_w - pad[1] if im_h > -1 and im_w > -1 and \ - im_h < in_h and im_w < in_h: + im_h < in_h and im_w < in_h: val = dmc_bilinear(input[n, c], in_h, in_w, im_h, im_w) val_out = val @@ -284,6 +286,69 @@ class TestWithDouble(TestModulatedDeformableConvOp): self.dtype = np.float64 +class TestFP16(unittest.TestCase): + + def check_main(self, input_np, offset_np, filter_np, dtype): + paddle.disable_static() + input_np = input_np.astype(dtype) + offset_np = offset_np.astype(dtype) + filter_np = filter_np.astype(dtype) + + input = paddle.to_tensor(input_np) + offset = paddle.to_tensor(offset_np) + filter = paddle.to_tensor(filter_np) + + input.stop_gradient = False + offset.stop_gradient = False + filter.stop_gradient = False + + y = paddle.vision.ops.deform_conv2d(input, offset, filter) + input_grad, offset_grad, filter_grad = paddle.grad( + y, [input, offset, filter]) + y_np = y.numpy().astype('float32') + input_grad_np = input_grad.numpy().astype('float32') + offset_grad_np = offset_grad.numpy().astype('float32') + filter_grad_np = filter_grad.numpy().astype('float32') + paddle.enable_static() + return y_np, input_grad_np, offset_grad_np, filter_grad_np + + def test_main(self): + if not paddle.is_compiled_with_cuda(): + return + self.pad = [1, 1] + self.stride = [1, 1] + self.dilations = [1, 1] + self.groups = 1 + self.input_size = [2, 3, 5, 5] # NCHW + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [40, f_c, 1, 1] + self.im2col_step = 1 + self.deformable_groups = 1 + offset_c = 2 * self.deformable_groups * self.filter_size[ + 2] * self.filter_size[3] + self.offset_size = [ + self.input_size[0], offset_c, self.input_size[2], self.input_size[3] + ] + + input = np.random.random(self.input_size) + offset = 10 * np.random.random(self.offset_size) + filter = np.random.random(self.filter_size) + + y_np_1, input_g_np_1, offset_g_np_1, filter_g_np_1 = self.check_main( + input, offset, filter, 'float16') + y_np_2, input_g_np_2, offset_g_np_2, filter_g_np_2 = self.check_main( + input, offset, filter, 'float32') + + def assert_equal(x, y): + np.testing.assert_allclose(x, y, atol=3e-2) + + assert_equal(y_np_1, y_np_2) + assert_equal(input_g_np_1, input_g_np_2) + assert_equal(offset_g_np_1, offset_g_np_2) + assert_equal(filter_g_np_1, filter_g_np_2) + + class TestModulatedDeformableConvV1InvalidInput(unittest.TestCase): def test_error(self): -- GitLab