未验证 提交 5e0614a1 编写于 作者: R Rayman 提交者: GitHub

【Hackathon No.56&38】deformable_conv_v1 算子实现 float16 数据类型支持&前向运行加速 (#46111)

support fp16 for deformable conv
上级 a7e1b9d2
...@@ -97,14 +97,14 @@ inline void ModulatedDeformableCol2imCPUKernel( ...@@ -97,14 +97,14 @@ inline void ModulatedDeformableCol2imCPUKernel(
width); width);
*(grad_im + cur_bottom_grad_pos) = *(grad_im + cur_bottom_grad_pos) =
*(grad_im + cur_bottom_grad_pos) + weight * cur_top_grad; *(grad_im + cur_bottom_grad_pos) + (weight * cur_top_grad);
} }
} }
} }
} }
} }
template <typename T, typename Context> template <typename T, typename MT, typename Context>
void ModulatedDeformableCol2im(const Context& dev_ctx, void ModulatedDeformableCol2im(const Context& dev_ctx,
const T* data_col, const T* data_col,
const T* data_offset, const T* data_offset,
...@@ -116,7 +116,7 @@ void ModulatedDeformableCol2im(const Context& dev_ctx, ...@@ -116,7 +116,7 @@ void ModulatedDeformableCol2im(const Context& dev_ctx,
const std::vector<int>& stride, const std::vector<int>& stride,
const std::vector<int>& dilation, const std::vector<int>& dilation,
const int deformable_group, const int deformable_group,
T* grad_im) { MT* grad_im) {
int channel_per_deformable_group = im_shape[0] / deformable_group; int channel_per_deformable_group = im_shape[0] / deformable_group;
int num_kernels = col_shape[0] * col_shape[1] * col_shape[2] * col_shape[3]; int num_kernels = col_shape[0] * col_shape[1] * col_shape[2] * col_shape[3];
...@@ -222,22 +222,22 @@ void ModulatedDeformableCol2imCoordCPUKernel( ...@@ -222,22 +222,22 @@ void ModulatedDeformableCol2imCoordCPUKernel(
if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) { if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) {
inv_h = inv_w = -2; inv_h = inv_w = -2;
} else { } else {
mval += data_col_ptr[col_pos] * mval += data_col_ptr[col_pos] * funcs::DmcnIm2colBilinear<T, T>(
funcs::DmcnIm2colBilinear(data_im_ptr + cnt * height * width, data_im_ptr + cnt * height * width,
width, width,
height, height,
width, width,
inv_h, inv_h,
inv_w); inv_w);
} }
const T weight = const T weight =
DmcnGetCoordinateWeight(inv_h, DmcnGetCoordinateWeight<T, T>(inv_h,
inv_w, inv_w,
height, height,
width, width,
data_im_ptr + cnt * height * width, data_im_ptr + cnt * height * width,
width, width,
bp_dir); bp_dir);
if (data_mask_ptr) { if (data_mask_ptr) {
const int data_mask_hw_ptr = const int data_mask_hw_ptr =
(((i * kernel_w + j) * height_col + h_out) * width_col + w_out); (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
......
...@@ -13,8 +13,8 @@ ...@@ -13,8 +13,8 @@
// limitations under the License. // limitations under the License.
#include "paddle/phi/kernels/funcs/deformable_conv_functor.h" #include "paddle/phi/kernels/funcs/deformable_conv_functor.h"
#include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/common/amp_type_traits.h"
namespace phi { namespace phi {
namespace funcs { namespace funcs {
...@@ -82,8 +82,8 @@ inline void ModulatedDeformableIm2colCPUKernel( ...@@ -82,8 +82,8 @@ inline void ModulatedDeformableIm2colCPUKernel(
const T h_im = h_in + i * dilation_h + offset_h; const T h_im = h_in + i * dilation_h + offset_h;
const T w_im = w_in + j * dilation_w + offset_w; const T w_im = w_in + j * dilation_w + offset_w;
if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) { if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) {
val = val = DmcnIm2colBilinear<T, T>(
DmcnIm2colBilinear(data_im_ptr, width, height, width, h_im, w_im); data_im_ptr, width, height, width, h_im, w_im);
} }
*data_col_ptr = val; *data_col_ptr = val;
if (data_mask_ptr) { if (data_mask_ptr) {
......
...@@ -12,8 +12,11 @@ ...@@ -12,8 +12,11 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/kernels/funcs/deformable_conv_functor.h" #include "paddle/phi/kernels/funcs/deformable_conv_functor.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/common/float16.h"
#include "paddle/phi/core/device_context.h"
namespace phi { namespace phi {
namespace funcs { namespace funcs {
...@@ -51,6 +54,8 @@ __global__ void ModulatedDeformableIm2colGpuKernel( ...@@ -51,6 +54,8 @@ __global__ void ModulatedDeformableIm2colGpuKernel(
T* data_col) { T* data_col) {
int index = blockIdx.x * blockDim.x + threadIdx.x; int index = blockIdx.x * blockDim.x + threadIdx.x;
int offset = blockDim.x * gridDim.x; int offset = blockDim.x * gridDim.x;
using MT = typename phi::dtype::MPTypeTrait<T>::Type;
for (size_t i = index; i < nthreads; i += offset) { for (size_t i = index; i < nthreads; i += offset) {
const int w_col = i % width_col; const int w_col = i % width_col;
const int h_col = (i / width_col) % height_col; const int h_col = (i / width_col) % height_col;
...@@ -85,22 +90,22 @@ __global__ void ModulatedDeformableIm2colGpuKernel( ...@@ -85,22 +90,22 @@ __global__ void ModulatedDeformableIm2colGpuKernel(
((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
w_col; w_col;
const T offset_h = data_offset_ptr[data_offset_h_ptr]; const MT offset_h = static_cast<MT>(data_offset_ptr[data_offset_h_ptr]);
const T offset_w = data_offset_ptr[data_offset_w_ptr]; const MT offset_w = static_cast<MT>(data_offset_ptr[data_offset_w_ptr]);
T val = static_cast<T>(0); MT val = static_cast<MT>(0);
const T h_im = h_in + i * dilation_h + offset_h; const MT h_im = h_in + i * dilation_h + offset_h;
const T w_im = w_in + j * dilation_w + offset_w; const MT w_im = w_in + j * dilation_w + offset_w;
if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) { if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) {
val = val = DmcnIm2colBilinear<T, MT>(
DmcnIm2colBilinear(data_im_ptr, width, height, width, h_im, w_im); data_im_ptr, width, height, width, h_im, w_im);
} }
*data_col_ptr = val;
if (data_mask_ptr) { if (data_mask_ptr) {
const int data_mask_hw_ptr = const int data_mask_hw_ptr =
((i * kernel_w + j) * height_col + h_col) * width_col + w_col; ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
const T mask = data_mask_ptr[data_mask_hw_ptr]; const MT mask = static_cast<MT>(data_mask_ptr[data_mask_hw_ptr]);
*data_col_ptr *= mask; val *= mask;
} }
*data_col_ptr = static_cast<T>(val);
data_col_ptr += batch_size * height_col * width_col; data_col_ptr += batch_size * height_col * width_col;
} }
} }
...@@ -164,6 +169,20 @@ template void ModulatedDeformableIm2col( ...@@ -164,6 +169,20 @@ template void ModulatedDeformableIm2col(
const int deformable_groups, const int deformable_groups,
float* data_col); float* data_col);
template void ModulatedDeformableIm2col(
const phi::GPUContext& dev_ctx,
const phi::dtype::float16* data_im,
const phi::dtype::float16* data_offset,
const phi::dtype::float16* data_mask,
const std::vector<int64_t>& im_shape,
const std::vector<int64_t>& col_shape,
const std::vector<int64_t>& filter_shape,
const std::vector<int>& paddings,
const std::vector<int>& strides,
const std::vector<int>& dilations,
const int deformable_groups,
phi::dtype::float16* data_col);
template void ModulatedDeformableIm2col( template void ModulatedDeformableIm2col(
const phi::GPUContext& dev_ctx, const phi::GPUContext& dev_ctx,
const double* data_im, const double* data_im,
......
...@@ -14,44 +14,47 @@ ...@@ -14,44 +14,47 @@
#pragma once #pragma once
#include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/common/float16.h"
#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/dense_tensor.h"
namespace phi { namespace phi {
namespace funcs { namespace funcs {
template <typename T> template <typename T, typename MT>
HOSTDEVICE T DmcnIm2colBilinear(const T* bottom_data, HOSTDEVICE MT DmcnIm2colBilinear(const T* bottom_data,
const int data_width, const int data_width,
const int height, const int height,
const int width, const int width,
T h, MT h,
T w) { MT w) {
int h_low = floor(h); int h_low = floor(h);
int w_low = floor(w); int w_low = floor(w);
int h_high = h_low + 1; int h_high = h_low + 1;
int w_high = w_low + 1; int w_high = w_low + 1;
T lh = h - h_low; MT lh = h - h_low;
T lw = w - w_low; MT lw = w - w_low;
T hh = 1 - lh; MT hh = 1 - lh;
T hw = 1 - lw; MT hw = 1 - lw;
T v1 = MT v1 = (h_low >= 0 && w_low >= 0)
(h_low >= 0 && w_low >= 0) ? bottom_data[h_low * data_width + w_low] : 0; ? static_cast<MT>(bottom_data[h_low * data_width + w_low])
T v2 = (h_low >= 0 && w_high <= width - 1) : 0;
? bottom_data[h_low * data_width + w_high] MT v2 = (h_low >= 0 && w_high <= width - 1)
: 0; ? static_cast<MT>(bottom_data[h_low * data_width + w_high])
T v3 = (h_high <= height - 1 && w_low >= 0) : 0;
? bottom_data[h_high * data_width + w_low] MT v3 = (h_high <= height - 1 && w_low >= 0)
: 0; ? static_cast<MT>(bottom_data[h_high * data_width + w_low])
T v4 = (h_high <= height - 1 && w_high <= width - 1) : 0;
? bottom_data[h_high * data_width + w_high] MT v4 = (h_high <= height - 1 && w_high <= width - 1)
: 0; ? static_cast<MT>(bottom_data[h_high * data_width + w_high])
: 0;
T w1 = hh * hw; MT w1 = hh * hw;
T w2 = hh * lw; MT w2 = hh * lw;
T w3 = lh * hw; MT w3 = lh * hw;
T w4 = lh * lw; MT w4 = lh * lw;
return w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4; return w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4;
} }
......
...@@ -29,7 +29,7 @@ static inline int NumBlocks(const int N) { ...@@ -29,7 +29,7 @@ static inline int NumBlocks(const int N) {
kNumMaximumNumBlocks); kNumMaximumNumBlocks);
} }
template <typename T> template <typename T, typename MT>
__global__ void ModulatedDeformableCol2imGpuKernel( __global__ void ModulatedDeformableCol2imGpuKernel(
const int nthreads, const int nthreads,
const T* data_col, const T* data_col,
...@@ -51,9 +51,10 @@ __global__ void ModulatedDeformableCol2imGpuKernel( ...@@ -51,9 +51,10 @@ __global__ void ModulatedDeformableCol2imGpuKernel(
const int deformable_group, const int deformable_group,
const int height_col, const int height_col,
const int width_col, const int width_col,
T* grad_im) { MT* grad_im) {
int index = blockIdx.x * blockDim.x + threadIdx.x; int index = blockIdx.x * blockDim.x + threadIdx.x;
int offset = blockDim.x * gridDim.x; int offset = blockDim.x * gridDim.x;
// using MT = typename phi::dtype::MPTypeTrait<T>::Type;
for (size_t thread = index; thread < nthreads; thread += offset) { for (size_t thread = index; thread < nthreads; thread += offset) {
const int j = (thread / width_col / height_col / batch_size) % kernel_w; const int j = (thread / width_col / height_col / batch_size) % kernel_w;
const int i = const int i =
...@@ -78,17 +79,17 @@ __global__ void ModulatedDeformableCol2imGpuKernel( ...@@ -78,17 +79,17 @@ __global__ void ModulatedDeformableCol2imGpuKernel(
((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out; ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
const int data_mask_hw_ptr = const int data_mask_hw_ptr =
((i * kernel_w + j) * height_col + h_out) * width_col + w_out; ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
const T offset_h = data_offset_ptr[data_offset_h_ptr]; const MT offset_h = static_cast<MT>(data_offset_ptr[data_offset_h_ptr]);
const T offset_w = data_offset_ptr[data_offset_w_ptr]; const MT offset_w = static_cast<MT>(data_offset_ptr[data_offset_w_ptr]);
const T cur_inv_h_data = h_in + i * dilation_h + offset_h; const MT cur_inv_h_data = h_in + i * dilation_h + offset_h;
const T cur_inv_w_data = w_in + j * dilation_w + offset_w; const MT cur_inv_w_data = w_in + j * dilation_w + offset_w;
T cur_top_grad = data_col[thread]; MT cur_top_grad = static_cast<MT>(data_col[thread]);
if (data_mask) { if (data_mask) {
const T* data_mask_ptr = const T* data_mask_ptr =
data_mask + (b * deformable_group + deformable_group_index) * data_mask + (b * deformable_group + deformable_group_index) *
kernel_h * kernel_w * height_col * width_col; kernel_h * kernel_w * height_col * width_col;
const T mask = data_mask_ptr[data_mask_hw_ptr]; const MT mask = static_cast<MT>(data_mask_ptr[data_mask_hw_ptr]);
cur_top_grad *= mask; cur_top_grad *= mask;
} }
const int cur_h = static_cast<int>(cur_inv_h_data); const int cur_h = static_cast<int>(cur_inv_h_data);
...@@ -100,13 +101,12 @@ __global__ void ModulatedDeformableCol2imGpuKernel( ...@@ -100,13 +101,12 @@ __global__ void ModulatedDeformableCol2imGpuKernel(
abs(cur_inv_w_data - (cur_w + dx)) < 1) { abs(cur_inv_w_data - (cur_w + dx)) < 1) {
int cur_bottom_grad_pos = int cur_bottom_grad_pos =
((b * channels + c) * height + cur_h + dy) * width + cur_w + dx; ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
T weight = DmcnGetGradientWeight(cur_inv_h_data, MT weight = DmcnGetGradientWeight(cur_inv_h_data,
cur_inv_w_data, cur_inv_w_data,
cur_h + dy, cur_h + dy,
cur_w + dx, cur_w + dx,
height, height,
width); width);
paddle::platform::CudaAtomicAdd(grad_im + cur_bottom_grad_pos, paddle::platform::CudaAtomicAdd(grad_im + cur_bottom_grad_pos,
weight * cur_top_grad); weight * cur_top_grad);
} }
...@@ -115,7 +115,7 @@ __global__ void ModulatedDeformableCol2imGpuKernel( ...@@ -115,7 +115,7 @@ __global__ void ModulatedDeformableCol2imGpuKernel(
} }
} }
template <typename T, typename Context> template <typename T, typename MT, typename Context>
void ModulatedDeformableCol2im(const Context& dev_ctx, void ModulatedDeformableCol2im(const Context& dev_ctx,
const T* data_col, const T* data_col,
const T* data_offset, const T* data_offset,
...@@ -127,13 +127,13 @@ void ModulatedDeformableCol2im(const Context& dev_ctx, ...@@ -127,13 +127,13 @@ void ModulatedDeformableCol2im(const Context& dev_ctx,
const std::vector<int>& stride, const std::vector<int>& stride,
const std::vector<int>& dilation, const std::vector<int>& dilation,
const int deformable_group, const int deformable_group,
T* grad_im) { MT* grad_im) {
int channel_per_deformable_group = im_shape[0] / deformable_group; int channel_per_deformable_group = im_shape[0] / deformable_group;
int num_kernels = col_shape[0] * col_shape[1] * col_shape[2] * col_shape[3]; int num_kernels = col_shape[0] * col_shape[1] * col_shape[2] * col_shape[3];
int blocks = NumBlocks(num_kernels); int blocks = NumBlocks(num_kernels);
int threads = kNumCUDAThreads; int threads = kNumCUDAThreads;
ModulatedDeformableCol2imGpuKernel<T> ModulatedDeformableCol2imGpuKernel<T, MT>
<<<blocks, threads, 0, dev_ctx.stream()>>>(num_kernels, <<<blocks, threads, 0, dev_ctx.stream()>>>(num_kernels,
data_col, data_col,
data_offset, data_offset,
...@@ -185,8 +185,9 @@ __global__ void ModulatedDeformableCol2imCoordGpuKernel( ...@@ -185,8 +185,9 @@ __global__ void ModulatedDeformableCol2imCoordGpuKernel(
T* grad_mask) { T* grad_mask) {
int index = blockIdx.x * blockDim.x + threadIdx.x; int index = blockIdx.x * blockDim.x + threadIdx.x;
int offset = blockDim.x * gridDim.x; int offset = blockDim.x * gridDim.x;
using MT = typename phi::dtype::MPTypeTrait<T>::Type;
for (size_t i = index; i < nthreads; i += offset) { for (size_t i = index; i < nthreads; i += offset) {
T val = 0, mval = 0; MT val = 0, mval = 0;
const int w = i % width_col; const int w = i % width_col;
const int h = (i / width_col) % height_col; const int h = (i / width_col) % height_col;
const int c = (i / width_col / height_col) % offset_channels; const int c = (i / width_col / height_col) % offset_channels;
...@@ -231,40 +232,42 @@ __global__ void ModulatedDeformableCol2imCoordGpuKernel( ...@@ -231,40 +232,42 @@ __global__ void ModulatedDeformableCol2imCoordGpuKernel(
const int data_offset_w_ptr = const int data_offset_w_ptr =
(((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
w_out); w_out);
const T offset_h = data_offset_ptr[data_offset_h_ptr];
const T offset_w = data_offset_ptr[data_offset_w_ptr]; const MT offset_h = static_cast<MT>(data_offset_ptr[data_offset_h_ptr]);
T inv_h = h_in + i * dilation_h + offset_h; const MT offset_w = static_cast<MT>(data_offset_ptr[data_offset_w_ptr]);
T inv_w = w_in + j * dilation_w + offset_w; MT inv_h = h_in + i * dilation_h + offset_h;
MT inv_w = w_in + j * dilation_w + offset_w;
if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) { if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) {
inv_h = inv_w = -2; inv_h = inv_w = -2;
} else { } else {
mval += data_col_ptr[col_pos] * mval +=
funcs::DmcnIm2colBilinear(data_im_ptr + cnt * height * width, static_cast<MT>(data_col_ptr[col_pos]) *
width, funcs::DmcnIm2colBilinear<T, MT>(data_im_ptr + cnt * height * width,
height, width,
width, height,
inv_h, width,
inv_w); inv_h,
inv_w);
} }
const T weight = const MT weight =
DmcnGetCoordinateWeight(inv_h, DmcnGetCoordinateWeight<T, MT>(inv_h,
inv_w, inv_w,
height, height,
width, width,
data_im_ptr + cnt * height * width, data_im_ptr + cnt * height * width,
width, width,
bp_dir); bp_dir);
if (data_mask_ptr) { if (data_mask_ptr) {
const int data_mask_hw_ptr = const int data_mask_hw_ptr =
(((i * kernel_w + j) * height_col + h_out) * width_col + w_out); (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
const T mask = data_mask_ptr[data_mask_hw_ptr]; const MT mask = static_cast<MT>(data_mask_ptr[data_mask_hw_ptr]);
val += weight * data_col_ptr[col_pos] * mask; val += weight * static_cast<MT>(data_col_ptr[col_pos]) * mask;
} else { } else {
val += weight * data_col_ptr[col_pos]; val += weight * static_cast<MT>(data_col_ptr[col_pos]);
} }
cnt += 1; cnt += 1;
} }
grad_offset[i] = val; grad_offset[i] = static_cast<T>(val);
if (grad_mask && offset_c % 2 == 0) if (grad_mask && offset_c % 2 == 0)
grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h *
kernel_w + kernel_w +
...@@ -359,4 +362,5 @@ PD_REGISTER_KERNEL(deformable_conv_grad, ...@@ -359,4 +362,5 @@ PD_REGISTER_KERNEL(deformable_conv_grad,
ALL_LAYOUT, ALL_LAYOUT,
phi::DeformableConvGradKernel, phi::DeformableConvGradKernel,
float, float,
double) {} double,
paddle::platform::float16) {}
...@@ -23,4 +23,5 @@ PD_REGISTER_KERNEL(deformable_conv, ...@@ -23,4 +23,5 @@ PD_REGISTER_KERNEL(deformable_conv,
ALL_LAYOUT, ALL_LAYOUT,
phi::DeformableConvKernel, phi::DeformableConvKernel,
float, float,
double) {} double,
phi::dtype::float16) {}
...@@ -14,8 +14,10 @@ ...@@ -14,8 +14,10 @@
#pragma once #pragma once
#include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/hostdevice.h" #include "paddle/phi/core/hostdevice.h"
#include "paddle/phi/kernels/cast_kernel.h"
#include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/full_kernel.h" #include "paddle/phi/kernels/full_kernel.h"
#include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/blas/blas.h"
...@@ -58,14 +60,14 @@ HOSTDEVICE T DmcnGetGradientWeight(T argmax_h, ...@@ -58,14 +60,14 @@ HOSTDEVICE T DmcnGetGradientWeight(T argmax_h,
return weight; return weight;
} }
template <typename T> template <typename T, typename MT>
HOSTDEVICE T DmcnGetCoordinateWeight(T argmax_h, HOSTDEVICE MT DmcnGetCoordinateWeight(MT argmax_h,
T argmax_w, MT argmax_w,
const int height, const int height,
const int width, const int width,
const T* im_data, const T* im_data,
const int data_width, const int data_width,
const int bp_dir) { const int bp_dir) {
if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
argmax_w >= width) { argmax_w >= width) {
return 0; return 0;
...@@ -76,43 +78,51 @@ HOSTDEVICE T DmcnGetCoordinateWeight(T argmax_h, ...@@ -76,43 +78,51 @@ HOSTDEVICE T DmcnGetCoordinateWeight(T argmax_h,
int argmax_h_high = argmax_h_low + 1; int argmax_h_high = argmax_h_low + 1;
int argmax_w_high = argmax_w_low + 1; int argmax_w_high = argmax_w_low + 1;
T weight = 0; MT weight = 0;
if (bp_dir == 0) { if (bp_dir == 0) {
weight += (argmax_h_low >= 0 && argmax_w_low >= 0) weight += (argmax_h_low >= 0 && argmax_w_low >= 0)
? -1 * (argmax_w_low + 1 - argmax_w) * ? -1 * (argmax_w_low + 1 - argmax_w) *
im_data[argmax_h_low * data_width + argmax_w_low] static_cast<MT>(
im_data[argmax_h_low * data_width + argmax_w_low])
: 0; : 0;
weight += (argmax_h_low >= 0 && argmax_w_high <= width - 1) weight += (argmax_h_low >= 0 && argmax_w_high <= width - 1)
? -1 * (argmax_w - argmax_w_low) * ? -1 * (argmax_w - argmax_w_low) *
im_data[argmax_h_low * data_width + argmax_w_high] static_cast<MT>(
im_data[argmax_h_low * data_width + argmax_w_high])
: 0; : 0;
weight += (argmax_h_high <= height - 1 && argmax_w_low >= 0) weight += (argmax_h_high <= height - 1 && argmax_w_low >= 0)
? (argmax_w_low + 1 - argmax_w) * ? (argmax_w_low + 1 - argmax_w) *
im_data[argmax_h_high * data_width + argmax_w_low] static_cast<MT>(
im_data[argmax_h_high * data_width + argmax_w_low])
: 0; : 0;
weight += (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) weight += (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
? (argmax_w - argmax_w_low) * ? (argmax_w - argmax_w_low) *
im_data[argmax_h_high * data_width + argmax_w_high] static_cast<MT>(
im_data[argmax_h_high * data_width + argmax_w_high])
: 0; : 0;
} else if (bp_dir == 1) { } else if (bp_dir == 1) {
weight += (argmax_h_low >= 0 && argmax_w_low >= 0) weight += (argmax_h_low >= 0 && argmax_w_low >= 0)
? -1 * (argmax_h_low + 1 - argmax_h) * ? -1 * (argmax_h_low + 1 - argmax_h) *
im_data[argmax_h_low * data_width + argmax_w_low] static_cast<MT>(
im_data[argmax_h_low * data_width + argmax_w_low])
: 0; : 0;
weight += (argmax_h_low >= 0 && argmax_w_high <= width - 1) weight += (argmax_h_low >= 0 && argmax_w_high <= width - 1)
? (argmax_h_low + 1 - argmax_h) * ? (argmax_h_low + 1 - argmax_h) *
im_data[argmax_h_low * data_width + argmax_w_high] static_cast<MT>(
im_data[argmax_h_low * data_width + argmax_w_high])
: 0; : 0;
weight += (argmax_h_high <= height - 1 && argmax_w_low >= 0) weight += (argmax_h_high <= height - 1 && argmax_w_low >= 0)
? -1 * (argmax_h - argmax_h_low) * ? -1 * (argmax_h - argmax_h_low) *
im_data[argmax_h_high * data_width + argmax_w_low] static_cast<MT>(
im_data[argmax_h_high * data_width + argmax_w_low])
: 0; : 0;
weight += (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) weight += (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
? (argmax_h - argmax_h_low) * ? (argmax_h - argmax_h_low) *
im_data[argmax_h_high * data_width + argmax_w_high] static_cast<MT>(
im_data[argmax_h_high * data_width + argmax_w_high])
: 0; : 0;
} }
...@@ -135,7 +145,7 @@ void ModulatedDeformableCol2imCoord(const Context& dev_ctx, ...@@ -135,7 +145,7 @@ void ModulatedDeformableCol2imCoord(const Context& dev_ctx,
T* grad_offset, T* grad_offset,
T* grad_mask); T* grad_mask);
template <typename T, typename Context> template <typename T, typename MT, typename Context>
void ModulatedDeformableCol2im(const Context& dev_ctx, void ModulatedDeformableCol2im(const Context& dev_ctx,
const T* data_col, const T* data_col,
const T* data_offset, const T* data_offset,
...@@ -147,7 +157,7 @@ void ModulatedDeformableCol2im(const Context& dev_ctx, ...@@ -147,7 +157,7 @@ void ModulatedDeformableCol2im(const Context& dev_ctx,
const std::vector<int>& stride, const std::vector<int>& stride,
const std::vector<int>& dilation, const std::vector<int>& dilation,
const int deformable_group, const int deformable_group,
T* grad_im); MT* grad_im);
template <typename T, typename Context> template <typename T, typename Context>
void FilterGradAddup(const Context& dev_ctx, void FilterGradAddup(const Context& dev_ctx,
...@@ -176,7 +186,7 @@ void DeformableConvGradKernel(const Context& dev_ctx, ...@@ -176,7 +186,7 @@ void DeformableConvGradKernel(const Context& dev_ctx,
DenseTensor* filter_grad, DenseTensor* filter_grad,
DenseTensor* mask_grad) { DenseTensor* mask_grad) {
const int batch_size = static_cast<int>(x.dims()[0]); const int batch_size = static_cast<int>(x.dims()[0]);
using MT = typename phi::dtype::MPTypeTrait<T>::Type;
DDim input_shape = phi::slice_ddim(x.dims(), 1, x.dims().size()); DDim input_shape = phi::slice_ddim(x.dims(), 1, x.dims().size());
std::vector<int64_t> input_shape_vec = phi::vectorize(input_shape); std::vector<int64_t> input_shape_vec = phi::vectorize(input_shape);
std::vector<int64_t> filter_shape_vec(phi::vectorize(filter.dims())); std::vector<int64_t> filter_shape_vec(phi::vectorize(filter.dims()));
...@@ -292,8 +302,8 @@ void DeformableConvGradKernel(const Context& dev_ctx, ...@@ -292,8 +302,8 @@ void DeformableConvGradKernel(const Context& dev_ctx,
mask_grad_data_ptr); mask_grad_data_ptr);
} }
if (dx) { if (dx) {
T* dx_ptr = dx->data<T>(); MT* mt_dx_ptr = dev_ctx.template Alloc<MT>(dx);
// get grad of input
ModulatedDeformableCol2im(dev_ctx, ModulatedDeformableCol2im(dev_ctx,
col_buffer_ptr, col_buffer_ptr,
offset_ptr + i * im2col_step * input_offset_dim, offset_ptr + i * im2col_step * input_offset_dim,
...@@ -305,7 +315,7 @@ void DeformableConvGradKernel(const Context& dev_ctx, ...@@ -305,7 +315,7 @@ void DeformableConvGradKernel(const Context& dev_ctx,
strides, strides,
dilations, dilations,
deformable_groups, deformable_groups,
dx_ptr + i * im2col_step * input_dim); mt_dx_ptr + i * im2col_step * input_dim);
dx->Resize(x.dims()); dx->Resize(x.dims());
} }
......
...@@ -14,11 +14,13 @@ ...@@ -14,11 +14,13 @@
#pragma once #pragma once
#include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/hostdevice.h" #include "paddle/phi/core/hostdevice.h"
#include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/deformable_conv_functor.h" #include "paddle/phi/kernels/funcs/deformable_conv_functor.h"
#include "paddle/phi/kernels/transpose_kernel.h"
#include "paddle/utils/optional.h" #include "paddle/utils/optional.h"
namespace phi { namespace phi {
...@@ -38,6 +40,12 @@ void DeformableConvKernel(const Context& dev_ctx, ...@@ -38,6 +40,12 @@ void DeformableConvKernel(const Context& dev_ctx,
DenseTensor* out) { DenseTensor* out) {
const int batch_size = static_cast<int>(x.dims()[0]); const int batch_size = static_cast<int>(x.dims()[0]);
int temp_step = std::min(64, batch_size);
if (batch_size % temp_step == 0) {
im2col_step = temp_step;
}
using MT = typename phi::dtype::MPTypeTrait<T>::Type;
std::vector<int64_t> filter_shape_vec(phi::vectorize(filter.dims())); std::vector<int64_t> filter_shape_vec(phi::vectorize(filter.dims()));
std::vector<int64_t> output_shape_vec(phi::vectorize(out->dims())); std::vector<int64_t> output_shape_vec(phi::vectorize(out->dims()));
...@@ -101,8 +109,11 @@ void DeformableConvKernel(const Context& dev_ctx, ...@@ -101,8 +109,11 @@ void DeformableConvKernel(const Context& dev_ctx,
dilations, dilations,
deformable_groups, deformable_groups,
col_buffer_ptr); col_buffer_ptr);
DenseTensor output_3d = output_4d.Slice(i, i + 1).Resize( DenseTensor output_3d = output_4d.Slice(i, i + 1).Resize(phi::slice_ddim(
phi::slice_ddim(output_4d.dims(), 1, output_4d.dims().size())); output_4d.dims(),
1,
output_4d.dims().size())); // group * C/group * (im2step * H * W)
// get the product of pixel and weight // get the product of pixel and weight
for (int g = 0; g < groups; ++g) { for (int g = 0; g < groups; ++g) {
DenseTensor weight_3d_slice = weight_3d.Slice(g, g + 1).Resize( DenseTensor weight_3d_slice = weight_3d.Slice(g, g + 1).Resize(
...@@ -110,8 +121,11 @@ void DeformableConvKernel(const Context& dev_ctx, ...@@ -110,8 +121,11 @@ void DeformableConvKernel(const Context& dev_ctx,
DenseTensor col_buffer_3d_slice = DenseTensor col_buffer_3d_slice =
col_buffer_3d.Slice(g, g + 1).Resize(phi::slice_ddim( col_buffer_3d.Slice(g, g + 1).Resize(phi::slice_ddim(
col_buffer_3d.dims(), 1, col_buffer_3d.dims().size())); col_buffer_3d.dims(), 1, col_buffer_3d.dims().size()));
DenseTensor output_3d_slice = output_3d.Slice(g, g + 1).Resize( DenseTensor output_3d_slice =
phi::slice_ddim(output_3d.dims(), 1, output_3d.dims().size())); output_3d.Slice(g, g + 1).Resize(phi::slice_ddim(
output_3d.dims(),
1,
output_3d.dims().size())); // C * ((im2col_step)*H*W))
blas.MatMul(weight_3d_slice, blas.MatMul(weight_3d_slice,
false, false,
col_buffer_3d_slice, col_buffer_3d_slice,
...@@ -121,7 +135,29 @@ void DeformableConvKernel(const Context& dev_ctx, ...@@ -121,7 +135,29 @@ void DeformableConvKernel(const Context& dev_ctx,
T(0.0)); T(0.0));
} }
} }
out->ShareDataWith(output_buffer).Resize(phi::make_ddim(output_shape_vec));
// swap axis to get the right result when im2col_step is greater than 1
if (im2col_step > 1) {
std::vector<int> axis(4);
axis[0] = 0;
axis[1] = 2;
axis[2] = 1;
axis[3] = 3;
DenseTensor real_output_buffer = phi::Transpose<T, Context>(
dev_ctx,
output_4d.Resize(
phi::make_ddim({batch_size / im2col_step,
output_shape_vec[1],
im2col_step,
output_shape_vec[2] * output_shape_vec[3]})),
axis);
out->ShareDataWith(real_output_buffer)
.Resize(phi::make_ddim(output_shape_vec));
} else {
out->ShareDataWith(output_buffer).Resize(phi::make_ddim(output_shape_vec));
}
} }
} // namespace phi } // namespace phi
...@@ -17,9 +17,11 @@ import unittest ...@@ -17,9 +17,11 @@ import unittest
import numpy as np import numpy as np
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
from op_test import OpTest from op_test import OpTest, skip_check_grad_ci
from paddle.fluid.framework import _test_eager_guard from paddle.fluid.framework import _test_eager_guard
paddle.enable_static()
def dmc_bilinear(data_im, height, width, h, w): def dmc_bilinear(data_im, height, width, h, w):
h_low = int(np.floor(h)) h_low = int(np.floor(h))
...@@ -59,8 +61,8 @@ def dconv_im2col_gemm(input, offset, filter, group, conv_param): ...@@ -59,8 +61,8 @@ def dconv_im2col_gemm(input, offset, filter, group, conv_param):
assert f_c * group == in_c assert f_c * group == in_c
assert np.mod(out_c, group) == 0 assert np.mod(out_c, group) == 0
stride, pad, dilation = conv_param['stride'], conv_param['pad'],\ stride, pad, dilation = conv_param['stride'], conv_param['pad'], \
conv_param['dilation'] conv_param['dilation']
out_h = 1 + (in_h + 2 * pad[0] - (dilation[0] * (f_h - 1) + 1)) // stride[0] out_h = 1 + (in_h + 2 * pad[0] - (dilation[0] * (f_h - 1) + 1)) // stride[0]
out_w = 1 + (in_w + 2 * pad[1] - (dilation[1] * (f_w - 1) + 1)) // stride[1] out_w = 1 + (in_w + 2 * pad[1] - (dilation[1] * (f_w - 1) + 1)) // stride[1]
assert out_h == in_h assert out_h == in_h
...@@ -74,18 +76,18 @@ def dconv_im2col_gemm(input, offset, filter, group, conv_param): ...@@ -74,18 +76,18 @@ def dconv_im2col_gemm(input, offset, filter, group, conv_param):
for kh in range(f_h): for kh in range(f_h):
for kw in range(f_w): for kw in range(f_w):
offset_h_table = \ offset_h_table = \
offset[n, ::2, h, w].reshape(f_h, f_w) offset[n, ::2, h, w].reshape(f_h, f_w)
offset_w_table = \ offset_w_table = \
offset[n, 1::2, h, w].reshape(f_h, f_w) offset[n, 1::2, h, w].reshape(f_h, f_w)
offset_h = offset_h_table[kh, kw] offset_h = offset_h_table[kh, kw]
offset_w = offset_w_table[kh, kw] offset_w = offset_w_table[kh, kw]
val = 0 val = 0
im_h = h * stride[0] + kh * dilation[0] \ im_h = h * stride[0] + kh * dilation[0] \
+ offset_h - pad[0] + offset_h - pad[0]
im_w = w * stride[0] + kw * dilation[0] \ im_w = w * stride[0] + kw * dilation[0] \
+ offset_w - pad[1] + offset_w - pad[1]
if im_h > -1 and im_w > -1 and \ if im_h > -1 and im_w > -1 and \
im_h < in_h and im_w < in_h: im_h < in_h and im_w < in_h:
val = dmc_bilinear(input[n, c], in_h, in_w, val = dmc_bilinear(input[n, c], in_h, in_w,
im_h, im_w) im_h, im_w)
val_out = val val_out = val
...@@ -284,6 +286,69 @@ class TestWithDouble(TestModulatedDeformableConvOp): ...@@ -284,6 +286,69 @@ class TestWithDouble(TestModulatedDeformableConvOp):
self.dtype = np.float64 self.dtype = np.float64
class TestFP16(unittest.TestCase):
def check_main(self, input_np, offset_np, filter_np, dtype):
paddle.disable_static()
input_np = input_np.astype(dtype)
offset_np = offset_np.astype(dtype)
filter_np = filter_np.astype(dtype)
input = paddle.to_tensor(input_np)
offset = paddle.to_tensor(offset_np)
filter = paddle.to_tensor(filter_np)
input.stop_gradient = False
offset.stop_gradient = False
filter.stop_gradient = False
y = paddle.vision.ops.deform_conv2d(input, offset, filter)
input_grad, offset_grad, filter_grad = paddle.grad(
y, [input, offset, filter])
y_np = y.numpy().astype('float32')
input_grad_np = input_grad.numpy().astype('float32')
offset_grad_np = offset_grad.numpy().astype('float32')
filter_grad_np = filter_grad.numpy().astype('float32')
paddle.enable_static()
return y_np, input_grad_np, offset_grad_np, filter_grad_np
def test_main(self):
if not paddle.is_compiled_with_cuda():
return
self.pad = [1, 1]
self.stride = [1, 1]
self.dilations = [1, 1]
self.groups = 1
self.input_size = [2, 3, 5, 5] # NCHW
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] // self.groups
self.filter_size = [40, f_c, 1, 1]
self.im2col_step = 1
self.deformable_groups = 1
offset_c = 2 * self.deformable_groups * self.filter_size[
2] * self.filter_size[3]
self.offset_size = [
self.input_size[0], offset_c, self.input_size[2], self.input_size[3]
]
input = np.random.random(self.input_size)
offset = 10 * np.random.random(self.offset_size)
filter = np.random.random(self.filter_size)
y_np_1, input_g_np_1, offset_g_np_1, filter_g_np_1 = self.check_main(
input, offset, filter, 'float16')
y_np_2, input_g_np_2, offset_g_np_2, filter_g_np_2 = self.check_main(
input, offset, filter, 'float32')
def assert_equal(x, y):
np.testing.assert_allclose(x, y, atol=3e-2)
assert_equal(y_np_1, y_np_2)
assert_equal(input_g_np_1, input_g_np_2)
assert_equal(offset_g_np_1, offset_g_np_2)
assert_equal(filter_g_np_1, filter_g_np_2)
class TestModulatedDeformableConvV1InvalidInput(unittest.TestCase): class TestModulatedDeformableConvV1InvalidInput(unittest.TestCase):
def test_error(self): def test_error(self):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册