From 663ebd5f39c17fcfbe4dd250c90e1b873d83db0c Mon Sep 17 00:00:00 2001 From: duanyanhui <45005871+YanhuiDua@users.noreply.github.com> Date: Wed, 31 Aug 2022 20:07:47 +0800 Subject: [PATCH] enhance grid_sampler cpu kernel to 5D input (#45578) * enhance grid_sampler cpu kernel to 5D input * fix bug when 5D input tensor running on the cudnn kernel --- .../kernels/cpu/grid_sample_grad_kernel.cc | 502 ++++++++++++++++-- paddle/phi/kernels/cpu/grid_sample_kernel.cc | 244 ++++++++- paddle/phi/kernels/cpu/grid_sample_utils.h | 190 ++++++- .../tests/unittests/test_grid_sampler_op.py | 97 +--- python/paddle/nn/functional/vision.py | 3 + 5 files changed, 898 insertions(+), 138 deletions(-) diff --git a/paddle/phi/kernels/cpu/grid_sample_grad_kernel.cc b/paddle/phi/kernels/cpu/grid_sample_grad_kernel.cc index 32fa0d5aaf..40d42b1346 100644 --- a/paddle/phi/kernels/cpu/grid_sample_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/grid_sample_grad_kernel.cc @@ -82,6 +82,67 @@ static inline void ClipWithMask(const CPUContext& ctx, } } +template +static inline void ClipWithMask3D(const CPUContext& ctx, + const int max_val, // height-1 or width-1 + bool align_corners, + std::string padding_mode, + DenseTensor* grid_slice, + DenseTensor* grid_scale) { + auto& place = *ctx.eigen_device(); + grid_scale->Resize(grid_slice->dims()); + ctx.Alloc(grid_scale); + + auto grid_slice_t = EigenTensor::From(*grid_slice); + auto factor = static_cast(max_val * 0.5); + if (!align_corners) { + factor = static_cast((max_val + 1) * 0.5); + } + auto grid_scale_t = EigenTensor::From(*grid_scale).setConstant(factor); + + if (padding_mode == "border") { + // auto bounded_lo = grid_slice_t.cwiseMax(static_cast(0)); + auto res = grid_slice_t.cwiseMax(static_cast(0)) + .cwiseMin(static_cast(max_val)); + + auto in_bound = (res == grid_slice_t); + grid_scale_t.device(place) = grid_scale_t * in_bound.template cast(); + grid_slice_t.device(place) = res; + } else if (padding_mode == "reflection") { + if (align_corners) { + auto double_range = static_cast(max_val * 2); + auto is_neg = (grid_slice_t < static_cast(0)); + auto grid_abs = grid_slice_t.abs(); + auto extra = grid_abs - (grid_abs / double_range).floor() * double_range; + auto one_more_flip = (extra > (double_range - extra)); + grid_scale_t.device(place) = + grid_scale_t * ((is_neg == one_more_flip).template cast() - + (is_neg != one_more_flip).template cast()); + grid_slice_t.device(place) = extra.cwiseMin(double_range - extra); + if (max_val == 0) { + grid_slice_t.device(place) = grid_slice_t.constant(static_cast(0)); + } + } else { + auto double_range = static_cast((max_val + 1) * 2); + auto grid_abs = (grid_slice_t + static_cast(0.5)).abs(); + auto is_neg = ((grid_slice_t + static_cast(0.5)) < static_cast(0)); + auto extra = grid_abs - (grid_abs / double_range).floor() * double_range; + auto one_more_flip = (extra > (double_range - extra)); + auto reflected = + extra.cwiseMin(double_range - extra) - static_cast(0.5); + auto clipped = reflected.cwiseMax(static_cast(0)) + .cwiseMin(static_cast(max_val)); + auto in_bound = (clipped == reflected).template cast(); + grid_scale_t.device(place) = + grid_scale_t * + ((is_neg == one_more_flip).template cast() - + (is_neg != one_more_flip).template cast()) * + in_bound; + grid_slice_t.device(place) = clipped; + } + } +} + template static void CalcGridLocationsWithGrad(const CPUContext& ctx, const DenseTensor& grid, @@ -118,6 +179,52 @@ static void CalcGridLocationsWithGrad(const CPUContext& ctx, ctx, in_h - 1, align_corners, padding_mode, grid_y, grid_y_scale); } +template +static void Calc3DGridLocationsWithGrad(const CPUContext& ctx, + const DenseTensor& grid, + const int in_d, + const int in_h, + const int in_w, + bool align_corners, + std::string padding_mode, + DenseTensor* grid_x, + DenseTensor* grid_y, + DenseTensor* grid_z, + DenseTensor* grid_x_scale, + DenseTensor* grid_y_scale, + DenseTensor* grid_z_scale) { + const int n = grid.dims()[0]; + const int out_d = grid.dims()[1]; + const int out_h = grid.dims()[2]; + const int out_w = grid.dims()[3]; + + // split grid with shape (n, d, h, w, 3) into (x, y, z) by the 3rd Dim + grid_x->Resize({n, out_d, out_h, out_w}); + grid_y->Resize({n, out_d, out_h, out_w}); + grid_z->Resize({n, out_d, out_h, out_w}); + T* grid_x_data = ctx.Alloc(grid_x); + T* grid_y_data = ctx.Alloc(grid_y); + T* grid_z_data = ctx.Alloc(grid_z); + + const T* grid_data = grid.data(); + for (int i = 0; i < n * out_d * out_h * out_w; i++) { + grid_x_data[i] = grid_data[3 * i]; + grid_y_data[i] = grid_data[(3 * i) + 1]; + grid_z_data[i] = grid_data[(3 * i) + 2]; + } + + Unnormalize3D(ctx, grid_x, in_w - 1, align_corners); + Unnormalize3D(ctx, grid_y, in_h - 1, align_corners); + Unnormalize3D(ctx, grid_z, in_d - 1, align_corners); + + ClipWithMask3D( + ctx, in_w - 1, align_corners, padding_mode, grid_x, grid_x_scale); + ClipWithMask3D( + ctx, in_h - 1, align_corners, padding_mode, grid_y, grid_y_scale); + ClipWithMask3D( + ctx, in_d - 1, align_corners, padding_mode, grid_z, grid_z_scale); +} + template static void GatherOutputGradToInputGrad(const DenseTensor& output_grad, DenseTensor* input_grad, @@ -156,6 +263,58 @@ static void GatherOutputGradToInputGrad(const DenseTensor& output_grad, } } +template +static void Gather3DOutputGradToInputGrad(const DenseTensor& output_grad, + DenseTensor* input_grad, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& z, + const DenseTensor& d1, + const DenseTensor& d2, + const DenseTensor& d3) { + const int n = output_grad.dims()[0]; + const int c = output_grad.dims()[1]; + const int out_d = output_grad.dims()[2]; + const int out_h = output_grad.dims()[3]; + const int out_w = output_grad.dims()[4]; + const int in_d = input_grad->dims()[2]; + const int in_h = input_grad->dims()[3]; + const int in_w = input_grad->dims()[4]; + auto x_t = EigenTensor::From(x); + auto y_t = EigenTensor::From(y); + auto z_t = EigenTensor::From(z); + auto d1_t = EigenTensor::From(d1); + auto d2_t = EigenTensor::From(d2); + auto d3_t = EigenTensor::From(d3); + auto input_grad_t = EigenTensor::From(*input_grad); + auto output_grad_t = EigenTensor::From(output_grad); + + for (int i = 0; i < n; i++) { + for (int m = 0; m < out_d; m++) { + for (int k = 0; k < out_h; k++) { + for (int l = 0; l < out_w; l++) { + if (IsInBound3D(x_t(i, m, k, l), + y_t(i, m, k, l), + z_t(i, m, k, l), + (T)(in_w - 1), + (T)(in_h - 1), + (T)(in_d - 1))) { + for (int j = 0; j < c; j++) { + input_grad_t(i, + j, + static_cast(round(z_t(i, m, k, l))), + static_cast(round(y_t(i, m, k, l))), + static_cast(round(x_t(i, m, k, l)))) += + output_grad_t(i, j, m, k, l) * d1_t(i, m, k, l) * + d2_t(i, m, k, l) * d3_t(i, m, k, l); + } + } + } + } + } + } +} + template static void GatherBilinearGrad(const CPUContext& ctx, const DenseTensor& input, @@ -256,6 +415,163 @@ static void GatherBilinearGrad(const CPUContext& ctx, } } +template +static void Gather3DBilinearGrad(const CPUContext& ctx, + const DenseTensor& input, + const DenseTensor& output_grad, + DenseTensor* grid_x, + DenseTensor* grid_y, + DenseTensor* grid_z, + DenseTensor* grid_x_scale, + DenseTensor* grid_y_scale, + DenseTensor* grid_z_scale, + DenseTensor* input_grad, + DenseTensor* grid_grad) { + const int n = grid_x->dims()[0]; + const int out_d = grid_x->dims()[1]; + const int out_h = grid_x->dims()[2]; + const int out_w = grid_x->dims()[3]; + const int c = input.dims()[1]; + + DenseTensor x_w, x_e, y_n, y_s, z_t, z_b; + DenseTensor d_w, d_e, d_n, d_s, d_t, d_b; + DenseTensor v_twn, v_ten, v_tws, v_tes, v_bwn, v_ben, v_bws, v_bes; + + All3DNeigbors(ctx, + input, + grid_x, + grid_y, + grid_z, + &x_w, + &x_e, + &y_n, + &y_s, + &z_t, + &z_b, + &d_w, + &d_e, + &d_n, + &d_s, + &d_t, + &d_b, + &v_twn, + &v_ten, + &v_tws, + &v_tes, + &v_bwn, + &v_ben, + &v_bws, + &v_bes); + // gather output grad value to input grad by corner point coords and weight + Gather3DOutputGradToInputGrad( + output_grad, input_grad, x_w, y_n, z_t, d_e, d_s, d_b); + Gather3DOutputGradToInputGrad( + output_grad, input_grad, x_w, y_s, z_t, d_e, d_n, d_b); + Gather3DOutputGradToInputGrad( + output_grad, input_grad, x_e, y_n, z_t, d_w, d_s, d_b); + Gather3DOutputGradToInputGrad( + output_grad, input_grad, x_e, y_s, z_t, d_w, d_n, d_b); + Gather3DOutputGradToInputGrad( + output_grad, input_grad, x_w, y_n, z_b, d_e, d_s, d_t); + Gather3DOutputGradToInputGrad( + output_grad, input_grad, x_w, y_s, z_b, d_e, d_n, d_t); + Gather3DOutputGradToInputGrad( + output_grad, input_grad, x_e, y_n, z_b, d_w, d_s, d_t); + Gather3DOutputGradToInputGrad( + output_grad, input_grad, x_e, y_s, z_b, d_w, d_n, d_t); + + auto v_twn_t = EigenTensor::From(v_twn); + auto v_ten_t = EigenTensor::From(v_ten); + auto v_tws_t = EigenTensor::From(v_tws); + auto v_tes_t = EigenTensor::From(v_tes); + auto v_bwn_t = EigenTensor::From(v_bwn); + auto v_ben_t = EigenTensor::From(v_ben); + auto v_bws_t = EigenTensor::From(v_bws); + auto v_bes_t = EigenTensor::From(v_bes); + + auto d_w_t = EigenTensor::From(d_w); + auto d_e_t = EigenTensor::From(d_e); + auto d_n_t = EigenTensor::From(d_n); + auto d_s_t = EigenTensor::From(d_s); + auto d_t_t = EigenTensor::From(d_t); + auto d_b_t = EigenTensor::From(d_b); + + auto output_grad_t = EigenTensor::From(output_grad); + if (grid_grad != nullptr) { + DenseTensor grid_grad_x, grid_grad_y, grid_grad_z; + grid_grad_x.Resize({n, out_d, out_h, out_w}); + grid_grad_y.Resize({n, out_d, out_h, out_w}); + grid_grad_z.Resize({n, out_d, out_h, out_w}); + ctx.Alloc(&grid_grad_x); + ctx.Alloc(&grid_grad_y); + ctx.Alloc(&grid_grad_z); + auto grid_grad_x_t = + EigenTensor::From(grid_grad_x).setConstant(static_cast(0.0)); + auto grid_grad_y_t = + EigenTensor::From(grid_grad_y).setConstant(static_cast(0.0)); + auto grid_grad_z_t = + EigenTensor::From(grid_grad_z).setConstant(static_cast(0.0)); + for (int i = 0; i < n; i++) { + for (int j = 0; j < c; j++) { + for (int m = 0; m < out_d; m++) { + for (int k = 0; k < out_h; k++) { + for (int l = 0; l < out_w; l++) { + grid_grad_x_t(i, m, k, l) += + ((v_ten_t(i, j, m, k, l) - v_twn_t(i, j, m, k, l)) * + d_s_t(i, m, k, l) * d_b_t(i, m, k, l) + + (v_tes_t(i, j, m, k, l) - v_tws_t(i, j, m, k, l)) * + d_n_t(i, m, k, l) * d_b_t(i, m, k, l) + + (v_ben_t(i, j, m, k, l) - v_bwn_t(i, j, m, k, l)) * + d_s_t(i, m, k, l) * d_t_t(i, m, k, l) + + (v_bes_t(i, j, m, k, l) - v_bws_t(i, j, m, k, l)) * + d_n_t(i, m, k, l) * d_t_t(i, m, k, l)) * + output_grad_t(i, j, m, k, l); + grid_grad_y_t(i, m, k, l) += + ((v_tws_t(i, j, m, k, l) - v_twn_t(i, j, m, k, l)) * + d_e_t(i, m, k, l) * d_b_t(i, m, k, l) + + (v_tes_t(i, j, m, k, l) - v_ten_t(i, j, m, k, l)) * + d_w_t(i, m, k, l) * d_b_t(i, m, k, l) + + (v_bws_t(i, j, m, k, l) - v_bwn_t(i, j, m, k, l)) * + d_e_t(i, m, k, l) * d_t_t(i, m, k, l) + + (v_bes_t(i, j, m, k, l) - v_ben_t(i, j, m, k, l)) * + d_w_t(i, m, k, l) * d_t_t(i, m, k, l)) * + output_grad_t(i, j, m, k, l); + grid_grad_z_t(i, m, k, l) += + ((v_bws_t(i, j, m, k, l) - v_tws_t(i, j, m, k, l)) * + d_e_t(i, m, k, l) * d_n_t(i, m, k, l) + + (v_bes_t(i, j, m, k, l) - v_tes_t(i, j, m, k, l)) * + d_w_t(i, m, k, l) * d_n_t(i, m, k, l) + + (v_bwn_t(i, j, m, k, l) - v_twn_t(i, j, m, k, l)) * + d_e_t(i, m, k, l) * d_s_t(i, m, k, l) + + (v_ben_t(i, j, m, k, l) - v_ten_t(i, j, m, k, l)) * + d_w_t(i, m, k, l) * d_s_t(i, m, k, l)) * + output_grad_t(i, j, m, k, l); + } + } + } + } + } + + auto grid_x_scale_t = EigenTensor::From(*grid_x_scale); + auto grid_y_scale_t = EigenTensor::From(*grid_y_scale); + auto grid_z_scale_t = EigenTensor::From(*grid_z_scale); + + grid_grad_x_t = grid_grad_x_t * grid_x_scale_t; + grid_grad_y_t = grid_grad_y_t * grid_y_scale_t; + grid_grad_z_t = grid_grad_z_t * grid_z_scale_t; + // gather grid_grad [x, y, z] in 4rd Dim + T* grid_grad_data = grid_grad->data(); + T* grid_grad_x_data = grid_grad_x.data(); + T* grid_grad_y_data = grid_grad_y.data(); + T* grid_grad_z_data = grid_grad_z.data(); + for (int i = 0; i < n * out_d * out_h * out_w; i++) { + grid_grad_data[3 * i] = grid_grad_x_data[i]; + grid_grad_data[3 * i + 1] = grid_grad_y_data[i]; + grid_grad_data[3 * i + 2] = grid_grad_z_data[i]; + } + } +} + template static void GatherOutputGradToInputGrad(const DenseTensor& output_grad, DenseTensor* input_grad, @@ -289,6 +605,50 @@ static void GatherOutputGradToInputGrad(const DenseTensor& output_grad, } } +template +static void Gather3DOutputGradToInputGrad(const DenseTensor& output_grad, + DenseTensor* input_grad, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& z) { + const int n = output_grad.dims()[0]; + const int c = output_grad.dims()[1]; + const int out_d = output_grad.dims()[2]; + const int out_h = output_grad.dims()[3]; + const int out_w = output_grad.dims()[4]; + const int in_d = input_grad->dims()[2]; + const int in_h = input_grad->dims()[3]; + const int in_w = input_grad->dims()[4]; + auto x_t = EigenTensor::From(x); + auto y_t = EigenTensor::From(y); + auto z_t = EigenTensor::From(z); + auto input_grad_t = EigenTensor::From(*input_grad); + auto output_grad_t = EigenTensor::From(output_grad); + for (int i = 0; i < n; i++) { + for (int m = 0; m < out_d; m++) { + for (int k = 0; k < out_h; k++) { + for (int l = 0; l < out_w; l++) { + if (IsInBound3D(x_t(i, m, k, l), + y_t(i, m, k, l), + z_t(i, m, k, l), + (T)(in_w - 1), + (T)(in_h - 1), + (T)(in_d - 1))) { + for (int j = 0; j < c; j++) { + input_grad_t(i, + j, + static_cast(round(z_t(i, m, k, l))), + static_cast(round(y_t(i, m, k, l))), + static_cast(round(x_t(i, m, k, l)))) += + output_grad_t(i, j, m, k, l); + } + } + } + } + } + } +} + template void GridSampleGradKernel(const Context& dev_ctx, const DenseTensor& x, @@ -299,52 +659,106 @@ void GridSampleGradKernel(const Context& dev_ctx, bool align_corners, DenseTensor* x_grad, DenseTensor* grid_grad) { - const int n = grid.dims()[0]; - const int out_h = grid.dims()[1]; - const int out_w = grid.dims()[2]; - const int c = x.dims()[1]; - const int in_h = x.dims()[2]; - const int in_w = x.dims()[3]; - - x_grad->Resize({n, c, in_h, in_w}); - dev_ctx.template Alloc(x_grad); - phi::funcs::SetConstant()(dev_ctx, x_grad, static_cast(0)); - - if (grid_grad != nullptr) { - grid_grad->Resize({n, out_h, out_w, 2}); - dev_ctx.template Alloc(grid_grad); - phi::funcs::SetConstant()( - dev_ctx, grid_grad, static_cast(0)); - } + if (x.dims().size() == 4) { + const int n = grid.dims()[0]; + const int out_h = grid.dims()[1]; + const int out_w = grid.dims()[2]; + const int c = x.dims()[1]; + const int in_h = x.dims()[2]; + const int in_w = x.dims()[3]; + + x_grad->Resize({n, c, in_h, in_w}); + dev_ctx.template Alloc(x_grad); + phi::funcs::SetConstant()(dev_ctx, x_grad, static_cast(0)); + + if (grid_grad != nullptr) { + grid_grad->Resize({n, out_h, out_w, 2}); + dev_ctx.template Alloc(grid_grad); + phi::funcs::SetConstant()( + dev_ctx, grid_grad, static_cast(0)); + } - DenseTensor grid_x, grid_y; - DenseTensor grid_x_scale, grid_y_scale; - CalcGridLocationsWithGrad(dev_ctx, - grid, - in_h, - in_w, - align_corners, - padding_mode, - &grid_x, - &grid_y, - &grid_x_scale, - &grid_y_scale); - if (mode == "bilinear") { - GatherBilinearGrad(dev_ctx, - x, - out_grid, - &grid_x, - &grid_y, - &grid_x_scale, - &grid_y_scale, - x_grad, - grid_grad); + DenseTensor grid_x, grid_y; + DenseTensor grid_x_scale, grid_y_scale; + CalcGridLocationsWithGrad(dev_ctx, + grid, + in_h, + in_w, + align_corners, + padding_mode, + &grid_x, + &grid_y, + &grid_x_scale, + &grid_y_scale); + if (mode == "bilinear") { + GatherBilinearGrad(dev_ctx, + x, + out_grid, + &grid_x, + &grid_y, + &grid_x_scale, + &grid_y_scale, + x_grad, + grid_grad); + } else { + auto grid_x_t = EigenTensor::From(grid_x); + auto grid_y_t = EigenTensor::From(grid_y); + grid_x_t = grid_x_t.round(); + grid_y_t = grid_y_t.round(); + GatherOutputGradToInputGrad(out_grid, x_grad, grid_x, grid_y); + } } else { - auto grid_x_t = EigenTensor::From(grid_x); - auto grid_y_t = EigenTensor::From(grid_y); - grid_x_t = grid_x_t.round(); - grid_y_t = grid_y_t.round(); - GatherOutputGradToInputGrad(out_grid, x_grad, grid_x, grid_y); + const int n = grid.dims()[0]; + const int out_d = grid.dims()[1]; + const int out_h = grid.dims()[2]; + const int out_w = grid.dims()[3]; + const int c = x.dims()[1]; + const int in_d = x.dims()[2]; + const int in_h = x.dims()[3]; + const int in_w = x.dims()[4]; + + x_grad->Resize({n, c, in_d, in_h, in_w}); + dev_ctx.template Alloc(x_grad); + phi::funcs::SetConstant()(dev_ctx, x_grad, static_cast(0)); + + if (grid_grad != nullptr) { + grid_grad->Resize({n, out_d, out_h, out_w, 3}); + dev_ctx.template Alloc(grid_grad); + phi::funcs::SetConstant()( + dev_ctx, grid_grad, static_cast(0)); + } + DenseTensor grid_x, grid_y, grid_z; + DenseTensor grid_x_scale, grid_y_scale, grid_z_scale; + + Calc3DGridLocationsWithGrad(dev_ctx, + grid, + in_d, + in_h, + in_w, + align_corners, + padding_mode, + &grid_x, + &grid_y, + &grid_z, + &grid_x_scale, + &grid_y_scale, + &grid_z_scale); + if (mode == "bilinear") { + Gather3DBilinearGrad(dev_ctx, + x, + out_grid, + &grid_x, + &grid_y, + &grid_z, + &grid_x_scale, + &grid_y_scale, + &grid_z_scale, + x_grad, + grid_grad); + } else { + Gather3DOutputGradToInputGrad( + out_grid, x_grad, grid_x, grid_y, grid_z); + } } } diff --git a/paddle/phi/kernels/cpu/grid_sample_kernel.cc b/paddle/phi/kernels/cpu/grid_sample_kernel.cc index 92a528cdda..14a199e6d2 100644 --- a/paddle/phi/kernels/cpu/grid_sample_kernel.cc +++ b/paddle/phi/kernels/cpu/grid_sample_kernel.cc @@ -22,6 +22,7 @@ namespace phi { using Array4 = Eigen::DSizes; +using Array5 = Eigen::DSizes; template static inline void Clip(const CPUContext& ctx, @@ -55,6 +56,38 @@ static inline void Clip(const CPUContext& ctx, } } +template +static inline void Clip3D(const CPUContext& ctx, + DenseTensor* grid_slice, + const int max_val, // height-1 or width-1 + bool align_corners, + std::string padding_mode) { + auto& place = *ctx.eigen_device(); + auto grid_slice_t = EigenTensor::From(*grid_slice); + if (padding_mode == "border") { + grid_slice_t.device(place) = grid_slice_t.cwiseMax(static_cast(0)) + .cwiseMin(static_cast(max_val)); + } else if (padding_mode == "reflection") { + if (align_corners) { + auto double_range = static_cast(max_val * 2); + auto grid_abs = grid_slice_t.abs(); + auto extra = grid_abs - (grid_abs / double_range).floor() * double_range; + grid_slice_t.device(place) = extra.cwiseMin(double_range - extra); + if (max_val == 0) { + grid_slice_t.device(place) = grid_slice_t.constant(static_cast(0)); + } + } else { + auto double_range = static_cast((max_val + 1) * 2); + auto grid_abs = (grid_slice_t + static_cast(0.5)).abs(); + auto extra = grid_abs - (grid_abs / double_range).floor() * double_range; + grid_slice_t.device(place) = + extra.cwiseMin(double_range - extra) - static_cast(0.5); + grid_slice_t.device(place) = grid_slice_t.cwiseMax(static_cast(0)) + .cwiseMin(static_cast(max_val)); + } + } +} + template static void CalcGridLocations(const CPUContext& ctx, const DenseTensor& grid, @@ -86,6 +119,45 @@ static void CalcGridLocations(const CPUContext& ctx, Clip(ctx, grid_y, in_h - 1, align_corners, padding_mode); } +template +static void Calc3DGridLocations(const CPUContext& ctx, + const DenseTensor& grid, + const int in_d, + const int in_h, + const int in_w, + bool align_corners, + std::string padding_mode, + DenseTensor* grid_x, + DenseTensor* grid_y, + DenseTensor* grid_z) { + const int n = grid.dims()[0]; + const int out_d = grid.dims()[1]; + const int out_h = grid.dims()[2]; + const int out_w = grid.dims()[3]; + + // split grid with shape (n, d, h, w, 3) into (x, y, z) by the 3rd Dim + grid_x->Resize({n, out_d, out_h, out_w}); + grid_y->Resize({n, out_d, out_h, out_w}); + grid_z->Resize({n, out_d, out_h, out_w}); + T* grid_x_data = ctx.Alloc(grid_x); + T* grid_y_data = ctx.Alloc(grid_y); + T* grid_z_data = ctx.Alloc(grid_z); + const T* grid_data = grid.data(); + for (int i = 0; i < n * out_d * out_h * out_w; i++) { + grid_x_data[i] = grid_data[3 * i]; + grid_y_data[i] = grid_data[(3 * i) + 1]; + grid_z_data[i] = grid_data[(3 * i) + 2]; + } + + Unnormalize3D(ctx, grid_x, in_w - 1, align_corners); + Unnormalize3D(ctx, grid_y, in_h - 1, align_corners); + Unnormalize3D(ctx, grid_z, in_d - 1, align_corners); + + Clip3D(ctx, grid_x, in_w - 1, align_corners, padding_mode); + Clip3D(ctx, grid_y, in_h - 1, align_corners, padding_mode); + Clip3D(ctx, grid_z, in_d - 1, align_corners, padding_mode); +} + template static void BilinearInter(const CPUContext& ctx, const DenseTensor& input, @@ -144,6 +216,94 @@ static void BilinearInter(const CPUContext& ctx, v_es_t * d_w_scaled_t * d_n_scaled_t; } +template +static void Bilinear3DInter(const CPUContext& ctx, + const DenseTensor& input, + DenseTensor* grid_x, + DenseTensor* grid_y, + DenseTensor* grid_z, + DenseTensor* out) { + auto& place = *ctx.eigen_device(); + const int n = grid_x->dims()[0]; + const int out_d = grid_x->dims()[1]; + const int out_h = grid_x->dims()[2]; + const int out_w = grid_x->dims()[3]; + const int c = input.dims()[1]; + + // get corner pixel values from (x, y, z) + // for 4d, we used north-east-south-west + // for 5d, we add top-bottom + DenseTensor x_w, x_e, y_n, y_s, z_t, z_b; + DenseTensor d_w, d_e, d_n, d_s, d_t, d_b; + DenseTensor v_twn, v_ten, v_tws, v_tes, v_bwn, v_ben, v_bws, v_bes; + + All3DNeigbors(ctx, + input, + grid_x, + grid_y, + grid_z, + &x_w, + &x_e, + &y_n, + &y_s, + &z_t, + &z_b, + &d_w, + &d_e, + &d_n, + &d_s, + &d_t, + &d_b, + &v_twn, + &v_ten, + &v_tws, + &v_tes, + &v_bwn, + &v_ben, + &v_bws, + &v_bes); + + auto d_w_t = EigenTensor::From(d_w); + auto d_e_t = EigenTensor::From(d_e); + auto d_n_t = EigenTensor::From(d_n); + auto d_s_t = EigenTensor::From(d_s); + auto d_t_t = EigenTensor::From(d_t); + auto d_b_t = EigenTensor::From(d_b); + + auto d_w_scaled_t = d_w_t.reshape(Array5(n, 1, out_d, out_h, out_w)) + .broadcast(Array5(1, c, 1, 1, 1)); + auto d_e_scaled_t = d_e_t.reshape(Array5(n, 1, out_d, out_h, out_w)) + .broadcast(Array5(1, c, 1, 1, 1)); + auto d_n_scaled_t = d_n_t.reshape(Array5(n, 1, out_d, out_h, out_w)) + .broadcast(Array5(1, c, 1, 1, 1)); + auto d_s_scaled_t = d_s_t.reshape(Array5(n, 1, out_d, out_h, out_w)) + .broadcast(Array5(1, c, 1, 1, 1)); + auto d_t_scaled_t = d_t_t.reshape(Array5(n, 1, out_d, out_h, out_w)) + .broadcast(Array5(1, c, 1, 1, 1)); + auto d_b_scaled_t = d_b_t.reshape(Array5(n, 1, out_d, out_h, out_w)) + .broadcast(Array5(1, c, 1, 1, 1)); + + auto v_twn_t = EigenTensor::From(v_twn); + auto v_ten_t = EigenTensor::From(v_ten); + auto v_tws_t = EigenTensor::From(v_tws); + auto v_tes_t = EigenTensor::From(v_tes); + auto v_bwn_t = EigenTensor::From(v_bwn); + auto v_ben_t = EigenTensor::From(v_ben); + auto v_bws_t = EigenTensor::From(v_bws); + auto v_bes_t = EigenTensor::From(v_bes); + auto output_t = EigenTensor::From(*out); + // bilinear interpolaetion by 4 corner points + output_t.device(place) = + v_twn_t * d_e_scaled_t * d_s_scaled_t * d_b_scaled_t + + v_ten_t * d_w_scaled_t * d_s_scaled_t * d_b_scaled_t + + v_tws_t * d_e_scaled_t * d_n_scaled_t * d_b_scaled_t + + v_tes_t * d_w_scaled_t * d_n_scaled_t * d_b_scaled_t + + v_bwn_t * d_e_scaled_t * d_s_scaled_t * d_t_scaled_t + + v_ben_t * d_w_scaled_t * d_s_scaled_t * d_t_scaled_t + + v_bws_t * d_e_scaled_t * d_n_scaled_t * d_t_scaled_t + + v_bes_t * d_w_scaled_t * d_n_scaled_t * d_t_scaled_t; +} + template void GridSampleKernel(const Context& dev_ctx, const DenseTensor& x, @@ -152,29 +312,67 @@ void GridSampleKernel(const Context& dev_ctx, const std::string& padding_mode, bool align_corners, DenseTensor* out) { - const int n = grid.dims()[0]; - const int out_h = grid.dims()[1]; - const int out_w = grid.dims()[2]; - const int c = x.dims()[1]; - const int in_h = x.dims()[2]; - const int in_w = x.dims()[3]; - - out->Resize(phi::make_ddim({n, c, out_h, out_w})); - dev_ctx.template Alloc(out); - phi::funcs::SetConstant()(dev_ctx, out, static_cast(0)); - - DenseTensor grid_x, grid_y; - CalcGridLocations( - dev_ctx, grid, in_h, in_w, align_corners, padding_mode, &grid_x, &grid_y); - - if (mode == "bilinear") { - BilinearInter(dev_ctx, x, &grid_x, &grid_y, out); - } else if (mode == "nearest") { - auto grid_x_t = EigenTensor::From(grid_x); - auto grid_y_t = EigenTensor::From(grid_y); - grid_x_t = grid_x_t.round(); - grid_y_t = grid_y_t.round(); - GetGridPointValue(x, out, grid_x, grid_y); + if (x.dims().size() == 4) { + const int n = grid.dims()[0]; + const int out_h = grid.dims()[1]; + const int out_w = grid.dims()[2]; + const int c = x.dims()[1]; + const int in_h = x.dims()[2]; + const int in_w = x.dims()[3]; + + out->Resize(phi::make_ddim({n, c, out_h, out_w})); + dev_ctx.template Alloc(out); + phi::funcs::SetConstant()(dev_ctx, out, static_cast(0)); + + DenseTensor grid_x, grid_y; + CalcGridLocations(dev_ctx, + grid, + in_h, + in_w, + align_corners, + padding_mode, + &grid_x, + &grid_y); + + if (mode == "bilinear") { + BilinearInter(dev_ctx, x, &grid_x, &grid_y, out); + } else if (mode == "nearest") { + auto grid_x_t = EigenTensor::From(grid_x); + auto grid_y_t = EigenTensor::From(grid_y); + grid_x_t = grid_x_t.round(); + grid_y_t = grid_y_t.round(); + GetGridPointValue(x, out, grid_x, grid_y); + } + } else { + const int n = grid.dims()[0]; + const int out_d = grid.dims()[1]; + const int out_h = grid.dims()[2]; + const int out_w = grid.dims()[3]; + const int c = x.dims()[1]; + const int in_d = x.dims()[2]; + const int in_h = x.dims()[3]; + const int in_w = x.dims()[4]; + + out->Resize(phi::make_ddim({n, c, out_d, out_h, out_w})); + dev_ctx.template Alloc(out); + phi::funcs::SetConstant()(dev_ctx, out, static_cast(0)); + + DenseTensor grid_x, grid_y, grid_z; + Calc3DGridLocations(dev_ctx, + grid, + in_d, + in_h, + in_w, + align_corners, + padding_mode, + &grid_x, + &grid_y, + &grid_z); + if (mode == "bilinear") { + Bilinear3DInter(dev_ctx, x, &grid_x, &grid_y, &grid_z, out); + } else if (mode == "nearest") { + Get3DGridPointValue(x, out, grid_x, grid_y, grid_z); + } } } diff --git a/paddle/phi/kernels/cpu/grid_sample_utils.h b/paddle/phi/kernels/cpu/grid_sample_utils.h index 53a16446d7..b9fca30ce1 100644 --- a/paddle/phi/kernels/cpu/grid_sample_utils.h +++ b/paddle/phi/kernels/cpu/grid_sample_utils.h @@ -13,7 +13,6 @@ // limitations under the License. #pragma once - #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/kernels/funcs/eigen/common.h" @@ -37,6 +36,24 @@ void Unnormalize(const CPUContext& ctx, } } +template +void Unnormalize3D(const CPUContext& ctx, + DenseTensor* grid_slice, + const int max_val, // height-1 or width-1 + bool align_corners) { + auto& place = *ctx.eigen_device(); + auto grid_slice_t = EigenTensor::From(*grid_slice); + + if (!align_corners) { + auto factor = static_cast((max_val + 1) * 0.5); + grid_slice_t.device(place) = + (grid_slice_t + static_cast(1)) * factor - static_cast(0.5); + } else { + auto factor = static_cast(max_val * 0.5); + grid_slice_t.device(place) = (grid_slice_t + static_cast(1)) * factor; + } +} + template inline bool IsInBound(T x, T y, T x_max, T y_max) { if (x < 0 || x > x_max || y < 0 || y > y_max) { @@ -45,6 +62,14 @@ inline bool IsInBound(T x, T y, T x_max, T y_max) { return true; } +template +inline bool IsInBound3D(T x, T y, T z, T x_max, T y_max, T z_max) { + if (x < 0 || x > x_max || y < 0 || y > y_max || z < 0 || z > z_max) { + return false; + } + return true; +} + template void GetGridPointValue(const DenseTensor& input, DenseTensor* output, @@ -157,4 +182,167 @@ void AllNeigbors(const CPUContext& ctx, GetGridPointValue(input, v_es, *x_e, *y_s); } +template +void Get3DGridPointValue(const DenseTensor& input, + DenseTensor* output, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& z) { + const int n = input.dims()[0]; + const int c = input.dims()[1]; + const int in_d = input.dims()[2]; + const int in_h = input.dims()[3]; + const int in_w = input.dims()[4]; + const int out_d = x.dims()[1]; + const int out_h = x.dims()[2]; + const int out_w = x.dims()[3]; + auto x_t = EigenTensor::From(x); + auto y_t = EigenTensor::From(y); + auto z_t = EigenTensor::From(z); + auto output_t = + EigenTensor::From(*output).setConstant(static_cast(0.0)); + auto input_t = EigenTensor::From(input); + + for (int i = 0; i < n; i++) { + for (int m = 0; m < out_d; m++) { + for (int k = 0; k < out_h; k++) { + for (int l = 0; l < out_w; l++) { + if (IsInBound3D(x_t(i, m, k, l), + y_t(i, m, k, l), + z_t(i, m, k, l), + (T)(in_w - 1), + (T)(in_h - 1), + (T)(in_d - 1))) { + for (int j = 0; j < c; j++) { + output_t(i, j, m, k, l) = + input_t(i, + j, + static_cast(round(z_t(i, m, k, l))), + static_cast(round(y_t(i, m, k, l))), + static_cast(round(x_t(i, m, k, l)))); + } + } + } + } + } + } +} + +template +void All3DNeigbors(const CPUContext& ctx, + const DenseTensor& input, + DenseTensor* grid_x, + DenseTensor* grid_y, + DenseTensor* grid_z, + DenseTensor* x_w, + DenseTensor* x_e, + DenseTensor* y_n, + DenseTensor* y_s, + DenseTensor* z_t, + DenseTensor* z_b, // positions + DenseTensor* d_w, + DenseTensor* d_e, + DenseTensor* d_n, + DenseTensor* d_s, + DenseTensor* d_t, + DenseTensor* d_b, // distance + DenseTensor* v_twn, + DenseTensor* v_ten, + DenseTensor* v_tws, + DenseTensor* v_tes, + DenseTensor* v_bwn, + DenseTensor* v_ben, + DenseTensor* v_bws, + DenseTensor* v_bes) { // values + auto& place = *ctx.eigen_device(); + + const int c = input.dims()[1]; + const int n = grid_x->dims()[0]; + const int out_d = grid_x->dims()[1]; + const int out_h = grid_x->dims()[2]; + const int out_w = grid_x->dims()[3]; + // calculate coords of 6 corner points + x_w->Resize({n, out_d, out_h, out_w}); + x_e->Resize({n, out_d, out_h, out_w}); + y_n->Resize({n, out_d, out_h, out_w}); + y_s->Resize({n, out_d, out_h, out_w}); + z_t->Resize({n, out_d, out_h, out_w}); + z_b->Resize({n, out_d, out_h, out_w}); + ctx.Alloc(x_w); + ctx.Alloc(x_e); + ctx.Alloc(y_n); + ctx.Alloc(y_s); + ctx.Alloc(z_t); + ctx.Alloc(z_b); + auto x_w_t = EigenTensor::From(*x_w); + auto x_e_t = EigenTensor::From(*x_e); + auto y_n_t = EigenTensor::From(*y_n); + auto y_s_t = EigenTensor::From(*y_s); + auto z_t_t = EigenTensor::From(*z_t); + auto z_b_t = EigenTensor::From(*z_b); + + auto grid_x_t = EigenTensor::From(*grid_x); + auto grid_y_t = EigenTensor::From(*grid_y); + auto grid_z_t = EigenTensor::From(*grid_z); + + x_w_t.device(place) = grid_x_t.floor(); + x_e_t.device(place) = x_w_t + static_cast(1); + y_n_t.device(place) = grid_y_t.floor(); + y_s_t.device(place) = y_n_t + static_cast(1); + z_t_t.device(place) = grid_z_t.floor(); + z_b_t.device(place) = z_t_t + static_cast(1); + + // calculate distances to 6 sides + d_w->Resize({n, out_d, out_h, out_w}); + d_e->Resize({n, out_d, out_h, out_w}); + d_n->Resize({n, out_d, out_h, out_w}); + d_s->Resize({n, out_d, out_h, out_w}); + d_t->Resize({n, out_d, out_h, out_w}); + d_b->Resize({n, out_d, out_h, out_w}); + ctx.Alloc(d_w); + ctx.Alloc(d_e); + ctx.Alloc(d_n); + ctx.Alloc(d_s); + ctx.Alloc(d_t); + ctx.Alloc(d_b); + auto d_w_t = EigenTensor::From(*d_w); + auto d_e_t = EigenTensor::From(*d_e); + auto d_n_t = EigenTensor::From(*d_n); + auto d_s_t = EigenTensor::From(*d_s); + auto d_t_t = EigenTensor::From(*d_t); + auto d_b_t = EigenTensor::From(*d_b); + d_w_t.device(place) = grid_x_t - x_w_t; + d_e_t.device(place) = x_e_t - grid_x_t; + d_n_t.device(place) = grid_y_t - y_n_t; + d_s_t.device(place) = y_s_t - grid_y_t; + d_t_t.device(place) = grid_z_t - z_t_t; + d_b_t.device(place) = z_b_t - grid_z_t; + + // calc 8 corner points value + v_twn->Resize({n, c, out_d, out_h, out_w}); + v_ten->Resize({n, c, out_d, out_h, out_w}); + v_tws->Resize({n, c, out_d, out_h, out_w}); + v_tes->Resize({n, c, out_d, out_h, out_w}); + v_bwn->Resize({n, c, out_d, out_h, out_w}); + v_ben->Resize({n, c, out_d, out_h, out_w}); + v_bws->Resize({n, c, out_d, out_h, out_w}); + v_bes->Resize({n, c, out_d, out_h, out_w}); + ctx.Alloc(v_twn); + ctx.Alloc(v_ten); + ctx.Alloc(v_tws); + ctx.Alloc(v_tes); + ctx.Alloc(v_bwn); + ctx.Alloc(v_ben); + ctx.Alloc(v_bws); + ctx.Alloc(v_bes); + Get3DGridPointValue(input, v_twn, *x_w, *y_n, *z_t); + Get3DGridPointValue(input, v_ten, *x_e, *y_n, *z_t); + Get3DGridPointValue(input, v_tws, *x_w, *y_s, *z_t); + Get3DGridPointValue(input, v_tes, *x_e, *y_s, *z_t); + Get3DGridPointValue(input, v_bwn, *x_w, *y_n, *z_b); + Get3DGridPointValue(input, v_ben, *x_e, *y_n, *z_b); + Get3DGridPointValue(input, v_bws, *x_w, *y_s, *z_b); + Get3DGridPointValue(input, v_bes, *x_e, *y_s, *z_b); +} + } // namespace phi diff --git a/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py b/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py index 8f9be0ddca..c162da6152 100644 --- a/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py +++ b/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py @@ -20,15 +20,6 @@ from op_test import OpTest, skip_check_grad_ci paddle.enable_static() -from white_list import ( - op_accuracy_white_list, - check_shape_white_list, - compile_vs_runtime_white_list, - no_check_set_white_list, - op_threshold_white_list, - no_grad_set_white_list, -) - def AffineGrid(theta, grid_shape): n = grid_shape[0] @@ -118,7 +109,6 @@ def getGridPointValue3D(data, x, y, z): out_H = x.shape[2] out_W = x.shape[3] - #out = np.zeros(data_shape, dtype='float64') out = np.zeros([N, C, out_D, out_H, out_W], dtype='float64') for i in range(N): for j in range(out_D): @@ -334,51 +324,15 @@ class TestGridSamplerOp(OpTest): self.padding_mode) } - def get_places(self): - places = [] - if core.is_compiled_with_cuda(): - places.append(core.CUDAPlace(0)) - return places - def test_check_output(self): - if len(self.grid_shape) == 4: - self.check_output(check_eager=True) - else: - check_eager_flag = True - check_dygraph_flag = False - for place in self.get_places(): - res = self.check_output_with_place( - place, - atol=1e-5, - check_dygraph=check_dygraph_flag, - check_eager=check_eager_flag) - if check_eager_flag: - assert check_dygraph_flag == False - outs, eager_dygraph_outs, fetch_list = res - elif check_dygraph_flag: - uts, dygraph_outs, fetch_list = res - else: - outs, fetch_list = res - if self.op_type not in compile_vs_runtime_white_list.COMPILE_RUN_OP_WHITE_LIST: - self.check_compile_vs_runtime(fetch_list, outs) + self.check_output(check_eager=True) def test_check_grad_normal(self): - if len(self.grid_shape) == 4: - self.check_grad(['X', 'Grid'], - 'Output', - max_relative_error=0.01, - numeric_grad_delta=self.numeric_grad_delta, - check_eager=True) - else: - self._check_grad_helper() - for place in self.get_places(): - self.check_grad_with_place( - place, ['X'], - 'Output', - numeric_grad_delta=self.numeric_grad_delta, - max_relative_error=0.01, - check_eager=True, - check_dygraph=False) + self.check_grad(['X', 'Grid'], + 'Output', + max_relative_error=0.01, + numeric_grad_delta=self.numeric_grad_delta, + check_eager=True) def initTestCase(self): self.x_shape = (2, 3, 8, 8) @@ -493,63 +447,67 @@ class Case6(TestGridSamplerOp): self.align_corners = False self.padding_mode = "zeros" self.mode = "bilinear" + self.numeric_grad_delta = 0.000001 class Case6_(TestGridSamplerOp): - def get_places(self): - places = [] - if core.is_compiled_with_cuda(): - places.append(core.CUDAPlace(0)) - return places - def initTestCase(self): - self.x_shape = (2, 3, 5, 6, 7) - self.grid_shape = (2, 8, 9, 10, 3) + self.x_shape = (2, 3, 4, 5, 6) + self.grid_shape = (2, 7, 8, 9, 3) self.theta_shape = (2, 3, 4) self.align_corners = False self.padding_mode = "border" self.mode = "bilinear" + self.numeric_grad_delta = 0.000001 class Case7(TestGridSamplerOp): def initTestCase(self): - self.x_shape = (2, 3, 5, 6, 7) - self.grid_shape = (2, 8, 9, 10, 3) + self.x_shape = (2, 3, 4, 5, 6) + self.grid_shape = (2, 7, 8, 9, 3) self.theta_shape = (2, 3, 4) self.align_corners = False self.padding_mode = "reflection" self.mode = "bilinear" + self.numeric_grad_delta = 0.000001 class Case8(TestGridSamplerOp): def initTestCase(self): - self.x_shape = (2, 3, 5, 6, 7) - self.grid_shape = (2, 8, 9, 10, 3) + self.x_shape = (2, 3, 4, 5, 6) + self.grid_shape = (2, 7, 8, 9, 3) self.theta_shape = (2, 3, 4) self.align_corners = True self.padding_mode = "reflection" self.mode = "bilinear" + self.numeric_grad_delta = 0.000001 class Case9(TestGridSamplerOp): def initTestCase(self): - self.x_shape = (2, 3, 5, 6, 7) - self.grid_shape = (2, 8, 9, 10, 3) + self.x_shape = (2, 3, 4, 5, 6) + self.grid_shape = (2, 7, 8, 9, 3) self.theta_shape = (2, 3, 4) self.align_corners = False self.padding_mode = "reflection" self.mode = "nearest" - self.numeric_grad_delta = 0.0001 + self.numeric_grad_delta = 0.000001 @skip_check_grad_ci(reason="'check_grad' on large inputs is too slow, " + "however it is desirable to cover the forward pass") class LargeInput3DCase(TestGridSamplerOp): + def get_places(self): + places = [] + if core.is_compiled_with_cuda(): + places.append(core.CUDAPlace(0)) + return places + def initTestCase(self): self.no_need_check_grad = True self.x_shape = (2, 3, 24, 24, 12) @@ -558,8 +516,8 @@ class LargeInput3DCase(TestGridSamplerOp): self.align_corners = False self.padding_mode = "reflection" self.mode = "bilinear" + self.numeric_grad_delta = 0.000001 self.use_cudnn = False - self.__class__.op_type = 'grid_sampler' def test_check_grad_normal(self): pass @@ -577,8 +535,7 @@ class Case10(LargeInput3DCase): self.align_corners = True self.padding_mode = "zeros" self.mode = "bilinear" - self.use_cudnn = False - self.__class__.op_type = 'grid_sampler' + self.numeric_grad_delta = 0.000001 if __name__ == "__main__": diff --git a/python/paddle/nn/functional/vision.py b/python/paddle/nn/functional/vision.py index e4e47aa682..7b01420441 100644 --- a/python/paddle/nn/functional/vision.py +++ b/python/paddle/nn/functional/vision.py @@ -275,6 +275,9 @@ def grid_sample(x, x.stop_gradient = False grid.stop_gradient = False + if len(grid.shape) == 5: + use_cudnn = False + if in_dygraph_mode(): return _C_ops.grid_sample(x, grid, mode, padding_mode, align_corners) elif in_dynamic_mode(): -- GitLab