enhance grid_sampler cpu kernel to 5D input (#45578)

* enhance grid_sampler cpu kernel to 5D input * fix bug when 5D input tensor running on the cudnn kernel

enhance grid_sampler cpu kernel to 5D input (#45578)
* enhance grid_sampler cpu kernel to 5D input * fix bug when 5D input tensor running on the cudnn kernel
663ebd5f · duanyanhui · GitHub · 6f2bac7c · 663ebd5f · 663ebd5f
5 changed file
--- a/paddle/phi/kernels/cpu/grid_sample_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/grid_sample_grad_kernel.cc
--- a/paddle/phi/kernels/cpu/grid_sample_kernel.cc
+++ b/paddle/phi/kernels/cpu/grid_sample_kernel.cc
@@ -22,6 +22,7 @@
 namespace phi {

 using Array4 = Eigen::DSizes<int64_t, 4>;
+using Array5 = Eigen::DSizes<int64_t, 5>;

 template <typename T>
 static inline void Clip(const CPUContext& ctx,
@@ -55,6 +56,38 @@ static inline void Clip(const CPUContext& ctx,
  }
 }

+template <typename T>
+static inline void Clip3D(const CPUContext& ctx,
+                          DenseTensor* grid_slice,
+                          const int max_val,  // height-1 or width-1
+                          bool align_corners,
+                          std::string padding_mode) {
+  auto& place = *ctx.eigen_device();
+  auto grid_slice_t = EigenTensor<T, 4>::From(*grid_slice);
+  if (padding_mode == "border") {
+    grid_slice_t.device(place) = grid_slice_t.cwiseMax(static_cast<T>(0))
+                                     .cwiseMin(static_cast<T>(max_val));
+  } else if (padding_mode == "reflection") {
+    if (align_corners) {
+      auto double_range = static_cast<T>(max_val * 2);
+      auto grid_abs = grid_slice_t.abs();
+      auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
+      grid_slice_t.device(place) = extra.cwiseMin(double_range - extra);
+      if (max_val == 0) {
+        grid_slice_t.device(place) = grid_slice_t.constant(static_cast<T>(0));
+      }
+    } else {
+      auto double_range = static_cast<T>((max_val + 1) * 2);
+      auto grid_abs = (grid_slice_t + static_cast<T>(0.5)).abs();
+      auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
+      grid_slice_t.device(place) =
+          extra.cwiseMin(double_range - extra) - static_cast<T>(0.5);
+      grid_slice_t.device(place) = grid_slice_t.cwiseMax(static_cast<T>(0))
+                                       .cwiseMin(static_cast<T>(max_val));
+    }
+  }
+}
+
 template <typename T>
 static void CalcGridLocations(const CPUContext& ctx,
                              const DenseTensor& grid,
@@ -86,6 +119,45 @@ static void CalcGridLocations(const CPUContext& ctx,
  Clip<T>(ctx, grid_y, in_h - 1, align_corners, padding_mode);
 }

+template <typename T>
+static void Calc3DGridLocations(const CPUContext& ctx,
+                                const DenseTensor& grid,
+                                const int in_d,
+                                const int in_h,
+                                const int in_w,
+                                bool align_corners,
+                                std::string padding_mode,
+                                DenseTensor* grid_x,
+                                DenseTensor* grid_y,
+                                DenseTensor* grid_z) {
+  const int n = grid.dims()[0];
+  const int out_d = grid.dims()[1];
+  const int out_h = grid.dims()[2];
+  const int out_w = grid.dims()[3];
+
+  // split grid with shape (n, d, h, w, 3) into (x, y, z) by the 3rd Dim
+  grid_x->Resize({n, out_d, out_h, out_w});
+  grid_y->Resize({n, out_d, out_h, out_w});
+  grid_z->Resize({n, out_d, out_h, out_w});
+  T* grid_x_data = ctx.Alloc<T>(grid_x);
+  T* grid_y_data = ctx.Alloc<T>(grid_y);
+  T* grid_z_data = ctx.Alloc<T>(grid_z);
+  const T* grid_data = grid.data<T>();
+  for (int i = 0; i < n * out_d * out_h * out_w; i++) {
+    grid_x_data[i] = grid_data[3 * i];
+    grid_y_data[i] = grid_data[(3 * i) + 1];
+    grid_z_data[i] = grid_data[(3 * i) + 2];
+  }
+
+  Unnormalize3D<T>(ctx, grid_x, in_w - 1, align_corners);
+  Unnormalize3D<T>(ctx, grid_y, in_h - 1, align_corners);
+  Unnormalize3D<T>(ctx, grid_z, in_d - 1, align_corners);
+
+  Clip3D<T>(ctx, grid_x, in_w - 1, align_corners, padding_mode);
+  Clip3D<T>(ctx, grid_y, in_h - 1, align_corners, padding_mode);
+  Clip3D<T>(ctx, grid_z, in_d - 1, align_corners, padding_mode);
+}
+
 template <typename T>
 static void BilinearInter(const CPUContext& ctx,
                          const DenseTensor& input,
@@ -144,6 +216,94 @@ static void BilinearInter(const CPUContext& ctx,
                           v_es_t * d_w_scaled_t * d_n_scaled_t;
 }

+template <typename T>
+static void Bilinear3DInter(const CPUContext& ctx,
+                            const DenseTensor& input,
+                            DenseTensor* grid_x,
+                            DenseTensor* grid_y,
+                            DenseTensor* grid_z,
+                            DenseTensor* out) {
+  auto& place = *ctx.eigen_device();
+  const int n = grid_x->dims()[0];
+  const int out_d = grid_x->dims()[1];
+  const int out_h = grid_x->dims()[2];
+  const int out_w = grid_x->dims()[3];
+  const int c = input.dims()[1];
+
+  // get corner pixel values from (x, y, z)
+  // for 4d, we used north-east-south-west
+  // for 5d, we add top-bottom
+  DenseTensor x_w, x_e, y_n, y_s, z_t, z_b;
+  DenseTensor d_w, d_e, d_n, d_s, d_t, d_b;
+  DenseTensor v_twn, v_ten, v_tws, v_tes, v_bwn, v_ben, v_bws, v_bes;
+
+  All3DNeigbors<T>(ctx,
+                   input,
+                   grid_x,
+                   grid_y,
+                   grid_z,
+                   &x_w,
+                   &x_e,
+                   &y_n,
+                   &y_s,
+                   &z_t,
+                   &z_b,
+                   &d_w,
+                   &d_e,
+                   &d_n,
+                   &d_s,
+                   &d_t,
+                   &d_b,
+                   &v_twn,
+                   &v_ten,
+                   &v_tws,
+                   &v_tes,
+                   &v_bwn,
+                   &v_ben,
+                   &v_bws,
+                   &v_bes);
+
+  auto d_w_t = EigenTensor<T, 4>::From(d_w);
+  auto d_e_t = EigenTensor<T, 4>::From(d_e);
+  auto d_n_t = EigenTensor<T, 4>::From(d_n);
+  auto d_s_t = EigenTensor<T, 4>::From(d_s);
+  auto d_t_t = EigenTensor<T, 4>::From(d_t);
+  auto d_b_t = EigenTensor<T, 4>::From(d_b);
+
+  auto d_w_scaled_t = d_w_t.reshape(Array5(n, 1, out_d, out_h, out_w))
+                          .broadcast(Array5(1, c, 1, 1, 1));
+  auto d_e_scaled_t = d_e_t.reshape(Array5(n, 1, out_d, out_h, out_w))
+                          .broadcast(Array5(1, c, 1, 1, 1));
+  auto d_n_scaled_t = d_n_t.reshape(Array5(n, 1, out_d, out_h, out_w))
+                          .broadcast(Array5(1, c, 1, 1, 1));
+  auto d_s_scaled_t = d_s_t.reshape(Array5(n, 1, out_d, out_h, out_w))
+                          .broadcast(Array5(1, c, 1, 1, 1));
+  auto d_t_scaled_t = d_t_t.reshape(Array5(n, 1, out_d, out_h, out_w))
+                          .broadcast(Array5(1, c, 1, 1, 1));
+  auto d_b_scaled_t = d_b_t.reshape(Array5(n, 1, out_d, out_h, out_w))
+                          .broadcast(Array5(1, c, 1, 1, 1));
+
+  auto v_twn_t = EigenTensor<T, 5>::From(v_twn);
+  auto v_ten_t = EigenTensor<T, 5>::From(v_ten);
+  auto v_tws_t = EigenTensor<T, 5>::From(v_tws);
+  auto v_tes_t = EigenTensor<T, 5>::From(v_tes);
+  auto v_bwn_t = EigenTensor<T, 5>::From(v_bwn);
+  auto v_ben_t = EigenTensor<T, 5>::From(v_ben);
+  auto v_bws_t = EigenTensor<T, 5>::From(v_bws);
+  auto v_bes_t = EigenTensor<T, 5>::From(v_bes);
+  auto output_t = EigenTensor<T, 5>::From(*out);
+  // bilinear interpolaetion by 4 corner points
+  output_t.device(place) =
+      v_twn_t * d_e_scaled_t * d_s_scaled_t * d_b_scaled_t +
+      v_ten_t * d_w_scaled_t * d_s_scaled_t * d_b_scaled_t +
+      v_tws_t * d_e_scaled_t * d_n_scaled_t * d_b_scaled_t +
+      v_tes_t * d_w_scaled_t * d_n_scaled_t * d_b_scaled_t +
+      v_bwn_t * d_e_scaled_t * d_s_scaled_t * d_t_scaled_t +
+      v_ben_t * d_w_scaled_t * d_s_scaled_t * d_t_scaled_t +
+      v_bws_t * d_e_scaled_t * d_n_scaled_t * d_t_scaled_t +
+      v_bes_t * d_w_scaled_t * d_n_scaled_t * d_t_scaled_t;
+}
+
 template <typename T, typename Context>
 void GridSampleKernel(const Context& dev_ctx,
                      const DenseTensor& x,
@@ -152,29 +312,67 @@ void GridSampleKernel(const Context& dev_ctx,
                      const std::string& padding_mode,
                      bool align_corners,
                      DenseTensor* out) {
-  const int n = grid.dims()[0];
-  const int out_h = grid.dims()[1];
-  const int out_w = grid.dims()[2];
-  const int c = x.dims()[1];
-  const int in_h = x.dims()[2];
-  const int in_w = x.dims()[3];
-
-  out->Resize(phi::make_ddim({n, c, out_h, out_w}));
-  dev_ctx.template Alloc<T>(out);
-  phi::funcs::SetConstant<Context, T>()(dev_ctx, out, static_cast<T>(0));
-
-  DenseTensor grid_x, grid_y;
-  CalcGridLocations<T>(
-      dev_ctx, grid, in_h, in_w, align_corners, padding_mode, &grid_x, &grid_y);
-
-  if (mode == "bilinear") {
-    BilinearInter<T>(dev_ctx, x, &grid_x, &grid_y, out);
-  } else if (mode == "nearest") {
-    auto grid_x_t = EigenTensor<T, 3>::From(grid_x);
-    auto grid_y_t = EigenTensor<T, 3>::From(grid_y);
-    grid_x_t = grid_x_t.round();
-    grid_y_t = grid_y_t.round();
-    GetGridPointValue<T>(x, out, grid_x, grid_y);
+  if (x.dims().size() == 4) {
+    const int n = grid.dims()[0];
+    const int out_h = grid.dims()[1];
+    const int out_w = grid.dims()[2];
+    const int c = x.dims()[1];
+    const int in_h = x.dims()[2];
+    const int in_w = x.dims()[3];
+
+    out->Resize(phi::make_ddim({n, c, out_h, out_w}));
+    dev_ctx.template Alloc<T>(out);
+    phi::funcs::SetConstant<Context, T>()(dev_ctx, out, static_cast<T>(0));
+
+    DenseTensor grid_x, grid_y;
+    CalcGridLocations<T>(dev_ctx,
+                         grid,
+                         in_h,
+                         in_w,
+                         align_corners,
+                         padding_mode,
+                         &grid_x,
+                         &grid_y);
+
+    if (mode == "bilinear") {
+      BilinearInter<T>(dev_ctx, x, &grid_x, &grid_y, out);
+    } else if (mode == "nearest") {
+      auto grid_x_t = EigenTensor<T, 3>::From(grid_x);
+      auto grid_y_t = EigenTensor<T, 3>::From(grid_y);
+      grid_x_t = grid_x_t.round();
+      grid_y_t = grid_y_t.round();
+      GetGridPointValue<T>(x, out, grid_x, grid_y);
+    }
+  } else {
+    const int n = grid.dims()[0];
+    const int out_d = grid.dims()[1];
+    const int out_h = grid.dims()[2];
+    const int out_w = grid.dims()[3];
+    const int c = x.dims()[1];
+    const int in_d = x.dims()[2];
+    const int in_h = x.dims()[3];
+    const int in_w = x.dims()[4];
+
+    out->Resize(phi::make_ddim({n, c, out_d, out_h, out_w}));
+    dev_ctx.template Alloc<T>(out);
+    phi::funcs::SetConstant<Context, T>()(dev_ctx, out, static_cast<T>(0));
+
+    DenseTensor grid_x, grid_y, grid_z;
+    Calc3DGridLocations<T>(dev_ctx,
+                           grid,
+                           in_d,
+                           in_h,
+                           in_w,
+                           align_corners,
+                           padding_mode,
+                           &grid_x,
+                           &grid_y,
+                           &grid_z);
+    if (mode == "bilinear") {
+      Bilinear3DInter<T>(dev_ctx, x, &grid_x, &grid_y, &grid_z, out);
+    } else if (mode == "nearest") {
+      Get3DGridPointValue<T>(x, out, grid_x, grid_y, grid_z);
+    }
  }
 }


--- a/paddle/phi/kernels/cpu/grid_sample_utils.h
+++ b/paddle/phi/kernels/cpu/grid_sample_utils.h
@@ -13,7 +13,6 @@
 // limitations under the License.

 #pragma once
-
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"

@@ -37,6 +36,24 @@ void Unnormalize(const CPUContext& ctx,
  }
 }

+template <typename T>
+void Unnormalize3D(const CPUContext& ctx,
+                   DenseTensor* grid_slice,
+                   const int max_val,  // height-1 or width-1
+                   bool align_corners) {
+  auto& place = *ctx.eigen_device();
+  auto grid_slice_t = EigenTensor<T, 4>::From(*grid_slice);
+
+  if (!align_corners) {
+    auto factor = static_cast<T>((max_val + 1) * 0.5);
+    grid_slice_t.device(place) =
+        (grid_slice_t + static_cast<T>(1)) * factor - static_cast<T>(0.5);
+  } else {
+    auto factor = static_cast<T>(max_val * 0.5);
+    grid_slice_t.device(place) = (grid_slice_t + static_cast<T>(1)) * factor;
+  }
+}
+
 template <typename T>
 inline bool IsInBound(T x, T y, T x_max, T y_max) {
  if (x < 0 || x > x_max || y < 0 || y > y_max) {
@@ -45,6 +62,14 @@ inline bool IsInBound(T x, T y, T x_max, T y_max) {
  return true;
 }

+template <typename T>
+inline bool IsInBound3D(T x, T y, T z, T x_max, T y_max, T z_max) {
+  if (x < 0 || x > x_max || y < 0 || y > y_max || z < 0 || z > z_max) {
+    return false;
+  }
+  return true;
+}
+
 template <typename T>
 void GetGridPointValue(const DenseTensor& input,
                       DenseTensor* output,
@@ -157,4 +182,167 @@ void AllNeigbors(const CPUContext& ctx,
  GetGridPointValue<T>(input, v_es, *x_e, *y_s);
 }

+template <typename T>
+void Get3DGridPointValue(const DenseTensor& input,
+                         DenseTensor* output,
+                         const DenseTensor& x,
+                         const DenseTensor& y,
+                         const DenseTensor& z) {
+  const int n = input.dims()[0];
+  const int c = input.dims()[1];
+  const int in_d = input.dims()[2];
+  const int in_h = input.dims()[3];
+  const int in_w = input.dims()[4];
+  const int out_d = x.dims()[1];
+  const int out_h = x.dims()[2];
+  const int out_w = x.dims()[3];
+  auto x_t = EigenTensor<T, 4>::From(x);
+  auto y_t = EigenTensor<T, 4>::From(y);
+  auto z_t = EigenTensor<T, 4>::From(z);
+  auto output_t =
+      EigenTensor<T, 5>::From(*output).setConstant(static_cast<T>(0.0));
+  auto input_t = EigenTensor<T, 5>::From(input);
+
+  for (int i = 0; i < n; i++) {
+    for (int m = 0; m < out_d; m++) {
+      for (int k = 0; k < out_h; k++) {
+        for (int l = 0; l < out_w; l++) {
+          if (IsInBound3D(x_t(i, m, k, l),
+                          y_t(i, m, k, l),
+                          z_t(i, m, k, l),
+                          (T)(in_w - 1),
+                          (T)(in_h - 1),
+                          (T)(in_d - 1))) {
+            for (int j = 0; j < c; j++) {
+              output_t(i, j, m, k, l) =
+                  input_t(i,
+                          j,
+                          static_cast<int>(round(z_t(i, m, k, l))),
+                          static_cast<int>(round(y_t(i, m, k, l))),
+                          static_cast<int>(round(x_t(i, m, k, l))));
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void All3DNeigbors(const CPUContext& ctx,
+                   const DenseTensor& input,
+                   DenseTensor* grid_x,
+                   DenseTensor* grid_y,
+                   DenseTensor* grid_z,
+                   DenseTensor* x_w,
+                   DenseTensor* x_e,
+                   DenseTensor* y_n,
+                   DenseTensor* y_s,
+                   DenseTensor* z_t,
+                   DenseTensor* z_b,  // positions
+                   DenseTensor* d_w,
+                   DenseTensor* d_e,
+                   DenseTensor* d_n,
+                   DenseTensor* d_s,
+                   DenseTensor* d_t,
+                   DenseTensor* d_b,  // distance
+                   DenseTensor* v_twn,
+                   DenseTensor* v_ten,
+                   DenseTensor* v_tws,
+                   DenseTensor* v_tes,
+                   DenseTensor* v_bwn,
+                   DenseTensor* v_ben,
+                   DenseTensor* v_bws,
+                   DenseTensor* v_bes) {  // values
+  auto& place = *ctx.eigen_device();
+
+  const int c = input.dims()[1];
+  const int n = grid_x->dims()[0];
+  const int out_d = grid_x->dims()[1];
+  const int out_h = grid_x->dims()[2];
+  const int out_w = grid_x->dims()[3];
+  // calculate coords of 6 corner points
+  x_w->Resize({n, out_d, out_h, out_w});
+  x_e->Resize({n, out_d, out_h, out_w});
+  y_n->Resize({n, out_d, out_h, out_w});
+  y_s->Resize({n, out_d, out_h, out_w});
+  z_t->Resize({n, out_d, out_h, out_w});
+  z_b->Resize({n, out_d, out_h, out_w});
+  ctx.Alloc<T>(x_w);
+  ctx.Alloc<T>(x_e);
+  ctx.Alloc<T>(y_n);
+  ctx.Alloc<T>(y_s);
+  ctx.Alloc<T>(z_t);
+  ctx.Alloc<T>(z_b);
+  auto x_w_t = EigenTensor<T, 4>::From(*x_w);
+  auto x_e_t = EigenTensor<T, 4>::From(*x_e);
+  auto y_n_t = EigenTensor<T, 4>::From(*y_n);
+  auto y_s_t = EigenTensor<T, 4>::From(*y_s);
+  auto z_t_t = EigenTensor<T, 4>::From(*z_t);
+  auto z_b_t = EigenTensor<T, 4>::From(*z_b);
+
+  auto grid_x_t = EigenTensor<T, 4>::From(*grid_x);
+  auto grid_y_t = EigenTensor<T, 4>::From(*grid_y);
+  auto grid_z_t = EigenTensor<T, 4>::From(*grid_z);
+
+  x_w_t.device(place) = grid_x_t.floor();
+  x_e_t.device(place) = x_w_t + static_cast<T>(1);
+  y_n_t.device(place) = grid_y_t.floor();
+  y_s_t.device(place) = y_n_t + static_cast<T>(1);
+  z_t_t.device(place) = grid_z_t.floor();
+  z_b_t.device(place) = z_t_t + static_cast<T>(1);
+
+  // calculate distances to 6 sides
+  d_w->Resize({n, out_d, out_h, out_w});
+  d_e->Resize({n, out_d, out_h, out_w});
+  d_n->Resize({n, out_d, out_h, out_w});
+  d_s->Resize({n, out_d, out_h, out_w});
+  d_t->Resize({n, out_d, out_h, out_w});
+  d_b->Resize({n, out_d, out_h, out_w});
+  ctx.Alloc<T>(d_w);
+  ctx.Alloc<T>(d_e);
+  ctx.Alloc<T>(d_n);
+  ctx.Alloc<T>(d_s);
+  ctx.Alloc<T>(d_t);
+  ctx.Alloc<T>(d_b);
+  auto d_w_t = EigenTensor<T, 4>::From(*d_w);
+  auto d_e_t = EigenTensor<T, 4>::From(*d_e);
+  auto d_n_t = EigenTensor<T, 4>::From(*d_n);
+  auto d_s_t = EigenTensor<T, 4>::From(*d_s);
+  auto d_t_t = EigenTensor<T, 4>::From(*d_t);
+  auto d_b_t = EigenTensor<T, 4>::From(*d_b);
+  d_w_t.device(place) = grid_x_t - x_w_t;
+  d_e_t.device(place) = x_e_t - grid_x_t;
+  d_n_t.device(place) = grid_y_t - y_n_t;
+  d_s_t.device(place) = y_s_t - grid_y_t;
+  d_t_t.device(place) = grid_z_t - z_t_t;
+  d_b_t.device(place) = z_b_t - grid_z_t;
+
+  // calc 8 corner points value
+  v_twn->Resize({n, c, out_d, out_h, out_w});
+  v_ten->Resize({n, c, out_d, out_h, out_w});
+  v_tws->Resize({n, c, out_d, out_h, out_w});
+  v_tes->Resize({n, c, out_d, out_h, out_w});
+  v_bwn->Resize({n, c, out_d, out_h, out_w});
+  v_ben->Resize({n, c, out_d, out_h, out_w});
+  v_bws->Resize({n, c, out_d, out_h, out_w});
+  v_bes->Resize({n, c, out_d, out_h, out_w});
+  ctx.Alloc<T>(v_twn);
+  ctx.Alloc<T>(v_ten);
+  ctx.Alloc<T>(v_tws);
+  ctx.Alloc<T>(v_tes);
+  ctx.Alloc<T>(v_bwn);
+  ctx.Alloc<T>(v_ben);
+  ctx.Alloc<T>(v_bws);
+  ctx.Alloc<T>(v_bes);
+  Get3DGridPointValue<T>(input, v_twn, *x_w, *y_n, *z_t);
+  Get3DGridPointValue<T>(input, v_ten, *x_e, *y_n, *z_t);
+  Get3DGridPointValue<T>(input, v_tws, *x_w, *y_s, *z_t);
+  Get3DGridPointValue<T>(input, v_tes, *x_e, *y_s, *z_t);
+  Get3DGridPointValue<T>(input, v_bwn, *x_w, *y_n, *z_b);
+  Get3DGridPointValue<T>(input, v_ben, *x_e, *y_n, *z_b);
+  Get3DGridPointValue<T>(input, v_bws, *x_w, *y_s, *z_b);
+  Get3DGridPointValue<T>(input, v_bes, *x_e, *y_s, *z_b);
+}
+
 }  // namespace phi
--- a/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
+++ b/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
@@ -20,15 +20,6 @@ from op_test import OpTest, skip_check_grad_ci

 paddle.enable_static()

-from white_list import (
-    op_accuracy_white_list,
-    check_shape_white_list,
-    compile_vs_runtime_white_list,
-    no_check_set_white_list,
-    op_threshold_white_list,
-    no_grad_set_white_list,
-)
-

 def AffineGrid(theta, grid_shape):
    n = grid_shape[0]
@@ -118,7 +109,6 @@ def getGridPointValue3D(data, x, y, z):
    out_H = x.shape[2]
    out_W = x.shape[3]

-    #out = np.zeros(data_shape, dtype='float64')
    out = np.zeros([N, C, out_D, out_H, out_W], dtype='float64')
    for i in range(N):
        for j in range(out_D):
@@ -334,51 +324,15 @@ class TestGridSamplerOp(OpTest):
                              self.padding_mode)
            }

-    def get_places(self):
-        places = []
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-        return places
-
    def test_check_output(self):
-        if len(self.grid_shape) == 4:
-            self.check_output(check_eager=True)
-        else:
-            check_eager_flag = True
-            check_dygraph_flag = False
-            for place in self.get_places():
-                res = self.check_output_with_place(
-                    place,
-                    atol=1e-5,
-                    check_dygraph=check_dygraph_flag,
-                    check_eager=check_eager_flag)
-                if check_eager_flag:
-                    assert check_dygraph_flag == False
-                    outs, eager_dygraph_outs, fetch_list = res
-                elif check_dygraph_flag:
-                    uts, dygraph_outs, fetch_list = res
-                else:
-                    outs, fetch_list = res
-                if self.op_type not in compile_vs_runtime_white_list.COMPILE_RUN_OP_WHITE_LIST:
-                    self.check_compile_vs_runtime(fetch_list, outs)
+        self.check_output(check_eager=True)

    def test_check_grad_normal(self):
-        if len(self.grid_shape) == 4:
-            self.check_grad(['X', 'Grid'],
-                            'Output',
-                            max_relative_error=0.01,
-                            numeric_grad_delta=self.numeric_grad_delta,
-                            check_eager=True)
-        else:
-            self._check_grad_helper()
-            for place in self.get_places():
-                self.check_grad_with_place(
-                    place, ['X'],
-                    'Output',
-                    numeric_grad_delta=self.numeric_grad_delta,
-                    max_relative_error=0.01,
-                    check_eager=True,
-                    check_dygraph=False)
+        self.check_grad(['X', 'Grid'],
+                        'Output',
+                        max_relative_error=0.01,
+                        numeric_grad_delta=self.numeric_grad_delta,
+                        check_eager=True)

    def initTestCase(self):
        self.x_shape = (2, 3, 8, 8)
@@ -493,63 +447,67 @@ class Case6(TestGridSamplerOp):
        self.align_corners = False
        self.padding_mode = "zeros"
        self.mode = "bilinear"
+        self.numeric_grad_delta = 0.000001


 class Case6_(TestGridSamplerOp):

-    def get_places(self):
-        places = []
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-        return places
-
    def initTestCase(self):
-        self.x_shape = (2, 3, 5, 6, 7)
-        self.grid_shape = (2, 8, 9, 10, 3)
+        self.x_shape = (2, 3, 4, 5, 6)
+        self.grid_shape = (2, 7, 8, 9, 3)
        self.theta_shape = (2, 3, 4)
        self.align_corners = False
        self.padding_mode = "border"
        self.mode = "bilinear"
+        self.numeric_grad_delta = 0.000001


 class Case7(TestGridSamplerOp):

    def initTestCase(self):
-        self.x_shape = (2, 3, 5, 6, 7)
-        self.grid_shape = (2, 8, 9, 10, 3)
+        self.x_shape = (2, 3, 4, 5, 6)
+        self.grid_shape = (2, 7, 8, 9, 3)
        self.theta_shape = (2, 3, 4)
        self.align_corners = False
        self.padding_mode = "reflection"
        self.mode = "bilinear"
+        self.numeric_grad_delta = 0.000001


 class Case8(TestGridSamplerOp):

    def initTestCase(self):
-        self.x_shape = (2, 3, 5, 6, 7)
-        self.grid_shape = (2, 8, 9, 10, 3)
+        self.x_shape = (2, 3, 4, 5, 6)
+        self.grid_shape = (2, 7, 8, 9, 3)
        self.theta_shape = (2, 3, 4)
        self.align_corners = True
        self.padding_mode = "reflection"
        self.mode = "bilinear"
+        self.numeric_grad_delta = 0.000001


 class Case9(TestGridSamplerOp):

    def initTestCase(self):
-        self.x_shape = (2, 3, 5, 6, 7)
-        self.grid_shape = (2, 8, 9, 10, 3)
+        self.x_shape = (2, 3, 4, 5, 6)
+        self.grid_shape = (2, 7, 8, 9, 3)
        self.theta_shape = (2, 3, 4)
        self.align_corners = False
        self.padding_mode = "reflection"
        self.mode = "nearest"
-        self.numeric_grad_delta = 0.0001
+        self.numeric_grad_delta = 0.000001


 @skip_check_grad_ci(reason="'check_grad' on large inputs is too slow, " +
                    "however it is desirable to cover the forward pass")
 class LargeInput3DCase(TestGridSamplerOp):

+    def get_places(self):
+        places = []
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        return places
+
    def initTestCase(self):
        self.no_need_check_grad = True
        self.x_shape = (2, 3, 24, 24, 12)
@@ -558,8 +516,8 @@ class LargeInput3DCase(TestGridSamplerOp):
        self.align_corners = False
        self.padding_mode = "reflection"
        self.mode = "bilinear"
+        self.numeric_grad_delta = 0.000001
        self.use_cudnn = False
-        self.__class__.op_type = 'grid_sampler'

    def test_check_grad_normal(self):
        pass
@@ -577,8 +535,7 @@ class Case10(LargeInput3DCase):
        self.align_corners = True
        self.padding_mode = "zeros"
        self.mode = "bilinear"
-        self.use_cudnn = False
-        self.__class__.op_type = 'grid_sampler'
+        self.numeric_grad_delta = 0.000001


 if __name__ == "__main__":

--- a/python/paddle/nn/functional/vision.py
+++ b/python/paddle/nn/functional/vision.py
@@ -275,6 +275,9 @@ def grid_sample(x,
        x.stop_gradient = False
        grid.stop_gradient = False

+    if len(grid.shape) == 5:
+        use_cudnn = False
+
    if in_dygraph_mode():
        return _C_ops.grid_sample(x, grid, mode, padding_mode, align_corners)
    elif in_dynamic_mode():