enhance grid_sampler to support 3d input (#45015)

* enhance grid_sampler to support 3d input

enhance grid_sampler to support 3d input (#45015)
* enhance grid_sampler to support 3d input
1773fbba · duanyanhui · GitHub · 1cb883da · 1773fbba · 1773fbba
7 changed file
--- a/paddle/fluid/operators/grid_sampler_op.cc
+++ b/paddle/fluid/operators/grid_sampler_op.cc
@@ -54,13 +54,19 @@ class GridSampleOpMaker : public framework::OpProtoAndCheckerMaker {
  void Make() override {
    AddInput("X",
             "(Tensor) The input data of GridSampleOp, "
-             "This is a 4-D tensor with shape of [N, C, H, W]");
+             "This is a 4-D tensor with shape of [N, C, H, W] or"
+             "        a 5-D tensot with shape of [N, C, D, H, W]");
    AddInput(
        "Grid",
        "(Tensor) The input grid of GridSampleOp generated by AffineGridOp, "
        "This is a 4-D tensor with shape of [N, H, W, 2] is the concatenation "
-        "of x and y coordinates with shape [N, H, W] in last dimension");
-    AddOutput("Output", "(Tensor) Output tensor with shape [N, C, H, W]");
+        "of x and y coordinates with shape [N, H, W] in last dimension or "
+        "a 5-D tensor with shape of [N, D, H, W, 3] is the concatenation "
+        "of depth, x and y coordinates with shape [N, D, H, W] in last "
+        "dimension ");
+    AddOutput("Output",
+              "(Tensor) Output tensor with shape [N, C, H, W] or shape [N,C, "
+              "D, H ,W]");
    AddAttr<bool>(
        "use_cudnn",
        "(bool, default true) Only used in cudnn kernel, need install cudnn")

--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -1288,19 +1288,31 @@ void GridSampleBaseInferMeta(const MetaTensor& x,
                             MetaConfig config) {
  auto x_dims = x.dims();
  auto grid_dims = grid.dims();
-  PADDLE_ENFORCE_EQ(x_dims.size(),
+  PADDLE_ENFORCE_GE(x_dims.size(),
                    4,
                    phi::errors::InvalidArgument(
                        "Input(X) of GridSampleOp should be 4-D Tensor, but "
                        "received X dimension size(%d)",
                        x_dims.size()));
-  PADDLE_ENFORCE_EQ(grid_dims.size(),
+  PADDLE_ENFORCE_LE(x_dims.size(),
+                    5,
+                    phi::errors::InvalidArgument(
+                        "Input(X) of GridSampleOp should be 4-D Tensor, but "
+                        "received X dimension size(%d)",
+                        x_dims.size()));
+  PADDLE_ENFORCE_GE(grid_dims.size(),
                    4,
                    phi::errors::InvalidArgument(
                        "Input(Grid) of GridSampleOp should be 4-D Tensor, "
                        "but received X dimension size(%d)",
                        grid_dims.size()));
-  if (config.is_runtime || grid_dims[3] > 0) {
+  PADDLE_ENFORCE_LE(grid_dims.size(),
+                    5,
+                    phi::errors::InvalidArgument(
+                        "Input(Grid) of GridSampleOp should be 4-D Tensor, "
+                        "but received X dimension size(%d)",
+                        grid_dims.size()));
+  if (grid_dims.size() == 4 && (config.is_runtime || grid_dims[3] > 0)) {
    PADDLE_ENFORCE_EQ(
        grid_dims[3],
        2,
@@ -1308,6 +1320,14 @@ void GridSampleBaseInferMeta(const MetaTensor& x,
            "Input(Grid) dimension[3] should be 2, but received %d",
            grid_dims[3]));
  }
+  if (grid_dims.size() == 5 && (config.is_runtime || grid_dims[4] > 0)) {
+    PADDLE_ENFORCE_EQ(
+        grid_dims[4],
+        3,
+        phi::errors::InvalidArgument(
+            "Input(Grid) dimension[4] should be 3, but received %d",
+            grid_dims[4]));
+  }
  if (config.is_runtime) {
    PADDLE_ENFORCE_EQ(
        grid_dims[0],
@@ -1318,8 +1338,12 @@ void GridSampleBaseInferMeta(const MetaTensor& x,
            x_dims[0],
            grid_dims[0]));
  }
-
-  out->set_dims({x_dims[0], x_dims[1], grid_dims[1], grid_dims[2]});
+  if (grid_dims.size() == 4) {
+    out->set_dims({x_dims[0], x_dims[1], grid_dims[1], grid_dims[2]});
+  } else {
+    out->set_dims(
+        {x_dims[0], x_dims[1], grid_dims[1], grid_dims[2], grid_dims[3]});
+  }
  out->set_dtype(x.dtype());
  out->share_lod(x);
 }

--- a/paddle/phi/kernels/gpu/grid_sample_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/grid_sample_grad_kernel.cu
@@ -32,6 +32,23 @@ static __forceinline__ __device__ void AtomicAdd(
  }
 }

+template <typename T>
+static __forceinline__ __device__ void AtomicAdd3D(T* data,
+                                                   int64_t d,
+                                                   int64_t h,
+                                                   int64_t w,
+                                                   int64_t sD,
+                                                   int64_t sH,
+                                                   int64_t sW,
+                                                   int64_t D,
+                                                   int64_t H,
+                                                   int64_t W,
+                                                   T delta) {
+  if (InBounds3D(d, h, w, D, H, W)) {
+    atomicAdd(data + d * sD + h * sH + w * sW, delta);
+  }
+}
+
 template <typename T>
 static __forceinline__ __device__ T
 UnnormalizeWithMask(T coord, int size, bool align_corners, T* grad_in) {
@@ -249,6 +266,305 @@ __global__ void GridSamplerCudaBackwardKernel(const int nthreads,
  }
 }

+template <typename T>
+__global__ void GridSampler3DCudaBackwardKernel(const int nthreads,
+                                                const T* grad_output,
+                                                const T* input,
+                                                const T* grid,
+                                                int out_c,
+                                                int out_d,
+                                                int out_h,
+                                                int out_w,
+                                                int in_d,
+                                                int in_h,
+                                                int in_w,
+                                                T* grad_input,
+                                                T* grad_grid,
+                                                const Mode mode,
+                                                const PaddingMode padding_mode,
+                                                bool align_corners) {
+  int inp_sW = 1;
+  int inp_sH = in_w;
+  int inp_sD = in_h * in_w;
+  int inp_sC = in_d * inp_sD;
+  int inp_sN = out_c * inp_sC;
+
+  int grid_sCoor = 1;
+  int grid_sW = 3;
+  int grid_sH = out_w * grid_sW;
+  int grid_sD = out_h * grid_sH;
+  int grid_sN = out_d * grid_sD;
+
+  int gOut_sW = 1;
+  int gOut_sH = out_w;
+  int gOut_sD = out_h * out_w;
+  int gOut_sC = out_d * gOut_sD;
+  int gOut_sN = out_c * gOut_sC;
+
+  CUDA_KERNEL_LOOP_TYPE(index, nthreads, int) {
+    const int w = index % out_w;
+    const int h = (index / out_w) % out_h;
+    const int d = (index / (out_h * out_w)) % out_d;
+    const int n = index / (out_d * out_h * out_w);
+    const auto grid_offset =
+        n * grid_sN + d * grid_sD + h * grid_sH + w * grid_sW;
+
+    // get the corresponding input x, y, z co-ordinates from grid
+    T ix = grid[grid_offset];
+    T iy = grid[grid_offset + grid_sCoor];
+    T iz = grid[grid_offset + 2 * grid_sCoor];
+
+    // multipliers for gradients on ix, iy, and iz
+    T gix_mult, giy_mult, giz_mult;
+    ix = ComputePositionsWithMask(
+        ix, in_w, padding_mode, align_corners, &gix_mult);
+    iy = ComputePositionsWithMask(
+        iy, in_h, padding_mode, align_corners, &giy_mult);
+    iz = ComputePositionsWithMask(
+        iz, in_d, padding_mode, align_corners, &giz_mult);
+
+    if (mode == Mode::bilinear) {
+      // get corner pixel values from (x, y, z)
+      // for 4d, we used north-east-south-west
+      // for 5d, we add top-bottom
+      int ix_tnw = static_cast<int>(std::floor(ix));
+      int iy_tnw = static_cast<int>(std::floor(iy));
+      int iz_tnw = static_cast<int>(std::floor(iz));
+
+      int ix_tne = ix_tnw + 1;
+      int iy_tne = iy_tnw;
+      int iz_tne = iz_tnw;
+
+      int ix_tsw = ix_tnw;
+      int iy_tsw = iy_tnw + 1;
+      int iz_tsw = iz_tnw;
+
+      int ix_tse = ix_tnw + 1;
+      int iy_tse = iy_tnw + 1;
+      int iz_tse = iz_tnw;
+
+      int ix_bnw = ix_tnw;
+      int iy_bnw = iy_tnw;
+      int iz_bnw = iz_tnw + 1;
+
+      int ix_bne = ix_tnw + 1;
+      int iy_bne = iy_tnw;
+      int iz_bne = iz_tnw + 1;
+
+      int ix_bsw = ix_tnw;
+      int iy_bsw = iy_tnw + 1;
+      int iz_bsw = iz_tnw + 1;
+
+      int ix_bse = ix_tnw + 1;
+      int iy_bse = iy_tnw + 1;
+      int iz_bse = iz_tnw + 1;
+
+      // get surfaces to each neighbor:
+      T tnw = (ix_bse - ix) * (iy_bse - iy) * (iz_bse - iz);
+      T tne = (ix - ix_bsw) * (iy_bsw - iy) * (iz_bsw - iz);
+      T tsw = (ix_bne - ix) * (iy - iy_bne) * (iz_bne - iz);
+      T tse = (ix - ix_bnw) * (iy - iy_bnw) * (iz_bnw - iz);
+      T bnw = (ix_tse - ix) * (iy_tse - iy) * (iz - iz_tse);
+      T bne = (ix - ix_tsw) * (iy_tsw - iy) * (iz - iz_tsw);
+      T bsw = (ix_tne - ix) * (iy - iy_tne) * (iz - iz_tne);
+      T bse = (ix - ix_tnw) * (iy - iy_tnw) * (iz - iz_tnw);
+
+      T gix = static_cast<T>(0), giy = static_cast<T>(0),
+        giz = static_cast<T>(0);
+      int gOut_offset = n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW;
+      int inp_offset_NC = n * inp_sN;
+      T* gInp_ptr_NC = grad_input + n * inp_sN;
+      for (int c = 0; c < out_c; ++c,
+               gOut_offset += gOut_sC,
+               gInp_ptr_NC += inp_sC,
+               inp_offset_NC += inp_sC) {
+        T gOut = grad_output[gOut_offset];
+
+        AtomicAdd3D(gInp_ptr_NC,
+                    iz_tnw,
+                    iy_tnw,
+                    ix_tnw,
+                    inp_sD,
+                    inp_sH,
+                    inp_sW,
+                    in_d,
+                    in_h,
+                    in_w,
+                    tnw * gOut);
+        AtomicAdd3D(gInp_ptr_NC,
+                    iz_tne,
+                    iy_tne,
+                    ix_tne,
+                    inp_sD,
+                    inp_sH,
+                    inp_sW,
+                    in_d,
+                    in_h,
+                    in_w,
+                    tne * gOut);
+        AtomicAdd3D(gInp_ptr_NC,
+                    iz_tsw,
+                    iy_tsw,
+                    ix_tsw,
+                    inp_sD,
+                    inp_sH,
+                    inp_sW,
+                    in_d,
+                    in_h,
+                    in_w,
+                    tsw * gOut);
+        AtomicAdd3D(gInp_ptr_NC,
+                    iz_tse,
+                    iy_tse,
+                    ix_tse,
+                    inp_sD,
+                    inp_sH,
+                    inp_sW,
+                    in_d,
+                    in_h,
+                    in_w,
+                    tse * gOut);
+        AtomicAdd3D(gInp_ptr_NC,
+                    iz_bnw,
+                    iy_bnw,
+                    ix_bnw,
+                    inp_sD,
+                    inp_sH,
+                    inp_sW,
+                    in_d,
+                    in_h,
+                    in_w,
+                    bnw * gOut);
+        AtomicAdd3D(gInp_ptr_NC,
+                    iz_bne,
+                    iy_bne,
+                    ix_bne,
+                    inp_sD,
+                    inp_sH,
+                    inp_sW,
+                    in_d,
+                    in_h,
+                    in_w,
+                    bne * gOut);
+        AtomicAdd3D(gInp_ptr_NC,
+                    iz_bsw,
+                    iy_bsw,
+                    ix_bsw,
+                    inp_sD,
+                    inp_sH,
+                    inp_sW,
+                    in_d,
+                    in_h,
+                    in_w,
+                    bsw * gOut);
+        AtomicAdd3D(gInp_ptr_NC,
+                    iz_bse,
+                    iy_bse,
+                    ix_bse,
+                    inp_sD,
+                    inp_sH,
+                    inp_sW,
+                    in_d,
+                    in_h,
+                    in_w,
+                    bse * gOut);
+
+        // calculate grad_grid
+        if (InBounds3D(iz_tnw, iy_tnw, ix_tnw, in_d, in_h, in_w)) {
+          T tnw_val = input[inp_offset_NC + iz_tnw * inp_sD + iy_tnw * inp_sH +
+                            ix_tnw * inp_sW];
+          gix -= tnw_val * (iy_bse - iy) * (iz_bse - iz) * gOut;
+          giy -= tnw_val * (ix_bse - ix) * (iz_bse - iz) * gOut;
+          giz -= tnw_val * (ix_bse - ix) * (iy_bse - iy) * gOut;
+        }
+        if (InBounds3D(iz_tne, iy_tne, ix_tne, in_d, in_h, in_w)) {
+          T tne_val = input[inp_offset_NC + iz_tne * inp_sD + iy_tne * inp_sH +
+                            ix_tne * inp_sW];
+          gix += tne_val * (iy_bsw - iy) * (iz_bsw - iz) * gOut;
+          giy -= tne_val * (ix - ix_bsw) * (iz_bsw - iz) * gOut;
+          giz -= tne_val * (ix - ix_bsw) * (iy_bsw - iy) * gOut;
+        }
+        if (InBounds3D(iz_tsw, iy_tsw, ix_tsw, in_d, in_h, in_w)) {
+          T tsw_val = input[inp_offset_NC + iz_tsw * inp_sD + iy_tsw * inp_sH +
+                            ix_tsw * inp_sW];
+          gix -= tsw_val * (iy - iy_bne) * (iz_bne - iz) * gOut;
+          giy += tsw_val * (ix_bne - ix) * (iz_bne - iz) * gOut;
+          giz -= tsw_val * (ix_bne - ix) * (iy - iy_bne) * gOut;
+        }
+        if (InBounds3D(iz_tse, iy_tse, ix_tse, in_d, in_h, in_w)) {
+          T tse_val = input[inp_offset_NC + iz_tse * inp_sD + iy_tse * inp_sH +
+                            ix_tse * inp_sW];
+          gix += tse_val * (iy - iy_bnw) * (iz_bnw - iz) * gOut;
+          giy += tse_val * (ix - ix_bnw) * (iz_bnw - iz) * gOut;
+          giz -= tse_val * (ix - ix_bnw) * (iy - iy_bnw) * gOut;
+        }
+        if (InBounds3D(iz_bnw, iy_bnw, ix_bnw, in_d, in_h, in_w)) {
+          T bnw_val = input[inp_offset_NC + iz_bnw * inp_sD + iy_bnw * inp_sH +
+                            ix_bnw * inp_sW];
+          gix -= bnw_val * (iy_tse - iy) * (iz - iz_tse) * gOut;
+          giy -= bnw_val * (ix_tse - ix) * (iz - iz_tse) * gOut;
+          giz += bnw_val * (ix_tse - ix) * (iy_tse - iy) * gOut;
+        }
+        if (InBounds3D(iz_bne, iy_bne, ix_bne, in_d, in_h, in_w)) {
+          T bne_val = input[inp_offset_NC + iz_bne * inp_sD + iy_bne * inp_sH +
+                            ix_bne * inp_sW];
+          gix += bne_val * (iy_tsw - iy) * (iz - iz_tsw) * gOut;
+          giy -= bne_val * (ix - ix_tsw) * (iz - iz_tsw) * gOut;
+          giz += bne_val * (ix - ix_tsw) * (iy_tsw - iy) * gOut;
+        }
+        if (InBounds3D(iz_bsw, iy_bsw, ix_bsw, in_d, in_h, in_w)) {
+          T bsw_val = input[inp_offset_NC + iz_bsw * inp_sD + iy_bsw * inp_sH +
+                            ix_bsw * inp_sW];
+          gix -= bsw_val * (iy - iy_tne) * (iz - iz_tne) * gOut;
+          giy += bsw_val * (ix_tne - ix) * (iz - iz_tne) * gOut;
+          giz += bsw_val * (ix_tne - ix) * (iy - iy_tne) * gOut;
+        }
+        if (InBounds3D(iz_bse, iy_bse, ix_bse, in_d, in_h, in_w)) {
+          T bse_val = input[inp_offset_NC + iz_bse * inp_sD + iy_bse * inp_sH +
+                            ix_bse * inp_sW];
+          gix += bse_val * (iy - iy_tnw) * (iz - iz_tnw) * gOut;
+          giy += bse_val * (ix - ix_tnw) * (iz - iz_tnw) * gOut;
+          giz += bse_val * (ix - ix_tnw) * (iy - iy_tnw) * gOut;
+        }
+      }
+      if (grad_grid != nullptr) {
+        T* gGrid_ptr_NDHW = grad_grid + index * grid_sW;
+        gGrid_ptr_NDHW[0] = gix_mult * gix;
+        gGrid_ptr_NDHW[1] = giy_mult * giy;
+        gGrid_ptr_NDHW[2] = giz_mult * giz;
+      }
+    } else if (mode == Mode::nearest) {
+      auto ix_nearest = static_cast<int>(std::round(ix));
+      auto iy_nearest = static_cast<int>(std::round(iy));
+      auto iz_nearest = static_cast<int>(std::round(iz));
+
+      // assign nearest neighor pixel value to output pixel
+      int gOut_offset = n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW;
+      T* gInp_ptr_NC = grad_input + n * inp_sN;
+      for (int c = 0; c < out_c;
+           ++c, gOut_offset += gOut_sC, gInp_ptr_NC += inp_sC) {
+        AtomicAdd3D(gInp_ptr_NC,
+                    iz_nearest,
+                    iy_nearest,
+                    ix_nearest,
+                    inp_sD,
+                    inp_sH,
+                    inp_sW,
+                    in_d,
+                    in_h,
+                    in_w,
+                    grad_output[gOut_offset]);
+      }
+      if (grad_grid != nullptr) {
+        T* gGrid_ptr_NDHW = grad_grid + index * grid_sW;
+        gGrid_ptr_NDHW[0] = static_cast<T>(0);
+        gGrid_ptr_NDHW[1] = static_cast<T>(0);
+        gGrid_ptr_NDHW[2] = static_cast<T>(0);
+      }
+    }
+  }
+}
+
 template <typename T, typename Context>
 void GridSampleGradKernel(const Context& dev_ctx,
                          const DenseTensor& x,
@@ -275,42 +591,84 @@ void GridSampleGradKernel(const Context& dev_ctx,
    enum_mode = Mode::bilinear;
  }

-  const int n = grid.dims()[0];
-  const int out_h = grid.dims()[1];
-  const int out_w = grid.dims()[2];
-  const int c = x.dims()[1];
-  const int in_h = x.dims()[2];
-  const int in_w = x.dims()[3];
+  if (x.dims().size() == 4) {
+    const int n = grid.dims()[0];
+    const int out_h = grid.dims()[1];
+    const int out_w = grid.dims()[2];
+    const int c = x.dims()[1];
+    const int in_h = x.dims()[2];
+    const int in_w = x.dims()[3];

-  dev_ctx.template Alloc<T>(x_grad);
-  phi::funcs::SetConstant<Context, T>()(dev_ctx, x_grad, static_cast<T>(0));
+    dev_ctx.template Alloc<T>(x_grad);
+    phi::funcs::SetConstant<Context, T>()(dev_ctx, x_grad, static_cast<T>(0));

-  T* grid_grad_data = nullptr;
-  if (grid_grad != nullptr) {
-    grid_grad_data = dev_ctx.template Alloc<T>(grid_grad);
-  }
+    T* grid_grad_data = nullptr;
+    if (grid_grad != nullptr) {
+      grid_grad_data = dev_ctx.template Alloc<T>(grid_grad);
+    }
+
+    int count = static_cast<int>(n * out_h * out_w);
+    auto cu_stream = dev_ctx.stream();
+    backends::gpu::GpuLaunchConfig config =
+        backends::gpu::GetGpuLaunchConfig1D(dev_ctx, count);
+    GridSamplerCudaBackwardKernel<T>
+        <<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>(
+            count,
+            out_grad.data<T>(),
+            x.data<T>(),
+            grid.data<T>(),
+            n,
+            c,
+            out_h,
+            out_w,
+            in_h,
+            in_w,
+            x_grad->data<T>(),
+            grid_grad_data,
+            enum_mode,
+            enum_padding_mode,
+            align_corners);
+  } else {
+    const int out_d = grid.dims()[1];
+    const int out_h = grid.dims()[2];
+    const int out_w = grid.dims()[3];
+    const int n = x.dims()[0];
+    const int c = x.dims()[1];
+    const int in_d = x.dims()[2];
+    const int in_h = x.dims()[3];
+    const int in_w = x.dims()[4];

-  int count = static_cast<int>(n * out_h * out_w);
-  auto cu_stream = dev_ctx.stream();
-  backends::gpu::GpuLaunchConfig config =
-      backends::gpu::GetGpuLaunchConfig1D(dev_ctx, count);
-  GridSamplerCudaBackwardKernel<T>
-      <<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>(
-          count,
-          out_grad.data<T>(),
-          x.data<T>(),
-          grid.data<T>(),
-          n,
-          c,
-          out_h,
-          out_w,
-          in_h,
-          in_w,
-          x_grad->data<T>(),
-          grid_grad_data,
-          enum_mode,
-          enum_padding_mode,
-          align_corners);
+    dev_ctx.template Alloc<T>(x_grad);
+    phi::funcs::SetConstant<Context, T>()(dev_ctx, x_grad, static_cast<T>(0));
+
+    T* grid_grad_data = nullptr;
+    if (grid_grad != nullptr) {
+      grid_grad_data = dev_ctx.template Alloc<T>(grid_grad);
+    }
+
+    int count = static_cast<int>(n * out_d * out_h * out_w);
+    auto cu_stream = dev_ctx.stream();
+    backends::gpu::GpuLaunchConfig config =
+        backends::gpu::GetGpuLaunchConfig1D(dev_ctx, count);
+    GridSampler3DCudaBackwardKernel<T>
+        <<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>(
+            count,
+            out_grad.data<T>(),
+            x.data<T>(),
+            grid.data<T>(),
+            c,
+            out_d,
+            out_h,
+            out_w,
+            in_d,
+            in_h,
+            in_w,
+            x_grad->data<T>(),
+            grid_grad_data,
+            enum_mode,
+            enum_padding_mode,
+            align_corners);
+  }
 }

 }  // namespace phi

--- a/paddle/phi/kernels/gpu/grid_sample_kernel.cu
+++ b/paddle/phi/kernels/gpu/grid_sample_kernel.cu
@@ -169,6 +169,169 @@ __global__ void GridSampleCudaKernel(const int nthreads,
  }
 }

+template <typename T>
+__global__ void GridSample3DCudaKernel(const int nthreads,
+                                       int out_c,
+                                       int out_d,
+                                       int out_h,
+                                       int out_w,
+                                       int in_d,
+                                       int in_h,
+                                       int in_w,
+                                       const T* input,
+                                       const T* grid,
+                                       T* output,
+                                       const Mode interpolation_mode,
+                                       const PaddingMode padding_mode,
+                                       bool align_corners) {
+  int inp_sW = 1;
+  int inp_sH = in_w;
+  int inp_sD = in_h * in_w;
+  int inp_sC = in_d * inp_sD;
+  int inp_sN = out_c * inp_sC;
+
+  int grid_sCoor = 1;
+  int grid_sW = 3;
+  int grid_sH = out_w * grid_sW;
+  int grid_sD = out_h * grid_sH;
+  int grid_sN = out_d * grid_sD;
+
+  int out_sW = 1;
+  int out_sH = out_w;
+  int out_sD = out_h * out_w;
+  int out_sC = out_d * out_sD;
+  int out_sN = out_c * out_sC;
+
+  CUDA_KERNEL_LOOP_TYPE(index, nthreads, int) {
+    const int w = index % out_w;
+    const int h = (index / out_w) % out_h;
+    const int d = (index / (out_h * out_w)) % out_d;
+    const int n = index / (out_d * out_h * out_w);
+    const int grid_offset =
+        n * grid_sN + d * grid_sD + h * grid_sH + w * grid_sW;
+    // get the corresponding input x, y, z co-ordinates from grid
+    T ix = grid[grid_offset];
+    T iy = grid[grid_offset + grid_sCoor];
+    T iz = grid[grid_offset + 2 * grid_sCoor];
+    ix = ComputePositions(ix, in_w, padding_mode, align_corners);
+    iy = ComputePositions(iy, in_h, padding_mode, align_corners);
+    iz = ComputePositions(iz, in_d, padding_mode, align_corners);
+    if (interpolation_mode == Mode::bilinear) {
+      // get corner pixel values from (x, y, z)
+      // for 4d, we used north-east-south-west
+      // for 5d, we add top-bottom
+      int ix_tnw = static_cast<int>(std::floor(ix));
+      int iy_tnw = static_cast<int>(std::floor(iy));
+      int iz_tnw = static_cast<int>(std::floor(iz));
+
+      int ix_tne = ix_tnw + 1;
+      int iy_tne = iy_tnw;
+      int iz_tne = iz_tnw;
+
+      int ix_tsw = ix_tnw;
+      int iy_tsw = iy_tnw + 1;
+      int iz_tsw = iz_tnw;
+
+      int ix_tse = ix_tnw + 1;
+      int iy_tse = iy_tnw + 1;
+      int iz_tse = iz_tnw;
+
+      int ix_bnw = ix_tnw;
+      int iy_bnw = iy_tnw;
+      int iz_bnw = iz_tnw + 1;
+
+      int ix_bne = ix_tnw + 1;
+      int iy_bne = iy_tnw;
+      int iz_bne = iz_tnw + 1;
+
+      int ix_bsw = ix_tnw;
+      int iy_bsw = iy_tnw + 1;
+      int iz_bsw = iz_tnw + 1;
+
+      int ix_bse = ix_tnw + 1;
+      int iy_bse = iy_tnw + 1;
+      int iz_bse = iz_tnw + 1;
+
+      // get surfaces to each neighbor:
+      T tnw = (ix_bse - ix) * (iy_bse - iy) * (iz_bse - iz);
+      T tne = (ix - ix_bsw) * (iy_bsw - iy) * (iz_bsw - iz);
+      T tsw = (ix_bne - ix) * (iy - iy_bne) * (iz_bne - iz);
+      T tse = (ix - ix_bnw) * (iy - iy_bnw) * (iz_bnw - iz);
+      T bnw = (ix_tse - ix) * (iy_tse - iy) * (iz - iz_tse);
+      T bne = (ix - ix_tsw) * (iy_tsw - iy) * (iz - iz_tsw);
+      T bsw = (ix_tne - ix) * (iy - iy_tne) * (iz - iz_tne);
+      T bse = (ix - ix_tnw) * (iy - iy_tnw) * (iz - iz_tnw);
+
+      auto inp_ptr_NC = input + n * inp_sN;
+      auto out_ptr_NCDHW =
+          output + n * out_sN + d * out_sD + h * out_sH + w * out_sW;
+      for (int c = 0; c < out_c;
+           ++c, inp_ptr_NC += inp_sC, out_ptr_NCDHW += out_sC) {
+        *out_ptr_NCDHW = static_cast<T>(0);
+        if (InBounds3D(iz_tnw, iy_tnw, ix_tnw, in_d, in_h, in_w)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_tnw * inp_sD + iy_tnw * inp_sH + ix_tnw * inp_sW] *
+              tnw;
+        }
+        if (InBounds3D(iz_tne, iy_tne, ix_tne, in_d, in_h, in_w)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_tne * inp_sD + iy_tne * inp_sH + ix_tne * inp_sW] *
+              tne;
+        }
+        if (InBounds3D(iz_tsw, iy_tsw, ix_tsw, in_d, in_h, in_w)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_tsw * inp_sD + iy_tsw * inp_sH + ix_tsw * inp_sW] *
+              tsw;
+        }
+        if (InBounds3D(iz_tse, iy_tse, ix_tse, in_d, in_h, in_w)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_tse * inp_sD + iy_tse * inp_sH + ix_tse * inp_sW] *
+              tse;
+        }
+        if (InBounds3D(iz_bnw, iy_bnw, ix_bnw, in_d, in_h, in_w)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_bnw * inp_sD + iy_bnw * inp_sH + ix_bnw * inp_sW] *
+              bnw;
+        }
+        if (InBounds3D(iz_bne, iy_bne, ix_bne, in_d, in_h, in_w)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_bne * inp_sD + iy_bne * inp_sH + ix_bne * inp_sW] *
+              bne;
+        }
+        if (InBounds3D(iz_bsw, iy_bsw, ix_bsw, in_d, in_h, in_w)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_bsw * inp_sD + iy_bsw * inp_sH + ix_bsw * inp_sW] *
+              bsw;
+        }
+        if (InBounds3D(iz_bse, iy_bse, ix_bse, in_d, in_h, in_w)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_bse * inp_sD + iy_bse * inp_sH + ix_bse * inp_sW] *
+              bse;
+        }
+      }
+    } else if (interpolation_mode == Mode::nearest) {
+      int ix_nearest = static_cast<int>(std::round(ix));
+      int iy_nearest = static_cast<int>(std::round(iy));
+      int iz_nearest = static_cast<int>(std::round(iz));
+
+      // assign nearest neighor pixel value to output pixel
+      auto inp_ptr_NC = input + n * inp_sN;
+      auto out_ptr_NCDHW =
+          output + n * out_sN + d * out_sD + h * out_sH + w * out_sW;
+      for (int c = 0; c < out_c;
+           ++c, inp_ptr_NC += inp_sC, out_ptr_NCDHW += out_sC) {
+        if (InBounds3D(iz_nearest, iy_nearest, ix_nearest, in_d, in_h, in_w)) {
+          *out_ptr_NCDHW =
+              inp_ptr_NC[iz_nearest * inp_sD + iy_nearest * inp_sH +
+                         ix_nearest * inp_sW];
+        } else {
+          *out_ptr_NCDHW = static_cast<T>(0);
+        }
+      }
+    }
+  }
+}
+
 template <typename T, typename Context>
 void GridSampleKernel(const Context& dev_ctx,
                      const DenseTensor& x,
@@ -193,38 +356,78 @@ void GridSampleKernel(const Context& dev_ctx,
    enum_mode = Mode::bilinear;
  }

-  const int n = grid.dims()[0];
-  const int out_h = grid.dims()[1];
-  const int out_w = grid.dims()[2];
-  const int c = x.dims()[1];
-  const int in_h = x.dims()[2];
-  const int in_w = x.dims()[3];
-  VLOG(3) << "n: " << n << "; c: " << c << "; out_h: " << out_h
-          << "; out_w: " << out_w;
-
-  auto* output_data = dev_ctx.template Alloc<T>(out);
-  VLOG(3) << "out dims: " << out->dims()[0] << "; " << out->dims()[1] << "; "
-          << out->dims()[2] << "; " << out->dims()[3];
-
-  int count = static_cast<int>(n * out_h * out_w);
-  auto cu_stream = dev_ctx.stream();
-  backends::gpu::GpuLaunchConfig config =
-      backends::gpu::GetGpuLaunchConfig1D(dev_ctx, count);
-  GridSampleCudaKernel<T>
-      <<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>(
-          count,
-          n,
-          c,
-          out_h,
-          out_w,
-          in_h,
-          in_w,
-          x.data<T>(),
-          grid.data<T>(),
-          output_data,
-          enum_mode,
-          enum_padding_mode,
-          align_corners);
+  if (x.dims().size() == 4) {
+    const int n = grid.dims()[0];
+    const int out_h = grid.dims()[1];
+    const int out_w = grid.dims()[2];
+    const int c = x.dims()[1];
+    const int in_h = x.dims()[2];
+    const int in_w = x.dims()[3];
+    VLOG(3) << "n: " << n << "; c: " << c << "; out_h: " << out_h
+            << "; out_w: " << out_w;
+
+    auto* output_data = dev_ctx.template Alloc<T>(out);
+    VLOG(3) << "out dims: " << out->dims()[0] << "; " << out->dims()[1] << "; "
+            << out->dims()[2] << "; " << out->dims()[3];
+
+    int count = static_cast<int>(n * out_h * out_w);
+    auto cu_stream = dev_ctx.stream();
+    backends::gpu::GpuLaunchConfig config =
+        backends::gpu::GetGpuLaunchConfig1D(dev_ctx, count);
+    GridSampleCudaKernel<T>
+        <<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>(
+            count,
+            n,
+            c,
+            out_h,
+            out_w,
+            in_h,
+            in_w,
+            x.data<T>(),
+            grid.data<T>(),
+            output_data,
+            enum_mode,
+            enum_padding_mode,
+            align_corners);
+  } else {
+    const int n = grid.dims()[0];
+    const int out_d = grid.dims()[1];
+    const int out_h = grid.dims()[2];
+    const int out_w = grid.dims()[3];
+    const int c = x.dims()[1];
+    const int in_d = x.dims()[2];
+    const int in_h = x.dims()[3];
+    const int in_w = x.dims()[4];
+
+    VLOG(3) << "n: " << n << "; c: " << c << "; out_d: " << out_d
+            << "; out_h: " << out_h << "; out_w: " << out_w;
+
+    auto* output_data = dev_ctx.template Alloc<T>(out);
+    VLOG(3) << "out dims: " << out->dims()[0] << "; " << out->dims()[1] << "; "
+            << out->dims()[2] << "; " << out->dims()[3] << "; "
+            << out->dims()[4];
+
+    int count = static_cast<int>(n * out_d * out_h * out_w);
+    auto cu_stream = dev_ctx.stream();
+    backends::gpu::GpuLaunchConfig config =
+        backends::gpu::GetGpuLaunchConfig1D(dev_ctx, count);
+    GridSample3DCudaKernel<T>
+        <<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>(
+            count,
+            c,
+            out_d,
+            out_h,
+            out_w,
+            in_d,
+            in_h,
+            in_w,
+            x.data<T>(),
+            grid.data<T>(),
+            output_data,
+            enum_mode,
+            enum_padding_mode,
+            align_corners);
+  }
 }

 }  // namespace phi

--- a/paddle/phi/kernels/gpu/grid_sample_utils.h
+++ b/paddle/phi/kernels/gpu/grid_sample_utils.h
@@ -27,4 +27,9 @@ static __forceinline__ __device__ bool InBounds(int h, int w, int H, int W) {
  return h >= 0 && h < H && w >= 0 && w < W;
 }

+static __forceinline__ __device__ bool InBounds3D(
+    int d, int h, int w, int D, int H, int W) {
+  return d >= 0 && d < D && h >= 0 && h < H && w >= 0 && w < W;
+}
+
 }  // namespace phi
--- a/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
+++ b/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
@@ -20,6 +20,15 @@ from op_test import OpTest, skip_check_grad_ci

 paddle.enable_static()

+from white_list import (
+    op_accuracy_white_list,
+    check_shape_white_list,
+    compile_vs_runtime_white_list,
+    no_check_set_white_list,
+    op_threshold_white_list,
+    no_grad_set_white_list,
+)
+

 def AffineGrid(theta, grid_shape):
    n = grid_shape[0]
@@ -64,6 +73,68 @@ def getGridPointValue(data, x, y):
    return out


+def AffineGrid3D(theta, grid_shape):
+    n = grid_shape[0]
+    d = grid_shape[1]
+    h = grid_shape[2]
+    w = grid_shape[3]
+    d_idx = np.repeat(np.repeat(np.linspace(-1, 1, d)[:, np.newaxis,
+                                                      np.newaxis],
+                                h,
+                                axis=1),
+                      w,
+                      axis=2)[:, :, :, np.newaxis]
+    h_idx = np.repeat(np.repeat(np.linspace(-1, 1, h)[np.newaxis, :,
+                                                      np.newaxis],
+                                w,
+                                axis=2),
+                      d,
+                      axis=0)[:, :, :, np.newaxis]
+    w_idx = np.repeat(np.repeat(np.linspace(-1, 1, w)[np.newaxis,
+                                                      np.newaxis, :],
+                                h,
+                                axis=1),
+                      d,
+                      axis=0)[:, :, :, np.newaxis]
+    grid = np.concatenate(
+        [w_idx, h_idx, d_idx, np.ones([d, h, w, 1])], axis=3)  # d * h * w * 4
+    grid = np.repeat(grid[np.newaxis, :], n, axis=0)  # n * d * h * w *4
+    ret = np.zeros([n, d * h * w, 3])
+    theta = theta.transpose([0, 2, 1])
+    for i in range(len(theta)):
+        ret[i] = np.dot(grid[i].reshape([d * h * w, 4]), theta[i])
+
+    return ret.reshape([n, d, h, w, 3]).astype("float64")
+
+
+def getGridPointValue3D(data, x, y, z):
+    data_shape = data.shape
+    N = data_shape[0]
+    C = data_shape[1]
+    in_D = data_shape[2]
+    in_H = data_shape[3]
+    in_W = data_shape[4]
+    out_D = x.shape[1]
+    out_H = x.shape[2]
+    out_W = x.shape[3]
+
+    #out = np.zeros(data_shape, dtype='float64')
+    out = np.zeros([N, C, out_D, out_H, out_W], dtype='float64')
+    for i in range(N):
+        for j in range(out_D):
+            for k in range(out_H):
+                for l in range(out_W):
+                    if y[i, j, k, l] < 0 or y[i, j, k, l] > in_H - 1 or x[
+                            i, j, k, l] < 0 or x[i, j, k, l] > in_W - 1 or z[
+                                i, j, k, l] < 0 or z[i, j, k, l] > in_D - 1:
+                        out[i, :, j, k, l] = 0
+                    else:
+                        out[i, :, j, k, l] = data[i, :, z[i, j, k, l],
+                                                  y[i, j, k, l], x[i, j, k, l]]
+
+    return out
+
+
 def clip(x, min_n, max_n):
    return np.maximum(np.minimum(x, max_n), min_n)

@@ -138,6 +209,80 @@ def GridSampler(data,
    return out


+def GridSampler3D(data,
+                  grid,
+                  align_corners=True,
+                  mode="bilinear",
+                  padding_mode="zeros"):
+    dims = data.shape
+    N = dims[0]
+    in_C = dims[1]
+    in_D = dims[2]
+    in_H = dims[3]
+    in_W = dims[4]
+
+    out_D = grid.shape[1]
+    out_H = grid.shape[2]
+    out_W = grid.shape[3]
+
+    x = grid[:, :, :, :, 0]
+    y = grid[:, :, :, :, 1]
+    z = grid[:, :, :, :, 2]
+
+    z_max = in_D - 1
+    y_max = in_H - 1
+    x_max = in_W - 1
+
+    x = unnormalizeAndClip(x, x_max, align_corners, padding_mode)
+    y = unnormalizeAndClip(y, y_max, align_corners, padding_mode)
+    z = unnormalizeAndClip(z, z_max, align_corners, padding_mode)
+
+    if mode == "bilinear":
+        x0 = np.floor(x).astype('int32')
+        x1 = x0 + 1
+        y0 = np.floor(y).astype('int32')
+        y1 = y0 + 1
+        z0 = np.floor(z).astype('int32')
+        z1 = z0 + 1
+
+        w_tnw = np.tile(((x1 - x) * (y1 - y) * (z1 - z)).reshape(
+            (N, 1, out_D, out_H, out_W)), (1, in_C, 1, 1, 1))
+        w_tne = np.tile(((x - x0) * (y1 - y) * (z1 - z)).reshape(
+            (N, 1, out_D, out_H, out_W)), (1, in_C, 1, 1, 1))
+        w_tsw = np.tile(((x1 - x) * (y - y0) * (z1 - z)).reshape(
+            (N, 1, out_D, out_H, out_W)), (1, in_C, 1, 1, 1))
+        w_tse = np.tile(((x - x0) * (y - y0) * (z1 - z)).reshape(
+            (N, 1, out_D, out_H, out_W)), (1, in_C, 1, 1, 1))
+        w_bnw = np.tile(((x1 - x) * (y1 - y) * (z - z0)).reshape(
+            (N, 1, out_D, out_H, out_W)), (1, in_C, 1, 1, 1))
+        w_bne = np.tile(((x - x0) * (y1 - y) * (z - z0)).reshape(
+            (N, 1, out_D, out_H, out_W)), (1, in_C, 1, 1, 1))
+        w_bsw = np.tile(((x1 - x) * (y - y0) * (z - z0)).reshape(
+            (N, 1, out_D, out_H, out_W)), (1, in_C, 1, 1, 1))
+        w_bse = np.tile(((x - x0) * (y - y0) * (z - z0)).reshape(
+            (N, 1, out_D, out_H, out_W)), (1, in_C, 1, 1, 1))
+
+        v_tnw = getGridPointValue3D(data, x0, y0, z0)
+        v_tne = getGridPointValue3D(data, x1, y0, z0)
+        v_tsw = getGridPointValue3D(data, x0, y1, z0)
+        v_tse = getGridPointValue3D(data, x1, y1, z0)
+        v_bnw = getGridPointValue3D(data, x0, y0, z1)
+        v_bne = getGridPointValue3D(data, x1, y0, z1)
+        v_bsw = getGridPointValue3D(data, x0, y1, z1)
+        v_bse = getGridPointValue3D(data, x1, y1, z1)
+
+        out = (w_tnw * v_tnw + w_tne * v_tne + w_tsw * v_tsw + w_tse * v_tse +
+               w_bnw * v_bnw + w_bne * v_bne + w_bsw * v_bsw +
+               w_bse * v_bse).astype('float64')
+
+    elif mode == "nearest":
+        x = np.round(x).astype('int32')
+        y = np.round(y).astype('int32')
+        z = np.round(z).astype('int32')
+        out = getGridPointValue3D(data, x, y, z)
+    return out
+
+
 class TestGridSamplerOp(OpTest):

    def setUp(self):
@@ -150,36 +295,90 @@ class TestGridSamplerOp(OpTest):
        self.mode = "bilinear"
        self.initTestCase()
        x = np.random.randint(0, 255, self.x_shape).astype('float64')
-
        theta = np.zeros(self.theta_shape).astype('float64')
-        for i in range(self.theta_shape[0]):
-            for j in range(2):
-                for k in range(3):
-                    theta[i, j, k] = np.random.rand(1)[0]
-        grid = AffineGrid(theta, self.grid_shape)
-
-        self.inputs = {'X': x, 'Grid': grid}
-        self.attrs = {
-            'use_cudnn': self.use_cudnn,
-            "align_corners": self.align_corners,
-            "padding_mode": self.padding_mode,
-            "mode": self.mode
-        }
-        self.outputs = {
-            'Output':
-            GridSampler(x, grid, self.align_corners, self.mode,
-                        self.padding_mode)
-        }
+
+        if len(self.grid_shape) == 4:
+            for i in range(self.theta_shape[0]):
+                for j in range(2):
+                    for k in range(3):
+                        theta[i, j, k] = np.random.rand(1)[0]
+            grid = AffineGrid(theta, self.grid_shape)
+            self.inputs = {'X': x, 'Grid': grid}
+            self.attrs = {
+                'use_cudnn': self.use_cudnn,
+                "align_corners": self.align_corners,
+                "padding_mode": self.padding_mode,
+                "mode": self.mode
+            }
+            self.outputs = {
+                'Output':
+                GridSampler(x, grid, self.align_corners, self.mode,
+                            self.padding_mode)
+            }
+        else:
+            for i in range(self.theta_shape[0]):
+                for j in range(3):
+                    for k in range(4):
+                        theta[i, j, k] = np.random.rand(1)[0]
+            grid = AffineGrid3D(theta, self.grid_shape)
+            self.inputs = {'X': x, 'Grid': grid}
+            self.attrs = {
+                'use_cudnn': self.use_cudnn,
+                "align_corners": self.align_corners,
+                "padding_mode": self.padding_mode,
+                "mode": self.mode
+            }
+            self.outputs = {
+                'Output':
+                GridSampler3D(x, grid, self.align_corners, self.mode,
+                              self.padding_mode)
+            }
+
+    def get_places(self):
+        places = []
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        return places

    def test_check_output(self):
-        self.check_output(check_eager=True)
+        if len(self.grid_shape) == 4:
+            self.check_output(check_eager=True)
+        else:
+            check_eager_flag = True
+            check_dygraph_flag = False
+            for place in self.get_places():
+                res = self.check_output_with_place(
+                    place,
+                    atol=1e-5,
+                    check_dygraph=check_dygraph_flag,
+                    check_eager=check_eager_flag)
+                if check_eager_flag:
+                    assert check_dygraph_flag == False
+                    outs, eager_dygraph_outs, fetch_list = res
+                elif check_dygraph_flag:
+                    uts, dygraph_outs, fetch_list = res
+                else:
+                    outs, fetch_list = res
+                if self.op_type not in compile_vs_runtime_white_list.COMPILE_RUN_OP_WHITE_LIST:
+                    self.check_compile_vs_runtime(fetch_list, outs)

    def test_check_grad_normal(self):
-        self.check_grad(['X', 'Grid'],
-                        'Output',
-                        max_relative_error=0.01,
-                        numeric_grad_delta=self.numeric_grad_delta,
-                        check_eager=True)
+        if len(self.grid_shape) == 4:
+            self.check_grad(['X', 'Grid'],
+                            'Output',
+                            max_relative_error=0.01,
+                            numeric_grad_delta=self.numeric_grad_delta,
+                            check_eager=True)
+        else:
+            self._check_grad_helper()
+            for place in self.get_places():
+                self.check_grad_with_place(
+                    place, ['X'],
+                    'Output',
+                    numeric_grad_delta=self.numeric_grad_delta,
+                    max_relative_error=0.01,
+                    check_eager=True,
+                    check_dygraph=False)

    def initTestCase(self):
        self.x_shape = (2, 3, 8, 8)
@@ -285,5 +484,102 @@ class Case5(LargeInputCase):
        self.use_cudnn = False if core.is_compiled_with_rocm() else True


+class Case6(TestGridSamplerOp):
+
+    def initTestCase(self):
+        self.x_shape = (2, 3, 5, 6, 7)
+        self.grid_shape = (2, 8, 9, 10, 3)
+        self.theta_shape = (2, 3, 4)
+        self.align_corners = False
+        self.padding_mode = "zeros"
+        self.mode = "bilinear"
+
+
+class Case6_(TestGridSamplerOp):
+
+    def get_places(self):
+        places = []
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        return places
+
+    def initTestCase(self):
+        self.x_shape = (2, 3, 5, 6, 7)
+        self.grid_shape = (2, 8, 9, 10, 3)
+        self.theta_shape = (2, 3, 4)
+        self.align_corners = False
+        self.padding_mode = "border"
+        self.mode = "bilinear"
+
+
+class Case7(TestGridSamplerOp):
+
+    def initTestCase(self):
+        self.x_shape = (2, 3, 5, 6, 7)
+        self.grid_shape = (2, 8, 9, 10, 3)
+        self.theta_shape = (2, 3, 4)
+        self.align_corners = False
+        self.padding_mode = "reflection"
+        self.mode = "bilinear"
+
+
+class Case8(TestGridSamplerOp):
+
+    def initTestCase(self):
+        self.x_shape = (2, 3, 5, 6, 7)
+        self.grid_shape = (2, 8, 9, 10, 3)
+        self.theta_shape = (2, 3, 4)
+        self.align_corners = True
+        self.padding_mode = "reflection"
+        self.mode = "bilinear"
+
+
+class Case9(TestGridSamplerOp):
+
+    def initTestCase(self):
+        self.x_shape = (2, 3, 5, 6, 7)
+        self.grid_shape = (2, 8, 9, 10, 3)
+        self.theta_shape = (2, 3, 4)
+        self.align_corners = False
+        self.padding_mode = "reflection"
+        self.mode = "nearest"
+        self.numeric_grad_delta = 0.0001
+
+
+@skip_check_grad_ci(reason="'check_grad' on large inputs is too slow, " +
+                    "however it is desirable to cover the forward pass")
+class LargeInput3DCase(TestGridSamplerOp):
+
+    def initTestCase(self):
+        self.no_need_check_grad = True
+        self.x_shape = (2, 3, 24, 24, 12)
+        self.grid_shape = (2, 25, 25, 12, 3)
+        self.theta_shape = (2, 3, 4)
+        self.align_corners = False
+        self.padding_mode = "reflection"
+        self.mode = "bilinear"
+        self.use_cudnn = False
+        self.__class__.op_type = 'grid_sampler'
+
+    def test_check_grad_normal(self):
+        pass
+
+
+@skip_check_grad_ci(reason="'check_grad' on large inputs is too slow, " +
+                    "however it is desirable to cover the forward pass")
+class Case10(LargeInput3DCase):
+
+    def initTestCase(self):
+        self.no_need_check_grad = True
+        self.x_shape = (2, 3, 24, 24, 12)
+        self.grid_shape = (2, 25, 25, 12, 3)
+        self.theta_shape = (2, 3, 4)
+        self.align_corners = True
+        self.padding_mode = "zeros"
+        self.mode = "bilinear"
+        self.use_cudnn = False
+        self.__class__.op_type = 'grid_sampler'
+
+
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/nn/functional/vision.py
+++ b/python/paddle/nn/functional/vision.py
@@ -127,12 +127,21 @@ def grid_sample(x,
    """
    This operation samples input X by using bilinear interpolation or
    nearest interpolation based on flow field grid, which is usually
-    generated by :code:`affine_grid` . The grid of shape [N, H, W, 2]
-    is the concatenation of (x, y) coordinates with shape [N, H, W] each,
-    where x is indexing the 4th dimension (in width dimension) of input
-    data x and y is indexing the 3rd dimension (in height dimension),
-    finally results is the bilinear interpolation or nearest value of 4 nearest corner
-    points. The output tensor shape will be [N, C, H, W].
+    generated by :code:`affine_grid` . When the input X is 4-D Tensor, 
+    the grid of shape [N, H, W, 2] is the concatenation of (x, y) 
+    coordinates with shape [N, H, W] each, where x is indexing the 4th 
+    dimension (in width dimension) of input data x and y is indexing 
+    the 3rd dimension (in height dimension), finally results is the 
+    bilinear interpolation or nearest value of 4 nearest corner
+    points. The output tensor shape will be [N, C, H, W]. When the input X 
+    is 5-D Tensor, the grid of shape [N, D, H, W, 3] is the concatenation 
+    of (x, y, z) coordinates with shape [N, D, H, W] each, where x is 
+    indexing the 5th dimension (in width dimension) of input data x, y is 
+    indexing the 4th dimension (in height dimension) and z is indexing the 
+    3rd dimension (in depth dimension) finally results is the bilinear 
+    interpolation or nearest value of 8 nearest cornerpoints. The output 
+    tensor shape will be [N, C, D, H, W]. 
+


    Step 1:
@@ -181,11 +190,13 @@ def grid_sample(x,

    Args:
        x(Tensor): The input tensor, which is a 4-d tensor with shape
-                     [N, C, H, W], N is the batch size, C is the channel
-                     number, H and W is the feature height and width.
+                     [N, C, H, W] or a 5-d tensor with shape [N, C, D, H, W], 
+                     N is the batch size, C is the channel number, 
+                     D, H and W is the feature depth, height and width.
                     The data type is float32 or float64.
-        grid(Tensor): Input grid tensor of shape [N, grid_H, grid_W, 2]. The
-                        data type is float32 or float64.
+        grid(Tensor): Input grid tensor, which is a 4-d tensor with shape [N, grid_H, 
+                        grid_W, 2] or a 5-d tensor with shape [N, grid_D, grid_H, 
+                        grid_W, 3]. The data type is float32 or float64.
        mode(str, optional): The interpolation method which can be 'bilinear' or 'nearest'.
                         Default: 'bilinear'.
        padding_mode(str, optional) The padding method used when source index
@@ -199,7 +210,8 @@ def grid_sample(x,
                             None by default.

    Returns:
-        Tensor, The shape of output is [N, C, grid_H, grid_W] in which `grid_H` is the height of grid and `grid_W` is the width of grid. The data type is same as input tensor.
+        Tensor, The shape of output is [N, C, grid_H, grid_W] or [N, C, grid_D, grid_H, grid_W] in which `grid_D` is the depth of grid, 
+                `grid_H` is the height of grid and `grid_W` is the width of grid. The data type is same as input tensor.

    Examples:

@@ -207,31 +219,24 @@ def grid_sample(x,
        
            import paddle
            import paddle.nn.functional as F
-            import numpy as np
-            
-            # shape=[1, 1, 3, 3]
-            x = np.array([[[[-0.6,  0.8, -0.5],
-                            [-0.5,  0.2,  1.2],
-                            [ 1.4,  0.3, -0.2]]]]).astype("float64")
            
+            # x shape=[1, 1, 3, 3]           
+            x = paddle.to_tensor([[[[-0.6,  0.8, -0.5],
+                                    [-0.5,  0.2,  1.2],
+                                    [ 1.4,  0.3, -0.2]]]],dtype='float64')
            # grid shape = [1, 3, 4, 2]
-            grid = np.array(
-                         [[[[ 0.2,  0.3],
-                            [-0.4, -0.3],
-                            [-0.9,  0.3],
-                            [-0.9, -0.6]],
-                           [[ 0.4,  0.1],
-                            [ 0.9, -0.8],
-                            [ 0.4,  0.5],
-                            [ 0.5, -0.2]],
-                           [[ 0.1, -0.8],
-                            [-0.3, -1. ],
-                            [ 0.7,  0.4],
-                            [ 0.2,  0.8]]]]).astype("float64")
-            
-            
-            x = paddle.to_tensor(x)
-            grid = paddle.to_tensor(grid)
+            grid = paddle.to_tensor([[[[ 0.2,  0.3],
+                                       [-0.4, -0.3],
+                                       [-0.9,  0.3],
+                                       [-0.9, -0.6]],
+                                      [[ 0.4,  0.1],
+                                       [ 0.9, -0.8],
+                                       [ 0.4,  0.5],
+                                       [ 0.5, -0.2]],
+                                      [[ 0.1, -0.8],
+                                       [-0.3, -1. ],
+                                       [ 0.7,  0.4],
+                                       [ 0.2,  0.8]]]],dtype='float64')
            y_t = F.grid_sample(
                x,
                grid,