【2.0 API】Add CUDA kernel and enhance options for grid_sample (#26576)

This PR enhance CPU kernel and add new CUDA kernel to make grid_sample support: - align_corners: with bool type. - padding mode: which can be in ['zeros', 'reflect', 'border'] - Interpolation mode: which ca be in ['bilinear', 'nearest'] The old CPU and CUDNN version only support align_corners=true, padding_mode='zeros' and interpolation_mode='bilinear'. The behavior of the new version op in default mode is compatible with the old version.

【2.0 API】Add CUDA kernel and enhance options for grid_sample (#26576)
This PR enhance CPU kernel and add new CUDA kernel to make grid_sample support: - align_corners: with bool type. - padding mode: which can be in ['zeros', 'reflect', 'border'] - Interpolation mode: which ca be in ['bilinear', 'nearest'] The old CPU and CUDNN version only support align_corners=true, padding_mode='zeros' and interpolation_mode='bilinear'. The behavior of the new version op in default mode is compatible with the old version.
79539cf1 · whs · GitHub · 39fe0d35 · 79539cf1 · 79539cf1
8 changed file
--- a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
@@ -41,13 +41,14 @@ class CUDNNGridSampleOpKernel : public framework::OpKernel<T> {

    int n = input->dims()[0];
    int c = input->dims()[1];
-    int h = input->dims()[2];
-    int w = input->dims()[3];
-    const int size[4] = {n, c, h, w};
+    int out_h = grid->dims()[1];
+    int out_w = grid->dims()[2];
+    const int size[4] = {n, c, out_h, out_w};

    const T* input_data = input->data<T>();
    const T* grid_data = grid->data<T>();
-    T* output_data = output->mutable_data<T>({n, c, h, w}, ctx.GetPlace());
+    T* output_data =
+        output->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());

    ScopedSpatialTransformerDescriptor st_desc;
    cudnnSpatialTransformerDescriptor_t cudnn_st_desc =
@@ -97,7 +98,7 @@ class CUDNNGridSampleGradOpKernel : public framework::OpKernel<T> {
    const T* grid_data = grid->data<T>();
    const T* output_grad_data = output_grad->data<T>();
    T* input_grad_data =
-        input_grad->mutable_data<T>(output_grad_dims, ctx.GetPlace());
+        input_grad->mutable_data<T>(input->dims(), ctx.GetPlace());
    T* grid_grad_data =
        grid_grad->mutable_data<T>({n, h, w, 2}, ctx.GetPlace());


--- a/paddle/fluid/operators/grid_sampler_op.cc
+++ b/paddle/fluid/operators/grid_sampler_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */

 #include "paddle/fluid/operators/grid_sampler_op.h"
 #include <memory>
+#include <string>
 #include "paddle/fluid/framework/op_registry.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cudnn_helper.h"
@@ -58,21 +59,10 @@ class GridSampleOp : public framework::OperatorWithKernel {
              "Input(X) and Input(Grid) dimension[0] should be equal, but "
              "received X dimension[0](%d) != Grid dimension[0](%d)",
              x_dims[0], grid_dims[0]));
-      PADDLE_ENFORCE_EQ(
-          grid_dims[1], x_dims[2],
-          platform::errors::InvalidArgument(
-              "Input(X) dims[2] and Input(Grid) dims[1] should be equal, but "
-              "received X dimension[2](%d) != Grid dimension[1](%d)",
-              x_dims[2], grid_dims[1]));
-      PADDLE_ENFORCE_EQ(
-          grid_dims[2], x_dims[3],
-          platform::errors::InvalidArgument(
-              "Input(X) dims[3] and Input(Grid) dims[2] should be equal, but "
-              "received X dimension[3](%d) != Grid dimension[2](%d)",
-              x_dims[3], grid_dims[2]));
    }

-    ctx->SetOutputDim("Output", x_dims);
+    ctx->SetOutputDim("Output",
+                      {x_dims[0], x_dims[1], grid_dims[1], grid_dims[2]});
    ctx->ShareLoD("X", "Output");
  }

@@ -108,15 +98,37 @@ class GridSampleOpMaker : public framework::OpProtoAndCheckerMaker {
        "(bool, default true) Only used in cudnn kernel, need install cudnn")
        .SetDefault(true);

+    AddAttr<bool>(
+        "align_corners",
+        "(bool, default true) If align_corners is true, it will project"
+        "-1 and 1 to the centers of the corner pixels. Otherwise, it will "
+        "project"
+        "-1 and 1 to the image edges.")
+        .SetDefault(true);
+
+    AddAttr<std::string>(
+        "mode",
+        "(bool, default true) The interpolation method which can be 'bilinear'"
+        " or 'nearest'.")
+        .SetDefault("bilinear");
+
+    AddAttr<std::string>(
+        "padding_mode",
+        "(bool, default true) The padding method used when source"
+        "index is out of input images. It can be 'zeros', 'reflect' and "
+        "'border'.")
+        .SetDefault("zeros");
+
    AddComment(R"DOC(
-      This operation samples input X by using bilinear interpolation based on 
+      This operation samples input X by using bilinear or nearest interpolation based on 
      flow field grid, which is usually generated by affine_grid. The grid of
      shape [N, H, W, 2] is the concatenation of (grid_x, grid_y) coordinates 
      with shape [N, H, W] each, where grid_x is indexing the 4th dimension 
      (in width dimension) of input data x and grid_y is indexing the 3rd 
      dimension (in height dimension), finally results is the bilinear 
-      interpolation value of 4 nearest corner points.
+      interpolation value or nearest value of 4 nearest corner points.

+      For bilinear interpolation mode:
      Step 1:
        Get (x, y) grid coordinates and scale to [0, H-1/W-1].


--- a/paddle/fluid/operators/grid_sampler_op.cu
+++ b/paddle/fluid/operators/grid_sampler_op.cu
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/grid_sampler_op.h"
+#include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/gpu_info.h"
+
+namespace paddle {
+namespace operators {
+
+static __forceinline__ __device__ bool in_bounds(int h, int w, int H, int W) {
+  return h >= 0 && h < H && w >= 0 && w < W;
+}
+
+template <typename T>
+static __forceinline__ __device__ void atomic_add(T* data, int h, int w, int sH,
+                                                  int sW, int H, int W,
+                                                  T delta) {
+  if (in_bounds(h, w, H, W)) {
+    atomicAdd(data + h * sH + w * sW, delta);
+  }
+}
+
+template <typename T>
+static __forceinline__ __device__ T _unnormalize(T coord, int size,
+                                                 bool align_corners) {
+  if (align_corners) {
+    return ((coord + 1.f) / 2) * (size - 1);
+  } else {
+    return ((coord + 1.f) * size - 1) / 2;
+  }
+}
+
+template <typename T>
+static __forceinline__ __device__ T clip_indexes(T in, int max_value) {
+  return min(static_cast<T>(max_value), max(in, static_cast<T>(0)));
+}
+
+template <typename T>
+static __forceinline__ __device__ T reflect_indexes(T in, int twice_low,
+                                                    int twice_high) {
+  if (twice_low == twice_high) {
+    return static_cast<T>(0);
+  }
+  T min = static_cast<T>(twice_low) / 2;
+  T span = static_cast<T>(twice_high - twice_low) / 2;
+  in = fabs(in - min);
+  T extra = fmod(in, span);
+  int flips = static_cast<int>(floor(in / span));
+  if (flips % 2 == 0) {
+    return extra + min;
+  } else {
+    return span - extra + min;
+  }
+}
+
+template <typename T>
+static __forceinline__ __device__ T compute_positions(T coord, int size,
+                                                      PaddingMode padding_mode,
+                                                      bool align_corners) {
+  coord = _unnormalize<T>(coord, size, align_corners);
+  if (padding_mode == PaddingMode::border) {
+    coord = clip_indexes(coord, size - 1);
+  } else if (padding_mode == PaddingMode::reflect) {
+    if (align_corners) {
+      coord = reflect_indexes(coord, 0, 2 * (size - 1));
+    } else {
+      coord = reflect_indexes(coord, -1, 2 * size - 1);
+    }
+    coord = clip_indexes(coord, size - 1);
+  }
+  return coord;
+}
+
+template <typename T>
+static __forceinline__ __device__ T _unnormalize_with_mask(T coord, int size,
+                                                           bool align_corners,
+                                                           T* grad_in) {
+  if (align_corners) {
+    *grad_in = static_cast<T>(size - 1) / 2;
+    return ((coord + 1.f) / 2) * (size - 1);
+  } else {
+    *grad_in = static_cast<T>(size) / 2;
+    return ((coord + 1.f) * size - 1) / 2;
+  }
+}
+
+template <typename T>
+static __forceinline__ __device__ T clip_indexes_with_mask(T in, int clip_limit,
+                                                           T* grad_in) {
+  if (in <= static_cast<T>(0)) {
+    *grad_in = static_cast<T>(0);
+    return static_cast<T>(0);
+  } else {
+    T max = static_cast<T>(clip_limit - 1);
+    if (in >= max) {
+      *grad_in = static_cast<T>(0);
+      return max;
+    } else {
+      *grad_in = static_cast<T>(1);
+      return in;
+    }
+  }
+}
+
+template <typename T>
+static __forceinline__ __device__ T
+reflect_indexes_with_mask(T in, int twice_low, int twice_high, T* grad_in) {
+  if (twice_low == twice_high) {
+    *grad_in = static_cast<T>(0);
+    return static_cast<T>(0);
+  }
+  int grad_in_mult_;
+  T min = static_cast<T>(twice_low) / 2;
+  T span = static_cast<T>(twice_high - twice_low) / 2;
+  in = in - min;
+  if (in < static_cast<T>(0)) {
+    grad_in_mult_ = -1;
+    in = -in;
+  } else {
+    grad_in_mult_ = 1;
+  }
+  T extra = fmod(in, span);
+  int flips = static_cast<int>(floor(in / span));
+  if (flips % 2 == 0) {
+    *grad_in = static_cast<T>(grad_in_mult_);
+    return extra + min;
+  } else {
+    *grad_in = static_cast<T>(-grad_in_mult_);
+    return span - extra + min;
+  }
+}
+
+template <typename T>
+static __forceinline__ __device__ T
+compute_positions_with_mask(T coord, int size, PaddingMode padding_mode,
+                            bool align_corners, T* grad_in) {
+  T grad_clip, grad_refl;
+  coord = _unnormalize_with_mask<T>(coord, size, align_corners, grad_in);
+  if (padding_mode == PaddingMode::border) {
+    coord = clip_indexes_with_mask(coord, size, &grad_clip);
+    *grad_in = (*grad_in) * grad_clip;
+  } else if (padding_mode == PaddingMode::reflect) {
+    if (align_corners) {
+      coord = reflect_indexes_with_mask(coord, 0, 2 * (size - 1), &grad_refl);
+    } else {
+      coord = reflect_indexes_with_mask(coord, -1, 2 * size - 1, &grad_refl);
+    }
+    coord = clip_indexes_with_mask(coord, size, &grad_clip);
+    *grad_in = (*grad_in) * grad_refl * grad_clip;
+  }
+
+  return coord;
+}
+
+template <typename T>
+__global__ void grid_sample_cuda_kernel(const int nthreads, int n, int out_c,
+                                        int out_h, int out_w, int in_h,
+                                        int in_w, const T* input, const T* grid,
+                                        T* output, const Mode mode,
+                                        const PaddingMode padding_mode,
+                                        bool align_corners) {
+  int inp_sN = out_c * in_h * in_w;
+
+  int inp_sC = in_h * in_w;
+  int inp_sH = in_w;
+  int inp_sW = 1;
+  int grid_sN = out_h * out_w * 2;
+  int grid_sH = out_w * 2;
+  int grid_sW = 2;
+  int grid_sCoor = 1;
+  int out_sN = out_c * out_h * out_w;
+  int out_sC = out_h * out_w;
+  int out_sH = out_w;
+  int out_sW = 1;
+
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int w = index % out_w;
+    const int h = (index / out_w) % out_h;
+    const int n = index / (out_h * out_w);
+    const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
+
+    T ix = grid[grid_offset];
+    T iy = grid[grid_offset + grid_sCoor];
+
+    ix = compute_positions(ix, in_w, padding_mode, align_corners);
+    iy = compute_positions(iy, in_h, padding_mode, align_corners);
+
+    if (mode == Mode::bilinear) {
+      int ix_nw = static_cast<int>(floor(ix));
+      int iy_nw = static_cast<int>(floor(iy));
+      int ix_ne = ix_nw + 1;
+      int iy_ne = iy_nw;
+      int ix_sw = ix_nw;
+      int iy_sw = iy_nw + 1;
+      int ix_se = ix_nw + 1;
+      int iy_se = iy_nw + 1;
+
+      T nw = (ix_se - ix) * (iy_se - iy);
+      T ne = (ix - ix_sw) * (iy_sw - iy);
+      T sw = (ix_ne - ix) * (iy - iy_ne);
+      T se = (ix - ix_nw) * (iy - iy_nw);
+
+      auto inp_offset_NC = n * inp_sN;
+      auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
+      for (int c = 0; c < out_c;
+           ++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) {
+        *out_ptr_NCHW = static_cast<T>(0);
+        if (in_bounds(iy_nw, ix_nw, in_h, in_w)) {
+          *out_ptr_NCHW +=
+              input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW] * nw;
+        }
+        if (in_bounds(iy_ne, ix_ne, in_h, in_w)) {
+          *out_ptr_NCHW +=
+              input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW] * ne;
+        }
+        if (in_bounds(iy_sw, ix_sw, in_h, in_w)) {
+          *out_ptr_NCHW +=
+              input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW] * sw;
+        }
+        if (in_bounds(iy_se, ix_se, in_h, in_w)) {
+          *out_ptr_NCHW +=
+              input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW] * se;
+        }
+      }
+    } else if (mode == Mode::nearest) {
+      int ix_nearest = static_cast<int>(round(ix));
+      int iy_nearest = static_cast<int>(round(iy));
+
+      auto inp_offset_NC = n * inp_sN;
+      auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
+      for (int c = 0; c < out_c;
+           ++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) {
+        if (in_bounds(iy_nearest, ix_nearest, in_h, in_w)) {
+          *out_ptr_NCHW =
+              input[inp_offset_NC + iy_nearest * inp_sH + ix_nearest * inp_sW];
+        } else {
+          *out_ptr_NCHW = static_cast<T>(0);
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+class GridSampleOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx = ctx.cuda_device_context();
+    auto align_corners = ctx.Attr<bool>("align_corners");
+    auto padding_mode_s = ctx.Attr<std::string>("padding_mode");
+    auto mode_s = ctx.Attr<std::string>("mode");
+    PaddingMode padding_mode;
+    Mode mode;
+    if (padding_mode_s == "border") {
+      padding_mode = PaddingMode::border;
+    } else if (padding_mode_s == "reflect") {
+      padding_mode = PaddingMode::reflect;
+    } else {
+      padding_mode = PaddingMode::zeros;
+    }
+
+    if (mode_s == "nearest") {
+      mode = Mode::nearest;
+    } else {
+      mode = Mode::bilinear;
+    }
+
+    auto* input = ctx.Input<Tensor>("X");
+    auto* grid = ctx.Input<Tensor>("Grid");
+    const int n = grid->dims()[0];
+    const int out_h = grid->dims()[1];
+    const int out_w = grid->dims()[2];
+    const int c = input->dims()[1];
+    const int in_h = input->dims()[2];
+    const int in_w = input->dims()[3];
+    VLOG(3) << "n: " << n << "; c: " << c << "; out_h: " << out_h
+            << "; out_w: " << out_w;
+    auto* output = ctx.Output<Tensor>("Output");
+    auto* output_data = output->mutable_data<T>(ctx.GetPlace());
+
+    VLOG(3) << "set constant";
+    math::SetConstant<paddle::platform::CUDADeviceContext, T>()(
+        dev_ctx, output, static_cast<T>(0));
+    int count = static_cast<int>(n * out_h * out_w);
+
+    auto cu_stream = dev_ctx.stream();
+
+    int block = 512;
+    int grid_size = (count + block - 1) / block;
+    grid_sample_cuda_kernel<T><<<block, grid_size, 0, cu_stream>>>(
+        count, n, c, out_h, out_w, in_h, in_w, input->data<T>(),
+        grid->data<T>(), output_data, mode, padding_mode, align_corners);
+  }
+};
+
+template <typename T>
+__global__ void grid_sampler_cuda_backward_kernel(
+    const int nthreads, const T* grad_output, const T* input, const T* grid,
+    int n, int out_c, int out_h, int out_w, int in_h, int in_w, T* grad_input,
+    T* grad_grid, const Mode mode, const PaddingMode padding_mode,
+    bool align_corners) {
+  int inp_sN = out_c * in_h * in_w;
+  int inp_sC = in_h * in_w;
+  int inp_sH = in_w;
+  int inp_sW = 1;
+  int grid_sN = out_h * out_w * 2;
+  int grid_sH = out_w * 2;
+  int grid_sW = 2;
+  int grid_sCoor = 1;
+
+  int gOut_sN = out_c * out_h * out_w;
+  int gOut_sC = out_h * out_w;
+  int gOut_sH = out_w;
+  int gOut_sW = 1;
+
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int w = index % out_w;
+    const int h = (index / out_w) % out_h;
+    const int n = index / (out_h * out_w);
+    const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
+
+    T ix = grid[grid_offset];
+    T iy = grid[grid_offset + grid_sCoor];
+
+    T gix_mult, giy_mult;
+    ix = compute_positions_with_mask(ix, in_w, padding_mode, align_corners,
+                                     &gix_mult);
+    iy = compute_positions_with_mask(iy, in_h, padding_mode, align_corners,
+                                     &giy_mult);
+
+    if (mode == Mode::bilinear) {
+      int ix_nw = static_cast<int>(floor(ix));
+      int iy_nw = static_cast<int>(floor(iy));
+      int ix_ne = ix_nw + 1;
+      int iy_ne = iy_nw;
+      int ix_sw = ix_nw;
+      int iy_sw = iy_nw + 1;
+      int ix_se = ix_nw + 1;
+      int iy_se = iy_nw + 1;
+
+      T nw = (ix_se - ix) * (iy_se - iy);
+      T ne = (ix - ix_sw) * (iy_sw - iy);
+      T sw = (ix_ne - ix) * (iy - iy_ne);
+      T se = (ix - ix_nw) * (iy - iy_nw);
+
+      T gix = static_cast<T>(0), giy = static_cast<T>(0);
+      int gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW;
+      T* gInp_ptr_NC = grad_input + n * inp_sN;
+      int inp_offset_NC = n * inp_sN;
+      for (int c = 0; c < out_c; ++c, inp_offset_NC += inp_sC,
+               gInp_ptr_NC += inp_sC, gOut_offset += gOut_sC) {
+        T gOut = grad_output[gOut_offset];
+
+        atomic_add(gInp_ptr_NC, iy_nw, ix_nw, inp_sH, inp_sW, in_h, in_w,
+                   nw * gOut);
+        atomic_add(gInp_ptr_NC, iy_ne, ix_ne, inp_sH, inp_sW, in_h, in_w,
+                   ne * gOut);
+        atomic_add(gInp_ptr_NC, iy_sw, ix_sw, inp_sH, inp_sW, in_h, in_w,
+                   sw * gOut);
+        atomic_add(gInp_ptr_NC, iy_se, ix_se, inp_sH, inp_sW, in_h, in_w,
+                   se * gOut);
+
+        if (in_bounds(iy_nw, ix_nw, in_h, in_w)) {
+          T nw_val = input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW];
+          gix -= nw_val * (iy_se - iy) * gOut;
+          giy -= nw_val * (ix_se - ix) * gOut;
+        }
+        if (in_bounds(iy_ne, ix_ne, in_h, in_w)) {
+          T ne_val = input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW];
+          gix += ne_val * (iy_sw - iy) * gOut;
+          giy -= ne_val * (ix - ix_sw) * gOut;
+        }
+        if (in_bounds(iy_sw, ix_sw, in_h, in_w)) {
+          T sw_val = input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW];
+          gix -= sw_val * (iy - iy_ne) * gOut;
+          giy += sw_val * (ix_ne - ix) * gOut;
+        }
+        if (in_bounds(iy_se, ix_se, in_h, in_w)) {
+          T se_val = input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW];
+          gix += se_val * (iy - iy_nw) * gOut;
+          giy += se_val * (ix - ix_nw) * gOut;
+        }
+      }
+
+      T* gGrid_ptr_NHW = grad_grid + index * grid_sW;
+      gGrid_ptr_NHW[0] = gix_mult * gix;
+      gGrid_ptr_NHW[1] = giy_mult * giy;
+    } else if (mode == Mode::nearest) {
+      int ix_nearest = static_cast<int>(::round(ix));
+      int iy_nearest = static_cast<int>(::round(iy));
+
+      int gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW;
+      T* gInp_ptr_NC = grad_input + n * inp_sN;
+      for (int c = 0; c < out_c;
+           ++c, gInp_ptr_NC += inp_sC, gOut_offset += gOut_sC) {
+        atomic_add(gInp_ptr_NC, iy_nearest, ix_nearest, inp_sH, inp_sW, in_h,
+                   in_w, grad_output[gOut_offset]);
+      }
+
+      T* gGrid_ptr_NHW = grad_grid + index * grid_sW;
+      gGrid_ptr_NHW[0] = static_cast<T>(0);
+      gGrid_ptr_NHW[1] = static_cast<T>(0);
+    }
+  }
+}
+
+template <typename T>
+class GridSampleGradOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx = ctx.cuda_device_context();
+    auto align_corners = ctx.Attr<bool>("align_corners");
+    auto padding_mode_s = ctx.Attr<std::string>("padding_mode");
+    auto mode_s = ctx.Attr<std::string>("mode");
+
+    PaddingMode padding_mode;
+    Mode mode;
+    if (padding_mode_s == "border") {
+      padding_mode = PaddingMode::border;
+    } else if (padding_mode_s == "reflect") {
+      padding_mode = PaddingMode::reflect;
+    } else {
+      padding_mode = PaddingMode::zeros;
+    }
+
+    if (mode_s == "nearest") {
+      mode = Mode::nearest;
+    } else {
+      mode = Mode::bilinear;
+    }
+
+    auto* input = ctx.Input<Tensor>("X");
+    auto* grid = ctx.Input<Tensor>("Grid");
+    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
+
+    const int n = grid->dims()[0];
+    const int out_h = grid->dims()[1];
+    const int out_w = grid->dims()[2];
+    const int c = input->dims()[1];
+    const int in_h = input->dims()[2];
+    const int in_w = input->dims()[3];
+
+    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    input_grad->mutable_data<T>(ctx.GetPlace());
+    math::SetConstant<paddle::platform::CUDADeviceContext, T>()(
+        ctx.template device_context<paddle::platform::CUDADeviceContext>(),
+        input_grad, static_cast<T>(0));
+    auto* grid_grad = ctx.Output<Tensor>(framework::GradVarName("Grid"));
+    grid_grad->mutable_data<T>(ctx.GetPlace());
+    math::SetConstant<paddle::platform::CUDADeviceContext, T>()(
+        ctx.template device_context<paddle::platform::CUDADeviceContext>(),
+        grid_grad, static_cast<T>(0));
+
+    int count = static_cast<int>(n * out_h * out_w);
+    auto cu_stream = dev_ctx.stream();
+    int block = 512;
+    int grid_size = (count + block - 1) / block;
+    grid_sampler_cuda_backward_kernel<T><<<block, grid_size, 0, cu_stream>>>(
+        count, output_grad->data<T>(), input->data<T>(), grid->data<T>(), n, c,
+        out_h, out_w, in_h, in_w, input_grad->data<T>(), grid_grad->data<T>(),
+        mode, padding_mode, align_corners);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(grid_sampler, ops::GridSampleOpCUDAKernel<float>,
+                        ops::GridSampleOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(grid_sampler_grad,
+                        ops::GridSampleGradOpCUDAKernel<float>,
+                        ops::GridSampleGradOpCUDAKernel<double>);
--- a/paddle/fluid/operators/grid_sampler_op.h
+++ b/paddle/fluid/operators/grid_sampler_op.h
--- a/python/paddle/fluid/tests/unittests/test_grid_sample_function.py
+++ b/python/paddle/fluid/tests/unittests/test_grid_sample_function.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+from paddle import fluid, nn
+import paddle.fluid.dygraph as dg
+import paddle.nn.functional as F
+import unittest
+
+
+class GridSampleTestCase(unittest.TestCase):
+    def __init__(self,
+                 methodName='runTest',
+                 x_shape=[2, 2, 3, 3],
+                 grid_shape=[2, 3, 3, 2],
+                 mode="bilinear",
+                 padding_mode="zeros",
+                 align_corners=False):
+        super(GridSampleTestCase, self).__init__(methodName)
+        self.padding_mode = padding_mode
+        self.x_shape = x_shape
+        self.grid_shape = grid_shape
+        self.mode = mode
+        self.padding_mode = padding_mode
+        self.align_corners = align_corners
+        self.dtype = "float64"
+
+    def setUp(self):
+        self.x = np.random.randn(*(self.x_shape)).astype(self.dtype)
+        self.grid = np.random.uniform(-1, 1, self.grid_shape).astype(self.dtype)
+
+    def static_functional(self, place):
+        main = fluid.Program()
+        start = fluid.Program()
+        with fluid.unique_name.guard():
+            with fluid.program_guard(main, start):
+                x = fluid.data("x", self.x_shape, dtype=self.dtype)
+                grid = fluid.data("grid", self.grid_shape, dtype=self.dtype)
+                y_var = F.grid_sample(
+                    x,
+                    grid,
+                    mode=self.mode,
+                    padding_mode=self.padding_mode,
+                    align_corners=self.align_corners)
+        feed_dict = {"x": self.x, "grid": self.grid}
+        exe = fluid.Executor(place)
+        exe.run(start)
+        y_np, = exe.run(main, feed=feed_dict, fetch_list=[y_var])
+        return y_np
+
+    def dynamic_functional(self):
+        x_t = paddle.to_tensor(self.x)
+        grid_t = paddle.to_tensor(self.grid)
+        y_t = F.grid_sample(
+            x_t,
+            grid_t,
+            mode=self.mode,
+            padding_mode=self.padding_mode,
+            align_corners=self.align_corners)
+        y_np = y_t.numpy()
+        return y_np
+
+    def _test_equivalence(self, place):
+        result1 = self.static_functional(place)
+        with dg.guard(place):
+            result2 = self.dynamic_functional()
+        np.testing.assert_array_almost_equal(result1, result2)
+
+    def runTest(self):
+        place = fluid.CPUPlace()
+        self._test_equivalence(place)
+
+        if fluid.core.is_compiled_with_cuda():
+            place = fluid.CUDAPlace(0)
+            self._test_equivalence(place)
+
+
+class GridSampleErrorTestCase(GridSampleTestCase):
+    def runTest(self):
+        place = fluid.CPUPlace()
+        with self.assertRaises(ValueError):
+            self.static_functional(place)
+
+
+def add_cases(suite):
+    suite.addTest(GridSampleTestCase(methodName='runTest'))
+    suite.addTest(
+        GridSampleTestCase(
+            methodName='runTest',
+            mode='bilinear',
+            padding_mode='reflect',
+            align_corners=True))
+    suite.addTest(
+        GridSampleTestCase(
+            methodName='runTest',
+            mode='bilinear',
+            padding_mode='zeros',
+            align_corners=True))
+
+
+def add_error_cases(suite):
+    suite.addTest(
+        GridSampleErrorTestCase(
+            methodName='runTest', padding_mode="VALID"))
+    suite.addTest(
+        GridSampleErrorTestCase(
+            methodName='runTest', align_corners="VALID"))
+    suite.addTest(GridSampleErrorTestCase(methodName='runTest', mode="VALID"))
+
+
+def load_tests(loader, standard_tests, pattern):
+    suite = unittest.TestSuite()
+    add_cases(suite)
+    add_error_cases(suite)
+    return suite
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
+++ b/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
@@ -17,17 +17,17 @@ import numpy as np
 from op_test import OpTest


-def AffineGrid(theta, size):
-    n = size[0]
-    h = size[2]
-    w = size[3]
+def AffineGrid(theta, grid_shape):
+    n = grid_shape[0]
+    h = grid_shape[1]
+    w = grid_shape[2]
    h_idx = np.repeat(
        np.linspace(-1, 1, h)[np.newaxis, :], w, axis=0).T[:, :, np.newaxis]
    w_idx = np.repeat(
        np.linspace(-1, 1, w)[np.newaxis, :], h, axis=0)[:, :, np.newaxis]
    grid = np.concatenate(
        [w_idx, h_idx, np.ones([h, w, 1])], axis=2)  # h * w * 3
-    grid = np.repeat(grid[np.newaxis, :], size[0], axis=0)  # n * h * w *3
+    grid = np.repeat(grid[np.newaxis, :], n, axis=0)  # n * h * w *3

    ret = np.zeros([n, h * w, 2])
    theta = theta.transpose([0, 2, 1])
@@ -40,15 +40,19 @@ def AffineGrid(theta, size):
 def getGridPointValue(data, x, y):
    data_shape = data.shape
    N = data_shape[0]
-    H = data_shape[2]
-    W = data_shape[3]
-
-    out = np.zeros(data_shape, dtype='float64')
+    C = data_shape[1]
+    in_H = data_shape[2]
+    in_W = data_shape[3]
+    out_H = x.shape[1]
+    out_W = x.shape[2]
+
+    #out = np.zeros(data_shape, dtype='float64')
+    out = np.zeros([N, C, out_H, out_W], dtype='float64')
    for i in range(N):
-        for j in range(H):
-            for k in range(W):
-                if y[i, j, k] < 0 or y[i, j, k] > H - 1 or x[i, j, k] < 0 or x[
-                        i, j, k] > W - 1:
+        for j in range(out_H):
+            for k in range(out_W):
+                if y[i, j, k] < 0 or y[i, j, k] > in_H - 1 or x[
+                        i, j, k] < 0 or x[i, j, k] > in_W - 1:
                    out[i, :, j, k] = 0
                else:
                    out[i, :, j, k] = data[i, :, y[i, j, k], x[i, j, k]]
@@ -56,44 +60,89 @@ def getGridPointValue(data, x, y):
    return out


-def GridSampler(data, grid):
-    dims = data.shape
-    N = dims[0]
-    C = dims[1]
-    H = dims[2]
-    W = dims[3]
+def clip(x, min_n, max_n):
+    return np.maximum(np.minimum(x, max_n), min_n)

-    x = grid[:, :, :, 0]
-    y = grid[:, :, :, 1]
-    y_max = H - 1
-    x_max = W - 1

-    x = 0.5 * ((x.astype('float64') + 1.0) * x_max)
-    y = 0.5 * ((y.astype('float64') + 1.0) * y_max)
+def unnormalizeAndClip(grid_slice, max_val, align_corners, padding_mode):
+    if align_corners:
+        grid_slice = 0.5 * ((grid_slice.astype('float64') + 1.0) * max_val)
+    else:
+        grid_slice = 0.5 * (
+            (grid_slice.astype('float64') + 1.0) * (max_val + 1)) - 0.5
+
+    if padding_mode == "border":
+        grid_slice = clip(grid_slice, 0, max_val)
+    elif padding_mode == "reflect":
+        double_range = 2 * max_val if align_corners else (max_val + 1) * 2
+        grid_abs = np.abs(grid_slice) if align_corners else np.abs(grid_slice +
+                                                                   0.5)
+        extra = grid_abs - np.floor(grid_abs / double_range) * double_range
+        grid_slice = np.minimum(extra, double_range - extra)
+        grid_slice = grid_slice if align_corners else clip(grid_slice - 0.5, 0,
+                                                           max_val)
+    return grid_slice

-    x0 = np.floor(x).astype('int32')
-    x1 = x0 + 1
-    y0 = np.floor(y).astype('int32')
-    y1 = y0 + 1

-    wa = np.tile(((x1 - x) * (y1 - y)).reshape((N, 1, H, W)), (1, C, 1, 1))
-    wb = np.tile(((x1 - x) * (y - y0)).reshape((N, 1, H, W)), (1, C, 1, 1))
-    wc = np.tile(((x - x0) * (y1 - y)).reshape((N, 1, H, W)), (1, C, 1, 1))
-    wd = np.tile(((x - x0) * (y - y0)).reshape((N, 1, H, W)), (1, C, 1, 1))
+def GridSampler(data,
+                grid,
+                align_corners=True,
+                mode="bilinear",
+                padding_mode="zeros"):
+    dims = data.shape
+    N = dims[0]
+    in_C = dims[1]
+    in_H = dims[2]
+    in_W = dims[3]

-    va = getGridPointValue(data, x0, y0)
-    vb = getGridPointValue(data, x0, y1)
-    vc = getGridPointValue(data, x1, y0)
-    vd = getGridPointValue(data, x1, y1)
+    out_H = grid.shape[1]
+    out_W = grid.shape[2]

-    out = (wa * va + wb * vb + wc * vc + wd * vd).astype('float64')
+    x = grid[:, :, :, 0]
+    y = grid[:, :, :, 1]
+    y_max = in_H - 1
+    x_max = in_W - 1
+
+    x = unnormalizeAndClip(x, x_max, align_corners, padding_mode)
+    y = unnormalizeAndClip(y, y_max, align_corners, padding_mode)
+
+    if mode == "bilinear":
+        x0 = np.floor(x).astype('int32')
+        x1 = x0 + 1
+        y0 = np.floor(y).astype('int32')
+        y1 = y0 + 1
+
+        wa = np.tile(((x1 - x) * (y1 - y)).reshape((N, 1, out_H, out_W)),
+                     (1, in_C, 1, 1))
+        wb = np.tile(((x1 - x) * (y - y0)).reshape((N, 1, out_H, out_W)),
+                     (1, in_C, 1, 1))
+        wc = np.tile(((x - x0) * (y1 - y)).reshape((N, 1, out_H, out_W)),
+                     (1, in_C, 1, 1))
+        wd = np.tile(((x - x0) * (y - y0)).reshape((N, 1, out_H, out_W)),
+                     (1, in_C, 1, 1))
+
+        va = getGridPointValue(data, x0, y0)
+        vb = getGridPointValue(data, x0, y1)
+        vc = getGridPointValue(data, x1, y0)
+        vd = getGridPointValue(data, x1, y1)
+
+        out = (wa * va + wb * vb + wc * vc + wd * vd).astype('float64')
+    elif mode == "nearest":
+        x = np.round(x).astype('int32')
+        y = np.round(y).astype('int32')
+        out = getGridPointValue(data, x, y)
    return out


 class TestGridSamplerOp(OpTest):
    def setUp(self):
-        self.initTestCase()
+        self.use_cudnn = False
+        self.numeric_grad_delta = 0.0001
        self.op_type = 'grid_sampler'
+        self.align_corners = True
+        self.padding_mode = "zeros"
+        self.mode = "bilinear"
+        self.initTestCase()
        x = np.random.randint(0, 255, self.x_shape).astype('float64')

        theta = np.zeros(self.theta_shape).astype('float64')
@@ -101,22 +150,90 @@ class TestGridSamplerOp(OpTest):
            for j in range(2):
                for k in range(3):
                    theta[i, j, k] = np.random.rand(1)[0]
-        grid = AffineGrid(theta, self.x_shape)
+        grid = AffineGrid(theta, self.grid_shape)

        self.inputs = {'X': x, 'Grid': grid}
-        self.attrs = {'use_cudnn': True}
-        self.outputs = {'Output': GridSampler(x, grid)}
+        self.attrs = {
+            'use_cudnn': self.use_cudnn,
+            "align_corners": self.align_corners,
+            "padding_mode": self.padding_mode,
+            "mode": self.mode
+        }
+        #    print("X: {}".format(x))
+        self.outputs = {
+            'Output': GridSampler(x, grid, self.align_corners, self.mode,
+                                  self.padding_mode)
+        }

    def test_check_output(self):
        self.check_output()

    def test_check_grad_normal(self):
-        self.check_grad(['X', 'Grid'], 'Output', max_relative_error=0.61)
+        self.check_grad(
+            ['X', 'Grid'],
+            'Output',
+            max_relative_error=0.01,
+            numeric_grad_delta=self.numeric_grad_delta)
+
+    def initTestCase(self):
+        self.x_shape = (2, 3, 8, 8)
+        self.grid_shape = (2, 7, 9, 2)
+        self.theta_shape = (2, 2, 3)
+        self.align_corners = True
+        self.padding_mode = "zeros"
+        self.mode = "bilinear"
+        self.use_cudnn = True
+
+
+class Case1(TestGridSamplerOp):
+    def initTestCase(self):
+        self.x_shape = (2, 3, 5, 6)
+        self.grid_shape = (2, 8, 9, 2)
+        self.theta_shape = (2, 2, 3)
+        self.align_corners = False
+        self.padding_mode = "zeros"
+        self.mode = "bilinear"
+
+
+class Case1(TestGridSamplerOp):
+    def initTestCase(self):
+        self.x_shape = (2, 3, 5, 6)
+        self.grid_shape = (2, 8, 9, 2)
+        self.theta_shape = (2, 2, 3)
+        self.align_corners = False
+        self.padding_mode = "border"
+        self.mode = "bilinear"
+
+
+class Case2(TestGridSamplerOp):
+    def initTestCase(self):
+        self.x_shape = (2, 3, 5, 6)
+        self.grid_shape = (2, 8, 9, 2)
+        self.theta_shape = (2, 2, 3)
+        self.align_corners = False
+        self.padding_mode = "reflect"
+        self.mode = "bilinear"
+
+
+class Case3(TestGridSamplerOp):
+    def initTestCase(self):
+        self.x_shape = (2, 3, 5, 6)
+        self.grid_shape = (2, 8, 9, 2)
+        self.theta_shape = (2, 2, 3)
+        self.align_corners = True
+        self.padding_mode = "reflect"
+        self.mode = "bilinear"
+

+class Case4(TestGridSamplerOp):
    def initTestCase(self):
-        self.x_shape = (2, 5, 7, 3)
-        self.grid_shape = (2, 7, 3, 2)
+        self.x_shape = (2, 3, 5, 6)
+        self.grid_shape = (2, 8, 9, 2)
        self.theta_shape = (2, 2, 3)
+        self.align_corners = False
+        self.padding_mode = "reflect"
+        self.mode = "nearest"
+        self.numeric_grad_delta = 0.0001


 if __name__ == "__main__":

--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -192,7 +192,7 @@ from .vision import fsp_matrix  #DEFINE_ALIAS
 from .vision import generate_mask_labels  #DEFINE_ALIAS
 from .vision import generate_proposal_labels  #DEFINE_ALIAS
 from .vision import generate_proposals  #DEFINE_ALIAS
-from .vision import grid_sampler  #DEFINE_ALIAS
+from .vision import grid_sample  #DEFINE_ALIAS
 from .vision import image_resize  #DEFINE_ALIAS
 from .vision import image_resize_short  #DEFINE_ALIAS
 # from .vision import multi_box_head        #DEFINE_ALIAS

--- a/python/paddle/nn/functional/vision.py
+++ b/python/paddle/nn/functional/vision.py
@@ -28,7 +28,6 @@ from ...fluid.layers import distribute_fpn_proposals  #DEFINE_ALIAS
 from ...fluid.layers import generate_mask_labels  #DEFINE_ALIAS
 from ...fluid.layers import generate_proposal_labels  #DEFINE_ALIAS
 from ...fluid.layers import generate_proposals  #DEFINE_ALIAS
-from ...fluid.layers import grid_sampler  #DEFINE_ALIAS
 from ...fluid.layers import image_resize  #DEFINE_ALIAS
 from ...fluid.layers import prior_box  #DEFINE_ALIAS
 from ...fluid.layers import prroi_pool  #DEFINE_ALIAS
@@ -68,7 +67,7 @@ __all__ = [
    'generate_mask_labels',
    'generate_proposal_labels',
    'generate_proposals',
-    'grid_sampler',
+    'grid_sample',
    'image_resize',
    'image_resize_short',
    #       'multi_box_head',
@@ -89,3 +88,187 @@ __all__ = [
    'yolo_box',
    'yolov3_loss'
 ]
+
+from ...fluid.layer_helper import LayerHelper
+from ...fluid.data_feeder import check_variable_and_dtype
+from ...fluid import core, dygraph_utils
+from ...fluid.framework import Variable, in_dygraph_mode
+from ...device import get_cudnn_version
+import numpy as np
+
+
+def grid_sample(x,
+                grid,
+                mode='bilinear',
+                padding_mode='zeros',
+                align_corners=True,
+                name=None):
+    """
+    This operation samples input X by using bilinear interpolation or
+    nearest interpolation based on flow field grid, which is usually
+    generated by :code:`affine_grid` . The grid of shape [N, H, W, 2]
+    is the concatenation of (x, y) coordinates with shape [N, H, W] each,
+    where x is indexing the 4th dimension (in width dimension) of input
+    data x and y is indexing the 3rd dimension (in height dimension),
+    finally results is the bilinear interpolation or nearest value of 4 nearest corner
+    points. The output tensor shape will be [N, C, H, W].
+
+    .. code-block:: text
+
+        Step 1:
+        Get (x, y) grid coordinates and scale to [0, H-1/W-1].
+
+        .. code-block:: text
+
+            grid_x = 0.5 * (grid[:, :, :, 0] + 1) * (W - 1)
+            grid_y = 0.5 * (grid[:, :, :, 1] + 1) * (H - 1)
+
+        Step 2:
+        Indices input data X with grid (x, y) in each [H, W] area, and bilinear
+        interpolate point value by 4 nearest points or nearest interpolate point value
+        by nearest point.
+
+          wn ------- y_n ------- en
+          |           |           |
+          |          d_n          |
+          |           |           |
+         x_w --d_w-- grid--d_e-- x_e
+          |           |           |
+          |          d_s          |
+          |           |           |
+          ws ------- y_s ------- wn
+
+        For bilinear interpolation:
+
+        x_w = floor(x)              // west side x coord
+        x_e = x_w + 1               // east side x coord
+        y_n = floor(y)              // north side y coord
+        y_s = y_s + 1               // south side y coord
+
+        d_w = grid_x - x_w          // distance to west side
+        d_e = x_e - grid_x          // distance to east side
+        d_n = grid_y - y_n          // distance to north side
+        d_s = y_s - grid_y          // distance to south side
+
+        wn = X[:, :, y_n, x_w]      // north-west point value
+        en = X[:, :, y_n, x_e]      // north-east point value
+        ws = X[:, :, y_s, x_w]      // south-east point value
+        es = X[:, :, y_s, x_w]      // north-east point value
+
+        output = wn * d_e * d_s + en * d_w * d_s
+               + ws * d_e * d_n + es * d_w * d_n
+
+    Args:
+        x(Tensor): The input tensor, which is a 4-d tensor with shape
+                     [N, C, H, W], N is the batch size, C is the channel
+                     number, H and W is the feature height and width.
+                     The data type is float32 or float64.
+        grid(Tensor): Input grid tensor of shape [N, grid_H, grid_W, 2]. The
+                        data type is float32 or float64.
+        mode(str, optional): The interpolation method which can be 'bilinear' or 'nearest'.
+                         Default: 'bilinear'.
+        padding_mode(str, optional) The padding method used when source index
+                   is out of input images. It can be 'zeros', 'reflect' and 'border'.
+                   Default: zeros.
+        align_corners(bool, optional): If `align_corners` is true, it will projects
+                   -1 and 1 to the centers of the corner pixels. Otherwise, it will
+                   projects -1 and 1 to the image edges.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+    Returns: Tensor, The shape of output is [N, C, grid_H, grid_W] in which `grid_H` is the height of grid
+                 and `grid_W` is the width of grid. The data type is same as input tensor.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+            
+            # shape=[1, 1, 3, 3]
+            x = np.array([[[[-0.6,  0.8, -0.5],
+                            [-0.5,  0.2,  1.2],
+                            [ 1.4,  0.3, -0.2]]]]).astype("float64")
+            
+            # grid shape = [1, 3, 4, 2]
+            grid = np.array(
+                         [[[[ 0.2,  0.3],
+                            [-0.4, -0.3],
+                            [-0.9,  0.3],
+                            [-0.9, -0.6]],
+                           [[ 0.4,  0.1],
+                            [ 0.9, -0.8],
+                            [ 0.4,  0.5],
+                            [ 0.5, -0.2]],
+                           [[ 0.1, -0.8],
+                            [-0.3, -1. ],
+                            [ 0.7,  0.4],
+                            [ 0.2,  0.8]]]]).astype("float64")
+            
+            paddle.disable_static()
+            x = paddle.to_tensor(x)
+            grid = paddle.to_tensor(grid)
+            y_t = F.grid_sample(
+                x,
+                grid,
+                mode='bilinear',
+                padding_mode='border',
+                align_corners=True)
+            print(y_t.numpy())
+            
+            # output shape = [1, 1, 3, 4]
+            # [[[[ 0.34   0.016  0.086 -0.448]
+            #    [ 0.55  -0.076  0.35   0.59 ]
+            #    [ 0.596  0.38   0.52   0.24 ]]]]
+    """
+    helper = LayerHelper("grid_sample", **locals())
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'grid_sampler')
+    check_variable_and_dtype(grid, 'grid', ['float32', 'float64'],
+                             'grid_sampler')
+    if not isinstance(x, Variable):
+        raise ValueError("The x should be a Variable")
+    if not isinstance(grid, Variable):
+        raise ValueError("The grid should be a Variable")
+    _modes = ['bilinear', 'nearest']
+    _padding_modes = ['zeros', 'reflect', 'border']
+    if mode not in _modes:
+        raise ValueError(
+            "The mode of grid sample function should be in {}, but got: {}".
+            format(_modes, mode))
+    if padding_mode not in _padding_modes:
+        raise ValueError(
+            "The padding mode of grid sample function should be in {}, but got: {}".
+            format(_padding_modes, padding_mode))
+
+    if not isinstance(align_corners, bool):
+        raise ValueError("The align corners should be bool, but got: {}".format(
+            align_corners))
+
+    cudnn_version = get_cudnn_version()
+    use_cudnn = False
+    if (cudnn_version is not None
+        ) and align_corners and mode == 'bilinear' and padding_mode == 'zeros':
+        use_cudnn = True
+    ipts = {'X': x, 'Grid': grid}
+    attrs = {
+        'mode': mode,
+        'padding_mode': padding_mode,
+        'align_corners': align_corners,
+        'use_cudnn': use_cudnn
+    }
+
+    if in_dygraph_mode():
+        attrs = ('mode', mode, 'padding_mode', padding_mode, 'align_corners',
+                 align_corners, 'use_cudnn', use_cudnn)
+        out = getattr(core.ops, 'grid_sampler')(x, grid, *attrs)
+    else:
+        out = helper.create_variable_for_type_inference(x.dtype)
+        helper.append_op(
+            type='grid_sampler',
+            inputs=ipts,
+            attrs=attrs,
+            outputs={'Output': out})
+    return out