diff --git a/paddle/fluid/operators/grid_sampler_op.cu b/paddle/fluid/operators/grid_sampler_op.cu
index e9b0a0108afc2336aa3bf350173ea4fa38635593..762d14096a5ab4d094894ad7c0ec822f5cc25d3b 100644
--- a/paddle/fluid/operators/grid_sampler_op.cu
+++ b/paddle/fluid/operators/grid_sampler_op.cu
@@ -187,7 +187,6 @@ __global__ void grid_sample_cuda_kernel(const int nthreads, int n, int out_c,
   int out_sC = out_h * out_w;
   int out_sH = out_w;
   int out_sW = 1;
-
   CUDA_KERNEL_LOOP(index, nthreads) {
     const int w = index % out_w;
     const int h = (index / out_w) % out_h;
@@ -199,7 +198,6 @@ __global__ void grid_sample_cuda_kernel(const int nthreads, int n, int out_c,
 
     ix = compute_positions(ix, in_w, padding_mode, align_corners);
     iy = compute_positions(iy, in_h, padding_mode, align_corners);
-
     if (mode == Mode::bilinear) {
       int ix_nw = static_cast<int>(floor(ix));
       int iy_nw = static_cast<int>(floor(iy));
@@ -216,6 +214,7 @@ __global__ void grid_sample_cuda_kernel(const int nthreads, int n, int out_c,
       T se = (ix - ix_nw) * (iy - iy_nw);
 
       auto inp_offset_NC = n * inp_sN;
+
       auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
       for (int c = 0; c < out_c;
            ++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) {
@@ -291,17 +290,17 @@ class GridSampleOpCUDAKernel : public framework::OpKernel<T> {
             << "; out_w: " << out_w;
     auto* output = ctx.Output<Tensor>("Output");
     auto* output_data = output->mutable_data<T>(ctx.GetPlace());
-
-    VLOG(3) << "set constant";
+    VLOG(3) << "out dims: " << output->dims()[0] << "; " << output->dims()[1]
+            << "; " << output->dims()[2] << "; " << output->dims()[3];
     math::SetConstant<paddle::platform::CUDADeviceContext, T>()(
         dev_ctx, output, static_cast<T>(0));
     int count = static_cast<int>(n * out_h * out_w);
-
     auto cu_stream = dev_ctx.stream();
-
-    int block = 512;
-    int grid_size = (count + block - 1) / block;
-    grid_sample_cuda_kernel<T><<<block, grid_size, 0, cu_stream>>>(
+    int block_size = 512;
+    int grid_size = (count + block_size - 1) / block_size;
+    VLOG(3) << "cuda launch - grid dims: " << grid_size << "; block dims"
+            << block_size;
+    grid_sample_cuda_kernel<T><<<grid_size, block_size, 0, cu_stream>>>(
         count, n, c, out_h, out_w, in_h, in_w, input->data<T>(),
         grid->data<T>(), output_data, mode, padding_mode, align_corners);
   }
@@ -475,9 +474,12 @@ class GridSampleGradOpCUDAKernel : public framework::OpKernel<T> {
 
     int count = static_cast<int>(n * out_h * out_w);
     auto cu_stream = dev_ctx.stream();
-    int block = 512;
-    int grid_size = (count + block - 1) / block;
-    grid_sampler_cuda_backward_kernel<T><<<block, grid_size, 0, cu_stream>>>(
+    int block_size = 512;
+    int grid_size = (count + block_size - 1) / block_size;
+    VLOG(3) << "cuda launch grad kernel - grid dims: " << grid_size
+            << "; block dims" << block_size << "; count: " << count;
+    grid_sampler_cuda_backward_kernel<
+        T><<<grid_size, block_size, 0, cu_stream>>>(
         count, output_grad->data<T>(), input->data<T>(), grid->data<T>(), n, c,
         out_h, out_w, in_h, in_w, input_grad->data<T>(), grid_grad_data, mode,
         padding_mode, align_corners);
diff --git a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
index 287e85cb271f817b3581431a804685608b8cd91a..083b671c283a0f5fe0302837d60a414f3061632a 100755
--- a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
@@ -19,6 +19,8 @@ import numpy as np
 from op_test import OpTest
 import paddle.fluid.core as core
 import paddle.fluid as fluid
+import paddle
+paddle.enable_static()
 
 
 def bilinear_interp_np(input,
diff --git a/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py b/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
index d5056bd11cf0845e8694e75d805f4197a5fb2024..49a3dedbf26a2f104ea5e7a480447b938ed8b19b 100644
--- a/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
+++ b/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
@@ -12,10 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import paddle
 import unittest
 import numpy as np
 import paddle.fluid.core as core
-from op_test import OpTest
+from op_test import OpTest, skip_check_grad_ci
+paddle.enable_static()
 
 
 def AffineGrid(theta, grid_shape):
@@ -160,7 +162,6 @@ class TestGridSamplerOp(OpTest):
             "padding_mode": self.padding_mode,
             "mode": self.mode
         }
-        #    print("X: {}".format(x))
         self.outputs = {
             'Output': GridSampler(x, grid, self.align_corners, self.mode,
                                   self.padding_mode)
@@ -237,5 +238,41 @@ class Case4(TestGridSamplerOp):
         self.numeric_grad_delta = 0.0001
 
 
+@skip_check_grad_ci(reason="'check_grad' on large inputs is too slow, " +
+                    "however it is desirable to cover the forward pass")
+class LargeInputCase(TestGridSamplerOp):
+    def get_places(self):
+        places = []
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        return places
+
+    def initTestCase(self):
+        self.no_need_check_grad = True
+        self.x_shape = (2, 3, 128, 128)
+        self.grid_shape = (2, 130, 130, 2)
+        self.theta_shape = (2, 2, 3)
+        self.align_corners = False
+        self.padding_mode = "reflection"
+        self.mode = "bilinear"
+
+    def test_check_grad_normal(self):
+        pass
+
+
+@skip_check_grad_ci(reason="'check_grad' on large inputs is too slow, " +
+                    "however it is desirable to cover the forward pass")
+class Case5(LargeInputCase):
+    def initTestCase(self):
+        self.no_need_check_grad = True
+        self.x_shape = (2, 3, 128, 128)
+        self.grid_shape = (2, 130, 130, 2)
+        self.theta_shape = (2, 2, 3)
+        self.align_corners = True
+        self.padding_mode = "zeros"
+        self.mode = "bilinear"
+        self.use_cudnn = False if core.is_compiled_with_rocm() else True
+
+
 if __name__ == "__main__":
     unittest.main()