diff --git a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
index 3da8af332bb5f6137b99ec3cbfba66509127e54b..0e8ca01eba7ce139dfea7882db0af3054825f68b 100644
--- a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
@@ -1,13 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/cudnn_helper.h"
diff --git a/paddle/fluid/operators/grid_sampler_op.cc b/paddle/fluid/operators/grid_sampler_op.cc
index 3f28ed5df7de58d1d815d878660a4582795ee0db..599ff9a9c1ef878c12c9b56c5b79d6f143099203 100644
--- a/paddle/fluid/operators/grid_sampler_op.cc
+++ b/paddle/fluid/operators/grid_sampler_op.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -67,23 +67,66 @@ class GridSampleOpMaker : public framework::OpProtoAndCheckerMaker {
     void Make() override {
       AddInput(
           "X",
-          "(Tensor) The input tensor of GridSampleOp, "
+          "(Tensor) The input data of GridSampleOp, "
           "This is a 4-D tensor with shape of [N, C, H, W]");
       AddInput(
           "Grid",
-          "(Tensor) The output of AffineGridOp, "
-          "This is a 4-D tensor with shape of [N, H, W, 2]");
+          "(Tensor) The input grid of GridSampleOp generated by AffineGridOp, "
+          "This is a 4-D tensor with shape of [N, H, W, 2] is the concatenation "
+          "of x and y coordinates with shape [N, H, W] in last dimention");
       AddOutput(
           "Output",
           "(Tensor) Output tensor with shape [N, C, H, W]");
       AddAttr<bool>(
           "use_cudnn",
-          "(bool, default false) Only used in cudnn kernel, need install cudnn")
+          "(bool, default true) Only used in cudnn kernel, need install cudnn")
           .SetDefault(true);
 
       AddComment(R"DOC(
-      It sample input X by grid gennerate by AffineGridOp.
-      )DOC");
+      It sample input X by grid gennerate by AffineGridOp. The grid of shape
+      [N, H, W, 2] is the concatenation of (x, y) coordinates with shape 
+      [N, H, W] each, with x indexing the 4th-D(W) of input feature map and y to 
+      indexng the 3rd-D(H), finally results is the bilinear interpolation value
+      of 4 nearest corner points.
+
+      Step 1:
+        Get (x, y) grid coordinates and scale to [0, H-1/W-1].
+
+        grid_x = 0.5 * (grid[:, :, :, 0] + 1) * (W - 1)
+        grid_y = 0.5 * (grid[:, :, :, 1] + 1) * (H - 1)
+
+      Step 2:
+        Indices input data X with grid (x, y) in each [H, W] area, and bilinear 
+        interpolate point value by 4 nearest points.
+
+          wn ------- y_n ------- en
+          |           |           |
+          |          d_n          |
+          |           |           |
+         x_w --d_w-- grid--d_e-- x_e
+          |           |           |
+          |          d_s          |
+          |           |           |
+          ws ------- y_s ------- wn
+
+        x_w = floor(x)              // west side x coord
+        x_e = x_w + 1               // east side x coord
+        y_n = floor(y)              // north side y coord
+        y_s = y_s + 1               // south side y coord
+
+        d_w = grid_x - x_w          // distance to west side
+        d_e = x_e - grid_x          // distance to east side
+        d_n = grid_y - y_n          // distance to north side
+        d_s = y_s - grid_y          // distance to south side
+
+        wn = X[:, :, y_n, x_w]      // north-west point value
+        en = X[:, :, y_n, x_e]      // north-east point value
+        ws = X[:, :, y_s, x_w]      // south-east point value
+        es = X[:, :, y_s, x_w]      // north-east point value
+
+        output = wn * d_e * d_s + en * d_w * d_s
+               + ws * d_e * d_n + es * d_w * d_n
+        )DOC");
     }
 };
 
@@ -91,7 +134,14 @@ class GridSampleOpGrad : public framework::OperatorWithKernel {
   public:
   using framework::OperatorWithKernel::OperatorWithKernel;
   void InferShape(framework::InferShapeContext* ctx) const override {
-    //TO DO
+    auto input_dims = ctx->GetInputDim("X");
+    auto grid_dims = ctx->GetInputDim("Grid");
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), input_dims);
+    }
+    if (ctx->HasOutput(framework::GradVarName("Grid"))) {
+      ctx->SetOutputDim(framework::GradVarName("Grid"), grid_dims);
+    }
   }
 
   protected:
diff --git a/paddle/fluid/operators/grid_sampler_op.h b/paddle/fluid/operators/grid_sampler_op.h
index 7f42fa66ca16527156136c907c636e71371cc97d..1e8f36567f498dfaa44068950d63f4f8c49c6e1c 100644
--- a/paddle/fluid/operators/grid_sampler_op.h
+++ b/paddle/fluid/operators/grid_sampler_op.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -33,7 +33,7 @@ using Array4 = Eigen::DSizes<int64_t, 4>;
 
 
 template <typename T>
-inline bool isInBound(T x, T y, T x_max, T y_max) {
+static inline bool isInBound(T x, T y, T x_max, T y_max) {
   if (x < 0 || x > x_max || y < 0 || y > y_max) {
     return false;
   }
@@ -41,10 +41,10 @@ inline bool isInBound(T x, T y, T x_max, T y_max) {
 }
 
 template <typename DeviceContext, typename T>
-void CalcGridLocations(const framework::ExecutionContext& ctx, const Tensor& grid,
+static void CalcGridLocations(const DeviceContext& ctx, const Tensor& grid,
     Tensor* x_w, Tensor* x_e, Tensor* y_n, Tensor* y_s,
     Tensor* d_w, Tensor* d_e, Tensor* d_n, Tensor* d_s) {
-  auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+  auto& place = *ctx.eigen_device();
   const int n = grid.dims()[0];
   const int h = grid.dims()[1];
   const int w = grid.dims()[2];
@@ -71,6 +71,7 @@ void CalcGridLocations(const framework::ExecutionContext& ctx, const Tensor& gri
   grid_x_t.device(place) = 0.5 * ((grid_x_t + ones_t) * x_max);
   grid_y_t.device(place) = 0.5 * ((grid_y_t + ones_t) * y_max);
 
+  // calculate coords of 4 corner points
   x_w->mutable_data<T>({n, h, w}, ctx.GetPlace());
   x_e->mutable_data<T>({n, h, w}, ctx.GetPlace());
   y_n->mutable_data<T>({n, h, w}, ctx.GetPlace());
@@ -84,6 +85,7 @@ void CalcGridLocations(const framework::ExecutionContext& ctx, const Tensor& gri
   y_n_t.device(place) = grid_y_t.floor();
   y_s_t.device(place) = y_n_t + ones_t;
 
+  // calculate distances to 4 sides
   d_w->mutable_data<T>({n, h, w}, ctx.GetPlace());
   d_e->mutable_data<T>({n, h, w}, ctx.GetPlace());
   d_n->mutable_data<T>({n, h, w}, ctx.GetPlace());
@@ -99,7 +101,7 @@ void CalcGridLocations(const framework::ExecutionContext& ctx, const Tensor& gri
 }
 
 template <typename T>
-void GetGridPointValue(const Tensor& input, Tensor* output,
+static void GetGridPointValue(const Tensor& input, Tensor* output,
                         const Tensor& x, const Tensor& y) {
   const int n = input.dims()[0];
   const int c = input.dims()[1];
@@ -124,7 +126,7 @@ void GetGridPointValue(const Tensor& input, Tensor* output,
 }
 
 template <typename T>
-void GatherOutputGradToInputGrad(const Tensor& output_grad, Tensor* input_grad,
+static void GatherOutputGradToInputGrad(const Tensor& output_grad, Tensor* input_grad,
     const Tensor& x, const Tensor& y, 
     const Tensor& d1, const Tensor& d2) {
   const int n = output_grad.dims()[0];
@@ -170,9 +172,10 @@ class GridSampleOpKernel : public framework::OpKernel<T> {
       // calc locations and distances of 4 corner points
       Tensor x_w, x_e, y_n, y_s;
       Tensor d_w, d_e, d_n, d_s;
-      CalcGridLocations<DeviceContext, T>(ctx, *grid, 
-                                          &x_w, &x_e, &y_n, &y_s,
-                                          &d_w, &d_e, &d_n, &d_s);
+      CalcGridLocations<DeviceContext, T>(ctx.template device_context<DeviceContext>(), 
+                            *grid, 
+                            &x_w, &x_e, &y_n, &y_s,
+                            &d_w, &d_e, &d_n, &d_s);
 
       auto* output = ctx.Output<Tensor>("Output");
       output->mutable_data<T>({n, c, h, w}, ctx.GetPlace());
@@ -239,9 +242,10 @@ class GridSampleGradOpKernel : public framework::OpKernel<T> {
       
       Tensor x_w, x_e, y_n, y_s;
       Tensor d_w, d_e, d_n, d_s;
-      CalcGridLocations<DeviceContext, T>(ctx, *grid, 
-                                          &x_w, &x_e, &y_n, &y_s,
-                                          &d_w, &d_e, &d_n, &d_s);
+      CalcGridLocations<DeviceContext, T>(ctx.template device_context<DeviceContext>(), 
+                            *grid, 
+                            &x_w, &x_e, &y_n, &y_s,
+                            &d_w, &d_e, &d_n, &d_s);
 
       // gather output grad value to input grad by corner point coords and weight
       GatherOutputGradToInputGrad<T>(*output_grad, input_grad, x_w, y_n, d_e, d_s);
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 6770f74211cd2e63a9a92b41a6db34b280c2ad5c..f4c2c2813f118dbad9ffa54ea6b285231852a5ca 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -7584,17 +7584,59 @@ def hash(input, hash_size, num_hash=1, name=None):
 
 
 @templatedoc()
-def grid_sampler(x, grid):
-    """
-    It sample data from input x by the given grid, insert data of each
-    point by bilinear interp.
+def grid_sampler(x, grid, name=None):
+    """
+    It sample input X by grid gennerate by AffineGridOp. The grid of shape
+    [N, H, W, 2] is the concatenation of (x, y) coordinates with shape 
+    [N, H, W] each, with x indexing the 4th-D(W) of input feature map and y to 
+    indexng the 3rd-D(H), finally results is the bilinear interpolation value
+    of 4 nearest corner points.
+
+    Step 1:
+    Get (x, y) grid coordinates and scale to [0, H-1/W-1].
+
+    grid_x = 0.5 * (grid[:, :, :, 0] + 1) * (W - 1)
+    grid_y = 0.5 * (grid[:, :, :, 1] + 1) * (H - 1)
+
+    Step 2:
+    Indices input data X with grid (x, y) in each [H, W] area, and bilinear 
+    interpolate point value by 4 nearest points.
+
+      wn ------- y_n ------- en
+      |           |           |
+      |          d_n          |
+      |           |           |
+     x_w --d_w-- grid--d_e-- x_e
+      |           |           |
+      |          d_s          |
+      |           |           |
+      ws ------- y_s ------- wn
+
+    x_w = floor(x)              // west side x coord
+    x_e = x_w + 1               // east side x coord
+    y_n = floor(y)              // north side y coord
+    y_s = y_s + 1               // south side y coord
+
+    d_w = grid_x - x_w          // distance to west side
+    d_e = x_e - grid_x          // distance to east side
+    d_n = grid_y - y_n          // distance to north side
+    d_s = y_s - grid_y          // distance to south side
+
+    wn = X[:, :, y_n, x_w]      // north-west point value
+    en = X[:, :, y_n, x_e]      // north-east point value
+    ws = X[:, :, y_s, x_w]      // south-east point value
+    es = X[:, :, y_s, x_w]      // north-east point value
+
+    output = wn * d_e * d_s + en * d_w * d_s
+           + ws * d_e * d_n + es * d_w * d_n
 
     Args:
-        x(Variable): Input data of shape [N, H, W, C]
-        grid(Variable): Input grid tensor of shape [N, H, W, 2]
+        x(Variable): Input data of shape [N, C, H, W].
+        grid(Variable): Input grid tensor of shape [N, H, W, 2].
+        name (str, default None): The name of this layer.
 
     Returns:
-        out(Variable): Output data indices by grid from x of shape [N, H, W, C]
+        out(Variable): Output data indices by grid from x of shape [N, C, H, W].
     """
     helper = LayerHelper("grid_sampler", **locals())
 
@@ -7606,13 +7648,11 @@ def grid_sampler(x, grid):
 
     out = helper.create_tmp_variable(x.dtype)
     ipts = {'X': x, 'Grid': grid}
-    attrs = {}
 
     helper.apppend_op(
             type='grid_sampler',
             inputs=ipts,
-            outputs={'Output', out},
-            attrs = None if len(attrs) == 0 else attrs)
+            outputs={'Output', out})
 
-    return 0
+    return out
 
diff --git a/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py b/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
index 958573c08586bc4675936a24f90009e18d4959e8..5a0b2d41b293d84bbfe06f5cdfb0a4d17ef6fb2b 100644
--- a/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
+++ b/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
@@ -35,7 +35,6 @@ def AffineGrid(theta, size):
     for i in range(len(theta)):
         ret[i] = np.dot(grid[i].reshape([h * w, 3]), theta[i])
 
-    # print ret.reshape([n, h * w, 2]).astype("float32")    
     return ret.reshape([n, h, w, 2]).astype("float32")
 
 def getGridPointValue(data, x, y):
@@ -104,13 +103,12 @@ class TestGridSamplerOp(OpTest):
         self.inputs = {'X': x, 'Grid': grid}
         self.attrs = {'use_cudnn': True}
         self.outputs = {'Output': GridSampler(x, grid)}
-        # print self.outputs
 
     def test_check_output(self):
         self.check_output(atol=1e-3)
 
     def test_check_grad_normal(self):
-        self.check_grad(['X', 'Grid'], 'Output', max_relative_error=0.6)
+        self.check_grad(['X', 'Grid'], 'Output', max_relative_error=0.61)
 
     def initTestCase(self):
         self.x_shape = (2, 5, 7, 3)