diff --git a/paddle/fluid/operators/detection/box_clip_op.cc b/paddle/fluid/operators/detection/box_clip_op.cc
index 15adcdedaed6c517bfbcf7788b9e1fd429beded4..3aa766559a530bc31fbb277f2bcd474da776e63b 100644
--- a/paddle/fluid/operators/detection/box_clip_op.cc
+++ b/paddle/fluid/operators/detection/box_clip_op.cc
@@ -41,14 +41,6 @@ class BoxClipOp : public framework::OperatorWithKernel {
     ctx->ShareDim("Input", /*->*/ "Output");
     ctx->ShareLoD("Input", /*->*/ "Output");
   }
-  /*
-  protected:
-   framework::OpKernelType GetExpectedKernelType(
-       const framework::ExecutionContext& ctx) const override {
-     auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("Input"));
-     return framework::OpKernelType(data_type, platform::CPUPlace());
-   }
-   */
 };
 
 class BoxClipOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -68,11 +60,17 @@ class BoxClipOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 This operator clips input boxes to original input images.
 
-The formula is given as follows:
+For each input box, The formula is given as follows:
 
-       $$height_out = \max(\min(height_loc, im_h), 0)$$
-       $$width_out = \max(\min(width_loc, im_w), 0)$$     
+       $$xmin = \max(\min(xmin, im_w - 1), 0)$$
+       $$ymin = \max(\min(ymin, im_h - 1), 0)$$     
+       $$xmax = \max(\min(xmax, im_w - 1), 0)$$
+       $$ymax = \max(\min(ymax, im_h - 1), 0)$$
 
+where im_w and im_h are computed from ImInfo, the formula is given as follows:
+
+       $$im_w = \round(width / im_scale)$$
+       $$im_h = \round(height / im_scale)$$ 
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/detection/box_clip_op.cu b/paddle/fluid/operators/detection/box_clip_op.cu
index f10c92366de4a1306eedbc193b14dfd674560b9e..b727da5f7b736b6f22407d1dfbca708ed0cf04d9 100644
--- a/paddle/fluid/operators/detection/box_clip_op.cu
+++ b/paddle/fluid/operators/detection/box_clip_op.cu
@@ -30,13 +30,13 @@ template <typename T, int BlockSize>
 static __global__ void GPUBoxClip(const T *input, const size_t *lod,
                                   const size_t width, const T *im_info,
                                   T *output) {
+  T im_w = round(im_info[blockIdx.x * ImInfoSize + 1] /
+                 im_info[blockIdx.x * ImInfoSize + 2]);
+  T im_h = round(im_info[blockIdx.x * ImInfoSize] /
+                 im_info[blockIdx.x * ImInfoSize + 2]);
   for (int i = threadIdx.x; i < (lod[blockIdx.x + 1] - lod[blockIdx.x]) * width;
        i += BlockSize) {
     int idx = lod[blockIdx.x] * width + i;
-    T im_w = round(im_info[blockIdx.x * ImInfoSize + 1] /
-                   im_info[blockIdx.x * ImInfoSize + 2]);
-    T im_h = round(im_info[blockIdx.x * ImInfoSize] /
-                   im_info[blockIdx.x * ImInfoSize + 2]);
     T im_size = (idx % 2 == 0) ? im_w : im_h;
     output[idx] = max(min(input[idx], im_size - 1), T(0.));
   }
@@ -57,9 +57,9 @@ class GPUBoxClipKernel : public framework::OpKernel<T> {
     framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
     auto &dev_ctx = context.template device_context<DeviceContext>();
     auto stream = dev_ctx.stream();
-    const size_t num_lod = lod.back().size() - 1;
+    const size_t batch_size = lod.back().size() - 1;
     T *output_data = output->mutable_data<T>(dev_ctx.GetPlace());
-    GPUBoxClip<T, 512><<<num_lod, 512, 0, stream>>>(
+    GPUBoxClip<T, 512><<<batch_size, 512, 0, stream>>>(
         input->data<T>(), abs_offset_lod[0].CUDAMutableData(dev_ctx.GetPlace()),
         bbox_width, im_info->data<T>(), output_data);
   }
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 3e2882ea3ce5d4420a5cf184485e61d9fa4f7e0d..9fc23da70ebd04bbbb77b1fdb8a44a097f89c186 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -1816,26 +1816,35 @@ def generate_proposals(scores,
 def box_clip(input, im_info, inplace=False, name=None):
     """
     Clip the box into the size given by im_info
-    The formula is given as follows:
+    For each input box, The formula is given as follows:
         
     .. code-block:: text
 
-        height_out = max(min(height_loc, im_h), 0)
-        width_out = max(min(width_loc, im_w), 0)
+        xmin = max(min(xmin, im_w - 1), 0)
+        ymin = max(min(ymin, im_h - 1), 0) 
+        xmax = max(min(xmax, im_w - 1), 0)
+        ymax = max(min(ymax, im_h - 1), 0)
+    
+    where im_w and im_h are computed from im_info:
+ 
+    .. code-block:: text
+
+        im_h = round(height / scale)
+        im_w = round(weight / scale)
 
     Args:
-        input_box(variable): The input box, the last dimension is 4.
+        input(variable): The input box, the last dimension is 4.
         im_info(variable): The information of image with shape [N, 3] with 
                             layout (height, width, scale). height and width
                             is the input size and scale is the ratio of input
                             size and original size.
-        inplace(bool): Must use :attr:`False` if :attr:`input_box` is used in 
+        inplace(bool): Must use :attr:`False` if :attr:`input` is used in 
                        multiple operators. If this flag is set :attr:`True`, 
-                       reuse input :attr:`input_box` to clip, which will 
-                       change the value of tensor variable :attr:`input_box` 
-                       and might cause errors when :attr:`input_box` is used 
+                       reuse input :attr:`input` to clip, which will 
+                       change the value of tensor variable :attr:`input` 
+                       and might cause errors when :attr:`input` is used 
                        in multiple operators. If :attr:`False`, preserve the 
-                       value pf :attr:`input_box` and create a new output 
+                       value pf :attr:`input` and create a new output 
                        tensor variable whose data is copied from input x but 
                        cliped.
         name (str): The name of this layer. It is optional.
@@ -1850,16 +1859,13 @@ def box_clip(input, im_info, inplace=False, name=None):
                 name='data', shape=[8, 4], dtype='float32', lod_level=1)
             im_info = fluid.layers.data(name='im_info', shape=[3])
             out = fluid.layers.box_clip(
-                input_box=boxes, im_info=im_info, inplace=True)
+                input=boxes, im_info=im_info, inplace=True)
     """
 
     helper = LayerHelper("box_clip", **locals())
-    output = helper.create_variable_for_type_inference(dtype=input.dtype)
+    output = x if inplace else helper.create_variable_for_type_inference(\
+             dtype=input.dtype)
     inputs = {"Input": input, "ImInfo": im_info}
-    helper.append_op(
-        type="box_clip",
-        inputs=inputs,
-        attrs={"inplace:": inplace},
-        outputs={"Output": output})
+    helper.append_op(type="box_clip", inputs=inputs, outputs={"Output": output})
 
     return output