diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h
index 9756754260d46519d181f95e000f39ba92d22ef0..4dc3de54deef71e42d73e0943021691c5e39f7c7 100644
--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -37,9 +37,8 @@ class Vector {
 
   // Fill vector with value. The vector size is `count`.
   explicit Vector(size_t count, const T& value = T()) {
-    if (count == 0) {
-      InitEmpty();
-    } else {
+    InitEmpty();
+    if (count != 0) {
       resize(count);
       T* ptr = begin();
       for (size_t i = 0; i < count; ++i) {
@@ -122,6 +121,10 @@ class Vector {
   const T* begin() const { return &this->operator[](0); }
   const T* end() const { return &this->operator[](size()); }
 
+  const T* cbegin() const { return begin(); }
+
+  const T* cend() const { return end(); }
+
   const T& back() const {
     auto it = end();
     --it;
@@ -244,7 +247,9 @@ class Vector {
 
   bool operator==(const Vector<T>& other) const {
     if (size() != other.size()) return false;
-    for (auto it1 = begin(), it2 = other.begin(); it1 < end(); ++it1, ++it2) {
+    auto it1 = cbegin();
+    auto it2 = other.cbegin();
+    for (; it1 < cend(); ++it1, ++it2) {
       if (*it1 != *it2) {
         return false;
       }
diff --git a/paddle/fluid/framework/mixed_vector_test.cu b/paddle/fluid/framework/mixed_vector_test.cu
index a89064525661af71b22f18f835fd7b111956847b..0d5a914eac78096bd41814080bf4b2105d25e187 100644
--- a/paddle/fluid/framework/mixed_vector_test.cu
+++ b/paddle/fluid/framework/mixed_vector_test.cu
@@ -26,10 +26,10 @@ TEST(mixed_vector, CPU_VECTOR) {
   for (int i = 0; i < 10; ++i) {
     tmp.push_back(i);
   }
-  ASSERT_EQ(tmp.size(), 10);
+  ASSERT_EQ(tmp.size(), 10UL);
   vec<int> tmp2;
   tmp2 = tmp;
-  ASSERT_EQ(tmp2.size(), 10);
+  ASSERT_EQ(tmp2.size(), 10UL);
   for (int i = 0; i < 10; ++i) {
     ASSERT_EQ(tmp2[i], i);
     ASSERT_EQ(tmp2[i], tmp[i]);
@@ -58,7 +58,7 @@ TEST(mixed_vector, GPU_VECTOR) {
   for (int i = 0; i < 10; ++i) {
     tmp.push_back(i);
   }
-  ASSERT_EQ(tmp.size(), 10);
+  ASSERT_EQ(tmp.size(), 10UL);
   paddle::platform::CUDAPlace gpu(0);
 
   multiply_10<<<1, 1, 0, GetCUDAStream(gpu)>>>(tmp.MutableData(gpu));
@@ -79,7 +79,7 @@ TEST(mixed_vector, MultiGPU) {
   for (int i = 0; i < 10; ++i) {
     tmp.push_back(i);
   }
-  ASSERT_EQ(tmp.size(), 10);
+  ASSERT_EQ(tmp.size(), 10UL);
   paddle::platform::CUDAPlace gpu0(0);
   paddle::platform::SetDeviceId(0);
   multiply_10<<<1, 1, 0, GetCUDAStream(gpu0)>>>(tmp.MutableData(gpu0));
@@ -91,3 +91,10 @@ TEST(mixed_vector, MultiGPU) {
     ASSERT_EQ(tmp[i], i * 100);
   }
 }
+
+TEST(mixed_vector, InitWithCount) {
+  paddle::framework::Vector<int> vec(10, 10);
+  for (int i = 0; i < 10; ++i) {
+    ASSERT_EQ(vec[i], 10);
+  }
+}
diff --git a/paddle/fluid/operators/multiclass_nms_op.cc b/paddle/fluid/operators/multiclass_nms_op.cc
index b2934f69cc9b2e50bdd5cbdf04deeaf5ca120e2c..168e6f85d6ac9f8d6522afe871d82e708da63227 100644
--- a/paddle/fluid/operators/multiclass_nms_op.cc
+++ b/paddle/fluid/operators/multiclass_nms_op.cc
@@ -38,22 +38,22 @@ class MultiClassNMSOp : public framework::OperatorWithKernel {
     auto box_dims = ctx->GetInputDim("BBoxes");
     auto score_dims = ctx->GetInputDim("Scores");
 
-    PADDLE_ENFORCE_EQ(box_dims.size(), 2,
-                      "The rank of Input(BBoxes) must be 2.");
+    PADDLE_ENFORCE_EQ(box_dims.size(), 3,
+                      "The rank of Input(BBoxes) must be 3.");
     PADDLE_ENFORCE_EQ(score_dims.size(), 3,
                       "The rank of Input(Scores) must be 3.");
-    PADDLE_ENFORCE_EQ(box_dims[1], 4,
+    PADDLE_ENFORCE_EQ(box_dims[2], 4,
                       "The 2nd dimension of Input(BBoxes) must be 4, "
                       "represents the layout of coordinate "
                       "[xmin, ymin, xmax, ymax]");
-    PADDLE_ENFORCE_EQ(box_dims[0], score_dims[2],
+    PADDLE_ENFORCE_EQ(box_dims[1], score_dims[2],
                       "The 1st dimensiong of Input(BBoxes) must be equal to "
                       "3rd dimension of Input(Scores), which represents the "
                       "predicted bboxes.");
 
     // Here the box_dims[0] is not the real dimension of output.
     // It will be rewritten in the computing kernel.
-    ctx->SetOutputDim("Out", {box_dims[0], 6});
+    ctx->SetOutputDim("Out", {box_dims[1], 6});
   }
 
  protected:
@@ -260,15 +260,20 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
     int64_t batch_size = score_dims[0];
     int64_t class_num = score_dims[1];
     int64_t predict_dim = score_dims[2];
+    int64_t box_dim = boxes->dims()[2];
 
     std::vector<std::map<int, std::vector<int>>> all_indices;
     std::vector<size_t> batch_starts = {0};
     for (int64_t i = 0; i < batch_size; ++i) {
       Tensor ins_score = scores->Slice(i, i + 1);
       ins_score.Resize({class_num, predict_dim});
+
+      Tensor ins_boxes = boxes->Slice(i, i + 1);
+      ins_boxes.Resize({predict_dim, box_dim});
+
       std::map<int, std::vector<int>> indices;
       int num_nmsed_out = 0;
-      MultiClassNMS(ctx, ins_score, *boxes, indices, num_nmsed_out);
+      MultiClassNMS(ctx, ins_score, ins_boxes, indices, num_nmsed_out);
       all_indices.push_back(indices);
       batch_starts.push_back(batch_starts.back() + num_nmsed_out);
     }
@@ -282,11 +287,15 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
       for (int64_t i = 0; i < batch_size; ++i) {
         Tensor ins_score = scores->Slice(i, i + 1);
         ins_score.Resize({class_num, predict_dim});
+
+        Tensor ins_boxes = boxes->Slice(i, i + 1);
+        ins_boxes.Resize({predict_dim, box_dim});
+
         int64_t s = batch_starts[i];
         int64_t e = batch_starts[i + 1];
         if (e > s) {
           Tensor out = outs->Slice(s, e);
-          MultiClassOutput(ins_score, *boxes, all_indices[i], &out);
+          MultiClassOutput(ins_score, ins_boxes, all_indices[i], &out);
         }
       }
     }
@@ -303,9 +312,9 @@ class MultiClassNMSOpMaker : public framework::OpProtoAndCheckerMaker {
   MultiClassNMSOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("BBoxes",
-             "(Tensor) A 2-D Tensor with shape [M, 4] represents the "
-             "predicted locations of M bounding bboxes. Each bounding box "
-             "has four coordinate values and the layout is "
+             "(Tensor) A 3-D Tensor with shape [N, M, 4] represents the "
+             "predicted locations of M bounding bboxes, N is the batch size. "
+             "Each bounding box has four coordinate values and the layout is "
              "[xmin, ymin, xmax, ymax].");
     AddInput("Scores",
              "(Tensor) A 3-D Tensor with shape [N, C, M] represents the "
diff --git a/python/paddle/v2/fluid/tests/test_multiclass_nms_op.py b/python/paddle/v2/fluid/tests/test_multiclass_nms_op.py
index 3b80d2359b083d30f9a5a7b8cc18aaf1ca5146c1..529223cf40dac646d295941a53455d892274dca3 100644
--- a/python/paddle/v2/fluid/tests/test_multiclass_nms_op.py
+++ b/python/paddle/v2/fluid/tests/test_multiclass_nms_op.py
@@ -137,7 +137,7 @@ def batched_multiclass_nms(boxes, scores, background, score_threshold,
     det_outs = []
     lod = [0]
     for n in range(batch_size):
-        nmsed_outs, nmsed_num = multiclass_nms(boxes, scores[n], background,
+        nmsed_outs, nmsed_num = multiclass_nms(boxes[n], scores[n], background,
                                                score_threshold, nms_threshold,
                                                nms_top_k, keep_top_k)
         lod.append(lod[-1] + nmsed_num)
@@ -145,7 +145,7 @@ def batched_multiclass_nms(boxes, scores, background, score_threshold,
 
         for c, indices in nmsed_outs.iteritems():
             for idx in indices:
-                xmin, ymin, xmax, ymax = boxes[idx][:]
+                xmin, ymin, xmax, ymax = boxes[n][idx][:]
                 det_outs.append([c, scores[n][c][idx], xmin, ymin, xmax, ymax])
 
     return det_outs, lod
@@ -179,9 +179,9 @@ class TestMulticlassNMSOp(OpTest):
         scores = np.reshape(scores, (N, M, C))
         scores = np.transpose(scores, (0, 2, 1))
 
-        boxes = np.random.random((M, BOX_SIZE)).astype('float32')
-        boxes[:, 0:2] = boxes[:, 0:2] * 0.5
-        boxes[:, 2:4] = boxes[:, 2:4] * 0.5 + 0.5
+        boxes = np.random.random((N, M, BOX_SIZE)).astype('float32')
+        boxes[:, :, 0:2] = boxes[:, :, 0:2] * 0.5
+        boxes[:, :, 2:4] = boxes[:, :, 2:4] * 0.5 + 0.5
 
         nmsed_outs, lod = batched_multiclass_nms(boxes, scores, background,
                                                  score_threshold, nms_threshold,