diff --git a/lite/kernels/bm/bridges/box_coder_op.cc b/lite/kernels/bm/bridges/box_coder_op.cc
index 9ef1824a6460ac6fd2bdfdceea531ce1a9d806f0..999ea4dca2dfc51426994b8ec96c237bec8f777f 100644
--- a/lite/kernels/bm/bridges/box_coder_op.cc
+++ b/lite/kernels/bm/bridges/box_coder_op.cc
@@ -73,10 +73,16 @@ int BoxCoderConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   if (op_info->HasAttr("variance")) {
     variance = op_info->GetAttr<std::vector<float>>("variance");
   }
+  int variance_len = variance.size();
   user_cpu_param_t bm_param;
   bm_param.op_type = USER_PADDLE_BOX_CODER;
   bm_param.u.box_coder_param.axis = axis;
-  bm_param.u.box_coder_param.variance = &variance[0];
+  CHECK_LE(variance_len, 2000);
+  memset(bm_param.u.box_coder_param.variance, 0, 2000 * sizeof(float));
+  memcpy(bm_param.u.box_coder_param.variance,
+         &variance[0],
+         variance_len * sizeof(float));
+  bm_param.u.box_coder_param.variance_len = variance_len;
   bm_param.u.box_coder_param.code_type =
       (code_type == "encode_center_size") ? 0 : 1;
   bm_param.u.box_coder_param.normalized = box_normalized;
diff --git a/lite/kernels/bm/bridges/cast_op.cc b/lite/kernels/bm/bridges/cast_op.cc
index 42c0751b9278b1fcecb4e4c6032e046c1fad5461..45cc90c2016f901536110d32322eeb62eced537b 100644
--- a/lite/kernels/bm/bridges/cast_op.cc
+++ b/lite/kernels/bm/bridges/cast_op.cc
@@ -32,7 +32,8 @@ bool CvtDtype(int dtype, int* ptype) {
       *ptype = DTYPE_INT16;
       break;
     case 2:
-      *ptype = DTYPE_FP32;
+    case 3:
+      *ptype = DTYPE_INT32;
       break;
     case 5:
       *ptype = DTYPE_FP32;
diff --git a/lite/kernels/bm/bridges/elementwise_ops.cc b/lite/kernels/bm/bridges/elementwise_ops.cc
index 715874d418871076a0070a8333ea4348de881aff..9124821b6edb6b7263743a004e2c923ce48994d8 100644
--- a/lite/kernels/bm/bridges/elementwise_ops.cc
+++ b/lite/kernels/bm/bridges/elementwise_ops.cc
@@ -127,7 +127,7 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   const float* y_data = const_cast<const float*>(y->mutable_data<float>());
   const float* x_data = const_cast<const float*>(x->mutable_data<float>());
   auto unique_op_name = lite::subgraph::bm::UniqueName("expand_ndims");
-  std::vector<int32_t> i_expand_shape_data(3);
+  std::vector<int32_t> i_expand_shape_data;
   if (x_is_const && y_is_const) {
     float* cpu_data = compute_elementwise_both_const(op);
     bm_add_const_tensor(graph->GetCompilerHandle(),
@@ -157,12 +157,31 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
             static_cast<const char*>(unique_op_name.c_str()));
         name[1] = static_cast<const char*>(unique_op_name.c_str());
         dim[1] = 3;
-        i_expand_shape_data[0] = i_y_shape_data[0];
-        i_expand_shape_data[1] = 1;
-        i_expand_shape_data[2] = 1;
+        i_expand_shape_data.push_back(i_y_shape_data[0]);
+        i_expand_shape_data.push_back(1);
+        i_expand_shape_data.push_back(1);
         shape[1] = &i_expand_shape_data[0];
         y_data = nullptr;
       }
+    } else {
+      if (dim[1] < dim[0]) {
+        for (size_t i = 0; i < dim[1]; i++) {
+          i_expand_shape_data.push_back(i_y_shape_data[i]);
+        }
+        for (size_t i = dim[1]; i < dim[0]; i++) {
+          i_expand_shape_data.push_back(1);
+        }
+        add_reshape_layer_v2(graph->GetCompilerHandle(),
+                             name[1],
+                             shape[1],
+                             dim[1],
+                             static_cast<const char*>(unique_op_name.c_str()),
+                             const_cast<const int*>(&i_expand_shape_data[0]),
+                             i_expand_shape_data.size());
+        dim[1] = dim[0];
+        shape[1] = &i_expand_shape_data[0];
+        name[1] = static_cast<const char*>(unique_op_name.c_str());
+      }
     }
     add_binary_layer_v2(graph->GetCompilerHandle(),
                         name[0],
diff --git a/lite/kernels/bm/bridges/multiclass_nms_op.cc b/lite/kernels/bm/bridges/multiclass_nms_op.cc
index fb7d656dd26408a2fe37a9a3ddba1521e3a0bc0d..6270dc9a3072b61895cf587db7d96e5feb329a9e 100644
--- a/lite/kernels/bm/bridges/multiclass_nms_op.cc
+++ b/lite/kernels/bm/bridges/multiclass_nms_op.cc
@@ -51,7 +51,7 @@ int MultiClassNMSConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto score_threshold = op_info->GetAttr<float>("score_threshold");
   auto nms_threshold = op_info->GetAttr<float>("nms_threshold");
   auto nms_eta = op_info->GetAttr<float>("nms_eta");
-  bool normalized;
+  bool normalized = false;
   if (op_info->HasAttr("normalized")) {
     normalized = op_info->GetAttr<bool>("normalized");
   }
@@ -97,12 +97,39 @@ int MultiClassNMSConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   in_dim[1] = score_dims.size();
   in_name[0] = static_cast<const char*>(boxes_var_name.c_str());
   in_name[1] = static_cast<const char*>(score_var_name.c_str());
-  int32_t* out_shape[1];
-  int32_t out_dim[1];
-  const char* out_name[1];
+  int32_t* out_shape[2];
+  int32_t out_dim[2];
+  const char* out_name[2];
   out_shape[0] = &i_out_shape_data[0];
   out_dim[0] = out_dims.size();
   out_name[0] = static_cast<const char*>(out_var_name.c_str());
+
+  std::vector<int64_t> vec_index_dim(score_dims.size());
+  std::vector<int32_t> i_out_index_shape_data(score_dims.size());
+  std::string out_index_name = "";
+  if (op_type == "multiclass_nms2") {
+    output_num = 2;
+    out_index_name = op_info->Output("Index").front();
+    auto out_index = scope->FindVar(out_index_name)->GetMutable<lite::Tensor>();
+    if (3 == score_dims.size()) {
+      vec_index_dim[0] = score_dims[0];
+      vec_index_dim[1] = keep_top_k;
+      vec_index_dim[2] = 1;
+    } else {
+      vec_index_dim[0] = keep_top_k;
+      vec_index_dim[1] = 1;
+    }
+    DDimLite index_dims(vec_index_dim);
+    out_index->Resize(index_dims);
+    out_index->mutable_data<float>();
+    for (size_t i = 0; i < index_dims.size(); i++) {
+      i_out_index_shape_data[i] = static_cast<int32_t>(index_dims[i]);
+    }
+    out_shape[1] = &i_out_index_shape_data[0];
+    out_dim[1] = index_dims.size();
+    out_name[1] = static_cast<const char*>(out_index_name.c_str());
+  }
+
   add_user_cpu_layer(graph->GetCompilerHandle(),
                      input_num,
                      in_shape,
@@ -126,3 +153,6 @@ int MultiClassNMSConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 REGISTER_SUBGRAPH_BRIDGE(multiclass_nms,
                          kBM,
                          paddle::lite::subgraph::bm::MultiClassNMSConverter);
+REGISTER_SUBGRAPH_BRIDGE(multiclass_nms2,
+                         kBM,
+                         paddle::lite::subgraph::bm::MultiClassNMSConverter);
diff --git a/lite/kernels/bm/bridges/paddle_use_bridges.h b/lite/kernels/bm/bridges/paddle_use_bridges.h
index b9b575c6dfb884e3962696dad15f994a9cb8d2e2..1891e13e432688ef5acd3b0b3aa3b174ddc01f46 100644
--- a/lite/kernels/bm/bridges/paddle_use_bridges.h
+++ b/lite/kernels/bm/bridges/paddle_use_bridges.h
@@ -39,6 +39,7 @@ USE_SUBGRAPH_BRIDGE(norm, kBM);
 USE_SUBGRAPH_BRIDGE(prior_box, kBM);
 USE_SUBGRAPH_BRIDGE(box_coder, kBM);
 USE_SUBGRAPH_BRIDGE(multiclass_nms, kBM);
+USE_SUBGRAPH_BRIDGE(multiclass_nms2, kBM);
 USE_SUBGRAPH_BRIDGE(nearest_interp, kBM);
 USE_SUBGRAPH_BRIDGE(bilinear_interp, kBM);
 USE_SUBGRAPH_BRIDGE(yolo_box, kBM);
diff --git a/lite/kernels/bm/bridges/yolo_box_op.cc b/lite/kernels/bm/bridges/yolo_box_op.cc
index a5ea07f5fdece51d623f26a87cc7f7d4b727d58e..c1f8fa100f65d3665479747f04a841c7ab642d3e 100644
--- a/lite/kernels/bm/bridges/yolo_box_op.cc
+++ b/lite/kernels/bm/bridges/yolo_box_op.cc
@@ -67,17 +67,17 @@ int YoloBoxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto downsample_ratio = op_info->GetAttr<int>("downsample_ratio");
   auto conf_thresh = op_info->GetAttr<float>("conf_thresh");
   auto anchors = op_info->GetAttr<std::vector<int>>("anchors");
-  int* anchors_buffer = static_cast<int*>(malloc(sizeof(int) * anchors.size()));
-  CHECK(anchors_buffer != nullptr);
-  memcpy(anchors_buffer, &anchors[0], sizeof(int) * anchors.size());
+  CHECK_LE(anchors.size(), 2000);
   user_cpu_param_t bm_param;
   bm_param.op_type = USER_PADDLE_YOLO_BOX;
   bm_param.u.yolo_box_param.class_num = class_num;
   bm_param.u.yolo_box_param.downsample_ratio = downsample_ratio;
   bm_param.u.yolo_box_param.conf_thresh = conf_thresh;
-  bm_param.u.yolo_box_param.anchors = anchors_buffer;
+  memset(bm_param.u.yolo_box_param.anchors, 0, 2000 * sizeof(int));
+  memcpy(bm_param.u.yolo_box_param.anchors,
+         &anchors[0],
+         anchors.size() * sizeof(int));
   bm_param.u.yolo_box_param.anchors_size = anchors.size();
-  memcpy(anchors_buffer, &anchors[0], sizeof(int) * anchors.size());
   int32_t input_num = 2;
   int32_t output_num = 2;
   int32_t* in_shape[2];