three ops: density prior box & box coder & multiclass nms (#116)

* (feat): add cnplugin cmake * (feat): add cnplugin deps to backends/CMakeLists.txt and utility.h * (feat): add box_coder converter * (feat): add density_prior_box op * (feat) add multiclass nms converter, while workspace as static tensor * (ref): change nms test input from file to random, add iou * (ref): add density_prior_box & box_coder to paddle_use_bridges.h

three ops: density prior box & box coder & multiclass nms (#116)
* (feat): add cnplugin cmake * (feat): add cnplugin deps to backends/CMakeLists.txt and utility.h * (feat): add box_coder converter * (feat): add density_prior_box op * (feat) add multiclass nms converter, while workspace as static tensor * (ref): change nms test input from file to random, add iou * (ref): add density_prior_box & box_coder to paddle_use_bridges.h
328d2da4 · zhaoying9105 · GitHub · 53544680 · 328d2da4 · 328d2da4
14 changed file
--- a/cmake/mlu.cmake
+++ b/cmake/mlu.cmake
@@ -36,6 +36,12 @@ if(NOT CNRT_INC)
  message(FATAL_ERROR "Can not find cnrt.h in ${NEUWARE_HOME}/include")
 endif()
+find_path(CNPLUGIN_INC NAMES cnplugin.h
+  PATHS ${NEUWARE_HOME}/include NO_DEFAULT_PATH)
+if(NOT CNPLUGIN_INC)
+  message(FATAL_ERROR "Can not find cnplugin.h in ${NEUWARE_HOME}/include")
+endif()
 include_directories("${NEUWARE_HOME}/include")
 find_library(CNML_LIB_FILE NAMES cnml
@@ -59,3 +65,15 @@ else()
  add_library(cnrt_lib SHARED IMPORTED GLOBAL)
  set_property(TARGET cnrt_lib PROPERTY IMPORTED_LOCATION ${CNRT_LIB_FILE})
 endif()
+find_library(CNPLUGIN_LIB_FILE NAMES cnplugin
+  PATHS ${NEUWARE_HOME}/lib64)
+if(NOT CNPLUGIN_LIB_FILE)
+  message(FATAL_ERROR "Can not find CNPLUGIN Library in ${NEUWARE_HOME}/lib64")
+else()
+  message(STATUS "Found CNPLUGIN Library: ${CNPLUGIN_LIB_FILE}")
+  add_library(cnplugin_lib SHARED IMPORTED GLOBAL)
+  set_property(TARGET cnplugin_lib PROPERTY IMPORTED_LOCATION ${CNPLUGIN_LIB_FILE})
+endif()
\ No newline at end of file
--- a/lite/backends/mlu/CMakeLists.txt
+++ b/lite/backends/mlu/CMakeLists.txt
@@ -4,4 +4,4 @@ endif()
 message (STATUS "Lite with mlu backend")
-lite_cc_library(target_wrapper_mlu SRCS target_wrapper.cc DEPS cnml_lib cnrt_lib)
+lite_cc_library(target_wrapper_mlu SRCS target_wrapper.cc DEPS cnml_lib cnrt_lib cnplugin_lib)
--- a/lite/kernels/mlu/bridges/CMakeLists.txt
+++ b/lite/kernels/mlu/bridges/CMakeLists.txt
@@ -28,6 +28,9 @@ lite_cc_library(subgraph_bridge_argmax_op_mlu SRCS argmax_op.cc DEPS ${subgraph_
 lite_cc_library(subgraph_bridge_squeeze_op_mlu SRCS squeeze_op.cc DEPS ${subgraph_bridge_deps_mlu})
 lite_cc_library(subgraph_bridge_reshape_op_mlu SRCS reshape_op.cc DEPS ${subgraph_bridge_deps_mlu})
 lite_cc_library(subgraph_bridge_flatten_op_mlu SRCS flatten_op.cc DEPS ${subgraph_bridge_deps_mlu})
+lite_cc_library(subgraph_bridge_box_coder_op_mlu SRCS box_coder_op.cc DEPS ${subgraph_bridge_deps_mlu})
+lite_cc_library(subgraph_bridge_density_prior_box_op_mlu SRCS density_prior_box_op.cc DEPS ${subgraph_bridge_deps_mlu})
+lite_cc_library(subgraph_bridge_multiclass_nms_mlu SRCS multiclass_nms.cc multiclass_nms_api.cc multiclass_nms_impl.o DEPS ${subgraph_bridge_deps_mlu})
 set(mlu_subgraph_bridges
        subgraph_bridge_registry
        subgraph_bridge_utility_mlu
@@ -52,6 +55,9 @@ set(mlu_subgraph_bridges
        subgraph_bridge_squeeze_op_mlu
        subgraph_bridge_reshape_op_mlu
        subgraph_bridge_flatten_op_mlu
+        subgraph_bridge_box_coder_op_mlu
+        subgraph_bridge_density_prior_box_op_mlu
+        subgraph_bridge_multiclass_nms_mlu
        CACHE INTERNAL "mlu_subgraph_bridges")
@@ -88,6 +94,9 @@ lite_cc_test(test_argmax_converter_mlu SRCS argmax_op_test.cc DEPS scope optimiz
 lite_cc_test(test_squeeze_converter_mlu SRCS squeeze_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
 lite_cc_test(test_reshape_converter_mlu SRCS reshape_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
 lite_cc_test(test_flatten_converter_mlu SRCS flatten_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+lite_cc_test(test_box_coder_mlu SRCS box_coder_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+lite_cc_test(test_density_prior_box_mlu SRCS density_prior_box_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+lite_cc_test(test_multiclass_nms_op_converter_mlu SRCS multiclass_nms_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
 if (LITE_BUILD_EXTRA)
  lite_cc_test(test_norm_converter_mlu SRCS norm_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
  lite_cc_test(test_lrn_converter_mlu SRCS lrn_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)

--- a/lite/kernels/mlu/bridges/box_coder_op.cc
+++ b/lite/kernels/mlu/bridges/box_coder_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+inline cnmlBoxCodeType_t GetBoxCodeType(const std::string& type) {
+  if (type == "encode_center_size") {
+    return cnmlBoxCodeType_t::Encode;
+  }
+  return cnmlBoxCodeType_t::Decode;
+}
+int BoxCoderConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[MLU] Converting " + op_type + "...";
+  auto Prior_box_name = op_info->Input("PriorBox").front();
+  auto Target_box_name = op_info->Input("TargetBox").front();
+  auto Output_box_name = op_info->Output("OutputBox").front();
+  std::vector<std::string> input_arg_names = op_info->InputArgumentNames();
+  if (std::find(input_arg_names.begin(),
+                input_arg_names.end(),
+                "PriorBoxVar") == input_arg_names.end()) {
+    LOG(FATAL) << "box coder mlu kernel expect PriorBoxVar input" << std::endl;
+  }
+  auto box_var_name = op_info->Input("PriorBoxVar").front();
+  auto* prior_box = scope->FindVar(Prior_box_name)->GetMutable<Tensor>();
+  auto* target_box = scope->FindVar(Target_box_name)->GetMutable<Tensor>();
+  auto* proposals = scope->FindVar(Output_box_name)->GetMutable<Tensor>();
+  auto* box_var = scope->FindVar(box_var_name)->GetMutable<Tensor>();
+  auto code_type_str = op_info->GetAttr<std::string>("code_type");
+  auto box_normalized = op_info->GetAttr<bool>("box_normalized");
+  int axis = -1;
+  if (op_info->HasAttr("axis")) {
+    axis = op_info->GetAttr<int>("axis");
+  } else {
+    LOG(FATAL) << "box coder mlu kernel expect axis" << std::endl;
+  }
+  if (op_info->HasAttr("variance")) {
+    LOG(WARNING) << "box coder mlu kernel expect not have variance attr"
+                 << std::endl;
+    VLOG(6) << "variance: ";
+    auto variance_vec = op_info->GetAttr<std::vector<float>>("variance");
+    for (size_t i = 0; i < variance_vec.size(); i++) {
+      VLOG(6) << variance_vec[i];
+    }
+  }
+  cnmlBoxCodeType_t code_type = GetBoxCodeType(code_type_str);
+  int row = -1;
+  int len = -1;
+  int col = -1;
+  if (code_type == cnmlBoxCodeType_t::Encode) {
+    // target_box_shape = {row, len};
+    // prior_box_shape = {col, len};
+    // output_shape = {row, col, len};
+    row = target_box->dims()[0];
+    len = target_box->dims()[1];
+    col = prior_box->dims()[0];
+  } else if (code_type == cnmlBoxCodeType_t::Decode) {
+    // target_box_shape = {row,col,len};
+    // prior_box_shape = {col, len} if axis == 0, or {row, len};
+    // output_shape = {row, col, len};
+    row = target_box->dims()[0];
+    col = target_box->dims()[1];
+    len = target_box->dims()[2];
+    if (axis == 0) {
+      CHECK(prior_box->dims()[0] == col);
+    } else {
+      CHECK(prior_box->dims()[0] == row);
+    }
+  }
+  bool float32_precision = false;
+  if (graph->FPType() == CNML_DATA_FLOAT32) {
+    float32_precision = true;
+  }
+  // =================== DEBUG ======================
+  VLOG(6) << "prior_box->dims(): " << prior_box->dims();
+  VLOG(6) << "target_box->dims(): " << target_box->dims();
+  VLOG(6) << "box_var->dims(): " << box_var->dims();
+  VLOG(6) << "proposals->dims(): " << proposals->dims();
+  VLOG(6) << "code_type_str: " << code_type_str;
+  VLOG(6) << "col: " << col;
+  VLOG(6) << "row: " << row;
+  VLOG(6) << "len: " << len;
+  VLOG(6) << "axis: " << axis;
+  VLOG(6) << "box_normalized :" << box_normalized;
+  VLOG(6) << "float32_precision: " << float32_precision;
+  VLOG(6) << "Prior_box_name: " << Prior_box_name;
+  VLOG(6) << "Target_box_name: " << Target_box_name;
+  VLOG(6) << "Output_box_name: " << Output_box_name;
+  VLOG(6) << "box_var_name: " << box_var_name;
+  // =================== DEBUG END ======================
+  auto target_box_tensor = graph->GetNode(Target_box_name);
+  auto prior_box_tensor = graph->GetNode(Prior_box_name);
+  auto box_var_tensor = graph->GetNode(box_var_name);
+  auto proposals_tensor = graph->AddNode(Output_box_name,
+                                         proposals->dims().Vectorize(),
+                                         CNML_TENSOR,
+                                         CNML_NCHW,
+                                         graph->FPType());
+  cnmlPluginBoxCoderOpParam_t param;
+  CNML_CALL(
+      cnmlCreatePluginBoxCoderOpParam(&param,
+                                      row,
+                                      col,
+                                      len,
+                                      axis,
+                                      box_normalized,
+                                      float32_precision,
+                                      code_type,
+                                      TargetWrapperMlu::MLUCoreVersion()));
+  cnmlBaseOp_t box_coder_op;
+  cnmlTensor_t input_tensors[3];
+  input_tensors[0] = target_box_tensor->mlu_tensor();
+  input_tensors[1] = prior_box_tensor->mlu_tensor();
+  input_tensors[2] = box_var_tensor->mlu_tensor();
+  cnmlTensor_t output_tensors[1];
+  output_tensors[0] = proposals_tensor->mlu_tensor();
+  CNML_CALL(cnmlCreatePluginBoxCoderOp(
+      &box_coder_op, param, input_tensors, output_tensors));
+  // CNML_CALL(cnmlSetOperationComputingLayout(box_coder_op, CNML_NCHW)); //
+  // important
+  graph->FuseOp(box_coder_op);
+  cnmlDestroyPluginBoxCoderOpParam(&param);
+  return SUCCESS;
+}
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+REGISTER_SUBGRAPH_BRIDGE(box_coder,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::BoxCoderConverter);
--- a/lite/kernels/mlu/bridges/box_coder_op_test.cc
+++ b/lite/kernels/mlu/bridges/box_coder_op_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/operators/box_coder_op.h"
+#include <gtest/gtest.h>
+#include <random>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include "lite/kernels/npu/bridges/registry.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+void ToFile(Tensor *tensor, std::string file_name) {
+  int count = tensor->dims().production();
+  auto data = tensor->mutable_data<float>();
+  std::ostringstream outs;
+  for (size_t i = 0; i < count; i++) {
+    outs << data[i] << std::endl;
+  }
+  std::ofstream of;
+  of.open(file_name, std::ios::out);
+  of << outs.str();
+  of.close();
+}
+inline std::string BoxCodeTypeToStr(cnmlBoxCodeType_t code_type) {
+  if (code_type == cnmlBoxCodeType_t::Encode) {
+    return "encode_center_size";
+  } else if (code_type == cnmlBoxCodeType_t::Decode) {
+    return "decode_center_size";
+  } else {
+    CHECK(false);
+  }
+}
+inline cnmlBoxCodeType_t GetBoxCodeType(const std::string &type) {
+  if (type == "encode_center_size") {
+    return cnmlBoxCodeType_t::Encode;
+  } else if (type == "decode_center_size") {
+    return cnmlBoxCodeType_t::Decode;
+  } else {
+    CHECK(false);
+  }
+}
+void EncodeCenterSize(float *target_box_data,
+                      float *prior_box_data,
+                      float *prior_box_var_data,
+                      std::vector<int64_t> target_box_shape,
+                      std::vector<int64_t> prior_box_shape,
+                      std::vector<int64_t> prior_box_var_shape,
+                      const bool normalized,
+                      const std::vector<float> variance,
+                      float *output) {
+  int64_t row = target_box_shape[0];
+  int64_t col = prior_box_shape[0];
+  int64_t len = prior_box_shape[1];
+  for (int64_t i = 0; i < row; ++i) {
+    for (int64_t j = 0; j < col; ++j) {
+      size_t offset = i * col * len + j * len;
+      float prior_box_width = prior_box_data[j * len + 2] -
+                              prior_box_data[j * len] + (normalized == false);
+      float prior_box_height = prior_box_data[j * len + 3] -
+                               prior_box_data[j * len + 1] +
+                               (normalized == false);
+      float prior_box_center_x = prior_box_data[j * len] + prior_box_width / 2;
+      float prior_box_center_y =
+          prior_box_data[j * len + 1] + prior_box_height / 2;
+      float target_box_center_x =
+          (target_box_data[i * len + 2] + target_box_data[i * len]) / 2;
+      float target_box_center_y =
+          (target_box_data[i * len + 3] + target_box_data[i * len + 1]) / 2;
+      float target_box_width = target_box_data[i * len + 2] -
+                               target_box_data[i * len] + (normalized == false);
+      float target_box_height = target_box_data[i * len + 3] -
+                                target_box_data[i * len + 1] +
+                                (normalized == false);
+      output[offset] =
+          (target_box_center_x - prior_box_center_x) / prior_box_width;
+      output[offset + 1] =
+          (target_box_center_y - prior_box_center_y) / prior_box_height;
+      output[offset + 2] =
+          std::log(std::fabs(target_box_width / prior_box_width));
+      output[offset + 3] =
+          std::log(std::fabs(target_box_height / prior_box_height));
+    }
+  }
+  if (prior_box_var_data) {
+    for (int64_t i = 0; i < row; ++i) {
+      for (int64_t j = 0; j < col; ++j) {
+        for (int k = 0; k < 4; ++k) {
+          size_t offset = i * col * len + j * len;
+          int prior_var_offset = j * len;
+          output[offset + k] /= prior_box_var_data[prior_var_offset + k];
+        }
+      }
+    }
+  } else if (!(variance.empty())) {
+    for (int64_t i = 0; i < row; ++i) {
+      for (int64_t j = 0; j < col; ++j) {
+        for (int k = 0; k < 4; ++k) {
+          size_t offset = i * col * len + j * len;
+          output[offset + k] /= static_cast<float>(variance[k]);
+        }
+      }
+    }
+  }
+}
+template <int axis, int var_size>
+void DecodeCenterSize(float *target_box_data,
+                      float *prior_box_data,
+                      float *prior_box_var_data,
+                      std::vector<int64_t> target_box_shape,
+                      std::vector<int64_t> prior_box_shape,
+                      std::vector<int64_t> prior_box_var_shape,
+                      const bool normalized,
+                      std::vector<float> variance,
+                      float *output) {
+  int64_t row = target_box_shape[0];
+  int64_t col = target_box_shape[1];
+  int64_t len = target_box_shape[2];
+  for (int64_t i = 0; i < row; ++i) {
+    for (int64_t j = 0; j < col; ++j) {
+      float var_data[4] = {1., 1., 1., 1.};
+      float *var_ptr = var_data;
+      size_t offset = i * col * len + j * len;
+      int prior_box_offset = axis == 0 ? j * len : i * len;
+      float prior_box_width = prior_box_data[prior_box_offset + 2] -
+                              prior_box_data[prior_box_offset] +
+                              (normalized == false);
+      float prior_box_height = prior_box_data[prior_box_offset + 3] -
+                               prior_box_data[prior_box_offset + 1] +
+                               (normalized == false);
+      float prior_box_center_x =
+          prior_box_data[prior_box_offset] + prior_box_width / 2;
+      float prior_box_center_y =
+          prior_box_data[prior_box_offset + 1] + prior_box_height / 2;
+      float target_box_center_x = 0, target_box_center_y = 0;
+      float target_box_width = 0, target_box_height = 0;
+      int prior_var_offset = axis == 0 ? j * len : i * len;
+      if (var_size == 2) {
+        std::memcpy(
+            var_ptr, prior_box_var_data + prior_var_offset, 4 * sizeof(float));
+      } else if (var_size == 1) {
+        var_ptr = reinterpret_cast<float *>(variance.data());
+      }
+      float box_var_x = *var_ptr;
+      float box_var_y = *(var_ptr + 1);
+      float box_var_w = *(var_ptr + 2);
+      float box_var_h = *(var_ptr + 3);
+      target_box_center_x =
+          box_var_x * target_box_data[offset] * prior_box_width +
+          prior_box_center_x;
+      target_box_center_y =
+          box_var_y * target_box_data[offset + 1] * prior_box_height +
+          prior_box_center_y;
+      target_box_width =
+          std::exp(box_var_w * target_box_data[offset + 2]) * prior_box_width;
+      target_box_height =
+          std::exp(box_var_h * target_box_data[offset + 3]) * prior_box_height;
+      output[offset] = target_box_center_x - target_box_width / 2;
+      output[offset + 1] = target_box_center_y - target_box_height / 2;
+      output[offset + 2] =
+          target_box_center_x + target_box_width / 2 - (normalized == false);
+      output[offset + 3] =
+          target_box_center_y + target_box_height / 2 - (normalized == false);
+    }
+  }
+}
+void Compute(cnmlBoxCodeType_t code_type,
+             lite::Tensor *prior_box,
+             lite::Tensor *target_box,
+             lite::Tensor *box_var,
+             lite::Tensor *output_box,
+             std::vector<float> variance,
+             bool normalized,
+             int axis) {
+  // BoxCodeType code_type = BoxCodeType::kDecodeCenterSize;
+  // std::vector<int> prior_box_shape = {512, 4};
+  // std::vector<int> prior_box_var_shape = prior_box_shape;
+  // std::vector<int> target_box_shape;
+  // std::vector<int> output_shape;
+  // if (code_type == BoxCodeType::kEncodeCenterSize) {
+  //   target_box_shape = {81, 4};
+  //   output_shape = {81, 512, 4};
+  // } else {
+  //   target_box_shape = {81, 512, 4};
+  //   output_shape = {81, 512, 4};
+  // }
+  auto *prior_box_data = prior_box->mutable_data<float>();
+  auto *prior_box_var_data = box_var->mutable_data<float>();
+  auto *target_box_data = target_box->mutable_data<float>();
+  auto *output_data = output_box->mutable_data<float>();
+  auto target_box_shape = target_box->dims().Vectorize();
+  auto prior_box_shape = prior_box->dims().Vectorize();
+  auto prior_box_var_shape = box_var->dims().Vectorize();
+  if (code_type == cnmlBoxCodeType_t::Encode) {
+    EncodeCenterSize(target_box_data,
+                     prior_box_data,
+                     prior_box_var_data,
+                     target_box_shape,
+                     prior_box_shape,
+                     prior_box_var_shape,
+                     normalized,
+                     variance,
+                     output_data);
+  } else if (code_type == cnmlBoxCodeType_t::Decode) {
+    if (prior_box_var_data) {
+      LOG(INFO) << "prior_box_var_data not null" << std::endl;
+      if (axis == 0) {
+        LOG(INFO) << "use DecodeCenterSize<1, 2> axis == 0" << std::endl;
+        DecodeCenterSize<0, 2>(target_box_data,
+                               prior_box_data,
+                               prior_box_var_data,
+                               target_box_shape,
+                               prior_box_shape,
+                               prior_box_var_shape,
+                               normalized,
+                               variance,
+                               output_data);
+      } else {
+        LOG(INFO) << "use DecodeCenterSize<1, 2> axis == 1" << std::endl;
+        DecodeCenterSize<1, 2>(target_box_data,
+                               prior_box_data,
+                               prior_box_var_data,
+                               target_box_shape,
+                               prior_box_shape,
+                               prior_box_var_shape,
+                               normalized,
+                               variance,
+                               output_data);
+      }
+    } else if (!(variance.empty())) {
+      LOG(INFO) << "prior_box_var_data null" << std::endl;
+      if (axis == 0) {
+        DecodeCenterSize<0, 1>(target_box_data,
+                               prior_box_data,
+                               prior_box_var_data,
+                               target_box_shape,
+                               prior_box_shape,
+                               prior_box_var_shape,
+                               normalized,
+                               variance,
+                               output_data);
+      } else {
+        DecodeCenterSize<1, 1>(target_box_data,
+                               prior_box_data,
+                               prior_box_var_data,
+                               target_box_shape,
+                               prior_box_shape,
+                               prior_box_var_shape,
+                               normalized,
+                               variance,
+                               output_data);
+      }
+    } else {
+      if (axis == 0) {
+        DecodeCenterSize<0, 0>(target_box_data,
+                               prior_box_data,
+                               prior_box_var_data,
+                               target_box_shape,
+                               prior_box_shape,
+                               prior_box_var_shape,
+                               normalized,
+                               variance,
+                               output_data);
+      } else {
+        DecodeCenterSize<1, 0>(target_box_data,
+                               prior_box_data,
+                               prior_box_var_data,
+                               target_box_shape,
+                               prior_box_shape,
+                               prior_box_var_shape,
+                               normalized,
+                               variance,
+                               output_data);
+      }
+    }
+  }
+}
+void box_coder_ref(const std::shared_ptr<operators::BoxCoderOpLite> op) {
+  Scope *scope = op->scope();
+  const OpInfo *op_info = op->op_info();
+  auto prior_box =
+      scope->FindVar(op_info->Input("PriorBox").front())->GetMutable<Tensor>();
+  auto target_box =
+      scope->FindVar(op_info->Input("TargetBox").front())->GetMutable<Tensor>();
+  auto box_var = scope->FindVar(op_info->Input("PriorBoxVar").front())
+                     ->GetMutable<Tensor>();
+  auto output_box = scope->FindVar(op_info->Output("OutputBox").front())
+                        ->GetMutable<Tensor>();
+  auto code_type_str = op_info->GetAttr<std::string>("code_type");
+  auto box_normalized = op_info->GetAttr<bool>("box_normalized");
+  auto axis = op_info->GetAttr<int>("axis");
+  auto code_type = GetBoxCodeType(code_type_str);
+  std::vector<float> variance;
+  if (op_info->HasAttr("variance")) {
+    variance = op_info->GetAttr<std::vector<float>>("variance");
+  }
+  Compute(code_type,
+          prior_box,
+          target_box,
+          box_var,
+          output_box,
+          variance,
+          box_normalized,
+          axis);
+}
+void test_box_coder(int row,
+                    int col,
+                    int len,
+                    int axis,
+                    cnmlBoxCodeType_t code_type,
+                    bool box_normalized) {
+  // prepare input&output variables
+  Scope scope;
+  std::string prior_box_var_name("PriorBox");
+  std::string taget_box_var_name("TargetBox");
+  std::string output_box_var_name("OutputBox");
+  std::string box_var_var_name("PriorBoxVar");
+  std::string output_ref_var_name("OutputBox_ref");
+  auto *prior_box = scope.Var(prior_box_var_name)->GetMutable<Tensor>();
+  auto *target_box = scope.Var(taget_box_var_name)->GetMutable<Tensor>();
+  auto *box_var = scope.Var(box_var_var_name)->GetMutable<Tensor>();
+  auto *output_box = scope.Var(output_box_var_name)->GetMutable<Tensor>();
+  auto *output_box_ref = scope.Var(output_ref_var_name)->GetMutable<Tensor>();
+  if (code_type == cnmlBoxCodeType_t::Encode) {
+    // target_box_shape = {row, len};
+    // prior_box_shape = {col, len};
+    // output_shape = {row, col, len};
+    target_box->Resize({row, len});
+    prior_box->Resize({col, len});
+    box_var->Resize({col, len});
+  } else if (code_type == cnmlBoxCodeType_t::Decode) {
+    // target_box_shape = {row,col,len};
+    // prior_box_shape = {col, len} if axis == 0, or {row, len};
+    // output_shape = {row, col, len};
+    target_box->Resize({row, col, len});
+    if (axis == 0) {
+      prior_box->Resize({col, len});
+      box_var->Resize({col, len});
+    } else if (axis == 1) {
+      prior_box->Resize({row, len});
+      box_var->Resize({row, len});
+    } else {
+      LOG(FATAL) << "axis should in {0,1} ,but got " << axis << std::endl;
+    }
+  }
+  // initialize input&output data
+  // FillTensor<float>(prior_box);
+  // FillTensor<float>(target_box);
+  // FillTensor<float, int>(box_var); // ??????
+  for (int i = 0; i < prior_box->dims().production(); i++) {
+    prior_box->mutable_data<float>()[i] = static_cast<float>((i % 8) + 1);
+  }
+  for (int i = 0; i < target_box->dims().production(); i++) {
+    target_box->mutable_data<float>()[i] = static_cast<float>((i % 8) + 1);
+  }
+  for (int i = 0; i < box_var->dims().production() / 4; i++) {
+    box_var->mutable_data<float>()[i * 4 + 0] = 0.1;
+    box_var->mutable_data<float>()[i * 4 + 1] = 0.1;
+    box_var->mutable_data<float>()[i * 4 + 2] = 0.2;
+    box_var->mutable_data<float>()[i * 4 + 3] = 0.2;
+  }
+  LOG(INFO) << "prior_box count : " << prior_box->dims().production();
+  LOG(INFO) << "target_box count : " << target_box->dims().production();
+  LOG(INFO) << "box_var count : " << box_var->dims().production();
+  // ToFile(*prior_box, "prior_box.txt");
+  // ToFile(*box_var, "box_var.txt");
+  // ToFile(*target_box, "target_box.txt");
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  opdesc.SetType("box_coder");
+  opdesc.SetInput("PriorBox", {prior_box_var_name});
+  opdesc.SetInput("TargetBox", {taget_box_var_name});
+  opdesc.SetInput("PriorBoxVar", {box_var_var_name});
+  opdesc.SetOutput("OutputBox", {output_box_var_name});
+  opdesc.SetAttr("axis", axis);
+  opdesc.SetAttr("box_normalized", box_normalized);
+  opdesc.SetAttr("code_type", BoxCodeTypeToStr(code_type));
+  // trans inputs
+  Tensor prior_box_trans;
+  Tensor box_var_trans;
+  Tensor target_box_trans;
+  prior_box_trans.Resize(prior_box->dims());
+  box_var_trans.Resize(box_var->dims());
+  target_box_trans.Resize(target_box->dims());
+  auto op = CreateOp<paddle::lite::operators::BoxCoderOpLite>(opdesc, &scope);
+  box_coder_ref(op);
+  output_box_ref->CopyDataFrom(*output_box);
+  // transpose(prior_box->mutable_data<float>(),
+  //           prior_box_trans.mutable_data<float>(),
+  //           {static_cast<int>(prior_box->dims()[0]),
+  //            static_cast<int>(prior_box->dims()[1]),
+  //            1,
+  //            1},
+  //           {0, 2, 3, 1});
+  // row col len 1 --> row len 1 col
+  transpose(target_box->mutable_data<float>(),
+            target_box_trans.mutable_data<float>(),
+            {
+                static_cast<int>(target_box->dims()[0]),
+                static_cast<int>(target_box->dims()[1]),
+                static_cast<int>(target_box->dims()[2]),
+                1,
+            },
+            {0, 2, 3, 1});
+  // transpose(box_var->mutable_data<float>(),
+  //           box_var_trans.mutable_data<float>(),
+  //           {static_cast<int>(box_var->dims()[0]),
+  //            static_cast<int>(box_var->dims()[0]),
+  //            1,
+  //            1},
+  //           {0, 2, 3, 1});
+  target_box->CopyDataFrom(target_box_trans);
+  LaunchOp(op,
+           {prior_box_var_name, taget_box_var_name, box_var_var_name},
+           {output_box_var_name});
+  // execute reference implementation and save to output tensor('out')
+  // compare results
+  auto *output_data = output_box->mutable_data<float>();
+  auto *output_ref_data = output_box_ref->mutable_data<float>();
+  Tensor output_trans;
+  output_trans.Resize(output_box->dims());
+  // row * len * 1 * col -> row * col * len * 1
+  transpose(output_data,
+            output_trans.mutable_data<float>(),
+            {static_cast<int>(output_box->dims()[0]),
+             static_cast<int>(output_box->dims()[2]),
+             1,
+             static_cast<int>(output_box->dims()[1])},
+            {0, 3, 1, 2});
+  output_data = output_trans.mutable_data<float>();
+  // ToFile(*output_box, "output_mlu_before_trans.txt");
+  // ToFile(&output_trans, "output_mlu.txt");
+  // ToFile(output_box_ref, "output_cpu.txt");
+  for (int i = 0; i < output_box->dims().production(); i++) {
+    VLOG(6) << i;
+    EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-2);
+  }
+}
+TEST(MLUBridges, prior_density_box) {
+  int row = 1;
+  int col = 20560;
+  int len = 4;
+  int axis = 0;
+  cnmlBoxCodeType_t code_type = cnmlBoxCodeType_t::Decode;
+  bool box_normalized = true;
+  test_box_coder(row, col, len, axis, code_type, box_normalized);
+}
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+USE_SUBGRAPH_BRIDGE(box_coder, kMLU);
--- a/lite/kernels/mlu/bridges/density_prior_box_op.cc
+++ b/lite/kernels/mlu/bridges/density_prior_box_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+void inferShape(Tensor* input,
+                Tensor* boxes,
+                Tensor* variances,
+                std::vector<float> fixed_ratios,
+                std::vector<int> densities) {
+  auto feat_height = input->dims()[2];
+  auto feat_width = input->dims()[3];
+  int num_priors = 0;
+  for (size_t i = 0; i < densities.size(); ++i) {
+    num_priors += (fixed_ratios.size()) * (pow(densities[i], 2));
+  }
+  std::vector<int64_t> boxes_shape = {feat_width, feat_height, num_priors, 4};
+  std::vector<int64_t> vars_shape = boxes_shape;
+  boxes->Resize(boxes_shape);
+  variances->Resize(vars_shape);
+}
+int DensityPriorBoxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[MLU] Converting " + op_type + "...";
+  auto input_name = op_info->Input("Input").front();
+  auto image_name = op_info->Input("Image").front();
+  auto boxes_name = op_info->Output("Boxes").front();
+  auto variances_name = op_info->Output("Variances").front();
+  auto input_var = scope->FindVar(input_name)->GetMutable<Tensor>();
+  auto image_var = scope->FindVar(image_name)->GetMutable<Tensor>();
+  auto boxes_var = scope->FindVar(boxes_name)->GetMutable<Tensor>();
+  auto variances_var = scope->FindVar(variances_name)->GetMutable<Tensor>();
+  auto clip = op_info->GetAttr<bool>("clip");
+  auto fixed_sizes = op_info->GetAttr<std::vector<float>>("fixed_sizes");
+  auto fixed_ratios = op_info->GetAttr<std::vector<float>>("fixed_ratios");
+  auto variances_ = op_info->GetAttr<std::vector<float>>("variances");
+  auto densities = op_info->GetAttr<std::vector<int>>("densities");
+  auto offset = op_info->GetAttr<float>("offset");
+  auto step_w = op_info->GetAttr<float>("step_w");
+  auto step_h = op_info->GetAttr<float>("step_h");
+  inferShape(input_var, boxes_var, variances_var, fixed_ratios, densities);
+  auto input_dims = input_var->dims();
+  auto image_dims = image_var->dims();
+  auto boxes_dims = boxes_var->dims();
+  auto variances_dims = variances_var->dims();
+  auto feat_tensor = graph->GetNode(input_name);
+  auto image_tensor = graph->GetNode(image_name);
+  auto boxes_tensor_trans = graph->AddNode(boxes_name + ".trans.boxes",
+                                           boxes_dims.Vectorize(),
+                                           CNML_TENSOR,
+                                           CNML_NHWC,
+                                           graph->FPType());
+  auto variances_tensor_trans = graph->AddNode(variances_name + ".trans.vars",
+                                               variances_dims.Vectorize(),
+                                               CNML_TENSOR,
+                                               CNML_NHWC,
+                                               graph->FPType());
+  bool float32_precision = false;
+  if (graph->FPType() == CNML_DATA_FLOAT32) {
+    float32_precision = true;
+  }
+  // ==================== DEBUG ==================
+  VLOG(6) << "input_name: " << input_name;
+  VLOG(6) << "image_name: " << image_name;
+  VLOG(6) << "boxes_name: " << boxes_name;
+  VLOG(6) << "variances_name: " << variances_name;
+  VLOG(6) << "input_dims : " << input_dims;
+  VLOG(6) << "image_dims : " << image_dims;
+  VLOG(6) << "boxes_dims : " << boxes_dims;
+  VLOG(6) << "variances_dims : " << variances_dims;
+  VLOG(6) << "clip : " << clip;
+  VLOG(6) << "fixed_sizes : ";
+  for (auto tmp : fixed_sizes) {
+    VLOG(6) << tmp;
+  }
+  VLOG(6) << "fixed_ratios : ";
+  for (auto tmp : fixed_ratios) {
+    VLOG(6) << tmp;
+  }
+  VLOG(6) << "variances_ : ";
+  for (auto tmp : variances_) {
+    VLOG(6) << tmp;
+  }
+  VLOG(6) << "densities : ";
+  for (auto tmp : densities) {
+    VLOG(6) << tmp;
+  }
+  VLOG(6) << "offset : " << offset;
+  VLOG(6) << "clip : " << clip;
+  int cnml_boxes_shape[4];
+  CNML_CALL(
+      cnmlGetTensorShape(boxes_tensor_trans->mlu_tensor(), cnml_boxes_shape));
+  VLOG(6) << "cnml_boxes_shape";
+  for (size_t i = 0; i < 4; i++) {
+    VLOG(6) << cnml_boxes_shape[i];
+  }
+  int cnml_vars_shape[4];
+  VLOG(6) << "cnml_vars_shape";
+  CNML_CALL(cnmlGetTensorShape(variances_tensor_trans->mlu_tensor(),
+                               cnml_vars_shape));
+  for (size_t i = 0; i < 4; i++) {
+    VLOG(6) << cnml_vars_shape[i];
+  }
+  int feat_width = input_dims[3];
+  int feat_height = input_dims[2];
+  int image_width = image_dims[3];
+  int image_height = image_dims[2];
+  // ==================== DEBUG END ==================
+  cnmlPluginDensityPriorBoxOpParam_t op_param;
+  cnmlCreatePluginDensityPriorBoxOpParam(&op_param,
+                                         feat_width,
+                                         feat_height,
+                                         image_width,
+                                         image_height,
+                                         variances_.data(),
+                                         variances_.size(),
+                                         densities.data(),
+                                         densities.size(),
+                                         fixed_sizes.data(),
+                                         fixed_sizes.size(),
+                                         fixed_ratios.data(),
+                                         fixed_ratios.size(),
+                                         clip,
+                                         step_w,
+                                         step_h,
+                                         offset,
+                                         float32_precision,
+                                         TargetWrapperMlu::MLUCoreVersion());
+  cnmlTensor_t input_tensors[2];
+  input_tensors[0] = feat_tensor->mlu_tensor();
+  input_tensors[1] = image_tensor->mlu_tensor();
+  cnmlTensor_t output_tensors[2];
+  output_tensors[0] = boxes_tensor_trans->mlu_tensor();
+  output_tensors[1] = variances_tensor_trans->mlu_tensor();
+  cnmlBaseOp_t density_prior_box_op;
+  CNML_CALL(cnmlCreatePluginDensityPriorBoxOp(
+      &density_prior_box_op, op_param, input_tensors, output_tensors));
+  std::vector<int> nchw_to_nhwc_axis = {0, 2, 3, 1};
+  // ============== Boxes Trans =======================
+  auto boxes_tensor = graph->AddNode(boxes_name,
+                                     boxes_dims.Vectorize(),
+                                     CNML_TENSOR,
+                                     CNML_NCHW,
+                                     graph->FPType());
+  cnmlBaseOp_t trans_boxes_op{nullptr};
+  cnmlNdTransposeOpParam_t trans_boxes_param{nullptr};
+  CNML_CALL(cnmlCreateNdTransposeOpParam(
+      &trans_boxes_param, nchw_to_nhwc_axis.data(), nchw_to_nhwc_axis.size()));
+  CNML_CALL(cnmlCreateNdTransposeProOp(&trans_boxes_op,
+                                       boxes_tensor_trans->mlu_tensor(),
+                                       boxes_tensor->mlu_tensor(),
+                                       trans_boxes_param));
+  // ============== Boxes Trans End ===================
+  // ============== Vars Trans =======================
+  auto variances_tensor = graph->AddNode(variances_name,
+                                         variances_dims.Vectorize(),
+                                         CNML_TENSOR,
+                                         CNML_NCHW,
+                                         graph->FPType());
+  cnmlBaseOp_t trans_vars_op{nullptr};
+  cnmlNdTransposeOpParam_t trans_vars_param{nullptr};
+  CNML_CALL(cnmlCreateNdTransposeOpParam(
+      &trans_vars_param, nchw_to_nhwc_axis.data(), nchw_to_nhwc_axis.size()));
+  CNML_CALL(cnmlCreateNdTransposeProOp(&trans_vars_op,
+                                       variances_tensor_trans->mlu_tensor(),
+                                       variances_tensor->mlu_tensor(),
+                                       trans_vars_param));
+  // ============== Vars Trans End ===================
+  // cnmlSetOperationComputingLayout(density_prior_box_op,CNML_NCHW);
+  // cnmlSetTensorComputingLayoutInOperation(
+  //     density_prior_box_op, boxes_tensor->mlu_tensor(), CNML_NCHW);
+  // cnmlSetTensorComputingLayoutInOperation(
+  //     density_prior_box_op, variances_tensor->mlu_tensor(), CNML_NCHW);
+  graph->FuseOp(trans_boxes_op);
+  graph->FuseOp(density_prior_box_op);
+  graph->FuseOp(trans_vars_op);
+  // cnmlDestroyPluginDensityPriorBoxOpParam(&op_param);
+  return SUCCESS;
+}
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+REGISTER_SUBGRAPH_BRIDGE(density_prior_box,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::DensityPriorBoxConverter);
--- a/lite/kernels/mlu/bridges/density_prior_box_op_test.cc
+++ b/lite/kernels/mlu/bridges/density_prior_box_op_test.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/operators/density_prior_box_op.h"
+#include <gtest/gtest.h>
+#include <random>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include "lite/kernels/npu/bridges/registry.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+void inferShape_(Tensor* input,
+                 Tensor* boxes,
+                 Tensor* variances,
+                 std::vector<float> fixed_ratios,
+                 std::vector<int> densities) {
+  auto feat_height = input->dims()[2];
+  auto feat_width = input->dims()[3];
+  int num_priors = 0;
+  for (size_t i = 0; i < densities.size(); ++i) {
+    num_priors += (fixed_ratios.size()) * (pow(densities[i], 2));
+  }
+  std::vector<int64_t> boxes_shape = {feat_width, feat_height, num_priors, 4};
+  std::vector<int64_t> vars_shape = boxes_shape;
+  boxes->Resize(boxes_shape);
+  variances->Resize(vars_shape);
+}
+void prior_density_box_ref(
+    const std::shared_ptr<operators::DensityPriorBoxOpLite> op) {
+  Scope* scope = op->scope();
+  const OpInfo* op_info = op->op_info();
+  auto input =
+      scope->FindVar(op_info->Input("Input").front())->GetMutable<Tensor>();
+  auto image =
+      scope->FindVar(op_info->Input("Image").front())->GetMutable<Tensor>();
+  auto boxes_tensor =
+      scope->FindVar(op_info->Output("Boxes").front())->GetMutable<Tensor>();
+  auto variances = scope->FindVar(op_info->Output("Variances").front())
+                       ->GetMutable<Tensor>();
+  auto clip = op_info->GetAttr<bool>("clip");
+  auto fixed_sizes = op_info->GetAttr<std::vector<float>>("fixed_sizes");
+  auto fixed_ratios = op_info->GetAttr<std::vector<float>>("fixed_ratios");
+  auto variances_ = op_info->GetAttr<std::vector<float>>("variances");
+  auto densities = op_info->GetAttr<std::vector<int>>("densities");
+  auto offset = op_info->GetAttr<float>("offset");
+  auto step_w = op_info->GetAttr<float>("step_w");
+  auto step_h = op_info->GetAttr<float>("step_h");
+  std::vector<int> input_shape = {128, 128};
+  std::vector<int> image_shape = {256, 256};
+  int num_priors = 0;
+  for (size_t i = 0; i < densities.size(); ++i) {
+    num_priors += (fixed_ratios.size()) * (pow(densities[i], 2));
+  }
+  int boxes_count = boxes_tensor->dims().production();
+  float* boxes = boxes_tensor->mutable_data<float>();
+  float* vars = variances->mutable_data<float>();
+  auto img_width = image->dims()[3];
+  auto img_height = image->dims()[2];
+  auto feature_width = input->dims()[3];
+  auto feature_height = input->dims()[2];
+  float step_width, step_height;
+  if (step_w == 0 || step_h == 0) {
+    step_width = static_cast<float>(img_width) / feature_width;
+    step_height = static_cast<float>(img_height) / feature_height;
+  } else {
+    step_width = step_w;
+    step_height = step_h;
+  }
+  int step_average = static_cast<int>((step_width + step_height) * 0.5);
+  std::vector<float> sqrt_fixed_ratios;
+  for (size_t i = 0; i < fixed_ratios.size(); i++) {
+    sqrt_fixed_ratios.push_back(sqrt(fixed_ratios[i]));
+  }
+  for (int h = 0; h < feature_height; ++h) {
+    for (int w = 0; w < feature_width; ++w) {
+      float center_x = (w + offset) * step_width;
+      float center_y = (h + offset) * step_height;
+      int idx = 0;
+      // Generate density prior boxes with fixed sizes.
+      for (size_t s = 0; s < fixed_sizes.size(); ++s) {
+        auto fixed_size = fixed_sizes[s];
+        int density = densities[s];
+        int shift = step_average / density;
+        // Generate density prior boxes with fixed ratios.
+        for (size_t r = 0; r < fixed_ratios.size(); ++r) {
+          float box_width_ratio = fixed_size * sqrt_fixed_ratios[r];
+          float box_height_ratio = fixed_size / sqrt_fixed_ratios[r];
+          float density_center_x = center_x - step_average / 2. + shift / 2.;
+          float density_center_y = center_y - step_average / 2. + shift / 2.;
+          for (int di = 0; di < density; ++di) {
+            for (int dj = 0; dj < density; ++dj) {
+              float center_x_temp = density_center_x + dj * shift;
+              float center_y_temp = density_center_y + di * shift;
+              boxes[h * feature_width * num_priors * 4 + w * num_priors * 4 +
+                    idx * 4 + 0] =
+                  std::max((center_x_temp - box_width_ratio / 2.) / img_width,
+                           0.);
+              boxes[h * feature_width * num_priors * 4 + w * num_priors * 4 +
+                    idx * 4 + 1] =
+                  std::max((center_y_temp - box_height_ratio / 2.) / img_height,
+                           0.);
+              boxes[h * feature_width * num_priors * 4 + w * num_priors * 4 +
+                    idx * 4 + 2] =
+                  std::min((center_x_temp + box_width_ratio / 2.) / img_width,
+                           1.);
+              boxes[h * feature_width * num_priors * 4 + w * num_priors * 4 +
+                    idx * 4 + 3] =
+                  std::min((center_y_temp + box_height_ratio / 2.) / img_height,
+                           1.);
+              idx++;
+            }
+          }
+        }
+      }
+    }
+  }
+  if (clip) {
+    std::transform(boxes, boxes + boxes_count, boxes, [](float v) -> float {
+      return std::min<float>(std::max<float>(v, 0.), 1.);
+    });
+  }
+  int box_num = feature_height * feature_width * num_priors;
+  for (int i = 0; i < box_num; ++i) {
+    for (size_t j = 0; j < variances_.size(); ++j) {
+      vars[i * variances_.size() + j] = variances_[j];
+    }
+  }
+}
+void test_prior_density_box(int feat_h,
+                            int feat_w,
+                            int img_h,
+                            int img_w,
+                            bool clip,
+                            std::vector<float> fixed_sizes,
+                            std::vector<float> fixed_ratios,
+                            std::vector<float> variances_,
+                            std::vector<int> densities,
+                            float step_w,
+                            float step_h,
+                            float offset) {
+  // prepare input&output variables
+  Scope scope;
+  std::string input_var_name("Input");
+  std::string image_var_name("Image");
+  std::string boxes_var_name("Boxes");
+  std::string variances_var_name("Variances");
+  std::string boxes_ref_var_name("Boxes_ref");
+  std::string variances_ref_var_name("Variances_ref");
+  auto* input = scope.Var(input_var_name)->GetMutable<Tensor>();
+  auto* image = scope.Var(image_var_name)->GetMutable<Tensor>();
+  auto* boxes = scope.Var(boxes_var_name)->GetMutable<Tensor>();
+  auto* variances = scope.Var(variances_var_name)->GetMutable<Tensor>();
+  auto* boxes_ref = scope.Var(boxes_ref_var_name)->GetMutable<Tensor>();
+  auto* variances_ref = scope.Var(variances_ref_var_name)->GetMutable<Tensor>();
+  input->Resize({1, 1, feat_h, feat_w});
+  image->Resize({1, 1, img_h, img_w});
+  // initialize input&output data
+  FillTensor<float>(input);
+  FillTensor<float, int>(image);
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  opdesc.SetType("density_prior_box");
+  opdesc.SetInput("Input", {input_var_name});
+  opdesc.SetInput("Image", {image_var_name});
+  opdesc.SetOutput("Boxes", {boxes_var_name});
+  opdesc.SetOutput("Variances", {variances_var_name});
+  opdesc.SetAttr("fixed_sizes", fixed_sizes);
+  opdesc.SetAttr("fixed_ratios", fixed_ratios);
+  opdesc.SetAttr("variances", variances_);
+  opdesc.SetAttr("densities", densities);
+  opdesc.SetAttr("offset", offset);
+  opdesc.SetAttr("clip", clip);
+  opdesc.SetAttr("step_w", step_w);
+  opdesc.SetAttr("step_h", step_h);
+  inferShape_(input, boxes, variances, fixed_ratios, densities);
+  inferShape_(input, boxes_ref, variances_ref, fixed_ratios, densities);
+  auto op = CreateOp<operators::DensityPriorBoxOpLite>(opdesc, &scope);
+  prior_density_box_ref(op);
+  boxes_ref->CopyDataFrom(*boxes);
+  variances_ref->CopyDataFrom(*variances);
+  LaunchOp(op,
+           {input_var_name, image_var_name},
+           {boxes_var_name, variances_var_name});
+  // execute reference implementation and save to output tensor('out')
+  // ===================== Trans From NHWC to NCHW ====================
+  Tensor boxes_trans;
+  boxes_trans.Resize(boxes->dims().Vectorize());
+  transpose(boxes->mutable_data<float>(),
+            boxes_trans.mutable_data<float>(),
+            {static_cast<int>(boxes->dims()[0]),
+             static_cast<int>(boxes->dims()[2]),
+             static_cast<int>(boxes->dims()[3]),
+             static_cast<int>(boxes->dims()[1])},
+            {0, 3, 1, 2});
+  boxes->CopyDataFrom(boxes_trans);
+  Tensor vars_trans;
+  vars_trans.Resize(variances->dims().Vectorize());
+  transpose(variances->mutable_data<float>(),
+            vars_trans.mutable_data<float>(),
+            {static_cast<int>(variances->dims()[0]),
+             static_cast<int>(variances->dims()[2]),
+             static_cast<int>(variances->dims()[3]),
+             static_cast<int>(variances->dims()[1])},
+            {0, 3, 1, 2});
+  variances->CopyDataFrom(vars_trans);
+  // compare results
+  auto* boxes_data = boxes->mutable_data<float>();
+  auto* boxes_ref_data = boxes_ref->mutable_data<float>();
+  auto* variances_data = variances->mutable_data<float>();
+  auto* variances_ref_data = variances_ref->mutable_data<float>();
+  // ToFile(*variances, "var_mlu.txt");
+  // ToFile(*variances_ref, "var_cpu.txt");
+  // ToFile(*boxes, "box_mlu.txt");
+  // ToFile(*boxes_ref, "box_cpu.txt");
+  for (int i = 0; i < variances->dims().production(); i++) {
+    VLOG(6) << i;
+    EXPECT_NEAR(variances_data[i], variances_ref_data[i], 1e-5);
+  }
+  for (int i = 0; i < boxes->dims().production(); i++) {
+    VLOG(6) << i;
+    EXPECT_NEAR(boxes_data[i], boxes_ref_data[i], 1e-5);
+  }
+}
+TEST(MLUBridges, prior_density_box) {
+  // std::vector<int> input_shape = {128, 128};
+  // std::vector<int> image_shape = {256, 256};
+  // std::vector<float> fixed_sizes = {8 * 16, 16 * 16, 32 * 16};
+  // std::vector<float> fixed_sizes = {8, 16, 32};
+  // std::vector<float> fixed_ratios = {0.5, 1, 2};
+  // std::vector<int> densities = {1, 1, 1};
+  std::vector<int> input_shape = {16, 16};
+  std::vector<int> image_shape = {32, 32};
+  std::vector<float> fixed_sizes = {8, 16, 32};
+  std::vector<float> fixed_ratios = {0.5, 1, 2};
+  std::vector<int> densities = {1, 1, 1};
+  std::vector<float> variances = {0.1, 0.1, 0.2, 0.2};
+  bool clip = true;
+  float offset = 0.5;
+  float step_h = 0;
+  float step_w = 0;
+  test_prior_density_box(input_shape[1],
+                         input_shape[0],
+                         image_shape[1],
+                         image_shape[0],
+                         clip,
+                         fixed_sizes,
+                         fixed_ratios,
+                         variances,
+                         densities,
+                         offset,
+                         step_h,
+                         step_w);
+}
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+USE_SUBGRAPH_BRIDGE(density_prior_box, kMLU);
--- a/lite/kernels/mlu/bridges/multiclass_nms.cc
+++ b/lite/kernels/mlu/bridges/multiclass_nms.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <algorithm>
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/multiclass_nms_api.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/operators/multiclass_nms_op.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+int MulticlassNmsConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[MLU] Converting " + op_type + "...";
+  auto bboxes_name = op_info->Input("BBoxes").front();
+  auto scores_name = op_info->Input("Scores").front();
+  auto out_name = op_info->Output("Out").front();
+  auto* bboxes = scope->FindTensor(bboxes_name);
+  auto* scores = scope->FindTensor(scores_name);
+  auto* out = scope->FindTensor(out_name);
+  auto background_label = op_info->GetAttr<int>("background_label");
+  auto keep_top_k = op_info->GetAttr<int>("keep_top_k");
+  auto nms_top_k = op_info->GetAttr<int>("nms_top_k");
+  auto score_threshold = op_info->GetAttr<float>("score_threshold");
+  auto nms_threshold = op_info->GetAttr<float>("nms_threshold");
+  auto nms_eta = op_info->GetAttr<float>("nms_eta");
+  bool normalized = false;
+  if (op_info->HasAttr("normalized")) {
+    normalized = op_info->GetAttr<bool>("normalized");
+  }
+  auto bboxes_dims = bboxes->dims();
+  auto scores_dims = scores->dims();
+  auto batch_size = bboxes->dims()[0];
+  auto num_boxes = bboxes->dims()[1];
+  auto class_num = scores->dims()[1];
+  keep_top_k = keep_top_k == -1 ? num_boxes : keep_top_k;
+  // ?????????????
+  int box_size = 4;
+  std::vector<int64_t> outs_shape = {batch_size, keep_top_k, box_size + 2};
+  const_cast<Tensor*>(out)->Resize(outs_shape);
+  auto out_dims = out->dims();
+  // LOG(WARNING) << "CORE NUM SHOULD BE 4!!!!" << std::endl;
+  int core_num = TargetWrapperMlu::MLUCoreNumber();
+  // expect {batch_size, num_boxes, box_size} in compute
+  // while {batch_size, box_size,num_boxes} on mlu
+  // while {batch_size, num_boxes, box_size} on cpu
+  // so mlu  data_flow and mlu compute layout mismatch, should set bboxes_tensor
+  // as NCHW
+  auto bboxes_tensor = graph->GetNode(bboxes_name);
+  // expect {batch_size, class_num, num_boxes} in compute
+  // while  {batch_size,  num_boxes,class_num } on mlu
+  // while  {batch_size, class_num, num_boxes} on cpu
+  // so mlu  data_flow and mlu compute layout mismatch, should set scores_tensor
+  // as NCHW
+  auto scores_tensor = graph->GetNode(scores_name);
+  // expect batch_size, keep_top_k, box_size + 2 in compute
+  // while batch_size, box_size + 2, keep_top_k on mlu
+  // while batch_size, keep_top_k, box_size + 2 on cpu
+  // so mlu  data_flow and mlu compute layout mismatch, should set out_tensor as
+  auto out_tensor = graph->AddNode(
+      out_name, out_dims.Vectorize(), CNML_TENSOR, CNML_NCHW, graph->FPType());
+  // trans bboxes {batch_size, num_boxes, box_size}
+  auto bboxes_trans_tensor = graph->AddNode(bboxes_name + ".trans.bboxes",
+                                            bboxes_dims.Vectorize(),
+                                            CNML_TENSOR,
+                                            CNML_NCHW,
+                                            graph->FPType(),
+                                            CNML_NCHW);
+  // trans scores {batch_size, class_num, num_boxes}
+  auto scores_trans_tensor = graph->AddNode(bboxes_name + ".trans.scores",
+                                            scores_dims.Vectorize(),
+                                            CNML_TENSOR,
+                                            CNML_NCHW,
+                                            graph->FPType(),
+                                            CNML_NCHW);
+  // trans out {batch_size, keep_top_k, box_size + 2}
+  auto out_trans_tensor = graph->AddNode(out_name + ".trans.out",
+                                         out_dims.Vectorize(),
+                                         CNML_TENSOR,
+                                         CNML_NCHW,
+                                         graph->FPType(),
+                                         CNML_NCHW);
+  std::string out_num_name = "nms_out_num";
+  auto* out_num = scope->NewTensor(out_num_name);
+  std::vector<int64_t> out_num_shape = {batch_size, 1};
+  out_num->Resize(out_num_shape);
+  auto num_outs_tensor = graph->AddNode(
+      out_num_name, out_num_shape, CNML_TENSOR, CNML_NCHW, graph->FPType());
+  bool float_precision = false;
+  if (graph->FPType() == CNML_DATA_FLOAT32) {
+    float_precision = true;
+  }
+  int64_t workspace_mem_size =
+      4 * std::min(static_cast<int>(batch_size), core_num) *
+      (14 * num_boxes + 8 * class_num * num_boxes);
+  int64_t workspace_fp_size = workspace_mem_size / 4;
+  if (!float_precision) {
+    // when run as fp16, mlu size will be half of cpu size, so workspace_fp_size
+    // should be double
+    workspace_fp_size = workspace_mem_size / 2;
+  }
+  std::vector<int64_t> workspace_shape = {workspace_fp_size};
+  std::string nms_workspace_name =
+      "nms_workspace";  // expect only one nms in same model
+  auto workspace_tensor = graph->AddNode(nms_workspace_name,
+                                         workspace_shape,
+                                         CNML_CONST,
+                                         CNML_NCHW,
+                                         graph->FPType());
+  std::vector<float> workspace_cpu(workspace_shape[0]);
+  // void* work_space_ = nullptr;
+  // cnrtMalloc(&work_space_, workspace_shape[0]);
+  VLOG(6) << "workspace_shape :" << workspace_shape[0];
+  // VLOG(6) << "workspace_shape mlu ptr :"
+  //         << reinterpret_cast<void*>(work_space_);
+  // =================== Bboxes Trans ============================
+  std::vector<int> bboxes_axis = {0, 2, 1};
+  cnmlBaseOp_t bboxes_trans_op{nullptr};
+  cnmlNdTransposeOpParam_t bboxes_trans_param{nullptr};
+  CNML_CALL(cnmlCreateNdTransposeOpParam(
+      &bboxes_trans_param, bboxes_axis.data(), bboxes_axis.size()));
+  CNML_CALL(cnmlCreateNdTransposeProOp(&bboxes_trans_op,
+                                       bboxes_tensor->mlu_tensor(),
+                                       bboxes_trans_tensor->mlu_tensor(),
+                                       bboxes_trans_param));
+  // =================== Bboxes Trans END ========================
+  // =================== Scores Trans ============================
+  std::vector<int> scores_axis = {0, 2, 1};
+  cnmlBaseOp_t scores_trans_op{nullptr};
+  cnmlNdTransposeOpParam_t scores_trans_param{nullptr};
+  CNML_CALL(cnmlCreateNdTransposeOpParam(
+      &scores_trans_param, scores_axis.data(), scores_axis.size()));
+  CNML_CALL(cnmlCreateNdTransposeProOp(&scores_trans_op,
+                                       scores_tensor->mlu_tensor(),
+                                       scores_trans_tensor->mlu_tensor(),
+                                       scores_trans_param));
+  // =================== Scores Trans END ========================
+  multiclass_nms_param_t params_;
+  create_multiclass_nms_param(&params_,
+                              score_threshold,
+                              nms_top_k,
+                              keep_top_k,
+                              nms_threshold,
+                              normalized,
+                              nms_eta,
+                              background_label,
+                              batch_size,
+                              class_num,
+                              num_boxes,
+                              box_size);
+  cnmlBaseOp_t multiclass_nms_op;
+  create_multiclass_nms_op(&multiclass_nms_op,
+                           params_,
+                           bboxes_trans_tensor->mlu_tensor(),
+                           scores_trans_tensor->mlu_tensor(),
+                           out_trans_tensor->mlu_tensor(),
+                           num_outs_tensor->mlu_tensor(),
+                           workspace_tensor->mlu_tensor(),
+                           float_precision);
+  graph->BindConstRawData(
+      nms_workspace_name, workspace_cpu.data(), workspace_cpu.size(), true);
+  // =================== Out Trans ============================
+  std::vector<int> out_axis = {0, 2, 1};
+  cnmlBaseOp_t out_trans_op{nullptr};
+  cnmlNdTransposeOpParam_t out_trans_param{nullptr};
+  CNML_CALL(cnmlCreateNdTransposeOpParam(
+      &out_trans_param, out_axis.data(), out_axis.size()));
+  CNML_CALL(cnmlCreateNdTransposeProOp(&out_trans_op,
+                                       out_trans_tensor->mlu_tensor(),
+                                       out_tensor->mlu_tensor(),
+                                       out_trans_param));
+  // =================== Out Trans END ========================
+  // =================== DEBUG ====================
+  VLOG(6) << "bboxes_name: " << bboxes_name;
+  VLOG(6) << "scores_name: " << scores_name;
+  VLOG(6) << "out_name: " << out_name;
+  VLOG(6) << "background_label: " << background_label;
+  VLOG(6) << "keep_top_k: " << keep_top_k;
+  VLOG(6) << "nms_top_k: " << nms_top_k;
+  VLOG(6) << "score_threshold: " << score_threshold;
+  VLOG(6) << "nms_threshold: " << nms_threshold;
+  VLOG(6) << "nms_eta: " << nms_eta;
+  VLOG(6) << "normalized: " << normalized;
+  VLOG(6) << "bboxes_dims: " << bboxes_dims;
+  VLOG(6) << "scores_dims: " << scores_dims;
+  VLOG(6) << "out_dims: " << out_dims;
+  VLOG(6) << "out_dims: " << out->dims();
+  VLOG(6) << "batch_size: " << batch_size;
+  VLOG(6) << "num_boxes : " << num_boxes;
+  VLOG(6) << "class_num: " << class_num;
+  //   cnmlPrintTensor(bboxes_tensor->mlu_tensor(), CNML_TENSOR);
+  //   cnmlPrintTensor(bboxes_trans_tensor->mlu_tensor(), CNML_TENSOR);
+  //   cnmlPrintTensor(scores_tensor->mlu_tensor(), CNML_TENSOR);
+  //   cnmlPrintTensor(scores_trans_tensor->mlu_tensor(), CNML_TENSOR);
+  //   cnmlPrintTensor(out_tensor->mlu_tensor(), CNML_TENSOR);
+  //   cnmlPrintTensor(out_trans_tensor->mlu_tensor(), CNML_TENSOR);
+  //   cnmlPrintTensor(num_outs_tensor->mlu_tensor(), CNML_TENSOR);
+  // =================== DEBUG END ================
+  graph->FuseOp(bboxes_trans_op);
+  graph->FuseOp(scores_trans_op);
+  graph->FuseOp(multiclass_nms_op);
+  graph->FuseOp(out_trans_op);
+  return SUCCESS;
+}
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+REGISTER_SUBGRAPH_BRIDGE(multiclass_nms,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::MulticlassNmsConverter);
--- a/lite/kernels/mlu/bridges/multiclass_nms_api.cc
+++ b/lite/kernels/mlu/bridges/multiclass_nms_api.cc
+// Copyright (c) 2020 smarsu. All Rights Reserved.
+#include "lite/kernels/mlu/bridges/multiclass_nms_api.h"
+#include <cnml.h>
+#include <cnrt.h>
+#include <stdlib.h>
+#include <fstream>
+#include <iostream>
+#include <vector>
+extern "C" {
+void multiclass_nms_paddle_entry(void *bboxes,
+                                 void *scores,
+                                 void *outs,
+                                 void *num_outs,
+                                 float score_threshold,
+                                 int nms_top_k,
+                                 int keep_top_k,
+                                 float nms_threshold,
+                                 bool normalized,
+                                 float nms_eta,
+                                 int background_label,
+                                 int batch_size,
+                                 int class_num,
+                                 int num_boxes,
+                                 int box_size,
+                                 void *work_space,
+                                 DataType data_type);
+}  // extern "C"
+void create_multiclass_nms_param(multiclass_nms_param_t *params_ptr,
+                                 float score_threshold,
+                                 int nms_top_k,
+                                 int keep_top_k,
+                                 float nms_threshold,
+                                 bool normalized,
+                                 float nms_eta,
+                                 int background_label,
+                                 int batch_size,
+                                 int class_num,
+                                 int num_boxes,
+                                 int box_size) {
+  multiclass_nms_param_t params =
+      (multiclass_nms_param_t)malloc(sizeof(struct multiclass_nms_param));
+  params->score_threshold = score_threshold;
+  params->nms_top_k = nms_top_k;
+  params->keep_top_k = keep_top_k;
+  params->nms_threshold = nms_threshold;
+  params->normalized = normalized;
+  params->nms_eta = nms_eta;
+  params->background_label = background_label;
+  params->batch_size = batch_size;
+  params->class_num = class_num;
+  params->num_boxes = num_boxes;
+  params->box_size = box_size;
+  *params_ptr = params;
+  return;
+}
+void destory_multiclass_nms_param(multiclass_nms_param_t *params) {
+  if (*params != NULL) {
+    free(*params);
+  }
+}
+int create_multiclass_nms_op(cnmlBaseOp_t *op_ptr,
+                             multiclass_nms_param_t nms_param,
+                             cnmlTensor_t bboxes,
+                             cnmlTensor_t scores,
+                             cnmlTensor_t outs,
+                             cnmlTensor_t num_outs,
+                             cnmlTensor_t workspace_tensor,
+                             bool float_precision) {
+  DataType data_type = kFloat16;
+  if (float_precision) {
+    data_type = kFloat32;
+  }
+  if (nms_param->keep_top_k == -1) {
+    nms_param->keep_top_k = nms_param->num_boxes;
+  }
+  cnrtKernelParamsBuffer_t params;
+  cnrtGetKernelParamsBuffer(&params);
+  cnrtKernelParamsBufferMarkInput(params);
+  cnrtKernelParamsBufferMarkInput(params);
+  cnrtKernelParamsBufferMarkOutput(params);
+  cnrtKernelParamsBufferMarkOutput(params);
+  cnrtKernelParamsBufferAddParam(
+      params, &nms_param->score_threshold, sizeof(float));
+  cnrtKernelParamsBufferAddParam(params, &nms_param->nms_top_k, sizeof(int));
+  cnrtKernelParamsBufferAddParam(params, &nms_param->keep_top_k, sizeof(int));
+  cnrtKernelParamsBufferAddParam(
+      params, &nms_param->nms_threshold, sizeof(float));
+  cnrtKernelParamsBufferAddParam(params, &nms_param->normalized, sizeof(bool));
+  cnrtKernelParamsBufferAddParam(params, &nms_param->nms_eta, sizeof(float));
+  cnrtKernelParamsBufferAddParam(
+      params, &nms_param->background_label, sizeof(int));
+  cnrtKernelParamsBufferAddParam(params, &nms_param->batch_size, sizeof(int));
+  cnrtKernelParamsBufferAddParam(params, &nms_param->class_num, sizeof(int));
+  cnrtKernelParamsBufferAddParam(params, &nms_param->num_boxes, sizeof(int));
+  cnrtKernelParamsBufferAddParam(params, &nms_param->box_size, sizeof(int));
+  // cnrtKernelParamsBufferAddParam(
+  //     params, &nms_param->work_space, sizeof(void *));
+  cnrtKernelParamsBufferMarkStatic(params);
+  cnrtKernelParamsBufferAddParam(params, &data_type, sizeof(DataType));
+  cnmlTensor_t input_tensors[2];
+  input_tensors[0] = bboxes;
+  input_tensors[1] = scores;
+  cnmlTensor_t output_tensors[2];
+  output_tensors[0] = outs;
+  output_tensors[1] = num_outs;
+  cnmlTensor_t static_tensors[1];
+  static_tensors[0] = workspace_tensor;
+  cnmlCreatePluginOp(op_ptr,
+                     "multiclass_nms_paddle",
+                     reinterpret_cast<void *>(multiclass_nms_paddle_entry),
+                     params,
+                     input_tensors,
+                     2,
+                     output_tensors,
+                     2,
+                     static_tensors,
+                     1);
+  cnrtDestroyKernelParamsBuffer(params);
+  return 0;
+}
--- a/lite/kernels/mlu/bridges/multiclass_nms_api.h
+++ b/lite/kernels/mlu/bridges/multiclass_nms_api.h
+// Copyright (c) 2020 smarsu. All Rights Reserved.
+#ifndef LITE_KERNELS_MLU_BRIDGES_MULTICLASS_NMS_API_H_
+#define LITE_KERNELS_MLU_BRIDGES_MULTICLASS_NMS_API_H_
+// #define ALIGN_UP(a, b) (((a) + (b) - 1) / (b) * (b))
+// #define ALIGN_DN(a, b) ((a) / (b) * (b))
+// #define DIV_UP(a, b) (((a) + (b) - 1) / (b))
+// #define DIV_DN(a, b) ((a) / (b))
+// #define MAX(a, b) ((a) >= (b) ? (a) : (b))
+// #define MIN(a, b) ((a) <= (b) ? (a) : (b))
+// #define ABS(a) (((a) > 0) ? (a) : (-(a)))
+// #define INIFITE 0x7F800000
+#include <cnml.h>
+#include <cnrt.h>
+enum DataType {
+  kInvalid,
+  kFloat32,
+  kFloat16,
+  kUint8,
+  kInt8,
+  kInt16,
+  kInt32,
+};
+enum TopkSplitStrategy {
+  kAuto,
+  kSplitN,
+  kSplitC,
+};
+enum ColorType {
+  kGray,
+  kRGB,
+  kBGR,
+  kRGBA,
+};
+struct multiclass_nms_param {
+  float score_threshold;
+  int nms_top_k;
+  int keep_top_k;
+  float nms_threshold;
+  bool normalized;
+  float nms_eta;
+  int background_label;
+  int batch_size;
+  int class_num;
+  int num_boxes;
+  int box_size;
+};
+typedef struct multiclass_nms_param *multiclass_nms_param_t;
+void create_multiclass_nms_param(multiclass_nms_param_t *params_ptr,
+                                 float score_threshold,
+                                 int nms_top_k,
+                                 int keep_top_k,
+                                 float nms_threshold,
+                                 bool normalized,
+                                 float nms_eta,
+                                 int background_label,
+                                 int batch_size,
+                                 int class_num,
+                                 int num_boxes,
+                                 int box_size);
+void destory_multiclass_nms_param(multiclass_nms_param_t *params);
+int create_multiclass_nms_op(cnmlBaseOp_t *op_ptr,
+                             multiclass_nms_param_t nms_param,
+                             cnmlTensor_t bboxes,
+                             cnmlTensor_t scores,
+                             cnmlTensor_t outs,
+                             cnmlTensor_t num_outs,
+                             cnmlTensor_t workspace_tensor,
+                             bool float_precision);
+#endif  // LITE_KERNELS_MLU_BRIDGES_MULTICLASS_NMS_API_H_
--- a/lite/kernels/mlu/bridges/multiclass_nms_impl.o
+++ b/lite/kernels/mlu/bridges/multiclass_nms_impl.o
--- a/lite/kernels/mlu/bridges/multiclass_nms_op_test.cc
+++ b/lite/kernels/mlu/bridges/multiclass_nms_op_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/operators/multiclass_nms_op.h"
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <cstdlib>
+#include <fstream>
+#include <random>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include "lite/kernels/npu/bridges/registry.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+std::vector<float> gen_random_boxes(int box_num, int img_w, int img_h) {
+  std::vector<float> boxes;
+  unsigned int SEED = 1;
+  for (size_t i = 0; i < box_num; i++) {
+    float x = rand_r(&SEED) / static_cast<double>(RAND_MAX) * img_w;
+    float w = rand_r(&SEED) / static_cast<double>(RAND_MAX) * img_w;
+    float y = rand_r(&SEED) / static_cast<double>(RAND_MAX) * img_h;
+    float h = rand_r(&SEED) / static_cast<double>(RAND_MAX) * img_h;
+    float xmin = std::max(0.0f, (x - w) / 2);
+    float ymin = std::max(0.0f, (y - h) / 2);
+    float xmax = std::min(static_cast<float>(img_w), (x + w) / 2);
+    float ymax = std::min(static_cast<float>(img_h), (y + h) / 2);
+    boxes.push_back(xmin);
+    boxes.push_back(ymin);
+    boxes.push_back(xmax);
+    boxes.push_back(ymax);
+  }
+  return boxes;
+}
+std::vector<float> gen_random_scores(int box_num, int class_num) {
+  std::vector<float> scores;
+  unsigned int SEED = 1;
+  for (size_t i = 0; i < box_num; i++) {
+    for (size_t i = 0; i < class_num; i++) {
+      scores.push_back(rand_r(&SEED) / static_cast<double>(RAND_MAX));
+    }
+  }
+  return scores;
+}
+float Area(float box[4]) {
+  float xmin = box[0];
+  float ymin = box[1];
+  float xmax = box[2];
+  float ymax = box[3];
+  CHECK(xmax > xmin) << "xmax: " << xmax << " xmin: " << xmin;
+  CHECK(ymax > ymin) << "ymax: " << ymax << " ymin: " << ymin;
+  float w = xmax - xmin;
+  float h = ymax - ymin;
+  return w * h;
+}
+// overlap may < 0
+float overlap(float min1, float max1, float min2, float max2) {
+  return ((max1 - min1) + (max2 - min2)) -
+         (std::max(max2, max1) - std::min(min1, min2));
+}
+float IntersectionArea(float box1[4], float box2[4]) {
+  float box1_xmin = box1[0];
+  float box1_ymin = box1[1];
+  float box1_xmax = box1[2];
+  float box1_ymax = box1[3];
+  float box2_xmin = box2[0];
+  float box2_ymin = box2[1];
+  float box2_xmax = box2[2];
+  float box2_ymax = box2[3];
+  float x_overlap = overlap(box1_xmin, box1_xmax, box2_xmin, box2_xmax);
+  float y_overlap = overlap(box1_ymin, box1_ymax, box2_ymin, box2_ymax);
+  float intersection_area = x_overlap * y_overlap;
+  return std::max(intersection_area, 0.0f);
+}
+float IOU(float box1[4], float box2[4]) {
+  float area1 = Area(box1);
+  float area2 = Area(box2);
+  float intersection_area = IntersectionArea(box1, box2);
+  float union_area = area1 + area2 - intersection_area;
+  return intersection_area / union_area;
+}
+template <typename T>
+void VecToFile(const std::vector<T>& vec, std::string filename) {
+  std::ofstream f(filename, std::ios::out);
+  if (!f) {
+    LOG(FATAL) << filename << "not exist!" << std::endl;
+  }
+  for (size_t i = 0; i < vec.size(); i++) {
+    f << vec[i] << std::endl;
+  }
+  f.close();
+}
+template <typename T>
+void ArrayToFile(const T* data, int size, std::string filename) {
+  std::ofstream f(filename, std::ios::out);
+  if (!f) {
+    LOG(FATAL) << filename << "not exist!" << std::endl;
+  }
+  for (size_t i = 0; i < size; i++) {
+    f << data[i] << std::endl;
+  }
+  f.close();
+}
+void ToFile(Tensor* tensor, std::string file_name) {
+  int count = tensor->dims().production();
+  auto data = tensor->mutable_data<float>();
+  std::ostringstream outs;
+  for (size_t i = 0; i < count; i++) {
+    outs << data[i] << std::endl;
+  }
+  std::ofstream of;
+  of.open(file_name, std::ios::out);
+  of << outs.str();
+  of.close();
+}
+void FromFile(Tensor* tensor, std::string file_name) {
+  LOG(INFO) << " from file:" << file_name << std::endl;
+  std::ifstream f;
+  f.open(file_name, std::ios::in);
+  if (f.good()) {
+    for (size_t i = 0; i < tensor->dims().production(); i++) {
+      f >> tensor->mutable_data<float>()[i];
+    }
+  } else {
+    LOG(FATAL) << "can not open " << file_name << "to read" << std::endl;
+  }
+  f.close();
+}
+template <typename dtype>
+static bool sort_score_pair_descend(const std::pair<float, dtype>& pair1,
+                                    const std::pair<float, dtype>& pair2) {
+  return pair1.first > pair2.first;
+}
+template <typename dtype>
+void get_max_score_index(const dtype* scores,
+                         int num,
+                         float threshold,
+                         int top_k,
+                         std::vector<std::pair<dtype, int>>* score_index_vec) {
+  // ArrayToFile(scores, 100, "cpu_score.txt");
+  //! Generate index score pairs.
+  for (int i = 0; i < num; ++i) {
+    if (scores[i] > threshold) {
+      score_index_vec->push_back(std::make_pair(scores[i], i));
+    }
+  }
+  //! Sort the score pair according to the scores in descending order
+  std::stable_sort(score_index_vec->begin(),
+                   score_index_vec->end(),
+                   sort_score_pair_descend<int>);
+  //! Keep top_k scores if needed.
+  if (top_k > -1 && top_k < score_index_vec->size()) {
+    score_index_vec->resize(top_k);
+  }
+}
+template <typename dtype>
+dtype bbox_size(const dtype* bbox, bool normalized = true) {
+  if (bbox[2] < bbox[0] || bbox[3] < bbox[1]) {
+    // If bbox is invalid (e.g. xmax < xmin or ymax < ymin), return 0.
+    return dtype(0.);
+  } else {
+    const dtype width = bbox[2] - bbox[0];
+    const dtype height = bbox[3] - bbox[1];
+    if (normalized) {
+      return width * height;
+    } else {
+      // If bbox is not within range [0, 1].
+      return (width + 1) * (height + 1);
+    }
+  }
+}
+template <typename dtype>
+dtype jaccard_overlap(const dtype* bbox1, const dtype* bbox2) {
+  if (bbox2[0] > bbox1[2] || bbox2[2] < bbox1[0] || bbox2[1] > bbox1[3] ||
+      bbox2[3] < bbox1[1]) {
+    return dtype(0.);
+  } else {
+    const dtype inter_xmin = std::max(bbox1[0], bbox2[0]);
+    const dtype inter_ymin = std::max(bbox1[1], bbox2[1]);
+    const dtype inter_xmax = std::min(bbox1[2], bbox2[2]);
+    const dtype inter_ymax = std::min(bbox1[3], bbox2[3]);
+    const dtype inter_width = inter_xmax - inter_xmin;
+    const dtype inter_height = inter_ymax - inter_ymin;
+    const dtype inter_size = inter_width * inter_height;
+    const dtype bbox1_size = bbox_size(bbox1);
+    const dtype bbox2_size = bbox_size(bbox2);
+    return inter_size / (bbox1_size + bbox2_size - inter_size);
+  }
+}
+template <typename dtype>
+void apply_nms_fast(const dtype* bboxes,
+                    const dtype* scores,
+                    int num,
+                    float score_threshold,
+                    float nms_threshold,
+                    float eta,
+                    int top_k,
+                    std::vector<int>* indices) {
+  // Get top_k scores (with corresponding indices).
+  std::vector<std::pair<dtype, int>> score_index_vec;
+  get_max_score_index(scores, num, score_threshold, top_k, &score_index_vec);
+  // Do nms.
+  float adaptive_threshold = nms_threshold;
+  indices->clear();
+  while (score_index_vec.size() != 0) {
+    const int idx = score_index_vec.front().second;
+    bool keep = true;
+    for (int k = 0; k < indices->size(); ++k) {
+      if (keep) {
+        const int kept_idx = (*indices)[k];
+        float overlap =
+            jaccard_overlap(bboxes + idx * 4, bboxes + kept_idx * 4);
+        keep = overlap <= adaptive_threshold;
+      } else {
+        break;
+      }
+    }
+    if (keep) {
+      indices->push_back(idx);
+    }
+    score_index_vec.erase(score_index_vec.begin());
+    if (keep && eta < 1 && adaptive_threshold > 0.5) {
+      adaptive_threshold *= eta;
+    }
+  }
+}
+template <typename dtype>
+void multiclass_nms_compute_ref(const operators::MulticlassNmsParam& param,
+                                int class_num,
+                                const std::vector<int>& priors,
+                                bool share_location,
+                                std::vector<float>* result) {
+  int background_id = param.background_label;
+  int keep_topk = param.keep_top_k;
+  int nms_topk = param.nms_top_k;
+  float conf_thresh = param.score_threshold;
+  float nms_thresh = param.nms_threshold;
+  float nms_eta = param.nms_eta;
+  const dtype* bbox_data = param.bboxes->data<const dtype>();
+  const dtype* conf_data = param.scores->data<const dtype>();
+  (*result).clear();
+  int num_kept = 0;
+  std::vector<std::map<int, std::vector<int>>> all_indices;
+  int64_t conf_offset = 0;
+  int64_t bbox_offset = 0;
+  for (int i = 0; i < priors.size(); ++i) {
+    std::map<int, std::vector<int>> indices;
+    int num_det = 0;
+    int num_priors = priors[i];
+    int conf_idx = class_num * conf_offset;
+    int bbox_idx =
+        share_location ? bbox_offset * 4 : bbox_offset * 4 * class_num;
+    for (int c = 0; c < class_num; ++c) {
+      if (c == background_id) {
+        // Ignore background class
+        continue;
+      }
+      const dtype* cur_conf_data = conf_data + conf_idx + c * num_priors;
+      const dtype* cur_bbox_data = bbox_data + bbox_idx;
+      if (!share_location) {
+        cur_bbox_data += c * num_priors * 4;
+      }
+      apply_nms_fast(cur_bbox_data,
+                     cur_conf_data,
+                     num_priors,
+                     conf_thresh,
+                     nms_thresh,
+                     nms_eta,
+                     nms_topk,
+                     &(indices[c]));
+      num_det += indices[c].size();
+    }
+    if (keep_topk > -1 && num_det > keep_topk) {
+      std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
+      for (auto it = indices.begin(); it != indices.end(); ++it) {
+        int label = it->first;
+        const std::vector<int>& label_indices = it->second;
+        for (int j = 0; j < label_indices.size(); ++j) {
+          int idx = label_indices[j];
+          float score = conf_data[conf_idx + label * num_priors + idx];
+          score_index_pairs.push_back(
+              std::make_pair(score, std::make_pair(label, idx)));
+        }
+      }
+      // Keep top k results per image.
+      std::stable_sort(score_index_pairs.begin(),
+                       score_index_pairs.end(),
+                       sort_score_pair_descend<std::pair<int, int>>);
+      score_index_pairs.resize(keep_topk);
+      // Store the new indices.
+      std::map<int, std::vector<int>> new_indices;
+      for (int j = 0; j < score_index_pairs.size(); ++j) {
+        int label = score_index_pairs[j].second.first;
+        int idx = score_index_pairs[j].second.second;
+        new_indices[label].push_back(idx);
+      }
+      all_indices.push_back(new_indices);
+      num_kept += keep_topk;
+    } else {
+      all_indices.push_back(indices);
+      num_kept += num_det;
+    }
+    conf_offset += num_priors;
+    bbox_offset += num_priors;
+  }
+  if (num_kept == 0) {
+    (*result).clear();
+    (*result).resize(1);
+    (*result)[0] = -1;
+    return;
+  } else {
+    (*result).resize(num_kept * 6);
+  }
+  int count = 0;
+  conf_offset = 0;
+  bbox_offset = 0;
+  for (int i = 0; i < priors.size(); ++i) {
+    int num_priors = priors[i];
+    int conf_idx = class_num * conf_offset;
+    int bbox_idx =
+        share_location ? bbox_offset * 4 : bbox_offset * 4 * class_num;
+    for (auto it = all_indices[i].begin(); it != all_indices[i].end(); ++it) {
+      int label = it->first;
+      std::vector<int>& indices = it->second;
+      const dtype* cur_conf_data = conf_data + conf_idx + label * num_priors;
+      const dtype* cur_bbox_data = bbox_data + bbox_idx;
+      if (!share_location) {
+        cur_bbox_data += label * num_priors * 4;
+      }
+      for (int j = 0; j < indices.size(); ++j) {
+        int idx = indices[j];
+        (*result)[count * 6] = label;
+        (*result)[count * 6 + 1] = cur_conf_data[idx];
+        for (int k = 0; k < 4; ++k) {
+          (*result)[count * 6 + 2 + k] = cur_bbox_data[idx * 4 + k];
+        }
+        ++count;
+      }
+    }
+    conf_offset += num_priors;
+    bbox_offset += num_priors;
+  }
+}
+void test_multiclass_nms(float score_threshold,
+                         int nms_top_k,
+                         int keep_top_k,
+                         float nms_threshold,
+                         bool normalized,
+                         float nms_eta,
+                         int background_label,
+                         int batch_size,
+                         int class_num,
+                         int num_boxes,
+                         int box_size,
+                         int core_num) {
+  // prepare input&output variables
+  Scope scope;
+  std::string bboxes_var_name = "BBoxes";
+  std::string scores_var_name = "Scores";
+  std::string out_var_name = "Out";
+  std::string out_num_var_name =
+      "nms_out_num";  // must be this name,corespond with
+                      // lite/operators/multiclass_nms_op.cc
+  auto* bboxes = scope.Var(bboxes_var_name)->GetMutable<Tensor>();
+  auto* scores = scope.Var(scores_var_name)->GetMutable<Tensor>();
+  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+  auto* out_num = scope.Var(out_num_var_name)->GetMutable<Tensor>();
+  std::vector<int64_t> bboxes_shape = {batch_size, num_boxes, box_size};
+  std::vector<int64_t> scores_shape = {batch_size, class_num, num_boxes};
+  std::vector<int64_t> out_num_shape = {batch_size};
+  bboxes->Resize(bboxes_shape);
+  scores->Resize(scores_shape);
+  out_num->Resize(out_num_shape);
+  std::vector<float> bboxes_vec = gen_random_boxes(num_boxes, 1024, 1024);
+  std::vector<float> scores_vec = gen_random_scores(num_boxes, class_num);
+  for (size_t i = 1; i < bboxes_vec.size(); i++) {
+    bboxes->mutable_data<float>()[i] = bboxes_vec[i];
+  }
+  for (size_t i = 1; i < scores_vec.size(); i++) {
+    scores->mutable_data<float>()[i] = scores_vec[i];
+  }
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  opdesc.SetType("multiclass_nms");
+  opdesc.SetInput("BBoxes", {bboxes_var_name});
+  opdesc.SetInput("Scores", {scores_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+  opdesc.SetAttr("background_label", background_label);
+  opdesc.SetAttr("keep_top_k", keep_top_k);
+  opdesc.SetAttr("nms_top_k", nms_top_k);
+  opdesc.SetAttr("score_threshold", score_threshold);
+  opdesc.SetAttr("nms_threshold", nms_threshold);
+  opdesc.SetAttr("nms_eta", nms_eta);
+  opdesc.SetAttr("normalized", normalized);
+  auto op = CreateOp<operators::MulticlassNmsOpLite>(opdesc, &scope);
+  // out_ref->CopyDataFrom(*out);
+  operators::MulticlassNmsParam param;
+  auto bboxes_name = opdesc.Input("BBoxes").front();
+  auto scores_name = opdesc.Input("Scores").front();
+  auto out_name = opdesc.Output("Out").front();
+  std::vector<std::string> output_arg_names = opdesc.OutputArgumentNames();
+  param.bboxes = bboxes;
+  param.scores = scores;
+  param.out = out;
+  param.background_label = opdesc.GetAttr<int>("background_label");
+  param.keep_top_k = opdesc.GetAttr<int>("keep_top_k");
+  param.nms_top_k = opdesc.GetAttr<int>("nms_top_k");
+  param.score_threshold = opdesc.GetAttr<float>("score_threshold");
+  param.nms_threshold = opdesc.GetAttr<float>("nms_threshold");
+  param.nms_eta = opdesc.GetAttr<float>("nms_eta");
+  if (opdesc.HasAttr("normalized")) {
+    param.normalized = opdesc.GetAttr<bool>("normalized");
+  }
+  const std::vector<int>& priors = {num_boxes};  // batch_size
+  std::vector<float> result;
+  multiclass_nms_compute_ref<float>(param, class_num, priors, true, &result);
+  // trans
+  Tensor bboxes_trans;
+  bboxes_trans.Resize({bboxes->dims()});
+  transpose(bboxes->mutable_data<float>(),
+            bboxes_trans.mutable_data<float>(),
+            {static_cast<int>(bboxes->dims()[0]),
+             static_cast<int>(bboxes->dims()[1]),
+             static_cast<int>(bboxes->dims()[2])},
+            {0, 2, 1});
+  bboxes->CopyDataFrom(bboxes_trans);
+  Tensor scores_trans;
+  scores_trans.Resize({scores->dims()});
+  transpose(scores->mutable_data<float>(),
+            scores_trans.mutable_data<float>(),
+            {static_cast<int>(scores->dims()[0]),
+             static_cast<int>(scores->dims()[1]),
+             static_cast<int>(scores->dims()[2])},
+            {0, 2, 1});
+  scores->CopyDataFrom(scores_trans);
+  LaunchOp(
+      op, {bboxes_var_name, scores_var_name}, {out_var_name, out_num_var_name});
+  // ToFile(out, "nms_out_mlu_before_trans.txt");
+  // out trans
+  Tensor out_trans;
+  out_trans.Resize({out->dims()});
+  transpose(out->mutable_data<float>(),
+            out_trans.mutable_data<float>(),
+            {static_cast<int>(out->dims()[0]),
+             static_cast<int>(out->dims()[2]),
+             static_cast<int>(out->dims()[1])},  // 0 2 1 on mlu
+            {0, 2, 1});
+  out->CopyDataFrom(out_trans);
+  // ToFile(out, "nms_out_mlu.txt");
+  // ToFile(out_num, "nms_out_num_mlu.txt");
+  // VecToFile(result, "nms_out_cpu.txt");
+  // auto out_data = out->mutable_data<float>();
+  int num_box = out->dims()[1];
+  int match_count = 0;
+  std::vector<int> matched_cpu_index;
+  for (int i = 0; i < num_box; i++) {
+    float mlu_box[4];
+    mlu_box[0] = out->mutable_data<float>()[i * 6 + 2];
+    mlu_box[1] = out->mutable_data<float>()[i * 6 + 3];
+    mlu_box[2] = out->mutable_data<float>()[i * 6 + 4];
+    mlu_box[3] = out->mutable_data<float>()[i * 6 + 5];
+    bool match = false;
+    for (size_t j = 0; j < num_box; j++) {
+      // if j th cpu box has matched some mlu box, do not use if to match other
+      // mlu box
+      if (std::find(std::begin(matched_cpu_index),
+                    std::end(matched_cpu_index),
+                    j) != std::end(matched_cpu_index)) {
+        continue;
+      }
+      float cpu_box[4];
+      cpu_box[0] = result[j * 6 + 2];
+      cpu_box[1] = result[j * 6 + 3];
+      cpu_box[2] = result[j * 6 + 4];
+      cpu_box[3] = result[j * 6 + 5];
+      if (IOU(mlu_box, cpu_box) >= 0.9) {
+        match = true;
+        matched_cpu_index.push_back(j);
+        break;
+      }
+    }
+    if (match) {
+      match_count += 1;
+    }
+  }
+  EXPECT_NEAR(match_count, num_box, 0);
+}
+TEST(MLUBridges, multiclass_nms) {
+  int background_label = -1;
+  int keep_top_k = 100;
+  int nms_top_k = 1000;
+  float score_threshold = 0.01;
+  float nms_threshold = 0.45;
+  int nms_eta = 1;
+  bool normalized = 0;
+  int batch_size = 1;
+  int num_boxes = 22743;
+  int class_num = 80;
+  int core_num = 4;
+  int box_size = 4;
+  test_multiclass_nms(score_threshold,
+                      nms_top_k,
+                      keep_top_k,
+                      nms_threshold,
+                      normalized,
+                      nms_eta,
+                      background_label,
+                      batch_size,
+                      class_num,
+                      num_boxes,
+                      box_size,
+                      core_num);
+}
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+USE_SUBGRAPH_BRIDGE(multiclass_nms, kMLU)
--- a/lite/kernels/mlu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/mlu/bridges/paddle_use_bridges.h
@@ -43,6 +43,9 @@ USE_SUBGRAPH_BRIDGE(flatten, kMLU);
 USE_SUBGRAPH_BRIDGE(flatten2, kMLU);
 USE_SUBGRAPH_BRIDGE(reshape, kMLU);
 USE_SUBGRAPH_BRIDGE(reshape2, kMLU);
+USE_SUBGRAPH_BRIDGE(multiclass_nms, kMLU);
+USE_SUBGRAPH_BRIDGE(density_prior_box, kMLU);
+USE_SUBGRAPH_BRIDGE(box_coder, kMLU);
 #ifdef LITE_BUILD_EXTRA
 USE_SUBGRAPH_BRIDGE(gather, kMLU);
 USE_SUBGRAPH_BRIDGE(lrn, kMLU)

--- a/lite/kernels/mlu/bridges/utility.h
+++ b/lite/kernels/mlu/bridges/utility.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <cnml.h>
+#include <cnplugin.h>
 #include <cnrt.h>
 #include <memory>