[LITE][BM] support hd all models,test=develop (#3540)

fix reshape infer shape issue adaptive pool, support adaptive pool2, multi thread ok, optimize global pool, support faceboxes and behavior image, realize bm device info, multi device ok, support multi cards, support efficienet

[LITE][BM] support hd all models,test=develop (#3540)
fix reshape infer shape issue adaptive pool, support adaptive pool2, multi thread ok, optimize global pool, support faceboxes and behavior image, realize bm device info, multi device ok, support multi cards, support efficienet
92d4172a · Santa An · GitHub · 4f983d86 · 92d4172a · 92d4172a
12 changed file
--- a/lite/backends/bm/target_wrapper.cc
+++ b/lite/backends/bm/target_wrapper.cc
@@ -24,16 +24,17 @@ std::map<int, void*> TargetWrapperBM::bm_hds_;

 size_t TargetWrapperBM::num_devices() {
  int count = 0;
-  bm_dev_getcount(&count);
+  bm_status_t ret = bm_dev_getcount(&count);
+  CHECK_EQ(ret, BM_SUCCESS) << "Failed with error code: "
+                            << static_cast<int>(ret);
  return count;
 }

+int TargetWrapperBM::GetDevice() { return device_id_; }
 void TargetWrapperBM::SetDevice(int id) {
-  /*
-    if (id < 0 || (size_t)id >= num_devices()) {
-      LOG(FATAL) << "Failed with invalid device id " << id;
-    }
-  */
+  if (id < 0 || (size_t)id >= num_devices()) {
+    LOG(FATAL) << "Failed with invalid device id " << id;
+  }
  device_id_ = id;
  if (bm_hds_.find(id) == bm_hds_.end()) {
    bm_handle_t bm_handle;

--- a/lite/backends/bm/target_wrapper.h
+++ b/lite/backends/bm/target_wrapper.h
@@ -31,6 +31,7 @@ class TargetWrapper<TARGET(kBM)> {
  static size_t maximum_stream() { return 0; }

  static void SetDevice(int id);
+  static int GetDevice();
  static void CreateStream(stream_t* stream) {}
  static void DestroyStream(const stream_t& stream) {}


--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -110,9 +110,7 @@ class Context<TargetType::kBM> {
  Context() {}
  explicit Context(const BMContext& ctx);
  // NOTE: InitOnce should only be used by ContextScheduler
-  void InitOnce() { Init(0); }
-
-  void Init(int dev_id) { TargetWrapperBM::SetDevice(dev_id); }
+  void InitOnce() { TargetWrapperBM::SetDevice(TargetWrapperBM::GetDevice()); }
  void CopySharedTo(BMContext* ctx) {}
  void* GetHandle() { return TargetWrapperBM::GetHandle(); }


--- a/lite/core/device_info.cc
+++ b/lite/core/device_info.cc
@@ -1240,6 +1240,19 @@ void Device<TARGET(kMLU)>::CreateQueue() {
 }
 #endif  // LITE_WITH_MLU

+#ifdef LITE_WITH_BM
+void Device<TARGET(kBM)>::SetId(int device_id) {
+  LOG(INFO) << "Set bm device " << device_id;
+  TargetWrapper<TARGET(kBM)>::SetDevice(device_id);
+  idx_ = device_id;
+}
+
+void Device<TARGET(kBM)>::Init() { SetId(idx_); }
+int Device<TARGET(kBM)>::core_num() {
+  return TargetWrapper<TARGET(kBM)>::num_devices();
+}
+#endif  // LITE_WITH_BM
+
 #ifdef LITE_WITH_CUDA

 void Device<TARGET(kCUDA)>::Init() {

--- a/lite/core/device_info.h
+++ b/lite/core/device_info.h
@@ -221,6 +221,49 @@ class Device<TARGET(kMLU)> {
 template class Env<TARGET(kMLU)>;
 #endif  // LITE_WITH_MLU

+#ifdef LITE_WITH_BM
+template <>
+class Device<TARGET(kBM)> {
+ public:
+  Device(int dev_id, int max_stream = 1)
+      : idx_(dev_id), max_stream_(max_stream) {}
+  void Init();
+
+  int id() { return idx_; }
+  int max_stream() { return 1; }
+  std::string name() { return "BM"; }
+  float max_memory() { return 16; }
+  int core_num();
+  void SetId(int idx);
+
+  int sm_version() { return 0; }
+  bool has_fp16() { return false; }
+  bool has_int8() { return false; }
+  bool has_hmma() { return false; }
+  bool has_imma() { return false; }
+  int runtime_version() { return 0; }
+
+ private:
+  void CreateQueue() {}
+  void GetInfo() {}
+
+ private:
+  int idx_{0};
+  int max_stream_{1};
+  std::string device_name_;
+  float max_memory_;
+
+  int sm_version_;
+  bool has_fp16_;
+  bool has_int8_;
+  bool has_hmma_;
+  bool has_imma_;
+  int runtime_version_;
+};
+
+template class Env<TARGET(kBM)>;
+#endif
+
 #ifdef LITE_WITH_CUDA
 template <>
 class Device<TARGET(kCUDA)> {

--- a/lite/kernels/bm/bridges/CMakeLists.txt
+++ b/lite/kernels/bm/bridges/CMakeLists.txt
@@ -35,7 +35,8 @@ lite_cc_library(subgraph_bridge_assign_value_op_bm SRCS assign_value_op.cc DEPS
 lite_cc_library(subgraph_bridge_shape_op_bm SRCS shape_op.cc DEPS ${bm_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_split_op_bm SRCS split_op.cc DEPS ${bm_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_matmul_op_bm SRCS matmul_op.cc DEPS ${bm_subgraph_bridge_deps})
-
+lite_cc_library(subgraph_bridge_density_prior_box_op_bm SRCS density_prior_box_op.cc DEPS ${bm_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_swish_op_bm SRCS swish_op.cc DEPS ${bm_subgraph_bridge_deps})

 set(bm_subgraph_bridges
        subgraph_bridge_registry
@@ -69,4 +70,6 @@ set(bm_subgraph_bridges
        subgraph_bridge_shape_op_bm
        subgraph_bridge_split_op_bm
        subgraph_bridge_matmul_op_bm
+        subgraph_bridge_density_prior_box_op_bm
+        subgraph_bridge_swish_op_bm
        CACHE INTERNAL "bm_subgraph_bridges")
--- a/lite/kernels/bm/bridges/density_prior_box_op.cc
+++ b/lite/kernels/bm/bridges/density_prior_box_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <bmcompiler_if.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+typedef struct __tag_st_priorbox_param {
+  std::vector<float> fixed_sizes;
+  std::vector<float> fixed_ratios;
+  std::vector<int> densities;
+  std::vector<float> variances;
+  float step_w;
+  float step_h;
+  float offset;
+  int prior_num;
+  bool clip;
+  bool flatten_to_2d;
+} st_priorbox_param;
+
+float* compute_density_priorbox_kernel(OpLite* op, st_priorbox_param* param) {
+  auto op_info = op->op_info();
+  auto scope = op->scope();
+  // inputs
+  auto in_var_name = op_info->Input("Input").front();
+  auto in = scope->FindVar(in_var_name)->GetMutable<lite::Tensor>();
+  auto in_dims = in->dims();
+  auto img_var_name = op_info->Input("Image").front();
+  auto img = scope->FindVar(img_var_name)->GetMutable<lite::Tensor>();
+  auto img_dims = img->dims();
+  // outputs
+  auto boxes_var_name = op_info->Output("Boxes").front();
+  auto boxes = scope->FindVar(boxes_var_name)->GetMutable<lite::Tensor>();
+  auto var_var_name = op_info->Output("Variances").front();
+  auto var = scope->FindVar(var_var_name)->GetMutable<lite::Tensor>();
+
+  auto img_width = img_dims[3];
+  auto img_height = img_dims[2];
+  auto feature_width = in_dims[3];
+  auto feature_height = in_dims[2];
+  float step_width, step_height;
+  if (param->step_w == 0.f || param->step_h == 0.f) {
+    step_width = static_cast<float>(img_width) / feature_width;
+    step_height = static_cast<float>(img_height) / feature_height;
+  } else {
+    step_width = param->step_w;
+    step_height = param->step_h;
+  }
+  int num_priors = 0;
+  for (size_t i = 0; i < param->densities.size(); ++i) {
+    num_priors += (param->fixed_ratios.size()) * (pow(param->densities[i], 2));
+  }
+  param->prior_num = num_priors;
+  DDim shape_out({feature_height, feature_width, num_priors, 4});
+  int32_t channel_size = feature_height * feature_width * num_priors * 4;
+  boxes->Resize(shape_out);
+  var->Resize(shape_out);
+  int step_average = static_cast<int>((step_width + step_height) * 0.5);
+  std::vector<float> sqrt_fixed_ratios;
+  for (size_t i = 0; i < param->fixed_ratios.size(); i++) {
+    sqrt_fixed_ratios.push_back(sqrt(param->fixed_ratios[i]));
+  }
+  float* cpu_data =
+      static_cast<float*>(malloc(sizeof(float) * boxes->data_size() * 2));
+  CHECK(cpu_data != nullptr);
+  float* b_t = cpu_data;
+  for (int h = 0; h < feature_height; ++h) {
+    for (int w = 0; w < feature_width; ++w) {
+      float center_x = (w + param->offset) * step_width;
+      float center_y = (h + param->offset) * step_height;
+
+      for (size_t s = 0; s < param->fixed_sizes.size(); ++s) {
+        auto fixed_size = param->fixed_sizes[s];
+        int density = param->densities[s];
+        int shift = step_average / density;
+        // Generate density prior boxes with fixed ratios.
+        for (size_t r = 0; r < param->fixed_ratios.size(); ++r) {
+          float box_width_ratio = fixed_size * sqrt_fixed_ratios[r];
+          float box_height_ratio = fixed_size / sqrt_fixed_ratios[r];
+          float density_center_x = center_x - step_average / 2. + shift / 2.;
+          float density_center_y = center_y - step_average / 2. + shift / 2.;
+          for (int di = 0; di < density; ++di) {
+            for (int dj = 0; dj < density; ++dj) {
+              float center_x_temp = density_center_x + dj * shift;
+              float center_y_temp = density_center_y + di * shift;
+              b_t[0] = std::max(
+                  (center_x_temp - box_width_ratio / 2.) / img_width, 0.);
+              b_t[1] = std::max(
+                  (center_y_temp - box_height_ratio / 2.) / img_height, 0.);
+              b_t[2] = std::min(
+                  (center_x_temp + box_width_ratio / 2.) / img_width, 1.);
+              b_t[3] = std::min(
+                  (center_y_temp + box_height_ratio / 2.) / img_height, 1.);
+              b_t += 4;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  if (param->clip) {
+    for (int32_t d = 0; d < channel_size; ++d) {
+      cpu_data[d] = std::min(std::max(cpu_data[d], 0.f), 1.f);
+    }
+  }
+  float* ptr = cpu_data + channel_size;
+  int count = 0;
+  for (int32_t h = 0; h < feature_height; ++h) {
+    for (int32_t w = 0; w < feature_width; ++w) {
+      for (int32_t i = 0; i < param->prior_num; ++i) {
+        for (int j = 0; j < 4; ++j) {
+          ptr[count] = param->variances[j];
+          ++count;
+        }
+      }
+    }
+  }
+  return cpu_data;
+}
+
+int DensityPriorBoxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  // inputs
+  auto in_var_name = op_info->Input("Input").front();
+  auto in = scope->FindVar(in_var_name)->GetMutable<lite::Tensor>();
+  auto in_dims = in->dims();
+  auto img_var_name = op_info->Input("Image").front();
+  auto img = scope->FindVar(img_var_name)->GetMutable<lite::Tensor>();
+  auto img_dims = img->dims();
+  std::vector<int32_t> i_input_shape_data(in_dims.size());
+  for (size_t i = 0; i < in_dims.size(); i++) {
+    i_input_shape_data[i] = static_cast<int32_t>(in_dims[i]);
+  }
+  // outputs
+  auto boxes_var_name = op_info->Output("Boxes").front();
+  auto boxes = scope->FindVar(boxes_var_name)->GetMutable<lite::Tensor>();
+  auto var_var_name = op_info->Output("Variances").front();
+  // param
+  st_priorbox_param param;
+  param.clip = op_info->GetAttr<bool>("clip");
+  param.flatten_to_2d = op_info->GetAttr<bool>("flatten_to_2d");
+  param.fixed_sizes = op_info->GetAttr<std::vector<float>>("fixed_sizes");
+  param.fixed_ratios = op_info->GetAttr<std::vector<float>>("fixed_ratios");
+  param.variances = op_info->GetAttr<std::vector<float>>("variances");
+  param.densities = op_info->GetAttr<std::vector<int>>("densities");
+
+  param.offset = op_info->GetAttr<float>("offset");
+  if (op_info->HasAttr("step_w")) {
+    param.step_w = op_info->GetAttr<float>("step_w");
+  }
+  if (op_info->HasAttr("step_h")) {
+    param.step_h = op_info->GetAttr<float>("step_h");
+  }
+  float* cpu_data = compute_density_priorbox_kernel(op, &param);
+  auto boxes_dims = boxes->dims();
+  std::vector<int32_t> i_pri_out_shape_data(3);
+  i_pri_out_shape_data[0] = 1;
+  i_pri_out_shape_data[1] = 2;
+  i_pri_out_shape_data[2] = boxes->data_size();
+  auto bm_priorbox_name = lite::subgraph::bm::UniqueName("bm_priorbox");
+  add_priorbox_layer(graph->GetCompilerHandle(),
+                     const_cast<const int*>(&i_input_shape_data[0]),
+                     in_dims.size(),
+                     static_cast<const char*>(in_var_name.c_str()),
+                     const_cast<const int*>(&i_pri_out_shape_data[0]),
+                     3,
+                     static_cast<const char*>(bm_priorbox_name.c_str()),
+                     static_cast<const float*>(cpu_data),
+                     0,
+                     nullptr,
+                     0,
+                     nullptr,
+                     0,
+                     nullptr,
+                     0,
+                     0,
+                     0,
+                     nullptr,
+                     0,
+                     0,
+                     0.f,
+                     0.f,
+                     0.f);
+  int32_t* shape[2];
+  int32_t dim[2];
+  const char* name[2];
+  int32_t dim_size = 3;
+  dim[0] = dim_size;
+  dim[1] = dim_size;
+  std::vector<int32_t> i_split_shape_data(dim_size);
+  for (size_t i = 0; i < dim_size; i++) {
+    i_split_shape_data[i] = i_pri_out_shape_data[i];
+  }
+  i_split_shape_data[1] /= 2;
+  shape[0] = &i_split_shape_data[0];
+  shape[1] = &i_split_shape_data[0];
+  name[0] = static_cast<const char*>(
+      lite::subgraph::bm::UniqueName("bm_boxes").c_str());
+  name[1] = static_cast<const char*>(
+      lite::subgraph::bm::UniqueName("bm_boxes_var").c_str());
+  int split_size[2];
+  split_size[0] = shape[0][1];
+  split_size[1] = shape[1][1];
+  add_tf_split_layer(graph->GetCompilerHandle(),
+                     const_cast<const int*>(&i_pri_out_shape_data[0]),
+                     3,
+                     static_cast<const char*>(bm_priorbox_name.c_str()),
+                     2,
+                     shape,
+                     dim,
+                     name,
+                     3,
+                     1,
+                     split_size,
+                     2);
+  // final output
+  std::vector<int32_t> i_output_shape_data(boxes_dims.size());
+  for (size_t i = 0; i < boxes_dims.size(); i++) {
+    i_output_shape_data[i] = static_cast<int32_t>(boxes_dims[i]);
+  }
+  add_reshape_layer_v2(graph->GetCompilerHandle(),
+                       name[0],
+                       shape[0],
+                       3,
+                       static_cast<const char*>(boxes_var_name.c_str()),
+                       const_cast<const int*>(&i_output_shape_data[0]),
+                       boxes_dims.size());
+  add_reshape_layer_v2(graph->GetCompilerHandle(),
+                       name[1],
+                       shape[1],
+                       3,
+                       static_cast<const char*>(var_var_name.c_str()),
+                       const_cast<const int*>(&i_output_shape_data[0]),
+                       boxes_dims.size());
+  graph->AddNode(boxes_var_name);
+  graph->AddNode(var_var_name);
+  return SUCCESS;
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(density_prior_box,
+                         kBM,
+                         paddle::lite::subgraph::bm::DensityPriorBoxConverter);
--- a/lite/kernels/bm/bridges/dropout_op.cc
+++ b/lite/kernels/bm/bridges/dropout_op.cc
@@ -51,15 +51,23 @@ int DropoutConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  auto dropout_prob = op_info->GetAttr<float>("dropout_prob");
  auto dropout_implementation =
      op_info->GetAttr<std::string>("dropout_implementation");
-  CHECK_EQ(dropout_implementation, "downgrade_in_infer");
-  add_const_binary_layer(graph->GetCompilerHandle(),
-                         static_cast<const char*>(x_var_name.c_str()),
-                         const_cast<const int*>(&i_x_shape_data[0]),
-                         x_dims.size(),
-                         1.f - dropout_prob,
-                         static_cast<const char*>(output_var_name.c_str()),
-                         BINARY_MUL,
-                         0);
+
+  if (dropout_implementation == "downgrade_in_infer") {
+    add_const_binary_layer(graph->GetCompilerHandle(),
+                           static_cast<const char*>(x_var_name.c_str()),
+                           const_cast<const int*>(&i_x_shape_data[0]),
+                           x_dims.size(),
+                           1.f - dropout_prob,
+                           static_cast<const char*>(output_var_name.c_str()),
+                           BINARY_MUL,
+                           0);
+  } else {
+    add_identity_layer(graph->GetCompilerHandle(),
+                       static_cast<const char*>(x_var_name.c_str()),
+                       const_cast<const int*>(&i_x_shape_data[0]),
+                       x_dims.size(),
+                       static_cast<const char*>(output_var_name.c_str()));
+  }

  graph->AddNode(output_var_name);
  return SUCCESS;

--- a/lite/kernels/bm/bridges/elementwise_ops.cc
+++ b/lite/kernels/bm/bridges/elementwise_ops.cc
@@ -24,6 +24,48 @@ namespace lite {
 namespace subgraph {
 namespace bm {

+float* compute_elementwise_both_const(OpLite* op) {
+  auto op_info = op->op_info();
+  auto scope = op->scope();
+  auto op_type = op_info->Type();
+
+  // input
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto x_dims = x->dims();
+  auto y_var_name = op_info->Input("Y").front();
+  auto y = scope->FindVar(y_var_name)->GetMutable<lite::Tensor>();
+  auto y_dims = y->dims();
+  // output
+  auto output_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  auto output_dims = output->dims();
+  float* cpu_data =
+      static_cast<float*>(malloc(sizeof(float) * output->data_size()));
+  CHECK(cpu_data != nullptr);
+  CHECK_EQ(x_dims.size(), y_dims.size());
+  const float* y_data = const_cast<const float*>(y->mutable_data<float>());
+  const float* x_data = const_cast<const float*>(x->mutable_data<float>());
+  if (op_type == "elementwise_mul") {
+    for (size_t i = 0; i < output->data_size(); i++) {
+      cpu_data[i] = x_data[i] * y_data[i];
+    }
+  } else if (op_type == "elementwise_add") {
+    for (size_t i = 0; i < output->data_size(); i++) {
+      cpu_data[i] = x_data[i] + y_data[i];
+    }
+  } else if (op_type == "elementwise_sub") {
+    for (size_t i = 0; i < output->data_size(); i++) {
+      cpu_data[i] = x_data[i] - y_data[i];
+    }
+  } else if (op_type == "elementwise_div") {
+    for (size_t i = 0; i < output->data_size(); i++) {
+      cpu_data[i] = x_data[i] / y_data[i];
+    }
+  }
+  return cpu_data;
+}
+
 int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  CHECK(ctx != nullptr);
  CHECK(op != nullptr);
@@ -41,21 +83,20 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  auto x_dims = x->dims();
  name[0] = static_cast<const char*>(x_var_name.c_str());
  dim[0] = x_dims.size();
-  const int64_t* x_shape_data = const_cast<const int64_t*>(&x_dims.data()[0]);
  std::vector<int32_t> i_x_shape_data(x_dims.size());
  for (size_t i = 0; i < x_dims.size(); i++) {
-    i_x_shape_data[i] = static_cast<int>(x_shape_data[i]);
+    i_x_shape_data[i] = static_cast<int>(x_dims[i]);
  }
  shape[0] = &i_x_shape_data[0];
+  bool x_is_const = !graph->HasNode(x_var_name);
  auto y_var_name = op_info->Input("Y").front();
  auto y = scope->FindVar(y_var_name)->GetMutable<lite::Tensor>();
  auto y_dims = y->dims();
  name[1] = static_cast<const char*>(y_var_name.c_str());
  dim[1] = y_dims.size();
-  const int64_t* y_shape_data = const_cast<const int64_t*>(&y_dims.data()[0]);
  std::vector<int32_t> i_y_shape_data(y_dims.size());
  for (size_t i = 0; i < y_dims.size(); i++) {
-    i_y_shape_data[i] = static_cast<int>(y_shape_data[i]);
+    i_y_shape_data[i] = static_cast<int>(y_dims[i]);
  }
  shape[1] = &i_y_shape_data[0];
  bool y_is_const = !graph->HasNode(y_var_name);
@@ -86,46 +127,56 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  const float* x_data = const_cast<const float*>(x->mutable_data<float>());
  auto unique_op_name = lite::subgraph::bm::UniqueName("expand_ndims");
  std::vector<int32_t> i_expand_shape_data(3);
-  if (y_is_const) {
-    if (dim[0] == dim[1] || 2 == dim[0]) {
-      bm_add_const_tensor(graph->GetCompilerHandle(),
-                          name[1],
-                          shape[1],
-                          dim[1],
-                          static_cast<bm_data_type_t>(DTYPE_FP32),
-                          static_cast<const void*>(y_data));
-    } else if (1 == dim[1] && 1 == axis) {
-      add_expand_ndims_layer(graph->GetCompilerHandle(),
-                             name[1],
-                             shape[1],
-                             dim[1],
-                             static_cast<const float*>(y_data),
-                             -1,
-                             2,
-                             static_cast<const char*>(unique_op_name.c_str()));
-      name[1] = static_cast<const char*>(unique_op_name.c_str());
-      dim[1] = 3;
-      i_expand_shape_data[0] = i_y_shape_data[0];
-      i_expand_shape_data[1] = 1;
-      i_expand_shape_data[2] = 1;
-      shape[1] = &i_expand_shape_data[0];
-      y_data = nullptr;
+  if (x_is_const && y_is_const) {
+    float* cpu_data = compute_elementwise_both_const(op);
+    bm_add_const_tensor(graph->GetCompilerHandle(),
+                        static_cast<const char*>(output_var_name.c_str()),
+                        const_cast<const int*>(&i_output_shape_data[0]),
+                        output_dims.size(),
+                        static_cast<bm_data_type_t>(DTYPE_FP32),
+                        static_cast<const void*>(cpu_data));
+  } else {
+    if (y_is_const) {
+      if (dim[0] == dim[1] || 2 == dim[0]) {
+        bm_add_const_tensor(graph->GetCompilerHandle(),
+                            name[1],
+                            shape[1],
+                            dim[1],
+                            static_cast<bm_data_type_t>(DTYPE_FP32),
+                            static_cast<const void*>(y_data));
+      } else if (1 == dim[1] && 1 == axis) {
+        add_expand_ndims_layer(
+            graph->GetCompilerHandle(),
+            name[1],
+            shape[1],
+            dim[1],
+            static_cast<const float*>(y_data),
+            -1,
+            2,
+            static_cast<const char*>(unique_op_name.c_str()));
+        name[1] = static_cast<const char*>(unique_op_name.c_str());
+        dim[1] = 3;
+        i_expand_shape_data[0] = i_y_shape_data[0];
+        i_expand_shape_data[1] = 1;
+        i_expand_shape_data[2] = 1;
+        shape[1] = &i_expand_shape_data[0];
+        y_data = nullptr;
+      }
    }
+    add_binary_layer_v2(graph->GetCompilerHandle(),
+                        name[0],
+                        shape[0],
+                        dim[0],
+                        0,
+                        static_cast<const float*>(x_data),
+                        name[1],
+                        shape[1],
+                        dim[1],
+                        0,
+                        static_cast<const float*>(y_data),
+                        static_cast<const char*>(output_var_name.c_str()),
+                        op_code);
  }
-  add_binary_layer_v2(graph->GetCompilerHandle(),
-                      name[0],
-                      shape[0],
-                      dim[0],
-                      0,
-                      static_cast<const float*>(x_data),
-                      name[1],
-                      shape[1],
-                      dim[1],
-                      0,
-                      static_cast<const float*>(y_data),
-                      static_cast<const char*>(output_var_name.c_str()),
-                      op_code);
-
  delete[] shape;
  delete[] name;
  delete[] dim;

--- a/lite/kernels/bm/bridges/matmul_op.cc
+++ b/lite/kernels/bm/bridges/matmul_op.cc
@@ -36,46 +36,46 @@ int MatMulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  auto x_var_name = op_info->Input("X").front();
  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
  auto x_dims = x->dims();
-  const int64_t* x_shape_data = const_cast<const int64_t*>(&x_dims.data()[0]);
  std::vector<int32_t> i_x_shape_data(x_dims.size());
  for (size_t i = 0; i < x_dims.size(); i++) {
-    i_x_shape_data[i] = static_cast<int>(x_shape_data[i]);
+    i_x_shape_data[i] = static_cast<int>(x_dims[i]);
  }
  auto y_var_name = op_info->Input("Y").front();
  auto y = scope->FindVar(y_var_name)->GetMutable<lite::Tensor>();
  auto y_dims = y->dims();
-  const int64_t* y_shape_data = const_cast<const int64_t*>(&y_dims.data()[0]);
  std::vector<int32_t> i_y_shape_data(y_dims.size());
  for (size_t i = 0; i < y_dims.size(); i++) {
-    i_y_shape_data[i] = static_cast<int>(y_shape_data[i]);
+    i_y_shape_data[i] = static_cast<int>(y_dims[i]);
  }
  // output
  auto output_var_name = op_info->Output("Out").front();
+  auto out = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  auto out_dims = out->dims();
+  std::vector<int32_t> i_out_shape_data(out_dims.size());
+  for (size_t i = 0; i < out_dims.size(); i++) {
+    i_out_shape_data[i] = static_cast<int>(out_dims[i]);
+  }
  bool transpose_x = op_info->GetAttr<bool>("transpose_X");
  bool transpose_y = op_info->GetAttr<bool>("transpose_Y");
  float alpha = op_info->GetAttr<float>("alpha");
+  CHECK_EQ(alpha, 1.f);
+  CHECK_EQ(transpose_x, 0);
+  CHECK_EQ(transpose_y, 0);

-  LOG(INFO) << x_dims << " " << y_dims << " " << alpha << " " << transpose_x
-            << " " << transpose_y;
-
-#if 0
-  add_const_binary_layer(graph->GetCompilerHandle(),
+  const float* y_data = const_cast<const float*>(y->mutable_data<float>());
+  const float* x_data = const_cast<const float*>(x->mutable_data<float>());
+  add_batch_matmul_layer(graph->GetCompilerHandle(),
                         static_cast<const char*>(x_var_name.c_str()),
                         const_cast<const int*>(&i_x_shape_data[0]),
                         x_dims.size(),
-                         scale,
-                         static_cast<const char*>(unique_op_scale_name.c_str()),
-                         BINARY_MUL,
-                         0);
-  add_const_binary_layer(graph->GetCompilerHandle(),
-                         static_cast<const char*>(unique_op_scale_name.c_str()),
-                         const_cast<const int*>(&i_x_shape_data[0]),
-                         x_dims.size(),
-                         bias,
-                         static_cast<const char*>(output_var_name.c_str()),
-                         BINARY_ADD,
-                         0);
-#endif
+                         0,
+                         x_data,
+                         static_cast<const char*>(y_var_name.c_str()),
+                         const_cast<const int*>(&i_y_shape_data[0]),
+                         y_dims.size(),
+                         0,
+                         y_data,
+                         static_cast<const char*>(output_var_name.c_str()));
  graph->AddNode(output_var_name);
  return SUCCESS;
 }

--- a/lite/kernels/bm/bridges/paddle_use_bridges.h
+++ b/lite/kernels/bm/bridges/paddle_use_bridges.h
@@ -60,3 +60,5 @@ USE_SUBGRAPH_BRIDGE(split, kBM);
 USE_SUBGRAPH_BRIDGE(matmul, kBM);
 USE_SUBGRAPH_BRIDGE(max_pool2d_with_index, kBM);
 USE_SUBGRAPH_BRIDGE(sigmoid, kBM);
+USE_SUBGRAPH_BRIDGE(density_prior_box, kBM);
+USE_SUBGRAPH_BRIDGE(swish, kBM);
--- a/lite/kernels/bm/bridges/swish_op.cc
+++ b/lite/kernels/bm/bridges/swish_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <bmcompiler_if.h>
+#include <bmcompiler_op_code.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+int SwishConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+
+  // input
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto x_dims = x->dims();
+  const int64_t* x_shape_data = const_cast<const int64_t*>(&x_dims.data()[0]);
+  std::vector<int> i_x_shape_data(x_dims.size());
+  for (size_t i = 0; i < x_dims.size(); i++) {
+    i_x_shape_data[i] = static_cast<int>(x_shape_data[i]);
+  }
+  // output
+  auto output_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  auto output_dims = output->dims();
+  std::vector<int32_t> i_output_shape_data(output_dims.size());
+  for (size_t i = 0; i < output_dims.size(); i++) {
+    i_output_shape_data[i] = output_dims[i];
+  }
+  auto unique_sigmoid_name =
+      lite::subgraph::bm::UniqueName(op_type + "_sigmoid");
+  auto beta = op_info->GetAttr<float>("beta");
+  CHECK_EQ(beta, 1.f);
+  add_active_layer(graph->GetCompilerHandle(),
+                   const_cast<const int*>(&i_x_shape_data[0]),
+                   x_dims.size(),
+                   static_cast<const char*>(x_var_name.c_str()),
+                   const_cast<const int*>(&i_output_shape_data[0]),
+                   output_dims.size(),
+                   static_cast<const char*>(unique_sigmoid_name.c_str()),
+                   ACTIVE_SIGMOID);
+
+  add_batch_matmul_layer(graph->GetCompilerHandle(),
+                         static_cast<const char*>(x_var_name.c_str()),
+                         const_cast<const int*>(&i_x_shape_data[0]),
+                         x_dims.size(),
+                         0,
+                         nullptr,
+                         static_cast<const char*>(unique_sigmoid_name.c_str()),
+                         const_cast<const int*>(&i_output_shape_data[0]),
+                         output_dims.size(),
+                         0,
+                         nullptr,
+                         static_cast<const char*>(output_var_name.c_str()));
+  graph->AddNode(output_var_name);
+  return SUCCESS;
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(swish,
+                         kBM,
+                         paddle::lite::subgraph::bm::SwishConverter);