* [lite][bm] support inception&vgg&darknet&mobilenet,test=develop (#2867)

* [LITE][BM] support VGG,Inception,Mobilenet,Darknet, test=develop

* [lite][bm] support inception&vgg&darknet&mobilenet,test=develop (#2867)
* [LITE][BM] support VGG,Inception,Mobilenet,Darknet, test=develop
1fe37df3 · Santa An · GitHub · 25dfe6b1 · 1fe37df3 · 1fe37df3
15 changed file
--- a/lite/kernels/bm/bridges/CMakeLists.txt
+++ b/lite/kernels/bm/bridges/CMakeLists.txt
@@ -15,7 +15,12 @@ lite_cc_library(subgraph_bridge_softmax_op_bm SRCS softmax_op.cc DEPS ${subgraph
 lite_cc_library(subgraph_bridge_mul_op_bm SRCS mul_op.cc DEPS ${bm_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_batch_norm_op_bm SRCS batch_norm_op.cc DEPS ${bm_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_scale_op_bm SRCS scale_op.cc DEPS ${bm_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_concat_op_bm SRCS concat_op.cc DEPS ${bm_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_dropout_op_bm SRCS dropout_op.cc DEPS ${bm_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_transpose_op_bm SRCS transpose_op.cc DEPS ${bm_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_reshape_op_bm SRCS reshape_op.cc DEPS ${bm_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_norm_op_bm SRCS norm_op.cc DEPS ${bm_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_prior_box_op_bm SRCS prior_box_op.cc DEPS ${bm_subgraph_bridge_deps})
 set(bm_subgraph_bridges
        subgraph_bridge_registry
        subgraph_bridge_engine
@@ -28,4 +33,10 @@ set(bm_subgraph_bridges
        subgraph_bridge_mul_op_bm
        subgraph_bridge_batch_norm_op_bm
        subgraph_bridge_scale_op_bm
+        subgraph_bridge_concat_op_bm
+        subgraph_bridge_dropout_op_bm
+        subgraph_bridge_transpose_op_bm
+        subgraph_bridge_reshape_op_bm
+        subgraph_bridge_norm_op_bm
+        subgraph_bridge_prior_box_op_bm
        CACHE INTERNAL "bm_subgraph_bridges")
--- a/lite/kernels/bm/bridges/act_op.cc
+++ b/lite/kernels/bm/bridges/act_op.cc
@@ -45,7 +45,14 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  for (size_t i = 0; i < output_dims.size(); i++) {
    i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
  }
-  CHECK_EQ(op_type, "relu");
+  float alpha = 0.f;
+  if (op_type == "relu") {
+  } else if (op_type == "leaky_relu") {
+    alpha = op_info->GetAttr<float>("alpha");
+  } else {
+    LOG(FATAL) << "[BM] unsupport act type";
+    return FAILED;
+  }
  add_relu_layer(graph->GetCompilerHandle(),
                 const_cast<const int*>(&i_x_shape_data[0]),
                 x_dims.size(),
@@ -53,7 +60,7 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                 const_cast<const int*>(&i_output_shape_data[0]),
                 output_dims.size(),
                 static_cast<const char*>(output_var_name.c_str()),
-                 0.f,
+                 alpha,
                 -1.f);
  graph->AddNode(output_var_name);
  return SUCCESS;
@@ -65,3 +72,6 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 }  // namespace paddle
 REGISTER_SUBGRAPH_BRIDGE(relu, kBM, paddle::lite::subgraph::bm::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(leaky_relu,
+                         kBM,
+                         paddle::lite::subgraph::bm::ActConverter);
--- a/lite/kernels/bm/bridges/concat_op.cc
+++ b/lite/kernels/bm/bridges/concat_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <bmcompiler_if.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  // input
+  auto x_names = op_info->Input("X");
+  auto x_type = kernel->GetInputDeclType("X");
+  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
+  // output
+  auto output_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  auto output_dims = output->dims();
+  const int64_t* output_shape_data =
+      const_cast<const int64_t*>(&output_dims.data()[0]);
+  std::vector<int32_t> i_output_shape_data(output_dims.size());
+  for (size_t i = 0; i < output_dims.size(); i++) {
+    i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
+  }
+  const int32_t input_num = x_names.size();
+  int32_t** shape = new int32_t*[input_num];
+  int32_t* dim = new int32_t[input_num];
+  const char** name = new const char*[input_num];
+  for (size_t i = 0; i < x_names.size(); i++) {
+    auto x = scope->FindMutableTensor(x_names[i]);
+    name[i] = x_names[i].c_str();
+    auto x_dims = x->dims();
+    dim[i] = x_dims.size();
+    const int64_t* x_shape_data = const_cast<const int64_t*>(&x_dims.data()[0]);
+    shape[i] = new int32_t[x_dims.size()];
+    for (size_t j = 0; j < x_dims.size(); j++) {
+      shape[i][j] = static_cast<int32_t>(x_shape_data[j]);
+    }
+  }
+  auto axis = op_info->GetAttr<int>("axis");
+  add_concat_layer(graph->GetCompilerHandle(),
+                   input_num,
+                   shape,
+                   dim,
+                   name,
+                   const_cast<const int*>(&i_output_shape_data[0]),
+                   output_dims.size(),
+                   static_cast<const char*>(output_var_name.c_str()),
+                   axis);
+  for (size_t i = 0; i < x_names.size(); i++) {
+    delete[] shape[i];
+  }
+  delete[] shape;
+  delete[] name;
+  delete[] dim;
+  graph->AddNode(output_var_name);
+  return SUCCESS;
+}
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+REGISTER_SUBGRAPH_BRIDGE(concat,
+                         kBM,
+                         paddle::lite::subgraph::bm::ConcatConverter);
--- a/lite/kernels/bm/bridges/conv_op.cc
+++ b/lite/kernels/bm/bridges/conv_op.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "lite/operators/conv_op.h"
 #include <bmcompiler_if.h>
 #include "lite/kernels/bm/bridges/graph.h"
 #include "lite/kernels/bm/bridges/utility.h"
@@ -58,10 +57,10 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  std::vector<int32_t> i_output_shape_data(output_dims.size());
  for (size_t i = 0; i < input_dims.size(); i++) {
-    i_input_shape_data[i] = static_cast<int>(input_shape_data[i]);
+    i_input_shape_data[i] = static_cast<int32_t>(input_shape_data[i]);
  }
  for (size_t i = 0; i < output_dims.size(); i++) {
-    i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
+    i_output_shape_data[i] = static_cast<int32_t>(output_shape_data[i]);
  }
  const float* filter_data =
      const_cast<const float*>(filter->mutable_data<float>());
@@ -69,7 +68,6 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
  auto strides = op_info->GetAttr<std::vector<int>>("strides");
  auto dilations = op_info->GetAttr<std::vector<int>>("dilations");
  add_conv_layer(graph->GetCompilerHandle(),
                 const_cast<const int*>(&i_input_shape_data[0]),
                 input_dims.size(),
@@ -104,3 +102,6 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 REGISTER_SUBGRAPH_BRIDGE(conv2d,
                         kBM,
                         paddle::lite::subgraph::bm::ConvConverter);
+REGISTER_SUBGRAPH_BRIDGE(depthwise_conv2d,
+                         kBM,
+                         paddle::lite::subgraph::bm::ConvConverter);
--- a/lite/kernels/bm/bridges/dropout_op.cc
+++ b/lite/kernels/bm/bridges/dropout_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <bmcompiler_if.h>
+#include <bmcompiler_op_code.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+int DropoutConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  // input
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto x_dims = x->dims();
+  const int64_t* x_shape_data = const_cast<const int64_t*>(&x_dims.data()[0]);
+  std::vector<int32_t> i_x_shape_data(x_dims.size());
+  for (size_t i = 0; i < x_dims.size(); i++) {
+    i_x_shape_data[i] = static_cast<int>(x_shape_data[i]);
+  }
+  // output
+  auto output_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  auto output_dims = output->dims();
+  const int64_t* output_shape_data =
+      const_cast<const int64_t*>(&output_dims.data()[0]);
+  std::vector<int32_t> i_output_shape_data(output_dims.size());
+  for (size_t i = 0; i < output_dims.size(); i++) {
+    i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
+  }
+  auto dropout_prob = op_info->GetAttr<float>("dropout_prob");
+  auto dropout_implementation =
+      op_info->GetAttr<std::string>("dropout_implementation");
+  CHECK_EQ(dropout_implementation, "downgrade_in_infer");
+  add_const_binary_layer(graph->GetCompilerHandle(),
+                         static_cast<const char*>(x_var_name.c_str()),
+                         const_cast<const int*>(&i_x_shape_data[0]),
+                         x_dims.size(),
+                         1.f - dropout_prob,
+                         static_cast<const char*>(output_var_name.c_str()),
+                         BINARY_MUL,
+                         0);
+  graph->AddNode(output_var_name);
+  return SUCCESS;
+}
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+REGISTER_SUBGRAPH_BRIDGE(dropout,
+                         kBM,
+                         paddle::lite::subgraph::bm::DropoutConverter);
--- a/lite/kernels/bm/bridges/elementwise_ops.cc
+++ b/lite/kernels/bm/bridges/elementwise_ops.cc
@@ -14,6 +14,7 @@
 #include <bmcompiler_defs.h>
 #include <bmcompiler_if.h>
 #include <bmcompiler_if_lite.h>
+#include <bmcompiler_op_code.h>
 #include "lite/kernels/bm/bridges/graph.h"
 #include "lite/kernels/bm/bridges/utility.h"
 #include "lite/kernels/npu/bridges/registry.h"
@@ -68,42 +69,52 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  for (size_t i = 0; i < output_dims.size(); i++) {
    i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
  }
-  if (y_is_const) {
+  auto axis = op_info->GetAttr<int>("axis");
-    CHECK_EQ(op_type, "elementwise_add");
-  }
  int op_code{-1};
+  int eltwise_if_code{-1};
  float coeff[2] = {1.f, 1.f};
  if (op_type == "elementwise_mul") {
-    op_code = 0;
+    op_code = BINARY_MUL;
+    eltwise_if_code = 0;
  } else if (op_type == "elementwise_add") {
-    op_code = 1;
+    op_code = BINARY_ADD;
+    eltwise_if_code = 1;
  } else if (op_type == "elementwise_sub") {
-    op_code = 1;
+    op_code = BINARY_SUB;
+    eltwise_if_code = 1;
    coeff[1] = -1.f;
  } else {
    LOG(FATAL) << "UNSUPPORTED ELTWISE OPERATION: " << op_type;
  }
-  if (!y_is_const) {
+  const float* y_data = const_cast<const float*>(y->mutable_data<float>());
-    add_eltwise_layer(graph->GetCompilerHandle(),
+  const float* x_data = const_cast<const float*>(x->mutable_data<float>());
-                      input_num,
+  auto unique_op_name = lite::subgraph::bm::UniqueName("expand_ndims");
-                      shape,
+  std::vector<int32_t> i_expand_shape_data(3);
-                      dim,
+  if (y_is_const) {
-                      name,
+    if (dim[0] == dim[1] || 2 == dim[0]) {
-                      const_cast<const int*>(&i_output_shape_data[0]),
+      bm_add_const_tensor(graph->GetCompilerHandle(),
-                      output_dims.size(),
+                          name[1],
-                      static_cast<const char*>(output_var_name.c_str()),
+                          shape[1],
-                      op_code,
+                          dim[1],
-                      coeff);
+                          static_cast<bm_data_type_t>(DTYPE_FP32),
-  } else {
+                          static_cast<const void*>(y_data));
-    const float* y_data = const_cast<const float*>(y->mutable_data<float>());
+    } else if (1 == dim[1] && 1 == axis) {
-    const float* x_data = const_cast<const float*>(x->mutable_data<float>());
+      add_expand_ndims_layer(graph->GetCompilerHandle(),
-    bm_add_const_tensor(graph->GetCompilerHandle(),
+                             name[1],
-                        name[1],
+                             shape[1],
-                        shape[0],
+                             dim[1],
-                        dim[0],
+                             static_cast<const float*>(y_data),
-                        static_cast<bm_data_type_t>(DTYPE_FP32),
+                             -1,
-                        static_cast<const void*>(y_data));
+                             2,
+                             static_cast<const char*>(unique_op_name.c_str()));
+      name[1] = static_cast<const char*>(unique_op_name.c_str());
+      dim[1] = 3;
+      i_expand_shape_data[0] = i_y_shape_data[0];
+      i_expand_shape_data[1] = 1;
+      i_expand_shape_data[2] = 1;
+      shape[1] = &i_expand_shape_data[0];
+      y_data = nullptr;
+    }
    add_binary_layer_v2(graph->GetCompilerHandle(),
                        name[0],
                        shape[0],
@@ -111,12 +122,23 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                        0,
                        static_cast<const float*>(x_data),
                        name[1],
-                        shape[0],
+                        shape[1],
-                        dim[0],
+                        dim[1],
                        0,
                        static_cast<const float*>(y_data),
                        static_cast<const char*>(output_var_name.c_str()),
-                        0);
+                        op_code);
+  } else {
+    add_eltwise_layer(graph->GetCompilerHandle(),
+                      input_num,
+                      shape,
+                      dim,
+                      name,
+                      const_cast<const int*>(&i_output_shape_data[0]),
+                      output_dims.size(),
+                      static_cast<const char*>(output_var_name.c_str()),
+                      eltwise_if_code,
+                      coeff);
  }
  delete[] shape;
  delete[] name;
@@ -133,3 +155,9 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 REGISTER_SUBGRAPH_BRIDGE(elementwise_add,
                         kBM,
                         paddle::lite::subgraph::bm::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(elementwise_mul,
+                         kBM,
+                         paddle::lite::subgraph::bm::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(elementwise_sub,
+                         kBM,
+                         paddle::lite::subgraph::bm::ElementwiseConverter);
--- a/lite/kernels/bm/bridges/mul_op.cc
+++ b/lite/kernels/bm/bridges/mul_op.cc
@@ -41,8 +41,10 @@ int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  }
  // add reshape layer
  int i_x_reshape_shape_data[2];
-  for (size_t i = 0; i < 2; i++) {
+  i_x_reshape_shape_data[0] = static_cast<int>(x_shape_data[0]);
-    i_x_reshape_shape_data[i] = static_cast<int>(x_shape_data[i]);
+  i_x_reshape_shape_data[1] = 1;
+  for (size_t i = 1; i < x_dims.size(); i++) {
+    i_x_reshape_shape_data[1] *= static_cast<int>(x_shape_data[i]);
  }
  int reshape_param[] = {0, -1};
  auto unique_op_reshape_name =

--- a/lite/kernels/bm/bridges/norm_op.cc
+++ b/lite/kernels/bm/bridges/norm_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <bmcompiler_if.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+int NormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto unique_op_name = lite::subgraph::bm::UniqueName(op_type);
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto x_dims = x->dims();
+  auto output_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  auto output_dims = output->dims();
+  const int64_t* x_shape_data = const_cast<const int64_t*>(&x_dims.data()[0]);
+  const int64_t* output_shape_data =
+      const_cast<const int64_t*>(&output_dims.data()[0]);
+  std::vector<int32_t> i_x_shape_data(x_dims.size());
+  std::vector<int32_t> i_output_shape_data(output_dims.size());
+  for (size_t i = 0; i < x_dims.size(); i++) {
+    i_x_shape_data[i] = static_cast<int>(x_shape_data[i]);
+  }
+  for (size_t i = 0; i < output_dims.size(); i++) {
+    i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
+  }
+  float one = 1.f;
+  auto epsilon = op_info->GetAttr<float>("epsilon");
+  add_normalize_layer(graph->GetCompilerHandle(),
+                      const_cast<const int*>(&i_x_shape_data[0]),
+                      x_dims.size(),
+                      static_cast<const char*>(x_var_name.c_str()),
+                      const_cast<const int*>(&i_output_shape_data[0]),
+                      output_dims.size(),
+                      static_cast<const char*>(output_var_name.c_str()),
+                      static_cast<const char*>(unique_op_name.c_str()),
+                      0,
+                      1,
+                      &one,
+                      epsilon);
+  graph->AddNode(output_var_name);
+  return SUCCESS;
+}
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+REGISTER_SUBGRAPH_BRIDGE(norm, kBM, paddle::lite::subgraph::bm::NormConverter);
--- a/lite/kernels/bm/bridges/paddle_use_bridges.h
+++ b/lite/kernels/bm/bridges/paddle_use_bridges.h
@@ -15,10 +15,24 @@
 #pragma once
 USE_SUBGRAPH_BRIDGE(relu, kBM);
+USE_SUBGRAPH_BRIDGE(leaky_relu, kBM);
 USE_SUBGRAPH_BRIDGE(conv2d, kBM);
+USE_SUBGRAPH_BRIDGE(depthwise_conv2d, kBM);
 USE_SUBGRAPH_BRIDGE(elementwise_add, kBM);
+USE_SUBGRAPH_BRIDGE(elementwise_mul, kBM);
+USE_SUBGRAPH_BRIDGE(elementwise_sub, kBM);
 USE_SUBGRAPH_BRIDGE(pool2d, kBM);
 USE_SUBGRAPH_BRIDGE(softmax, kBM);
 USE_SUBGRAPH_BRIDGE(mul, kBM);
 USE_SUBGRAPH_BRIDGE(batch_norm, kBM);
 USE_SUBGRAPH_BRIDGE(scale, kBM);
+USE_SUBGRAPH_BRIDGE(concat, kBM);
+USE_SUBGRAPH_BRIDGE(dropout, kBM);
+USE_SUBGRAPH_BRIDGE(transpose, kBM);
+USE_SUBGRAPH_BRIDGE(transpose2, kBM);
+USE_SUBGRAPH_BRIDGE(reshape, kBM);
+USE_SUBGRAPH_BRIDGE(reshape2, kBM);
+USE_SUBGRAPH_BRIDGE(flatten, kBM);
+USE_SUBGRAPH_BRIDGE(flatten2, kBM);
+USE_SUBGRAPH_BRIDGE(norm, kBM);
+USE_SUBGRAPH_BRIDGE(prior_box, kBM);
--- a/lite/kernels/bm/bridges/pool_op.cc
+++ b/lite/kernels/bm/bridges/pool_op.cc
@@ -65,6 +65,12 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  if (pooling_type == "avg") {
    average_exclusive = op_info->GetAttr<bool>("exclusive");
  }
+  if (global_pooling) {
+    paddings[0] = 0;
+    paddings[1] = 0;
+    ksize[0] = i_x_shape_data[2];
+    ksize[1] = i_x_shape_data[3];
+  }
  add_pooling_layer(
      graph->GetCompilerHandle(),
      const_cast<const int*>(&i_x_shape_data[0]),

--- a/lite/kernels/bm/bridges/prior_box_op.cc
+++ b/lite/kernels/bm/bridges/prior_box_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <bmcompiler_if.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+typedef struct __tag_st_priorbox_param {
+  std::vector<float> min_sizes;
+  std::vector<float> max_sizes;
+  std::vector<float> aspect_ratios;
+  std::vector<float> variances;
+  float step_w;
+  float step_h;
+  float offset;
+  int32_t img_w;
+  int32_t img_h;
+  int32_t prior_num;
+  bool min_max_aspect_ratios_order;
+  bool clip;
+  bool flip;
+} st_priorbox_param;
+inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
+                               bool flip,
+                               std::vector<float>* output_aspect_ratior) {
+  constexpr float epsilon = 1e-6;
+  output_aspect_ratior->clear();
+  output_aspect_ratior->push_back(1.0f);
+  for (size_t i = 0; i < input_aspect_ratior.size(); ++i) {
+    float ar = input_aspect_ratior[i];
+    bool already_exist = false;
+    for (size_t j = 0; j < output_aspect_ratior->size(); ++j) {
+      if (fabs(ar - output_aspect_ratior->at(j)) < epsilon) {
+        already_exist = true;
+        break;
+      }
+    }
+    if (!already_exist) {
+      output_aspect_ratior->push_back(ar);
+      if (flip) {
+        output_aspect_ratior->push_back(1.0f / ar);
+      }
+    }
+  }
+}
+float* compute_priorbox_kernel(OpLite* op, st_priorbox_param* param) {
+  auto op_info = op->op_info();
+  auto scope = op->scope();
+  // inputs
+  auto in_var_name = op_info->Input("Input").front();
+  auto in = scope->FindVar(in_var_name)->GetMutable<lite::Tensor>();
+  auto in_dims = in->dims();
+  auto img_var_name = op_info->Input("Image").front();
+  auto img = scope->FindVar(img_var_name)->GetMutable<lite::Tensor>();
+  auto img_dims = img->dims();
+  // outputs
+  auto boxes_var_name = op_info->Output("Boxes").front();
+  auto boxes = scope->FindVar(boxes_var_name)->GetMutable<lite::Tensor>();
+  auto var_var_name = op_info->Output("Variances").front();
+  auto var = scope->FindVar(var_var_name)->GetMutable<lite::Tensor>();
+  std::vector<float> expand_aspect_ratios;
+  ExpandAspectRatios(param->aspect_ratios, param->flip, &expand_aspect_ratios);
+  param->aspect_ratios.clear();
+  for (size_t i = 0; i < expand_aspect_ratios.size(); i++) {
+    param->aspect_ratios.push_back(expand_aspect_ratios[i]);
+  }
+  param->prior_num = param->aspect_ratios.size() * param->min_sizes.size();
+  if (param->max_sizes.size() > 0) {
+    param->prior_num += param->max_sizes.size();
+  }
+  int32_t win1 = in_dims[3];
+  int32_t hin1 = in_dims[2];
+  DDim shape_out({hin1, win1, param->prior_num, 4});
+  boxes->Resize(shape_out);
+  var->Resize(shape_out);
+  // boxes->mutable_data<float>();
+  // var->mutable_data<float>();
+  float* cpu_data =
+      static_cast<float*>(malloc(sizeof(float) * boxes->data_size() * 2));
+  CHECK(cpu_data != nullptr);
+  const int32_t width = in_dims[3];
+  const int32_t height = in_dims[2];
+  int32_t img_width = param->img_w;
+  int32_t img_height = param->img_h;
+  if (img_width == 0 || img_height == 0) {
+    img_width = img_dims[3];
+    img_height = img_dims[2];
+  }
+  float step_w = param->step_w;
+  float step_h = param->step_h;
+  if (step_w == 0.f || step_h == 0.f) {
+    step_w = static_cast<float>(img_width) / width;
+    step_h = static_cast<float>(img_height) / height;
+  }
+  float offset = param->offset;
+  int32_t channel_size = height * width * param->prior_num * 4;
+  int32_t idx = 0;
+  ///////////////////////////////////////////////////////////////////////
+  for (int32_t h = 0; h < height; ++h) {
+    for (int32_t w = 0; w < width; ++w) {
+      float center_x = (w + offset) * step_w;
+      float center_y = (h + offset) * step_h;
+      float box_width = 0.f;
+      float box_height = 0.f;
+      float* min_buf = reinterpret_cast<float*>(malloc(sizeof(float) * 4));
+      float* max_buf = reinterpret_cast<float*>(malloc(sizeof(float) * 4));
+      float* com_buf = reinterpret_cast<float*>(
+          malloc(sizeof(float) * expand_aspect_ratios.size() * 4));
+      CHECK(min_buf != nullptr);
+      CHECK(max_buf != nullptr);
+      CHECK(com_buf != nullptr);
+      // LOG(INFO) << "the number of min_size is " << min_sizes_.size();
+      for (size_t s = 0; s < param->min_sizes.size(); ++s) {
+        int32_t min_idx = 0;
+        int32_t max_idx = 0;
+        int32_t com_idx = 0;
+        int32_t min_size = param->min_sizes[s];
+        //! first prior: aspect_ratio = 1, size = min_size
+        box_width = box_height = min_size;
+        //! xmin
+        min_buf[min_idx++] = (center_x - box_width / 2.f) / img_width;
+        //! ymin
+        min_buf[min_idx++] = (center_y - box_height / 2.f) / img_height;
+        //! xmax
+        min_buf[min_idx++] = (center_x + box_width / 2.f) / img_width;
+        //! ymax
+        min_buf[min_idx++] = (center_y + box_height / 2.f) / img_height;
+        if (param->max_sizes.size() > 0) {
+          int max_size = param->max_sizes[s];
+          //! second prior: aspect_ratio = 1, size = sqrt(min_size * max_size)
+          box_width = box_height = sqrtf(min_size * max_size);
+          //! xmin
+          max_buf[max_idx++] = (center_x - box_width / 2.f) / img_width;
+          //! ymin
+          max_buf[max_idx++] = (center_y - box_height / 2.f) / img_height;
+          //! xmax
+          max_buf[max_idx++] = (center_x + box_width / 2.f) / img_width;
+          //! ymax
+          max_buf[max_idx++] = (center_y + box_height / 2.f) / img_height;
+        }
+        //! rest of priors
+        for (size_t r = 0; r < expand_aspect_ratios.size(); ++r) {
+          float ar = expand_aspect_ratios[r];
+          if (fabs(ar - 1.) < 1e-6) {
+            continue;
+          }
+          box_width = min_size * sqrt(ar);
+          box_height = min_size / sqrt(ar);
+          //! xmin
+          com_buf[com_idx++] = (center_x - box_width / 2.f) / img_width;
+          //! ymin
+          com_buf[com_idx++] = (center_y - box_height / 2.f) / img_height;
+          //! xmax
+          com_buf[com_idx++] = (center_x + box_width / 2.f) / img_width;
+          //! ymax
+          com_buf[com_idx++] = (center_y + box_height / 2.f) / img_height;
+        }
+        if (param->min_max_aspect_ratios_order) {
+          memcpy(cpu_data + idx, min_buf, sizeof(float) * min_idx);
+          idx += min_idx;
+          memcpy(cpu_data + idx, max_buf, sizeof(float) * max_idx);
+          idx += max_idx;
+          memcpy(cpu_data + idx, com_buf, sizeof(float) * com_idx);
+          idx += com_idx;
+        } else {
+          memcpy(cpu_data + idx, com_buf, sizeof(float) * com_idx);
+          idx += com_idx;
+          memcpy(cpu_data + idx, max_buf, sizeof(float) * max_idx);
+          idx += max_idx;
+        }
+      }
+      free(min_buf);
+      free(max_buf);
+      free(com_buf);
+    }
+  }
+  //! clip the prior's coordidate such that it is within [0, 1]
+  if (param->clip) {
+    for (int32_t d = 0; d < channel_size; ++d) {
+      cpu_data[d] = std::min(std::max(cpu_data[d], 0.f), 1.f);
+    }
+  }
+  //! set the variance.
+  float* ptr = cpu_data + channel_size;
+  int count = 0;
+  for (int32_t h = 0; h < height; ++h) {
+    for (int32_t w = 0; w < width; ++w) {
+      for (int32_t i = 0; i < param->prior_num; ++i) {
+        for (int j = 0; j < 4; ++j) {
+          ptr[count] = param->variances[j];
+          ++count;
+        }
+      }
+    }
+  }
+  return cpu_data;
+}
+int PriorBoxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  // inputs
+  auto in_var_name = op_info->Input("Input").front();
+  auto in = scope->FindVar(in_var_name)->GetMutable<lite::Tensor>();
+  auto in_dims = in->dims();
+  auto img_var_name = op_info->Input("Image").front();
+  auto img = scope->FindVar(img_var_name)->GetMutable<lite::Tensor>();
+  auto img_dims = img->dims();
+  std::vector<int32_t> i_input_shape_data(in_dims.size());
+  for (size_t i = 0; i < in_dims.size(); i++) {
+    i_input_shape_data[i] = static_cast<int32_t>(in_dims[i]);
+  }
+  // outputs
+  auto boxes_var_name = op_info->Output("Boxes").front();
+  auto boxes = scope->FindVar(boxes_var_name)->GetMutable<lite::Tensor>();
+  auto var_var_name = op_info->Output("Variances").front();
+  auto unique_op_name = lite::subgraph::bm::UniqueName(op_type);
+  // param
+  st_priorbox_param param;
+  param.clip = op_info->GetAttr<bool>("clip");
+  param.min_sizes = op_info->GetAttr<std::vector<float>>("min_sizes");
+  param.max_sizes = op_info->GetAttr<std::vector<float>>("max_sizes");
+  param.aspect_ratios = op_info->GetAttr<std::vector<float>>("aspect_ratios");
+  param.variances = op_info->GetAttr<std::vector<float>>("variances");
+  param.offset = op_info->GetAttr<float>("offset");
+  if (op_info->HasAttr("flip")) {
+    param.flip = op_info->GetAttr<bool>("flip");
+  }
+  if (op_info->HasAttr("img_w")) {
+    param.img_w = op_info->GetAttr<int32_t>("img_w");
+  }
+  if (op_info->HasAttr("img_h")) {
+    param.img_h = op_info->GetAttr<int32_t>("img_h");
+  }
+  if (op_info->HasAttr("step_w")) {
+    param.step_w = op_info->GetAttr<float>("step_w");
+  }
+  if (op_info->HasAttr("step_h")) {
+    param.step_h = op_info->GetAttr<float>("step_h");
+  }
+  if (op_info->HasAttr("prior_num")) {
+    param.prior_num = op_info->GetAttr<int32_t>("prior_num");
+  }
+  if (op_info->HasAttr("min_max_aspect_ratios_order")) {
+    param.min_max_aspect_ratios_order =
+        op_info->GetAttr<bool>("min_max_aspect_ratios_order");
+  }
+  float* cpu_data = compute_priorbox_kernel(op, &param);
+  compute_priorbox_kernel(op, param);
+  auto boxes_dims = boxes->dims();
+  std::vector<int32_t> i_pri_out_shape_data(boxes_dims.size());
+  for (size_t i = 0; i < boxes_dims.size(); i++) {
+    i_pri_out_shape_data[i] = static_cast<int32_t>(boxes_dims[i]);
+  }
+  i_pri_out_shape_data[0] *= 2;
+  add_priorbox_layer(graph->GetCompilerHandle(),
+                     const_cast<const int*>(&i_input_shape_data[0]),
+                     in_dims.size(),
+                     static_cast<const char*>(in_var_name.c_str()),
+                     const_cast<const int*>(&i_pri_out_shape_data[0]),
+                     boxes_dims.size(),
+                     static_cast<const char*>(unique_op_name.c_str()),
+                     static_cast<const float*>(cpu_data),
+                     param.min_sizes.size(),
+                     const_cast<const float*>(&param.min_sizes[0]),
+                     param.max_sizes.size(),
+                     const_cast<const float*>(&param.max_sizes[0]),
+                     param.aspect_ratios.size(),
+                     const_cast<const float*>(&param.aspect_ratios[0]),
+                     static_cast<int>(param.flip),
+                     static_cast<int>(param.clip),
+                     param.variances.size(),
+                     const_cast<const float*>(&param.variances[0]),
+                     param.img_h,
+                     param.img_w,
+                     param.step_h,
+                     param.step_w,
+                     param.offset);
+  std::vector<int32_t> i_output_shape_data(boxes_dims.size());
+  for (size_t i = 0; i < boxes_dims.size(); i++) {
+    i_output_shape_data[i] = static_cast<int32_t>(boxes_dims[i]);
+  }
+  int32_t* shape[2];
+  int dim[2];
+  const char* name[2];
+  dim[0] = boxes_dims.size();
+  dim[1] = boxes_dims.size();
+  name[0] = static_cast<const char*>(boxes_var_name.c_str());
+  name[1] = static_cast<const char*>(var_var_name.c_str());
+  shape[0] = &i_output_shape_data[0];
+  shape[1] = &i_output_shape_data[0];
+  int split_size = 2;
+  add_tf_split_layer(graph->GetCompilerHandle(),
+                     const_cast<const int*>(&i_pri_out_shape_data[0]),
+                     boxes_dims.size(),
+                     static_cast<const char*>(unique_op_name.c_str()),
+                     2,
+                     shape,
+                     dim,
+                     name,
+                     boxes_dims.size(),
+                     0,
+                     &split_size,
+                     0);
+  graph->AddNode(boxes_var_name);
+  graph->AddNode(var_var_name);
+  return SUCCESS;
+}
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+REGISTER_SUBGRAPH_BRIDGE(prior_box,
+                         kBM,
+                         paddle::lite::subgraph::bm::PriorBoxConverter);
--- a/lite/kernels/bm/bridges/reshape_op.cc
+++ b/lite/kernels/bm/bridges/reshape_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <bmcompiler_if.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/npu/bridges/registry.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+int ReshapeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto x_dims = x->dims();
+  std::vector<int32_t> i_x_shape_data(x_dims.size());
+  for (size_t i = 0; i < x_dims.size(); i++) {
+    i_x_shape_data[i] = static_cast<int32_t>(x_dims[i]);
+  }
+  auto output_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  auto output_dims = output->dims();
+  std::vector<int32_t> i_output_shape_data(output_dims.size());
+  for (size_t i = 0; i < output_dims.size(); i++) {
+    i_output_shape_data[i] = static_cast<int32_t>(output_dims[i]);
+  }
+  // auto axis = op_info->GetAttr<int>("axis");
+  add_reshape_layer_v2(graph->GetCompilerHandle(),
+                       static_cast<const char*>(x_var_name.c_str()),
+                       const_cast<const int*>(&i_x_shape_data[0]),
+                       x_dims.size(),
+                       static_cast<const char*>(output_var_name.c_str()),
+                       const_cast<const int*>(&i_output_shape_data[0]),
+                       output_dims.size());
+  graph->AddNode(output_var_name);
+  return SUCCESS;
+}
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+REGISTER_SUBGRAPH_BRIDGE(reshape,
+                         kBM,
+                         paddle::lite::subgraph::bm::ReshapeConverter);
+REGISTER_SUBGRAPH_BRIDGE(reshape2,
+                         kBM,
+                         paddle::lite::subgraph::bm::ReshapeConverter);
+REGISTER_SUBGRAPH_BRIDGE(flatten,
+                         kBM,
+                         paddle::lite::subgraph::bm::ReshapeConverter);
+REGISTER_SUBGRAPH_BRIDGE(flatten2,
+                         kBM,
+                         paddle::lite::subgraph::bm::ReshapeConverter);
--- a/lite/kernels/bm/bridges/softmax_op.cc
+++ b/lite/kernels/bm/bridges/softmax_op.cc
@@ -48,7 +48,10 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  for (size_t i = 0; i < length; i++) {
    i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
  }
-  auto axis = op_info->GetAttr<int>("axis");
+  int32_t axis = -1;
+  if (op_info->HasAttr("axis")) {
+    axis = op_info->GetAttr<int>("axis");
+  }
  if (axis < 0) {
    axis += x_dims.size();
  }

--- a/lite/kernels/bm/bridges/transpose_op.cc
+++ b/lite/kernels/bm/bridges/transpose_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <bmcompiler_defs.h>
+#include <bmcompiler_if.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/npu/bridges/registry.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+int TransposeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto x_dims = x->dims();
+  auto output_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  auto output_dims = output->dims();
+  const int64_t* x_shape_data = const_cast<const int64_t*>(&x_dims.data()[0]);
+  const int64_t* output_shape_data =
+      const_cast<const int64_t*>(&output_dims.data()[0]);
+  std::vector<int32_t> i_x_shape_data(x_dims.size());
+  std::vector<int32_t> i_output_shape_data(output_dims.size());
+  for (size_t i = 0; i < x_dims.size(); i++) {
+    i_x_shape_data[i] = static_cast<int>(x_shape_data[i]);
+  }
+  for (size_t i = 0; i < output_dims.size(); i++) {
+    i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
+  }
+  auto axis = op_info->GetAttr<std::vector<int>>("axis");
+  CHECK_EQ(axis.size(), x_dims.size());
+  add_transpose_layer_v2(graph->GetCompilerHandle(),
+                         static_cast<const char*>(x_var_name.c_str()),
+                         const_cast<const int*>(&i_x_shape_data[0]),
+                         x_dims.size(),
+                         DTYPE_FP32,
+                         static_cast<const char*>(output_var_name.c_str()),
+                         NULL,
+                         const_cast<const int*>(&axis[0]));
+  graph->AddNode(output_var_name);
+  return SUCCESS;
+}
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+REGISTER_SUBGRAPH_BRIDGE(transpose,
+                         kBM,
+                         paddle::lite::subgraph::bm::TransposeConverter);
+REGISTER_SUBGRAPH_BRIDGE(transpose2,
+                         kBM,
+                         paddle::lite::subgraph::bm::TransposeConverter);
--- a/lite/kernels/bm/subgraph_compute.cc
+++ b/lite/kernels/bm/subgraph_compute.cc
@@ -54,7 +54,7 @@ int SubgraphEngine::BuildDeviceProgram() {
  }
  std::string net_name = "paddle_bitmain";
  __bmcompile_opt(
-      graph.GetCompilerHandle(), const_cast<char*>(net_name.c_str()), 2);
+      graph.GetCompilerHandle(), const_cast<char*>(net_name.c_str()), 1);
  void* bmodel_data = nullptr;
  unsigned int data_size = 0;
  bm_hd_ = static_cast<bm_handle_t>(ctx.GetHandle());
@@ -109,7 +109,6 @@ int SubgraphEngine::BuildDeviceProgram() {
                            net_info_->output_dtypes[i],
                            stage.output_shapes[i]);
  }
  return status;
 }