[LITE][BM] fix reshape infer shape issue, test=develop (#3384)

* [LITE][BM] fix reshape infer shape issue, test=develop * [LITE][BM] with testing=on, test=develop

[LITE][BM] fix reshape infer shape issue, test=develop (#3384)
* [LITE][BM] fix reshape infer shape issue, test=develop * [LITE][BM] with testing=on, test=develop
e55542dc · Santa An · GitHub · b0b60f4f · e55542dc · e55542dc
21 changed file
--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -190,7 +190,11 @@ if(WITH_TESTING)
           lite_cc_test(test_classify_lite_bm SRCS test_classify_lite_bm.cc
              DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
              ${ops} ${host_kernels} ${bm_kernels} ${bm_bridges}
-              ARGS --model_dir=${LITE_MODEL_DIR}/resnet50)
+              ARGS --model_dir=${LITE_MODEL_DIR}/classify)
+           lite_cc_test(test_yolov3_lite_bm SRCS test_yolov3_lite_bm.cc
+              DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
+              ${ops} ${host_kernels} ${bm_kernels} ${bm_bridges}
+              ARGS --model_dir=${LITE_MODEL_DIR}/yolov3)
        endif()
    endif()
 endif()

--- a/lite/api/_paddle_use_ops.h
+++ b/lite/api/_paddle_use_ops.h
@@ -63,6 +63,7 @@ USE_LITE_OP(swish)
 USE_LITE_OP(log)
 USE_LITE_OP(exp)
 USE_LITE_OP(conv2d_transpose)
+USE_LITE_OP(depthwise_conv2d_transpose)
 USE_LITE_OP(negative)
 USE_LITE_OP(pad2d)
 USE_LITE_OP(power)

--- a/lite/api/test_yolov3_lite_bm.cc
+++ b/lite/api/test_yolov3_lite_bm.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <fstream>
+#include <vector>
+#include "lite/api/cxx_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+#include "lite/api/test_helper.h"
+#include "lite/core/op_registry.h"
+DEFINE_string(input_img_txt_path,
+              "",
+              "if set input_img_txt_path, read the img filename as input.");
+namespace paddle {
+namespace lite {
+void TestModel(const std::vector<Place>& valid_places) {
+  lite::Predictor predictor;
+  std::vector<std::string> passes;
+  predictor.Build(FLAGS_model_dir,
+                  FLAGS_model_dir + "/model",
+                  FLAGS_model_dir + "/params",
+                  valid_places,
+                  passes);
+  auto* input_tensor = predictor.GetInput(0);
+  input_tensor->Resize(DDim(
+      std::vector<DDim::value_type>({1, 3, FLAGS_im_height, FLAGS_im_width})));
+  auto* data = input_tensor->mutable_data<float>();
+  auto item_size = input_tensor->dims().production();
+  if (FLAGS_input_img_txt_path.empty()) {
+    for (int i = 0; i < item_size; i++) {
+      data[i] = 1;
+    }
+  } else {
+    std::fstream fs(FLAGS_input_img_txt_path, std::ios::in);
+    if (!fs.is_open()) {
+      LOG(FATAL) << "open input_img_txt error.";
+    }
+    for (int i = 0; i < item_size; i++) {
+      fs >> data[i];
+    }
+  }
+  auto* image_tensor = predictor.GetInput(1);
+  image_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 2})));
+  data = image_tensor->mutable_data<float>();
+  data[0] = FLAGS_im_height;
+  data[1] = FLAGS_im_width;
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    predictor.Run();
+  }
+  auto start = GetCurrentUS();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    predictor.Run();
+  }
+  LOG(INFO) << "================== Speed Report ===================";
+  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
+            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
+            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
+            << " ms in average.";
+  auto out = predictor.GetOutputs();
+  FILE* fp = fopen("result.txt", "wb");
+  for (int i = 0; i < out.size(); i++) {
+    auto* out_data = out[i]->data<float>();
+    for (int j = 0; j < out[i]->numel(); j++) {
+      fprintf(fp, "%f\n", out_data[j]);
+    }
+  }
+  fclose(fp);
+}
+TEST(Yolov3, test_bm) {
+  std::vector<Place> valid_places({Place{TARGET(kBM), PRECISION(kFloat)},
+                                   Place{TARGET(kX86), PRECISION(kFloat)}});
+  TestModel(valid_places);
+}
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/bm/bridges/CMakeLists.txt
+++ b/lite/kernels/bm/bridges/CMakeLists.txt
@@ -32,6 +32,9 @@ lite_cc_library(subgraph_bridge_squeeze_op_bm SRCS squeeze_op.cc DEPS ${bm_subgr
 lite_cc_library(subgraph_bridge_cast_op_bm SRCS cast_op.cc DEPS ${bm_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_fill_constant_op_bm SRCS fill_constant_op.cc DEPS ${bm_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_assign_value_op_bm SRCS assign_value_op.cc DEPS ${bm_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_shape_op_bm SRCS shape_op.cc DEPS ${bm_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_split_op_bm SRCS split_op.cc DEPS ${bm_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_matmul_op_bm SRCS matmul_op.cc DEPS ${bm_subgraph_bridge_deps})
 set(bm_subgraph_bridges
        subgraph_bridge_registry
@@ -62,4 +65,7 @@ set(bm_subgraph_bridges
        subgraph_bridge_cast_op_bm
        subgraph_bridge_fill_constant_op_bm
        subgraph_bridge_assign_value_op_bm
+        subgraph_bridge_shape_op_bm
+        subgraph_bridge_split_op_bm
+        subgraph_bridge_matmul_op_bm
        CACHE INTERNAL "bm_subgraph_bridges")
--- a/lite/kernels/bm/bridges/assign_value_op.cc
+++ b/lite/kernels/bm/bridges/assign_value_op.cc
@@ -40,17 +40,31 @@ int AssignValueConverter(void* ctx, OpLite* op, KernelBase* kernel) {
    i_output_shape_data[i] = static_cast<int>(output_dims[i]);
    buffer_size *= i_output_shape_data[i];
  }
-  auto fp32_values = op_info->GetAttr<std::vector<float>>("fp32_values");
+  std::vector<float> fp32_values;
+  std::vector<int> int32_values;
  float* assign_data =
      reinterpret_cast<float*>(malloc(buffer_size * sizeof(float)));
  CHECK(assign_data != nullptr);
-  CHECK_EQ(buffer_size, fp32_values.size());
+  bm_data_type_t data_type = static_cast<bm_data_type_t>(DTYPE_FP32);
+  fp32_values = op_info->GetAttr<std::vector<float>>("fp32_values");
+  if (0 != fp32_values.size()) {
+    for (int i = 0; i < fp32_values.size(); i++) {
+      assign_data[i] = fp32_values[i];
+    }
+  } else {
+    int32_values = op_info->GetAttr<std::vector<int>>("int32_values");
+    data_type = static_cast<bm_data_type_t>(DTYPE_INT32);
+    CHECK_EQ(buffer_size, int32_values.size());
+    for (int i = 0; i < int32_values.size(); i++) {
+      assign_data[i] = int32_values[i];
+    }
+  }
  bm_add_const_tensor(graph->GetCompilerHandle(),
                      static_cast<const char*>(output_var_name.c_str()),
                      const_cast<const int*>(i_output_shape_data.data()),
                      output_dims.size(),
-                      static_cast<bm_data_type_t>(DTYPE_FP32),
+                      data_type,
                      reinterpret_cast<const void*>(assign_data));
  graph->AddNode(output_var_name);
  return SUCCESS;

--- a/lite/kernels/bm/bridges/conv_op.cc
+++ b/lite/kernels/bm/bridges/conv_op.cc
@@ -91,7 +91,6 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                 dilations[1],
                 static_cast<int>(has_bias));
  graph->AddNode(output_var_name);
-  LOG(INFO) << output_var_name << input_dims << " " << output_dims;
  return SUCCESS;
 }

--- a/lite/kernels/bm/bridges/conv_transpose_op.cc
+++ b/lite/kernels/bm/bridges/conv_transpose_op.cc
@@ -108,3 +108,6 @@ int ConvTransposeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 REGISTER_SUBGRAPH_BRIDGE(conv2d_transpose,
                         kBM,
                         paddle::lite::subgraph::bm::ConvTransposeConverter);
+REGISTER_SUBGRAPH_BRIDGE(depthwise_conv2d_transpose,
+                         kBM,
+                         paddle::lite::subgraph::bm::ConvTransposeConverter);
--- a/lite/kernels/bm/bridges/elementwise_ops.cc
+++ b/lite/kernels/bm/bridges/elementwise_ops.cc
@@ -65,7 +65,6 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  auto output_dims = output->dims();
  const int64_t* output_shape_data =
      const_cast<const int64_t*>(&output_dims.data()[0]);
-  LOG(INFO) << x_dims << " " << output_dims;
  std::vector<int32_t> i_output_shape_data(output_dims.size());
  for (size_t i = 0; i < output_dims.size(); i++) {
    i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);

--- a/lite/kernels/bm/bridges/interpolate_op.cc
+++ b/lite/kernels/bm/bridges/interpolate_op.cc
@@ -54,6 +54,7 @@ int InterpolateConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  } else {
    type = 0;
  }
+  is_int = false;
  if (type == 2 && is_int) {
    add_upsample_layer(graph->GetCompilerHandle(),
                       const_cast<const int*>(&i_x_shape_data[0]),

--- a/lite/kernels/bm/bridges/matmul_op.cc
+++ b/lite/kernels/bm/bridges/matmul_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <bmcompiler_if.h>
+#include <bmcompiler_op_code.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+int MatMulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto unique_op_name = lite::subgraph::bm::UniqueName(op_type);
+  // input
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto x_dims = x->dims();
+  const int64_t* x_shape_data = const_cast<const int64_t*>(&x_dims.data()[0]);
+  std::vector<int32_t> i_x_shape_data(x_dims.size());
+  for (size_t i = 0; i < x_dims.size(); i++) {
+    i_x_shape_data[i] = static_cast<int>(x_shape_data[i]);
+  }
+  auto y_var_name = op_info->Input("Y").front();
+  auto y = scope->FindVar(y_var_name)->GetMutable<lite::Tensor>();
+  auto y_dims = y->dims();
+  const int64_t* y_shape_data = const_cast<const int64_t*>(&y_dims.data()[0]);
+  std::vector<int32_t> i_y_shape_data(y_dims.size());
+  for (size_t i = 0; i < y_dims.size(); i++) {
+    i_y_shape_data[i] = static_cast<int>(y_shape_data[i]);
+  }
+  // output
+  auto output_var_name = op_info->Output("Out").front();
+  bool transpose_x = op_info->GetAttr<bool>("transpose_X");
+  bool transpose_y = op_info->GetAttr<bool>("transpose_Y");
+  float alpha = op_info->GetAttr<float>("alpha");
+  LOG(INFO) << x_dims << " " << y_dims << " " << alpha << " " << transpose_x
+            << " " << transpose_y;
+#if 0
+  add_const_binary_layer(graph->GetCompilerHandle(),
+                         static_cast<const char*>(x_var_name.c_str()),
+                         const_cast<const int*>(&i_x_shape_data[0]),
+                         x_dims.size(),
+                         scale,
+                         static_cast<const char*>(unique_op_scale_name.c_str()),
+                         BINARY_MUL,
+                         0);
+  add_const_binary_layer(graph->GetCompilerHandle(),
+                         static_cast<const char*>(unique_op_scale_name.c_str()),
+                         const_cast<const int*>(&i_x_shape_data[0]),
+                         x_dims.size(),
+                         bias,
+                         static_cast<const char*>(output_var_name.c_str()),
+                         BINARY_ADD,
+                         0);
+#endif
+  graph->AddNode(output_var_name);
+  return SUCCESS;
+}
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+REGISTER_SUBGRAPH_BRIDGE(matmul,
+                         kBM,
+                         paddle::lite::subgraph::bm::MatMulConverter);
--- a/lite/kernels/bm/bridges/multiclass_nms_op.cc
+++ b/lite/kernels/bm/bridges/multiclass_nms_op.cc
@@ -45,14 +45,6 @@ int MultiClassNMSConverter(void* ctx, OpLite* op, KernelBase* kernel) {
    i_score_shape_data[i] = static_cast<int32_t>(score_dims[i]);
  }
-  auto out_var_name = op_info->Output("Out").front();
-  auto out = scope->FindVar(out_var_name)->GetMutable<lite::Tensor>();
-  auto out_dims = out->dims();
-  std::vector<int32_t> i_out_shape_data(out_dims.size());
-  for (size_t i = 0; i < out_dims.size(); i++) {
-    i_out_shape_data[i] = static_cast<int32_t>(out_dims[i]);
-  }
  auto background_label = op_info->GetAttr<int>("background_label");
  auto keep_top_k = op_info->GetAttr<int>("keep_top_k");
  auto nms_top_k = op_info->GetAttr<int>("nms_top_k");
@@ -64,6 +56,26 @@ int MultiClassNMSConverter(void* ctx, OpLite* op, KernelBase* kernel) {
    normalized = op_info->GetAttr<bool>("normalized");
  }
+  auto out_var_name = op_info->Output("Out").front();
+  auto out = scope->FindVar(out_var_name)->GetMutable<lite::Tensor>();
+  std::vector<int64_t> vec_out_dim(score_dims.size());
+  if (3 == score_dims.size()) {
+    vec_out_dim[0] = score_dims[0];  // batch_size
+    vec_out_dim[1] = keep_top_k;
+    vec_out_dim[2] = 6;
+  } else {
+    vec_out_dim[0] = keep_top_k;
+    vec_out_dim[1] = 6;
+  }
+  DDimLite out_dims(vec_out_dim);
+  out->Resize(out_dims);
+  out->mutable_data<float>();
+  std::vector<int32_t> i_out_shape_data(out_dims.size());
+  for (size_t i = 0; i < out_dims.size(); i++) {
+    i_out_shape_data[i] = static_cast<int32_t>(out_dims[i]);
+  }
  user_cpu_param_t bm_param;
  bm_param.op_type = USER_PADDLE_MULTICLASS_NMS;
  bm_param.u.multiclass_nms_param.background_label = background_label;
@@ -88,12 +100,9 @@ int MultiClassNMSConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  int32_t* out_shape[1];
  int32_t out_dim[1];
  const char* out_name[1];
-  i_out_shape_data[0] = keep_top_k;
-  i_out_shape_data[1] = 6;
  out_shape[0] = &i_out_shape_data[0];
-  out_dim[0] = 2;
+  out_dim[0] = out_dims.size();
  out_name[0] = static_cast<const char*>(out_var_name.c_str());
  add_user_cpu_layer(graph->GetCompilerHandle(),
                     input_num,
                     in_shape,

--- a/lite/kernels/bm/bridges/paddle_use_bridges.h
+++ b/lite/kernels/bm/bridges/paddle_use_bridges.h
@@ -48,8 +48,13 @@ USE_SUBGRAPH_BRIDGE(slice, kBM);
 USE_SUBGRAPH_BRIDGE(conv2d_transpose, kBM);
 USE_SUBGRAPH_BRIDGE(reduce_sum, kBM);
 USE_SUBGRAPH_BRIDGE(reduce_mean, kBM);
+USE_SUBGRAPH_BRIDGE(reduce_max, kBM);
 USE_SUBGRAPH_BRIDGE(squeeze, kBM);
 USE_SUBGRAPH_BRIDGE(squeeze2, kBM);
 USE_SUBGRAPH_BRIDGE(cast, kBM);
 USE_SUBGRAPH_BRIDGE(fill_constant, kBM);
 USE_SUBGRAPH_BRIDGE(assign_value, kBM);
+USE_SUBGRAPH_BRIDGE(depthwise_conv2d_transpose, kBM);
+USE_SUBGRAPH_BRIDGE(shape, kBM);
+USE_SUBGRAPH_BRIDGE(split, kBM);
+USE_SUBGRAPH_BRIDGE(matmul, kBM);
--- a/lite/kernels/bm/bridges/reduce_full_op.cc
+++ b/lite/kernels/bm/bridges/reduce_full_op.cc
@@ -49,6 +49,8 @@ int ReduceFullConverter(void* ctx, OpLite* op, KernelBase* kernel) {
    op_code = REDUCE_SUM;
  } else if (op_type == "reduce_mean") {
    op_code = REDUCE_MEAN;
+  } else if (op_type == "reduce_max") {
+    op_code = REDUCE_MAX;
  }
  add_reduce_full_layer(graph->GetCompilerHandle(),
@@ -75,3 +77,6 @@ REGISTER_SUBGRAPH_BRIDGE(reduce_sum,
 REGISTER_SUBGRAPH_BRIDGE(reduce_mean,
                         kBM,
                         paddle::lite::subgraph::bm::ReduceFullConverter);
+REGISTER_SUBGRAPH_BRIDGE(reduce_max,
+                         kBM,
+                         paddle::lite::subgraph::bm::ReduceFullConverter);
--- a/lite/kernels/bm/bridges/shape_op.cc
+++ b/lite/kernels/bm/bridges/shape_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <bmcompiler_defs.h>
+#include <bmcompiler_if.h>
+#include <bmcompiler_if_lite.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+int ShapeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  // input
+  auto x_var_name = op_info->Input("Input").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto x_dims = x->dims();
+  // output
+  auto output_var_name = op_info->Output("Out").front();
+  std::vector<int32_t> i_x_shape_data(x_dims.size());
+  for (size_t i = 0; i < x_dims.size(); i++) {
+    i_x_shape_data[i] = static_cast<int32_t>(x_dims[i]);
+  }
+  add_shape_ref_layer(graph->GetCompilerHandle(),
+                      static_cast<const char*>(x_var_name.c_str()),
+                      const_cast<const int*>(i_x_shape_data.data()),
+                      x_dims.size(),
+                      static_cast<const char*>(output_var_name.c_str()));
+  graph->AddNode(output_var_name);
+  return SUCCESS;
+}
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+REGISTER_SUBGRAPH_BRIDGE(shape,
+                         kBM,
+                         paddle::lite::subgraph::bm::ShapeConverter);
--- a/lite/kernels/bm/bridges/split_op.cc
+++ b/lite/kernels/bm/bridges/split_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <bmcompiler_if.h>
+#include <bmcompiler_op_code.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+int SplitConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  // input
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto x_dims = x->dims();
+  const int64_t* x_shape_data = const_cast<const int64_t*>(&x_dims.data()[0]);
+  std::vector<int32_t> i_x_shape_data(x_dims.size());
+  for (size_t i = 0; i < x_dims.size(); i++) {
+    i_x_shape_data[i] = static_cast<int>(x_shape_data[i]);
+  }
+  // output
+  auto output_names = op_info->Output("Out");
+  auto axis = op_info->GetAttr<int>("axis");
+  auto num = op_info->GetAttr<int>("num");
+  auto sections = op_info->GetAttr<std::vector<int>>("sections");
+  if (0 == num) {
+    num = sections.size();
+  }
+  if (0 == sections.size()) {
+    for (size_t i = 0; i < num; i++) {
+      sections.push_back(x_dims[axis] / num);
+    }
+  }
+  int** shape = new int*[num];
+  int* dim = new int[num];
+  const char** name = new const char*[num];
+  for (size_t i = 0; i < num; i++) {
+    auto out = scope->FindVar(output_names[i])->GetMutable<lite::Tensor>();
+    name[i] = static_cast<const char*>(output_names[i].c_str());
+    auto out_dims = out->dims();
+    shape[i] = new int[out_dims.size()];
+    for (size_t j = 0; j < out_dims.size(); j++) {
+      shape[i][j] = out_dims[j];
+    }
+    dim[i] = out_dims.size();
+  }
+  add_tf_split_layer(graph->GetCompilerHandle(),
+                     const_cast<const int*>(&i_x_shape_data[0]),
+                     x_dims.size(),
+                     static_cast<const char*>(x_var_name.c_str()),
+                     num,
+                     shape,
+                     dim,
+                     name,
+                     x_dims.size(),
+                     axis,
+                     const_cast<const int*>(&sections[0]),
+                     num);
+  for (size_t i = 0; i < num; i++) {
+    graph->AddNode(output_names[i]);
+    delete[] shape[i];
+  }
+  delete[] shape;
+  delete[] name;
+  delete[] dim;
+  return SUCCESS;
+}
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+REGISTER_SUBGRAPH_BRIDGE(split,
+                         kBM,
+                         paddle::lite::subgraph::bm::SplitConverter);
--- a/lite/kernels/bm/bridges/transpose_op.cc
+++ b/lite/kernels/bm/bridges/transpose_op.cc
@@ -15,6 +15,7 @@
 #include <bmcompiler_defs.h>
 #include <bmcompiler_if.h>
 #include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/utility.h"
 #include "lite/kernels/npu/bridges/registry.h"
 namespace paddle {
@@ -39,11 +40,20 @@ int TransposeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  const int64_t* output_shape_data =
      const_cast<const int64_t*>(&output_dims.data()[0]);
  std::vector<int32_t> i_x_shape_data(x_dims.size());
-  std::vector<int32_t> i_output_shape_data(output_dims.size());
+  std::vector<int32_t> i_output_shape_data(x_dims.size());
  for (size_t i = 0; i < x_dims.size(); i++) {
    i_x_shape_data[i] = static_cast<int>(x_shape_data[i]);
  }
-  for (size_t i = 0; i < output_dims.size(); i++) {
+  auto out_name = output_var_name;
+  if (x_dims.size() > output_dims.size()) {
+    for (size_t i = 0; i < (x_dims.size() - output_dims.size()); i++) {
+      i_output_shape_data[i] = 1;
+    }
+    out_name = lite::subgraph::bm::UniqueName(op_type);
+  }
+  for (size_t i = (x_dims.size() - output_dims.size()); i < output_dims.size();
+       i++) {
    i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
  }
  auto axis = op_info->GetAttr<std::vector<int>>("axis");
@@ -53,9 +63,22 @@ int TransposeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                         const_cast<const int*>(&i_x_shape_data[0]),
                         x_dims.size(),
                         DTYPE_FP32,
-                         static_cast<const char*>(output_var_name.c_str()),
+                         static_cast<const char*>(out_name.c_str()),
                         NULL,
                         const_cast<const int*>(&axis[0]));
+  if (x_dims.size() > output_dims.size()) {
+    std::vector<int32_t> i_real_output_shape_data(output_dims.size());
+    for (size_t i = 0; i < output_dims.size(); i++) {
+      i_real_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
+    }
+    add_reshape_layer_v2(graph->GetCompilerHandle(),
+                         static_cast<const char*>(out_name.c_str()),
+                         const_cast<const int*>(&i_output_shape_data[0]),
+                         i_output_shape_data.size(),
+                         static_cast<const char*>(output_var_name.c_str()),
+                         const_cast<const int*>(&i_real_output_shape_data[0]),
+                         output_dims.size());
+  }
  graph->AddNode(output_var_name);
  return SUCCESS;
 }

--- a/lite/kernels/bm/subgraph_compute.cc
+++ b/lite/kernels/bm/subgraph_compute.cc
@@ -88,18 +88,27 @@ int SubgraphEngine::BuildDeviceProgram() {
  // output
  origin_odims_.resize(output_names_.size());
  origin_otensors_.resize(output_names_.size());
-  device_outputs_.resize(output_names_.size());
+  device_outputs_.resize(net_info_->output_num);
-  for (size_t i = 0; i < output_names_.size(); i++) {
+  int out_index = 0;
-    origin_otensors_[i] = scope_->FindMutableTensor(net_info_->output_names[i]);
+  for (int i = 0; i < output_names_.size(); i++) {
-    CHECK(origin_otensors_[i]);
+    outname_map_.insert(std::pair<std::string, int>(output_names_[i], i));
-    origin_odims_[i] = origin_otensors_[i]->dims();
+  }
-    origin_otensors_[i]->mutable_data<float>();
+  for (int i = 0; i < net_info_->output_num; i++) {
+    Tensor* t_cur = scope_->FindMutableTensor(net_info_->output_names[i]);
+    CHECK(t_cur != nullptr);
    bm_device_mem_t* p_mem =
        static_cast<bm_device_mem_t*>(malloc(sizeof(bm_device_mem_t)));
    CHECK(p_mem != nullptr);
-    CHECK_EQ(bm_malloc_device_byte(
+    if (outname_map_.find(net_info_->output_names[i]) != outname_map_.end()) {
-                 bm_hd_, p_mem, origin_otensors_[i]->memory_size()),
+      origin_otensors_[out_index] = t_cur;
-             BM_SUCCESS);
+      origin_odims_[out_index] = origin_otensors_[out_index]->dims();
+      origin_otensors_[out_index]->mutable_data<float>();
+      out_index += 1;
+    }
+    CHECK_EQ(
+        bm_malloc_device_byte(bm_hd_, p_mem, net_info_->max_output_bytes[i]),
+        BM_SUCCESS);
    bmrt_tensor_with_device(&device_outputs_[i],
                            *p_mem,
                            net_info_->output_dtypes[i],
@@ -123,10 +132,14 @@ int SubgraphEngine::LaunchDeviceProgram() {
                        true,
                        false);
  bm_thread_sync(bm_hd_);
+  int out_index = 0;
  for (size_t i = 0; i < device_outputs_.size(); i++) {
-    bm_memcpy_d2s(bm_hd_,
+    if (outname_map_.find(net_info_->output_names[i]) != outname_map_.end()) {
-                  const_cast<void*>(origin_otensors_[i]->raw_data()),
+      bm_memcpy_d2s(bm_hd_,
-                  device_outputs_[i].device_mem);
+                    const_cast<void*>(origin_otensors_[out_index]->raw_data()),
+                    device_outputs_[i].device_mem);
+      out_index++;
+    }
  }
  return 0;
 }

--- a/lite/kernels/bm/subgraph_compute.h
+++ b/lite/kernels/bm/subgraph_compute.h
@@ -51,6 +51,7 @@ class SubgraphEngine : public subgraph::Engine {
  void *bmrt_hd_;
  std::vector<bm_tensor_t> device_inputs_;
  std::vector<bm_tensor_t> device_outputs_;
+  std::map<std::string, int> outname_map_;
  const char **net_names_;
  const bm_net_info_t *net_info_;
  bm_handle_t bm_hd_;

--- a/lite/operators/conv_transpose_op.cc
+++ b/lite/operators/conv_transpose_op.cc
@@ -157,3 +157,5 @@ bool ConvTransposeOpLite::AttachImpl(const cpp::OpDesc& op_desc,
 REGISTER_LITE_OP(conv2d_transpose,
                 paddle::lite::operators::ConvTransposeOpLite);
+REGISTER_LITE_OP(depthwise_conv2d_transpose,
+                 paddle::lite::operators::ConvTransposeOpLite);
--- a/lite/operators/reshape_op.cc
+++ b/lite/operators/reshape_op.cc
@@ -37,7 +37,7 @@ bool ReshapeOp::InferShapeImpl() const {
    for (size_t i = 0; i < shape_tensor_vct.size(); i++) {
      final_shape[i] = shape_tensor_vct[i]->data<int>()[0];
    }
-  } else if (shape_tensor != nullptr) {
+  } else if (shape_tensor != nullptr && shape_tensor->data<int>() != nullptr) {
    auto *shape_tensor_data = shape_tensor->data<int>();
    final_shape = std::vector<int>(shape_tensor_data,
                                   shape_tensor_data + shape_tensor->numel());

--- a/lite/tools/build_bm.sh
+++ b/lite/tools/build_bm.sh
@@ -5,7 +5,7 @@ set -ex
 BM_SDK_ROOT="$(pwd)/third-party/bmlibs/bm_sc3_libs"     # BM SDK
 TARGET_NAME="BM1682"     # default target
 BUILD_EXTRA=OFF                     # ON(with sequence ops)/OFF
-WITH_TESTING=OFF                    # ON/OFF
+WITH_TESTING=ON                  # ON/OFF
 function print_usage {
    echo -e "\nUSAGE:"