[LITE][BM] adaptive pool,test=develop (#3425)

* [LITE][BM] fix reshape infer shape issue, optimize global pool,adaptive pool,multi thread

[LITE][BM] adaptive pool,test=develop (#3425)
* [LITE][BM] fix reshape infer shape issue, optimize global pool,adaptive pool,multi thread
499b389e · Santa An · GitHub · 43777438 · 499b389e · 499b389e
12 changed file
--- a/lite/api/_paddle_use_ops.h
+++ b/lite/api/_paddle_use_ops.h
@@ -48,6 +48,7 @@ USE_LITE_OP(concat)
 USE_LITE_OP(conv2d)
 USE_LITE_OP(depthwise_conv2d)
 USE_LITE_OP(pool2d)
+USE_LITE_OP(max_pool2d_with_index)
 USE_LITE_OP(batch_norm)
 USE_LITE_OP(fusion_elementwise_sub_activation)
 USE_LITE_OP(transpose)

--- a/lite/api/test_classify_lite_bm.cc
+++ b/lite/api/test_classify_lite_bm.cc
@@ -15,6 +15,7 @@
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
 #include <fstream>
+#include <thread>  //NOLINT
 #include <vector>
 #include "lite/api/cxx_api.h"
 #include "lite/api/paddle_use_kernels.h"
@@ -30,14 +31,18 @@ DEFINE_string(input_img_txt_path,
 namespace paddle {
 namespace lite {

-void TestModel(const std::vector<Place>& valid_places) {
+const int g_batch_size = 1;
+const int g_thread_num = 1;
+
+void instance_run() {
  lite::Predictor predictor;
  std::vector<std::string> passes;
+  std::vector<Place> valid_places({Place{TARGET(kBM), PRECISION(kFloat)},
+                                   Place{TARGET(kX86), PRECISION(kFloat)}});
  predictor.Build(FLAGS_model_dir, "", "", valid_places, passes);
-
  auto* input_tensor = predictor.GetInput(0);
-  input_tensor->Resize(DDim(
-      std::vector<DDim::value_type>({1, 3, FLAGS_im_height, FLAGS_im_width})));
+  input_tensor->Resize(DDim(std::vector<DDim::value_type>(
+      {g_batch_size, 3, FLAGS_im_height, FLAGS_im_width})));
  auto* data = input_tensor->mutable_data<float>();
  auto item_size = input_tensor->dims().production();
  if (FLAGS_input_img_txt_path.empty()) {
@@ -45,12 +50,15 @@ void TestModel(const std::vector<Place>& valid_places) {
      data[i] = 1;
    }
  } else {
-    std::fstream fs(FLAGS_input_img_txt_path, std::ios::in);
-    if (!fs.is_open()) {
-      LOG(FATAL) << "open input_img_txt error.";
-    }
-    for (int i = 0; i < item_size; i++) {
-      fs >> data[i];
+    for (int j = 0; j < g_batch_size; j++) {
+      std::fstream fs(FLAGS_input_img_txt_path, std::ios::in);
+      if (!fs.is_open()) {
+        LOG(FATAL) << "open input_img_txt error.";
+      }
+      for (int i = 0; i < item_size / g_batch_size; i++) {
+        fs >> data[i];
+      }
+      data += j * item_size / g_batch_size;
    }
  }
  for (int i = 0; i < FLAGS_warmup; ++i) {
@@ -72,6 +80,7 @@ void TestModel(const std::vector<Place>& valid_places) {
  FILE* fp = fopen("result.txt", "wb");
  for (int i = 0; i < out.size(); i++) {
    auto* out_data = out[i]->data<float>();
+    LOG(INFO) << out[i]->numel();
    for (int j = 0; j < out[i]->numel(); j++) {
      fprintf(fp, "%f\n", out_data[j]);
    }
@@ -79,6 +88,16 @@ void TestModel(const std::vector<Place>& valid_places) {
  fclose(fp);
 }

+void TestModel(const std::vector<Place>& valid_places) {
+  std::vector<std::unique_ptr<std::thread>> instances_vec;
+  for (int i = 0; i < g_thread_num; ++i) {
+    instances_vec.emplace_back(new std::thread(&instance_run));
+  }
+  for (int i = 0; i < g_thread_num; ++i) {
+    instances_vec[i]->join();
+  }
+}
+
 TEST(Classify, test_bm) {
  std::vector<Place> valid_places({Place{TARGET(kBM), PRECISION(kFloat)},
                                   Place{TARGET(kX86), PRECISION(kFloat)}});

--- a/lite/kernels/bm/bridges/CMakeLists.txt
+++ b/lite/kernels/bm/bridges/CMakeLists.txt
@@ -36,6 +36,7 @@ lite_cc_library(subgraph_bridge_shape_op_bm SRCS shape_op.cc DEPS ${bm_subgraph_
 lite_cc_library(subgraph_bridge_split_op_bm SRCS split_op.cc DEPS ${bm_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_matmul_op_bm SRCS matmul_op.cc DEPS ${bm_subgraph_bridge_deps})

+
 set(bm_subgraph_bridges
        subgraph_bridge_registry
        subgraph_bridge_engine

--- a/lite/kernels/bm/bridges/act_op.cc
+++ b/lite/kernels/bm/bridges/act_op.cc
@@ -54,6 +54,8 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
    active_type_id = ACTIVE_SQRT;
  } else if (op_type == "square") {
    active_type_id = ACTIVE_SQUARE;
+  } else if (op_type == "sigmoid") {
+    active_type_id = ACTIVE_SIGMOID;
  } else {
    LOG(FATAL) << "[BM] unsupport act type";
    return FAILED;
@@ -102,3 +104,6 @@ REGISTER_SUBGRAPH_BRIDGE(leaky_relu,
                         paddle::lite::subgraph::bm::ActConverter);
 REGISTER_SUBGRAPH_BRIDGE(sqrt, kBM, paddle::lite::subgraph::bm::ActConverter);
 REGISTER_SUBGRAPH_BRIDGE(square, kBM, paddle::lite::subgraph::bm::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(sigmoid,
+                         kBM,
+                         paddle::lite::subgraph::bm::ActConverter);
--- a/lite/kernels/bm/bridges/graph.cc
+++ b/lite/kernels/bm/bridges/graph.cc
@@ -20,11 +20,14 @@ namespace lite {
 namespace subgraph {
 namespace bm {

+pthread_mutex_t Graph::mutex_compiler_ = PTHREAD_MUTEX_INITIALIZER;
+
 void Graph::AddNode(const std::string& name) {
  nodes_.insert(std::make_pair(name, name));
 }

 void Graph::CreateCompilerHandle() {
+  pthread_mutex_lock(&mutex_compiler_);
 #ifdef BM1682
  compiler_handle_ = create_bmcompiler("BM1682");
 #else
@@ -33,6 +36,8 @@ void Graph::CreateCompilerHandle() {
  CHECK(compiler_handle_ != nullptr);
 }

+void Graph::UnlockCompilerMutex() { pthread_mutex_unlock(&mutex_compiler_); }
+
 }  // namespace bm
 }  // namespace subgraph
 }  // namespace lite

--- a/lite/kernels/bm/bridges/graph.h
+++ b/lite/kernels/bm/bridges/graph.h
@@ -14,6 +14,7 @@

 #pragma once

+#include <pthread.h>
 #include <memory>
 #include <string>
 #include <unordered_map>
@@ -36,10 +37,12 @@ class Graph {
  }
  void CreateCompilerHandle();
  void* GetCompilerHandle() { return compiler_handle_; }
+  void UnlockCompilerMutex();

 private:
  std::unordered_map<std::string, std::string> nodes_;
  void* compiler_handle_;
+  static pthread_mutex_t mutex_compiler_;
 };

 }  // namespace bm

--- a/lite/kernels/bm/bridges/paddle_use_bridges.h
+++ b/lite/kernels/bm/bridges/paddle_use_bridges.h
@@ -58,3 +58,5 @@ USE_SUBGRAPH_BRIDGE(depthwise_conv2d_transpose, kBM);
 USE_SUBGRAPH_BRIDGE(shape, kBM);
 USE_SUBGRAPH_BRIDGE(split, kBM);
 USE_SUBGRAPH_BRIDGE(matmul, kBM);
+USE_SUBGRAPH_BRIDGE(max_pool2d_with_index, kBM);
+USE_SUBGRAPH_BRIDGE(sigmoid, kBM);
--- a/lite/kernels/bm/bridges/pool_op.cc
+++ b/lite/kernels/bm/bridges/pool_op.cc
@@ -11,7 +11,10 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#include <bmcompiler_defs.h>
 #include <bmcompiler_if.h>
+#include <bmcompiler_if_lite.h>
+#include <user_bmcpu_common.h>
 #include "lite/kernels/bm/bridges/graph.h"
 #include "lite/kernels/bm/bridges/utility.h"
 #include "lite/kernels/npu/bridges/registry.h"
@@ -54,46 +57,84 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  shape[0] = &i_output_shape_data[0];
  name[0] = static_cast<const char*>(output_var_name.c_str());
  dim[0] = output_dims.size();
-  auto pooling_type = op_info->GetAttr<std::string>("pooling_type");
+  std::string pooling_type;
+  if (op_info->HasAttr("pooling_type")) {
+    pooling_type = op_info->GetAttr<std::string>("pooling_type");
+  } else if (op_type == "max_pool2d_with_index") {
+    pooling_type = "max";
+  }
  CHECK(pooling_type == "max" || pooling_type == "avg");
  auto ksize = op_info->GetAttr<std::vector<int>>("ksize");
  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
  auto strides = op_info->GetAttr<std::vector<int>>("strides");
  auto global_pooling = op_info->GetAttr<bool>("global_pooling");
-  auto ceil_mode = op_info->GetAttr<bool>("ceil_mode");
+  bool ceil_mode = false;
+  if (op_info->HasAttr("ceil_mode")) {
+    ceil_mode = op_info->GetAttr<bool>("ceil_mode");
+  }
+  bool adaptive = false;
+  if (op_info->HasAttr("adaptive")) {
+    adaptive = op_info->GetAttr<bool>("adaptive");
+  }
  bool average_exclusive = false;
  if (pooling_type == "avg") {
    average_exclusive = op_info->GetAttr<bool>("exclusive");
  }
+  if (output_dims[2] == 1 && output_dims[3] == 1) {
+    global_pooling = true;
+  }
  if (global_pooling) {
    paddings[0] = 0;
    paddings[1] = 0;
    ksize[0] = i_x_shape_data[2];
    ksize[1] = i_x_shape_data[3];
  }
-  add_pooling_layer(
-      graph->GetCompilerHandle(),
-      const_cast<const int*>(&i_x_shape_data[0]),
-      x_dims.size(),
-      static_cast<const char*>(x_var_name.c_str()),
-      1,
-      shape,
-      dim,
-      name,
-      ksize[0],
-      ksize[1],
-      paddings[0],
-      paddings[0],
-      paddings[1],
-      paddings[1],
-      strides[0],
-      strides[1],
-      (ksize[0] > 1 && ksize[1] > 1) && pooling_type == "max" ? 0 : 1,
-      static_cast<int>(average_exclusive),
-      static_cast<int>(global_pooling),
-      static_cast<int>(ceil_mode),
-      static_cast<const char*>(unique_op_name.c_str()),
-      nullptr);
+  bool is_max = (pooling_type == "max");
+  if (adaptive && !global_pooling) {
+    user_cpu_param_t bm_param;
+    bm_param.op_type = USER_PADDLE_ADAPTIVE_POOL;
+    bm_param.u.adaptive_pool_parm.is_avg = !is_max;
+    int32_t* in_shape[1];
+    int32_t in_dim[1];
+    const char* in_name[1];
+    in_shape[0] = &i_x_shape_data[0];
+    in_name[0] = static_cast<const char*>(x_var_name.c_str());
+    in_dim[0] = x_dims.size();
+    add_user_cpu_layer(graph->GetCompilerHandle(),
+                       1,
+                       in_shape,
+                       in_dim,
+                       in_name,
+                       1,
+                       shape,
+                       dim,
+                       name,
+                       &bm_param,
+                       static_cast<int>(sizeof(bm_param)));
+  } else {
+    add_pooling_layer(graph->GetCompilerHandle(),
+                      const_cast<const int*>(&i_x_shape_data[0]),
+                      x_dims.size(),
+                      static_cast<const char*>(x_var_name.c_str()),
+                      1,
+                      shape,
+                      dim,
+                      name,
+                      ksize[0],
+                      ksize[1],
+                      paddings[0],
+                      paddings[0],
+                      paddings[1],
+                      paddings[1],
+                      strides[0],
+                      strides[1],
+                      is_max ? 0 : 1,
+                      static_cast<int>(average_exclusive),
+                      static_cast<int>(global_pooling),
+                      static_cast<int>(ceil_mode),
+                      static_cast<const char*>(unique_op_name.c_str()),
+                      nullptr);
+  }
  graph->AddNode(output_var_name);
  return SUCCESS;
 }
@@ -105,3 +146,6 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 REGISTER_SUBGRAPH_BRIDGE(pool2d,
                         kBM,
                         paddle::lite::subgraph::bm::PoolConverter);
+REGISTER_SUBGRAPH_BRIDGE(max_pool2d_with_index,
+                         kBM,
+                         paddle::lite::subgraph::bm::PoolConverter);
--- a/lite/kernels/bm/subgraph_compute.cc
+++ b/lite/kernels/bm/subgraph_compute.cc
@@ -40,6 +40,7 @@ int SubgraphEngine::BuildDeviceProgram() {
    op->CheckShape();
    op->InferShape();
    std::string op_type = op->op_info()->Type();
+    LOG(INFO) << op_type;
    if (!bridges.Exists(op_type, TARGET(kBM))) {
      return subgraph::FAILED;
    }
@@ -59,6 +60,7 @@ int SubgraphEngine::BuildDeviceProgram() {
  unsigned int data_size = 0;
  bm_hd_ = static_cast<bm_handle_t>(ctx.GetHandle());
  finish_bmcompiler_data(graph.GetCompilerHandle(), &bmodel_data, &data_size);
+  graph.UnlockCompilerMutex();
  bmrt_hd_ = bmrt_create(bm_hd_);
  if (false == bmrt_load_bmodel_data(bmrt_hd_, bmodel_data, data_size)) {
    return subgraph::FAILED;

--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
@@ -108,6 +108,7 @@ add_operator(collect_fpn_proposals_op_lite extra SRCS collect_fpn_proposals_op.c
 add_operator(distribute_fpn_proposals_op_lite extra SRCS distribute_fpn_proposals_op.cc DEPS ${op_DEPS})
 add_operator(crf_decoding_op_lite extra SRCS crf_decoding_op.cc DEPS ${op_DEPS})
 add_operator(ctc_align_op_lite extra SRCS ctc_align_op.cc DEPS ${op_DEPS})
+add_operator(max_pool_with_index_op extra SRCS max_pool_with_index_op.cc DEPS ${op_DEPS})

 # for OCR specific
 add_operator(while_op extra SRCS while_op.cc DEPS ${op_DEPS})

--- a/lite/operators/max_pool_with_index_op.cc
+++ b/lite/operators/max_pool_with_index_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/max_pool_with_index_op.h"
+#include <algorithm>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool MaxPoolWithIndexOpLite::CheckShape() const {
+  CHECK_OR_FALSE(param_.x);
+  CHECK_OR_FALSE(param_.output);
+
+  const auto& x_dims = param_.x->dims();
+  const auto& strides = param_.strides;
+  const auto& ksize = param_.ksize;
+  const auto& paddings = *param_.paddings;
+  // "Pooling intput should be 4-D or 5-D tensor."
+  CHECK_OR_FALSE(x_dims.size() == 4 || x_dims.size() == 5);
+  // Input size and pooling size should be consistent.
+  CHECK_OR_FALSE(x_dims.size() - ksize.size() == 2U);
+  // Strides size and pooling size should be the same.
+  CHECK_OR_FALSE(ksize.size() == strides.size());
+  // Paddings size must be 4.
+  CHECK_OR_FALSE(paddings.size() == 4L);
+
+  return true;
+}
+
+inline int MaxPoolOutputSize(int input_size,
+                             int filter_size,
+                             int padding,
+                             int stride) {
+  int output_size = (input_size - filter_size + 2 * padding) / stride + 1;
+  return output_size;
+}
+
+bool MaxPoolWithIndexOpLite::InferShapeImpl() const {
+  const auto x_dims = param_.x->dims();
+  const auto ksize = param_.ksize;
+  std::vector<int64_t> output_shape({x_dims[0], x_dims[1]});
+  const auto& strides = param_.strides;
+  const auto& paddings = *param_.paddings;
+  const auto adaptive = param_.adaptive;
+
+  if (adaptive) {
+    output_shape.insert(output_shape.end(), ksize.begin(), ksize.end());
+  } else {
+    for (size_t i = 0; i < ksize.size(); ++i) {
+      output_shape.push_back(
+          MaxPoolOutputSize(x_dims[i + 2], ksize[i], paddings[i], strides[i]));
+    }
+  }
+  param_.output->Resize(lite::DDim(output_shape));
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(max_pool2d_with_index,
+                 paddle::lite::operators::MaxPoolWithIndexOpLite);
--- a/lite/operators/max_pool_with_index_op.h
+++ b/lite/operators/max_pool_with_index_op.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+#include "lite/core/kernel.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/core/tensor.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class MaxPoolWithIndexOpLite : public OpLite {
+ public:
+  MaxPoolWithIndexOpLite() {}
+
+  explicit MaxPoolWithIndexOpLite(const std::string &type) : OpLite(type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  // TODO(Superjomn) replace framework::OpDesc with a lite one.
+  bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override {
+    auto x = op_desc.Input("X").front();
+    auto out = op_desc.Output("Out").front();
+    auto mask = op_desc.Output("Mask").front();
+
+    CHECK(scope->FindVar(x));
+    CHECK(scope->FindVar(out));
+    CHECK(scope->FindVar(mask));
+    param_.x = scope->FindVar(x)->GetMutable<lite::Tensor>();
+    param_.output = scope->FindVar(out)->GetMutable<lite::Tensor>();
+
+    param_.ksize = op_desc.GetAttr<std::vector<int>>("ksize");
+    param_.global_pooling = op_desc.GetAttr<bool>("global_pooling");
+    param_.strides = op_desc.GetAttr<std::vector<int>>("strides");
+    auto paddings = op_desc.GetAttr<std::vector<int>>("paddings");
+    if (op_desc.HasAttr("adaptive")) {
+      param_.adaptive = op_desc.GetAttr<bool>("adaptive");
+    }
+    // 2-pad to 4-pad
+    if (paddings.size() == 2L) {
+      for (size_t i = 0; i < 2L; ++i) {
+        int copy_pad = *(paddings.begin() + 2 * i);
+        paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
+      }
+    } else {
+      if (paddings.size() != 4L) {
+        LOG(FATAL)
+            << "Paddings size should be the same or twice as the inputs size.";
+      }
+    }
+    param_.paddings = std::make_shared<std::vector<int>>(paddings);
+    return true;
+  }
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "max_pool2d_with_index"; }
+
+ private:
+  mutable PoolParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle