From 499b389e484cc0238b7648584948a7cc9f30c2d5 Mon Sep 17 00:00:00 2001 From: Santa An <49897975+AnBaolei1984@users.noreply.github.com> Date: Sun, 26 Apr 2020 10:46:20 +0800 Subject: [PATCH] [LITE][BM] adaptive pool,test=develop (#3425) * [LITE][BM] fix reshape infer shape issue, optimize global pool,adaptive pool,multi thread --- lite/api/_paddle_use_ops.h | 1 + lite/api/test_classify_lite_bm.cc | 39 +++++--- lite/kernels/bm/bridges/CMakeLists.txt | 1 + lite/kernels/bm/bridges/act_op.cc | 5 ++ lite/kernels/bm/bridges/graph.cc | 5 ++ lite/kernels/bm/bridges/graph.h | 3 + lite/kernels/bm/bridges/paddle_use_bridges.h | 2 + lite/kernels/bm/bridges/pool_op.cc | 94 ++++++++++++++------ lite/kernels/bm/subgraph_compute.cc | 2 + lite/operators/CMakeLists.txt | 1 + lite/operators/max_pool_with_index_op.cc | 76 ++++++++++++++++ lite/operators/max_pool_with_index_op.h | 87 ++++++++++++++++++ 12 files changed, 281 insertions(+), 35 deletions(-) create mode 100644 lite/operators/max_pool_with_index_op.cc create mode 100644 lite/operators/max_pool_with_index_op.h diff --git a/lite/api/_paddle_use_ops.h b/lite/api/_paddle_use_ops.h index 778b4dc7a8..63d5938cf5 100644 --- a/lite/api/_paddle_use_ops.h +++ b/lite/api/_paddle_use_ops.h @@ -48,6 +48,7 @@ USE_LITE_OP(concat) USE_LITE_OP(conv2d) USE_LITE_OP(depthwise_conv2d) USE_LITE_OP(pool2d) +USE_LITE_OP(max_pool2d_with_index) USE_LITE_OP(batch_norm) USE_LITE_OP(fusion_elementwise_sub_activation) USE_LITE_OP(transpose) diff --git a/lite/api/test_classify_lite_bm.cc b/lite/api/test_classify_lite_bm.cc index b2507e28ad..e7ebc80ade 100644 --- a/lite/api/test_classify_lite_bm.cc +++ b/lite/api/test_classify_lite_bm.cc @@ -15,6 +15,7 @@ #include #include #include +#include //NOLINT #include #include "lite/api/cxx_api.h" #include "lite/api/paddle_use_kernels.h" @@ -30,14 +31,18 @@ DEFINE_string(input_img_txt_path, namespace paddle { namespace lite { -void TestModel(const std::vector& valid_places) { +const int g_batch_size = 1; +const int g_thread_num = 1; + +void instance_run() { lite::Predictor predictor; std::vector passes; + std::vector valid_places({Place{TARGET(kBM), PRECISION(kFloat)}, + Place{TARGET(kX86), PRECISION(kFloat)}}); predictor.Build(FLAGS_model_dir, "", "", valid_places, passes); - auto* input_tensor = predictor.GetInput(0); - input_tensor->Resize(DDim( - std::vector({1, 3, FLAGS_im_height, FLAGS_im_width}))); + input_tensor->Resize(DDim(std::vector( + {g_batch_size, 3, FLAGS_im_height, FLAGS_im_width}))); auto* data = input_tensor->mutable_data(); auto item_size = input_tensor->dims().production(); if (FLAGS_input_img_txt_path.empty()) { @@ -45,12 +50,15 @@ void TestModel(const std::vector& valid_places) { data[i] = 1; } } else { - std::fstream fs(FLAGS_input_img_txt_path, std::ios::in); - if (!fs.is_open()) { - LOG(FATAL) << "open input_img_txt error."; - } - for (int i = 0; i < item_size; i++) { - fs >> data[i]; + for (int j = 0; j < g_batch_size; j++) { + std::fstream fs(FLAGS_input_img_txt_path, std::ios::in); + if (!fs.is_open()) { + LOG(FATAL) << "open input_img_txt error."; + } + for (int i = 0; i < item_size / g_batch_size; i++) { + fs >> data[i]; + } + data += j * item_size / g_batch_size; } } for (int i = 0; i < FLAGS_warmup; ++i) { @@ -72,6 +80,7 @@ void TestModel(const std::vector& valid_places) { FILE* fp = fopen("result.txt", "wb"); for (int i = 0; i < out.size(); i++) { auto* out_data = out[i]->data(); + LOG(INFO) << out[i]->numel(); for (int j = 0; j < out[i]->numel(); j++) { fprintf(fp, "%f\n", out_data[j]); } @@ -79,6 +88,16 @@ void TestModel(const std::vector& valid_places) { fclose(fp); } +void TestModel(const std::vector& valid_places) { + std::vector> instances_vec; + for (int i = 0; i < g_thread_num; ++i) { + instances_vec.emplace_back(new std::thread(&instance_run)); + } + for (int i = 0; i < g_thread_num; ++i) { + instances_vec[i]->join(); + } +} + TEST(Classify, test_bm) { std::vector valid_places({Place{TARGET(kBM), PRECISION(kFloat)}, Place{TARGET(kX86), PRECISION(kFloat)}}); diff --git a/lite/kernels/bm/bridges/CMakeLists.txt b/lite/kernels/bm/bridges/CMakeLists.txt index 1985e76cde..57a89696c4 100644 --- a/lite/kernels/bm/bridges/CMakeLists.txt +++ b/lite/kernels/bm/bridges/CMakeLists.txt @@ -36,6 +36,7 @@ lite_cc_library(subgraph_bridge_shape_op_bm SRCS shape_op.cc DEPS ${bm_subgraph_ lite_cc_library(subgraph_bridge_split_op_bm SRCS split_op.cc DEPS ${bm_subgraph_bridge_deps}) lite_cc_library(subgraph_bridge_matmul_op_bm SRCS matmul_op.cc DEPS ${bm_subgraph_bridge_deps}) + set(bm_subgraph_bridges subgraph_bridge_registry subgraph_bridge_engine diff --git a/lite/kernels/bm/bridges/act_op.cc b/lite/kernels/bm/bridges/act_op.cc index 1739dd4185..c85e2c5e1e 100644 --- a/lite/kernels/bm/bridges/act_op.cc +++ b/lite/kernels/bm/bridges/act_op.cc @@ -54,6 +54,8 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) { active_type_id = ACTIVE_SQRT; } else if (op_type == "square") { active_type_id = ACTIVE_SQUARE; + } else if (op_type == "sigmoid") { + active_type_id = ACTIVE_SIGMOID; } else { LOG(FATAL) << "[BM] unsupport act type"; return FAILED; @@ -102,3 +104,6 @@ REGISTER_SUBGRAPH_BRIDGE(leaky_relu, paddle::lite::subgraph::bm::ActConverter); REGISTER_SUBGRAPH_BRIDGE(sqrt, kBM, paddle::lite::subgraph::bm::ActConverter); REGISTER_SUBGRAPH_BRIDGE(square, kBM, paddle::lite::subgraph::bm::ActConverter); +REGISTER_SUBGRAPH_BRIDGE(sigmoid, + kBM, + paddle::lite::subgraph::bm::ActConverter); diff --git a/lite/kernels/bm/bridges/graph.cc b/lite/kernels/bm/bridges/graph.cc index 32b10f5020..aeb810f028 100644 --- a/lite/kernels/bm/bridges/graph.cc +++ b/lite/kernels/bm/bridges/graph.cc @@ -20,11 +20,14 @@ namespace lite { namespace subgraph { namespace bm { +pthread_mutex_t Graph::mutex_compiler_ = PTHREAD_MUTEX_INITIALIZER; + void Graph::AddNode(const std::string& name) { nodes_.insert(std::make_pair(name, name)); } void Graph::CreateCompilerHandle() { + pthread_mutex_lock(&mutex_compiler_); #ifdef BM1682 compiler_handle_ = create_bmcompiler("BM1682"); #else @@ -33,6 +36,8 @@ void Graph::CreateCompilerHandle() { CHECK(compiler_handle_ != nullptr); } +void Graph::UnlockCompilerMutex() { pthread_mutex_unlock(&mutex_compiler_); } + } // namespace bm } // namespace subgraph } // namespace lite diff --git a/lite/kernels/bm/bridges/graph.h b/lite/kernels/bm/bridges/graph.h index 40dadcc92d..c54f4d7ad0 100644 --- a/lite/kernels/bm/bridges/graph.h +++ b/lite/kernels/bm/bridges/graph.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include #include @@ -36,10 +37,12 @@ class Graph { } void CreateCompilerHandle(); void* GetCompilerHandle() { return compiler_handle_; } + void UnlockCompilerMutex(); private: std::unordered_map nodes_; void* compiler_handle_; + static pthread_mutex_t mutex_compiler_; }; } // namespace bm diff --git a/lite/kernels/bm/bridges/paddle_use_bridges.h b/lite/kernels/bm/bridges/paddle_use_bridges.h index e644fe8b06..bb6003026d 100644 --- a/lite/kernels/bm/bridges/paddle_use_bridges.h +++ b/lite/kernels/bm/bridges/paddle_use_bridges.h @@ -58,3 +58,5 @@ USE_SUBGRAPH_BRIDGE(depthwise_conv2d_transpose, kBM); USE_SUBGRAPH_BRIDGE(shape, kBM); USE_SUBGRAPH_BRIDGE(split, kBM); USE_SUBGRAPH_BRIDGE(matmul, kBM); +USE_SUBGRAPH_BRIDGE(max_pool2d_with_index, kBM); +USE_SUBGRAPH_BRIDGE(sigmoid, kBM); diff --git a/lite/kernels/bm/bridges/pool_op.cc b/lite/kernels/bm/bridges/pool_op.cc index cd48db5b72..01760b7b77 100644 --- a/lite/kernels/bm/bridges/pool_op.cc +++ b/lite/kernels/bm/bridges/pool_op.cc @@ -11,7 +11,10 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. +#include #include +#include +#include #include "lite/kernels/bm/bridges/graph.h" #include "lite/kernels/bm/bridges/utility.h" #include "lite/kernels/npu/bridges/registry.h" @@ -54,46 +57,84 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { shape[0] = &i_output_shape_data[0]; name[0] = static_cast(output_var_name.c_str()); dim[0] = output_dims.size(); - auto pooling_type = op_info->GetAttr("pooling_type"); + std::string pooling_type; + if (op_info->HasAttr("pooling_type")) { + pooling_type = op_info->GetAttr("pooling_type"); + } else if (op_type == "max_pool2d_with_index") { + pooling_type = "max"; + } CHECK(pooling_type == "max" || pooling_type == "avg"); auto ksize = op_info->GetAttr>("ksize"); auto paddings = op_info->GetAttr>("paddings"); auto strides = op_info->GetAttr>("strides"); auto global_pooling = op_info->GetAttr("global_pooling"); - auto ceil_mode = op_info->GetAttr("ceil_mode"); + bool ceil_mode = false; + if (op_info->HasAttr("ceil_mode")) { + ceil_mode = op_info->GetAttr("ceil_mode"); + } + bool adaptive = false; + if (op_info->HasAttr("adaptive")) { + adaptive = op_info->GetAttr("adaptive"); + } bool average_exclusive = false; if (pooling_type == "avg") { average_exclusive = op_info->GetAttr("exclusive"); } + if (output_dims[2] == 1 && output_dims[3] == 1) { + global_pooling = true; + } if (global_pooling) { paddings[0] = 0; paddings[1] = 0; ksize[0] = i_x_shape_data[2]; ksize[1] = i_x_shape_data[3]; } - add_pooling_layer( - graph->GetCompilerHandle(), - const_cast(&i_x_shape_data[0]), - x_dims.size(), - static_cast(x_var_name.c_str()), - 1, - shape, - dim, - name, - ksize[0], - ksize[1], - paddings[0], - paddings[0], - paddings[1], - paddings[1], - strides[0], - strides[1], - (ksize[0] > 1 && ksize[1] > 1) && pooling_type == "max" ? 0 : 1, - static_cast(average_exclusive), - static_cast(global_pooling), - static_cast(ceil_mode), - static_cast(unique_op_name.c_str()), - nullptr); + bool is_max = (pooling_type == "max"); + if (adaptive && !global_pooling) { + user_cpu_param_t bm_param; + bm_param.op_type = USER_PADDLE_ADAPTIVE_POOL; + bm_param.u.adaptive_pool_parm.is_avg = !is_max; + int32_t* in_shape[1]; + int32_t in_dim[1]; + const char* in_name[1]; + in_shape[0] = &i_x_shape_data[0]; + in_name[0] = static_cast(x_var_name.c_str()); + in_dim[0] = x_dims.size(); + add_user_cpu_layer(graph->GetCompilerHandle(), + 1, + in_shape, + in_dim, + in_name, + 1, + shape, + dim, + name, + &bm_param, + static_cast(sizeof(bm_param))); + } else { + add_pooling_layer(graph->GetCompilerHandle(), + const_cast(&i_x_shape_data[0]), + x_dims.size(), + static_cast(x_var_name.c_str()), + 1, + shape, + dim, + name, + ksize[0], + ksize[1], + paddings[0], + paddings[0], + paddings[1], + paddings[1], + strides[0], + strides[1], + is_max ? 0 : 1, + static_cast(average_exclusive), + static_cast(global_pooling), + static_cast(ceil_mode), + static_cast(unique_op_name.c_str()), + nullptr); + } graph->AddNode(output_var_name); return SUCCESS; } @@ -105,3 +146,6 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { REGISTER_SUBGRAPH_BRIDGE(pool2d, kBM, paddle::lite::subgraph::bm::PoolConverter); +REGISTER_SUBGRAPH_BRIDGE(max_pool2d_with_index, + kBM, + paddle::lite::subgraph::bm::PoolConverter); diff --git a/lite/kernels/bm/subgraph_compute.cc b/lite/kernels/bm/subgraph_compute.cc index c6059461d1..d7640e1ac7 100644 --- a/lite/kernels/bm/subgraph_compute.cc +++ b/lite/kernels/bm/subgraph_compute.cc @@ -40,6 +40,7 @@ int SubgraphEngine::BuildDeviceProgram() { op->CheckShape(); op->InferShape(); std::string op_type = op->op_info()->Type(); + LOG(INFO) << op_type; if (!bridges.Exists(op_type, TARGET(kBM))) { return subgraph::FAILED; } @@ -59,6 +60,7 @@ int SubgraphEngine::BuildDeviceProgram() { unsigned int data_size = 0; bm_hd_ = static_cast(ctx.GetHandle()); finish_bmcompiler_data(graph.GetCompilerHandle(), &bmodel_data, &data_size); + graph.UnlockCompilerMutex(); bmrt_hd_ = bmrt_create(bm_hd_); if (false == bmrt_load_bmodel_data(bmrt_hd_, bmodel_data, data_size)) { return subgraph::FAILED; diff --git a/lite/operators/CMakeLists.txt b/lite/operators/CMakeLists.txt index 87f74f9fe7..332d97ed7c 100644 --- a/lite/operators/CMakeLists.txt +++ b/lite/operators/CMakeLists.txt @@ -108,6 +108,7 @@ add_operator(collect_fpn_proposals_op_lite extra SRCS collect_fpn_proposals_op.c add_operator(distribute_fpn_proposals_op_lite extra SRCS distribute_fpn_proposals_op.cc DEPS ${op_DEPS}) add_operator(crf_decoding_op_lite extra SRCS crf_decoding_op.cc DEPS ${op_DEPS}) add_operator(ctc_align_op_lite extra SRCS ctc_align_op.cc DEPS ${op_DEPS}) +add_operator(max_pool_with_index_op extra SRCS max_pool_with_index_op.cc DEPS ${op_DEPS}) # for OCR specific add_operator(while_op extra SRCS while_op.cc DEPS ${op_DEPS}) diff --git a/lite/operators/max_pool_with_index_op.cc b/lite/operators/max_pool_with_index_op.cc new file mode 100644 index 0000000000..b62cb26e31 --- /dev/null +++ b/lite/operators/max_pool_with_index_op.cc @@ -0,0 +1,76 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/max_pool_with_index_op.h" +#include +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool MaxPoolWithIndexOpLite::CheckShape() const { + CHECK_OR_FALSE(param_.x); + CHECK_OR_FALSE(param_.output); + + const auto& x_dims = param_.x->dims(); + const auto& strides = param_.strides; + const auto& ksize = param_.ksize; + const auto& paddings = *param_.paddings; + // "Pooling intput should be 4-D or 5-D tensor." + CHECK_OR_FALSE(x_dims.size() == 4 || x_dims.size() == 5); + // Input size and pooling size should be consistent. + CHECK_OR_FALSE(x_dims.size() - ksize.size() == 2U); + // Strides size and pooling size should be the same. + CHECK_OR_FALSE(ksize.size() == strides.size()); + // Paddings size must be 4. + CHECK_OR_FALSE(paddings.size() == 4L); + + return true; +} + +inline int MaxPoolOutputSize(int input_size, + int filter_size, + int padding, + int stride) { + int output_size = (input_size - filter_size + 2 * padding) / stride + 1; + return output_size; +} + +bool MaxPoolWithIndexOpLite::InferShapeImpl() const { + const auto x_dims = param_.x->dims(); + const auto ksize = param_.ksize; + std::vector output_shape({x_dims[0], x_dims[1]}); + const auto& strides = param_.strides; + const auto& paddings = *param_.paddings; + const auto adaptive = param_.adaptive; + + if (adaptive) { + output_shape.insert(output_shape.end(), ksize.begin(), ksize.end()); + } else { + for (size_t i = 0; i < ksize.size(); ++i) { + output_shape.push_back( + MaxPoolOutputSize(x_dims[i + 2], ksize[i], paddings[i], strides[i])); + } + } + param_.output->Resize(lite::DDim(output_shape)); + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(max_pool2d_with_index, + paddle::lite::operators::MaxPoolWithIndexOpLite); diff --git a/lite/operators/max_pool_with_index_op.h b/lite/operators/max_pool_with_index_op.h new file mode 100644 index 0000000000..bd82743c27 --- /dev/null +++ b/lite/operators/max_pool_with_index_op.h @@ -0,0 +1,87 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include "lite/core/kernel.h" +#include "lite/core/op_lite.h" +#include "lite/core/scope.h" +#include "lite/core/tensor.h" +#include "lite/operators/op_params.h" +#include "lite/utils/all.h" + +namespace paddle { +namespace lite { +namespace operators { + +class MaxPoolWithIndexOpLite : public OpLite { + public: + MaxPoolWithIndexOpLite() {} + + explicit MaxPoolWithIndexOpLite(const std::string &type) : OpLite(type) {} + + bool CheckShape() const override; + + bool InferShapeImpl() const override; + + // TODO(Superjomn) replace framework::OpDesc with a lite one. + bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override { + auto x = op_desc.Input("X").front(); + auto out = op_desc.Output("Out").front(); + auto mask = op_desc.Output("Mask").front(); + + CHECK(scope->FindVar(x)); + CHECK(scope->FindVar(out)); + CHECK(scope->FindVar(mask)); + param_.x = scope->FindVar(x)->GetMutable(); + param_.output = scope->FindVar(out)->GetMutable(); + + param_.ksize = op_desc.GetAttr>("ksize"); + param_.global_pooling = op_desc.GetAttr("global_pooling"); + param_.strides = op_desc.GetAttr>("strides"); + auto paddings = op_desc.GetAttr>("paddings"); + if (op_desc.HasAttr("adaptive")) { + param_.adaptive = op_desc.GetAttr("adaptive"); + } + // 2-pad to 4-pad + if (paddings.size() == 2L) { + for (size_t i = 0; i < 2L; ++i) { + int copy_pad = *(paddings.begin() + 2 * i); + paddings.insert(paddings.begin() + 2 * i + 1, copy_pad); + } + } else { + if (paddings.size() != 4L) { + LOG(FATAL) + << "Paddings size should be the same or twice as the inputs size."; + } + } + param_.paddings = std::make_shared>(paddings); + return true; + } + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + + std::string DebugString() const override { return "max_pool2d_with_index"; } + + private: + mutable PoolParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle -- GitLab