From 23d2f921ed2e14c211fad5c2752124b59d0b13fb Mon Sep 17 00:00:00 2001 From: Santa An <49897975+AnBaolei1984@users.noreply.github.com> Date: Sun, 27 Sep 2020 10:39:09 +0800 Subject: [PATCH] [LITE][BM] support multiclass_nms2 and fix some issues, test=develop (#4422) * [LITE][BM] fix input shape order changed issue,test=develop (#4407) * [LITE][BM] support multiclass_nms2 and fix some issues, test=develop (#4379) --- lite/backends/bm/target_wrapper.cc | 2 +- lite/kernels/bm/bridges/box_coder_op.cc | 8 ++++- lite/kernels/bm/bridges/cast_op.cc | 3 +- lite/kernels/bm/bridges/elementwise_ops.cc | 27 +++++++++++--- lite/kernels/bm/bridges/multiclass_nms_op.cc | 38 +++++++++++++++++--- lite/kernels/bm/bridges/paddle_use_bridges.h | 1 + lite/kernels/bm/bridges/yolo_box_op.cc | 10 +++--- lite/kernels/bm/subgraph_compute.cc | 16 +++++---- 8 files changed, 82 insertions(+), 23 deletions(-) diff --git a/lite/backends/bm/target_wrapper.cc b/lite/backends/bm/target_wrapper.cc index 6dab2a574d..83aa4dc8c1 100644 --- a/lite/backends/bm/target_wrapper.cc +++ b/lite/backends/bm/target_wrapper.cc @@ -23,7 +23,7 @@ int TargetWrapperBM::device_id_ = 0; std::map TargetWrapperBM::bm_hds_; size_t TargetWrapperBM::num_devices() { - int count = 0; + int count = 1; bm_status_t ret = bm_dev_getcount(&count); CHECK_EQ(ret, BM_SUCCESS) << "Failed with error code: " << static_cast(ret); diff --git a/lite/kernels/bm/bridges/box_coder_op.cc b/lite/kernels/bm/bridges/box_coder_op.cc index 9ef1824a64..999ea4dca2 100644 --- a/lite/kernels/bm/bridges/box_coder_op.cc +++ b/lite/kernels/bm/bridges/box_coder_op.cc @@ -73,10 +73,16 @@ int BoxCoderConverter(void* ctx, OpLite* op, KernelBase* kernel) { if (op_info->HasAttr("variance")) { variance = op_info->GetAttr>("variance"); } + int variance_len = variance.size(); user_cpu_param_t bm_param; bm_param.op_type = USER_PADDLE_BOX_CODER; bm_param.u.box_coder_param.axis = axis; - bm_param.u.box_coder_param.variance = &variance[0]; + CHECK_LE(variance_len, 2000); + memset(bm_param.u.box_coder_param.variance, 0, 2000 * sizeof(float)); + memcpy(bm_param.u.box_coder_param.variance, + &variance[0], + variance_len * sizeof(float)); + bm_param.u.box_coder_param.variance_len = variance_len; bm_param.u.box_coder_param.code_type = (code_type == "encode_center_size") ? 0 : 1; bm_param.u.box_coder_param.normalized = box_normalized; diff --git a/lite/kernels/bm/bridges/cast_op.cc b/lite/kernels/bm/bridges/cast_op.cc index 42c0751b92..45cc90c201 100644 --- a/lite/kernels/bm/bridges/cast_op.cc +++ b/lite/kernels/bm/bridges/cast_op.cc @@ -32,7 +32,8 @@ bool CvtDtype(int dtype, int* ptype) { *ptype = DTYPE_INT16; break; case 2: - *ptype = DTYPE_FP32; + case 3: + *ptype = DTYPE_INT32; break; case 5: *ptype = DTYPE_FP32; diff --git a/lite/kernels/bm/bridges/elementwise_ops.cc b/lite/kernels/bm/bridges/elementwise_ops.cc index 715874d418..9124821b6e 100644 --- a/lite/kernels/bm/bridges/elementwise_ops.cc +++ b/lite/kernels/bm/bridges/elementwise_ops.cc @@ -127,7 +127,7 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) { const float* y_data = const_cast(y->mutable_data()); const float* x_data = const_cast(x->mutable_data()); auto unique_op_name = lite::subgraph::bm::UniqueName("expand_ndims"); - std::vector i_expand_shape_data(3); + std::vector i_expand_shape_data; if (x_is_const && y_is_const) { float* cpu_data = compute_elementwise_both_const(op); bm_add_const_tensor(graph->GetCompilerHandle(), @@ -157,12 +157,31 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) { static_cast(unique_op_name.c_str())); name[1] = static_cast(unique_op_name.c_str()); dim[1] = 3; - i_expand_shape_data[0] = i_y_shape_data[0]; - i_expand_shape_data[1] = 1; - i_expand_shape_data[2] = 1; + i_expand_shape_data.push_back(i_y_shape_data[0]); + i_expand_shape_data.push_back(1); + i_expand_shape_data.push_back(1); shape[1] = &i_expand_shape_data[0]; y_data = nullptr; } + } else { + if (dim[1] < dim[0]) { + for (size_t i = 0; i < dim[1]; i++) { + i_expand_shape_data.push_back(i_y_shape_data[i]); + } + for (size_t i = dim[1]; i < dim[0]; i++) { + i_expand_shape_data.push_back(1); + } + add_reshape_layer_v2(graph->GetCompilerHandle(), + name[1], + shape[1], + dim[1], + static_cast(unique_op_name.c_str()), + const_cast(&i_expand_shape_data[0]), + i_expand_shape_data.size()); + dim[1] = dim[0]; + shape[1] = &i_expand_shape_data[0]; + name[1] = static_cast(unique_op_name.c_str()); + } } add_binary_layer_v2(graph->GetCompilerHandle(), name[0], diff --git a/lite/kernels/bm/bridges/multiclass_nms_op.cc b/lite/kernels/bm/bridges/multiclass_nms_op.cc index fb7d656dd2..6270dc9a30 100644 --- a/lite/kernels/bm/bridges/multiclass_nms_op.cc +++ b/lite/kernels/bm/bridges/multiclass_nms_op.cc @@ -51,7 +51,7 @@ int MultiClassNMSConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto score_threshold = op_info->GetAttr("score_threshold"); auto nms_threshold = op_info->GetAttr("nms_threshold"); auto nms_eta = op_info->GetAttr("nms_eta"); - bool normalized; + bool normalized = false; if (op_info->HasAttr("normalized")) { normalized = op_info->GetAttr("normalized"); } @@ -97,12 +97,39 @@ int MultiClassNMSConverter(void* ctx, OpLite* op, KernelBase* kernel) { in_dim[1] = score_dims.size(); in_name[0] = static_cast(boxes_var_name.c_str()); in_name[1] = static_cast(score_var_name.c_str()); - int32_t* out_shape[1]; - int32_t out_dim[1]; - const char* out_name[1]; + int32_t* out_shape[2]; + int32_t out_dim[2]; + const char* out_name[2]; out_shape[0] = &i_out_shape_data[0]; out_dim[0] = out_dims.size(); out_name[0] = static_cast(out_var_name.c_str()); + + std::vector vec_index_dim(score_dims.size()); + std::vector i_out_index_shape_data(score_dims.size()); + std::string out_index_name = ""; + if (op_type == "multiclass_nms2") { + output_num = 2; + out_index_name = op_info->Output("Index").front(); + auto out_index = scope->FindVar(out_index_name)->GetMutable(); + if (3 == score_dims.size()) { + vec_index_dim[0] = score_dims[0]; + vec_index_dim[1] = keep_top_k; + vec_index_dim[2] = 1; + } else { + vec_index_dim[0] = keep_top_k; + vec_index_dim[1] = 1; + } + DDimLite index_dims(vec_index_dim); + out_index->Resize(index_dims); + out_index->mutable_data(); + for (size_t i = 0; i < index_dims.size(); i++) { + i_out_index_shape_data[i] = static_cast(index_dims[i]); + } + out_shape[1] = &i_out_index_shape_data[0]; + out_dim[1] = index_dims.size(); + out_name[1] = static_cast(out_index_name.c_str()); + } + add_user_cpu_layer(graph->GetCompilerHandle(), input_num, in_shape, @@ -126,3 +153,6 @@ int MultiClassNMSConverter(void* ctx, OpLite* op, KernelBase* kernel) { REGISTER_SUBGRAPH_BRIDGE(multiclass_nms, kBM, paddle::lite::subgraph::bm::MultiClassNMSConverter); +REGISTER_SUBGRAPH_BRIDGE(multiclass_nms2, + kBM, + paddle::lite::subgraph::bm::MultiClassNMSConverter); diff --git a/lite/kernels/bm/bridges/paddle_use_bridges.h b/lite/kernels/bm/bridges/paddle_use_bridges.h index b9b575c6df..1891e13e43 100644 --- a/lite/kernels/bm/bridges/paddle_use_bridges.h +++ b/lite/kernels/bm/bridges/paddle_use_bridges.h @@ -39,6 +39,7 @@ USE_SUBGRAPH_BRIDGE(norm, kBM); USE_SUBGRAPH_BRIDGE(prior_box, kBM); USE_SUBGRAPH_BRIDGE(box_coder, kBM); USE_SUBGRAPH_BRIDGE(multiclass_nms, kBM); +USE_SUBGRAPH_BRIDGE(multiclass_nms2, kBM); USE_SUBGRAPH_BRIDGE(nearest_interp, kBM); USE_SUBGRAPH_BRIDGE(bilinear_interp, kBM); USE_SUBGRAPH_BRIDGE(yolo_box, kBM); diff --git a/lite/kernels/bm/bridges/yolo_box_op.cc b/lite/kernels/bm/bridges/yolo_box_op.cc index a5ea07f5fd..c1f8fa100f 100644 --- a/lite/kernels/bm/bridges/yolo_box_op.cc +++ b/lite/kernels/bm/bridges/yolo_box_op.cc @@ -67,17 +67,17 @@ int YoloBoxConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto downsample_ratio = op_info->GetAttr("downsample_ratio"); auto conf_thresh = op_info->GetAttr("conf_thresh"); auto anchors = op_info->GetAttr>("anchors"); - int* anchors_buffer = static_cast(malloc(sizeof(int) * anchors.size())); - CHECK(anchors_buffer != nullptr); - memcpy(anchors_buffer, &anchors[0], sizeof(int) * anchors.size()); + CHECK_LE(anchors.size(), 2000); user_cpu_param_t bm_param; bm_param.op_type = USER_PADDLE_YOLO_BOX; bm_param.u.yolo_box_param.class_num = class_num; bm_param.u.yolo_box_param.downsample_ratio = downsample_ratio; bm_param.u.yolo_box_param.conf_thresh = conf_thresh; - bm_param.u.yolo_box_param.anchors = anchors_buffer; + memset(bm_param.u.yolo_box_param.anchors, 0, 2000 * sizeof(int)); + memcpy(bm_param.u.yolo_box_param.anchors, + &anchors[0], + anchors.size() * sizeof(int)); bm_param.u.yolo_box_param.anchors_size = anchors.size(); - memcpy(anchors_buffer, &anchors[0], sizeof(int) * anchors.size()); int32_t input_num = 2; int32_t output_num = 2; int32_t* in_shape[2]; diff --git a/lite/kernels/bm/subgraph_compute.cc b/lite/kernels/bm/subgraph_compute.cc index efbb848313..eeb81ba9da 100644 --- a/lite/kernels/bm/subgraph_compute.cc +++ b/lite/kernels/bm/subgraph_compute.cc @@ -66,9 +66,9 @@ bool SubgraphEngine::BuildDeviceProgram() { graph.GetCompilerHandle(), const_cast(unique_net_name.c_str()), 1); void* bmodel_data = nullptr; unsigned int data_size = 0; - bm_hd_ = static_cast(ctx.GetHandle()); finish_bmcompiler_data(graph.GetCompilerHandle(), &bmodel_data, &data_size); graph.UnlockCompilerMutex(); + bm_hd_ = static_cast(ctx.GetHandle()); bmrt_hd_ = bmrt_create(bm_hd_); if (false == bmrt_load_bmodel_data(bmrt_hd_, bmodel_data, data_size)) { return false; @@ -79,15 +79,15 @@ bool SubgraphEngine::BuildDeviceProgram() { // input device_inputs_.resize(input_names_.size()); for (size_t i = 0; i < input_names_.size(); i++) { - origin_itensors_[i] = + auto origin_itensor = exec_scope_->FindMutableTensor(net_info_->input_names[i]); - CHECK(origin_itensors_[i]); + CHECK(origin_itensor); bm_device_mem_t* p_mem = static_cast(malloc(sizeof(bm_device_mem_t))); CHECK(p_mem != nullptr); - CHECK_EQ(bm_malloc_device_byte( - bm_hd_, p_mem, origin_itensors_[i]->memory_size()), - BM_SUCCESS); + CHECK_EQ( + bm_malloc_device_byte(bm_hd_, p_mem, origin_itensor->memory_size()), + BM_SUCCESS); bmrt_tensor_with_device(&device_inputs_[i], *p_mem, net_info_->input_dtypes[i], @@ -124,9 +124,11 @@ bool SubgraphEngine::BuildDeviceProgram() { bool SubgraphEngine::LaunchDeviceProgram() { for (size_t i = 0; i < device_inputs_.size(); i++) { + auto origin_itensor = + exec_scope_->FindMutableTensor(net_info_->input_names[i]); bm_memcpy_s2d(bm_hd_, device_inputs_[i].device_mem, - const_cast(origin_itensors_[i]->raw_data())); + const_cast(origin_itensor->raw_data())); } bmrt_launch_tensor_ex(bmrt_hd_, net_names_[0], -- GitLab