From 92787a39bea2e8e76ebb3aeb592cda90cac6591a Mon Sep 17 00:00:00 2001 From: chonwhite Date: Thu, 5 Dec 2019 20:04:16 +0800 Subject: [PATCH] added fpga multiclass_nms implementation --- lite/api/CMakeLists.txt | 5 + lite/api/cxx_api.cc | 14 ++ lite/api/paddle_use_kernels.h | 165 ++++++++++++++------ lite/api/paddle_use_ops.h | 142 ++++++++++++----- lite/api/test_ssd_fpga.cc | 154 ++++++++++++++++++ lite/backends/fpga/KD/debugger.hpp | 2 +- lite/backends/fpga/KD/pes/conv_pe.hpp | 2 +- lite/backends/fpga/KD/tensor.hpp | 2 +- lite/core/kernel.h | 4 +- lite/core/mir/type_target_cast_pass.cc | 2 + lite/core/program.cc | 5 +- lite/kernels/arm/prior_box_compute.cc | 6 +- lite/kernels/fpga/CMakeLists.txt | 6 +- lite/kernels/fpga/feed_compute.cc | 1 + lite/kernels/fpga/io_copy_compute.cc | 6 +- lite/kernels/fpga/multiclass_nms_compute.cc | 91 ++++++----- lite/kernels/host/CMakeLists.txt | 2 +- lite/kernels/host/multiclass_nms_compute.cc | 6 +- 18 files changed, 469 insertions(+), 146 deletions(-) mode change 100644 => 100755 lite/api/cxx_api.cc create mode 100755 lite/api/test_ssd_fpga.cc mode change 100644 => 100755 lite/core/kernel.h mode change 100644 => 100755 lite/core/mir/type_target_cast_pass.cc mode change 100644 => 100755 lite/core/program.cc mode change 100644 => 100755 lite/kernels/arm/prior_box_compute.cc mode change 100644 => 100755 lite/kernels/host/CMakeLists.txt mode change 100644 => 100755 lite/kernels/host/multiclass_nms_compute.cc diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt index 408a63e3f5..25ba812a2a 100755 --- a/lite/api/CMakeLists.txt +++ b/lite/api/CMakeLists.txt @@ -194,6 +194,11 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING) CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels}) + lite_cc_test(test_ssd_fpga SRCS test_ssd_fpga.cc + DEPS ${lite_model_test_DEPS} + CL_DEPS ${opencl_kernels} + FPGA_DEPS ${fpga_kernels}) + lite_cc_test(test_inceptionv4 SRCS inceptionv4_test.cc DEPS ${lite_model_test_DEPS} CL_DEPS ${opencl_kernels} diff --git a/lite/api/cxx_api.cc b/lite/api/cxx_api.cc old mode 100644 new mode 100755 index 4647f20bbe..463a0b8462 --- a/lite/api/cxx_api.cc +++ b/lite/api/cxx_api.cc @@ -121,6 +121,7 @@ void Predictor::SaveOpKernelInfo(const std::string &model_dir) { << kpf_path; } +#ifndef LITE_WITH_FPGA lite::Tensor *Predictor::GetInput(size_t offset) { CHECK(input_names_.size() > offset) << "The network has " << input_names_.size() << " inputs" @@ -130,6 +131,19 @@ lite::Tensor *Predictor::GetInput(size_t offset) { << " in exec_scope"; return in_var->GetMutable(); } +#else +lite::Tensor *Predictor::GetInput(size_t offset) { + auto *_feed_list = exec_scope_->FindVar("feed"); + CHECK(_feed_list) << "no feed variable in exec_scope"; + auto *feed_list = _feed_list->GetMutable>(); + if (offset >= feed_list->size()) { + feed_list->resize(offset + 1); + } + return &feed_list->at(offset); +} +#endif + + // get inputs names std::vector Predictor::GetInputNames() { return input_names_; } diff --git a/lite/api/paddle_use_kernels.h b/lite/api/paddle_use_kernels.h index 4fa21429c6..c4bc385ad7 100644 --- a/lite/api/paddle_use_kernels.h +++ b/lite/api/paddle_use_kernels.h @@ -1,21 +1,6 @@ #pragma once #include "paddle_lite_factory_helper.h" -USE_LITE_KERNEL(pool2d, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(yolo_box, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(conv2d, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(depthwise_conv2d, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, int8_out); -USE_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, fp32_out); -USE_LITE_KERNEL(depthwise_conv2d, kARM, kInt8, kNCHW, int8_out); -USE_LITE_KERNEL(depthwise_conv2d, kARM, kInt8, kNCHW, fp32_out); -USE_LITE_KERNEL(expand, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(reduce_mean, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(prior_box, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(affine_channel, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(stack, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(fill_constant, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(fill_constant_batch_size_like, kARM, kFloat, kNCHW, def); USE_LITE_KERNEL(relu, kARM, kFloat, kNCHW, def); USE_LITE_KERNEL(leaky_relu, kARM, kFloat, kNCHW, def); USE_LITE_KERNEL(relu_clipped, kARM, kFloat, kNCHW, def); @@ -29,11 +14,87 @@ USE_LITE_KERNEL(exp, kARM, kFloat, kNCHW, def); USE_LITE_KERNEL(floor, kARM, kFloat, kNCHW, def); USE_LITE_KERNEL(hard_sigmoid, kARM, kFloat, kNCHW, def); USE_LITE_KERNEL(rsqrt, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(pad2d, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(fc, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(fc, kARM, kInt8, kNCHW, int8out); -USE_LITE_KERNEL(fc, kARM, kInt8, kNCHW, fp32out); -USE_LITE_KERNEL(multiclass_nms, kHost, kFloat, kNCHW, def); +USE_LITE_KERNEL(prior_box, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(affine_channel, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(logical_xor, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(logical_and, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(logical_or, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(logical_not, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(roi_align, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(scale, kFPGA, kFP16, kNHWC, def); +USE_LITE_KERNEL(generate_proposals, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(reduce_mean, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(crop, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(range, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(scale, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(axpy, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(beam_search, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(norm, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(power, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(calib, kARM, kInt8, kNCHW, fp32_to_int8); +USE_LITE_KERNEL(calib, kARM, kInt8, kNCHW, int8_to_fp32); +USE_LITE_KERNEL(calib_once, kARM, kInt8, kNCHW, fp32_to_int8); +USE_LITE_KERNEL(calib_once, kARM, kInt8, kNCHW, int8_to_fp32); +USE_LITE_KERNEL(negative, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(bilinear_interp, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(nearest_interp, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(box_coder, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(less_than, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(equal, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(not_equal, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(less_equal, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(greater_than, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(greater_equal, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(multiclass_nms, kFPGA, kFP16, kNHWC, def); +USE_LITE_KERNEL(lod_reset, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(box_clip, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(density_prior_box, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(decode_bboxes, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(squeeze, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(squeeze2, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(gru, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(increment, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(layout, kFPGA, kAny, kNHWC, hwc_to_chw_fpga_fp16); +USE_LITE_KERNEL(layout, kFPGA, kAny, kNHWC, hwc_to_chw_arm_float); +USE_LITE_KERNEL(layout, kFPGA, kAny, kNHWC, chw_to_hwc_fpga_fp16); +USE_LITE_KERNEL(layout_once, kFPGA, kAny, kNHWC, hwc_to_chw_fpga_fp16); +USE_LITE_KERNEL(layout_once, kFPGA, kAny, kNHWC, chw_to_hwc_fpga_fp16); +USE_LITE_KERNEL(elementwise_add, kFPGA, kFP16, kNHWC, def); +USE_LITE_KERNEL(fusion_elementwise_add_activation, kFPGA, kFP16, kNHWC, def); +USE_LITE_KERNEL(elementwise_mul, kFPGA, kFP16, kNHWC, def); +USE_LITE_KERNEL(is_empty, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(shape, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(split, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(calib, kFPGA, kFP16, kNHWC, fp32_to_fp16_fpga); +USE_LITE_KERNEL(calib, kFPGA, kFP16, kNHWC, fp16_to_fp32_fpga); +USE_LITE_KERNEL(calib_once, kFPGA, kFP16, kNHWC, fp32_to_fp16_fpga); +USE_LITE_KERNEL(calib_once, kFPGA, kFP16, kNHWC, fp16_to_fp32_fpga); +USE_LITE_KERNEL(sequence_expand, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(expand, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(gather, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(im2sequence, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(reduce_max, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(density_prior_box, kFPGA, kFP16, kNHWC, def); +USE_LITE_KERNEL(feed, kFPGA, kFP16, kNHWC, def); +USE_LITE_KERNEL(lookup_table, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(lookup_table_v2, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(reshape, kHost, kAny, kAny, def); +USE_LITE_KERNEL(reshape2, kHost, kAny, kAny, def); +USE_LITE_KERNEL(flatten, kHost, kAny, kAny, def); +USE_LITE_KERNEL(flatten2, kHost, kAny, kAny, def); +USE_LITE_KERNEL(conv2d_transpose, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(unsqueeze, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(unsqueeze2, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(softmax, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(layer_norm, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(assign, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(dropout, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(pool2d, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(yolo_box, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(sequence_softmax, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(stack, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(norm, kFPGA, kFP16, kNHWC, def); USE_LITE_KERNEL(transpose, kARM, kFloat, kNCHW, def); USE_LITE_KERNEL(transpose2, kARM, kFloat, kNCHW, def); USE_LITE_KERNEL(layout, kARM, kFloat, kNCHW, nchw2nhwc); @@ -44,7 +105,6 @@ USE_LITE_KERNEL(layout_once, kARM, kFloat, kNCHW, nchw2nhwc); USE_LITE_KERNEL(layout_once, kARM, kFloat, kNCHW, nhwc2nchw); USE_LITE_KERNEL(layout_once, kARM, kInt8, kNCHW, int8_nchw2nhwc); USE_LITE_KERNEL(layout_once, kARM, kInt8, kNCHW, int8_nhwc2nchw); -USE_LITE_KERNEL(matmul, kARM, kFloat, kNCHW, def); USE_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW, def); USE_LITE_KERNEL(fusion_elementwise_add_activation, kARM, kFloat, kNCHW, def); USE_LITE_KERNEL(elementwise_sub, kARM, kFloat, kNCHW, def); @@ -55,33 +115,44 @@ USE_LITE_KERNEL(elementwise_max, kARM, kFloat, kNCHW, def); USE_LITE_KERNEL(fusion_elementwise_max_activation, kARM, kFloat, kNCHW, def); USE_LITE_KERNEL(elementwise_div, kARM, kFloat, kNCHW, def); USE_LITE_KERNEL(fusion_elementwise_div_activation, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(split, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def); -USE_LITE_KERNEL(scale, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(fetch, kFPGA, kFP16, kNHWC, def); +USE_LITE_KERNEL(fill_constant, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(fill_constant_batch_size_like, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(while, kARM, kFloat, kNCHW, def); USE_LITE_KERNEL(batch_norm, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(io_copy, kFPGA, kAny, kAny, host_to_device); +USE_LITE_KERNEL(io_copy, kFPGA, kAny, kAny, device_to_host); +USE_LITE_KERNEL(io_copy_once, kFPGA, kAny, kAny, host_to_device_once); +USE_LITE_KERNEL(io_copy_once, kFPGA, kAny, kAny, device_to_host_once); +USE_LITE_KERNEL(arg_max, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(conv2d, kFPGA, kFP16, kNHWC, def); +USE_LITE_KERNEL(beam_search_decode, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(assign_value, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(gru, kFPGA, kFP16, kNHWC, def); +USE_LITE_KERNEL(cast, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(conv2d, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(depthwise_conv2d, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, int8_out); +USE_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, fp32_out); +USE_LITE_KERNEL(depthwise_conv2d, kARM, kInt8, kNCHW, int8_out); +USE_LITE_KERNEL(depthwise_conv2d, kARM, kInt8, kNCHW, fp32_out); +USE_LITE_KERNEL(write_to_array, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(anchor_generator, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(dropout, kFPGA, kFP16, kNHWC, def); +USE_LITE_KERNEL(pad2d, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(fc, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(fc, kARM, kInt8, kNCHW, int8out); +USE_LITE_KERNEL(fc, kARM, kInt8, kNCHW, fp32out); +USE_LITE_KERNEL(lrn, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(pool2d, kFPGA, kFP16, kNHWC, def); +USE_LITE_KERNEL(read_from_array, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(matmul, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def); +USE_LITE_KERNEL(slice, kARM, kFloat, kNCHW, def); USE_LITE_KERNEL(concat, kARM, kFloat, kNCHW, def); USE_LITE_KERNEL(shuffle_channel, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(arg_max, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(top_k, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(sequence_pool, kARM, kFloat, kNCHW, def); USE_LITE_KERNEL(feed, kHost, kAny, kAny, def); -USE_LITE_KERNEL(reshape, kHost, kAny, kAny, def); -USE_LITE_KERNEL(reshape2, kHost, kAny, kAny, def); -USE_LITE_KERNEL(flatten, kHost, kAny, kAny, def); -USE_LITE_KERNEL(flatten2, kHost, kAny, kAny, def); -USE_LITE_KERNEL(calib, kARM, kInt8, kNCHW, fp32_to_int8); -USE_LITE_KERNEL(calib, kARM, kInt8, kNCHW, int8_to_fp32); -USE_LITE_KERNEL(calib_once, kARM, kInt8, kNCHW, fp32_to_int8); -USE_LITE_KERNEL(calib_once, kARM, kInt8, kNCHW, int8_to_fp32); -USE_LITE_KERNEL(conv2d_transpose, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(range, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(unsqueeze, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(unsqueeze2, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(softmax, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(squeeze, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(squeeze2, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(bilinear_interp, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(nearest_interp, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(box_coder, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(dropout, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(slice, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(cast, kARM, kFloat, kNCHW, def); \ No newline at end of file +USE_LITE_KERNEL(gru_unit, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(fc, kFPGA, kFP16, kNHWC, def); \ No newline at end of file diff --git a/lite/api/paddle_use_ops.h b/lite/api/paddle_use_ops.h index faa4131728..e1976b20f3 100644 --- a/lite/api/paddle_use_ops.h +++ b/lite/api/paddle_use_ops.h @@ -1,48 +1,87 @@ #pragma once #include "paddle_lite_factory_helper.h" +USE_LITE_OP(lookup_table_v2); USE_LITE_OP(feed); -USE_LITE_OP(mul); -USE_LITE_OP(search_fc); -USE_LITE_OP(conv2d); -USE_LITE_OP(depthwise_conv2d); +USE_LITE_OP(fake_channel_wise_dequantize_max_abs); +USE_LITE_OP(assign); USE_LITE_OP(layout); -USE_LITE_OP(split); +USE_LITE_OP(transpose); +USE_LITE_OP(transpose2); +USE_LITE_OP(pool2d); +USE_LITE_OP(batch_norm); +USE_LITE_OP(reshape); +USE_LITE_OP(reshape2); +USE_LITE_OP(search_fc); +USE_LITE_OP(sequence_reverse); +USE_LITE_OP(matmul); +USE_LITE_OP(slice); +USE_LITE_OP(lod_reset); +USE_LITE_OP(graph_op); +USE_LITE_OP(expand); +USE_LITE_OP(top_k); +USE_LITE_OP(fake_quantize_range_abs_max); +USE_LITE_OP(arg_max); +USE_LITE_OP(beam_search); +USE_LITE_OP(box_clip); +USE_LITE_OP(fake_quantize_dequantize_moving_average_abs_max); +USE_LITE_OP(box_coder); +USE_LITE_OP(search_seq_depadding); +USE_LITE_OP(write_to_array); +USE_LITE_OP(is_empty); +USE_LITE_OP(prior_box); +USE_LITE_OP(sequence_concat); +USE_LITE_OP(affine_channel); +USE_LITE_OP(shape); +USE_LITE_OP(axpy); +USE_LITE_OP(anchor_generator); +USE_LITE_OP(reduce_max); +USE_LITE_OP(gru); +USE_LITE_OP(uniform_random); +USE_LITE_OP(unsqueeze); +USE_LITE_OP(unsqueeze2); +USE_LITE_OP(layout_once); +USE_LITE_OP(search_group_padding); +USE_LITE_OP(assign_value); +USE_LITE_OP(norm); +USE_LITE_OP(concat); +USE_LITE_OP(fill_constant); +USE_LITE_OP(fill_constant_batch_size_like); +USE_LITE_OP(calib_once); +USE_LITE_OP(decode_bboxes); +USE_LITE_OP(negative); +USE_LITE_OP(layer_norm); +USE_LITE_OP(mean); +USE_LITE_OP(lrn); +USE_LITE_OP(fetch); +USE_LITE_OP(power); USE_LITE_OP(stack); -USE_LITE_OP(scale); -USE_LITE_OP(multiclass_nms); +USE_LITE_OP(gather); +USE_LITE_OP(fc); +USE_LITE_OP(gru_unit); +USE_LITE_OP(increment); +USE_LITE_OP(mul); +USE_LITE_OP(search_seq_softmax); +USE_LITE_OP(var_conv_2d); +USE_LITE_OP(dropout); USE_LITE_OP(fusion_elementwise_sub_activation); USE_LITE_OP(fusion_elementwise_add_activation); USE_LITE_OP(fusion_elementwise_mul_activation); USE_LITE_OP(fusion_elementwise_max_activation); USE_LITE_OP(fusion_elementwise_div_activation); -USE_LITE_OP(unsqueeze); -USE_LITE_OP(unsqueeze2); -USE_LITE_OP(sequence_topk_avg_pooling); -USE_LITE_OP(transpose); -USE_LITE_OP(transpose2); -USE_LITE_OP(dropout); -USE_LITE_OP(pool2d); USE_LITE_OP(elementwise_sub); USE_LITE_OP(elementwise_add); USE_LITE_OP(elementwise_mul); USE_LITE_OP(elementwise_max); USE_LITE_OP(elementwise_div); -USE_LITE_OP(io_copy_once); -USE_LITE_OP(batch_norm); -USE_LITE_OP(reshape); -USE_LITE_OP(reshape2); -USE_LITE_OP(expand); -USE_LITE_OP(matmul); -USE_LITE_OP(concat); -USE_LITE_OP(slice); -USE_LITE_OP(graph_op); USE_LITE_OP(pad2d); +USE_LITE_OP(crop); +USE_LITE_OP(sequence_expand); +USE_LITE_OP(search_aligned_mat_mul); +USE_LITE_OP(io_copy); USE_LITE_OP(squeeze); USE_LITE_OP(squeeze2); -USE_LITE_OP(arg_max); -USE_LITE_OP(fill_constant); -USE_LITE_OP(fill_constant_batch_size_like); +USE_LITE_OP(reduce_sum); USE_LITE_OP(square); USE_LITE_OP(relu); USE_LITE_OP(leaky_relu); @@ -59,22 +98,53 @@ USE_LITE_OP(hard_sigmoid); USE_LITE_OP(sqrt); USE_LITE_OP(rsqrt); USE_LITE_OP(softsign); +USE_LITE_OP(sequence_expand_as); +USE_LITE_OP(match_matrix_tensor); USE_LITE_OP(range); -USE_LITE_OP(box_coder); -USE_LITE_OP(calib); -USE_LITE_OP(mean); -USE_LITE_OP(conv2d_transpose); -USE_LITE_OP(fetch); -USE_LITE_OP(prior_box); +USE_LITE_OP(lookup_table); USE_LITE_OP(fake_quantize_moving_average_abs_max); -USE_LITE_OP(io_copy); +USE_LITE_OP(search_grnn); +USE_LITE_OP(conv2d_transpose); +USE_LITE_OP(flatten); +USE_LITE_OP(flatten2); USE_LITE_OP(nearest_interp); USE_LITE_OP(bilinear_interp); USE_LITE_OP(softmax); USE_LITE_OP(reduce_mean); -USE_LITE_OP(affine_channel); USE_LITE_OP(cast); USE_LITE_OP(fake_dequantize_max_abs); -USE_LITE_OP(shuffle_channel); +USE_LITE_OP(read_from_array); USE_LITE_OP(yolo_box); -USE_LITE_OP(fc); \ No newline at end of file +USE_LITE_OP(multiclass_nms); +USE_LITE_OP(while); +USE_LITE_OP(conv2d); +USE_LITE_OP(depthwise_conv2d); +USE_LITE_OP(split); +USE_LITE_OP(scale); +USE_LITE_OP(beam_search_decode); +USE_LITE_OP(im2sequence); +USE_LITE_OP(sequence_topk_avg_pooling); +USE_LITE_OP(io_copy_once); +USE_LITE_OP(roi_align); +USE_LITE_OP(sequence_reshape); +USE_LITE_OP(equal); +USE_LITE_OP(notequal); +USE_LITE_OP(less_than); +USE_LITE_OP(less_equal); +USE_LITE_OP(greater_than); +USE_LITE_OP(greater_equal); +USE_LITE_OP(calib); +USE_LITE_OP(sequence_pool); +USE_LITE_OP(attention_padding_mask); +USE_LITE_OP(search_attention_padding_mask); +USE_LITE_OP(density_prior_box); +USE_LITE_OP(search_seq_fc); +USE_LITE_OP(generate_proposals); +USE_LITE_OP(sequence_arithmetic); +USE_LITE_OP(search_seq_arithmetic); +USE_LITE_OP(shuffle_channel); +USE_LITE_OP(sequence_softmax); +USE_LITE_OP(logical_xor); +USE_LITE_OP(logical_and); +USE_LITE_OP(logical_or); +USE_LITE_OP(logical_not); \ No newline at end of file diff --git a/lite/api/test_ssd_fpga.cc b/lite/api/test_ssd_fpga.cc new file mode 100755 index 0000000000..15c2637507 --- /dev/null +++ b/lite/api/test_ssd_fpga.cc @@ -0,0 +1,154 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include "lite/api/cxx_api.h" +#include "lite/api/paddle_use_kernels.h" +#include "lite/api/paddle_use_ops.h" +#include "lite/api/paddle_use_passes.h" +#include "lite/api/test_helper.h" +#include "lite/core/op_registry.h" + + +DEFINE_string(input_file, "", "input_file"); + +namespace paddle { +namespace lite { + +std::vector GetDirectoryFiles(const std::string& dir) { + std::vector files; + std::shared_ptr directory_ptr(opendir(dir.c_str()), + [](DIR* dir) { dir&& closedir(dir); }); + struct dirent* dirent_ptr; + if (!directory_ptr) { + std::cout << "Error opening : " << std::strerror(errno) << dir << std::endl; + return files; + } + + while ((dirent_ptr = readdir(directory_ptr.get())) != nullptr) { + files.push_back(std::string(dirent_ptr->d_name)); + } + return files; +} + +void readFromFile(int num, std::string path, float* data) { + std::ifstream file_stream(path); + // file_stream.open(path); + if (!file_stream.good()) { + std::cout << "file: " << path << " dones not exist!\n"; + exit(-1); + return; + } + // float* data = mutableData(); + for (int i = 0; i < num; ++i) { + float value = 0; + file_stream >> value; + data[i] = value; + } + file_stream.close(); +} + +// #ifdef LITE_WITH_FPGA +TEST(ResNet50, test) { + lite::Predictor predictor; + std::vector valid_places({ + Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)}, + Place{TARGET(kHost), PRECISION(kFloat)}, + Place{TARGET(kARM), PRECISION(kFloat)}, + }); + + predictor.Build(FLAGS_model_dir, + "", + "", + valid_places); + + + // predictor.Build(FLAGS_model_dir, + // FLAGS_model_dir + "/model", + // FLAGS_model_dir + "/params", + // Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)}, + // valid_places); + + + auto* input_tensor = predictor.GetInput(0); + + int width = 416; + int height = 416; + + + std::ifstream file_stream(FLAGS_input_file); + // file_stream.open(path); + if (!file_stream.good()) { + std::cout << "file: " << FLAGS_input_file << " dones not exist!\n"; + exit(-1); + return; + } + + file_stream >> height; + file_stream >> width; + + input_tensor->Resize(DDim(std::vector({1, 3, height, width}))); + auto* data = input_tensor->mutable_data(); + auto item_size = input_tensor->dims().production(); + + for (int i = 0; i < item_size; i++) { + data[i] = 1; + } + + // readFromFile(item_size, "car.data", data); + + int num = 3 * width * height; + + for (int i = 0; i < num; ++i) { + float value = 0; + file_stream >> value; + data[i] = value; + } + file_stream.close(); + + for (int i = 0; i < 2; ++i) { + predictor.Run(); + } + + auto* out = predictor.GetOutput(0); + for (int i = 0;i < out->dims().production();i++) { + std::cout << ":" << out->data()[i] << std::endl; + } + + // std::cout << "-------\n"; + // auto* out1 = predictor.GetOutput(1); + // for (int i = 0;i < out1->dims().production();i++) { + // std::cout << ":" << out1->data()[i] << std::endl; + // } + + // std::string file = "output/" + FLAGS_input_file.substr (6); + // std::cout << "file:::" << file << std::endl; + + // std::ofstream ofs; + // ofs.open(file); + // for (int i = 0; i < out->dims().production(); i++) { + // float value = out->data()[i]; + // ofs << value << std::endl; + // } + // ofs.close(); + + LOG(INFO) << "================== Speed Report ==================="; +} +// #endif + +} // namespace lite +} // namespace paddle diff --git a/lite/backends/fpga/KD/debugger.hpp b/lite/backends/fpga/KD/debugger.hpp index 70a92612f2..b549a6e5f3 100755 --- a/lite/backends/fpga/KD/debugger.hpp +++ b/lite/backends/fpga/KD/debugger.hpp @@ -79,7 +79,7 @@ inline void read_from_file(lite::Tensor* t,const std::string& path) { } inline void save_float(float* data, const std::string& name, int len) { - return; + // return; static int counter = 0; std::string old_string = std::to_string(counter); std::string new_string = std::string(3 - old_string.length(), '0') + old_string; diff --git a/lite/backends/fpga/KD/pes/conv_pe.hpp b/lite/backends/fpga/KD/pes/conv_pe.hpp index 73f0b7f5cd..2b943bb3ab 100755 --- a/lite/backends/fpga/KD/pes/conv_pe.hpp +++ b/lite/backends/fpga/KD/pes/conv_pe.hpp @@ -59,7 +59,7 @@ class ConvPE : public PE { if (param_.filter->shape().width() == 1 && param_.filter->shape().height() == 1) { - use_cpu_ = true; + // use_cpu_ = true; } if (!use_cpu_) { // param_.filter->releaseData(); diff --git a/lite/backends/fpga/KD/tensor.hpp b/lite/backends/fpga/KD/tensor.hpp index 48c97a54f3..fb17cf01af 100755 --- a/lite/backends/fpga/KD/tensor.hpp +++ b/lite/backends/fpga/KD/tensor.hpp @@ -395,7 +395,7 @@ class Tensor { } void save_file_with_name(std::string path) { - return; + // return; invalidate(); // usleep(20000); // return; diff --git a/lite/core/kernel.h b/lite/core/kernel.h old mode 100644 new mode 100755 index 05d7a6b333..176f6c69ac --- a/lite/core/kernel.h +++ b/lite/core/kernel.h @@ -84,11 +84,9 @@ class KernelBase { #ifdef LITE_WITH_PROFILE if (profile_id_ >= 0) { profile::ProfileBlock x(profile_id_, "kernel"); - Run(); } -#else - Run(); #endif + Run(); } void SetContext(std::unique_ptr&& ctx) { diff --git a/lite/core/mir/type_target_cast_pass.cc b/lite/core/mir/type_target_cast_pass.cc old mode 100644 new mode 100755 index b008faa687..d0e9cd4e2b --- a/lite/core/mir/type_target_cast_pass.cc +++ b/lite/core/mir/type_target_cast_pass.cc @@ -93,8 +93,10 @@ void TypeTargetTransformPass::AddIoCopyInst( // Set the place for io_copy_output_arg node, the target should be equal to // to.target() // The precision and layout should be equal to from.precision(), from.layout() + #ifndef LITE_WITH_FPGA io_copy_output_arg->AsArg().type = LiteType::GetTensorTy(to.target(), from.precision(), from.layout()); + #endif auto* io_copy_inst = graph->NewInstructNode(); bool in_persist = in->AsArg().is_weight || in->AsArg().is_persist; diff --git a/lite/core/program.cc b/lite/core/program.cc old mode 100644 new mode 100755 index b60f279c0f..686b373040 --- a/lite/core/program.cc +++ b/lite/core/program.cc @@ -114,11 +114,14 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) { void RuntimeProgram::Run() { for (auto& inst : instructions_) { std::string op_type = inst.op()->op_info()->Type(); + +#ifndef LITE_WITH_FPGA if (op_type == "feed" || op_type == "fetch") continue; +#endif inst.Run(); #ifdef LITE_WITH_PROFILE #ifdef LITE_WITH_PRECISION_PROFILE - LITE_PRECISION_PROFILE(inst) + // LITE_PRECISION_PROFILE(inst) #endif // LITE_WITH_PRECISION_PROFILE #endif // LITE_WITH_PROFILE } diff --git a/lite/kernels/arm/prior_box_compute.cc b/lite/kernels/arm/prior_box_compute.cc old mode 100644 new mode 100755 index 48ae1e94dd..aa3c2b3758 --- a/lite/kernels/arm/prior_box_compute.cc +++ b/lite/kernels/arm/prior_box_compute.cc @@ -98,8 +98,10 @@ REGISTER_LITE_KERNEL(prior_box, kNCHW, paddle::lite::kernels::arm::PriorBoxCompute, def) - .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindInput("Image", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Input",{LiteType::GetTensorTy( + TARGET(kFPGA), PRECISION(kAny), DATALAYOUT(kAny))}) + .BindInput("Image", {LiteType::GetTensorTy( + TARGET(kFPGA), PRECISION(kAny), DATALAYOUT(kAny))}) .BindOutput("Boxes", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Variances", {LiteType::GetTensorTy(TARGET(kARM))}) .Finalize(); diff --git a/lite/kernels/fpga/CMakeLists.txt b/lite/kernels/fpga/CMakeLists.txt index f431d408bb..246290c73a 100755 --- a/lite/kernels/fpga/CMakeLists.txt +++ b/lite/kernels/fpga/CMakeLists.txt @@ -7,15 +7,17 @@ set(fpga_deps fpga_target_wrapper kernel_fpga) # add_kernel(activation_compute_fpga FPGA basic SRCS activation_compute.cc DEPS ${fpga_deps}) # add_kernel(box_coder_compute_fpga FPGA basic SRCS box_coder_compute.cc DEPS ${fpga_deps}) -add_kernel(concat_compute_fpga FPGA basic SRCS concat_compute.cc DEPS ${fpga_deps}) +# add_kernel(concat_compute_fpga FPGA basic SRCS concat_compute.cc DEPS ${fpga_deps}) add_kernel(conv_compute_fpga FPGA basic SRCS conv_compute.cc DEPS ${fpga_deps}) add_kernel(density_prior_box_compute_fpga FPGA basic SRCS density_prior_box_compute.cc DEPS ${fpga_deps}) add_kernel(dropout_compute_fpga FPGA basic SRCS dropout_compute.cc DEPS ${fpga_deps}) add_kernel(elementwise_compute_fpga FPGA basic SRCS elementwise_compute.cc DEPS ${fpga_deps}) +# add_kernel(feed_compute_fpga FPGA basic SRCS fc_compute.cc DEPS ${fpga_deps}) + add_kernel(fc_compute_fpga FPGA basic SRCS fc_compute.cc DEPS ${fpga_deps}) add_kernel(gru_compute_fpga FPGA extra SRCS gru_compute.cc DEPS ${fpga_deps}) # add_kernel(mul_compute_fpga FPGA basic SRCS mul_compute.cc DEPS ${fpga_deps}) -# add_kernel(multiclass_nms_compute_fpga FPGA basic SRCS multiclass_nms_compute.cc DEPS ${fpga_deps}) +add_kernel(multiclass_nms_compute_fpga FPGA basic SRCS multiclass_nms_compute.cc DEPS ${fpga_deps}) add_kernel(norm_compute_fpga FPGA basic SRCS norm_compute.cc DEPS ${fpga_deps}) # add_kernel(im2sequence_compute_fpga FPGA basic SRCS im2sequence_compute.cc DEPS ${fpga_deps}) add_kernel(pooling_compute_fpga FPGA basic SRCS pooling_compute.cc DEPS ${fpga_deps}) diff --git a/lite/kernels/fpga/feed_compute.cc b/lite/kernels/fpga/feed_compute.cc index 6f574c2ba0..5bd9bdaaf3 100755 --- a/lite/kernels/fpga/feed_compute.cc +++ b/lite/kernels/fpga/feed_compute.cc @@ -37,6 +37,7 @@ void FeedCompute::PrepareForRun() { } void FeedCompute::Run() { + std::cout << "================= FeedCompute ================= \n"; auto& param = this->Param(); Tensor& x = param.feed_list->at(param.col); zynqmp::InputParam& feed_param = pe_.param(); diff --git a/lite/kernels/fpga/io_copy_compute.cc b/lite/kernels/fpga/io_copy_compute.cc index df6abfe118..fe7a7b737b 100755 --- a/lite/kernels/fpga/io_copy_compute.cc +++ b/lite/kernels/fpga/io_copy_compute.cc @@ -157,11 +157,11 @@ REGISTER_LITE_KERNEL(io_copy, device_to_host) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kFPGA), - PRECISION(kAny), - DATALAYOUT(kAny))}) + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), - PRECISION(kAny), + PRECISION(kFloat), DATALAYOUT(kNHWC))}) .Finalize(); diff --git a/lite/kernels/fpga/multiclass_nms_compute.cc b/lite/kernels/fpga/multiclass_nms_compute.cc index be71642488..89c652a38e 100755 --- a/lite/kernels/fpga/multiclass_nms_compute.cc +++ b/lite/kernels/fpga/multiclass_nms_compute.cc @@ -12,11 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "lite/kernels/fpga/multiclass_nms_compute.h" #include #include #include -#include "lite/kernels/fpga/multiclass_nms_compute.h" +#include "lite/backends/fpga/KD/debugger.hpp" namespace paddle { namespace lite { @@ -196,11 +197,15 @@ void MultiClassNMS(const operators::MulticlassNmsParam& param, int num_det = 0; int64_t class_num = scores_size == 3 ? scores.dims()[0] : scores.dims()[1]; - Tensor bbox_slice, score_slice; + + // scores.ZynqTensor()->saveToFile("nms_scores", true); + for (int64_t c = 0; c < class_num; ++c) { + Tensor bbox_slice, score_slice; if (c == background_label) continue; if (scores_size == 3) { - score_slice = scores.Slice(c, c + 1); + scores.Slice(score_slice, c, c + 1); + // score_slice.ZynqTensor()->saveToFile("nms_slice", true); bbox_slice = bboxes; } else { score_slice.Resize({scores.dims()[0], 1}); @@ -208,7 +213,7 @@ void MultiClassNMS(const operators::MulticlassNmsParam& param, SliceOneClass(scores, c, &score_slice); SliceOneClass(bboxes, c, &bbox_slice); } - NMSFast(bbox_slice, + NMSFast(bboxes,// TODO bbox_slice score_slice, score_threshold, nms_threshold, @@ -225,6 +230,9 @@ void MultiClassNMS(const operators::MulticlassNmsParam& param, *num_nmsed_out = num_det; const T* scores_data = scores.data(); if (keep_top_k > -1 && num_det > keep_top_k) { + + Tensor score_slice; + const T* sdata; std::vector>> score_index_pairs; for (const auto& it : *indices) { @@ -315,48 +323,35 @@ void MultiClassOutput(const Tensor& scores, void MulticlassNmsCompute::Run() { auto& param = Param(); - auto* boxes = param.bboxes; auto* scores = param.scores; auto* outs = param.out; + outs->mutable_data(); - auto boxes_dims = boxes->dims(); - auto boxes_size = boxes_dims.size(); auto score_dims = scores->dims(); auto score_size = score_dims.size(); - Tensor bboxes_tensor; - bboxes_tensor.Resize(boxes_dims); - auto bboxes_data = bboxes_tensor.mutable_data(); - bboxes_tensor.ZynqTensor()->copyFrom(boxes->ZynqTensor()); - - Tensor score_tensor; - score_tensor.Resize(score_dims); - auto score_data = score_tensor.mutable_data(); - score_tensor.ZynqTensor()->copyFrom(scores->ZynqTensor()); - + auto box_dims = boxes->dims(); + int64_t box_dim = boxes->dims()[2]; std::vector>> all_indices; std::vector batch_starts = {0}; int64_t batch_size = score_dims[0]; - int64_t box_dim = boxes->dims()[2]; + int64_t out_dim = box_dim + 2; int num_nmsed_out = 0; Tensor boxes_slice, scores_slice; - boxes_slice.mutable_data(); - scores_slice.mutable_data(); - int n = score_size == 3 ? batch_size : boxes->lod().back().size() - 1; for (int i = 0; i < n; ++i) { if (score_size == 3) { - scores_slice = score_tensor.Slice(i, i + 1); + scores->Slice(scores_slice, i, i + 1); scores_slice.Resize({score_dims[1], score_dims[2]}); - boxes_slice = bboxes_tensor.Slice(i, i + 1); + boxes->Slice(boxes_slice, i, i + 1); boxes_slice.Resize({score_dims[2], box_dim}); } else { - auto boxes_lod = bboxes_tensor.lod().back(); - scores_slice = score_tensor.Slice(boxes_lod[i], boxes_lod[i + 1]); - boxes_slice = bboxes_tensor.Slice(boxes_lod[i], boxes_lod[i + 1]); + auto boxes_lod = boxes->lod().back(); + scores->Slice(scores_slice, boxes_lod[i], boxes_lod[i + 1]); + boxes->Slice(boxes_slice, boxes_lod[i], boxes_lod[i + 1]); } std::map> indices; MultiClassNMS( @@ -365,6 +360,8 @@ void MulticlassNmsCompute::Run() { batch_starts.push_back(batch_starts.back() + num_nmsed_out); } + + uint64_t num_kept = batch_starts.back(); if (num_kept == 0) { outs->Resize({1, 1}); @@ -375,39 +372,44 @@ void MulticlassNmsCompute::Run() { outs->Resize({static_cast(num_kept), out_dim}); for (int i = 0; i < n; ++i) { if (score_size == 3) { - scores_slice = score_tensor.Slice(i, i + 1); - boxes_slice = bboxes_tensor.Slice(i, i + 1); + scores->Slice(scores_slice, i, i + 1); + boxes->Slice(boxes_slice, i, i + 1); scores_slice.Resize({score_dims[1], score_dims[2]}); boxes_slice.Resize({score_dims[2], box_dim}); } else { auto boxes_lod = boxes->lod().back(); - scores_slice = score_tensor.Slice(boxes_lod[i], boxes_lod[i + 1]); - boxes_slice = bboxes_tensor.Slice(boxes_lod[i], boxes_lod[i + 1]); + scores->Slice(scores_slice, boxes_lod[i], boxes_lod[i + 1]); + boxes->Slice(boxes_slice, boxes_lod[i], boxes_lod[i + 1]); } int64_t s = static_cast(batch_starts[i]); int64_t e = static_cast(batch_starts[i + 1]); + if (e > s) { - Tensor out = outs->Slice(s, e); + Tensor out; + outs->Slice(out, s, e); + // scores_slice.ZynqTensor()->saveToFile("scores_slice", true); MultiClassOutput( scores_slice, boxes_slice, all_indices[i], score_dims.size(), &out); + out.ZynqTensor()->saveToFile("out", true); + outs->ZynqTensor()->copyFrom(out.ZynqTensor()); } } } + + // save_tensor(param.scores, "_scores.txt", false); + // save_tensor(param.bboxes, "_bboxes.txt", false); + + boxes->ZynqTensor()->saveToFile("_boxes", true); + scores->ZynqTensor()->saveToFile("_scores", true); + outs->ZynqTensor()->saveToFile("_outs", true); + LoD lod; lod.emplace_back(batch_starts); outs->set_lod(lod); - - - // auto* boxes = param.bboxes; - // auto* scores = param.scores; - // auto* outs = param.out; - // boxes->ZynqTensor()->saveToFile("boxes", true); - // scores->ZynqTensor()->saveToFile("scores", true); - // param.out->ZynqTensor()->saveToFile("nms_", true); } -} // namespace fpga +} // namespace host } // namespace kernels } // namespace lite } // namespace paddle @@ -418,10 +420,7 @@ REGISTER_LITE_KERNEL(multiclass_nms, kNHWC, paddle::lite::kernels::fpga::MulticlassNmsCompute, def) - .BindInput("BBoxes", - {LiteType::GetTensorTy(TARGET(kARM))}) - .BindInput("Scores", - {LiteType::GetTensorTy(TARGET(kARM))}) - .BindOutput("Out", - {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("BBoxes", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindInput("Scores", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))}) .Finalize(); diff --git a/lite/kernels/host/CMakeLists.txt b/lite/kernels/host/CMakeLists.txt old mode 100644 new mode 100755 index 428cc213ce..2751641f45 --- a/lite/kernels/host/CMakeLists.txt +++ b/lite/kernels/host/CMakeLists.txt @@ -3,7 +3,7 @@ message(STATUS "compile with lite host kernels") add_kernel(feed_compute_host Host basic SRCS feed_compute.cc DEPS ${lite_kernel_deps}) add_kernel(fetch_compute_host Host basic SRCS fetch_compute.cc DEPS ${lite_kernel_deps}) add_kernel(reshape_compute_host Host basic SRCS reshape_compute.cc DEPS ${lite_kernel_deps} reshape_op) -add_kernel(multiclass_nms_compute_host Host basic SRCS multiclass_nms_compute.cc DEPS ${lite_kernel_deps}) +# add_kernel(multiclass_nms_compute_host Host basic SRCS multiclass_nms_compute.cc DEPS ${lite_kernel_deps}) #lite_cc_test(test_reshape_compute_host SRCS reshape_compute_test.cc DEPS reshape_compute_host any) #lite_cc_test(test_multiclass_nms_compute_host SRCS multiclass_nms_compute_test.cc DEPS multiclass_nms_compute_host any) diff --git a/lite/kernels/host/multiclass_nms_compute.cc b/lite/kernels/host/multiclass_nms_compute.cc old mode 100644 new mode 100755 index 6f6079ef88..131b51b90a --- a/lite/kernels/host/multiclass_nms_compute.cc +++ b/lite/kernels/host/multiclass_nms_compute.cc @@ -392,7 +392,9 @@ REGISTER_LITE_KERNEL(multiclass_nms, kNCHW, paddle::lite::kernels::host::MulticlassNmsCompute, def) - .BindInput("BBoxes", {LiteType::GetTensorTy(TARGET(kHost))}) - .BindInput("Scores", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindInput("BBoxes", {LiteType::GetTensorTy( + TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny))}) + .BindInput("Scores", {LiteType::GetTensorTy( + TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))}) .Finalize(); -- GitLab