提交 92787a39 编写于 作者: C chonwhite

added fpga multiclass_nms implementation

上级 de865cc9
......@@ -194,6 +194,11 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels})
lite_cc_test(test_ssd_fpga SRCS test_ssd_fpga.cc
DEPS ${lite_model_test_DEPS}
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels})
lite_cc_test(test_inceptionv4 SRCS inceptionv4_test.cc
DEPS ${lite_model_test_DEPS}
CL_DEPS ${opencl_kernels}
......
......@@ -121,6 +121,7 @@ void Predictor::SaveOpKernelInfo(const std::string &model_dir) {
<< kpf_path;
}
#ifndef LITE_WITH_FPGA
lite::Tensor *Predictor::GetInput(size_t offset) {
CHECK(input_names_.size() > offset)
<< "The network has " << input_names_.size() << " inputs"
......@@ -130,6 +131,19 @@ lite::Tensor *Predictor::GetInput(size_t offset) {
<< " in exec_scope";
return in_var->GetMutable<lite::Tensor>();
}
#else
lite::Tensor *Predictor::GetInput(size_t offset) {
auto *_feed_list = exec_scope_->FindVar("feed");
CHECK(_feed_list) << "no feed variable in exec_scope";
auto *feed_list = _feed_list->GetMutable<std::vector<lite::Tensor>>();
if (offset >= feed_list->size()) {
feed_list->resize(offset + 1);
}
return &feed_list->at(offset);
}
#endif
// get inputs names
std::vector<std::string> Predictor::GetInputNames() { return input_names_; }
......
#pragma once
#include "paddle_lite_factory_helper.h"
USE_LITE_KERNEL(pool2d, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(yolo_box, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(conv2d, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(depthwise_conv2d, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, int8_out);
USE_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, fp32_out);
USE_LITE_KERNEL(depthwise_conv2d, kARM, kInt8, kNCHW, int8_out);
USE_LITE_KERNEL(depthwise_conv2d, kARM, kInt8, kNCHW, fp32_out);
USE_LITE_KERNEL(expand, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(reduce_mean, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(prior_box, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(affine_channel, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(stack, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(fill_constant, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(fill_constant_batch_size_like, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(relu, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(leaky_relu, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(relu_clipped, kARM, kFloat, kNCHW, def);
......@@ -29,11 +14,87 @@ USE_LITE_KERNEL(exp, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(floor, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(hard_sigmoid, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(rsqrt, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(pad2d, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(fc, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(fc, kARM, kInt8, kNCHW, int8out);
USE_LITE_KERNEL(fc, kARM, kInt8, kNCHW, fp32out);
USE_LITE_KERNEL(multiclass_nms, kHost, kFloat, kNCHW, def);
USE_LITE_KERNEL(prior_box, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(affine_channel, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(logical_xor, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(logical_and, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(logical_or, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(logical_not, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(roi_align, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(scale, kFPGA, kFP16, kNHWC, def);
USE_LITE_KERNEL(generate_proposals, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(reduce_mean, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(crop, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(range, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(scale, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(axpy, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(beam_search, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(norm, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(power, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(calib, kARM, kInt8, kNCHW, fp32_to_int8);
USE_LITE_KERNEL(calib, kARM, kInt8, kNCHW, int8_to_fp32);
USE_LITE_KERNEL(calib_once, kARM, kInt8, kNCHW, fp32_to_int8);
USE_LITE_KERNEL(calib_once, kARM, kInt8, kNCHW, int8_to_fp32);
USE_LITE_KERNEL(negative, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(bilinear_interp, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(nearest_interp, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(box_coder, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(less_than, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(equal, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(not_equal, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(less_equal, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(greater_than, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(greater_equal, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(multiclass_nms, kFPGA, kFP16, kNHWC, def);
USE_LITE_KERNEL(lod_reset, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(box_clip, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(density_prior_box, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(decode_bboxes, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(squeeze, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(squeeze2, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(gru, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(increment, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(layout, kFPGA, kAny, kNHWC, hwc_to_chw_fpga_fp16);
USE_LITE_KERNEL(layout, kFPGA, kAny, kNHWC, hwc_to_chw_arm_float);
USE_LITE_KERNEL(layout, kFPGA, kAny, kNHWC, chw_to_hwc_fpga_fp16);
USE_LITE_KERNEL(layout_once, kFPGA, kAny, kNHWC, hwc_to_chw_fpga_fp16);
USE_LITE_KERNEL(layout_once, kFPGA, kAny, kNHWC, chw_to_hwc_fpga_fp16);
USE_LITE_KERNEL(elementwise_add, kFPGA, kFP16, kNHWC, def);
USE_LITE_KERNEL(fusion_elementwise_add_activation, kFPGA, kFP16, kNHWC, def);
USE_LITE_KERNEL(elementwise_mul, kFPGA, kFP16, kNHWC, def);
USE_LITE_KERNEL(is_empty, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(shape, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(split, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(calib, kFPGA, kFP16, kNHWC, fp32_to_fp16_fpga);
USE_LITE_KERNEL(calib, kFPGA, kFP16, kNHWC, fp16_to_fp32_fpga);
USE_LITE_KERNEL(calib_once, kFPGA, kFP16, kNHWC, fp32_to_fp16_fpga);
USE_LITE_KERNEL(calib_once, kFPGA, kFP16, kNHWC, fp16_to_fp32_fpga);
USE_LITE_KERNEL(sequence_expand, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(expand, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(gather, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(im2sequence, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(reduce_max, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(density_prior_box, kFPGA, kFP16, kNHWC, def);
USE_LITE_KERNEL(feed, kFPGA, kFP16, kNHWC, def);
USE_LITE_KERNEL(lookup_table, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(lookup_table_v2, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(reshape, kHost, kAny, kAny, def);
USE_LITE_KERNEL(reshape2, kHost, kAny, kAny, def);
USE_LITE_KERNEL(flatten, kHost, kAny, kAny, def);
USE_LITE_KERNEL(flatten2, kHost, kAny, kAny, def);
USE_LITE_KERNEL(conv2d_transpose, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(unsqueeze, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(unsqueeze2, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(softmax, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(layer_norm, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(assign, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(dropout, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(pool2d, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(yolo_box, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(sequence_softmax, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(stack, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(norm, kFPGA, kFP16, kNHWC, def);
USE_LITE_KERNEL(transpose, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(transpose2, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(layout, kARM, kFloat, kNCHW, nchw2nhwc);
......@@ -44,7 +105,6 @@ USE_LITE_KERNEL(layout_once, kARM, kFloat, kNCHW, nchw2nhwc);
USE_LITE_KERNEL(layout_once, kARM, kFloat, kNCHW, nhwc2nchw);
USE_LITE_KERNEL(layout_once, kARM, kInt8, kNCHW, int8_nchw2nhwc);
USE_LITE_KERNEL(layout_once, kARM, kInt8, kNCHW, int8_nhwc2nchw);
USE_LITE_KERNEL(matmul, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(fusion_elementwise_add_activation, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(elementwise_sub, kARM, kFloat, kNCHW, def);
......@@ -55,33 +115,44 @@ USE_LITE_KERNEL(elementwise_max, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(fusion_elementwise_max_activation, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(elementwise_div, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(fusion_elementwise_div_activation, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(split, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def);
USE_LITE_KERNEL(scale, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(fetch, kFPGA, kFP16, kNHWC, def);
USE_LITE_KERNEL(fill_constant, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(fill_constant_batch_size_like, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(while, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(batch_norm, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(io_copy, kFPGA, kAny, kAny, host_to_device);
USE_LITE_KERNEL(io_copy, kFPGA, kAny, kAny, device_to_host);
USE_LITE_KERNEL(io_copy_once, kFPGA, kAny, kAny, host_to_device_once);
USE_LITE_KERNEL(io_copy_once, kFPGA, kAny, kAny, device_to_host_once);
USE_LITE_KERNEL(arg_max, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(conv2d, kFPGA, kFP16, kNHWC, def);
USE_LITE_KERNEL(beam_search_decode, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(assign_value, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(gru, kFPGA, kFP16, kNHWC, def);
USE_LITE_KERNEL(cast, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(conv2d, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(depthwise_conv2d, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, int8_out);
USE_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, fp32_out);
USE_LITE_KERNEL(depthwise_conv2d, kARM, kInt8, kNCHW, int8_out);
USE_LITE_KERNEL(depthwise_conv2d, kARM, kInt8, kNCHW, fp32_out);
USE_LITE_KERNEL(write_to_array, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(anchor_generator, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(dropout, kFPGA, kFP16, kNHWC, def);
USE_LITE_KERNEL(pad2d, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(fc, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(fc, kARM, kInt8, kNCHW, int8out);
USE_LITE_KERNEL(fc, kARM, kInt8, kNCHW, fp32out);
USE_LITE_KERNEL(lrn, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(pool2d, kFPGA, kFP16, kNHWC, def);
USE_LITE_KERNEL(read_from_array, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(matmul, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def);
USE_LITE_KERNEL(slice, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(concat, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(shuffle_channel, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(arg_max, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(top_k, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(sequence_pool, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(feed, kHost, kAny, kAny, def);
USE_LITE_KERNEL(reshape, kHost, kAny, kAny, def);
USE_LITE_KERNEL(reshape2, kHost, kAny, kAny, def);
USE_LITE_KERNEL(flatten, kHost, kAny, kAny, def);
USE_LITE_KERNEL(flatten2, kHost, kAny, kAny, def);
USE_LITE_KERNEL(calib, kARM, kInt8, kNCHW, fp32_to_int8);
USE_LITE_KERNEL(calib, kARM, kInt8, kNCHW, int8_to_fp32);
USE_LITE_KERNEL(calib_once, kARM, kInt8, kNCHW, fp32_to_int8);
USE_LITE_KERNEL(calib_once, kARM, kInt8, kNCHW, int8_to_fp32);
USE_LITE_KERNEL(conv2d_transpose, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(range, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(unsqueeze, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(unsqueeze2, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(softmax, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(squeeze, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(squeeze2, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(bilinear_interp, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(nearest_interp, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(box_coder, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(dropout, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(slice, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(cast, kARM, kFloat, kNCHW, def);
\ No newline at end of file
USE_LITE_KERNEL(gru_unit, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(fc, kFPGA, kFP16, kNHWC, def);
\ No newline at end of file
#pragma once
#include "paddle_lite_factory_helper.h"
USE_LITE_OP(lookup_table_v2);
USE_LITE_OP(feed);
USE_LITE_OP(mul);
USE_LITE_OP(search_fc);
USE_LITE_OP(conv2d);
USE_LITE_OP(depthwise_conv2d);
USE_LITE_OP(fake_channel_wise_dequantize_max_abs);
USE_LITE_OP(assign);
USE_LITE_OP(layout);
USE_LITE_OP(split);
USE_LITE_OP(transpose);
USE_LITE_OP(transpose2);
USE_LITE_OP(pool2d);
USE_LITE_OP(batch_norm);
USE_LITE_OP(reshape);
USE_LITE_OP(reshape2);
USE_LITE_OP(search_fc);
USE_LITE_OP(sequence_reverse);
USE_LITE_OP(matmul);
USE_LITE_OP(slice);
USE_LITE_OP(lod_reset);
USE_LITE_OP(graph_op);
USE_LITE_OP(expand);
USE_LITE_OP(top_k);
USE_LITE_OP(fake_quantize_range_abs_max);
USE_LITE_OP(arg_max);
USE_LITE_OP(beam_search);
USE_LITE_OP(box_clip);
USE_LITE_OP(fake_quantize_dequantize_moving_average_abs_max);
USE_LITE_OP(box_coder);
USE_LITE_OP(search_seq_depadding);
USE_LITE_OP(write_to_array);
USE_LITE_OP(is_empty);
USE_LITE_OP(prior_box);
USE_LITE_OP(sequence_concat);
USE_LITE_OP(affine_channel);
USE_LITE_OP(shape);
USE_LITE_OP(axpy);
USE_LITE_OP(anchor_generator);
USE_LITE_OP(reduce_max);
USE_LITE_OP(gru);
USE_LITE_OP(uniform_random);
USE_LITE_OP(unsqueeze);
USE_LITE_OP(unsqueeze2);
USE_LITE_OP(layout_once);
USE_LITE_OP(search_group_padding);
USE_LITE_OP(assign_value);
USE_LITE_OP(norm);
USE_LITE_OP(concat);
USE_LITE_OP(fill_constant);
USE_LITE_OP(fill_constant_batch_size_like);
USE_LITE_OP(calib_once);
USE_LITE_OP(decode_bboxes);
USE_LITE_OP(negative);
USE_LITE_OP(layer_norm);
USE_LITE_OP(mean);
USE_LITE_OP(lrn);
USE_LITE_OP(fetch);
USE_LITE_OP(power);
USE_LITE_OP(stack);
USE_LITE_OP(scale);
USE_LITE_OP(multiclass_nms);
USE_LITE_OP(gather);
USE_LITE_OP(fc);
USE_LITE_OP(gru_unit);
USE_LITE_OP(increment);
USE_LITE_OP(mul);
USE_LITE_OP(search_seq_softmax);
USE_LITE_OP(var_conv_2d);
USE_LITE_OP(dropout);
USE_LITE_OP(fusion_elementwise_sub_activation);
USE_LITE_OP(fusion_elementwise_add_activation);
USE_LITE_OP(fusion_elementwise_mul_activation);
USE_LITE_OP(fusion_elementwise_max_activation);
USE_LITE_OP(fusion_elementwise_div_activation);
USE_LITE_OP(unsqueeze);
USE_LITE_OP(unsqueeze2);
USE_LITE_OP(sequence_topk_avg_pooling);
USE_LITE_OP(transpose);
USE_LITE_OP(transpose2);
USE_LITE_OP(dropout);
USE_LITE_OP(pool2d);
USE_LITE_OP(elementwise_sub);
USE_LITE_OP(elementwise_add);
USE_LITE_OP(elementwise_mul);
USE_LITE_OP(elementwise_max);
USE_LITE_OP(elementwise_div);
USE_LITE_OP(io_copy_once);
USE_LITE_OP(batch_norm);
USE_LITE_OP(reshape);
USE_LITE_OP(reshape2);
USE_LITE_OP(expand);
USE_LITE_OP(matmul);
USE_LITE_OP(concat);
USE_LITE_OP(slice);
USE_LITE_OP(graph_op);
USE_LITE_OP(pad2d);
USE_LITE_OP(crop);
USE_LITE_OP(sequence_expand);
USE_LITE_OP(search_aligned_mat_mul);
USE_LITE_OP(io_copy);
USE_LITE_OP(squeeze);
USE_LITE_OP(squeeze2);
USE_LITE_OP(arg_max);
USE_LITE_OP(fill_constant);
USE_LITE_OP(fill_constant_batch_size_like);
USE_LITE_OP(reduce_sum);
USE_LITE_OP(square);
USE_LITE_OP(relu);
USE_LITE_OP(leaky_relu);
......@@ -59,22 +98,53 @@ USE_LITE_OP(hard_sigmoid);
USE_LITE_OP(sqrt);
USE_LITE_OP(rsqrt);
USE_LITE_OP(softsign);
USE_LITE_OP(sequence_expand_as);
USE_LITE_OP(match_matrix_tensor);
USE_LITE_OP(range);
USE_LITE_OP(box_coder);
USE_LITE_OP(calib);
USE_LITE_OP(mean);
USE_LITE_OP(conv2d_transpose);
USE_LITE_OP(fetch);
USE_LITE_OP(prior_box);
USE_LITE_OP(lookup_table);
USE_LITE_OP(fake_quantize_moving_average_abs_max);
USE_LITE_OP(io_copy);
USE_LITE_OP(search_grnn);
USE_LITE_OP(conv2d_transpose);
USE_LITE_OP(flatten);
USE_LITE_OP(flatten2);
USE_LITE_OP(nearest_interp);
USE_LITE_OP(bilinear_interp);
USE_LITE_OP(softmax);
USE_LITE_OP(reduce_mean);
USE_LITE_OP(affine_channel);
USE_LITE_OP(cast);
USE_LITE_OP(fake_dequantize_max_abs);
USE_LITE_OP(shuffle_channel);
USE_LITE_OP(read_from_array);
USE_LITE_OP(yolo_box);
USE_LITE_OP(fc);
\ No newline at end of file
USE_LITE_OP(multiclass_nms);
USE_LITE_OP(while);
USE_LITE_OP(conv2d);
USE_LITE_OP(depthwise_conv2d);
USE_LITE_OP(split);
USE_LITE_OP(scale);
USE_LITE_OP(beam_search_decode);
USE_LITE_OP(im2sequence);
USE_LITE_OP(sequence_topk_avg_pooling);
USE_LITE_OP(io_copy_once);
USE_LITE_OP(roi_align);
USE_LITE_OP(sequence_reshape);
USE_LITE_OP(equal);
USE_LITE_OP(notequal);
USE_LITE_OP(less_than);
USE_LITE_OP(less_equal);
USE_LITE_OP(greater_than);
USE_LITE_OP(greater_equal);
USE_LITE_OP(calib);
USE_LITE_OP(sequence_pool);
USE_LITE_OP(attention_padding_mask);
USE_LITE_OP(search_attention_padding_mask);
USE_LITE_OP(density_prior_box);
USE_LITE_OP(search_seq_fc);
USE_LITE_OP(generate_proposals);
USE_LITE_OP(sequence_arithmetic);
USE_LITE_OP(search_seq_arithmetic);
USE_LITE_OP(shuffle_channel);
USE_LITE_OP(sequence_softmax);
USE_LITE_OP(logical_xor);
USE_LITE_OP(logical_and);
USE_LITE_OP(logical_or);
USE_LITE_OP(logical_not);
\ No newline at end of file
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <dirent.h>
#include <gflags/gflags.h>
#include <gtest/gtest.h>
#include <vector>
#include "lite/api/cxx_api.h"
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/api/paddle_use_passes.h"
#include "lite/api/test_helper.h"
#include "lite/core/op_registry.h"
DEFINE_string(input_file, "", "input_file");
namespace paddle {
namespace lite {
std::vector<std::string> GetDirectoryFiles(const std::string& dir) {
std::vector<std::string> files;
std::shared_ptr<DIR> directory_ptr(opendir(dir.c_str()),
[](DIR* dir) { dir&& closedir(dir); });
struct dirent* dirent_ptr;
if (!directory_ptr) {
std::cout << "Error opening : " << std::strerror(errno) << dir << std::endl;
return files;
}
while ((dirent_ptr = readdir(directory_ptr.get())) != nullptr) {
files.push_back(std::string(dirent_ptr->d_name));
}
return files;
}
void readFromFile(int num, std::string path, float* data) {
std::ifstream file_stream(path);
// file_stream.open(path);
if (!file_stream.good()) {
std::cout << "file: " << path << " dones not exist!\n";
exit(-1);
return;
}
// float* data = mutableData<float>();
for (int i = 0; i < num; ++i) {
float value = 0;
file_stream >> value;
data[i] = value;
}
file_stream.close();
}
// #ifdef LITE_WITH_FPGA
TEST(ResNet50, test) {
lite::Predictor predictor;
std::vector<Place> valid_places({
Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)},
Place{TARGET(kHost), PRECISION(kFloat)},
Place{TARGET(kARM), PRECISION(kFloat)},
});
predictor.Build(FLAGS_model_dir,
"",
"",
valid_places);
// predictor.Build(FLAGS_model_dir,
// FLAGS_model_dir + "/model",
// FLAGS_model_dir + "/params",
// Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)},
// valid_places);
auto* input_tensor = predictor.GetInput(0);
int width = 416;
int height = 416;
std::ifstream file_stream(FLAGS_input_file);
// file_stream.open(path);
if (!file_stream.good()) {
std::cout << "file: " << FLAGS_input_file << " dones not exist!\n";
exit(-1);
return;
}
file_stream >> height;
file_stream >> width;
input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, height, width})));
auto* data = input_tensor->mutable_data<float>();
auto item_size = input_tensor->dims().production();
for (int i = 0; i < item_size; i++) {
data[i] = 1;
}
// readFromFile(item_size, "car.data", data);
int num = 3 * width * height;
for (int i = 0; i < num; ++i) {
float value = 0;
file_stream >> value;
data[i] = value;
}
file_stream.close();
for (int i = 0; i < 2; ++i) {
predictor.Run();
}
auto* out = predictor.GetOutput(0);
for (int i = 0;i < out->dims().production();i++) {
std::cout << ":" << out->data<float>()[i] << std::endl;
}
// std::cout << "-------\n";
// auto* out1 = predictor.GetOutput(1);
// for (int i = 0;i < out1->dims().production();i++) {
// std::cout << ":" << out1->data<float>()[i] << std::endl;
// }
// std::string file = "output/" + FLAGS_input_file.substr (6);
// std::cout << "file:::" << file << std::endl;
// std::ofstream ofs;
// ofs.open(file);
// for (int i = 0; i < out->dims().production(); i++) {
// float value = out->data<float>()[i];
// ofs << value << std::endl;
// }
// ofs.close();
LOG(INFO) << "================== Speed Report ===================";
}
// #endif
} // namespace lite
} // namespace paddle
......@@ -79,7 +79,7 @@ inline void read_from_file(lite::Tensor* t,const std::string& path) {
}
inline void save_float(float* data, const std::string& name, int len) {
return;
// return;
static int counter = 0;
std::string old_string = std::to_string(counter);
std::string new_string = std::string(3 - old_string.length(), '0') + old_string;
......
......@@ -59,7 +59,7 @@ class ConvPE : public PE {
if (param_.filter->shape().width() == 1 &&
param_.filter->shape().height() == 1) {
use_cpu_ = true;
// use_cpu_ = true;
}
if (!use_cpu_) {
// param_.filter->releaseData();
......
......@@ -395,7 +395,7 @@ class Tensor {
}
void save_file_with_name(std::string path) {
return;
// return;
invalidate();
// usleep(20000);
// return;
......
......@@ -84,11 +84,9 @@ class KernelBase {
#ifdef LITE_WITH_PROFILE
if (profile_id_ >= 0) {
profile::ProfileBlock x(profile_id_, "kernel");
Run();
}
#else
Run();
#endif
Run();
}
void SetContext(std::unique_ptr<KernelContext>&& ctx) {
......
......@@ -93,8 +93,10 @@ void TypeTargetTransformPass::AddIoCopyInst(
// Set the place for io_copy_output_arg node, the target should be equal to
// to.target()
// The precision and layout should be equal to from.precision(), from.layout()
#ifndef LITE_WITH_FPGA
io_copy_output_arg->AsArg().type =
LiteType::GetTensorTy(to.target(), from.precision(), from.layout());
#endif
auto* io_copy_inst = graph->NewInstructNode();
bool in_persist = in->AsArg().is_weight || in->AsArg().is_persist;
......
......@@ -114,11 +114,14 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) {
void RuntimeProgram::Run() {
for (auto& inst : instructions_) {
std::string op_type = inst.op()->op_info()->Type();
#ifndef LITE_WITH_FPGA
if (op_type == "feed" || op_type == "fetch") continue;
#endif
inst.Run();
#ifdef LITE_WITH_PROFILE
#ifdef LITE_WITH_PRECISION_PROFILE
LITE_PRECISION_PROFILE(inst)
// LITE_PRECISION_PROFILE(inst)
#endif // LITE_WITH_PRECISION_PROFILE
#endif // LITE_WITH_PROFILE
}
......
......@@ -98,8 +98,10 @@ REGISTER_LITE_KERNEL(prior_box,
kNCHW,
paddle::lite::kernels::arm::PriorBoxCompute,
def)
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Image", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Input",{LiteType::GetTensorTy(
TARGET(kFPGA), PRECISION(kAny), DATALAYOUT(kAny))})
.BindInput("Image", {LiteType::GetTensorTy(
TARGET(kFPGA), PRECISION(kAny), DATALAYOUT(kAny))})
.BindOutput("Boxes", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Variances", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
......@@ -7,15 +7,17 @@ set(fpga_deps fpga_target_wrapper kernel_fpga)
# add_kernel(activation_compute_fpga FPGA basic SRCS activation_compute.cc DEPS ${fpga_deps})
# add_kernel(box_coder_compute_fpga FPGA basic SRCS box_coder_compute.cc DEPS ${fpga_deps})
add_kernel(concat_compute_fpga FPGA basic SRCS concat_compute.cc DEPS ${fpga_deps})
# add_kernel(concat_compute_fpga FPGA basic SRCS concat_compute.cc DEPS ${fpga_deps})
add_kernel(conv_compute_fpga FPGA basic SRCS conv_compute.cc DEPS ${fpga_deps})
add_kernel(density_prior_box_compute_fpga FPGA basic SRCS density_prior_box_compute.cc DEPS ${fpga_deps})
add_kernel(dropout_compute_fpga FPGA basic SRCS dropout_compute.cc DEPS ${fpga_deps})
add_kernel(elementwise_compute_fpga FPGA basic SRCS elementwise_compute.cc DEPS ${fpga_deps})
# add_kernel(feed_compute_fpga FPGA basic SRCS fc_compute.cc DEPS ${fpga_deps})
add_kernel(fc_compute_fpga FPGA basic SRCS fc_compute.cc DEPS ${fpga_deps})
add_kernel(gru_compute_fpga FPGA extra SRCS gru_compute.cc DEPS ${fpga_deps})
# add_kernel(mul_compute_fpga FPGA basic SRCS mul_compute.cc DEPS ${fpga_deps})
# add_kernel(multiclass_nms_compute_fpga FPGA basic SRCS multiclass_nms_compute.cc DEPS ${fpga_deps})
add_kernel(multiclass_nms_compute_fpga FPGA basic SRCS multiclass_nms_compute.cc DEPS ${fpga_deps})
add_kernel(norm_compute_fpga FPGA basic SRCS norm_compute.cc DEPS ${fpga_deps})
# add_kernel(im2sequence_compute_fpga FPGA basic SRCS im2sequence_compute.cc DEPS ${fpga_deps})
add_kernel(pooling_compute_fpga FPGA basic SRCS pooling_compute.cc DEPS ${fpga_deps})
......
......@@ -37,6 +37,7 @@ void FeedCompute::PrepareForRun() {
}
void FeedCompute::Run() {
std::cout << "================= FeedCompute ================= \n";
auto& param = this->Param<param_t>();
Tensor& x = param.feed_list->at(param.col);
zynqmp::InputParam& feed_param = pe_.param();
......
......@@ -157,11 +157,11 @@ REGISTER_LITE_KERNEL(io_copy,
device_to_host)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kAny),
DATALAYOUT(kAny))})
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kARM),
PRECISION(kAny),
PRECISION(kFloat),
DATALAYOUT(kNHWC))})
.Finalize();
......
......@@ -12,11 +12,12 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/fpga/multiclass_nms_compute.h"
#include <map>
#include <utility>
#include <vector>
#include "lite/kernels/fpga/multiclass_nms_compute.h"
#include "lite/backends/fpga/KD/debugger.hpp"
namespace paddle {
namespace lite {
......@@ -196,11 +197,15 @@ void MultiClassNMS(const operators::MulticlassNmsParam& param,
int num_det = 0;
int64_t class_num = scores_size == 3 ? scores.dims()[0] : scores.dims()[1];
Tensor bbox_slice, score_slice;
// scores.ZynqTensor()->saveToFile("nms_scores", true);
for (int64_t c = 0; c < class_num; ++c) {
Tensor bbox_slice, score_slice;
if (c == background_label) continue;
if (scores_size == 3) {
score_slice = scores.Slice<T>(c, c + 1);
scores.Slice<T>(score_slice, c, c + 1);
// score_slice.ZynqTensor()->saveToFile("nms_slice", true);
bbox_slice = bboxes;
} else {
score_slice.Resize({scores.dims()[0], 1});
......@@ -208,7 +213,7 @@ void MultiClassNMS(const operators::MulticlassNmsParam& param,
SliceOneClass<T>(scores, c, &score_slice);
SliceOneClass<T>(bboxes, c, &bbox_slice);
}
NMSFast(bbox_slice,
NMSFast(bboxes,// TODO bbox_slice
score_slice,
score_threshold,
nms_threshold,
......@@ -225,6 +230,9 @@ void MultiClassNMS(const operators::MulticlassNmsParam& param,
*num_nmsed_out = num_det;
const T* scores_data = scores.data<T>();
if (keep_top_k > -1 && num_det > keep_top_k) {
Tensor score_slice;
const T* sdata;
std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
for (const auto& it : *indices) {
......@@ -315,48 +323,35 @@ void MultiClassOutput(const Tensor& scores,
void MulticlassNmsCompute::Run() {
auto& param = Param<operators::MulticlassNmsParam>();
auto* boxes = param.bboxes;
auto* scores = param.scores;
auto* outs = param.out;
outs->mutable_data<float>();
auto boxes_dims = boxes->dims();
auto boxes_size = boxes_dims.size();
auto score_dims = scores->dims();
auto score_size = score_dims.size();
Tensor bboxes_tensor;
bboxes_tensor.Resize(boxes_dims);
auto bboxes_data = bboxes_tensor.mutable_data<float>();
bboxes_tensor.ZynqTensor()->copyFrom(boxes->ZynqTensor());
Tensor score_tensor;
score_tensor.Resize(score_dims);
auto score_data = score_tensor.mutable_data<float>();
score_tensor.ZynqTensor()->copyFrom(scores->ZynqTensor());
auto box_dims = boxes->dims();
int64_t box_dim = boxes->dims()[2];
std::vector<std::map<int, std::vector<int>>> all_indices;
std::vector<uint64_t> batch_starts = {0};
int64_t batch_size = score_dims[0];
int64_t box_dim = boxes->dims()[2];
int64_t out_dim = box_dim + 2;
int num_nmsed_out = 0;
Tensor boxes_slice, scores_slice;
boxes_slice.mutable_data<float>();
scores_slice.mutable_data<float>();
int n = score_size == 3 ? batch_size : boxes->lod().back().size() - 1;
for (int i = 0; i < n; ++i) {
if (score_size == 3) {
scores_slice = score_tensor.Slice<float>(i, i + 1);
scores->Slice<float>(scores_slice, i, i + 1);
scores_slice.Resize({score_dims[1], score_dims[2]});
boxes_slice = bboxes_tensor.Slice<float>(i, i + 1);
boxes->Slice<float>(boxes_slice, i, i + 1);
boxes_slice.Resize({score_dims[2], box_dim});
} else {
auto boxes_lod = bboxes_tensor.lod().back();
scores_slice = score_tensor.Slice<float>(boxes_lod[i], boxes_lod[i + 1]);
boxes_slice = bboxes_tensor.Slice<float>(boxes_lod[i], boxes_lod[i + 1]);
auto boxes_lod = boxes->lod().back();
scores->Slice<float>(scores_slice, boxes_lod[i], boxes_lod[i + 1]);
boxes->Slice<float>(boxes_slice, boxes_lod[i], boxes_lod[i + 1]);
}
std::map<int, std::vector<int>> indices;
MultiClassNMS<float>(
......@@ -365,6 +360,8 @@ void MulticlassNmsCompute::Run() {
batch_starts.push_back(batch_starts.back() + num_nmsed_out);
}
uint64_t num_kept = batch_starts.back();
if (num_kept == 0) {
outs->Resize({1, 1});
......@@ -375,39 +372,44 @@ void MulticlassNmsCompute::Run() {
outs->Resize({static_cast<int64_t>(num_kept), out_dim});
for (int i = 0; i < n; ++i) {
if (score_size == 3) {
scores_slice = score_tensor.Slice<float>(i, i + 1);
boxes_slice = bboxes_tensor.Slice<float>(i, i + 1);
scores->Slice<float>(scores_slice, i, i + 1);
boxes->Slice<float>(boxes_slice, i, i + 1);
scores_slice.Resize({score_dims[1], score_dims[2]});
boxes_slice.Resize({score_dims[2], box_dim});
} else {
auto boxes_lod = boxes->lod().back();
scores_slice = score_tensor.Slice<float>(boxes_lod[i], boxes_lod[i + 1]);
boxes_slice = bboxes_tensor.Slice<float>(boxes_lod[i], boxes_lod[i + 1]);
scores->Slice<float>(scores_slice, boxes_lod[i], boxes_lod[i + 1]);
boxes->Slice<float>(boxes_slice, boxes_lod[i], boxes_lod[i + 1]);
}
int64_t s = static_cast<int64_t>(batch_starts[i]);
int64_t e = static_cast<int64_t>(batch_starts[i + 1]);
if (e > s) {
Tensor out = outs->Slice<float>(s, e);
Tensor out;
outs->Slice<float>(out, s, e);
// scores_slice.ZynqTensor()->saveToFile("scores_slice", true);
MultiClassOutput<float>(
scores_slice, boxes_slice, all_indices[i], score_dims.size(), &out);
out.ZynqTensor()->saveToFile("out", true);
outs->ZynqTensor()->copyFrom(out.ZynqTensor());
}
}
}
// save_tensor(param.scores, "_scores.txt", false);
// save_tensor(param.bboxes, "_bboxes.txt", false);
boxes->ZynqTensor()->saveToFile("_boxes", true);
scores->ZynqTensor()->saveToFile("_scores", true);
outs->ZynqTensor()->saveToFile("_outs", true);
LoD lod;
lod.emplace_back(batch_starts);
outs->set_lod(lod);
// auto* boxes = param.bboxes;
// auto* scores = param.scores;
// auto* outs = param.out;
// boxes->ZynqTensor()->saveToFile("boxes", true);
// scores->ZynqTensor()->saveToFile("scores", true);
// param.out->ZynqTensor()->saveToFile("nms_", true);
}
} // namespace fpga
} // namespace host
} // namespace kernels
} // namespace lite
} // namespace paddle
......@@ -418,10 +420,7 @@ REGISTER_LITE_KERNEL(multiclass_nms,
kNHWC,
paddle::lite::kernels::fpga::MulticlassNmsCompute,
def)
.BindInput("BBoxes",
{LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Scores",
{LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("BBoxes", {LiteType::GetTensorTy(TARGET(kHost))})
.BindInput("Scores", {LiteType::GetTensorTy(TARGET(kHost))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
.Finalize();
......@@ -3,7 +3,7 @@ message(STATUS "compile with lite host kernels")
add_kernel(feed_compute_host Host basic SRCS feed_compute.cc DEPS ${lite_kernel_deps})
add_kernel(fetch_compute_host Host basic SRCS fetch_compute.cc DEPS ${lite_kernel_deps})
add_kernel(reshape_compute_host Host basic SRCS reshape_compute.cc DEPS ${lite_kernel_deps} reshape_op)
add_kernel(multiclass_nms_compute_host Host basic SRCS multiclass_nms_compute.cc DEPS ${lite_kernel_deps})
# add_kernel(multiclass_nms_compute_host Host basic SRCS multiclass_nms_compute.cc DEPS ${lite_kernel_deps})
#lite_cc_test(test_reshape_compute_host SRCS reshape_compute_test.cc DEPS reshape_compute_host any)
#lite_cc_test(test_multiclass_nms_compute_host SRCS multiclass_nms_compute_test.cc DEPS multiclass_nms_compute_host any)
......@@ -392,7 +392,9 @@ REGISTER_LITE_KERNEL(multiclass_nms,
kNCHW,
paddle::lite::kernels::host::MulticlassNmsCompute,
def)
.BindInput("BBoxes", {LiteType::GetTensorTy(TARGET(kHost))})
.BindInput("Scores", {LiteType::GetTensorTy(TARGET(kHost))})
.BindInput("BBoxes", {LiteType::GetTensorTy(
TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny))})
.BindInput("Scores", {LiteType::GetTensorTy(
TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
.Finalize();
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册