From 92787a39bea2e8e76ebb3aeb592cda90cac6591a Mon Sep 17 00:00:00 2001
From: chonwhite <arthurbrown@163.com>
Date: Thu, 5 Dec 2019 20:04:16 +0800
Subject: [PATCH] added fpga multiclass_nms implementation

---
 lite/api/CMakeLists.txt                     |   5 +
 lite/api/cxx_api.cc                         |  14 ++
 lite/api/paddle_use_kernels.h               | 165 ++++++++++++++------
 lite/api/paddle_use_ops.h                   | 142 ++++++++++++-----
 lite/api/test_ssd_fpga.cc                   | 154 ++++++++++++++++++
 lite/backends/fpga/KD/debugger.hpp          |   2 +-
 lite/backends/fpga/KD/pes/conv_pe.hpp       |   2 +-
 lite/backends/fpga/KD/tensor.hpp            |   2 +-
 lite/core/kernel.h                          |   4 +-
 lite/core/mir/type_target_cast_pass.cc      |   2 +
 lite/core/program.cc                        |   5 +-
 lite/kernels/arm/prior_box_compute.cc       |   6 +-
 lite/kernels/fpga/CMakeLists.txt            |   6 +-
 lite/kernels/fpga/feed_compute.cc           |   1 +
 lite/kernels/fpga/io_copy_compute.cc        |   6 +-
 lite/kernels/fpga/multiclass_nms_compute.cc |  91 ++++++-----
 lite/kernels/host/CMakeLists.txt            |   2 +-
 lite/kernels/host/multiclass_nms_compute.cc |   6 +-
 18 files changed, 469 insertions(+), 146 deletions(-)
 mode change 100644 => 100755 lite/api/cxx_api.cc
 create mode 100755 lite/api/test_ssd_fpga.cc
 mode change 100644 => 100755 lite/core/kernel.h
 mode change 100644 => 100755 lite/core/mir/type_target_cast_pass.cc
 mode change 100644 => 100755 lite/core/program.cc
 mode change 100644 => 100755 lite/kernels/arm/prior_box_compute.cc
 mode change 100644 => 100755 lite/kernels/host/CMakeLists.txt
 mode change 100644 => 100755 lite/kernels/host/multiclass_nms_compute.cc

diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt
index 408a63e3f5..25ba812a2a 100755
--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -194,6 +194,11 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels})
 
+    lite_cc_test(test_ssd_fpga SRCS test_ssd_fpga.cc
+       DEPS ${lite_model_test_DEPS}
+       CL_DEPS ${opencl_kernels}
+       FPGA_DEPS ${fpga_kernels})
+
     lite_cc_test(test_inceptionv4 SRCS inceptionv4_test.cc
        DEPS ${lite_model_test_DEPS}
        CL_DEPS ${opencl_kernels}
diff --git a/lite/api/cxx_api.cc b/lite/api/cxx_api.cc
old mode 100644
new mode 100755
index 4647f20bbe..463a0b8462
--- a/lite/api/cxx_api.cc
+++ b/lite/api/cxx_api.cc
@@ -121,6 +121,7 @@ void Predictor::SaveOpKernelInfo(const std::string &model_dir) {
             << kpf_path;
 }
 
+#ifndef LITE_WITH_FPGA
 lite::Tensor *Predictor::GetInput(size_t offset) {
   CHECK(input_names_.size() > offset)
       << "The network has " << input_names_.size() << " inputs"
@@ -130,6 +131,19 @@ lite::Tensor *Predictor::GetInput(size_t offset) {
                 << " in exec_scope";
   return in_var->GetMutable<lite::Tensor>();
 }
+#else
+lite::Tensor *Predictor::GetInput(size_t offset) {
+  auto *_feed_list = exec_scope_->FindVar("feed");
+  CHECK(_feed_list) << "no feed variable in exec_scope";
+  auto *feed_list = _feed_list->GetMutable<std::vector<lite::Tensor>>();
+  if (offset >= feed_list->size()) {
+    feed_list->resize(offset + 1);
+  }
+  return &feed_list->at(offset);
+}
+#endif
+
+
 
 // get inputs names
 std::vector<std::string> Predictor::GetInputNames() { return input_names_; }
diff --git a/lite/api/paddle_use_kernels.h b/lite/api/paddle_use_kernels.h
index 4fa21429c6..c4bc385ad7 100644
--- a/lite/api/paddle_use_kernels.h
+++ b/lite/api/paddle_use_kernels.h
@@ -1,21 +1,6 @@
 #pragma once
 #include "paddle_lite_factory_helper.h"
 
-USE_LITE_KERNEL(pool2d, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(yolo_box, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(conv2d, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(depthwise_conv2d, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, int8_out);
-USE_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, fp32_out);
-USE_LITE_KERNEL(depthwise_conv2d, kARM, kInt8, kNCHW, int8_out);
-USE_LITE_KERNEL(depthwise_conv2d, kARM, kInt8, kNCHW, fp32_out);
-USE_LITE_KERNEL(expand, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(reduce_mean, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(prior_box, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(affine_channel, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(stack, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(fill_constant, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(fill_constant_batch_size_like, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(relu, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(leaky_relu, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(relu_clipped, kARM, kFloat, kNCHW, def);
@@ -29,11 +14,87 @@ USE_LITE_KERNEL(exp, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(floor, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(hard_sigmoid, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(rsqrt, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(pad2d, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(fc, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(fc, kARM, kInt8, kNCHW, int8out);
-USE_LITE_KERNEL(fc, kARM, kInt8, kNCHW, fp32out);
-USE_LITE_KERNEL(multiclass_nms, kHost, kFloat, kNCHW, def);
+USE_LITE_KERNEL(prior_box, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(affine_channel, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(logical_xor, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(logical_and, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(logical_or, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(logical_not, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(roi_align, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(scale, kFPGA, kFP16, kNHWC, def);
+USE_LITE_KERNEL(generate_proposals, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(reduce_mean, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(crop, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(range, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(scale, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(axpy, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(beam_search, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(norm, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(power, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(calib, kARM, kInt8, kNCHW, fp32_to_int8);
+USE_LITE_KERNEL(calib, kARM, kInt8, kNCHW, int8_to_fp32);
+USE_LITE_KERNEL(calib_once, kARM, kInt8, kNCHW, fp32_to_int8);
+USE_LITE_KERNEL(calib_once, kARM, kInt8, kNCHW, int8_to_fp32);
+USE_LITE_KERNEL(negative, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(bilinear_interp, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(nearest_interp, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(box_coder, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(less_than, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(equal, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(not_equal, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(less_equal, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(greater_than, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(greater_equal, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(multiclass_nms, kFPGA, kFP16, kNHWC, def);
+USE_LITE_KERNEL(lod_reset, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(box_clip, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(density_prior_box, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(decode_bboxes, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(squeeze, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(squeeze2, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(gru, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(increment, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(layout, kFPGA, kAny, kNHWC, hwc_to_chw_fpga_fp16);
+USE_LITE_KERNEL(layout, kFPGA, kAny, kNHWC, hwc_to_chw_arm_float);
+USE_LITE_KERNEL(layout, kFPGA, kAny, kNHWC, chw_to_hwc_fpga_fp16);
+USE_LITE_KERNEL(layout_once, kFPGA, kAny, kNHWC, hwc_to_chw_fpga_fp16);
+USE_LITE_KERNEL(layout_once, kFPGA, kAny, kNHWC, chw_to_hwc_fpga_fp16);
+USE_LITE_KERNEL(elementwise_add, kFPGA, kFP16, kNHWC, def);
+USE_LITE_KERNEL(fusion_elementwise_add_activation, kFPGA, kFP16, kNHWC, def);
+USE_LITE_KERNEL(elementwise_mul, kFPGA, kFP16, kNHWC, def);
+USE_LITE_KERNEL(is_empty, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(shape, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(split, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(calib, kFPGA, kFP16, kNHWC, fp32_to_fp16_fpga);
+USE_LITE_KERNEL(calib, kFPGA, kFP16, kNHWC, fp16_to_fp32_fpga);
+USE_LITE_KERNEL(calib_once, kFPGA, kFP16, kNHWC, fp32_to_fp16_fpga);
+USE_LITE_KERNEL(calib_once, kFPGA, kFP16, kNHWC, fp16_to_fp32_fpga);
+USE_LITE_KERNEL(sequence_expand, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(expand, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(gather, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(im2sequence, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(reduce_max, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(density_prior_box, kFPGA, kFP16, kNHWC, def);
+USE_LITE_KERNEL(feed, kFPGA, kFP16, kNHWC, def);
+USE_LITE_KERNEL(lookup_table, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(lookup_table_v2, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(reshape, kHost, kAny, kAny, def);
+USE_LITE_KERNEL(reshape2, kHost, kAny, kAny, def);
+USE_LITE_KERNEL(flatten, kHost, kAny, kAny, def);
+USE_LITE_KERNEL(flatten2, kHost, kAny, kAny, def);
+USE_LITE_KERNEL(conv2d_transpose, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(unsqueeze, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(unsqueeze2, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(softmax, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(layer_norm, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(assign, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(dropout, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(pool2d, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(yolo_box, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(sequence_softmax, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(stack, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(norm, kFPGA, kFP16, kNHWC, def);
 USE_LITE_KERNEL(transpose, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(transpose2, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(layout, kARM, kFloat, kNCHW, nchw2nhwc);
@@ -44,7 +105,6 @@ USE_LITE_KERNEL(layout_once, kARM, kFloat, kNCHW, nchw2nhwc);
 USE_LITE_KERNEL(layout_once, kARM, kFloat, kNCHW, nhwc2nchw);
 USE_LITE_KERNEL(layout_once, kARM, kInt8, kNCHW, int8_nchw2nhwc);
 USE_LITE_KERNEL(layout_once, kARM, kInt8, kNCHW, int8_nhwc2nchw);
-USE_LITE_KERNEL(matmul, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(fusion_elementwise_add_activation, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(elementwise_sub, kARM, kFloat, kNCHW, def);
@@ -55,33 +115,44 @@ USE_LITE_KERNEL(elementwise_max, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(fusion_elementwise_max_activation, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(elementwise_div, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(fusion_elementwise_div_activation, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(split, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def);
-USE_LITE_KERNEL(scale, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(fetch, kFPGA, kFP16, kNHWC, def);
+USE_LITE_KERNEL(fill_constant, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(fill_constant_batch_size_like, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(while, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(batch_norm, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(io_copy, kFPGA, kAny, kAny, host_to_device);
+USE_LITE_KERNEL(io_copy, kFPGA, kAny, kAny, device_to_host);
+USE_LITE_KERNEL(io_copy_once, kFPGA, kAny, kAny, host_to_device_once);
+USE_LITE_KERNEL(io_copy_once, kFPGA, kAny, kAny, device_to_host_once);
+USE_LITE_KERNEL(arg_max, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(conv2d, kFPGA, kFP16, kNHWC, def);
+USE_LITE_KERNEL(beam_search_decode, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(assign_value, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(gru, kFPGA, kFP16, kNHWC, def);
+USE_LITE_KERNEL(cast, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(conv2d, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(depthwise_conv2d, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, int8_out);
+USE_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, fp32_out);
+USE_LITE_KERNEL(depthwise_conv2d, kARM, kInt8, kNCHW, int8_out);
+USE_LITE_KERNEL(depthwise_conv2d, kARM, kInt8, kNCHW, fp32_out);
+USE_LITE_KERNEL(write_to_array, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(anchor_generator, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(dropout, kFPGA, kFP16, kNHWC, def);
+USE_LITE_KERNEL(pad2d, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(fc, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(fc, kARM, kInt8, kNCHW, int8out);
+USE_LITE_KERNEL(fc, kARM, kInt8, kNCHW, fp32out);
+USE_LITE_KERNEL(lrn, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(pool2d, kFPGA, kFP16, kNHWC, def);
+USE_LITE_KERNEL(read_from_array, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(matmul, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def);
+USE_LITE_KERNEL(slice, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(concat, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(shuffle_channel, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(arg_max, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(top_k, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(sequence_pool, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(feed, kHost, kAny, kAny, def);
-USE_LITE_KERNEL(reshape, kHost, kAny, kAny, def);
-USE_LITE_KERNEL(reshape2, kHost, kAny, kAny, def);
-USE_LITE_KERNEL(flatten, kHost, kAny, kAny, def);
-USE_LITE_KERNEL(flatten2, kHost, kAny, kAny, def);
-USE_LITE_KERNEL(calib, kARM, kInt8, kNCHW, fp32_to_int8);
-USE_LITE_KERNEL(calib, kARM, kInt8, kNCHW, int8_to_fp32);
-USE_LITE_KERNEL(calib_once, kARM, kInt8, kNCHW, fp32_to_int8);
-USE_LITE_KERNEL(calib_once, kARM, kInt8, kNCHW, int8_to_fp32);
-USE_LITE_KERNEL(conv2d_transpose, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(range, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(unsqueeze, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(unsqueeze2, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(softmax, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(squeeze, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(squeeze2, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(bilinear_interp, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(nearest_interp, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(box_coder, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(dropout, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(slice, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(cast, kARM, kFloat, kNCHW, def);
\ No newline at end of file
+USE_LITE_KERNEL(gru_unit, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(fc, kFPGA, kFP16, kNHWC, def);
\ No newline at end of file
diff --git a/lite/api/paddle_use_ops.h b/lite/api/paddle_use_ops.h
index faa4131728..e1976b20f3 100644
--- a/lite/api/paddle_use_ops.h
+++ b/lite/api/paddle_use_ops.h
@@ -1,48 +1,87 @@
 #pragma once
 #include "paddle_lite_factory_helper.h"
 
+USE_LITE_OP(lookup_table_v2);
 USE_LITE_OP(feed);
-USE_LITE_OP(mul);
-USE_LITE_OP(search_fc);
-USE_LITE_OP(conv2d);
-USE_LITE_OP(depthwise_conv2d);
+USE_LITE_OP(fake_channel_wise_dequantize_max_abs);
+USE_LITE_OP(assign);
 USE_LITE_OP(layout);
-USE_LITE_OP(split);
+USE_LITE_OP(transpose);
+USE_LITE_OP(transpose2);
+USE_LITE_OP(pool2d);
+USE_LITE_OP(batch_norm);
+USE_LITE_OP(reshape);
+USE_LITE_OP(reshape2);
+USE_LITE_OP(search_fc);
+USE_LITE_OP(sequence_reverse);
+USE_LITE_OP(matmul);
+USE_LITE_OP(slice);
+USE_LITE_OP(lod_reset);
+USE_LITE_OP(graph_op);
+USE_LITE_OP(expand);
+USE_LITE_OP(top_k);
+USE_LITE_OP(fake_quantize_range_abs_max);
+USE_LITE_OP(arg_max);
+USE_LITE_OP(beam_search);
+USE_LITE_OP(box_clip);
+USE_LITE_OP(fake_quantize_dequantize_moving_average_abs_max);
+USE_LITE_OP(box_coder);
+USE_LITE_OP(search_seq_depadding);
+USE_LITE_OP(write_to_array);
+USE_LITE_OP(is_empty);
+USE_LITE_OP(prior_box);
+USE_LITE_OP(sequence_concat);
+USE_LITE_OP(affine_channel);
+USE_LITE_OP(shape);
+USE_LITE_OP(axpy);
+USE_LITE_OP(anchor_generator);
+USE_LITE_OP(reduce_max);
+USE_LITE_OP(gru);
+USE_LITE_OP(uniform_random);
+USE_LITE_OP(unsqueeze);
+USE_LITE_OP(unsqueeze2);
+USE_LITE_OP(layout_once);
+USE_LITE_OP(search_group_padding);
+USE_LITE_OP(assign_value);
+USE_LITE_OP(norm);
+USE_LITE_OP(concat);
+USE_LITE_OP(fill_constant);
+USE_LITE_OP(fill_constant_batch_size_like);
+USE_LITE_OP(calib_once);
+USE_LITE_OP(decode_bboxes);
+USE_LITE_OP(negative);
+USE_LITE_OP(layer_norm);
+USE_LITE_OP(mean);
+USE_LITE_OP(lrn);
+USE_LITE_OP(fetch);
+USE_LITE_OP(power);
 USE_LITE_OP(stack);
-USE_LITE_OP(scale);
-USE_LITE_OP(multiclass_nms);
+USE_LITE_OP(gather);
+USE_LITE_OP(fc);
+USE_LITE_OP(gru_unit);
+USE_LITE_OP(increment);
+USE_LITE_OP(mul);
+USE_LITE_OP(search_seq_softmax);
+USE_LITE_OP(var_conv_2d);
+USE_LITE_OP(dropout);
 USE_LITE_OP(fusion_elementwise_sub_activation);
 USE_LITE_OP(fusion_elementwise_add_activation);
 USE_LITE_OP(fusion_elementwise_mul_activation);
 USE_LITE_OP(fusion_elementwise_max_activation);
 USE_LITE_OP(fusion_elementwise_div_activation);
-USE_LITE_OP(unsqueeze);
-USE_LITE_OP(unsqueeze2);
-USE_LITE_OP(sequence_topk_avg_pooling);
-USE_LITE_OP(transpose);
-USE_LITE_OP(transpose2);
-USE_LITE_OP(dropout);
-USE_LITE_OP(pool2d);
 USE_LITE_OP(elementwise_sub);
 USE_LITE_OP(elementwise_add);
 USE_LITE_OP(elementwise_mul);
 USE_LITE_OP(elementwise_max);
 USE_LITE_OP(elementwise_div);
-USE_LITE_OP(io_copy_once);
-USE_LITE_OP(batch_norm);
-USE_LITE_OP(reshape);
-USE_LITE_OP(reshape2);
-USE_LITE_OP(expand);
-USE_LITE_OP(matmul);
-USE_LITE_OP(concat);
-USE_LITE_OP(slice);
-USE_LITE_OP(graph_op);
 USE_LITE_OP(pad2d);
+USE_LITE_OP(crop);
+USE_LITE_OP(sequence_expand);
+USE_LITE_OP(search_aligned_mat_mul);
+USE_LITE_OP(io_copy);
 USE_LITE_OP(squeeze);
 USE_LITE_OP(squeeze2);
-USE_LITE_OP(arg_max);
-USE_LITE_OP(fill_constant);
-USE_LITE_OP(fill_constant_batch_size_like);
+USE_LITE_OP(reduce_sum);
 USE_LITE_OP(square);
 USE_LITE_OP(relu);
 USE_LITE_OP(leaky_relu);
@@ -59,22 +98,53 @@ USE_LITE_OP(hard_sigmoid);
 USE_LITE_OP(sqrt);
 USE_LITE_OP(rsqrt);
 USE_LITE_OP(softsign);
+USE_LITE_OP(sequence_expand_as);
+USE_LITE_OP(match_matrix_tensor);
 USE_LITE_OP(range);
-USE_LITE_OP(box_coder);
-USE_LITE_OP(calib);
-USE_LITE_OP(mean);
-USE_LITE_OP(conv2d_transpose);
-USE_LITE_OP(fetch);
-USE_LITE_OP(prior_box);
+USE_LITE_OP(lookup_table);
 USE_LITE_OP(fake_quantize_moving_average_abs_max);
-USE_LITE_OP(io_copy);
+USE_LITE_OP(search_grnn);
+USE_LITE_OP(conv2d_transpose);
+USE_LITE_OP(flatten);
+USE_LITE_OP(flatten2);
 USE_LITE_OP(nearest_interp);
 USE_LITE_OP(bilinear_interp);
 USE_LITE_OP(softmax);
 USE_LITE_OP(reduce_mean);
-USE_LITE_OP(affine_channel);
 USE_LITE_OP(cast);
 USE_LITE_OP(fake_dequantize_max_abs);
-USE_LITE_OP(shuffle_channel);
+USE_LITE_OP(read_from_array);
 USE_LITE_OP(yolo_box);
-USE_LITE_OP(fc);
\ No newline at end of file
+USE_LITE_OP(multiclass_nms);
+USE_LITE_OP(while);
+USE_LITE_OP(conv2d);
+USE_LITE_OP(depthwise_conv2d);
+USE_LITE_OP(split);
+USE_LITE_OP(scale);
+USE_LITE_OP(beam_search_decode);
+USE_LITE_OP(im2sequence);
+USE_LITE_OP(sequence_topk_avg_pooling);
+USE_LITE_OP(io_copy_once);
+USE_LITE_OP(roi_align);
+USE_LITE_OP(sequence_reshape);
+USE_LITE_OP(equal);
+USE_LITE_OP(notequal);
+USE_LITE_OP(less_than);
+USE_LITE_OP(less_equal);
+USE_LITE_OP(greater_than);
+USE_LITE_OP(greater_equal);
+USE_LITE_OP(calib);
+USE_LITE_OP(sequence_pool);
+USE_LITE_OP(attention_padding_mask);
+USE_LITE_OP(search_attention_padding_mask);
+USE_LITE_OP(density_prior_box);
+USE_LITE_OP(search_seq_fc);
+USE_LITE_OP(generate_proposals);
+USE_LITE_OP(sequence_arithmetic);
+USE_LITE_OP(search_seq_arithmetic);
+USE_LITE_OP(shuffle_channel);
+USE_LITE_OP(sequence_softmax);
+USE_LITE_OP(logical_xor);
+USE_LITE_OP(logical_and);
+USE_LITE_OP(logical_or);
+USE_LITE_OP(logical_not);
\ No newline at end of file
diff --git a/lite/api/test_ssd_fpga.cc b/lite/api/test_ssd_fpga.cc
new file mode 100755
index 0000000000..15c2637507
--- /dev/null
+++ b/lite/api/test_ssd_fpga.cc
@@ -0,0 +1,154 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <dirent.h>
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "lite/api/cxx_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+#include "lite/api/test_helper.h"
+#include "lite/core/op_registry.h"
+
+
+DEFINE_string(input_file, "", "input_file");
+
+namespace paddle {
+namespace lite {
+
+std::vector<std::string> GetDirectoryFiles(const std::string& dir) {
+  std::vector<std::string> files;
+  std::shared_ptr<DIR> directory_ptr(opendir(dir.c_str()),
+                                     [](DIR* dir) { dir&& closedir(dir); });
+  struct dirent* dirent_ptr;
+  if (!directory_ptr) {
+    std::cout << "Error opening : " << std::strerror(errno) << dir << std::endl;
+    return files;
+  }
+
+  while ((dirent_ptr = readdir(directory_ptr.get())) != nullptr) {
+    files.push_back(std::string(dirent_ptr->d_name));
+  }
+  return files;
+}
+
+void readFromFile(int num, std::string path, float* data) {
+  std::ifstream file_stream(path);
+  // file_stream.open(path);
+  if (!file_stream.good()) {
+    std::cout << "file: " << path << " dones not exist!\n";
+    exit(-1);
+    return;
+  }
+  // float* data = mutableData<float>();
+  for (int i = 0; i < num; ++i) {
+    float value = 0;
+    file_stream >> value;
+    data[i] = value;
+  }
+  file_stream.close();
+}
+
+// #ifdef LITE_WITH_FPGA
+TEST(ResNet50, test) {
+  lite::Predictor predictor;
+  std::vector<Place> valid_places({
+      Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)},
+      Place{TARGET(kHost), PRECISION(kFloat)},
+      Place{TARGET(kARM), PRECISION(kFloat)},
+  });
+
+  predictor.Build(FLAGS_model_dir,
+                  "",
+                  "",
+                  valid_places);
+
+
+  // predictor.Build(FLAGS_model_dir,
+  //                 FLAGS_model_dir + "/model",
+  //                 FLAGS_model_dir + "/params",
+  //                 Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)},
+  //                 valid_places);
+
+
+  auto* input_tensor = predictor.GetInput(0);
+
+  int width = 416;
+  int height = 416;
+
+
+  std::ifstream file_stream(FLAGS_input_file);
+  // file_stream.open(path);
+  if (!file_stream.good()) {
+    std::cout << "file: " << FLAGS_input_file << " dones not exist!\n";
+    exit(-1);
+    return;
+  }
+
+  file_stream >> height;
+  file_stream >> width;
+
+  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, height, width})));
+  auto* data = input_tensor->mutable_data<float>();
+  auto item_size = input_tensor->dims().production();
+
+  for (int i = 0; i < item_size; i++) {
+    data[i] = 1;
+  }
+
+  // readFromFile(item_size, "car.data", data);
+
+  int num = 3 * width * height;
+
+  for (int i = 0; i < num; ++i) {
+    float value = 0;
+    file_stream >> value;
+    data[i] = value;
+  }
+  file_stream.close();
+
+  for (int i = 0; i < 2; ++i) {
+    predictor.Run();
+  }
+
+  auto* out = predictor.GetOutput(0);
+  for (int i = 0;i < out->dims().production();i++) {
+    std::cout << ":" << out->data<float>()[i] << std::endl; 
+  }
+
+  // std::cout << "-------\n";
+  // auto* out1 = predictor.GetOutput(1);
+  // for (int i = 0;i < out1->dims().production();i++) {
+  //   std::cout << ":" << out1->data<float>()[i] << std::endl; 
+  // }
+
+  // std::string file = "output/" + FLAGS_input_file.substr (6);
+  // std::cout << "file:::" << file << std::endl;
+
+  // std::ofstream ofs;
+  // ofs.open(file);
+  // for (int i = 0; i < out->dims().production(); i++) {
+  //   float value = out->data<float>()[i];
+  //   ofs << value << std::endl;
+  // }
+  // ofs.close();
+
+  LOG(INFO) << "================== Speed Report ===================";
+}
+// #endif
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/fpga/KD/debugger.hpp b/lite/backends/fpga/KD/debugger.hpp
index 70a92612f2..b549a6e5f3 100755
--- a/lite/backends/fpga/KD/debugger.hpp
+++ b/lite/backends/fpga/KD/debugger.hpp
@@ -79,7 +79,7 @@ inline void read_from_file(lite::Tensor* t,const std::string& path) {
 }
 
 inline void save_float(float* data, const std::string& name, int len) {
-    return;
+    // return;
   static int counter = 0;
   std::string old_string = std::to_string(counter);
   std::string new_string = std::string(3 - old_string.length(), '0') + old_string;
diff --git a/lite/backends/fpga/KD/pes/conv_pe.hpp b/lite/backends/fpga/KD/pes/conv_pe.hpp
index 73f0b7f5cd..2b943bb3ab 100755
--- a/lite/backends/fpga/KD/pes/conv_pe.hpp
+++ b/lite/backends/fpga/KD/pes/conv_pe.hpp
@@ -59,7 +59,7 @@ class ConvPE : public PE {
 
     if (param_.filter->shape().width() == 1 &&
         param_.filter->shape().height() == 1) {
-        use_cpu_ = true;
+        // use_cpu_ = true;
     }
     if (!use_cpu_) {
       // param_.filter->releaseData();
diff --git a/lite/backends/fpga/KD/tensor.hpp b/lite/backends/fpga/KD/tensor.hpp
index 48c97a54f3..fb17cf01af 100755
--- a/lite/backends/fpga/KD/tensor.hpp
+++ b/lite/backends/fpga/KD/tensor.hpp
@@ -395,7 +395,7 @@ class Tensor {
   }
 
   void save_file_with_name(std::string path) {
-    return;
+    // return;
     invalidate();
     // usleep(20000);
     // return;
diff --git a/lite/core/kernel.h b/lite/core/kernel.h
old mode 100644
new mode 100755
index 05d7a6b333..176f6c69ac
--- a/lite/core/kernel.h
+++ b/lite/core/kernel.h
@@ -84,11 +84,9 @@ class KernelBase {
 #ifdef LITE_WITH_PROFILE
     if (profile_id_ >= 0) {
       profile::ProfileBlock x(profile_id_, "kernel");
-      Run();
     }
-#else
-    Run();
 #endif
+    Run();
   }
 
   void SetContext(std::unique_ptr<KernelContext>&& ctx) {
diff --git a/lite/core/mir/type_target_cast_pass.cc b/lite/core/mir/type_target_cast_pass.cc
old mode 100644
new mode 100755
index b008faa687..d0e9cd4e2b
--- a/lite/core/mir/type_target_cast_pass.cc
+++ b/lite/core/mir/type_target_cast_pass.cc
@@ -93,8 +93,10 @@ void TypeTargetTransformPass::AddIoCopyInst(
   // Set the place for io_copy_output_arg node, the target should be equal to
   // to.target()
   // The precision and layout should be equal to from.precision(), from.layout()
+  #ifndef LITE_WITH_FPGA
   io_copy_output_arg->AsArg().type =
       LiteType::GetTensorTy(to.target(), from.precision(), from.layout());
+  #endif
   auto* io_copy_inst = graph->NewInstructNode();
 
   bool in_persist = in->AsArg().is_weight || in->AsArg().is_persist;
diff --git a/lite/core/program.cc b/lite/core/program.cc
old mode 100644
new mode 100755
index b60f279c0f..686b373040
--- a/lite/core/program.cc
+++ b/lite/core/program.cc
@@ -114,11 +114,14 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) {
 void RuntimeProgram::Run() {
   for (auto& inst : instructions_) {
     std::string op_type = inst.op()->op_info()->Type();
+
+#ifndef LITE_WITH_FPGA
     if (op_type == "feed" || op_type == "fetch") continue;
+#endif
     inst.Run();
 #ifdef LITE_WITH_PROFILE
 #ifdef LITE_WITH_PRECISION_PROFILE
-    LITE_PRECISION_PROFILE(inst)
+    // LITE_PRECISION_PROFILE(inst)
 #endif  // LITE_WITH_PRECISION_PROFILE
 #endif  // LITE_WITH_PROFILE
   }
diff --git a/lite/kernels/arm/prior_box_compute.cc b/lite/kernels/arm/prior_box_compute.cc
old mode 100644
new mode 100755
index 48ae1e94dd..aa3c2b3758
--- a/lite/kernels/arm/prior_box_compute.cc
+++ b/lite/kernels/arm/prior_box_compute.cc
@@ -98,8 +98,10 @@ REGISTER_LITE_KERNEL(prior_box,
                      kNCHW,
                      paddle::lite::kernels::arm::PriorBoxCompute,
                      def)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Image", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Input",{LiteType::GetTensorTy(
+                   TARGET(kFPGA), PRECISION(kAny), DATALAYOUT(kAny))})
+    .BindInput("Image", {LiteType::GetTensorTy(
+                   TARGET(kFPGA), PRECISION(kAny), DATALAYOUT(kAny))})
     .BindOutput("Boxes", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Variances", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
diff --git a/lite/kernels/fpga/CMakeLists.txt b/lite/kernels/fpga/CMakeLists.txt
index f431d408bb..246290c73a 100755
--- a/lite/kernels/fpga/CMakeLists.txt
+++ b/lite/kernels/fpga/CMakeLists.txt
@@ -7,15 +7,17 @@ set(fpga_deps fpga_target_wrapper kernel_fpga)
 
 # add_kernel(activation_compute_fpga FPGA basic SRCS activation_compute.cc DEPS ${fpga_deps})
 # add_kernel(box_coder_compute_fpga FPGA basic SRCS box_coder_compute.cc DEPS ${fpga_deps})
-add_kernel(concat_compute_fpga FPGA basic SRCS concat_compute.cc DEPS ${fpga_deps})
+# add_kernel(concat_compute_fpga FPGA basic SRCS concat_compute.cc DEPS ${fpga_deps})
 add_kernel(conv_compute_fpga FPGA basic SRCS conv_compute.cc DEPS ${fpga_deps})
 add_kernel(density_prior_box_compute_fpga FPGA basic SRCS density_prior_box_compute.cc DEPS ${fpga_deps})
 add_kernel(dropout_compute_fpga FPGA basic SRCS dropout_compute.cc DEPS ${fpga_deps})
 add_kernel(elementwise_compute_fpga FPGA basic SRCS elementwise_compute.cc DEPS ${fpga_deps})
+# add_kernel(feed_compute_fpga FPGA basic SRCS fc_compute.cc DEPS ${fpga_deps})
+
 add_kernel(fc_compute_fpga FPGA basic SRCS fc_compute.cc DEPS ${fpga_deps})
 add_kernel(gru_compute_fpga FPGA extra SRCS gru_compute.cc DEPS ${fpga_deps})
 # add_kernel(mul_compute_fpga FPGA basic SRCS mul_compute.cc DEPS ${fpga_deps})
-# add_kernel(multiclass_nms_compute_fpga FPGA basic SRCS multiclass_nms_compute.cc DEPS ${fpga_deps})
+add_kernel(multiclass_nms_compute_fpga FPGA basic SRCS multiclass_nms_compute.cc DEPS ${fpga_deps})
 add_kernel(norm_compute_fpga FPGA basic SRCS norm_compute.cc DEPS ${fpga_deps})
 # add_kernel(im2sequence_compute_fpga FPGA basic SRCS im2sequence_compute.cc DEPS ${fpga_deps})
 add_kernel(pooling_compute_fpga FPGA basic SRCS pooling_compute.cc DEPS ${fpga_deps})
diff --git a/lite/kernels/fpga/feed_compute.cc b/lite/kernels/fpga/feed_compute.cc
index 6f574c2ba0..5bd9bdaaf3 100755
--- a/lite/kernels/fpga/feed_compute.cc
+++ b/lite/kernels/fpga/feed_compute.cc
@@ -37,6 +37,7 @@ void FeedCompute::PrepareForRun() {
 }
 
 void FeedCompute::Run() {
+  std::cout << "================= FeedCompute ================= \n";
   auto& param = this->Param<param_t>();
   Tensor& x = param.feed_list->at(param.col);
   zynqmp::InputParam& feed_param = pe_.param();
diff --git a/lite/kernels/fpga/io_copy_compute.cc b/lite/kernels/fpga/io_copy_compute.cc
index df6abfe118..fe7a7b737b 100755
--- a/lite/kernels/fpga/io_copy_compute.cc
+++ b/lite/kernels/fpga/io_copy_compute.cc
@@ -157,11 +157,11 @@ REGISTER_LITE_KERNEL(io_copy,
                      device_to_host)
     .BindInput("Input",
                {LiteType::GetTensorTy(TARGET(kFPGA),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kAny))})
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
     .BindOutput("Out",
                 {LiteType::GetTensorTy(TARGET(kARM),
-                                       PRECISION(kAny),
+                                       PRECISION(kFloat),
                                        DATALAYOUT(kNHWC))})
     .Finalize();
 
diff --git a/lite/kernels/fpga/multiclass_nms_compute.cc b/lite/kernels/fpga/multiclass_nms_compute.cc
index be71642488..89c652a38e 100755
--- a/lite/kernels/fpga/multiclass_nms_compute.cc
+++ b/lite/kernels/fpga/multiclass_nms_compute.cc
@@ -12,11 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "lite/kernels/fpga/multiclass_nms_compute.h"
 #include <map>
 #include <utility>
 #include <vector>
 
-#include "lite/kernels/fpga/multiclass_nms_compute.h"
+#include "lite/backends/fpga/KD/debugger.hpp"
 
 namespace paddle {
 namespace lite {
@@ -196,11 +197,15 @@ void MultiClassNMS(const operators::MulticlassNmsParam& param,
   int num_det = 0;
 
   int64_t class_num = scores_size == 3 ? scores.dims()[0] : scores.dims()[1];
-  Tensor bbox_slice, score_slice;
+  
+  // scores.ZynqTensor()->saveToFile("nms_scores", true);
+
   for (int64_t c = 0; c < class_num; ++c) {
+    Tensor bbox_slice, score_slice;
     if (c == background_label) continue;
     if (scores_size == 3) {
-      score_slice = scores.Slice<T>(c, c + 1);
+      scores.Slice<T>(score_slice, c, c + 1);
+      // score_slice.ZynqTensor()->saveToFile("nms_slice", true);
       bbox_slice = bboxes;
     } else {
       score_slice.Resize({scores.dims()[0], 1});
@@ -208,7 +213,7 @@ void MultiClassNMS(const operators::MulticlassNmsParam& param,
       SliceOneClass<T>(scores, c, &score_slice);
       SliceOneClass<T>(bboxes, c, &bbox_slice);
     }
-    NMSFast(bbox_slice,
+    NMSFast(bboxes,// TODO bbox_slice
             score_slice,
             score_threshold,
             nms_threshold,
@@ -225,6 +230,9 @@ void MultiClassNMS(const operators::MulticlassNmsParam& param,
   *num_nmsed_out = num_det;
   const T* scores_data = scores.data<T>();
   if (keep_top_k > -1 && num_det > keep_top_k) {
+
+    Tensor score_slice;
+
     const T* sdata;
     std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
     for (const auto& it : *indices) {
@@ -315,48 +323,35 @@ void MultiClassOutput(const Tensor& scores,
 
 void MulticlassNmsCompute::Run() {
   auto& param = Param<operators::MulticlassNmsParam>();
-
   auto* boxes = param.bboxes;
   auto* scores = param.scores;
   auto* outs = param.out;
+  outs->mutable_data<float>();
 
-  auto boxes_dims = boxes->dims();
-  auto boxes_size = boxes_dims.size();
   auto score_dims = scores->dims();
   auto score_size = score_dims.size();
 
-  Tensor bboxes_tensor;
-  bboxes_tensor.Resize(boxes_dims);
-  auto bboxes_data = bboxes_tensor.mutable_data<float>();
-  bboxes_tensor.ZynqTensor()->copyFrom(boxes->ZynqTensor());
-
-  Tensor score_tensor;
-  score_tensor.Resize(score_dims);
-  auto score_data = score_tensor.mutable_data<float>();
-  score_tensor.ZynqTensor()->copyFrom(scores->ZynqTensor());
-
+  auto box_dims = boxes->dims();
+  int64_t box_dim = boxes->dims()[2];
 
   std::vector<std::map<int, std::vector<int>>> all_indices;
   std::vector<uint64_t> batch_starts = {0};
   int64_t batch_size = score_dims[0];
-  int64_t box_dim = boxes->dims()[2];
+  
   int64_t out_dim = box_dim + 2;
   int num_nmsed_out = 0;
   Tensor boxes_slice, scores_slice;
-  boxes_slice.mutable_data<float>();
-  scores_slice.mutable_data<float>();
-
   int n = score_size == 3 ? batch_size : boxes->lod().back().size() - 1;
   for (int i = 0; i < n; ++i) {
     if (score_size == 3) {
-      scores_slice = score_tensor.Slice<float>(i, i + 1);
+      scores->Slice<float>(scores_slice, i, i + 1);
       scores_slice.Resize({score_dims[1], score_dims[2]});
-      boxes_slice = bboxes_tensor.Slice<float>(i, i + 1);
+      boxes->Slice<float>(boxes_slice, i, i + 1);
       boxes_slice.Resize({score_dims[2], box_dim});
     } else {
-      auto boxes_lod = bboxes_tensor.lod().back();
-      scores_slice = score_tensor.Slice<float>(boxes_lod[i], boxes_lod[i + 1]);
-      boxes_slice = bboxes_tensor.Slice<float>(boxes_lod[i], boxes_lod[i + 1]);
+      auto boxes_lod = boxes->lod().back();
+      scores->Slice<float>(scores_slice, boxes_lod[i], boxes_lod[i + 1]);
+      boxes->Slice<float>(boxes_slice, boxes_lod[i], boxes_lod[i + 1]);
     }
     std::map<int, std::vector<int>> indices;
     MultiClassNMS<float>(
@@ -365,6 +360,8 @@ void MulticlassNmsCompute::Run() {
     batch_starts.push_back(batch_starts.back() + num_nmsed_out);
   }
 
+
+
   uint64_t num_kept = batch_starts.back();
   if (num_kept == 0) {
     outs->Resize({1, 1});
@@ -375,39 +372,44 @@ void MulticlassNmsCompute::Run() {
     outs->Resize({static_cast<int64_t>(num_kept), out_dim});
     for (int i = 0; i < n; ++i) {
       if (score_size == 3) {
-        scores_slice = score_tensor.Slice<float>(i, i + 1);
-        boxes_slice = bboxes_tensor.Slice<float>(i, i + 1);
+        scores->Slice<float>(scores_slice, i, i + 1);
+        boxes->Slice<float>(boxes_slice, i, i + 1);
         scores_slice.Resize({score_dims[1], score_dims[2]});
         boxes_slice.Resize({score_dims[2], box_dim});
       } else {
         auto boxes_lod = boxes->lod().back();
-        scores_slice = score_tensor.Slice<float>(boxes_lod[i], boxes_lod[i + 1]);
-        boxes_slice = bboxes_tensor.Slice<float>(boxes_lod[i], boxes_lod[i + 1]);
+        scores->Slice<float>(scores_slice, boxes_lod[i], boxes_lod[i + 1]);
+        boxes->Slice<float>(boxes_slice, boxes_lod[i], boxes_lod[i + 1]);
       }
       int64_t s = static_cast<int64_t>(batch_starts[i]);
       int64_t e = static_cast<int64_t>(batch_starts[i + 1]);
+
       if (e > s) {
-        Tensor out = outs->Slice<float>(s, e);
+        Tensor out;
+        outs->Slice<float>(out, s, e);
+        // scores_slice.ZynqTensor()->saveToFile("scores_slice", true);
         MultiClassOutput<float>(
             scores_slice, boxes_slice, all_indices[i], score_dims.size(), &out);
+        out.ZynqTensor()->saveToFile("out", true); 
+        outs->ZynqTensor()->copyFrom(out.ZynqTensor());
       }
     }
   }
 
+
+  // save_tensor(param.scores, "_scores.txt", false);
+  // save_tensor(param.bboxes, "_bboxes.txt", false);
+
+  boxes->ZynqTensor()->saveToFile("_boxes", true);
+  scores->ZynqTensor()->saveToFile("_scores", true);
+  outs->ZynqTensor()->saveToFile("_outs", true);
+
   LoD lod;
   lod.emplace_back(batch_starts);
 
   outs->set_lod(lod);
-
-
-  //   auto* boxes = param.bboxes;
-  // auto* scores = param.scores;
-  // auto* outs = param.out;
-  // boxes->ZynqTensor()->saveToFile("boxes", true);
-  //   scores->ZynqTensor()->saveToFile("scores", true);
-  // param.out->ZynqTensor()->saveToFile("nms_", true);
 }
-}  // namespace fpga
+}  // namespace host
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
@@ -418,10 +420,7 @@ REGISTER_LITE_KERNEL(multiclass_nms,
                      kNHWC,
                      paddle::lite::kernels::fpga::MulticlassNmsCompute,
                      def)
-    .BindInput("BBoxes",
-               {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Scores",
-               {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("BBoxes", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("Scores", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
     .Finalize();
diff --git a/lite/kernels/host/CMakeLists.txt b/lite/kernels/host/CMakeLists.txt
old mode 100644
new mode 100755
index 428cc213ce..2751641f45
--- a/lite/kernels/host/CMakeLists.txt
+++ b/lite/kernels/host/CMakeLists.txt
@@ -3,7 +3,7 @@ message(STATUS "compile with lite host kernels")
 add_kernel(feed_compute_host Host basic SRCS feed_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(fetch_compute_host Host basic SRCS fetch_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(reshape_compute_host Host basic SRCS reshape_compute.cc DEPS ${lite_kernel_deps} reshape_op)
-add_kernel(multiclass_nms_compute_host Host basic SRCS multiclass_nms_compute.cc DEPS ${lite_kernel_deps})
+# add_kernel(multiclass_nms_compute_host Host basic SRCS multiclass_nms_compute.cc DEPS ${lite_kernel_deps})
 
 #lite_cc_test(test_reshape_compute_host SRCS reshape_compute_test.cc DEPS reshape_compute_host any)
 #lite_cc_test(test_multiclass_nms_compute_host SRCS multiclass_nms_compute_test.cc DEPS multiclass_nms_compute_host any)
diff --git a/lite/kernels/host/multiclass_nms_compute.cc b/lite/kernels/host/multiclass_nms_compute.cc
old mode 100644
new mode 100755
index 6f6079ef88..131b51b90a
--- a/lite/kernels/host/multiclass_nms_compute.cc
+++ b/lite/kernels/host/multiclass_nms_compute.cc
@@ -392,7 +392,9 @@ REGISTER_LITE_KERNEL(multiclass_nms,
                      kNCHW,
                      paddle::lite::kernels::host::MulticlassNmsCompute,
                      def)
-    .BindInput("BBoxes", {LiteType::GetTensorTy(TARGET(kHost))})
-    .BindInput("Scores", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("BBoxes", {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny))})
+    .BindInput("Scores", {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
     .Finalize();
-- 
GitLab