YOLOV3 works

d3d793c7 · chonwhite · a59d6fab · d3d793c7 · d3d793c7 · d3d793c7
35 changed file
--- a/lite/backends/fpga/KD/dispatch/action.hpp
+++ b/lite/backends/fpga/KD/dispatch/action.hpp
@@ -18,19 +18,14 @@ namespace paddle {
 namespace zynqmp {
 class Action {
-public:
+ public:
-  void readScale(float* scale) {
+  void readScale(float* scale) {}
-  }
+  void writeScale(float* scale) {}
-  void writeScale(float* scale) {
+ private:
-  }
-private:
  int id_ = -1;
  int scaleIndex_ = -1;
 }
 }
 }
\ No newline at end of file
--- a/lite/backends/fpga/KD/dispatch/transaction.hpp
+++ b/lite/backends/fpga/KD/dispatch/transaction.hpp
@@ -21,20 +21,16 @@ namespace paddle {
 namespace zynqmp {
 class Transaction {
+ public:
+  void appendAction(Action* action) { actions_.push_back(action); };
-public:
+  void startTraction(){
-  void appendAction(Action* action) {
-    actions_.push_back(action);
-  };
-  void startTraction() {
  };
-private:
+ private:
  std::std::vector<Action*> actions_;
  int id_ = -1;
 }
 }
 }
\ No newline at end of file
--- a/lite/backends/fpga/KD/dispatch/transaction_manager.hpp
+++ b/lite/backends/fpga/KD/dispatch/transaction_manager.hpp
@@ -20,7 +20,7 @@ namespace paddle {
 namespace zynqmp {
 class TransactionManager {
-public:
+ public:
  static TransactionManager& get_instance() {
    static TransactionManager s_instance;
    return s_instance;
@@ -34,14 +34,11 @@ public:
    return currentTransaction_;
  };
-  void endTransaction() {
+  void endTransaction() { currentTransaction_ = nullptr; }
-    currentTransaction_ = nullptr;
-  }
-private:
+ private:
  Transaction* currentTransaction_ = nullptr;
  std::vector<Transaction*> transactions_;
 }
 }
 }
\ No newline at end of file
--- a/lite/backends/fpga/KD/io.cpp
+++ b/lite/backends/fpga/KD/io.cpp
@@ -15,8 +15,5 @@ limitations under the License. */
 #include "io.hpp"
 namespace paddle {
-namespace zynqmp {
+namespace zynqmp {}  // namespace zynqmp
-}  // namespace zynqmp
 }  // namespace paddle
--- a/lite/backends/fpga/KD/pes/conv_pe.hpp
+++ b/lite/backends/fpga/KD/pes/conv_pe.hpp
@@ -70,6 +70,10 @@ class ConvPE : public PE {
        param_.input->shape().channel() >= 2048) {
      use_cpu_ = true;
    }
+    if (param_.filter->shape().width() == 1 &&
+        param_.filter->shape().num() % 16 != 0) {
+      use_cpu_ = true;
+    }
    if (!use_cpu_) {
      // param_.filter->releaseData();
    }
@@ -93,34 +97,38 @@ class ConvPE : public PE {
    float* filter_data = param_.filter->data<float>();
    float* mi = new float[in_channel];
+    int wh = input->shape().width() * input->shape().height();
+    float max = 0;
    for (int i = 0; i < out_channel; i++) {
-      float* image = image_addr;
      float* filter_ptr = filter_data + i * in_channel;
-      float* out_ptr = mi;
+      // #pragma omp parallel for
-#pragma omp parallel for
-      for (int j = 0; j < in_channel; j++) {
+      for (int k = 0; k < wh; k++) {
-        // float32x4_t x0 = vld1q_f32(image);
+        float* image = image_addr;
-        // float32x4_t x1 = vld1q_f32(filter_ptr);
+        float* out_ptr = mi;
-        // float32x4_t r = vmulq_f32(x0, x1);
+        for (int j = 0; j < in_channel; j++) {
+          float value = image_addr[k * in_channel + j] * filter_ptr[j];
-        // vst1q_f32(out_ptr, r);
+          mi[j] = value;
-        // image += 4;
+        }
-        // filter_ptr += 4;
-        // out_ptr += 4;
+        float sum = 0;
-        float value = image_addr[j] * filter_ptr[j];
+        for (int j = 0; j < in_channel; j++) {
-        mi[j] = value;
+          sum += mi[j];
+        }
+        sum *= param_.scale()->data<float>()[i];
+        sum += param_.bias()->data<float>()[i];
+        out[i * wh + k] = sum;
+        max = std::max(max, std::abs(sum));
      }
-      float sum = 0;
-      for (int j = 0; j < in_channel; j++) {
-        sum += mi[j];
-      }
-      out[i] = sum;
    }
    delete[] mi;
    float_output.flush();
    output->copyFrom(&float_output);
+    output->scale()[0] = max / 127.0;
+    output->scale()[1] = 127.0 / max;
+    // output->saveToFile("cpu", true);
  }
  bool dispatch() {
@@ -206,7 +214,6 @@ class ConvPE : public PE {
      // std::cout << "\n ================== EW ================== \n";
      // }
    }
    return ret == 0;
  }

--- a/lite/backends/fpga/KD/pes/prior_box_pe.cpp
+++ b/lite/backends/fpga/KD/pes/prior_box_pe.cpp
@@ -262,7 +262,7 @@ bool PriorBoxPE::dispatch() {
  param_.outputBoxes->copyFrom(this->cachedBoxes_);
  param_.outputVariances->copyFrom(this->cachedVariances_);
  param_.outputBoxes->flush();
  // param_.outputBoxes->syncToCPU();
  param_.outputVariances->flush();

--- a/lite/backends/fpga/KD/pes/resize_pe.hpp
+++ b/lite/backends/fpga/KD/pes/resize_pe.hpp
@@ -84,28 +84,26 @@ class ResizePE : public PE {
    param_.input->syncToCPU();
-        for (int h = 0; h < in_height; h++) {
+    for (int h = 0; h < in_height; h++) {
-            for (int w = 0; w < in_width; w++) {
+      for (int w = 0; w < in_width; w++) {
-                int src_index = in_width * channel * h + w * channel;
+        int src_index = in_width * channel * h + w * channel;
-                float16* src = param_.input->data<float16>() + src_index;
+        float16* src = param_.input->data<float16>() + src_index;
-                // std::cout << "src_index:" << src_index << std::endl;
+        // std::cout << "src_index:" << src_index << std::endl;
-                for (int v = 0; v < factor; v++) {
+        for (int v = 0; v < factor; v++) {
-                    for (int i =0; i < factor; i++) {
+          for (int i = 0; i < factor; i++) {
-                        int dst_index = out_width * channel * h * factor +
+            int dst_index = out_width * channel * h * factor +
-                            out_width * channel * v +
+                            out_width * channel * v + w * channel * factor +
-                            w * channel * factor +
                            channel * i;
-                        float16* dst = param_.output->data<float16>() + dst_index;
+            float16* dst = param_.output->data<float16>() + dst_index;
-                        memcpy(dst, src, channel * sizeof(float16));
+            memcpy(dst, src, channel * sizeof(float16));
-                        // std::cout << "dst_index:" << dst_index << std::endl;
+            // std::cout << "dst_index:" << dst_index << std::endl;
-                    }
+          }
-                }
-            }
        }
-        param_.output->flush();
+      }
-        param_.output->copyScaleFrom(param_.input);
    }
+    param_.output->flush();
+    param_.output->copyScaleFrom(param_.input);
+  }
  bool dispatch() {
    cpu_compute();

--- a/lite/backends/fpga/KD/pes/scale_pe.hpp
+++ b/lite/backends/fpga/KD/pes/scale_pe.hpp
@@ -158,8 +158,9 @@ class ScalePE : public PE {
        int index = i * input->shape().channel() + c;
        float x = image_addr[index];
        float y = half_to_float(scale_data[c]);
-        float value =  x * y;
+        float value = x * y;
-        // std::cout << " x = " << std::to_string(x) << " y = " << std::to_string(y) << " v = " << std::to_string(value) << std::endl;
+        // std::cout << " x = " << std::to_string(x) << " y = " <<
+        // std::to_string(y) << " v = " << std::to_string(value) << std::endl;
        // float value = half_to_float(in_data[index]) * 19.3598f;
        data_out[index] = float_to_half(value);
@@ -188,9 +189,9 @@ class ScalePE : public PE {
    //   dw_param.quantizedFilter()->flush();
    // }
    // param_.input->syncToDevice();
-    // return dw_pe_.dispatch();
+    return dw_pe_.dispatch();
-    cpu_compute();
+    // cpu_compute();
    return true;
  }

--- a/lite/backends/fpga/KD/pes/yolobox_pe.hpp
+++ b/lite/backends/fpga/KD/pes/yolobox_pe.hpp
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "lite/backends/fpga/KD/pe.hpp"
+#include "lite/backends/fpga/KD/pe_params.hpp"
+namespace paddle {
+namespace zynqmp {
+float sigmoid(float x) {
+  return 1.0 / (1.0 + std::exp(-x));
+}
+inline void GetYoloBox(float* box, const float* x, const int* anchors, int w,
+                                  int h, int an_idx, int grid_size,
+                                  int input_size, int index, 
+                                  int img_height, int img_width) {
+  box[0] = (w + sigmoid(x[index])) * img_width * 1.0f/ grid_size;
+  box[1] = (h + sigmoid(x[index + 1])) * img_height * 1.0f / grid_size;
+  box[2] = std::exp(x[index + 2 ]) * anchors[2 * an_idx] * img_width * 1.0f/
+           input_size;
+  box[3] = std::exp(x[index + 3]) * anchors[2 * an_idx + 1] *
+           img_height * 1.0f / input_size;
+}
+inline int GetEntryIndex(int batch, int an_idx, int hw_idx,
+                                    int an_num, int an_stride, int stride,
+                                    int entry) {
+  return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx;
+}
+inline void CalcDetectionBox(float* boxes, float* box, const int box_idx,
+                                        const int img_height,
+                                        const int img_width) {
+  boxes[box_idx] = box[0] - box[2] / 2;
+  boxes[box_idx + 1] = box[1] - box[3] / 2;
+  boxes[box_idx + 2] = box[0] + box[2] / 2;
+  boxes[box_idx + 3] = box[1] + box[3] / 2;
+  boxes[box_idx] = boxes[box_idx] > 0 ? boxes[box_idx] : 0;
+  boxes[box_idx + 1] =
+      boxes[box_idx + 1] > 0 ? boxes[box_idx + 1] : 0;
+  boxes[box_idx + 2] = boxes[box_idx + 2] < img_width - 1
+                           ? boxes[box_idx + 2]
+                           : (img_width - 1);
+  boxes[box_idx + 3] = boxes[box_idx + 3] < img_height - 1
+                           ? boxes[box_idx + 3]
+                           : (img_height - 1);
+}
+inline void CalcLabelScore(float* scores, const float* input,
+                                      const int label_idx, const int score_idx,
+                                      const int class_num, const float conf) {
+  for (int i = 0; i < class_num; i++) {
+    scores[score_idx + i] = conf * sigmoid(input[label_idx + i]);
+    // std::cout << scores[score_idx + i] << " ";
+  }
+  // std::cout << std::endl;
+}
+class YoloBoxPE : public PE {
+ public:
+  bool init() {
+    param_.outputBoxes->setAligned(false);
+    param_.outputScores->setAligned(false);
+    param_.outputBoxes->setDataLocation(CPU);
+    param_.outputScores->setDataLocation(CPU);
+    return true;
+  }
+  bool dispatch() {
+    auto* input = param_.input;
+    auto* imgsize = param_.imgSize;
+    auto* boxes = param_.outputBoxes;
+    auto* scores = param_.outputScores;
+    auto anchors = param_.anchors;
+    int class_num = param_.classNum;
+    float conf_thresh = param_.confThresh;
+    int downsample_ratio = param_.downsampleRatio;
+    const int num = input->shape().num();
+    const int height = input->shape().height();
+    const int width = input->shape().width();
+    const int box_num = boxes->shape().channel();
+    const int an_num = anchors.size() / 2;
+    int input_size = downsample_ratio * height;
+    const int stride = height * width;
+    const int an_stride = (class_num + 5) * stride;
+    Tensor anchors_;
+    Shape anchors_shape(N, {an_num * 2});
+    auto anchors_data = anchors_.mutableData<int32_t>(INT32, anchors_shape);
+    std::copy(anchors.begin(), anchors.end(), anchors_data);
+    input->syncToCPU();
+    input->unalignImage();
+    // input->setAligned(false);
+    Tensor input_float;
+    input_float.setDataLocation(CPU);
+    float* input_data = input_float.mutableData<float>(FP32, input->shape());
+    input_float.copyFrom(input);
+    // input_float.saveToFile("input_yolobox_half", "true");
+    // input_float.setAligned(input->aligned());
+    // input_float.unalignImage();
+    // std::cout << "-------------unalignImage-----------------" << std::endl;
+    // for (int i = 0; i < input_float.shape().numel(); ++i)
+    // {
+    //   std::cout << input_data[i] << " ";
+    // }
+    //  std::cout << "-" << std::endl;
+    //  std::cout << "-------------unalignImage-----------------" << std::endl;
+    // input_float.setAligned(false);
+    // input_float.saveToFile("input_yolobox_float", "true");
+    // input_float.syncToCPU();
+    // input_float.invalidate();
+    imgsize->saveToFile("img_size", true);
+    const int32_t* imgsize_data = imgsize->data<int32_t>();
+    Tensor boxes_float;
+    Tensor scores_float;
+    boxes_float.setDataLocation(CPU);
+    float* boxes_float_data = boxes_float.mutableData<float>(FP32, boxes->shape());
+    memset(boxes_float_data, 0, boxes->shape().numel() * sizeof(float));
+    scores_float.setDataLocation(CPU);
+    float* scores_float_data = scores_float.mutableData<float>(FP32, scores->shape());
+    memset(scores_float_data, 0, scores->shape().numel() * sizeof(float));
+    // float* boxes_data = boxes->mutableData<float>();
+    // memset(boxes_data, 0, boxes->shape().numel() * sizeof(float));
+    // float* scores_data = scores->mutableData<float>();
+    // memset(scores_data, 0, scores->shape().numel() * sizeof(float));
+    float box[4];
+    // for (int n = 0; n < num; n++) {
+      // int img_height = imgsize_data[2 * i];
+      // int img_width = imgsize_data[2 * i + 1];
+    int img_height = imgsize_data[0];
+    int img_width = imgsize_data[1];
+    std::cout << "YoloBoxPE imgsize:" << img_height << "," << img_width << std::endl;
+    int channel = input_float.shape().channel();
+    int count = 0;
+    for (int h = 0; h < height; h++) {
+      for (int w = 0; w < width ; w++) {
+        for (int n = 0; n < an_num; n++) {
+          int obj_idx = channel * width * h + channel * w + n * (5 + class_num) + 4;
+          // std::cout << obj_idx << " ";
+          float conf = sigmoid(input_data[obj_idx]);
+          if (conf < conf_thresh) {
+            count++;
+            continue;
+          }
+          int box_idx = channel * width * h + channel * w + n * (5 + class_num) + 0;
+          GetYoloBox(box, input_data, anchors_data, w, h, n, height, input_size,
+                        box_idx, img_height, img_width);
+          box_idx = h * an_num * 4 * width + an_num * 4 * w +  n * 4;
+          CalcDetectionBox(boxes_float_data, box, box_idx, img_height,img_width);
+          int label_idx = channel * width * h + channel * w + n * (5 + class_num) + 5;
+          int score_idx = h * an_num * class_num * width + an_num * class_num * w + n * class_num;
+          CalcLabelScore(scores_float_data, input_data, label_idx, score_idx, class_num, conf);
+        }
+      }
+    }
+    boxes->copyFrom(&boxes_float);
+    scores->copyFrom(&scores_float);
+    input->setAligned(true);
+  }
+  void apply(){};
+ YoloBoxParam& param() { return param_; }
+ private:
+  YoloBoxParam param_;
+};
+}  // namespace zynqmp
+}  // namespace paddle
--- a/lite/backends/fpga/KD/tensor.hpp
+++ b/lite/backends/fpga/KD/tensor.hpp
@@ -266,23 +266,21 @@ class Tensor {
      return;
    }
    BypassArgs args;
-    args.input_data_type = src->dataType_ == FP32 ? DATA_TYPE_FP32 : DATA_TYPE_FP16;
+    args.input_data_type =
+        src->dataType_ == FP32 ? DATA_TYPE_FP32 : DATA_TYPE_FP16;
    args.output_data_type = dataType_ == FP32 ? DATA_TYPE_FP32 : DATA_TYPE_FP16;
    args.input_layout_type = LAYOUT_HWC;
    args.output_layout_type = LAYOUT_HWC;
-    args.image = {
+    args.image = {.address = src->data<void>(),
-      .address = src->data<void>(),
+                  .scale_address = src->scale(),
-      .scale_address = src->scale(),
+                  .channels = (uint32_t)src->shape().numel(),
-      .channels = (uint32_t)src->shape().numel(),
+                  .width = 1,
-      .width = 1,
+                  .height = 1,
-      .height = 1,
+                  .pad_width = 0U,
-      .pad_width = 0U,
+                  .pad_height = 0U};
-      .pad_height = 0U
-    };
    ImageOutputArgs output = {
-      .address = data<void>(),
+        .address = data<void>(), .scale_address = scale(),
-      .scale_address = scale(),
    };
    args.output = output;
@@ -385,10 +383,11 @@ class Tensor {
  void save_file_with_name(std::string path) {
    // std::cout << "saving file: " << path << std::endl;
    void* add = (void*)this;
-    // printf("tensor @: %p  data: %p \n", (void *)add, (void*)data<void>());  
+    // printf("tensor @: %p  data: %p \n", (void *)add, (void*)data<void>());
    // return;
    std::ofstream ofs;
    ofs.open(path);
+    ofs << "data type: " << dataType() << std::endl;
    ofs << scale()[0] << " / " << scale()[1] << std::endl;
    for (int i = 0; i < shape_->numel(); i++) {
@@ -406,13 +405,14 @@ class Tensor {
      if (dataType_ == INT32) {
        value = data<int32_t>()[i];
      }
      if (i < 10) {
        std::cout << value << ",";
      }
+      //   if (i > 1000) {
+      //       break;
+      //   }
      ofs << value << std::endl;
    }
    usleep(30000);
    ofs.close();
@@ -465,7 +465,6 @@ class Tensor {
        value = half_to_float(tensor.data<float16>()[i]);
      }
      os << value << " ";
    }
    os << "\n";
    return os;

--- a/lite/backends/fpga/lite_tensor.h
+++ b/lite/backends/fpga/lite_tensor.h
@@ -166,6 +166,9 @@ class TensorLite {
  void clear() {
    // zynq_tensor_->releaseData();
+    if (zynq_tensor_) {
+      memset(zynq_tensor_->data<void>(), 0, zynq_tensor_->memorySize());
+    }
  }
  template <typename T>

--- a/lite/core/mir/fusion/quant_dequant_op_fuser.h
+++ b/lite/core/mir/fusion/quant_dequant_op_fuser.h
@@ -105,7 +105,6 @@ class ChannelWiseDequantOpFuser : public FuseBase {
 */
 class DeleteQuantDequantOpFuser : public FuseBase {
 public:
  void BuildPattern() override;
  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;

--- a/lite/core/mir/kernel_place_correct_pass.h
+++ b/lite/core/mir/kernel_place_correct_pass.h
@@ -84,13 +84,13 @@ class KernelPlaceCorrectPass : public DebugPass {
      std::string node_name = out->AsArg().name;
      std::string arg_name = get_argname(node_name, inst.op_info()->outputs());
      auto op_type = inst.op_type();
      if (op_type == "reshape" || op_type == "reshape2") {
        for (auto* x_in : x->inlinks) {
+          std::string in_name =
-          std::string in_name = get_argname(x_in->AsArg().name, inst.op_info()->inputs());
+              get_argname(x_in->AsArg().name, inst.op_info()->inputs());
          // std::cout << "name: " << x_in->AsArg().name  << std::endl;
          // std::cout << "in_name: " << in_name  << std::endl;
          if (in_name == "X") {
@@ -101,9 +101,11 @@ class KernelPlaceCorrectPass : public DebugPass {
        }
        p = in->AsArg().type->precision();
-        if ( p != PrecisionType::kFP16) {
+        if (p != PrecisionType::kFP16) {
-          // std::cout << "found an arm ............... : " << inst.kernels().size() << std::endl;
+          // std::cout << "found an arm ............... : " <<
-          // std::cout << "tt:" <<  TargetRepr(inst.kernels()[0]->target()) << std::endl;
+          // inst.kernels().size() << std::endl;
+          // std::cout << "tt:" <<  TargetRepr(inst.kernels()[0]->target()) <<
+          // std::endl;
          UpdateTarget(inst, TargetType::kHost);
          UpdateTensor(inst, in, out, TargetType::kHost);
        }
@@ -113,8 +115,9 @@ class KernelPlaceCorrectPass : public DebugPass {
        UpdateTarget(inst, TargetType::kFPGA);
      }
-      if (inst.op_type() == "split" || inst.op_type() == "transpose") {
+      if (inst.op_type() == "split" || inst.op_type() == "transpose" ||
-        if ( p != PrecisionType::kFP16) {
+          inst.op_type() == "transpose2") {
+        if (p != PrecisionType::kFP16) {
          UpdateTarget(inst, TargetType::kARM);
          for (auto* x_out : x->outlinks) {
            UpdateTensor(inst, in, x_out, TargetType::kARM);
@@ -123,9 +126,12 @@ class KernelPlaceCorrectPass : public DebugPass {
      }
      if (inst.op_type() == "concat") {
-        std::cout << "concat target:" << TargetRepr(inst.kernels()[0]->target()) << std::endl;
+        std::cout << "concat target:" << TargetRepr(inst.kernels()[0]->target())
-        std::cout << "concat p:" << PrecisionToStr(inst.kernels()[0]->precision()) << std::endl;
+                  << std::endl;
-        if ( p != PrecisionType::kFP16) {
+        std::cout << "concat p:"
+                  << PrecisionToStr(inst.kernels()[0]->precision())
+                  << std::endl;
+        if (p != PrecisionType::kFP16) {
          UpdateTarget(inst, TargetType::kARM);
          UpdateTensor(inst, in, out, TargetType::kARM);
        }
@@ -134,8 +140,9 @@ class KernelPlaceCorrectPass : public DebugPass {
      // if (inst.op_type() == "elementwise_mul") {
      //   for (auto* x_in : x->inlinks) {
-      //     std::string in_name = get_argname(x_in->AsArg().name, inst.op_info()->inputs());
+      //     std::string in_name = get_argname(x_in->AsArg().name,
+      //     inst.op_info()->inputs());
      //     std::cout << "name: " << x_in->AsArg().name  << std::endl;
      //     std::cout << "in_name: " << in_name  << std::endl;
      //     if (in_name == "Y") {
@@ -150,7 +157,6 @@ class KernelPlaceCorrectPass : public DebugPass {
      //     UpdateTensor(inst, in, out, TargetType::kARM);
      //   }
      // }
      std::vector<TargetType> in_types;
      std::vector<TargetType> out_types;
@@ -164,11 +170,13 @@ class KernelPlaceCorrectPass : public DebugPass {
        auto type = inst.picked_kernel().GetInputDeclType(arg_name);
-        // std::cout << arg_name <<" is weight:: " << std::to_string(x_in->AsArg().is_weight) 
+        // std::cout << arg_name <<" is weight:: " <<
-        //     << "     is persist: " << std::to_string(x_in->AsArg().is_persist) << std::endl;
+        // std::to_string(x_in->AsArg().is_weight)
+        //     << "     is persist: " <<
+        //     std::to_string(x_in->AsArg().is_persist) << std::endl;
        // std::cout << " type: "<< inst.op_type() << std::endl;
        if (!x_in->AsArg().is_weight) {
          auto p = x_in->AsArg().type->precision();
          auto t = x_in->AsArg().type->target();
@@ -224,10 +232,10 @@ class KernelPlaceCorrectPass : public DebugPass {
    }
  }
  // Update me's kUnk fields by other's fields.
  void UpdateTarget(mir::Node::Stmt& inst, TargetType new_target) {  // NOLINT
-    // std::cout << "1 kernels: " << std::to_string(inst.kernels().size()) << std::endl;
+    // std::cout << "1 kernels: " << std::to_string(inst.kernels().size()) <<
+    // std::endl;
    auto new_place = inst.place();
    new_place.target = new_target;
@@ -244,25 +252,30 @@ class KernelPlaceCorrectPass : public DebugPass {
    std::vector<Place> places;
    places.push_back(new_place);
    inst.ResetKernels(places);
-    // std::cout << "2 kernels: " << std::to_string(inst.kernels().size()) << std::endl;
+    // std::cout << "2 kernels: " << std::to_string(inst.kernels().size()) <<
+    // std::endl;
  }
-  void UpdateTensor(mir::Node::Stmt& inst, Node* in, Node* out, TargetType new_target = TargetType::kUnk) {
+  void UpdateTensor(mir::Node::Stmt& inst,
+                    Node* in,
+                    Node* out,
+                    TargetType new_target = TargetType::kUnk) {
    auto get_argname = [&](
-    const std::string& node_name,
+        const std::string& node_name,
-    const std::map<std::string, std::vector<std::string>>& argname_map)
+        const std::map<std::string, std::vector<std::string>>& argname_map)
-    -> std::string {
+        -> std::string {
-      for (auto& ele : argname_map) {
+          for (auto& ele : argname_map) {
-        auto it =
+            auto it =
-            std::find(ele.second.begin(), ele.second.end(), node_name);
+                std::find(ele.second.begin(), ele.second.end(), node_name);
-        if (it != ele.second.end()) return ele.first;
+            if (it != ele.second.end()) return ele.first;
-      }
+          }
-      return "";
+          return "";
-    };
+        };
-    std::string arg_name = get_argname(out->AsArg().name, inst.op_info()->outputs());
+    std::string arg_name =
-    std::string in_name = get_argname(in->AsArg().name, inst.op_info()->inputs());
+        get_argname(out->AsArg().name, inst.op_info()->outputs());
+    std::string in_name =
+        get_argname(in->AsArg().name, inst.op_info()->inputs());
    auto type = inst.picked_kernel().GetInputDeclType(in_name);
    auto tmp_ptype = in->AsArg().type->precision();
@@ -281,7 +294,8 @@ class KernelPlaceCorrectPass : public DebugPass {
      tmp_layout = DataLayoutType::kNCHW;
    }
-    out->AsArg().type = LiteType::GetTensorTy(tmp_target, tmp_ptype, tmp_layout);
+    out->AsArg().type =
+        LiteType::GetTensorTy(tmp_target, tmp_ptype, tmp_layout);
  }
 };

--- a/lite/core/mir/static_kernel_pick_pass.cc
+++ b/lite/core/mir/static_kernel_pick_pass.cc
@@ -80,6 +80,8 @@ void StaticKernelPickPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
    std::sort(scored.begin(), scored.end(), KernelScoreCmp);
    instruct.kernels().clear();
+    VLOG(2) << "picking kernel " << scored.front().second->name() << "\n\n";
    if (!instruct.op_info()->HasAttr("enable_int8")) {
      // Move kernel back
      // Just keep a single best kernel.

--- a/lite/core/mir/static_kernel_pick_pass.h
+++ b/lite/core/mir/static_kernel_pick_pass.h
@@ -157,7 +157,21 @@ class StaticKernelPickPass : public mir::StmtPass {
        }
      }
      if (in_match) {
-        final_score = 5000;
+        final_score += 1000;
+      }
+      bool out_match = true;
+      for (size_t i = 0; i < out_names.size(); ++i) {
+        std::string tmp;
+        CHECK(instruct.op_info()->GetOutputArgname(out_names[i], &tmp));
+        if (out_types.count(out_names[i]) &&
+            out_types.at(out_names[i]) !=
+                kernel.GetOutputDeclType(tmp)->precision()) {
+          out_match = false;
+        }
+      }
+      if (out_match) {
+        final_score += 1000;
      }
    }

--- a/lite/core/mir/type_target_cast_pass.cc
+++ b/lite/core/mir/type_target_cast_pass.cc
@@ -53,7 +53,6 @@ void TypeTargetTransformPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
      ComplementInputs(graph.get(), node, in, &copied_nodes);
    }
  }
 }
 void TypeTargetTransformPass::ComplementInputs(
@@ -74,6 +73,7 @@ void TypeTargetTransformPass::ComplementInputs(
  auto in_arg_name = in->AsArg().name;
  std::string tmp;
  CHECK(inst.op_info()->GetInputArgname(in_arg_name, &tmp));
+  VLOG(4) << "in_arg_name: " << in_arg_name << " tmp:" << tmp;
  auto decl_arg_type = inst.picked_kernel().GetInputDeclType(tmp);
  CHECK(in->AsArg().type);
  if (!TargetCompatibleTo(*in->AsArg().type, *decl_arg_type)) {

--- a/lite/core/mir/variable_place_inference_pass.h
+++ b/lite/core/mir/variable_place_inference_pass.h
@@ -141,6 +141,7 @@ class VariablePlaceInferencePass : public DebugPass {
            x_in->AsArg().type = type;
          } else {
            PrecisionType tmp_ptype = x_in->AsArg().type->precision();
+            VLOG(4) << "tmp_ptype:" << PrecisionToStr(tmp_ptype);
            x_in->AsArg().type = LiteType::GetTensorTy(
                type->target(), tmp_ptype, type->layout());
          }
@@ -172,6 +173,9 @@ class VariablePlaceInferencePass : public DebugPass {
            x_out->AsArg().type = type;
          } else {
            PrecisionType tmp_ptype = x_out->AsArg().type->precision();
+            tmp_ptype = type->precision();
+            // inst.picked_kernel().precision();
+            VLOG(4) << "tmp_ptype:" << PrecisionToStr(tmp_ptype);
            x_out->AsArg().type = LiteType::GetTensorTy(
                type->target(), tmp_ptype, type->layout());
          }

--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
@@ -134,7 +134,6 @@ class Optimizer {
           "mlu_postprocess_pass"}};
      if (passes.size() == 1) {
        // multi_stream_analysis_pass must be in the front of
        // runtime_context_assign_pass

--- a/lite/kernels/fpga/CMakeLists.txt
+++ b/lite/kernels/fpga/CMakeLists.txt
@@ -42,6 +42,8 @@ add_kernel(layout_compute_fpga FPGA basic SRCS layout_compute.cc DEPS ${fpga_dep
 add_kernel(feed_compute_fpga FPGA basic SRCS feed_compute.cc DEPS ${fpga_deps})
 add_kernel(fetch_compute_fpga FPGA basic SRCS fetch_compute.cc DEPS ${fpga_deps})
+add_kernel(yolo_box_compute_fpga FPGA basic SRCS yolo_box_compute.cc DEPS ${fpga_deps})
 # add_kernel(while_compute_fpga FPGA extra SRCS while_compute.cc DEPS ${fpga_deps})
 # add_kernel(write_to_array_compute_fpga FPGA extra SRCS write_to_array_compute.cc DEPS ${fpga_deps})

--- a/lite/kernels/fpga/activation_compute.cc
+++ b/lite/kernels/fpga/activation_compute.cc
@@ -35,6 +35,28 @@ void ReluCompute::PrepareForRun() {
 void ReluCompute::Run() { pe_.dispatch(); }
+void SigmoidCompute::Run() {
+  // TODO(chonwhite) use fpga and arm implementation;
+  auto& param = this->Param<param_t>();
+  auto output_data = param.Out->mutable_data<float16>();
+  int numel = param.Out->numel();
+  float16* in_data = param.X->ZynqTensor()->data<float16>();
+  float16* out_data = param.Out->ZynqTensor()->data<float16>();
+  param.X->ZynqTensor()->syncToCPU();
+  float max = 0.0f;
+  for (int i = 0; i < numel; i++) {
+    /* code */
+    float value = zynqmp::half_to_float(in_data[i]);
+    value = 1 / (1 + exp(-value));
+    out_data[i] = zynqmp::float_to_half(value);
+    max = std::max(std::abs(value), max);
+  }
+  param.Out->ZynqTensor()->scale()[0] = max / 127.0;
+  param.Out->ZynqTensor()->scale()[1] = 127.0 / max;
+  param.Out->ZynqTensor()->flush();
+}
 }  // namespace fpga
 }  // namespace kernels
 }  // namespace lite
@@ -51,3 +73,19 @@ REGISTER_LITE_KERNEL(
                                       PRECISION(kFP16),
                                       DATALAYOUT(kNHWC))})
    .Finalize();
+REGISTER_LITE_KERNEL(sigmoid,
+                     kFPGA,
+                     kFP16,
+                     kNHWC,
+                     paddle::lite::kernels::fpga::SigmoidCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kFPGA),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
--- a/lite/kernels/fpga/activation_compute.h
+++ b/lite/kernels/fpga/activation_compute.h
@@ -49,6 +49,16 @@ class ReluCompute
  zynqmp::Tensor output_;
 };
+class SigmoidCompute
+    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
+ public:
+  using param_t = operators::ActivationParam;
+  void Run() override;
+  virtual ~SigmoidCompute() = default;
+};
 }  // namespace fpga
 }  // namespace kernels
 }  // namespace lite

--- a/lite/kernels/fpga/calib_compute.cc
+++ b/lite/kernels/fpga/calib_compute.cc
@@ -49,7 +49,7 @@ void CalibComputeFloat2Int::Run() {
  const auto* din = param.input->data<float>();
  auto* dout = param.output->mutable_data<int>();
  // param.output->ZynqTensor()->copyFrom(param.input->ZynqTensor());
-  //TODO
+  // TODO
  auto out_lod = param.output->mutable_lod();
  *out_lod = param.input->lod();
  return;

--- a/lite/kernels/fpga/concat_compute.cc
+++ b/lite/kernels/fpga/concat_compute.cc
@@ -45,11 +45,11 @@ void ConcatCompute::PrepareForRun() {
 void ConcatCompute::Run() {
  pe_.dispatch();
-#ifdef FPGA_PRINT_TENSOR
+  // #ifdef FPGA_PRINT_TENSOR
  zynqmp::ConcatParam& concat_param = pe_.param();
  concat_param.output->flush();
-  // Debugger::get_instance().registerOutput("concat", concat_param.output);
+  Debugger::get_instance().registerOutput("concat", concat_param.output);
-#endif
+  // #endif
 }
 }  // namespace fpga

--- a/lite/kernels/fpga/conv_compute.cc
+++ b/lite/kernels/fpga/conv_compute.cc
@@ -53,7 +53,8 @@ void ConvCompute::PrepareForRun() {
    if (param.activation_param.Leaky_relu_alpha > 0.001) {
      conv_param.activeParam.type = zynqmp::TYPE_LEAKY_RELU;
-      conv_param.activeParam.leaky_relu_factor = param.activation_param.Leaky_relu_alpha;
+      conv_param.activeParam.leaky_relu_factor =
+          param.activation_param.Leaky_relu_alpha;
    }
    dw_conv_pe_.init();
@@ -79,13 +80,15 @@ void ConvCompute::PrepareForRun() {
    if (param.activation_param.Leaky_relu_alpha > 0.001) {
      conv_param.activeParam.type = zynqmp::TYPE_LEAKY_RELU;
-      conv_param.activeParam.leaky_relu_factor = param.activation_param.Leaky_relu_alpha;
+      conv_param.activeParam.leaky_relu_factor =
+          param.activation_param.Leaky_relu_alpha;
    }
    conv_pe_.init();
    conv_pe_.apply();
  }
-  // std::cout << "Leaky_relu_alpha:" << param.activation_param.Leaky_relu_alpha << std::endl;
+  // std::cout << "Leaky_relu_alpha:" << param.activation_param.Leaky_relu_alpha
+  // << std::endl;
 }
 void ConvCompute::Run() {

--- a/lite/kernels/fpga/elementwise_compute.cc
+++ b/lite/kernels/fpga/elementwise_compute.cc
@@ -96,11 +96,12 @@ void ElementwiseMulCompute::PrepareForRun() {
    scale_value = param.Y->data<zynqmp::float16>()[0];
    // std::cout << "FP16 \n";
  }
  // std::cout << "channel:" << channel << std::endl;
  // std::cout << "production:" << param.Y->dims().production() << std::endl;
-  // std::cout << "scale_value:" << std::to_string(zynqmp::half_to_float(scale_value)) << std::endl;
+  // std::cout << "scale_value:" <<
+  // std::to_string(zynqmp::half_to_float(scale_value)) << std::endl;
  // exit(-1);
  for (int i = 0; i < channel; i++) {
@@ -112,7 +113,8 @@ void ElementwiseMulCompute::PrepareForRun() {
        scale_value = param.Y->data<zynqmp::float16>()[i];
      }
    }
-    // std::cout << "scale_value:" << std::to_string(zynqmp::half_to_float(scale_value)) << std::endl;
+    // std::cout << "scale_value:" <<
+    // std::to_string(zynqmp::half_to_float(scale_value)) << std::endl;
    // exit(-1);
    scale_data[i] = scale_value;
    bias_data[i] = zero_;
@@ -128,13 +130,13 @@ void ElementwiseMulCompute::Run() {
  if (!param.Y->persistable()) {
    // TODO
    scale_.copyFrom(param.Y->ZynqTensor());
-    scale_.flush();//TODO
+    scale_.flush();  // TODO
  }
  pe_.dispatch();
 #ifdef FPGA_PRINT_TENSOR
  zynqmp::ScaleParam& scale_param = pe_.param();
-  // Debugger::get_instance().registerOutput("ew_mul_in", scale_param.input);
+// Debugger::get_instance().registerOutput("ew_mul_in", scale_param.input);
-  // Debugger::get_instance().registerOutput("ew_mul", scale_param.output);
+// Debugger::get_instance().registerOutput("ew_mul", scale_param.output);
 #endif
 }
@@ -214,8 +216,7 @@ REGISTER_LITE_KERNEL(elementwise_mul,
               {LiteType::GetTensorTy(TARGET(kFPGA),
                                      PRECISION(kFP16),
                                      DATALAYOUT(kNHWC))})
-    .BindInput("Y",
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
-               {LiteType::GetTensorTy(TARGET(kARM))})
    .BindOutput("Out",
                {LiteType::GetTensorTy(TARGET(kFPGA),
                                       PRECISION(kFP16),

--- a/lite/kernels/fpga/feed_compute.cc
+++ b/lite/kernels/fpga/feed_compute.cc
@@ -28,7 +28,14 @@ void FeedCompute::PrepareForRun() {
  auto& param = this->Param<param_t>();
  Tensor& x = param.feed_list->at(param.col);
  param.out->Resize(x.dims());
-  param.out->mutable_data<float16>();
+  auto in_type = x.ZynqTensor()->dataType();
+  if (in_type == zynqmp::FP32 || in_type == zynqmp::FP16) {
+    param.out->mutable_data<float16>();
+  }
+  if (in_type == zynqmp::INT32) {
+    param.out->mutable_data<int32_t>();
+  }
  // ====================================================
  zynqmp::InputParam& feed_param = pe_.param();
  feed_param.input = x.ZynqTensor();
@@ -68,12 +75,18 @@ REGISTER_LITE_KERNEL(
                                       DATALAYOUT(kNHWC))})
    .Finalize();
-// REGISTER_LITE_KERNEL(feed,
+REGISTER_LITE_KERNEL(feed,
-//                      kFPGA,
+                     kFPGA,
-//                      kFP16,
+                     kFP16,
-//                      kNHWC,
+                     kNHWC,
-//                      paddle::lite::kernels::fpga::FeedCompute,
+                     paddle::lite::kernels::fpga::FeedCompute,
-//                      def_host)
+                     feed_int32)
-//     .BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("X",
-//     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
+               {LiteType::GetTensorTy(TARGET(kHost),
-//     .Finalize();
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kInt32),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
\ No newline at end of file
--- a/lite/kernels/fpga/interpolate_compute.cc
+++ b/lite/kernels/fpga/interpolate_compute.cc
@@ -49,7 +49,6 @@ void BilinearInterpCompute::Run() {
  //                              interp_method);
 }
 void nearest_interp(const float16* src,
                    int w_in,
                    int h_in,
@@ -85,7 +84,7 @@ void nearest_interp(const float16* src,
      int near_y = static_cast<int>(scale_h_new * h);
      for (int w = 0; w < w_out; ++w) {
        int near_x = static_cast<int>(scale_w_new * w);
        const float16* src_n = src + (near_y * w_in + near_x) * c;
        memcpy(dst_p, src_n, c * sizeof(float16));
        dst_p += c;
@@ -133,8 +132,6 @@ inline std::vector<T> get_new_data_from_tensor(const Tensor* new_data_tensor) {
  return vec_new_data;
 }
 void interpolate(lite::Tensor* X,
                 lite::Tensor* OutSize,
                 std::vector<const lite::Tensor*> SizeTensor,
@@ -188,19 +185,18 @@ void interpolate(lite::Tensor* X,
  int spatial_in = in_h * in_w;
  int spatial_out = out_h * out_w;
  for (int i = 0; i < count; ++i) {
-      nearest_interp(din + spatial_in * i,
+    nearest_interp(din + spatial_in * i,
-                     in_w,
+                   in_w,
-                     in_h,
+                   in_h,
-                     out_c,
+                   out_c,
-                     dout + spatial_out * i,
+                   dout + spatial_out * i,
-                     out_w,
+                   out_w,
-                     out_h,
+                   out_h,
-                     1.f / width_scale,
+                   1.f / width_scale,
-                     1.f / height_scale,
+                   1.f / height_scale,
-                     with_align);
+                   with_align);
-    }
+  }
 }
 void NearestInterpCompute::Run() {
@@ -215,27 +211,24 @@ void NearestInterpCompute::Run() {
  int out_h = param.out_h;
  bool align_corners = param.align_corners;
  std::string interp_method = "";
-  X->ZynqTensor()->invalidate();//TODO
+  X->ZynqTensor()->invalidate();  // TODO
  X->ZynqTensor()->saveToFile("n_in", true);
  interpolate(X,
-                               OutSize,
+              OutSize,
-                               SizeTensor,
+              SizeTensor,
-                               Scale,
+              Scale,
-                               Out,
+              Out,
-                               out_h,
+              out_h,
-                               out_w,
+              out_w,
-                               scale,
+              scale,
-                               align_corners,
+              align_corners,
-                               interp_method);
+              interp_method);
  Out->ZynqTensor()->flush();
  Out->ZynqTensor()->copyScaleFrom(X->ZynqTensor());
  Out->ZynqTensor()->saveToFile("n_out", true);
 }
 } /* namespace fpga */
@@ -249,15 +242,17 @@ REGISTER_LITE_KERNEL(bilinear_interp,
                     kNHWC,
                     paddle::lite::kernels::fpga::BilinearInterpCompute,
                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kFPGA),
+    .BindInput("X",
-                                       PRECISION(kFP16),
+               {LiteType::GetTensorTy(TARGET(kFPGA),
-                                       DATALAYOUT(kNHWC))})
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
    .BindInput("OutSize",
               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
    .BindInput("SizeTensor",
               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
    .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kFPGA),
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kFPGA),
                                       PRECISION(kFP16),
                                       DATALAYOUT(kNHWC))})
    .Finalize();
@@ -268,15 +263,17 @@ REGISTER_LITE_KERNEL(nearest_interp,
                     kNHWC,
                     paddle::lite::kernels::fpga::NearestInterpCompute,
                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kFPGA),
+    .BindInput("X",
-                                       PRECISION(kFP16),
+               {LiteType::GetTensorTy(TARGET(kFPGA),
-                                       DATALAYOUT(kNHWC))})
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
    .BindInput("OutSize",
               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
    .BindInput("SizeTensor",
               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
    .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kFPGA),
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kFPGA),
                                       PRECISION(kFP16),
                                       DATALAYOUT(kNHWC))})
    .Finalize();
--- a/lite/kernels/fpga/interpolate_compute.h
+++ b/lite/kernels/fpga/interpolate_compute.h
@@ -14,9 +14,9 @@
 #pragma once
 #include <string>
+#include "lite/backends/fpga/KD/pes/resize_pe.hpp"
 #include "lite/core/kernel.h"
 #include "lite/core/op_registry.h"
-#include "lite/backends/fpga/KD/pes/resize_pe.hpp"
 namespace paddle {
 namespace lite {
@@ -34,12 +34,12 @@ class BilinearInterpCompute
 class NearestInterpCompute
    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
 public:
  void PrepareForRun() override;
  void Run() override;
  virtual ~NearestInterpCompute() = default;
 private:
  zynqmp::ResizePE pe_;
 };

--- a/lite/kernels/fpga/io_copy_compute.cc
+++ b/lite/kernels/fpga/io_copy_compute.cc
@@ -44,8 +44,6 @@ class IoCopyHostCHWToFpgaHWCCompute
          param.x->target() == TARGET(kFPGA));
    param.x->ZynqTensor()->flush();
    if (param.x->ZynqTensor()->dataType() == zynqmp::INT32) {
      param.y->mutable_data<int>();
      param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor());
@@ -86,7 +84,7 @@ class IoCopyFpgaToHostCompute
    auto& param = Param<operators::IoCopyParam>();
    CHECK(param.x->target() == TARGET(kHost) ||
          param.x->target() == TARGET(kFPGA));
    param.x->ZynqTensor()->syncToDevice();
    param.y->mutable_data<float>();
    param.y->ZynqTensor()->setDataType(zynqmp::FP32);
@@ -104,7 +102,7 @@ class IoCopyFpgaToHostCompute
    } else {
      param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor());
    }
    param.y->ZynqTensor()->invalidate();
    copy_properties(param);
  }
@@ -141,16 +139,22 @@ class IoCopyFpgaToHostCHWCompute
    CHECK(param.x->target() == TARGET(kHost) ||
          param.x->target() == TARGET(kFPGA));
-    Tensor hwc;    
+    param.x->ZynqTensor()->syncToDevice();
+    if (param.x->ZynqTensor()->dataType() == zynqmp::INT32) {
+      param.y->mutable_data<int32_t>();
+      param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor());
+      return;
+    }
+    Tensor hwc;
    hwc.Resize(param.y->dims());
    float* hwc_data = hwc.mutable_data<float>();
    float* chw_data = param.y->mutable_data<float>();
    param.y->ZynqTensor()->setDataType(zynqmp::FP32);
-    param.x->ZynqTensor()->syncToDevice();
    hwc.ZynqTensor()->setDataLocation(zynqmp::CPU);
    param.y->ZynqTensor()->setDataLocation(zynqmp::CPU);
    if (param.x->ZynqTensor()->aligned() &&
        param.x->ZynqTensor()->shape().shouldAlign()) {
      zynqmp::Tensor tempTensor;
@@ -158,15 +162,15 @@ class IoCopyFpgaToHostCHWCompute
                                      param.x->ZynqTensor()->shape());
      tempTensor.copyFrom(param.x->ZynqTensor());
      tempTensor.setAligned(true);
-      // tempTensor.saveToFile("temp_1", true);
+      tempTensor.saveToFile("temp_1", true);
-      tempTensor.unalignImage();
+      //   tempTensor.unalignImage();
-      // tempTensor.saveToFile("temp_2", true);
+      tempTensor.saveToFile("temp_2", true);
      hwc.ZynqTensor()->copyFrom(&tempTensor);
    } else {
      // hwc.ZynqTensor()->copyFrom(param.x->ZynqTensor());
      float16* in_data = param.x->ZynqTensor()->data<float16>();
-      // float* f_data = 
+      // float* f_data =
      param.x->ZynqTensor()->flush();
      float max = 0;
@@ -198,6 +202,7 @@ class IoCopyFpgaToHostCHWCompute
               dims.height(),
               dims.width());
+    param.y->ZynqTensor()->copyFrom(hwc.ZynqTensor());
    // param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor());
    param.y->ZynqTensor()->flush();
    copy_properties(param);
@@ -205,8 +210,8 @@ class IoCopyFpgaToHostCHWCompute
    param.x->ZynqTensor()->invalidate();
    param.x->ZynqTensor()->flush();
    // hwc.ZynqTensor()->saveToFile("hwc", true);
-    // param.x->ZynqTensor()->saveToFile("io2_x", true);
+    param.x->ZynqTensor()->saveToFile("io2_x", true);
-    // param.y->ZynqTensor()->saveToFile("io2_y", true);
+    param.y->ZynqTensor()->saveToFile("io2_y", true);
  }
  std::string doc() const override { return "Copy IO from FPGA to HOST"; }
 };
@@ -238,15 +243,16 @@ REGISTER_LITE_KERNEL(io_copy,
                     kAny,
                     paddle::lite::kernels::fpga::IoCopyHostCHWToFpgaHWCCompute,
                     host_float_chw_to_device_fp16_hwc)
-    .BindInput("Input", {LiteType::GetTensorTy(
+    .BindInput("Input",
-                   TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW))})
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
    .BindOutput("Out",
                {LiteType::GetTensorTy(TARGET(kFPGA),
                                       PRECISION(kFP16),
                                       DATALAYOUT(kNHWC))})
    .Finalize();
 REGISTER_LITE_KERNEL(io_copy,
                     kFPGA,
                     kAny,
@@ -311,25 +317,24 @@ REGISTER_LITE_KERNEL(io_copy,
 //                                        DATALAYOUT(kAny))})
 //     .Finalize();
 //  ==========================================================
-  // std::unique_ptr<type_infer_handler_t> GetTypeInferHandler() override {
+// std::unique_ptr<type_infer_handler_t> GetTypeInferHandler() override {
-  //   std::unique_ptr<type_infer_handler_t> res(new type_infer_handler_t);
+//   std::unique_ptr<type_infer_handler_t> res(new type_infer_handler_t);
-  //   *res = [](const std::map<std::string, const Type*>& inputs,
+//   *res = [](const std::map<std::string, const Type*>& inputs,
-  //             const std::string& out) -> const Type* {
+//             const std::string& out) -> const Type* {
-  //     CHECK(!inputs.empty());
+//     CHECK(!inputs.empty());
-  //     auto* type = inputs.at("Input");
+//     auto* type = inputs.at("Input");
-  //     CHECK(type->target() == TARGET(kHost));
+//     CHECK(type->target() == TARGET(kHost));
-  //     auto out_place = type->place();
+//     auto out_place = type->place();
-  //     out_place.target = TARGET(kFPGA);
+//     out_place.target = TARGET(kFPGA);
-  //     auto* out_type = Type::Get(type->id(),
+//     auto* out_type = Type::Get(type->id(),
-  //                                out_place.target,
+//                                out_place.target,
-  //                                out_place.precision,
+//                                out_place.precision,
-  //                                out_place.layout,
+//                                out_place.layout,
-  //                                out_place.device);
+//                                out_place.device);
-  //     return out_type;
+//     return out_type;
-  //   };
+//   };
-  //   return res;
+//   return res;
-  // }
+// }
\ No newline at end of file
--- a/lite/kernels/fpga/prior_box_compute.cc
+++ b/lite/kernels/fpga/prior_box_compute.cc
@@ -132,4 +132,3 @@ REGISTER_LITE_KERNEL(prior_box,
    .BindOutput("Boxes", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindOutput("Variances", {LiteType::GetTensorTy(TARGET(kARM))})
    .Finalize();
--- a/lite/kernels/fpga/reshape_compute.cc
+++ b/lite/kernels/fpga/reshape_compute.cc
@@ -23,7 +23,6 @@ namespace fpga {
 using float16 = zynqmp::float16;
 void FlattenCompute::Run() {
  auto& param = Param<operators::ReshapeParam>();
  auto x = param.x;
@@ -45,12 +44,10 @@ void FlattenCompute::Run() {
  output->Resize(output_dims);
 #ifdef FPGA_PRINT_TENSOR
-  Debugger::get_instance().registerOutput("flatten",
+  Debugger::get_instance().registerOutput("flatten", output->ZynqTensor());
-                                          output->ZynqTensor());
 #endif
 }
 void ReshapeCompute::Run() {
  auto& param = Param<operators::ReshapeParam>();
  auto x = param.x;
@@ -69,17 +66,14 @@ void ReshapeCompute::Run() {
  } else {
    // output->CopyDataFrom(*x);
  }
  output->ZynqTensor()->copyFrom(x->ZynqTensor());
  // output->ZynqTensor()->saveToFile("ro", true);
  output->ZynqTensor()->flush();
  output->ZynqTensor()->setAligned(x->ZynqTensor()->aligned());
 #ifdef FPGA_PRINT_TENSOR
-  Debugger::get_instance().registerOutput("reshape",
+  Debugger::get_instance().registerOutput("reshape", output->ZynqTensor());
-                                          output->ZynqTensor());
 #endif
 }
@@ -163,7 +157,7 @@ REGISTER_LITE_KERNEL(flatten2,
                                      PRECISION(kFP16),
                                      DATALAYOUT(kNHWC))})
    .BindInput("Shape",
-                {LiteType::GetTensorTy(TARGET(kHost),
+               {LiteType::GetTensorTy(TARGET(kHost),
                                      PRECISION(kAny),
                                      DATALAYOUT(kAny))})
    .BindOutput("Out",

--- a/lite/kernels/fpga/softmax_compute.cc
+++ b/lite/kernels/fpga/softmax_compute.cc
@@ -38,9 +38,9 @@ void SoftmaxCompute::Run() {
  zynqmp::SoftmaxParam& softmax_param = pe_.param();
  // softmax_param.input->saveToFile("softmax_in", true);
  pe_.dispatch();
  softmax_param.output->flush();
-  // softmax_param.output->saveToFile("softmax", true);
+// softmax_param.output->saveToFile("softmax", true);
 #ifdef FPGA_PRINT_TENSOR
  Debugger::get_instance().registerOutput("softmax", softmax_param.output);
 #endif
@@ -61,17 +61,9 @@ REGISTER_LITE_KERNEL(softmax,
               {LiteType::GetTensorTy(TARGET(kFPGA),
                                      PRECISION(kFP16),
                                      DATALAYOUT(kNHWC))})
-    .BindOutput("Out",
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-                {LiteType::GetTensorTy(TARGET(kARM))})
    .Finalize();
 // .BindOutput("Out",
 //                 {LiteType::GetTensorTy(TARGET(kFPGA),
 //                                        PRECISION(kFP16),

--- a/lite/kernels/fpga/transpose_compute.cc
+++ b/lite/kernels/fpga/transpose_compute.cc
@@ -104,7 +104,7 @@ void Transpose2Compute::Run() {
  // param.x->ZynqTensor()->saveToFile("t_unaligned", true);
  param.x->ZynqTensor()->flush();
  param.x->ZynqTensor()->invalidate();
  if (param.x->dims().size() != 4) {
    transposeCompute(param);
    param.output->ZynqTensor()->setAligned(param.x->ZynqTensor()->aligned());
@@ -115,7 +115,7 @@ void Transpose2Compute::Run() {
  // param.output->ZynqTensor()->copyFrom(param.x->ZynqTensor());
  param.output->ZynqTensor()->flush();
-  // param.output->ZynqTensor()->saveToFile("Transpose2", true);
+  param.output->ZynqTensor()->saveToFile("Transpose2", true);
 }
 }  // namespace fpga
@@ -151,8 +151,9 @@ REGISTER_LITE_KERNEL(transpose2,
               {LiteType::GetTensorTy(TARGET(kFPGA),
                                      PRECISION(kFP16),
                                      DATALAYOUT(kNHWC))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kFPGA),
+    .BindOutput("Out",
-                                      PRECISION(kFP16),
+                {LiteType::GetTensorTy(TARGET(kFPGA),
-                                      DATALAYOUT(kNHWC))})
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
    .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kARM))})
    .Finalize();
--- a/lite/kernels/fpga/yolo_box_compute.cc
+++ b/lite/kernels/fpga/yolo_box_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/fpga/yolo_box_compute.h"
+#include <vector>
+#include "lite/backends/arm/math/funcs.h"
+#include "lite/core/tensor.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace fpga {
+void YoloBoxCompute::PrepareForRun() {
+  auto& param = Param<operators::YoloBoxParam>();
+  lite::Tensor* X = param.X;
+  lite::Tensor* ImgSize = param.ImgSize;
+  lite::Tensor* Boxes = param.Boxes;
+  lite::Tensor* Scores = param.Scores;
+  Boxes->mutable_data<float>();
+  Scores->mutable_data<float>();
+  zynqmp::YoloBoxParam& yolobox_param = pe_.param();
+  yolobox_param.input = X->ZynqTensor();
+  yolobox_param.imgSize = ImgSize->ZynqTensor();
+  yolobox_param.outputBoxes = Boxes->ZynqTensor();
+  yolobox_param.outputScores = Scores->ZynqTensor();
+  yolobox_param.downsampleRatio = param.downsample_ratio;
+  yolobox_param.anchors = param.anchors;
+  yolobox_param.classNum = param.class_num;
+  yolobox_param.confThresh = param.conf_thresh;
+  pe_.init();
+  pe_.apply();
+}
+void YoloBoxCompute::Run() {
+  pe_.dispatch();
+  zynqmp::YoloBoxParam& yolobox_param = pe_.param();
+  yolobox_param.imgSize->saveToFile("img_size", true);
+//   exit(-1);
+  yolobox_param.outputBoxes->saveToFile("yolo_boxes", true);
+  yolobox_param.outputScores->saveToFile("yolo_scores", true);
+}
+}  // namespace fpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+// REGISTER_LITE_KERNEL(yolo_box,
+//                      kFPGA,
+//                      kFP16,
+//                      kNHWC,
+//                      paddle::lite::kernels::fpga::YoloBoxCompute,
+//                      def)
+//     .BindInput("X", {LiteType::GetTensorTy(TARGET(kFPGA),
+//                                       PRECISION(kFP16),
+//                                       DATALAYOUT(kNHWC))})
+//     .BindInput("ImgSize",
+//                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+//     .BindOutput("Boxes", {LiteType::GetTensorTy(TARGET(kARM))})
+//     .BindOutput("Scores", {LiteType::GetTensorTy(TARGET(kARM))})
+//     .Finalize();
--- a/lite/kernels/fpga/yolo_box_compute.h
+++ b/lite/kernels/fpga/yolo_box_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/backends/fpga/KD/float16.hpp"
+#include "lite/backends/fpga/KD/pes/elementwise_add_pe.hpp"
+#include "lite/backends/fpga/KD/pes/yolobox_pe.hpp"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace fpga {
+using float16 = zynqmp::float16;
+class YoloBoxCompute 
+    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
+ public:
+  void PrepareForRun() override;
+  void Run() override;
+  virtual ~YoloBoxCompute() {
+  };
+ private:
+  zynqmp::YoloBoxPE pe_;
+};
+}  // namespace fpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle