diff --git a/lite/backends/fpga/KD/dispatch/action.hpp b/lite/backends/fpga/KD/dispatch/action.hpp
index 0235439a0704c65cabcd397d97141f48dc254dec..9a3653be804b750a6eb487c02ee5a8a8b88c4e52 100644
--- a/lite/backends/fpga/KD/dispatch/action.hpp
+++ b/lite/backends/fpga/KD/dispatch/action.hpp
@@ -18,19 +18,14 @@ namespace paddle {
 namespace zynqmp {
 
 class Action {
-public:
-  void readScale(float* scale) {
+ public:
+  void readScale(float* scale) {}
 
-  }
+  void writeScale(float* scale) {}
 
-  void writeScale(float* scale) {
-
-  }
-
-private:
+ private:
   int id_ = -1;
   int scaleIndex_ = -1;
 }
-
 }
-}
\ No newline at end of file
+}
diff --git a/lite/backends/fpga/KD/dispatch/transaction.hpp b/lite/backends/fpga/KD/dispatch/transaction.hpp
index c5f19e0e4eb2457472c86de811dd08017bc835bd..6081c720ca62f08ceff8505c7ed410233bd29627 100644
--- a/lite/backends/fpga/KD/dispatch/transaction.hpp
+++ b/lite/backends/fpga/KD/dispatch/transaction.hpp
@@ -21,20 +21,16 @@ namespace paddle {
 namespace zynqmp {
 
 class Transaction {
+ public:
+  void appendAction(Action* action) { actions_.push_back(action); };
 
-public:
-  void appendAction(Action* action) {
-    actions_.push_back(action);
-  };
+  void startTraction(){
 
-  void startTraction() {
-    
   };
 
-private:
+ private:
   std::std::vector<Action*> actions_;
   int id_ = -1;
 }
-
 }
-}
\ No newline at end of file
+}
diff --git a/lite/backends/fpga/KD/dispatch/transaction_manager.hpp b/lite/backends/fpga/KD/dispatch/transaction_manager.hpp
index b24e154402f335c8e95d195e9dd677f3fd3a6280..149f0e0073c28abed5d2f0cc8f253374c77f0555 100644
--- a/lite/backends/fpga/KD/dispatch/transaction_manager.hpp
+++ b/lite/backends/fpga/KD/dispatch/transaction_manager.hpp
@@ -20,7 +20,7 @@ namespace paddle {
 namespace zynqmp {
 
 class TransactionManager {
-public:
+ public:
   static TransactionManager& get_instance() {
     static TransactionManager s_instance;
     return s_instance;
@@ -34,14 +34,11 @@ public:
     return currentTransaction_;
   };
 
-  void endTransaction() {
-    currentTransaction_ = nullptr;
-  }
+  void endTransaction() { currentTransaction_ = nullptr; }
 
-private:
+ private:
   Transaction* currentTransaction_ = nullptr;
   std::vector<Transaction*> transactions_;
 }
-
 }
-}
\ No newline at end of file
+}
diff --git a/lite/backends/fpga/KD/io.cpp b/lite/backends/fpga/KD/io.cpp
index 868287d937bf73a6e34af736b4f94a29f34036ca..2a36a8ec89627d9e07c516c31fb5ca04c2a8d0d9 100644
--- a/lite/backends/fpga/KD/io.cpp
+++ b/lite/backends/fpga/KD/io.cpp
@@ -15,8 +15,5 @@ limitations under the License. */
 #include "io.hpp"
 
 namespace paddle {
-namespace zynqmp {
-
-
-}  // namespace zynqmp
+namespace zynqmp {}  // namespace zynqmp
 }  // namespace paddle
diff --git a/lite/backends/fpga/KD/pes/conv_pe.hpp b/lite/backends/fpga/KD/pes/conv_pe.hpp
index 4c5da08a4b1f5ae02965b03da8ff95c09a721f5c..210c02fac5ff3fa135cfc626dbd2d00cbeac12ba 100644
--- a/lite/backends/fpga/KD/pes/conv_pe.hpp
+++ b/lite/backends/fpga/KD/pes/conv_pe.hpp
@@ -70,6 +70,10 @@ class ConvPE : public PE {
         param_.input->shape().channel() >= 2048) {
       use_cpu_ = true;
     }
+    if (param_.filter->shape().width() == 1 &&
+        param_.filter->shape().num() % 16 != 0) {
+      use_cpu_ = true;
+    }
     if (!use_cpu_) {
       // param_.filter->releaseData();
     }
@@ -93,34 +97,38 @@ class ConvPE : public PE {
 
     float* filter_data = param_.filter->data<float>();
     float* mi = new float[in_channel];
+
+    int wh = input->shape().width() * input->shape().height();
+    float max = 0;
     for (int i = 0; i < out_channel; i++) {
-      float* image = image_addr;
       float* filter_ptr = filter_data + i * in_channel;
-      float* out_ptr = mi;
-#pragma omp parallel for
-      for (int j = 0; j < in_channel; j++) {
-        // float32x4_t x0 = vld1q_f32(image);
-        // float32x4_t x1 = vld1q_f32(filter_ptr);
-
-        // float32x4_t r = vmulq_f32(x0, x1);
-
-        // vst1q_f32(out_ptr, r);
-        // image += 4;
-        // filter_ptr += 4;
-        // out_ptr += 4;
-        float value = image_addr[j] * filter_ptr[j];
-        mi[j] = value;
+      // #pragma omp parallel for
+
+      for (int k = 0; k < wh; k++) {
+        float* image = image_addr;
+        float* out_ptr = mi;
+
+        for (int j = 0; j < in_channel; j++) {
+          float value = image_addr[k * in_channel + j] * filter_ptr[j];
+          mi[j] = value;
+        }
+
+        float sum = 0;
+        for (int j = 0; j < in_channel; j++) {
+          sum += mi[j];
+        }
+        sum *= param_.scale()->data<float>()[i];
+        sum += param_.bias()->data<float>()[i];
+        out[i * wh + k] = sum;
+        max = std::max(max, std::abs(sum));
       }
-
-      float sum = 0;
-      for (int j = 0; j < in_channel; j++) {
-        sum += mi[j];
-      }
-      out[i] = sum;
     }
     delete[] mi;
     float_output.flush();
     output->copyFrom(&float_output);
+    output->scale()[0] = max / 127.0;
+    output->scale()[1] = 127.0 / max;
+    // output->saveToFile("cpu", true);
   }
 
   bool dispatch() {
@@ -206,7 +214,6 @@ class ConvPE : public PE {
       // std::cout << "\n ================== EW ================== \n";
       // }
     }
-
     return ret == 0;
   }
 
diff --git a/lite/backends/fpga/KD/pes/prior_box_pe.cpp b/lite/backends/fpga/KD/pes/prior_box_pe.cpp
index 6c2f99087d779e529208f486b5223621abb6afa1..0ee29b3de885390f0adafc167da513521a2d2184 100644
--- a/lite/backends/fpga/KD/pes/prior_box_pe.cpp
+++ b/lite/backends/fpga/KD/pes/prior_box_pe.cpp
@@ -262,7 +262,7 @@ bool PriorBoxPE::dispatch() {
 
   param_.outputBoxes->copyFrom(this->cachedBoxes_);
   param_.outputVariances->copyFrom(this->cachedVariances_);
-  
+
   param_.outputBoxes->flush();
   // param_.outputBoxes->syncToCPU();
   param_.outputVariances->flush();
diff --git a/lite/backends/fpga/KD/pes/resize_pe.hpp b/lite/backends/fpga/KD/pes/resize_pe.hpp
index 98728202b62d2ba819848cf4ccb6658800bb8493..e27c60c7694b13ffd02e156e68abe9e29a7af43f 100644
--- a/lite/backends/fpga/KD/pes/resize_pe.hpp
+++ b/lite/backends/fpga/KD/pes/resize_pe.hpp
@@ -84,28 +84,26 @@ class ResizePE : public PE {
 
     param_.input->syncToCPU();
 
-        for (int h = 0; h < in_height; h++) {
-            for (int w = 0; w < in_width; w++) {
-                int src_index = in_width * channel * h + w * channel;
-                float16* src = param_.input->data<float16>() + src_index;
-                // std::cout << "src_index:" << src_index << std::endl;
-                for (int v = 0; v < factor; v++) {
-                    for (int i =0; i < factor; i++) {
-                        int dst_index = out_width * channel * h * factor +
-                            out_width * channel * v +
-                            w * channel * factor +
+    for (int h = 0; h < in_height; h++) {
+      for (int w = 0; w < in_width; w++) {
+        int src_index = in_width * channel * h + w * channel;
+        float16* src = param_.input->data<float16>() + src_index;
+        // std::cout << "src_index:" << src_index << std::endl;
+        for (int v = 0; v < factor; v++) {
+          for (int i = 0; i < factor; i++) {
+            int dst_index = out_width * channel * h * factor +
+                            out_width * channel * v + w * channel * factor +
                             channel * i;
-                        float16* dst = param_.output->data<float16>() + dst_index;
-                        memcpy(dst, src, channel * sizeof(float16));
-                        // std::cout << "dst_index:" << dst_index << std::endl;
-                    }
-                }
-            }
+            float16* dst = param_.output->data<float16>() + dst_index;
+            memcpy(dst, src, channel * sizeof(float16));
+            // std::cout << "dst_index:" << dst_index << std::endl;
+          }
         }
-        param_.output->flush();
-        param_.output->copyScaleFrom(param_.input);
+      }
     }
-
+    param_.output->flush();
+    param_.output->copyScaleFrom(param_.input);
+  }
 
   bool dispatch() {
     cpu_compute();
diff --git a/lite/backends/fpga/KD/pes/scale_pe.hpp b/lite/backends/fpga/KD/pes/scale_pe.hpp
index b6b2daa6a2827bb38a2bee93c4e99797243edcf3..8a2c0c92ec5ee051021ba622cc191f074543cabb 100755
--- a/lite/backends/fpga/KD/pes/scale_pe.hpp
+++ b/lite/backends/fpga/KD/pes/scale_pe.hpp
@@ -158,8 +158,9 @@ class ScalePE : public PE {
         int index = i * input->shape().channel() + c;
         float x = image_addr[index];
         float y = half_to_float(scale_data[c]);
-        float value =  x * y;
-        // std::cout << " x = " << std::to_string(x) << " y = " << std::to_string(y) << " v = " << std::to_string(value) << std::endl;
+        float value = x * y;
+        // std::cout << " x = " << std::to_string(x) << " y = " <<
+        // std::to_string(y) << " v = " << std::to_string(value) << std::endl;
         // float value = half_to_float(in_data[index]) * 19.3598f;
         data_out[index] = float_to_half(value);
 
@@ -188,9 +189,9 @@ class ScalePE : public PE {
     //   dw_param.quantizedFilter()->flush();
     // }
     // param_.input->syncToDevice();
-    // return dw_pe_.dispatch();
+    return dw_pe_.dispatch();
 
-    cpu_compute();
+    // cpu_compute();
     return true;
   }
 
diff --git a/lite/backends/fpga/KD/pes/yolobox_pe.hpp b/lite/backends/fpga/KD/pes/yolobox_pe.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d997fe07833f63bf6843c226477a402d6aeb0357
--- /dev/null
+++ b/lite/backends/fpga/KD/pes/yolobox_pe.hpp
@@ -0,0 +1,205 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "lite/backends/fpga/KD/pe.hpp"
+#include "lite/backends/fpga/KD/pe_params.hpp"
+
+namespace paddle {
+namespace zynqmp {
+
+
+float sigmoid(float x) {
+  return 1.0 / (1.0 + std::exp(-x));
+}
+
+inline void GetYoloBox(float* box, const float* x, const int* anchors, int w,
+                                  int h, int an_idx, int grid_size,
+                                  int input_size, int index, 
+                                  int img_height, int img_width) {
+  box[0] = (w + sigmoid(x[index])) * img_width * 1.0f/ grid_size;
+  box[1] = (h + sigmoid(x[index + 1])) * img_height * 1.0f / grid_size;
+  box[2] = std::exp(x[index + 2 ]) * anchors[2 * an_idx] * img_width * 1.0f/
+           input_size;
+  box[3] = std::exp(x[index + 3]) * anchors[2 * an_idx + 1] *
+           img_height * 1.0f / input_size;
+}
+
+inline int GetEntryIndex(int batch, int an_idx, int hw_idx,
+                                    int an_num, int an_stride, int stride,
+                                    int entry) {
+  return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx;
+}
+
+inline void CalcDetectionBox(float* boxes, float* box, const int box_idx,
+                                        const int img_height,
+                                        const int img_width) {
+  boxes[box_idx] = box[0] - box[2] / 2;
+  boxes[box_idx + 1] = box[1] - box[3] / 2;
+  boxes[box_idx + 2] = box[0] + box[2] / 2;
+  boxes[box_idx + 3] = box[1] + box[3] / 2;
+
+  boxes[box_idx] = boxes[box_idx] > 0 ? boxes[box_idx] : 0;
+  boxes[box_idx + 1] =
+      boxes[box_idx + 1] > 0 ? boxes[box_idx + 1] : 0;
+  boxes[box_idx + 2] = boxes[box_idx + 2] < img_width - 1
+                           ? boxes[box_idx + 2]
+                           : (img_width - 1);
+  boxes[box_idx + 3] = boxes[box_idx + 3] < img_height - 1
+                           ? boxes[box_idx + 3]
+                           : (img_height - 1);
+}
+
+inline void CalcLabelScore(float* scores, const float* input,
+                                      const int label_idx, const int score_idx,
+                                      const int class_num, const float conf) {
+  for (int i = 0; i < class_num; i++) {
+    scores[score_idx + i] = conf * sigmoid(input[label_idx + i]);
+    // std::cout << scores[score_idx + i] << " ";
+  }
+  // std::cout << std::endl;
+}
+
+
+class YoloBoxPE : public PE {
+ public:
+  bool init() {
+    param_.outputBoxes->setAligned(false);
+    param_.outputScores->setAligned(false);
+    param_.outputBoxes->setDataLocation(CPU);
+    param_.outputScores->setDataLocation(CPU);
+    return true;
+  }
+
+  bool dispatch() {
+    auto* input = param_.input;
+    auto* imgsize = param_.imgSize;
+    auto* boxes = param_.outputBoxes;
+    auto* scores = param_.outputScores;
+    auto anchors = param_.anchors;
+    int class_num = param_.classNum;
+    float conf_thresh = param_.confThresh;
+    int downsample_ratio = param_.downsampleRatio;
+
+
+    const int num = input->shape().num();
+    const int height = input->shape().height();
+    const int width = input->shape().width();
+    const int box_num = boxes->shape().channel();
+    const int an_num = anchors.size() / 2;
+    int input_size = downsample_ratio * height;
+
+    const int stride = height * width;
+    const int an_stride = (class_num + 5) * stride;
+
+    Tensor anchors_;
+    Shape anchors_shape(N, {an_num * 2});
+    auto anchors_data = anchors_.mutableData<int32_t>(INT32, anchors_shape);
+    std::copy(anchors.begin(), anchors.end(), anchors_data);
+
+    input->syncToCPU();
+    input->unalignImage();
+    // input->setAligned(false);
+    Tensor input_float;
+    input_float.setDataLocation(CPU);
+    float* input_data = input_float.mutableData<float>(FP32, input->shape());
+    input_float.copyFrom(input);
+    // input_float.saveToFile("input_yolobox_half", "true");
+
+    // input_float.setAligned(input->aligned());
+    // input_float.unalignImage();
+    // std::cout << "-------------unalignImage-----------------" << std::endl;
+    // for (int i = 0; i < input_float.shape().numel(); ++i)
+    // {
+    //   std::cout << input_data[i] << " ";
+    // }
+    //  std::cout << "-" << std::endl;
+    //  std::cout << "-------------unalignImage-----------------" << std::endl;
+    // input_float.setAligned(false);
+    // input_float.saveToFile("input_yolobox_float", "true");
+    // input_float.syncToCPU();
+    // input_float.invalidate();
+
+    imgsize->saveToFile("img_size", true);
+    const int32_t* imgsize_data = imgsize->data<int32_t>();
+    
+    Tensor boxes_float;
+    Tensor scores_float;
+
+    boxes_float.setDataLocation(CPU);
+    float* boxes_float_data = boxes_float.mutableData<float>(FP32, boxes->shape());
+    memset(boxes_float_data, 0, boxes->shape().numel() * sizeof(float));
+
+    scores_float.setDataLocation(CPU);
+    float* scores_float_data = scores_float.mutableData<float>(FP32, scores->shape());
+    memset(scores_float_data, 0, scores->shape().numel() * sizeof(float));
+
+    // float* boxes_data = boxes->mutableData<float>();
+    // memset(boxes_data, 0, boxes->shape().numel() * sizeof(float));
+    
+    // float* scores_data = scores->mutableData<float>();
+    // memset(scores_data, 0, scores->shape().numel() * sizeof(float));
+
+    float box[4];
+    // for (int n = 0; n < num; n++) {
+      // int img_height = imgsize_data[2 * i];
+      // int img_width = imgsize_data[2 * i + 1];
+    int img_height = imgsize_data[0];
+    int img_width = imgsize_data[1];
+    std::cout << "YoloBoxPE imgsize:" << img_height << "," << img_width << std::endl;
+
+    int channel = input_float.shape().channel();
+    int count = 0;
+    for (int h = 0; h < height; h++) {
+      for (int w = 0; w < width ; w++) {
+        for (int n = 0; n < an_num; n++) {
+     
+          int obj_idx = channel * width * h + channel * w + n * (5 + class_num) + 4;
+          // std::cout << obj_idx << " ";
+          float conf = sigmoid(input_data[obj_idx]);
+          if (conf < conf_thresh) {
+            count++;
+            continue;
+          }
+
+          int box_idx = channel * width * h + channel * w + n * (5 + class_num) + 0;
+          GetYoloBox(box, input_data, anchors_data, w, h, n, height, input_size,
+                        box_idx, img_height, img_width);
+   
+          box_idx = h * an_num * 4 * width + an_num * 4 * w +  n * 4;
+          CalcDetectionBox(boxes_float_data, box, box_idx, img_height,img_width);
+
+          int label_idx = channel * width * h + channel * w + n * (5 + class_num) + 5;
+          int score_idx = h * an_num * class_num * width + an_num * class_num * w + n * class_num;
+          CalcLabelScore(scores_float_data, input_data, label_idx, score_idx, class_num, conf);
+        }
+      }
+    }
+
+    boxes->copyFrom(&boxes_float);
+    scores->copyFrom(&scores_float);
+    input->setAligned(true);
+  }
+
+  void apply(){};
+
+ YoloBoxParam& param() { return param_; }
+
+ private:
+  YoloBoxParam param_;
+
+};
+}  // namespace zynqmp
+}  // namespace paddle
diff --git a/lite/backends/fpga/KD/tensor.hpp b/lite/backends/fpga/KD/tensor.hpp
index 2cee46fb55a9dae9a2bd4020b01c4099ea2525c3..3a2996fed4965b019aeae386103e595ba4c6d3d9 100644
--- a/lite/backends/fpga/KD/tensor.hpp
+++ b/lite/backends/fpga/KD/tensor.hpp
@@ -266,23 +266,21 @@ class Tensor {
       return;
     }
     BypassArgs args;
-    args.input_data_type = src->dataType_ == FP32 ? DATA_TYPE_FP32 : DATA_TYPE_FP16;
+    args.input_data_type =
+        src->dataType_ == FP32 ? DATA_TYPE_FP32 : DATA_TYPE_FP16;
     args.output_data_type = dataType_ == FP32 ? DATA_TYPE_FP32 : DATA_TYPE_FP16;
     args.input_layout_type = LAYOUT_HWC;
     args.output_layout_type = LAYOUT_HWC;
-    args.image = {
-      .address = src->data<void>(),
-      .scale_address = src->scale(),
-      .channels = (uint32_t)src->shape().numel(),
-      .width = 1,
-      .height = 1,
-      .pad_width = 0U,
-      .pad_height = 0U
-    };
+    args.image = {.address = src->data<void>(),
+                  .scale_address = src->scale(),
+                  .channels = (uint32_t)src->shape().numel(),
+                  .width = 1,
+                  .height = 1,
+                  .pad_width = 0U,
+                  .pad_height = 0U};
 
     ImageOutputArgs output = {
-      .address = data<void>(),
-      .scale_address = scale(),
+        .address = data<void>(), .scale_address = scale(),
     };
 
     args.output = output;
@@ -385,10 +383,11 @@ class Tensor {
   void save_file_with_name(std::string path) {
     // std::cout << "saving file: " << path << std::endl;
     void* add = (void*)this;
-    // printf("tensor @: %p  data: %p \n", (void *)add, (void*)data<void>());  
+    // printf("tensor @: %p  data: %p \n", (void *)add, (void*)data<void>());
     // return;
     std::ofstream ofs;
     ofs.open(path);
+    ofs << "data type: " << dataType() << std::endl;
     ofs << scale()[0] << " / " << scale()[1] << std::endl;
 
     for (int i = 0; i < shape_->numel(); i++) {
@@ -406,13 +405,14 @@ class Tensor {
       if (dataType_ == INT32) {
         value = data<int32_t>()[i];
       }
-      
+
       if (i < 10) {
         std::cout << value << ",";
       }
-
+      //   if (i > 1000) {
+      //       break;
+      //   }
       ofs << value << std::endl;
-
     }
     usleep(30000);
     ofs.close();
@@ -465,7 +465,6 @@ class Tensor {
         value = half_to_float(tensor.data<float16>()[i]);
       }
       os << value << " ";
-
     }
     os << "\n";
     return os;
diff --git a/lite/backends/fpga/lite_tensor.h b/lite/backends/fpga/lite_tensor.h
index c6f837db75c4e321ab17f417d218f9222971688d..f3d409edf756d53c72637504c630e6bad7f4393a 100644
--- a/lite/backends/fpga/lite_tensor.h
+++ b/lite/backends/fpga/lite_tensor.h
@@ -166,6 +166,9 @@ class TensorLite {
 
   void clear() {
     // zynq_tensor_->releaseData();
+    if (zynq_tensor_) {
+      memset(zynq_tensor_->data<void>(), 0, zynq_tensor_->memorySize());
+    }
   }
 
   template <typename T>
diff --git a/lite/core/mir/fusion/quant_dequant_op_fuser.h b/lite/core/mir/fusion/quant_dequant_op_fuser.h
index 88dbc1bc171eddb4c5666ba7f3a23e17e7383e9d..aa2ac583b2d7a53a4529ad6a4ef3e59293950f39 100644
--- a/lite/core/mir/fusion/quant_dequant_op_fuser.h
+++ b/lite/core/mir/fusion/quant_dequant_op_fuser.h
@@ -105,7 +105,6 @@ class ChannelWiseDequantOpFuser : public FuseBase {
 */
 class DeleteQuantDequantOpFuser : public FuseBase {
  public:
-
   void BuildPattern() override;
   void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
 
diff --git a/lite/core/mir/kernel_place_correct_pass.h b/lite/core/mir/kernel_place_correct_pass.h
index 4f7d9d110caa0850ba1d3d30eeac85d34725a451..1c4439643fcd8aa16d3d81480679760e79fda611 100644
--- a/lite/core/mir/kernel_place_correct_pass.h
+++ b/lite/core/mir/kernel_place_correct_pass.h
@@ -84,13 +84,13 @@ class KernelPlaceCorrectPass : public DebugPass {
 
       std::string node_name = out->AsArg().name;
       std::string arg_name = get_argname(node_name, inst.op_info()->outputs());
-      
+
       auto op_type = inst.op_type();
 
       if (op_type == "reshape" || op_type == "reshape2") {
         for (auto* x_in : x->inlinks) {
-          
-          std::string in_name = get_argname(x_in->AsArg().name, inst.op_info()->inputs());
+          std::string in_name =
+              get_argname(x_in->AsArg().name, inst.op_info()->inputs());
           // std::cout << "name: " << x_in->AsArg().name  << std::endl;
           // std::cout << "in_name: " << in_name  << std::endl;
           if (in_name == "X") {
@@ -101,9 +101,11 @@ class KernelPlaceCorrectPass : public DebugPass {
         }
 
         p = in->AsArg().type->precision();
-        if ( p != PrecisionType::kFP16) {
-          // std::cout << "found an arm ............... : " << inst.kernels().size() << std::endl;
-          // std::cout << "tt:" <<  TargetRepr(inst.kernels()[0]->target()) << std::endl;
+        if (p != PrecisionType::kFP16) {
+          // std::cout << "found an arm ............... : " <<
+          // inst.kernels().size() << std::endl;
+          // std::cout << "tt:" <<  TargetRepr(inst.kernels()[0]->target()) <<
+          // std::endl;
           UpdateTarget(inst, TargetType::kHost);
           UpdateTensor(inst, in, out, TargetType::kHost);
         }
@@ -113,8 +115,9 @@ class KernelPlaceCorrectPass : public DebugPass {
         UpdateTarget(inst, TargetType::kFPGA);
       }
 
-      if (inst.op_type() == "split" || inst.op_type() == "transpose") {
-        if ( p != PrecisionType::kFP16) {
+      if (inst.op_type() == "split" || inst.op_type() == "transpose" ||
+          inst.op_type() == "transpose2") {
+        if (p != PrecisionType::kFP16) {
           UpdateTarget(inst, TargetType::kARM);
           for (auto* x_out : x->outlinks) {
             UpdateTensor(inst, in, x_out, TargetType::kARM);
@@ -123,9 +126,12 @@ class KernelPlaceCorrectPass : public DebugPass {
       }
 
       if (inst.op_type() == "concat") {
-        std::cout << "concat target:" << TargetRepr(inst.kernels()[0]->target()) << std::endl;
-        std::cout << "concat p:" << PrecisionToStr(inst.kernels()[0]->precision()) << std::endl;
-        if ( p != PrecisionType::kFP16) {
+        std::cout << "concat target:" << TargetRepr(inst.kernels()[0]->target())
+                  << std::endl;
+        std::cout << "concat p:"
+                  << PrecisionToStr(inst.kernels()[0]->precision())
+                  << std::endl;
+        if (p != PrecisionType::kFP16) {
           UpdateTarget(inst, TargetType::kARM);
           UpdateTensor(inst, in, out, TargetType::kARM);
         }
@@ -134,8 +140,9 @@ class KernelPlaceCorrectPass : public DebugPass {
       // if (inst.op_type() == "elementwise_mul") {
 
       //   for (auto* x_in : x->inlinks) {
-          
-      //     std::string in_name = get_argname(x_in->AsArg().name, inst.op_info()->inputs());
+
+      //     std::string in_name = get_argname(x_in->AsArg().name,
+      //     inst.op_info()->inputs());
       //     std::cout << "name: " << x_in->AsArg().name  << std::endl;
       //     std::cout << "in_name: " << in_name  << std::endl;
       //     if (in_name == "Y") {
@@ -150,7 +157,6 @@ class KernelPlaceCorrectPass : public DebugPass {
       //     UpdateTensor(inst, in, out, TargetType::kARM);
       //   }
       // }
-      
 
       std::vector<TargetType> in_types;
       std::vector<TargetType> out_types;
@@ -164,11 +170,13 @@ class KernelPlaceCorrectPass : public DebugPass {
 
         auto type = inst.picked_kernel().GetInputDeclType(arg_name);
 
-        // std::cout << arg_name <<" is weight:: " << std::to_string(x_in->AsArg().is_weight) 
-        //     << "     is persist: " << std::to_string(x_in->AsArg().is_persist) << std::endl;
+        // std::cout << arg_name <<" is weight:: " <<
+        // std::to_string(x_in->AsArg().is_weight)
+        //     << "     is persist: " <<
+        //     std::to_string(x_in->AsArg().is_persist) << std::endl;
 
         // std::cout << " type: "<< inst.op_type() << std::endl;
- 
+
         if (!x_in->AsArg().is_weight) {
           auto p = x_in->AsArg().type->precision();
           auto t = x_in->AsArg().type->target();
@@ -224,10 +232,10 @@ class KernelPlaceCorrectPass : public DebugPass {
     }
   }
 
-
   // Update me's kUnk fields by other's fields.
   void UpdateTarget(mir::Node::Stmt& inst, TargetType new_target) {  // NOLINT
-    // std::cout << "1 kernels: " << std::to_string(inst.kernels().size()) << std::endl;
+    // std::cout << "1 kernels: " << std::to_string(inst.kernels().size()) <<
+    // std::endl;
     auto new_place = inst.place();
 
     new_place.target = new_target;
@@ -244,25 +252,30 @@ class KernelPlaceCorrectPass : public DebugPass {
     std::vector<Place> places;
     places.push_back(new_place);
     inst.ResetKernels(places);
-    // std::cout << "2 kernels: " << std::to_string(inst.kernels().size()) << std::endl;
+    // std::cout << "2 kernels: " << std::to_string(inst.kernels().size()) <<
+    // std::endl;
   }
 
-  void UpdateTensor(mir::Node::Stmt& inst, Node* in, Node* out, TargetType new_target = TargetType::kUnk) {
-
+  void UpdateTensor(mir::Node::Stmt& inst,
+                    Node* in,
+                    Node* out,
+                    TargetType new_target = TargetType::kUnk) {
     auto get_argname = [&](
-    const std::string& node_name,
-    const std::map<std::string, std::vector<std::string>>& argname_map)
-    -> std::string {
-      for (auto& ele : argname_map) {
-        auto it =
-            std::find(ele.second.begin(), ele.second.end(), node_name);
-        if (it != ele.second.end()) return ele.first;
-      }
-      return "";
-    };
+        const std::string& node_name,
+        const std::map<std::string, std::vector<std::string>>& argname_map)
+        -> std::string {
+          for (auto& ele : argname_map) {
+            auto it =
+                std::find(ele.second.begin(), ele.second.end(), node_name);
+            if (it != ele.second.end()) return ele.first;
+          }
+          return "";
+        };
 
-    std::string arg_name = get_argname(out->AsArg().name, inst.op_info()->outputs());
-    std::string in_name = get_argname(in->AsArg().name, inst.op_info()->inputs());
+    std::string arg_name =
+        get_argname(out->AsArg().name, inst.op_info()->outputs());
+    std::string in_name =
+        get_argname(in->AsArg().name, inst.op_info()->inputs());
 
     auto type = inst.picked_kernel().GetInputDeclType(in_name);
     auto tmp_ptype = in->AsArg().type->precision();
@@ -281,7 +294,8 @@ class KernelPlaceCorrectPass : public DebugPass {
       tmp_layout = DataLayoutType::kNCHW;
     }
 
-    out->AsArg().type = LiteType::GetTensorTy(tmp_target, tmp_ptype, tmp_layout);
+    out->AsArg().type =
+        LiteType::GetTensorTy(tmp_target, tmp_ptype, tmp_layout);
   }
 };
 
diff --git a/lite/core/mir/static_kernel_pick_pass.cc b/lite/core/mir/static_kernel_pick_pass.cc
index 1cc8942d611db389a44cbf6a244775a5b666b587..8f18c3982d26e80fdfc7656a7b694d3a9b54d4a2 100755
--- a/lite/core/mir/static_kernel_pick_pass.cc
+++ b/lite/core/mir/static_kernel_pick_pass.cc
@@ -80,6 +80,8 @@ void StaticKernelPickPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
     std::sort(scored.begin(), scored.end(), KernelScoreCmp);
     instruct.kernels().clear();
 
+    VLOG(2) << "picking kernel " << scored.front().second->name() << "\n\n";
+
     if (!instruct.op_info()->HasAttr("enable_int8")) {
       // Move kernel back
       // Just keep a single best kernel.
diff --git a/lite/core/mir/static_kernel_pick_pass.h b/lite/core/mir/static_kernel_pick_pass.h
index a5e057a11be969ab5e963c3cee68fc7e80016af4..3ac8978ce9d5cab44b693f5d17cab529f4eb4376 100644
--- a/lite/core/mir/static_kernel_pick_pass.h
+++ b/lite/core/mir/static_kernel_pick_pass.h
@@ -157,7 +157,21 @@ class StaticKernelPickPass : public mir::StmtPass {
         }
       }
       if (in_match) {
-        final_score = 5000;
+        final_score += 1000;
+      }
+      bool out_match = true;
+      for (size_t i = 0; i < out_names.size(); ++i) {
+        std::string tmp;
+        CHECK(instruct.op_info()->GetOutputArgname(out_names[i], &tmp));
+        if (out_types.count(out_names[i]) &&
+            out_types.at(out_names[i]) !=
+                kernel.GetOutputDeclType(tmp)->precision()) {
+          out_match = false;
+        }
+      }
+
+      if (out_match) {
+        final_score += 1000;
       }
     }
 
diff --git a/lite/core/mir/type_target_cast_pass.cc b/lite/core/mir/type_target_cast_pass.cc
index 89dbb4a420c610981d5a15ef6e961e7668f21fe7..3d67e98de3654e753d48e5da4ba490be65aceb5a 100644
--- a/lite/core/mir/type_target_cast_pass.cc
+++ b/lite/core/mir/type_target_cast_pass.cc
@@ -53,7 +53,6 @@ void TypeTargetTransformPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
       ComplementInputs(graph.get(), node, in, &copied_nodes);
     }
   }
-
 }
 
 void TypeTargetTransformPass::ComplementInputs(
@@ -74,6 +73,7 @@ void TypeTargetTransformPass::ComplementInputs(
   auto in_arg_name = in->AsArg().name;
   std::string tmp;
   CHECK(inst.op_info()->GetInputArgname(in_arg_name, &tmp));
+  VLOG(4) << "in_arg_name: " << in_arg_name << " tmp:" << tmp;
   auto decl_arg_type = inst.picked_kernel().GetInputDeclType(tmp);
   CHECK(in->AsArg().type);
   if (!TargetCompatibleTo(*in->AsArg().type, *decl_arg_type)) {
diff --git a/lite/core/mir/variable_place_inference_pass.h b/lite/core/mir/variable_place_inference_pass.h
index 875bf23082a24cb6fcae878b46cc9dcdbb2b76f7..ae0a79d7c6254b85b7e65ebc7aae63bd81765d09 100644
--- a/lite/core/mir/variable_place_inference_pass.h
+++ b/lite/core/mir/variable_place_inference_pass.h
@@ -141,6 +141,7 @@ class VariablePlaceInferencePass : public DebugPass {
             x_in->AsArg().type = type;
           } else {
             PrecisionType tmp_ptype = x_in->AsArg().type->precision();
+            VLOG(4) << "tmp_ptype:" << PrecisionToStr(tmp_ptype);
             x_in->AsArg().type = LiteType::GetTensorTy(
                 type->target(), tmp_ptype, type->layout());
           }
@@ -172,6 +173,9 @@ class VariablePlaceInferencePass : public DebugPass {
             x_out->AsArg().type = type;
           } else {
             PrecisionType tmp_ptype = x_out->AsArg().type->precision();
+            tmp_ptype = type->precision();
+            // inst.picked_kernel().precision();
+            VLOG(4) << "tmp_ptype:" << PrecisionToStr(tmp_ptype);
             x_out->AsArg().type = LiteType::GetTensorTy(
                 type->target(), tmp_ptype, type->layout());
           }
diff --git a/lite/core/optimizer.h b/lite/core/optimizer.h
index 83e4abb3dafb619c5d0dfb3a7d93a0a1f113705e..5015b633e7b028ffe98a5c0a156c471271e16b0f 100755
--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
@@ -134,7 +134,6 @@ class Optimizer {
 
            "mlu_postprocess_pass"}};
 
-
       if (passes.size() == 1) {
         // multi_stream_analysis_pass must be in the front of
         // runtime_context_assign_pass
diff --git a/lite/kernels/fpga/CMakeLists.txt b/lite/kernels/fpga/CMakeLists.txt
index fd1f3263c843b07c33e26ebc7a2b2b7ffdfcb423..d34ba83d0e78627e9419571960c1f9ef63b0275c 100755
--- a/lite/kernels/fpga/CMakeLists.txt
+++ b/lite/kernels/fpga/CMakeLists.txt
@@ -42,6 +42,8 @@ add_kernel(layout_compute_fpga FPGA basic SRCS layout_compute.cc DEPS ${fpga_dep
 add_kernel(feed_compute_fpga FPGA basic SRCS feed_compute.cc DEPS ${fpga_deps})
 add_kernel(fetch_compute_fpga FPGA basic SRCS fetch_compute.cc DEPS ${fpga_deps})
 
+add_kernel(yolo_box_compute_fpga FPGA basic SRCS yolo_box_compute.cc DEPS ${fpga_deps})
+
 # add_kernel(while_compute_fpga FPGA extra SRCS while_compute.cc DEPS ${fpga_deps})
 # add_kernel(write_to_array_compute_fpga FPGA extra SRCS write_to_array_compute.cc DEPS ${fpga_deps})
 
diff --git a/lite/kernels/fpga/activation_compute.cc b/lite/kernels/fpga/activation_compute.cc
index f6704204d34c309835c1de0ef61afed97c0b29e3..9b5503a7e9946c77f62e9bbdafe4d59c457001e3 100644
--- a/lite/kernels/fpga/activation_compute.cc
+++ b/lite/kernels/fpga/activation_compute.cc
@@ -35,6 +35,28 @@ void ReluCompute::PrepareForRun() {
 
 void ReluCompute::Run() { pe_.dispatch(); }
 
+void SigmoidCompute::Run() {
+  // TODO(chonwhite) use fpga and arm implementation;
+  auto& param = this->Param<param_t>();
+  auto output_data = param.Out->mutable_data<float16>();
+  int numel = param.Out->numel();
+
+  float16* in_data = param.X->ZynqTensor()->data<float16>();
+  float16* out_data = param.Out->ZynqTensor()->data<float16>();
+  param.X->ZynqTensor()->syncToCPU();
+  float max = 0.0f;
+  for (int i = 0; i < numel; i++) {
+    /* code */
+    float value = zynqmp::half_to_float(in_data[i]);
+    value = 1 / (1 + exp(-value));
+    out_data[i] = zynqmp::float_to_half(value);
+    max = std::max(std::abs(value), max);
+  }
+  param.Out->ZynqTensor()->scale()[0] = max / 127.0;
+  param.Out->ZynqTensor()->scale()[1] = 127.0 / max;
+  param.Out->ZynqTensor()->flush();
+}
+
 }  // namespace fpga
 }  // namespace kernels
 }  // namespace lite
@@ -51,3 +73,19 @@ REGISTER_LITE_KERNEL(
                                        PRECISION(kFP16),
                                        DATALAYOUT(kNHWC))})
     .Finalize();
+
+REGISTER_LITE_KERNEL(sigmoid,
+                     kFPGA,
+                     kFP16,
+                     kNHWC,
+                     paddle::lite::kernels::fpga::SigmoidCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kFPGA),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
diff --git a/lite/kernels/fpga/activation_compute.h b/lite/kernels/fpga/activation_compute.h
index 5cc431e2d41e2de1d841ee386de3aae4434e3865..f39a324f569a48f3c0d582a7376b5128259a504e 100644
--- a/lite/kernels/fpga/activation_compute.h
+++ b/lite/kernels/fpga/activation_compute.h
@@ -49,6 +49,16 @@ class ReluCompute
   zynqmp::Tensor output_;
 };
 
+class SigmoidCompute
+    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  void Run() override;
+
+  virtual ~SigmoidCompute() = default;
+};
+
 }  // namespace fpga
 }  // namespace kernels
 }  // namespace lite
diff --git a/lite/kernels/fpga/calib_compute.cc b/lite/kernels/fpga/calib_compute.cc
index e4f13aedd8c36126e995a45d83d41c55c957b771..7605f8cd52366d6200cf647c1efd7259e70568fe 100755
--- a/lite/kernels/fpga/calib_compute.cc
+++ b/lite/kernels/fpga/calib_compute.cc
@@ -49,7 +49,7 @@ void CalibComputeFloat2Int::Run() {
   const auto* din = param.input->data<float>();
   auto* dout = param.output->mutable_data<int>();
   // param.output->ZynqTensor()->copyFrom(param.input->ZynqTensor());
-  //TODO
+  // TODO
   auto out_lod = param.output->mutable_lod();
   *out_lod = param.input->lod();
   return;
diff --git a/lite/kernels/fpga/concat_compute.cc b/lite/kernels/fpga/concat_compute.cc
index 523d3577095a4c4cbb9b21e8f67a92a22295108f..6af4e7d69d2d3352d44bf67e09d63cc4b26ea50b 100755
--- a/lite/kernels/fpga/concat_compute.cc
+++ b/lite/kernels/fpga/concat_compute.cc
@@ -45,11 +45,11 @@ void ConcatCompute::PrepareForRun() {
 
 void ConcatCompute::Run() {
   pe_.dispatch();
-#ifdef FPGA_PRINT_TENSOR
+  // #ifdef FPGA_PRINT_TENSOR
   zynqmp::ConcatParam& concat_param = pe_.param();
   concat_param.output->flush();
-  // Debugger::get_instance().registerOutput("concat", concat_param.output);
-#endif
+  Debugger::get_instance().registerOutput("concat", concat_param.output);
+  // #endif
 }
 
 }  // namespace fpga
diff --git a/lite/kernels/fpga/conv_compute.cc b/lite/kernels/fpga/conv_compute.cc
index 14de934eb3c9a3a27d2395612ef97204aac4689a..7754d70d822ebc22e3c1bc4db50504ea2d71e14c 100644
--- a/lite/kernels/fpga/conv_compute.cc
+++ b/lite/kernels/fpga/conv_compute.cc
@@ -53,7 +53,8 @@ void ConvCompute::PrepareForRun() {
 
     if (param.activation_param.Leaky_relu_alpha > 0.001) {
       conv_param.activeParam.type = zynqmp::TYPE_LEAKY_RELU;
-      conv_param.activeParam.leaky_relu_factor = param.activation_param.Leaky_relu_alpha;
+      conv_param.activeParam.leaky_relu_factor =
+          param.activation_param.Leaky_relu_alpha;
     }
 
     dw_conv_pe_.init();
@@ -79,13 +80,15 @@ void ConvCompute::PrepareForRun() {
 
     if (param.activation_param.Leaky_relu_alpha > 0.001) {
       conv_param.activeParam.type = zynqmp::TYPE_LEAKY_RELU;
-      conv_param.activeParam.leaky_relu_factor = param.activation_param.Leaky_relu_alpha;
+      conv_param.activeParam.leaky_relu_factor =
+          param.activation_param.Leaky_relu_alpha;
     }
 
     conv_pe_.init();
     conv_pe_.apply();
   }
-  // std::cout << "Leaky_relu_alpha:" << param.activation_param.Leaky_relu_alpha << std::endl;
+  // std::cout << "Leaky_relu_alpha:" << param.activation_param.Leaky_relu_alpha
+  // << std::endl;
 }
 
 void ConvCompute::Run() {
diff --git a/lite/kernels/fpga/elementwise_compute.cc b/lite/kernels/fpga/elementwise_compute.cc
index 1bcb7f2ae78114055c1c740d3f434594d7a02b64..569a6e049e1ede1608a3d62e66b705ddec4c43b2 100755
--- a/lite/kernels/fpga/elementwise_compute.cc
+++ b/lite/kernels/fpga/elementwise_compute.cc
@@ -96,11 +96,12 @@ void ElementwiseMulCompute::PrepareForRun() {
     scale_value = param.Y->data<zynqmp::float16>()[0];
     // std::cout << "FP16 \n";
   }
-  
+
   // std::cout << "channel:" << channel << std::endl;
   // std::cout << "production:" << param.Y->dims().production() << std::endl;
 
-  // std::cout << "scale_value:" << std::to_string(zynqmp::half_to_float(scale_value)) << std::endl;
+  // std::cout << "scale_value:" <<
+  // std::to_string(zynqmp::half_to_float(scale_value)) << std::endl;
   // exit(-1);
 
   for (int i = 0; i < channel; i++) {
@@ -112,7 +113,8 @@ void ElementwiseMulCompute::PrepareForRun() {
         scale_value = param.Y->data<zynqmp::float16>()[i];
       }
     }
-    // std::cout << "scale_value:" << std::to_string(zynqmp::half_to_float(scale_value)) << std::endl;
+    // std::cout << "scale_value:" <<
+    // std::to_string(zynqmp::half_to_float(scale_value)) << std::endl;
     // exit(-1);
     scale_data[i] = scale_value;
     bias_data[i] = zero_;
@@ -128,13 +130,13 @@ void ElementwiseMulCompute::Run() {
   if (!param.Y->persistable()) {
     // TODO
     scale_.copyFrom(param.Y->ZynqTensor());
-    scale_.flush();//TODO
+    scale_.flush();  // TODO
   }
   pe_.dispatch();
 #ifdef FPGA_PRINT_TENSOR
   zynqmp::ScaleParam& scale_param = pe_.param();
-  // Debugger::get_instance().registerOutput("ew_mul_in", scale_param.input);
-  // Debugger::get_instance().registerOutput("ew_mul", scale_param.output);
+// Debugger::get_instance().registerOutput("ew_mul_in", scale_param.input);
+// Debugger::get_instance().registerOutput("ew_mul", scale_param.output);
 #endif
 }
 
@@ -214,8 +216,7 @@ REGISTER_LITE_KERNEL(elementwise_mul,
                {LiteType::GetTensorTy(TARGET(kFPGA),
                                       PRECISION(kFP16),
                                       DATALAYOUT(kNHWC))})
-    .BindInput("Y",
-               {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out",
                 {LiteType::GetTensorTy(TARGET(kFPGA),
                                        PRECISION(kFP16),
diff --git a/lite/kernels/fpga/feed_compute.cc b/lite/kernels/fpga/feed_compute.cc
index f28b4c9b3c92ea19d90034e9381e533597765e0f..5a029cd5189b5bb8ddbf920b7b174aab5e49dc18 100755
--- a/lite/kernels/fpga/feed_compute.cc
+++ b/lite/kernels/fpga/feed_compute.cc
@@ -28,7 +28,14 @@ void FeedCompute::PrepareForRun() {
   auto& param = this->Param<param_t>();
   Tensor& x = param.feed_list->at(param.col);
   param.out->Resize(x.dims());
-  param.out->mutable_data<float16>();
+
+  auto in_type = x.ZynqTensor()->dataType();
+  if (in_type == zynqmp::FP32 || in_type == zynqmp::FP16) {
+    param.out->mutable_data<float16>();
+  }
+  if (in_type == zynqmp::INT32) {
+    param.out->mutable_data<int32_t>();
+  }
   // ====================================================
   zynqmp::InputParam& feed_param = pe_.param();
   feed_param.input = x.ZynqTensor();
@@ -68,12 +75,18 @@ REGISTER_LITE_KERNEL(
                                        DATALAYOUT(kNHWC))})
     .Finalize();
 
-// REGISTER_LITE_KERNEL(feed,
-//                      kFPGA,
-//                      kFP16,
-//                      kNHWC,
-//                      paddle::lite::kernels::fpga::FeedCompute,
-//                      def_host)
-//     .BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))})
-//     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
-//     .Finalize();
+REGISTER_LITE_KERNEL(feed,
+                     kFPGA,
+                     kFP16,
+                     kNHWC,
+                     paddle::lite::kernels::fpga::FeedCompute,
+                     feed_int32)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kInt32),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
\ No newline at end of file
diff --git a/lite/kernels/fpga/interpolate_compute.cc b/lite/kernels/fpga/interpolate_compute.cc
index 7358ec1bf3f1e146b3b1399b1a51428880484aca..b7eb36c91021f953ff9f17edef8248f7d4f44bdd 100644
--- a/lite/kernels/fpga/interpolate_compute.cc
+++ b/lite/kernels/fpga/interpolate_compute.cc
@@ -49,7 +49,6 @@ void BilinearInterpCompute::Run() {
   //                              interp_method);
 }
 
-
 void nearest_interp(const float16* src,
                     int w_in,
                     int h_in,
@@ -85,7 +84,7 @@ void nearest_interp(const float16* src,
       int near_y = static_cast<int>(scale_h_new * h);
       for (int w = 0; w < w_out; ++w) {
         int near_x = static_cast<int>(scale_w_new * w);
-        
+
         const float16* src_n = src + (near_y * w_in + near_x) * c;
         memcpy(dst_p, src_n, c * sizeof(float16));
         dst_p += c;
@@ -133,8 +132,6 @@ inline std::vector<T> get_new_data_from_tensor(const Tensor* new_data_tensor) {
   return vec_new_data;
 }
 
-
-
 void interpolate(lite::Tensor* X,
                  lite::Tensor* OutSize,
                  std::vector<const lite::Tensor*> SizeTensor,
@@ -188,19 +185,18 @@ void interpolate(lite::Tensor* X,
   int spatial_in = in_h * in_w;
   int spatial_out = out_h * out_w;
 
-
   for (int i = 0; i < count; ++i) {
-      nearest_interp(din + spatial_in * i,
-                     in_w,
-                     in_h,
-                     out_c,
-                     dout + spatial_out * i,
-                     out_w,
-                     out_h,
-                     1.f / width_scale,
-                     1.f / height_scale,
-                     with_align);
-    }
+    nearest_interp(din + spatial_in * i,
+                   in_w,
+                   in_h,
+                   out_c,
+                   dout + spatial_out * i,
+                   out_w,
+                   out_h,
+                   1.f / width_scale,
+                   1.f / height_scale,
+                   with_align);
+  }
 }
 
 void NearestInterpCompute::Run() {
@@ -215,27 +211,24 @@ void NearestInterpCompute::Run() {
   int out_h = param.out_h;
   bool align_corners = param.align_corners;
 
-
   std::string interp_method = "";
 
-  X->ZynqTensor()->invalidate();//TODO
+  X->ZynqTensor()->invalidate();  // TODO
   X->ZynqTensor()->saveToFile("n_in", true);
   interpolate(X,
-                               OutSize,
-                               SizeTensor,
-                               Scale,
-                               Out,
-                               out_h,
-                               out_w,
-                               scale,
-                               align_corners,
-                               interp_method);
+              OutSize,
+              SizeTensor,
+              Scale,
+              Out,
+              out_h,
+              out_w,
+              scale,
+              align_corners,
+              interp_method);
 
-  
   Out->ZynqTensor()->flush();
   Out->ZynqTensor()->copyScaleFrom(X->ZynqTensor());
   Out->ZynqTensor()->saveToFile("n_out", true);
-
 }
 
 } /* namespace fpga */
@@ -249,15 +242,17 @@ REGISTER_LITE_KERNEL(bilinear_interp,
                      kNHWC,
                      paddle::lite::kernels::fpga::BilinearInterpCompute,
                      def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kFPGA),
-                                       PRECISION(kFP16),
-                                       DATALAYOUT(kNHWC))})
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
     .BindInput("OutSize",
                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
     .BindInput("SizeTensor",
                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
     .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kFPGA),
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kFPGA),
                                        PRECISION(kFP16),
                                        DATALAYOUT(kNHWC))})
     .Finalize();
@@ -268,15 +263,17 @@ REGISTER_LITE_KERNEL(nearest_interp,
                      kNHWC,
                      paddle::lite::kernels::fpga::NearestInterpCompute,
                      def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kFPGA),
-                                       PRECISION(kFP16),
-                                       DATALAYOUT(kNHWC))})
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
     .BindInput("OutSize",
                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
     .BindInput("SizeTensor",
                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
     .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kFPGA),
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kFPGA),
                                        PRECISION(kFP16),
                                        DATALAYOUT(kNHWC))})
     .Finalize();
diff --git a/lite/kernels/fpga/interpolate_compute.h b/lite/kernels/fpga/interpolate_compute.h
index cc904f936452debc8c1cfb2b8c30a867afcd5163..86f83c71b0b98848d715178ab65c2fb82ab570d3 100644
--- a/lite/kernels/fpga/interpolate_compute.h
+++ b/lite/kernels/fpga/interpolate_compute.h
@@ -14,9 +14,9 @@
 
 #pragma once
 #include <string>
+#include "lite/backends/fpga/KD/pes/resize_pe.hpp"
 #include "lite/core/kernel.h"
 #include "lite/core/op_registry.h"
-#include "lite/backends/fpga/KD/pes/resize_pe.hpp"
 
 namespace paddle {
 namespace lite {
@@ -34,12 +34,12 @@ class BilinearInterpCompute
 class NearestInterpCompute
     : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
  public:
-  
   void PrepareForRun() override;
 
   void Run() override;
 
   virtual ~NearestInterpCompute() = default;
+
  private:
   zynqmp::ResizePE pe_;
 };
diff --git a/lite/kernels/fpga/io_copy_compute.cc b/lite/kernels/fpga/io_copy_compute.cc
index a7dbf9359f6adf4bb89fbcd78c0ff03637fae173..84f75c2e2a257510c16907bd655863c5b51307de 100755
--- a/lite/kernels/fpga/io_copy_compute.cc
+++ b/lite/kernels/fpga/io_copy_compute.cc
@@ -44,8 +44,6 @@ class IoCopyHostCHWToFpgaHWCCompute
           param.x->target() == TARGET(kFPGA));
     param.x->ZynqTensor()->flush();
 
-    
-    
     if (param.x->ZynqTensor()->dataType() == zynqmp::INT32) {
       param.y->mutable_data<int>();
       param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor());
@@ -86,7 +84,7 @@ class IoCopyFpgaToHostCompute
     auto& param = Param<operators::IoCopyParam>();
     CHECK(param.x->target() == TARGET(kHost) ||
           param.x->target() == TARGET(kFPGA));
-    
+
     param.x->ZynqTensor()->syncToDevice();
     param.y->mutable_data<float>();
     param.y->ZynqTensor()->setDataType(zynqmp::FP32);
@@ -104,7 +102,7 @@ class IoCopyFpgaToHostCompute
     } else {
       param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor());
     }
-    
+
     param.y->ZynqTensor()->invalidate();
     copy_properties(param);
   }
@@ -141,16 +139,22 @@ class IoCopyFpgaToHostCHWCompute
     CHECK(param.x->target() == TARGET(kHost) ||
           param.x->target() == TARGET(kFPGA));
 
-    Tensor hwc;    
+    param.x->ZynqTensor()->syncToDevice();
+    if (param.x->ZynqTensor()->dataType() == zynqmp::INT32) {
+      param.y->mutable_data<int32_t>();
+      param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor());
+      return;
+    }
+
+    Tensor hwc;
     hwc.Resize(param.y->dims());
     float* hwc_data = hwc.mutable_data<float>();
     float* chw_data = param.y->mutable_data<float>();
     param.y->ZynqTensor()->setDataType(zynqmp::FP32);
-    param.x->ZynqTensor()->syncToDevice();
 
     hwc.ZynqTensor()->setDataLocation(zynqmp::CPU);
     param.y->ZynqTensor()->setDataLocation(zynqmp::CPU);
-    
+
     if (param.x->ZynqTensor()->aligned() &&
         param.x->ZynqTensor()->shape().shouldAlign()) {
       zynqmp::Tensor tempTensor;
@@ -158,15 +162,15 @@ class IoCopyFpgaToHostCHWCompute
                                       param.x->ZynqTensor()->shape());
       tempTensor.copyFrom(param.x->ZynqTensor());
       tempTensor.setAligned(true);
-      // tempTensor.saveToFile("temp_1", true);
-      tempTensor.unalignImage();
-      // tempTensor.saveToFile("temp_2", true);
-      
+      tempTensor.saveToFile("temp_1", true);
+      //   tempTensor.unalignImage();
+      tempTensor.saveToFile("temp_2", true);
+
       hwc.ZynqTensor()->copyFrom(&tempTensor);
     } else {
       // hwc.ZynqTensor()->copyFrom(param.x->ZynqTensor());
       float16* in_data = param.x->ZynqTensor()->data<float16>();
-      // float* f_data = 
+      // float* f_data =
       param.x->ZynqTensor()->flush();
       float max = 0;
 
@@ -198,6 +202,7 @@ class IoCopyFpgaToHostCHWCompute
                dims.height(),
                dims.width());
 
+    param.y->ZynqTensor()->copyFrom(hwc.ZynqTensor());
     // param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor());
     param.y->ZynqTensor()->flush();
     copy_properties(param);
@@ -205,8 +210,8 @@ class IoCopyFpgaToHostCHWCompute
     param.x->ZynqTensor()->invalidate();
     param.x->ZynqTensor()->flush();
     // hwc.ZynqTensor()->saveToFile("hwc", true);
-    // param.x->ZynqTensor()->saveToFile("io2_x", true);
-    // param.y->ZynqTensor()->saveToFile("io2_y", true);
+    param.x->ZynqTensor()->saveToFile("io2_x", true);
+    param.y->ZynqTensor()->saveToFile("io2_y", true);
   }
   std::string doc() const override { return "Copy IO from FPGA to HOST"; }
 };
@@ -238,15 +243,16 @@ REGISTER_LITE_KERNEL(io_copy,
                      kAny,
                      paddle::lite::kernels::fpga::IoCopyHostCHWToFpgaHWCCompute,
                      host_float_chw_to_device_fp16_hwc)
-    .BindInput("Input", {LiteType::GetTensorTy(
-                   TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW))})
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
     .BindOutput("Out",
                 {LiteType::GetTensorTy(TARGET(kFPGA),
                                        PRECISION(kFP16),
                                        DATALAYOUT(kNHWC))})
     .Finalize();
 
-
 REGISTER_LITE_KERNEL(io_copy,
                      kFPGA,
                      kAny,
@@ -311,25 +317,24 @@ REGISTER_LITE_KERNEL(io_copy,
 //                                        DATALAYOUT(kAny))})
 //     .Finalize();
 
-
 //  ==========================================================
 
-  // std::unique_ptr<type_infer_handler_t> GetTypeInferHandler() override {
-  //   std::unique_ptr<type_infer_handler_t> res(new type_infer_handler_t);
-  //   *res = [](const std::map<std::string, const Type*>& inputs,
-  //             const std::string& out) -> const Type* {
-  //     CHECK(!inputs.empty());
-  //     auto* type = inputs.at("Input");
-  //     CHECK(type->target() == TARGET(kHost));
-
-  //     auto out_place = type->place();
-  //     out_place.target = TARGET(kFPGA);
-  //     auto* out_type = Type::Get(type->id(),
-  //                                out_place.target,
-  //                                out_place.precision,
-  //                                out_place.layout,
-  //                                out_place.device);
-  //     return out_type;
-  //   };
-  //   return res;
-  // }
\ No newline at end of file
+// std::unique_ptr<type_infer_handler_t> GetTypeInferHandler() override {
+//   std::unique_ptr<type_infer_handler_t> res(new type_infer_handler_t);
+//   *res = [](const std::map<std::string, const Type*>& inputs,
+//             const std::string& out) -> const Type* {
+//     CHECK(!inputs.empty());
+//     auto* type = inputs.at("Input");
+//     CHECK(type->target() == TARGET(kHost));
+
+//     auto out_place = type->place();
+//     out_place.target = TARGET(kFPGA);
+//     auto* out_type = Type::Get(type->id(),
+//                                out_place.target,
+//                                out_place.precision,
+//                                out_place.layout,
+//                                out_place.device);
+//     return out_type;
+//   };
+//   return res;
+// }
\ No newline at end of file
diff --git a/lite/kernels/fpga/prior_box_compute.cc b/lite/kernels/fpga/prior_box_compute.cc
index c19744fa520e1333180d97a72d8f1f45f9f3a9bf..ac4a5d18a9c7dc9b45684a1ae8072542245fd645 100644
--- a/lite/kernels/fpga/prior_box_compute.cc
+++ b/lite/kernels/fpga/prior_box_compute.cc
@@ -132,4 +132,3 @@ REGISTER_LITE_KERNEL(prior_box,
     .BindOutput("Boxes", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Variances", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
-
diff --git a/lite/kernels/fpga/reshape_compute.cc b/lite/kernels/fpga/reshape_compute.cc
index 24c60f54efde70f67aa12a308437ea55ff5f47b7..3f088700f1ee9787d0d19eff00a809cb3daf9ddd 100644
--- a/lite/kernels/fpga/reshape_compute.cc
+++ b/lite/kernels/fpga/reshape_compute.cc
@@ -23,7 +23,6 @@ namespace fpga {
 
 using float16 = zynqmp::float16;
 
-
 void FlattenCompute::Run() {
   auto& param = Param<operators::ReshapeParam>();
   auto x = param.x;
@@ -45,12 +44,10 @@ void FlattenCompute::Run() {
   output->Resize(output_dims);
 
 #ifdef FPGA_PRINT_TENSOR
-  Debugger::get_instance().registerOutput("flatten",
-                                          output->ZynqTensor());
+  Debugger::get_instance().registerOutput("flatten", output->ZynqTensor());
 #endif
 }
 
-
 void ReshapeCompute::Run() {
   auto& param = Param<operators::ReshapeParam>();
   auto x = param.x;
@@ -69,17 +66,14 @@ void ReshapeCompute::Run() {
   } else {
     // output->CopyDataFrom(*x);
   }
-  
-  
 
   output->ZynqTensor()->copyFrom(x->ZynqTensor());
   // output->ZynqTensor()->saveToFile("ro", true);
   output->ZynqTensor()->flush();
   output->ZynqTensor()->setAligned(x->ZynqTensor()->aligned());
-  
+
 #ifdef FPGA_PRINT_TENSOR
-  Debugger::get_instance().registerOutput("reshape",
-                                          output->ZynqTensor());
+  Debugger::get_instance().registerOutput("reshape", output->ZynqTensor());
 #endif
 }
 
@@ -163,7 +157,7 @@ REGISTER_LITE_KERNEL(flatten2,
                                       PRECISION(kFP16),
                                       DATALAYOUT(kNHWC))})
     .BindInput("Shape",
-                {LiteType::GetTensorTy(TARGET(kHost),
+               {LiteType::GetTensorTy(TARGET(kHost),
                                       PRECISION(kAny),
                                       DATALAYOUT(kAny))})
     .BindOutput("Out",
diff --git a/lite/kernels/fpga/softmax_compute.cc b/lite/kernels/fpga/softmax_compute.cc
index 25fceda569a832e0f415fd878705b8a1b99ce34f..8e51a716b092489920f0cfbc729f0cb22e5c71c8 100755
--- a/lite/kernels/fpga/softmax_compute.cc
+++ b/lite/kernels/fpga/softmax_compute.cc
@@ -38,9 +38,9 @@ void SoftmaxCompute::Run() {
   zynqmp::SoftmaxParam& softmax_param = pe_.param();
   // softmax_param.input->saveToFile("softmax_in", true);
   pe_.dispatch();
-  
+
   softmax_param.output->flush();
-  // softmax_param.output->saveToFile("softmax", true);
+// softmax_param.output->saveToFile("softmax", true);
 #ifdef FPGA_PRINT_TENSOR
   Debugger::get_instance().registerOutput("softmax", softmax_param.output);
 #endif
@@ -61,17 +61,9 @@ REGISTER_LITE_KERNEL(softmax,
                {LiteType::GetTensorTy(TARGET(kFPGA),
                                       PRECISION(kFP16),
                                       DATALAYOUT(kNHWC))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
 
-
-
-
-
-
-
-
 // .BindOutput("Out",
 //                 {LiteType::GetTensorTy(TARGET(kFPGA),
 //                                        PRECISION(kFP16),
diff --git a/lite/kernels/fpga/transpose_compute.cc b/lite/kernels/fpga/transpose_compute.cc
index 5f55ae3d9f18f15f82ef15941610a2aecca5e474..d2e9d9c16662a5e8a91ee43af59544c4c1efcecc 100644
--- a/lite/kernels/fpga/transpose_compute.cc
+++ b/lite/kernels/fpga/transpose_compute.cc
@@ -104,7 +104,7 @@ void Transpose2Compute::Run() {
   // param.x->ZynqTensor()->saveToFile("t_unaligned", true);
   param.x->ZynqTensor()->flush();
   param.x->ZynqTensor()->invalidate();
-  
+
   if (param.x->dims().size() != 4) {
     transposeCompute(param);
     param.output->ZynqTensor()->setAligned(param.x->ZynqTensor()->aligned());
@@ -115,7 +115,7 @@ void Transpose2Compute::Run() {
   // param.output->ZynqTensor()->copyFrom(param.x->ZynqTensor());
 
   param.output->ZynqTensor()->flush();
-  // param.output->ZynqTensor()->saveToFile("Transpose2", true);
+  param.output->ZynqTensor()->saveToFile("Transpose2", true);
 }
 
 }  // namespace fpga
@@ -151,8 +151,9 @@ REGISTER_LITE_KERNEL(transpose2,
                {LiteType::GetTensorTy(TARGET(kFPGA),
                                       PRECISION(kFP16),
                                       DATALAYOUT(kNHWC))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kFPGA),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kFPGA),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
     .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
diff --git a/lite/kernels/fpga/yolo_box_compute.cc b/lite/kernels/fpga/yolo_box_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1e90cf30c4b0d6b5bf93d85811ef8fbe7324ba34
--- /dev/null
+++ b/lite/kernels/fpga/yolo_box_compute.cc
@@ -0,0 +1,80 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/fpga/yolo_box_compute.h"
+#include <vector>
+#include "lite/backends/arm/math/funcs.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace fpga {
+
+void YoloBoxCompute::PrepareForRun() {
+  auto& param = Param<operators::YoloBoxParam>();
+  lite::Tensor* X = param.X;
+  lite::Tensor* ImgSize = param.ImgSize;
+  lite::Tensor* Boxes = param.Boxes;
+  lite::Tensor* Scores = param.Scores;
+  
+
+  Boxes->mutable_data<float>();
+  Scores->mutable_data<float>();
+
+  zynqmp::YoloBoxParam& yolobox_param = pe_.param();
+  yolobox_param.input = X->ZynqTensor();
+  yolobox_param.imgSize = ImgSize->ZynqTensor();
+  yolobox_param.outputBoxes = Boxes->ZynqTensor();
+  yolobox_param.outputScores = Scores->ZynqTensor();
+  yolobox_param.downsampleRatio = param.downsample_ratio;
+  yolobox_param.anchors = param.anchors;
+  yolobox_param.classNum = param.class_num;
+  yolobox_param.confThresh = param.conf_thresh;
+
+  pe_.init();
+  pe_.apply();
+
+}
+
+void YoloBoxCompute::Run() {
+
+  pe_.dispatch();
+
+  zynqmp::YoloBoxParam& yolobox_param = pe_.param();
+  yolobox_param.imgSize->saveToFile("img_size", true);
+//   exit(-1);
+  yolobox_param.outputBoxes->saveToFile("yolo_boxes", true);
+  yolobox_param.outputScores->saveToFile("yolo_scores", true);
+}
+
+}  // namespace fpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+// REGISTER_LITE_KERNEL(yolo_box,
+//                      kFPGA,
+//                      kFP16,
+//                      kNHWC,
+//                      paddle::lite::kernels::fpga::YoloBoxCompute,
+//                      def)
+//     .BindInput("X", {LiteType::GetTensorTy(TARGET(kFPGA),
+//                                       PRECISION(kFP16),
+//                                       DATALAYOUT(kNHWC))})
+//     .BindInput("ImgSize",
+//                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+//     .BindOutput("Boxes", {LiteType::GetTensorTy(TARGET(kARM))})
+//     .BindOutput("Scores", {LiteType::GetTensorTy(TARGET(kARM))})
+//     .Finalize();
diff --git a/lite/kernels/fpga/yolo_box_compute.h b/lite/kernels/fpga/yolo_box_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..e4c573cf6719ea1b49fb83b431182ff57c8f4796
--- /dev/null
+++ b/lite/kernels/fpga/yolo_box_compute.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+#include "lite/backends/fpga/KD/float16.hpp"
+#include "lite/backends/fpga/KD/pes/elementwise_add_pe.hpp"
+#include "lite/backends/fpga/KD/pes/yolobox_pe.hpp"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace fpga {
+
+using float16 = zynqmp::float16;
+
+class YoloBoxCompute 
+    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
+ public:
+  void PrepareForRun() override;
+  void Run() override;
+
+  virtual ~YoloBoxCompute() {
+
+  };
+
+ private:
+  zynqmp::YoloBoxPE pe_;
+};
+
+}  // namespace fpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle