From a59d6fabd4a287f3676d0ffffb92a63e2fcd7619 Mon Sep 17 00:00:00 2001
From: chonwhite <arthurbrown@163.com>
Date: Fri, 19 Jun 2020 10:18:11 +0800
Subject: [PATCH] arm & fpga kernel works together

---
 lite/backends/fpga/KD/debugger.hpp            |   1 +
 lite/backends/fpga/KD/dispatch/action.hpp     |  36 +++
 .../backends/fpga/KD/dispatch/transaction.hpp |  40 +++
 .../fpga/KD/dispatch/transaction_manager.hpp  |  47 +++
 lite/backends/fpga/KD/llapi/filter.cpp        |   4 +-
 lite/backends/fpga/KD/pes/conv_process.hpp    |  14 +-
 lite/backends/fpga/KD/pes/input_pe.hpp        |   4 +-
 lite/backends/fpga/KD/pes/norm_pe.hpp         |   1 +
 lite/backends/fpga/KD/pes/output_pe.hpp       |   4 +-
 lite/backends/fpga/KD/pes/prior_box_pe.cpp    |   7 +-
 lite/backends/fpga/KD/pes/prior_box_pe.hpp    |   7 +
 .../fpga/KD/pes/{resize.hpp => resize_pe.hpp} |  36 ++-
 lite/backends/fpga/KD/pes/scale_pe.hpp        |  39 ++-
 lite/backends/fpga/KD/pes/softmax_pe.cpp      |   1 +
 lite/backends/fpga/KD/pes/split_pe.hpp        |   2 +-
 lite/backends/fpga/KD/tensor.hpp              |  35 ++-
 lite/backends/fpga/lite_tensor.cc             |   1 +
 lite/backends/fpga/lite_tensor.h              |  27 +-
 lite/core/mir/kernel_place_correct_pass.h     | 143 ++++++++-
 lite/core/mir/static_kernel_pick_pass.h       |  17 ++
 lite/core/mir/type_precision_cast_pass.cc     |  10 +
 lite/core/mir/type_target_cast_pass.cc        |  11 +-
 lite/kernels/arm/concat_compute.cc            |  32 +-
 lite/kernels/fpga/CMakeLists.txt              |   2 +
 lite/kernels/fpga/calib_compute.cc            |  29 +-
 lite/kernels/fpga/calib_compute.h             |  12 +
 lite/kernels/fpga/concat_compute.cc           |   3 +-
 lite/kernels/fpga/conv_compute.cc             |  11 +
 lite/kernels/fpga/elementwise_compute.cc      |  52 +++-
 lite/kernels/fpga/fetch_compute.cc            |  24 +-
 lite/kernels/fpga/interpolate_compute.cc      | 282 ++++++++++++++++++
 lite/kernels/fpga/interpolate_compute.h       |  50 ++++
 lite/kernels/fpga/io_copy_compute.cc          | 196 ++++++------
 lite/kernels/fpga/multiclass_nms_compute.cc   | 171 ++++++-----
 lite/kernels/fpga/prior_box_compute.cc        |   3 +-
 lite/kernels/fpga/reshape_compute.cc          |  97 ++++--
 lite/kernels/fpga/reshape_compute.h           |   8 +
 lite/kernels/fpga/scale_compute.cc            |   4 +-
 lite/kernels/fpga/scale_compute.h             |   2 +
 lite/kernels/fpga/softmax_compute.cc          |  25 +-
 lite/kernels/fpga/transpose_compute.cc        |  34 ++-
 41 files changed, 1235 insertions(+), 289 deletions(-)
 create mode 100644 lite/backends/fpga/KD/dispatch/action.hpp
 create mode 100644 lite/backends/fpga/KD/dispatch/transaction.hpp
 create mode 100644 lite/backends/fpga/KD/dispatch/transaction_manager.hpp
 rename lite/backends/fpga/KD/pes/{resize.hpp => resize_pe.hpp} (64%)
 create mode 100644 lite/kernels/fpga/interpolate_compute.cc
 create mode 100644 lite/kernels/fpga/interpolate_compute.h

diff --git a/lite/backends/fpga/KD/debugger.hpp b/lite/backends/fpga/KD/debugger.hpp
index 004536fc8d..454e5db8c6 100755
--- a/lite/backends/fpga/KD/debugger.hpp
+++ b/lite/backends/fpga/KD/debugger.hpp
@@ -73,6 +73,7 @@ class Debugger {
     op_config["nms"] = true;
     op_config["pb_boxes"] = true;
     op_config["pb_variances"] = true;
+    op_config["reshape"] = true;
     op_config["softmax"] = true;
     op_config["split"] = true;
   }
diff --git a/lite/backends/fpga/KD/dispatch/action.hpp b/lite/backends/fpga/KD/dispatch/action.hpp
new file mode 100644
index 0000000000..0235439a07
--- /dev/null
+++ b/lite/backends/fpga/KD/dispatch/action.hpp
@@ -0,0 +1,36 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace paddle {
+namespace zynqmp {
+
+class Action {
+public:
+  void readScale(float* scale) {
+
+  }
+
+  void writeScale(float* scale) {
+
+  }
+
+private:
+  int id_ = -1;
+  int scaleIndex_ = -1;
+}
+
+}
+}
\ No newline at end of file
diff --git a/lite/backends/fpga/KD/dispatch/transaction.hpp b/lite/backends/fpga/KD/dispatch/transaction.hpp
new file mode 100644
index 0000000000..c5f19e0e4e
--- /dev/null
+++ b/lite/backends/fpga/KD/dispatch/transaction.hpp
@@ -0,0 +1,40 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "lite/backends/fpga/KD/dispatch/action.hpp"
+
+#include <algorithm>
+#include <vector>
+
+namespace paddle {
+namespace zynqmp {
+
+class Transaction {
+
+public:
+  void appendAction(Action* action) {
+    actions_.push_back(action);
+  };
+
+  void startTraction() {
+    
+  };
+
+private:
+  std::std::vector<Action*> actions_;
+  int id_ = -1;
+}
+
+}
+}
\ No newline at end of file
diff --git a/lite/backends/fpga/KD/dispatch/transaction_manager.hpp b/lite/backends/fpga/KD/dispatch/transaction_manager.hpp
new file mode 100644
index 0000000000..b24e154402
--- /dev/null
+++ b/lite/backends/fpga/KD/dispatch/transaction_manager.hpp
@@ -0,0 +1,47 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+
+namespace paddle {
+namespace zynqmp {
+
+class TransactionManager {
+public:
+  static TransactionManager& get_instance() {
+    static TransactionManager s_instance;
+    return s_instance;
+  }
+
+  Transaction* getTransaction() {
+    if (currentTransaction_ == nullptr) {
+      currentTransaction_ = new Transaction();
+      transactions_.push_back(currentTransaction_);
+    }
+    return currentTransaction_;
+  };
+
+  void endTransaction() {
+    currentTransaction_ = nullptr;
+  }
+
+private:
+  Transaction* currentTransaction_ = nullptr;
+  std::vector<Transaction*> transactions_;
+}
+
+}
+}
\ No newline at end of file
diff --git a/lite/backends/fpga/KD/llapi/filter.cpp b/lite/backends/fpga/KD/llapi/filter.cpp
index e09b9d67d1..a6dd5e7241 100755
--- a/lite/backends/fpga/KD/llapi/filter.cpp
+++ b/lite/backends/fpga/KD/llapi/filter.cpp
@@ -240,8 +240,8 @@ int8_t* format_filter(float* data_in,
   for (int n = 0; n < num; n++) {
     float* filter_start = data_in + n * chw;
     int8_t* quantized_start = quantized_data + n * chw;
-    // float f_max = find_max(filter_start, chw);
-    float f_max = max;
+    float f_max = find_max(filter_start, chw);
+    // float f_max = max;
     quantize(filter_start, quantized_start, chw, f_max);
     filter_max.push_back(f_max);
   }
diff --git a/lite/backends/fpga/KD/pes/conv_process.hpp b/lite/backends/fpga/KD/pes/conv_process.hpp
index cea22e0edc..6eed7d6080 100755
--- a/lite/backends/fpga/KD/pes/conv_process.hpp
+++ b/lite/backends/fpga/KD/pes/conv_process.hpp
@@ -264,10 +264,10 @@ inline void format_filter(Tensor* filter,
   quantized_filter->flush();
   fpga_free(quantized_data);
 
-  // for (size_t i = 0; i < max_values.size(); i++) {
-  //   // scales.push_back(max_values[i] / max_value);
-  //   scales.push_back(1.0f);
-  // }
+  for (size_t i = 0; i < max_values.size(); i++) {
+    scales.push_back(max_values[i] / max_value);
+    // scales.push_back(1.0f);
+  }
 
   // filter->saveToFile("filter.txt");
   // std::ofstream ofs;
@@ -374,17 +374,15 @@ inline void split_filter_num(const ConvParam& c_param) {
     std::vector<float> v;  // TODO(chonwhite) change variable name;
     format_filter(&new_filter, &(conv_param->filter), param.groups, v, max);
     conv_param->filter.setDataType(INT8);
-
     Tensor scale;
     Tensor bias;
 
     int chnnnel_start = i * filter_num_per_div;
-
     Shape s_shape(NC, {1, filter_num});
     float* scale_data = scale.mutableData<float>(FP32, s_shape);
     float* bias_data = bias.mutableData<float>(FP32, s_shape);
     for (int n = 0; n < filter_num; n++) {
-      scale_data[n] = param.scale()->data<float>()[n + chnnnel_start];
+      scale_data[n] = param.scale()->data<float>()[n + chnnnel_start] * v[n];
     }
     for (int n = 0; n < filter_num; n++) {
       bias_data[n] = param.bias()->data<float>()[n + chnnnel_start];
@@ -513,7 +511,7 @@ inline void pack_channel_filter(const ConvParam& c_param) {
     float* scale_data = scale.mutableData<float>(FP32, s_shape);
     float* bias_data = bias.mutableData<float>(FP32, s_shape);
     for (int n = 0; n < filter_current_pack; n++) {
-      scale_data[n] = param.scale()->data<float>()[n + chnnnel_start];
+      scale_data[n] = param.scale()->data<float>()[n + chnnnel_start] * v[n];
     }
     for (int n = 0; n < filter_current_pack; n++) {
       bias_data[n] = param.bias()->data<float>()[n + chnnnel_start];
diff --git a/lite/backends/fpga/KD/pes/input_pe.hpp b/lite/backends/fpga/KD/pes/input_pe.hpp
index 380c85e17e..ec32658de1 100755
--- a/lite/backends/fpga/KD/pes/input_pe.hpp
+++ b/lite/backends/fpga/KD/pes/input_pe.hpp
@@ -41,7 +41,9 @@ class InputPE : public PE {
       src = &half_tensor;
     }
     output->mutableData<void>();
-    src->alignImage(output, true);
+    src->alignImage();
+    output->copyFrom(src);
+    // src->alignImage(output, true);
     return true;
   }
 
diff --git a/lite/backends/fpga/KD/pes/norm_pe.hpp b/lite/backends/fpga/KD/pes/norm_pe.hpp
index 0537df27e2..a3da530736 100644
--- a/lite/backends/fpga/KD/pes/norm_pe.hpp
+++ b/lite/backends/fpga/KD/pes/norm_pe.hpp
@@ -103,6 +103,7 @@ class NormPE : public PE {
     float_out.flush();
     // float_out.saveToFile("normalize_", true);
     param_.output->copyFrom(&float_out);
+    param_.output->flush();
   }
 
   bool dispatch() {
diff --git a/lite/backends/fpga/KD/pes/output_pe.hpp b/lite/backends/fpga/KD/pes/output_pe.hpp
index 2d02d30fba..015c934a5b 100755
--- a/lite/backends/fpga/KD/pes/output_pe.hpp
+++ b/lite/backends/fpga/KD/pes/output_pe.hpp
@@ -56,8 +56,8 @@ class OutputPE : public PE {
 
     fpga_reset();
 
-    auto max = fpga_get_memory_size_max();
-    std::cout << "PL ===== Max: ===== :: " << max << std::endl;
+    // auto max = fpga_get_memory_size_max();
+    // std::cout << "PL ===== Max: ===== :: " << max << std::endl;
 
     return true;
   }
diff --git a/lite/backends/fpga/KD/pes/prior_box_pe.cpp b/lite/backends/fpga/KD/pes/prior_box_pe.cpp
index 00dfe1830f..6c2f99087d 100644
--- a/lite/backends/fpga/KD/pes/prior_box_pe.cpp
+++ b/lite/backends/fpga/KD/pes/prior_box_pe.cpp
@@ -241,7 +241,7 @@ void PriorBoxPE::compute_prior_box() {
   }
 
   boxes.flush();
-  boxes.syncToCPU();
+  // boxes.syncToCPU();
   variances.flush();
   output_boxes->copyFrom(&boxes);
   output_variances->copyFrom(&variances);
@@ -261,11 +261,12 @@ bool PriorBoxPE::dispatch() {
   }
 
   param_.outputBoxes->copyFrom(this->cachedBoxes_);
-
   param_.outputVariances->copyFrom(this->cachedVariances_);
+  
   param_.outputBoxes->flush();
-  param_.outputBoxes->syncToCPU();
+  // param_.outputBoxes->syncToCPU();
   param_.outputVariances->flush();
+  return true;
 }
 
 }  // namespace zynqmp
diff --git a/lite/backends/fpga/KD/pes/prior_box_pe.hpp b/lite/backends/fpga/KD/pes/prior_box_pe.hpp
index 8afe40dd30..1fe789084b 100755
--- a/lite/backends/fpga/KD/pes/prior_box_pe.hpp
+++ b/lite/backends/fpga/KD/pes/prior_box_pe.hpp
@@ -35,6 +35,13 @@ class PriorBoxPE : public PE {
 
   PriorBoxParam& param() { return param_; }
 
+  ~PriorBoxPE() {
+    if (cachedBoxes_ != nullptr) {
+      delete cachedBoxes_;
+      delete cachedVariances_;
+    }
+  }
+
  private:
   PriorBoxParam param_;
   Tensor* cachedBoxes_ = nullptr;
diff --git a/lite/backends/fpga/KD/pes/resize.hpp b/lite/backends/fpga/KD/pes/resize_pe.hpp
similarity index 64%
rename from lite/backends/fpga/KD/pes/resize.hpp
rename to lite/backends/fpga/KD/pes/resize_pe.hpp
index f83896d2c7..98728202b6 100644
--- a/lite/backends/fpga/KD/pes/resize.hpp
+++ b/lite/backends/fpga/KD/pes/resize_pe.hpp
@@ -73,9 +73,43 @@ class ResizePE : public PE {
     scale[0] = max / 127.0;
     scale[1] = 127.0 / max;
   }
+  void cpu_compute() {
+    Shape& in_shape = param_.input->shape();
+    Shape& out_shape = param_.output->shape();
+    int channel = in_shape.channel();
+    int in_height = in_shape.height();
+    int in_width = in_shape.width();
+    int out_width = out_shape.width();
+    int factor = out_shape.width() / in_shape.width();
+
+    param_.input->syncToCPU();
+
+        for (int h = 0; h < in_height; h++) {
+            for (int w = 0; w < in_width; w++) {
+                int src_index = in_width * channel * h + w * channel;
+                float16* src = param_.input->data<float16>() + src_index;
+                // std::cout << "src_index:" << src_index << std::endl;
+                for (int v = 0; v < factor; v++) {
+                    for (int i =0; i < factor; i++) {
+                        int dst_index = out_width * channel * h * factor +
+                            out_width * channel * v +
+                            w * channel * factor +
+                            channel * i;
+                        float16* dst = param_.output->data<float16>() + dst_index;
+                        memcpy(dst, src, channel * sizeof(float16));
+                        // std::cout << "dst_index:" << dst_index << std::endl;
+                    }
+                }
+            }
+        }
+        param_.output->flush();
+        param_.output->copyScaleFrom(param_.input);
+    }
+
 
   bool dispatch() {
-    bool ret = compute_fpga_resize(args_) == 0;
+    cpu_compute();
+    // bool ret = compute_fpga_resize(args_) == 0;
     return true;
   }
 
diff --git a/lite/backends/fpga/KD/pes/scale_pe.hpp b/lite/backends/fpga/KD/pes/scale_pe.hpp
index 09755c65a3..b6b2daa6a2 100755
--- a/lite/backends/fpga/KD/pes/scale_pe.hpp
+++ b/lite/backends/fpga/KD/pes/scale_pe.hpp
@@ -141,22 +141,26 @@ class ScalePE : public PE {
     Tensor* output = param_.output;
     Tensor float_input;
     float* image_addr = float_input.mutableData<float>(FP32, input->shape());
-    input->syncToCPU();
+    // input->syncToCPU();
+    // input->invalidate();
     float_input.copyFrom(input);
     float16* data_out = output->data<float16>();
 
-    float* scale_data = param_.scale->data<float>();
+    float16* scale_data = param_.scale->data<float16>();
 
     int wh = input->shape().width() * input->shape().height();
 
     float16* in_data = input->data<float16>();
-
     float max = 0;
 
     for (int i = 0; i < wh; i++) {
       for (int c = 0; c < input->shape().channel(); c++) {
         int index = i * input->shape().channel() + c;
-        float value = half_to_float(in_data[index]) * scale_data[c];
+        float x = image_addr[index];
+        float y = half_to_float(scale_data[c]);
+        float value =  x * y;
+        // std::cout << " x = " << std::to_string(x) << " y = " << std::to_string(y) << " v = " << std::to_string(value) << std::endl;
+        // float value = half_to_float(in_data[index]) * 19.3598f;
         data_out[index] = float_to_half(value);
 
         if (value < 0) {
@@ -167,24 +171,27 @@ class ScalePE : public PE {
         }
       }
     }
+    // exit(-1);
     output->flush();
     output->scale()[0] = max / 127.0f;
     output->scale()[1] = 127.0f / max;
   }
 
   bool dispatch() {
-    if (param_.scale->dataType() == FP16) {
-      DepthwiseConvParam& dw_param = dw_pe_.param();
-      memcpy(dw_param.quantizedFilter()->mutableData<float16>(),
-             param_.scale->data<float16>(),
-             param_.scale->shape().numel() * sizeof(float16));
-      dw_param.quantizedFilter()->scale()[0] = param_.scale->scale()[0];
-      dw_param.quantizedFilter()->scale()[1] = param_.scale->scale()[1];
-
-      dw_param.quantizedFilter()->flush();
-    }
-    param_.input->syncToDevice();
-    return dw_pe_.dispatch();
+    // if (param_.scale->dataType() == FP16) {
+    //   DepthwiseConvParam& dw_param = dw_pe_.param();
+    //   memcpy(dw_param.quantizedFilter()->mutableData<float16>(),
+    //          param_.scale->data<float16>(),
+    //          param_.scale->shape().numel() * sizeof(float16));
+    //   dw_param.quantizedFilter()->scale()[0] = param_.scale->scale()[0];
+    //   dw_param.quantizedFilter()->scale()[1] = param_.scale->scale()[1];
+    //   dw_param.quantizedFilter()->flush();
+    // }
+    // param_.input->syncToDevice();
+    // return dw_pe_.dispatch();
+
+    cpu_compute();
+    return true;
   }
 
   ScaleParam& param() { return param_; }
diff --git a/lite/backends/fpga/KD/pes/softmax_pe.cpp b/lite/backends/fpga/KD/pes/softmax_pe.cpp
index 099ed20b8f..7a834169fb 100755
--- a/lite/backends/fpga/KD/pes/softmax_pe.cpp
+++ b/lite/backends/fpga/KD/pes/softmax_pe.cpp
@@ -154,6 +154,7 @@ bool SoftmaxPE::dispatch() {
   float_output.flush();
 
   output->copyFrom(&float_output);
+  output->flush();
   return true;
 }
 
diff --git a/lite/backends/fpga/KD/pes/split_pe.hpp b/lite/backends/fpga/KD/pes/split_pe.hpp
index 01a0367874..8c382bbf62 100644
--- a/lite/backends/fpga/KD/pes/split_pe.hpp
+++ b/lite/backends/fpga/KD/pes/split_pe.hpp
@@ -105,7 +105,7 @@ class SplitPE : public PE {
                                           in_stride,
                                           out_stride[axis]);
         input_offset += out_stride[axis];
-        // out->flush();
+        out->flush();
       }
       return true;
     }
diff --git a/lite/backends/fpga/KD/tensor.hpp b/lite/backends/fpga/KD/tensor.hpp
index 19f8f3b250..2cee46fb55 100644
--- a/lite/backends/fpga/KD/tensor.hpp
+++ b/lite/backends/fpga/KD/tensor.hpp
@@ -266,22 +266,25 @@ class Tensor {
       return;
     }
     BypassArgs args;
-    args.input_data_type =
-        src->dataType_ == FP32 ? DATA_TYPE_FP32 : DATA_TYPE_FP16;
+    args.input_data_type = src->dataType_ == FP32 ? DATA_TYPE_FP32 : DATA_TYPE_FP16;
     args.output_data_type = dataType_ == FP32 ? DATA_TYPE_FP32 : DATA_TYPE_FP16;
     args.input_layout_type = LAYOUT_HWC;
     args.output_layout_type = LAYOUT_HWC;
-    args.image = {.address = src->data<void>(),
-                  .scale_address = src->scale(),
-                  .channels = (uint32_t)src->shape().numel(),
-                  .width = 1,
-                  .height = 1,
-                  .pad_width = 0u,
-                  .pad_height = 0u};
+    args.image = {
+      .address = src->data<void>(),
+      .scale_address = src->scale(),
+      .channels = (uint32_t)src->shape().numel(),
+      .width = 1,
+      .height = 1,
+      .pad_width = 0U,
+      .pad_height = 0U
+    };
 
     ImageOutputArgs output = {
-        .address = data<void>(), .scale_address = scale(),
+      .address = data<void>(),
+      .scale_address = scale(),
     };
+
     args.output = output;
     size_t aligned_remainder = src->shape().numel() % 16;
     if (aligned_remainder > 0) {
@@ -380,6 +383,10 @@ class Tensor {
   }
 
   void save_file_with_name(std::string path) {
+    // std::cout << "saving file: " << path << std::endl;
+    void* add = (void*)this;
+    // printf("tensor @: %p  data: %p \n", (void *)add, (void*)data<void>());  
+    // return;
     std::ofstream ofs;
     ofs.open(path);
     ofs << scale()[0] << " / " << scale()[1] << std::endl;
@@ -399,8 +406,15 @@ class Tensor {
       if (dataType_ == INT32) {
         value = data<int32_t>()[i];
       }
+      
+      if (i < 10) {
+        std::cout << value << ",";
+      }
+
       ofs << value << std::endl;
+
     }
+    usleep(30000);
     ofs.close();
   }
 
@@ -451,6 +465,7 @@ class Tensor {
         value = half_to_float(tensor.data<float16>()[i]);
       }
       os << value << " ";
+
     }
     os << "\n";
     return os;
diff --git a/lite/backends/fpga/lite_tensor.cc b/lite/backends/fpga/lite_tensor.cc
index 5308640495..7b79ac8915 100755
--- a/lite/backends/fpga/lite_tensor.cc
+++ b/lite/backends/fpga/lite_tensor.cc
@@ -102,6 +102,7 @@ void TensorLite::CopyDataFrom(const TensorLite &other) {
   Resize(other.dims());
   auto shape = other.zynq_tensor_->shape();
   zynq_tensor_->mutableData<void>(zynq_tensor_->dataType(), shape);
+  precision_ = other.precision_;
 
   // this->ZynqTensor()->copyFrom(other.ZynqTensor());
   memcpy(this->ZynqTensor()->data<void>(),
diff --git a/lite/backends/fpga/lite_tensor.h b/lite/backends/fpga/lite_tensor.h
index 3574d466e9..c6f837db75 100644
--- a/lite/backends/fpga/lite_tensor.h
+++ b/lite/backends/fpga/lite_tensor.h
@@ -109,6 +109,7 @@ class TensorLite {
   template <typename T, typename R = T>
   const R *data() const {
     return zynq_tensor_->data<R>() + offset_;
+    // return zynq_tensor_->data<R>();
   }
 
   void Resize(const DDimLite &ddim) { dims_ = ddim; }
@@ -198,7 +199,8 @@ class TensorLite {
   // set values of precision_ and persistable_ after updating it.
   // If your tensor is just a temp tensor, such as activations,
   // you can ignore these two attributes.
-  PrecisionType precision_{PrecisionType::kUnk};
+  // PrecisionType precision_{PrecisionType::kUnk};
+  PrecisionType precision_{PrecisionType::kFloat};
   bool persistable_{false};
 
   DDimLite dims_;
@@ -235,6 +237,28 @@ zynqmp::DataType get_date_type() {
   return data_type;
 }
 
+template <typename T>
+PrecisionType get_precistion_type() {
+  PrecisionType data_type = PrecisionType::kUnk;
+  if (typeid(T) == typeid(float)) {
+    data_type = PrecisionType::kFloat;
+  }
+  if (typeid(T) == typeid(zynqmp::float16)) {
+    data_type = PrecisionType::kFP16;
+  }
+  if (typeid(T) == typeid(int)) {
+    data_type = PrecisionType::kInt32;
+  }
+  if (typeid(T) == typeid(int32_t)) {
+    data_type = PrecisionType::kInt32;
+  }
+  if (typeid(T) == typeid(int8_t)) {
+    data_type = PrecisionType::kInt8;
+  }
+
+  return data_type;
+}
+
 template <typename T, typename R>
 R *TensorLite::mutable_data() {
   std::vector<int> v;
@@ -261,6 +285,7 @@ R *TensorLite::mutable_data() {
   }
   zynqmp::Shape input_shape(layout_type, v);
   zynqmp::DataType data_type = get_date_type<T>();
+  precision_ = get_precistion_type<T>();
 
   if (zynq_tensor_.get() == nullptr) {
     zynq_tensor_.reset(new zynqmp::Tensor());
diff --git a/lite/core/mir/kernel_place_correct_pass.h b/lite/core/mir/kernel_place_correct_pass.h
index 71c6ea9273..4f7d9d110c 100644
--- a/lite/core/mir/kernel_place_correct_pass.h
+++ b/lite/core/mir/kernel_place_correct_pass.h
@@ -50,6 +50,7 @@ class KernelPlaceCorrectPass : public DebugPass {
     VLOG(4) << "lite_with_targets['kFPGA']:" << lite_with_targets["kFPGA"];
 
     VLOG(3) << "param-type-registry:\n" << ParamTypeRegistry::Global();
+    // std::cout << ""
     for (auto& x : graph->StmtTopologicalOrder()) {
       auto& inst = x->AsStmt();
       // The IoCopyOp is a tool operator, it won't support the type inference.
@@ -77,6 +78,80 @@ class KernelPlaceCorrectPass : public DebugPass {
 
       bool need_correct_place = true;
 
+      auto in = x->inlinks.front();
+      auto out = x->outlinks.front();
+      auto p = in->AsArg().type->precision();
+
+      std::string node_name = out->AsArg().name;
+      std::string arg_name = get_argname(node_name, inst.op_info()->outputs());
+      
+      auto op_type = inst.op_type();
+
+      if (op_type == "reshape" || op_type == "reshape2") {
+        for (auto* x_in : x->inlinks) {
+          
+          std::string in_name = get_argname(x_in->AsArg().name, inst.op_info()->inputs());
+          // std::cout << "name: " << x_in->AsArg().name  << std::endl;
+          // std::cout << "in_name: " << in_name  << std::endl;
+          if (in_name == "X") {
+            in = x_in;
+            std::cout << "found input \n";
+            // exit(-1);
+          }
+        }
+
+        p = in->AsArg().type->precision();
+        if ( p != PrecisionType::kFP16) {
+          // std::cout << "found an arm ............... : " << inst.kernels().size() << std::endl;
+          // std::cout << "tt:" <<  TargetRepr(inst.kernels()[0]->target()) << std::endl;
+          UpdateTarget(inst, TargetType::kHost);
+          UpdateTensor(inst, in, out, TargetType::kHost);
+        }
+      }
+
+      if (inst.op_type() == "fetch") {
+        UpdateTarget(inst, TargetType::kFPGA);
+      }
+
+      if (inst.op_type() == "split" || inst.op_type() == "transpose") {
+        if ( p != PrecisionType::kFP16) {
+          UpdateTarget(inst, TargetType::kARM);
+          for (auto* x_out : x->outlinks) {
+            UpdateTensor(inst, in, x_out, TargetType::kARM);
+          }
+        }
+      }
+
+      if (inst.op_type() == "concat") {
+        std::cout << "concat target:" << TargetRepr(inst.kernels()[0]->target()) << std::endl;
+        std::cout << "concat p:" << PrecisionToStr(inst.kernels()[0]->precision()) << std::endl;
+        if ( p != PrecisionType::kFP16) {
+          UpdateTarget(inst, TargetType::kARM);
+          UpdateTensor(inst, in, out, TargetType::kARM);
+        }
+      }
+
+      // if (inst.op_type() == "elementwise_mul") {
+
+      //   for (auto* x_in : x->inlinks) {
+          
+      //     std::string in_name = get_argname(x_in->AsArg().name, inst.op_info()->inputs());
+      //     std::cout << "name: " << x_in->AsArg().name  << std::endl;
+      //     std::cout << "in_name: " << in_name  << std::endl;
+      //     if (in_name == "Y") {
+      //       in = x_in;
+      //       std::cout << "found y \n";
+      //       // exit(-1);
+      //     }
+      //   }
+
+      //   if ( p != PrecisionType::kFP16) {
+      //     UpdateTarget(inst, TargetType::kARM);
+      //     UpdateTensor(inst, in, out, TargetType::kARM);
+      //   }
+      // }
+      
+
       std::vector<TargetType> in_types;
       std::vector<TargetType> out_types;
       for (auto* x_in : x->inlinks) {
@@ -88,6 +163,21 @@ class KernelPlaceCorrectPass : public DebugPass {
                 << "-- node name:" << node_name;
 
         auto type = inst.picked_kernel().GetInputDeclType(arg_name);
+
+        // std::cout << arg_name <<" is weight:: " << std::to_string(x_in->AsArg().is_weight) 
+        //     << "     is persist: " << std::to_string(x_in->AsArg().is_persist) << std::endl;
+
+        // std::cout << " type: "<< inst.op_type() << std::endl;
+ 
+        if (!x_in->AsArg().is_weight) {
+          auto p = x_in->AsArg().type->precision();
+          auto t = x_in->AsArg().type->target();
+          auto l = x_in->AsArg().type->layout();
+          // std::cout << "p:" << PrecisionToStr(p) << std::endl;
+          // std::cout << "t:" << TargetRepr(t) << std::endl;
+          // std::cout << "layout:" << DataLayoutToStr(l) << std::endl;
+        }
+
         if (!x_in->AsArg().type) {
           need_correct_place &= false;
         } else {
@@ -129,18 +219,69 @@ class KernelPlaceCorrectPass : public DebugPass {
       need_correct_place &= (io_target_same && (in_types[0] != this_type));
       if (need_correct_place) {
         // update this kernel's valid place;
-        UpdateTarget(inst, in_types[0]);
+        // UpdateTarget(inst, in_types[0]);
       }
     }
   }
 
+
   // Update me's kUnk fields by other's fields.
   void UpdateTarget(mir::Node::Stmt& inst, TargetType new_target) {  // NOLINT
+    // std::cout << "1 kernels: " << std::to_string(inst.kernels().size()) << std::endl;
     auto new_place = inst.place();
+
     new_place.target = new_target;
+    if (new_target == TargetType::kARM) {
+      new_place.precision = PrecisionType::kFloat;
+      new_place.layout = DataLayoutType::kNCHW;
+    }
+
+    if (new_target == TargetType::kHost) {
+      new_place.precision = PrecisionType::kFloat;
+      new_place.layout = DataLayoutType::kNCHW;
+    }
+
     std::vector<Place> places;
     places.push_back(new_place);
     inst.ResetKernels(places);
+    // std::cout << "2 kernels: " << std::to_string(inst.kernels().size()) << std::endl;
+  }
+
+  void UpdateTensor(mir::Node::Stmt& inst, Node* in, Node* out, TargetType new_target = TargetType::kUnk) {
+
+    auto get_argname = [&](
+    const std::string& node_name,
+    const std::map<std::string, std::vector<std::string>>& argname_map)
+    -> std::string {
+      for (auto& ele : argname_map) {
+        auto it =
+            std::find(ele.second.begin(), ele.second.end(), node_name);
+        if (it != ele.second.end()) return ele.first;
+      }
+      return "";
+    };
+
+    std::string arg_name = get_argname(out->AsArg().name, inst.op_info()->outputs());
+    std::string in_name = get_argname(in->AsArg().name, inst.op_info()->inputs());
+
+    auto type = inst.picked_kernel().GetInputDeclType(in_name);
+    auto tmp_ptype = in->AsArg().type->precision();
+    auto tmp_target = type->target();
+    auto tmp_layout = type->layout();
+
+    if (new_target == TargetType::kARM) {
+      tmp_target = TargetType::kARM;
+      tmp_ptype = PrecisionType::kFloat;
+      tmp_layout = DataLayoutType::kNCHW;
+    }
+
+    if (new_target == TargetType::kHost) {
+      tmp_target = TargetType::kHost;
+      tmp_ptype = PrecisionType::kFloat;
+      tmp_layout = DataLayoutType::kNCHW;
+    }
+
+    out->AsArg().type = LiteType::GetTensorTy(tmp_target, tmp_ptype, tmp_layout);
   }
 };
 
diff --git a/lite/core/mir/static_kernel_pick_pass.h b/lite/core/mir/static_kernel_pick_pass.h
index 6d45be3b89..a5e057a11b 100644
--- a/lite/core/mir/static_kernel_pick_pass.h
+++ b/lite/core/mir/static_kernel_pick_pass.h
@@ -144,6 +144,23 @@ class StaticKernelPickPass : public mir::StmtPass {
       }
     }
 
+    if (kernel.target() == TARGET(kFPGA)) {
+      final_score = 4000;
+      bool in_match = true;
+      for (size_t i = 0; i < in_names.size(); ++i) {
+        std::string tmp;
+        CHECK(instruct.op_info()->GetInputArgname(in_names[i], &tmp));
+        if (in_types.count(in_names[i]) &&
+            in_types.at(in_names[i]) !=
+                kernel.GetInputDeclType(tmp)->precision()) {
+          in_match = false;
+        }
+      }
+      if (in_match) {
+        final_score = 5000;
+      }
+    }
+
     VLOG(4) << "[score(final)]:" << final_score;
     VLOG(2) << "-------- pick summary for " << instruct.op_type()
             << " --------";
diff --git a/lite/core/mir/type_precision_cast_pass.cc b/lite/core/mir/type_precision_cast_pass.cc
index 121e64dc18..87ebaeeb4b 100644
--- a/lite/core/mir/type_precision_cast_pass.cc
+++ b/lite/core/mir/type_precision_cast_pass.cc
@@ -134,6 +134,12 @@ void PrecisionCastPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
   // Start from inputs of the graph, those should have place set.
   std::list<Node*> nodes;
   for (auto& node : graph->StmtTopologicalOrder()) {
+
+    // if (node->IsStmt()) {
+    //     auto& s = node->AsStmt();
+    //     std::cout << "type_precision type:" << s.op_type() << std::endl;
+    // }
+    // type_precision_cast_pass
     nodes.push_back(node);
   }
 
@@ -231,6 +237,10 @@ void PrecisionCastPass::AddCastInst(
     // create Op and kernels.
     bool in_persist = in->AsArg().is_weight || in->AsArg().is_persist;
     std::string cast_type = in_persist ? "calib_once" : "calib";
+
+    // TODO
+    cast_type = "calib";
+
     cast_op_output_arg->AsArg().is_persist = in_persist;
     auto cast_op = LiteOpRegistry::Global().Create(cast_type);
     CHECK(cast_op) << "create op [" << cast_op << "] failed";
diff --git a/lite/core/mir/type_target_cast_pass.cc b/lite/core/mir/type_target_cast_pass.cc
index ed16211de4..89dbb4a420 100644
--- a/lite/core/mir/type_target_cast_pass.cc
+++ b/lite/core/mir/type_target_cast_pass.cc
@@ -32,6 +32,12 @@ void TypeTargetTransformPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
   // Start from inputs of the graph, those should have place set.
   std::list<Node*> nodes;
   for (auto& node : graph->StmtTopologicalOrder()) {
+    // if (node->IsStmt()) {
+    //   auto& s = node->AsStmt();
+    //   // std::cout << "type_target type:" << s.op_type() << std::endl;
+    // }else {
+    //   // std::cout << "type_target not a statement \n";
+    // }
     nodes.push_back(node);
   }
 
@@ -47,6 +53,7 @@ void TypeTargetTransformPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
       ComplementInputs(graph.get(), node, in, &copied_nodes);
     }
   }
+
 }
 
 void TypeTargetTransformPass::ComplementInputs(
@@ -127,7 +134,8 @@ void TypeTargetTransformPass::AddIoCopyInst(
     auto* io_copy_inst = graph->NewInstructNode();
 
     bool in_persist = in->AsArg().is_weight || in->AsArg().is_persist;
-    std::string io_copy_type = in_persist ? "io_copy_once" : "io_copy";
+    // std::string io_copy_type = in_persist ? "io_copy_once" : "io_copy";
+    std::string io_copy_type = "io_copy";
     io_copy_output_arg->AsArg().is_persist = in_persist;
     // create Op and kernels.
     auto io_copy_op = LiteOpRegistry::Global().Create(io_copy_type);
@@ -147,6 +155,7 @@ void TypeTargetTransformPass::AddIoCopyInst(
     // fix(MyPandaShaoxiang): select kernel that input_dcl_type same as in.type
     bool is_found = false;
     std::vector<std::unique_ptr<KernelBase>> selected_kernels;
+    std::cout << "kernels:" << std::to_string(kernels.size()) << std::endl;
     for (auto& kernel : kernels) {
       const Type* in_arg_ty = kernel->GetInputDeclType("Input");
       const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
diff --git a/lite/kernels/arm/concat_compute.cc b/lite/kernels/arm/concat_compute.cc
index dc78e1b955..f4d23548a9 100644
--- a/lite/kernels/arm/concat_compute.cc
+++ b/lite/kernels/arm/concat_compute.cc
@@ -64,6 +64,7 @@ void ConcatCompute::Run() {
   auto& param = Param<operators::ConcatParam>();
   std::vector<lite::Tensor*> inputs = param.x;
   CHECK_GE(inputs.size(), 1);
+  // std::cout << "concat size:" << std::to_string(inputs.size()) << std::endl;
   auto* out = param.output;
   int axis = param.axis;
   auto* axis_tensor = param.axis_tensor;
@@ -72,21 +73,22 @@ void ConcatCompute::Run() {
     axis = axis_tensor_data[0];
   }
 
-  switch (inputs.front()->precision()) {
-    case PRECISION(kFloat):
-      ConcatFunc<float>(inputs, axis, out);
-      break;
-    case PRECISION(kInt32):
-      ConcatFunc<int32_t>(inputs, axis, out);
-      break;
-    case PRECISION(kInt64):
-      ConcatFunc<int64_t>(inputs, axis, out);
-      break;
-    default:
-      LOG(FATAL) << "Concat does not implement for the "
-                 << "input type:"
-                 << static_cast<int>(inputs.front()->precision());
-  }
+  ConcatFunc<float>(inputs, axis, out);
+  // switch (inputs.front()->precision()) {
+  //   case PRECISION(kFloat):
+  //     ConcatFunc<float>(inputs, axis, out);
+  //     break;
+  //   case PRECISION(kInt32):
+  //     ConcatFunc<int32_t>(inputs, axis, out);
+  //     break;
+  //   case PRECISION(kInt64):
+  //     ConcatFunc<int64_t>(inputs, axis, out);
+  //     break;
+  //   default:
+  //     LOG(FATAL) << "Concat does not implement for the "
+  //                << "input type:"
+  //                << static_cast<int>(inputs.front()->precision());
+  // }
 }
 
 }  // namespace arm
diff --git a/lite/kernels/fpga/CMakeLists.txt b/lite/kernels/fpga/CMakeLists.txt
index 7a37d19dbc..fd1f3263c8 100755
--- a/lite/kernels/fpga/CMakeLists.txt
+++ b/lite/kernels/fpga/CMakeLists.txt
@@ -17,6 +17,8 @@ add_kernel(conv_compute_fpga FPGA basic SRCS conv_compute.cc DEPS ${fpga_deps})
 add_kernel(dropout_compute_fpga FPGA basic SRCS dropout_compute.cc DEPS ${fpga_deps})
 add_kernel(elementwise_compute_fpga FPGA basic SRCS elementwise_compute.cc DEPS ${fpga_deps})
 
+add_kernel(interpolate_compute_fpga FPGA basic SRCS interpolate_compute.cc DEPS ${fpga_deps})
+
 add_kernel(fc_compute_fpga FPGA basic SRCS fc_compute.cc DEPS ${fpga_deps})
 add_kernel(gru_compute_fpga FPGA extra SRCS gru_compute.cc DEPS ${fpga_deps})
 
diff --git a/lite/kernels/fpga/calib_compute.cc b/lite/kernels/fpga/calib_compute.cc
index 25614711e7..e4f13aedd8 100755
--- a/lite/kernels/fpga/calib_compute.cc
+++ b/lite/kernels/fpga/calib_compute.cc
@@ -44,6 +44,17 @@ void CalibComputeFP16ToFp32::Run() {
   return;
 }
 
+void CalibComputeFloat2Int::Run() {
+  auto& param = this->Param<operators::CalibParam>();
+  const auto* din = param.input->data<float>();
+  auto* dout = param.output->mutable_data<int>();
+  // param.output->ZynqTensor()->copyFrom(param.input->ZynqTensor());
+  //TODO
+  auto out_lod = param.output->mutable_lod();
+  *out_lod = param.input->lod();
+  return;
+}
+
 }  // namespace fpga
 }  // namespace kernels
 }  // namespace lite
@@ -65,12 +76,28 @@ REGISTER_LITE_KERNEL(calib,
                                        DATALAYOUT(kNHWC))})
     .Finalize();
 
+REGISTER_LITE_KERNEL(calib,
+                     kFPGA,
+                     kFP16,
+                     kNHWC,
+                     paddle::lite::kernels::fpga::CalibComputeFloat2Int,
+                     float_2_int_fpga)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kARM),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kARM),
+                                       PRECISION(kInt32),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
+
 REGISTER_LITE_KERNEL(calib,
                      kFPGA,
                      kFP16,
                      kNHWC,
                      paddle::lite::kernels::fpga::CalibComputeFP16ToFp32,
-                     fp16_to_fp32_fpga)
+                     float_to_int_fpga)
     .BindInput("Input",
                {LiteType::GetTensorTy(TARGET(kFPGA),
                                       PRECISION(kFP16),
diff --git a/lite/kernels/fpga/calib_compute.h b/lite/kernels/fpga/calib_compute.h
index 3f5c399b9a..9701b52cd9 100644
--- a/lite/kernels/fpga/calib_compute.h
+++ b/lite/kernels/fpga/calib_compute.h
@@ -45,6 +45,18 @@ class CalibComputeFP16ToFp32
  private:
 };
 
+class CalibComputeFloat2Int
+    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
+ public:
+  using param_t = operators::CalibParam;
+
+  void Run() override;
+
+  ~CalibComputeFloat2Int() override{};
+
+ private:
+};
+
 }  // namespace fpga
 }  // namespace kernels
 }  // namespace lite
diff --git a/lite/kernels/fpga/concat_compute.cc b/lite/kernels/fpga/concat_compute.cc
index ad66e30981..523d357709 100755
--- a/lite/kernels/fpga/concat_compute.cc
+++ b/lite/kernels/fpga/concat_compute.cc
@@ -47,7 +47,8 @@ void ConcatCompute::Run() {
   pe_.dispatch();
 #ifdef FPGA_PRINT_TENSOR
   zynqmp::ConcatParam& concat_param = pe_.param();
-  Debugger::get_instance().registerOutput("concat", concat_param.output);
+  concat_param.output->flush();
+  // Debugger::get_instance().registerOutput("concat", concat_param.output);
 #endif
 }
 
diff --git a/lite/kernels/fpga/conv_compute.cc b/lite/kernels/fpga/conv_compute.cc
index bd6adf6093..14de934eb3 100644
--- a/lite/kernels/fpga/conv_compute.cc
+++ b/lite/kernels/fpga/conv_compute.cc
@@ -51,6 +51,11 @@ void ConvCompute::PrepareForRun() {
       conv_param.activeParam.type = zynqmp::TYPE_RELU;
     }
 
+    if (param.activation_param.Leaky_relu_alpha > 0.001) {
+      conv_param.activeParam.type = zynqmp::TYPE_LEAKY_RELU;
+      conv_param.activeParam.leaky_relu_factor = param.activation_param.Leaky_relu_alpha;
+    }
+
     dw_conv_pe_.init();
     dw_conv_pe_.apply();
   } else {
@@ -72,9 +77,15 @@ void ConvCompute::PrepareForRun() {
       conv_param.activeParam.type = zynqmp::TYPE_RELU;
     }
 
+    if (param.activation_param.Leaky_relu_alpha > 0.001) {
+      conv_param.activeParam.type = zynqmp::TYPE_LEAKY_RELU;
+      conv_param.activeParam.leaky_relu_factor = param.activation_param.Leaky_relu_alpha;
+    }
+
     conv_pe_.init();
     conv_pe_.apply();
   }
+  // std::cout << "Leaky_relu_alpha:" << param.activation_param.Leaky_relu_alpha << std::endl;
 }
 
 void ConvCompute::Run() {
diff --git a/lite/kernels/fpga/elementwise_compute.cc b/lite/kernels/fpga/elementwise_compute.cc
index 0c9df75949..1bcb7f2ae7 100755
--- a/lite/kernels/fpga/elementwise_compute.cc
+++ b/lite/kernels/fpga/elementwise_compute.cc
@@ -88,13 +88,33 @@ void ElementwiseMulCompute::PrepareForRun() {
       scale_.mutableData<zynqmp::float16>(zynqmp::FP16, shape);
   zynqmp::float16* bias_data =
       bias_.mutableData<zynqmp::float16>(zynqmp::FP16, shape);
-  float scale_value = param.Y->data<float>()[0];
+  zynqmp::float16 scale_value = 0;
+  if (param.Y->ZynqTensor()->dataType() == zynqmp::FP32) {
+    scale_value = zynqmp::float_to_half(param.Y->data<float>()[0]);
+    // std::cout << "FP32 \n";
+  } else {
+    scale_value = param.Y->data<zynqmp::float16>()[0];
+    // std::cout << "FP16 \n";
+  }
+  
+  // std::cout << "channel:" << channel << std::endl;
+  // std::cout << "production:" << param.Y->dims().production() << std::endl;
+
+  // std::cout << "scale_value:" << std::to_string(zynqmp::half_to_float(scale_value)) << std::endl;
+  // exit(-1);
 
   for (int i = 0; i < channel; i++) {
     if (param.Y->dims().production() != 1) {
-      scale_value = param.Y->ZynqTensor()->data<float>()[i];
+      // scale_value = param.Y->ZynqTensor()->data<zynqmp::float16>()[i];
+      if (param.Y->ZynqTensor()->dataType() == zynqmp::FP32) {
+        scale_value = zynqmp::float_to_half(param.Y->data<float>()[i]);
+      } else {
+        scale_value = param.Y->data<zynqmp::float16>()[i];
+      }
     }
-    scale_data[i] = zynqmp::float_to_half(scale_value);
+    // std::cout << "scale_value:" << std::to_string(zynqmp::half_to_float(scale_value)) << std::endl;
+    // exit(-1);
+    scale_data[i] = scale_value;
     bias_data[i] = zero_;
   }
 
@@ -104,15 +124,17 @@ void ElementwiseMulCompute::PrepareForRun() {
 
 void ElementwiseMulCompute::Run() {
   auto& param = Param<operators::ElementwiseParam>();
+  // std::cout << "param.Y :" << param.Y->persistable() << std::endl;
   if (!param.Y->persistable()) {
+    // TODO
     scale_.copyFrom(param.Y->ZynqTensor());
-    scale_.invalidate();
+    scale_.flush();//TODO
   }
   pe_.dispatch();
 #ifdef FPGA_PRINT_TENSOR
   zynqmp::ScaleParam& scale_param = pe_.param();
-  Debugger::get_instance().registerOutput("ew_mul_in", scale_param.input);
-  Debugger::get_instance().registerOutput("ew_mul", scale_param.output);
+  // Debugger::get_instance().registerOutput("ew_mul_in", scale_param.input);
+  // Debugger::get_instance().registerOutput("ew_mul", scale_param.output);
 #endif
 }
 
@@ -181,3 +203,21 @@ REGISTER_LITE_KERNEL(elementwise_mul,
                                        PRECISION(kFP16),
                                        DATALAYOUT(kNHWC))})
     .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_mul,
+                     kFPGA,
+                     kFP16,
+                     kNHWC,
+                     paddle::lite::kernels::fpga::ElementwiseMulCompute,
+                     ew_mul_y_arm)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kFPGA),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
\ No newline at end of file
diff --git a/lite/kernels/fpga/fetch_compute.cc b/lite/kernels/fpga/fetch_compute.cc
index d5c8585aae..4ece2a780d 100755
--- a/lite/kernels/fpga/fetch_compute.cc
+++ b/lite/kernels/fpga/fetch_compute.cc
@@ -64,18 +64,18 @@ void FetchCompute::Run() {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_LITE_KERNEL(fetch,
-                     kFPGA,
-                     kFP16,
-                     kNHWC,
-                     paddle::lite::kernels::fpga::FetchCompute,
-                     fpga_host)
-    .BindInput("X",
-               {LiteType::GetTensorTy(TARGET(kFPGA),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kAny))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
-    .Finalize();
+// REGISTER_LITE_KERNEL(fetch,
+//                      kFPGA,
+//                      kFP16,
+//                      kNHWC,
+//                      paddle::lite::kernels::fpga::FetchCompute,
+//                      fpga_host)
+//     .BindInput("X",
+//                {LiteType::GetTensorTy(TARGET(kFPGA),
+//                                       PRECISION(kFP16),
+//                                       DATALAYOUT(kNHWC))})
+//     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
+//     .Finalize();
 
 REGISTER_LITE_KERNEL(fetch,
                      kFPGA,
diff --git a/lite/kernels/fpga/interpolate_compute.cc b/lite/kernels/fpga/interpolate_compute.cc
new file mode 100644
index 0000000000..7358ec1bf3
--- /dev/null
+++ b/lite/kernels/fpga/interpolate_compute.cc
@@ -0,0 +1,282 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/fpga/interpolate_compute.h"
+#include <string>
+#include <vector>
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace fpga {
+
+using float16 = zynqmp::float16;
+
+void BilinearInterpCompute::Run() {
+  // auto& param = Param<operators::InterpolateParam>();
+  // lite::Tensor* X = param.X;
+  // lite::Tensor* OutSize = param.OutSize;
+  // auto SizeTensor = param.SizeTensor;
+  // auto Scale = param.Scale;
+  // lite::Tensor* Out = param.Out;
+  // float scale = param.scale;
+  // int out_w = param.out_w;
+  // int out_h = param.out_h;
+  // bool align_corners = param.align_corners;
+  // std::string interp_method = "Bilinear";
+  // lite::arm::math::interpolate(X,
+  //                              OutSize,
+  //                              SizeTensor,
+  //                              Scale,
+  //                              Out,
+  //                              out_h,
+  //                              out_w,
+  //                              scale,
+  //                              align_corners,
+  //                              interp_method);
+}
+
+
+void nearest_interp(const float16* src,
+                    int w_in,
+                    int h_in,
+                    int c,
+                    float16* dst,
+                    int w_out,
+                    int h_out,
+                    float scale_x,
+                    float scale_y,
+                    bool with_align) {
+  float scale_w_new = (with_align)
+                          ? (static_cast<float>(w_in - 1) / (w_out - 1))
+                          : (static_cast<float>(w_in) / (w_out));
+  float scale_h_new = (with_align)
+                          ? (static_cast<float>(h_in - 1) / (h_out - 1))
+                          : (static_cast<float>(h_in) / (h_out));
+  if (with_align) {
+    for (int h = 0; h < h_out; ++h) {
+      float16* dst_p = dst + h * w_out * c;
+      int near_y = static_cast<int>(scale_h_new * h + 0.5);
+      for (int w = 0; w < w_out; ++w) {
+        int near_x = static_cast<int>(scale_w_new * w + 0.5);
+        // *dst_p++ = src[near_y * w_in + near_x];
+        const float16* src_n = src + (near_y * w_in + near_x) * c;
+        memcpy(dst_p, src_n, c * sizeof(float16));
+
+        dst_p += c;
+      }
+    }
+  } else {
+    for (int h = 0; h < h_out; ++h) {
+      float16* dst_p = dst + h * w_out;
+      int near_y = static_cast<int>(scale_h_new * h);
+      for (int w = 0; w < w_out; ++w) {
+        int near_x = static_cast<int>(scale_w_new * w);
+        
+        const float16* src_n = src + (near_y * w_in + near_x) * c;
+        memcpy(dst_p, src_n, c * sizeof(float16));
+        dst_p += c;
+      }
+    }
+  }
+}
+
+void NearestInterpCompute::PrepareForRun() {
+  auto& param = Param<operators::InterpolateParam>();
+  lite::Tensor* X = param.X;
+  lite::Tensor* OutSize = param.OutSize;
+  lite::Tensor* Out = param.Out;
+
+  Out->mutable_data<float16>();
+
+  zynqmp::ResizeParam& norm_param = pe_.param();
+  norm_param.input = X->ZynqTensor();
+  norm_param.output = Out->ZynqTensor();
+
+  pe_.init();
+  pe_.apply();
+}
+
+// TODO
+inline std::vector<int> get_new_shape(
+    std::vector<const lite::Tensor*> list_new_shape_tensor) {
+  // get tensor from
+  std::vector<int> vec_new_shape;
+  for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) {
+    auto tensor = list_new_shape_tensor[i];
+    vec_new_shape.push_back(static_cast<int32_t>(*tensor->data<int32_t>()));
+  }
+
+  return vec_new_shape;
+}
+
+template <typename T>
+inline std::vector<T> get_new_data_from_tensor(const Tensor* new_data_tensor) {
+  std::vector<T> vec_new_data;
+  auto* new_data = new_data_tensor->data<T>();
+  lite::Tensor cpu_starts_tensor;
+  vec_new_data =
+      std::vector<T>(new_data, new_data + new_data_tensor->dims().production());
+  return vec_new_data;
+}
+
+
+
+void interpolate(lite::Tensor* X,
+                 lite::Tensor* OutSize,
+                 std::vector<const lite::Tensor*> SizeTensor,
+                 lite::Tensor* Scale,
+                 lite::Tensor* Out,
+                 int out_height,
+                 int out_width,
+                 float scale,
+                 bool with_align,
+                 std::string interpolate_type) {
+  int in_h = X->dims()[2];
+  int in_w = X->dims()[3];
+  if (SizeTensor.size() > 0) {
+    auto new_size = get_new_shape(SizeTensor);
+    out_height = new_size[0];
+    out_width = new_size[1];
+  } else {
+    auto scale_tensor = Scale;
+    if (scale_tensor != nullptr) {
+      auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
+      scale = scale_data[0];
+    }
+    if (scale > 0) {
+      out_height = static_cast<int>(in_h * scale);
+      out_width = static_cast<int>(in_w * scale);
+    }
+    auto out_size = OutSize;
+    if (out_size != nullptr) {
+      auto out_size_data = get_new_data_from_tensor<int>(out_size);
+      out_height = out_size_data[0];
+      out_width = out_size_data[1];
+    }
+  }
+  float height_scale = scale;
+  float width_scale = scale;
+  if (out_width > 0 && out_height > 0) {
+    height_scale = static_cast<float>(out_height / X->dims()[2]);
+    width_scale = static_cast<float>(out_width / X->dims()[3]);
+  }
+  int num_cout = X->dims()[0];
+  int c_cout = X->dims()[1];
+  Out->Resize({num_cout, c_cout, out_height, out_width});
+
+  float16* dout = Out->mutable_data<float16>();
+  const float16* din = X->data<float16>();
+  int out_num = Out->dims()[0];
+  int out_c = Out->dims()[1];
+  int count = out_num;
+  int out_h = Out->dims()[2];
+  int out_w = Out->dims()[3];
+  int spatial_in = in_h * in_w;
+  int spatial_out = out_h * out_w;
+
+
+  for (int i = 0; i < count; ++i) {
+      nearest_interp(din + spatial_in * i,
+                     in_w,
+                     in_h,
+                     out_c,
+                     dout + spatial_out * i,
+                     out_w,
+                     out_h,
+                     1.f / width_scale,
+                     1.f / height_scale,
+                     with_align);
+    }
+}
+
+void NearestInterpCompute::Run() {
+  auto& param = Param<operators::InterpolateParam>();
+  lite::Tensor* X = param.X;
+  lite::Tensor* OutSize = param.OutSize;
+  auto SizeTensor = param.SizeTensor;
+  auto Scale = param.Scale;
+  lite::Tensor* Out = param.Out;
+  float scale = param.scale;
+  int out_w = param.out_w;
+  int out_h = param.out_h;
+  bool align_corners = param.align_corners;
+
+
+  std::string interp_method = "";
+
+  X->ZynqTensor()->invalidate();//TODO
+  X->ZynqTensor()->saveToFile("n_in", true);
+  interpolate(X,
+                               OutSize,
+                               SizeTensor,
+                               Scale,
+                               Out,
+                               out_h,
+                               out_w,
+                               scale,
+                               align_corners,
+                               interp_method);
+
+  
+  Out->ZynqTensor()->flush();
+  Out->ZynqTensor()->copyScaleFrom(X->ZynqTensor());
+  Out->ZynqTensor()->saveToFile("n_out", true);
+
+}
+
+} /* namespace fpga */
+} /* namespace kernels */
+} /* namespace lite */
+} /* namespace paddle */
+
+REGISTER_LITE_KERNEL(bilinear_interp,
+                     kFPGA,
+                     kFP16,
+                     kNHWC,
+                     paddle::lite::kernels::fpga::BilinearInterpCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kFPGA),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
+    .BindInput("OutSize",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("SizeTensor",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kFPGA),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(nearest_interp,
+                     kFPGA,
+                     kFP16,
+                     kNHWC,
+                     paddle::lite::kernels::fpga::NearestInterpCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kFPGA),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
+    .BindInput("OutSize",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("SizeTensor",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kFPGA),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
diff --git a/lite/kernels/fpga/interpolate_compute.h b/lite/kernels/fpga/interpolate_compute.h
new file mode 100644
index 0000000000..cc904f9364
--- /dev/null
+++ b/lite/kernels/fpga/interpolate_compute.h
@@ -0,0 +1,50 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/backends/fpga/KD/pes/resize_pe.hpp"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace fpga {
+
+class BilinearInterpCompute
+    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
+ public:
+  void Run() override;
+
+  virtual ~BilinearInterpCompute() = default;
+};
+
+class NearestInterpCompute
+    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
+ public:
+  
+  void PrepareForRun() override;
+
+  void Run() override;
+
+  virtual ~NearestInterpCompute() = default;
+ private:
+  zynqmp::ResizePE pe_;
+};
+
+} /* namespace fpga */
+} /* namespace kernels */
+} /* namespace lite */
+} /* namespace paddle */
diff --git a/lite/kernels/fpga/io_copy_compute.cc b/lite/kernels/fpga/io_copy_compute.cc
index 2fd4b0afcf..a7dbf9359f 100755
--- a/lite/kernels/fpga/io_copy_compute.cc
+++ b/lite/kernels/fpga/io_copy_compute.cc
@@ -25,10 +25,17 @@ namespace fpga {
 
 using float16 = zynqmp::float16;
 
+void copy_properties(operators::IoCopyParam& param) {
+  param.y->set_persistable(param.x->persistable());
+  auto out_lod = param.y->mutable_lod();
+  *out_lod = param.x->lod();
+  param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor());
+}
+
 /*
  * This kernel copies a tensor from host to FPGA space.
  */
-class IoCopyHostToFpgaCompute
+class IoCopyHostCHWToFpgaHWCCompute
     : public KernelLite<TARGET(kFPGA), PRECISION(kAny), DATALAYOUT(kAny)> {
  public:
   void Run() override {
@@ -37,52 +44,33 @@ class IoCopyHostToFpgaCompute
           param.x->target() == TARGET(kFPGA));
     param.x->ZynqTensor()->flush();
 
+    
+    
     if (param.x->ZynqTensor()->dataType() == zynqmp::INT32) {
       param.y->mutable_data<int>();
       param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor());
+      param.y->ZynqTensor()->flush();
+      copy_properties(param);
       return;
     }
 
-    if (param.x->ZynqTensor()->dataType() == zynqmp::FP32) {
-      param.y->mutable_data<float16>();
-      if (param.x->ZynqTensor()->aligned() &&
-          param.x->ZynqTensor()->shape().shouldAlign()) {
-        zynqmp::Tensor tempTensor;
-        tempTensor.mutableData<float16>(zynqmp::FP16,
-                                        param.x->ZynqTensor()->shape());
-        tempTensor.copyFrom(param.x->ZynqTensor());
-        tempTensor.setAligned(true);
-        tempTensor.unalignImage();
-        param.y->ZynqTensor()->copyFrom(&tempTensor);
-      } else {
-        param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor());
-      }
-      param.y->ZynqTensor()->invalidate();
-      param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor());
+    param.y->mutable_data<float16>();
+    param.y->ZynqTensor()->setDataLocation(zynqmp::Device);
+    if (param.x->ZynqTensor()->aligned() &&
+        param.x->ZynqTensor()->shape().shouldAlign()) {
+      zynqmp::Tensor tempTensor;
+      tempTensor.mutableData<float16>(zynqmp::FP16,
+                                      param.x->ZynqTensor()->shape());
+      tempTensor.copyFrom(param.x->ZynqTensor());
+      tempTensor.setAligned(true);
+      tempTensor.unalignImage();
+      tempTensor.flush();
+      param.y->ZynqTensor()->copyFrom(&tempTensor);
+    } else {
+      param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor());
     }
-
-    auto out_lod = param.y->mutable_lod();
-    *out_lod = param.x->lod();
-  }
-
-  std::unique_ptr<type_infer_handler_t> GetTypeInferHandler() override {
-    std::unique_ptr<type_infer_handler_t> res(new type_infer_handler_t);
-    *res = [](const std::map<std::string, const Type*>& inputs,
-              const std::string& out) -> const Type* {
-      CHECK(!inputs.empty());
-      auto* type = inputs.at("Input");
-      CHECK(type->target() == TARGET(kHost));
-
-      auto out_place = type->place();
-      out_place.target = TARGET(kFPGA);
-      auto* out_type = Type::Get(type->id(),
-                                 out_place.target,
-                                 out_place.precision,
-                                 out_place.layout,
-                                 out_place.device);
-      return out_type;
-    };
-    return res;
+    copy_properties(param);
+    param.y->ZynqTensor()->invalidate();
   }
 
   std::string doc() const override { return "Copy IO from HOST to FPGA"; }
@@ -98,10 +86,11 @@ class IoCopyFpgaToHostCompute
     auto& param = Param<operators::IoCopyParam>();
     CHECK(param.x->target() == TARGET(kHost) ||
           param.x->target() == TARGET(kFPGA));
-
+    
+    param.x->ZynqTensor()->syncToDevice();
     param.y->mutable_data<float>();
     param.y->ZynqTensor()->setDataType(zynqmp::FP32);
-    param.x->ZynqTensor()->syncToDevice();
+    param.y->ZynqTensor()->setDataLocation(zynqmp::CPU);
 
     if (param.x->ZynqTensor()->aligned() &&
         param.x->ZynqTensor()->shape().shouldAlign()) {
@@ -115,10 +104,9 @@ class IoCopyFpgaToHostCompute
     } else {
       param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor());
     }
-    param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor());
-    param.y->ZynqTensor()->flush();
-    auto out_lod = param.y->mutable_lod();
-    *out_lod = param.x->lod();
+    
+    param.y->ZynqTensor()->invalidate();
+    copy_properties(param);
   }
   std::string doc() const override { return "Copy IO from FPGA to HOST"; }
 };
@@ -153,14 +141,16 @@ class IoCopyFpgaToHostCHWCompute
     CHECK(param.x->target() == TARGET(kHost) ||
           param.x->target() == TARGET(kFPGA));
 
-    Tensor hwc;
+    Tensor hwc;    
     hwc.Resize(param.y->dims());
     float* hwc_data = hwc.mutable_data<float>();
-
     float* chw_data = param.y->mutable_data<float>();
     param.y->ZynqTensor()->setDataType(zynqmp::FP32);
     param.x->ZynqTensor()->syncToDevice();
 
+    hwc.ZynqTensor()->setDataLocation(zynqmp::CPU);
+    param.y->ZynqTensor()->setDataLocation(zynqmp::CPU);
+    
     if (param.x->ZynqTensor()->aligned() &&
         param.x->ZynqTensor()->shape().shouldAlign()) {
       zynqmp::Tensor tempTensor;
@@ -168,10 +158,30 @@ class IoCopyFpgaToHostCHWCompute
                                       param.x->ZynqTensor()->shape());
       tempTensor.copyFrom(param.x->ZynqTensor());
       tempTensor.setAligned(true);
+      // tempTensor.saveToFile("temp_1", true);
       tempTensor.unalignImage();
+      // tempTensor.saveToFile("temp_2", true);
+      
       hwc.ZynqTensor()->copyFrom(&tempTensor);
     } else {
-      hwc.ZynqTensor()->copyFrom(param.x->ZynqTensor());
+      // hwc.ZynqTensor()->copyFrom(param.x->ZynqTensor());
+      float16* in_data = param.x->ZynqTensor()->data<float16>();
+      // float* f_data = 
+      param.x->ZynqTensor()->flush();
+      float max = 0;
+
+      for (int i = 0; i < param.x->dims().production(); i++) {
+        float value = zynqmp::half_to_float(in_data[i]);
+        hwc_data[i] = value;
+        if (value < 0) {
+          value = -value;
+        }
+        if (value > max) {
+          max = value;
+        }
+      }
+      param.x->ZynqTensor()->scale()[0] = max / 127;
+      param.x->ZynqTensor()->scale()[1] = 127 / max;
     }
 
     int num = 1;
@@ -188,10 +198,15 @@ class IoCopyFpgaToHostCHWCompute
                dims.height(),
                dims.width());
 
-    param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor());
+    // param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor());
     param.y->ZynqTensor()->flush();
-    auto out_lod = param.y->mutable_lod();
-    *out_lod = param.x->lod();
+    copy_properties(param);
+
+    param.x->ZynqTensor()->invalidate();
+    param.x->ZynqTensor()->flush();
+    // hwc.ZynqTensor()->saveToFile("hwc", true);
+    // param.x->ZynqTensor()->saveToFile("io2_x", true);
+    // param.y->ZynqTensor()->saveToFile("io2_y", true);
   }
   std::string doc() const override { return "Copy IO from FPGA to HOST"; }
 };
@@ -201,52 +216,36 @@ class IoCopyFpgaToHostCHWCompute
 }  // namespace lite
 }  // namespace paddle
 
-// REGISTER_LITE_KERNEL(io_copy,
-//                      kFPGA,
-//                      kAny,
-//                      kAny,
-//                      paddle::lite::kernels::fpga::IoCopyHostToFpgaCompute,
-//                      host_to_device)
-//     .BindInput("Input",
-//                {LiteType::GetTensorTy(TARGET(kHost),
-//                                       PRECISION(kAny),
-//                                       DATALAYOUT(kAny))})
-//     .BindOutput("Out",
-//                 {LiteType::GetTensorTy(TARGET(kFPGA),
-//                                        PRECISION(kAny),
-//                                        DATALAYOUT(kAny))})
-//     .Finalize();
-
 REGISTER_LITE_KERNEL(io_copy,
                      kFPGA,
                      kAny,
                      kAny,
-                     paddle::lite::kernels::fpga::IoCopyHostToFpgaCompute,
-                     host_to_device_any_any)
+                     paddle::lite::kernels::fpga::IoCopyHostCHWToFpgaHWCCompute,
+                     host_to_device)
     .BindInput("Input",
-               {LiteType::GetTensorTy(
-                   TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kInt32),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kFPGA),
+                                       PRECISION(kAny),
+                                       DATALAYOUT(kAny))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(io_copy,
+                     kFPGA,
+                     kAny,
+                     kAny,
+                     paddle::lite::kernels::fpga::IoCopyHostCHWToFpgaHWCCompute,
+                     host_float_chw_to_device_fp16_hwc)
+    .BindInput("Input", {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW))})
     .BindOutput("Out",
                 {LiteType::GetTensorTy(TARGET(kFPGA),
                                        PRECISION(kFP16),
                                        DATALAYOUT(kNHWC))})
     .Finalize();
 
-// REGISTER_LITE_KERNEL(io_copy,
-//                      kFPGA,
-//                      kAny,
-//                      kAny,
-//                      paddle::lite::kernels::fpga::IoCopyFpgaToHostCompute,
-//                      device_to_host)
-//     .BindInput("Input",
-//                {LiteType::GetTensorTy(TARGET(kFPGA),
-//                                       PRECISION(kFP16),
-//                                       DATALAYOUT(kNHWC))})
-//     .BindOutput("Out",
-//                 {LiteType::GetTensorTy(TARGET(kHost),
-//                                        PRECISION(kFloat),
-//                                        DATALAYOUT(kNHWC))})
-//     .Finalize();
 
 REGISTER_LITE_KERNEL(io_copy,
                      kFPGA,
@@ -311,3 +310,26 @@ REGISTER_LITE_KERNEL(io_copy,
 //                                        PRECISION(kAny),
 //                                        DATALAYOUT(kAny))})
 //     .Finalize();
+
+
+//  ==========================================================
+
+  // std::unique_ptr<type_infer_handler_t> GetTypeInferHandler() override {
+  //   std::unique_ptr<type_infer_handler_t> res(new type_infer_handler_t);
+  //   *res = [](const std::map<std::string, const Type*>& inputs,
+  //             const std::string& out) -> const Type* {
+  //     CHECK(!inputs.empty());
+  //     auto* type = inputs.at("Input");
+  //     CHECK(type->target() == TARGET(kHost));
+
+  //     auto out_place = type->place();
+  //     out_place.target = TARGET(kFPGA);
+  //     auto* out_type = Type::Get(type->id(),
+  //                                out_place.target,
+  //                                out_place.precision,
+  //                                out_place.layout,
+  //                                out_place.device);
+  //     return out_type;
+  //   };
+  //   return res;
+  // }
\ No newline at end of file
diff --git a/lite/kernels/fpga/multiclass_nms_compute.cc b/lite/kernels/fpga/multiclass_nms_compute.cc
index 23a5aad8e6..9e1e106223 100644
--- a/lite/kernels/fpga/multiclass_nms_compute.cc
+++ b/lite/kernels/fpga/multiclass_nms_compute.cc
@@ -94,6 +94,7 @@ T PolyIoU(const T* box1,
           const size_t box_size,
           const bool normalized) {
   LOG(FATAL) << "PolyIoU not implement.";
+  return *box1;
 }
 
 template <class T>
@@ -128,34 +129,44 @@ void NMSFast(const Tensor& bbox,
              std::vector<int>* selected_indices,
              const bool normalized) {
   // The total boxes for each instance.
+  // std::cout << "1\n";
   int64_t num_boxes = bbox.dims()[0];
+  // std::cout << "1,1\n";
   // 4: [xmin ymin xmax ymax]
   // 8: [x1 y1 x2 y2 x3 y3 x4 y4]
   // 16, 24, or 32: [x1 y1 x2 y2 ...  xn yn], n = 8, 12 or 16
   int64_t box_size = bbox.dims()[1];
+  // std::cout << "1,2\n";
 
   std::vector<T> scores_data(num_boxes);
   std::copy_n(scores.data<T>(), num_boxes, scores_data.begin());
+  // std::cout << "1,3\n";
   std::vector<std::pair<T, int>> sorted_indices;
+  // std::cout << "1,4\n";
   GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices);
 
+  // std::cout << "2\n";
   selected_indices->clear();
   T adaptive_threshold = nms_threshold;
   const T* bbox_data = bbox.data<T>();
-
+  // std::cout << "3\n";
   while (sorted_indices.size() != 0) {
     const int idx = sorted_indices.front().second;
+    // std::cout << "4\n";
     bool keep = true;
     for (size_t k = 0; k < selected_indices->size(); ++k) {
+      // std::cout << "5\n";
       if (keep) {
         const int kept_idx = (*selected_indices)[k];
         T overlap = T(0.);
+        // std::cout << "6\n";
         // 4: [xmin ymin xmax ymax]
         if (box_size == 4) {
           overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
                                       bbox_data + kept_idx * box_size,
                                       normalized);
         }
+        // std::cout << "7\n";
         // 8: [x1 y1 x2 y2 x3 y3 x4 y4] or 16, 24, 32
         if (box_size == 8 || box_size == 16 || box_size == 24 ||
             box_size == 32) {
@@ -168,10 +179,13 @@ void NMSFast(const Tensor& bbox,
       } else {
         break;
       }
+      // std::cout << "8\n";
     }
+    // std::cout << "9\n";
     if (keep) {
       selected_indices->push_back(idx);
     }
+    // std::cout << "10\n";
     sorted_indices.erase(sorted_indices.begin());
     if (keep && eta < 1 && adaptive_threshold > 0.5) {
       adaptive_threshold *= eta;
@@ -195,21 +209,25 @@ void MultiClassNMS(const operators::MulticlassNmsParam& param,
   T score_threshold = static_cast<T>(param.score_threshold);
 
   int num_det = 0;
-  int64_t class_num = scores_size == 3 ? scores.dims()[0] : scores.dims()[1];
 
+  int64_t class_num = scores_size == 3 ? scores.dims()[0] : scores.dims()[1];
+  Tensor bbox_slice, score_slice;
   for (int64_t c = 0; c < class_num; ++c) {
-    Tensor bbox_slice, score_slice;
     if (c == background_label) continue;
+
+    // std::cout << "------ 1 \n";
     if (scores_size == 3) {
+      // std::cout << "------ scores_size = 3 \n";
       scores.Slice<T>(score_slice, c, c + 1);
-      bbox_slice = bboxes;
+      // bbox_slice = bboxes;
     } else {
+      // std::cout << "------ scores_size != 3 \n";
       score_slice.Resize({scores.dims()[0], 1});
       bbox_slice.Resize({scores.dims()[0], 4});
       SliceOneClass<T>(scores, c, &score_slice);
       SliceOneClass<T>(bboxes, c, &bbox_slice);
     }
-    NMSFast(bboxes,
+    NMSFast(bboxes,// TODO
             score_slice,
             score_threshold,
             nms_threshold,
@@ -226,8 +244,6 @@ void MultiClassNMS(const operators::MulticlassNmsParam& param,
   *num_nmsed_out = num_det;
   const T* scores_data = scores.data<T>();
   if (keep_top_k > -1 && num_det > keep_top_k) {
-    Tensor score_slice;
-
     const T* sdata;
     std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
     for (const auto& it : *indices) {
@@ -275,7 +291,9 @@ void MultiClassOutput(const Tensor& scores,
                       const Tensor& bboxes,
                       const std::map<int, std::vector<int>>& selected_indices,
                       const int scores_size,
-                      Tensor* outs) {
+                      Tensor* outs,
+                      int* oindices = nullptr,
+                      const int offset = 0) {
   int64_t class_num = scores.dims()[1];
   int64_t predict_dim = scores.dims()[1];
   int64_t box_size = bboxes.dims()[1];
@@ -305,9 +323,15 @@ void MultiClassOutput(const Tensor& scores,
       if (scores_size == 3) {
         bdata = bboxes_data + idx * box_size;
         odata[count * out_dim + 1] = sdata[idx];  // score
+        if (oindices != nullptr) {
+          oindices[count] = offset + idx;
+        }
       } else {
         bdata = bbox.data<T>() + idx * box_size;
         odata[count * out_dim + 1] = *(scores_data + idx * class_num + label);
+        if (oindices != nullptr) {
+          oindices[count] = offset + idx * class_num + label;
+        }
       }
       // xmin, ymin, xmax, ymax or multi-points coordinates
       std::memcpy(odata + count * out_dim + 2, bdata, box_size * sizeof(T));
@@ -318,36 +342,18 @@ void MultiClassOutput(const Tensor& scores,
 
 void MulticlassNmsCompute::Run() {
   auto& param = Param<operators::MulticlassNmsParam>();
-  auto* boxes_in = param.bboxes;
-  auto* scores_in = param.scores;
+  auto* boxes = param.bboxes;
+  auto* scores = param.scores;
   auto* outs = param.out;
-  outs->mutable_data<float>();
-
-  auto score_dims = boxes_in->dims();
+  bool return_index = param.index ? true : false;
+  auto* index = param.index;
+  auto score_dims = scores->dims();
   auto score_size = score_dims.size();
 
-  Tensor boxes_float;
-  Tensor scores_float;
-
-  boxes_float.Resize(boxes_in->dims());
-  scores_float.Resize(scores_in->dims());
-
-  boxes_float.mutable_data<float>();
-  scores_float.mutable_data<float>();
-
-  boxes_float.ZynqTensor()->copyFrom(boxes_in->ZynqTensor());
-  scores_float.ZynqTensor()->copyFrom(scores_in->ZynqTensor());
-
-  Tensor* boxes = &boxes_float;
-  Tensor* scores = &scores_float;
-
-  auto box_dims = boxes->dims();
-  int64_t box_dim = boxes->dims()[2];
-
   std::vector<std::map<int, std::vector<int>>> all_indices;
   std::vector<uint64_t> batch_starts = {0};
   int64_t batch_size = score_dims[0];
-
+  int64_t box_dim = boxes->dims()[2];
   int64_t out_dim = box_dim + 2;
   int num_nmsed_out = 0;
   Tensor boxes_slice, scores_slice;
@@ -372,79 +378,104 @@ void MulticlassNmsCompute::Run() {
 
   uint64_t num_kept = batch_starts.back();
   if (num_kept == 0) {
-    outs->Resize({1, 1});
-    float* od = outs->mutable_data<float>();
-    od[0] = -1;
-    batch_starts = {0, 1};
+    if (return_index) {
+      outs->Resize({0, out_dim});
+      index->Resize({0, 1});
+    } else {
+      outs->Resize({1, 1});
+      float* od = outs->mutable_data<float>();
+      od[0] = -1;
+      batch_starts = {0, 1};
+    }
   } else {
     outs->Resize({static_cast<int64_t>(num_kept), out_dim});
+    outs->mutable_data<float>();
+    int offset = 0;
+    int* oindices = nullptr;
     for (int i = 0; i < n; ++i) {
       if (score_size == 3) {
         scores->Slice<float>(scores_slice, i, i + 1);
         boxes->Slice<float>(boxes_slice, i, i + 1);
         scores_slice.Resize({score_dims[1], score_dims[2]});
         boxes_slice.Resize({score_dims[2], box_dim});
+        if (return_index) {
+          offset = i * score_dims[2];
+        }
       } else {
         auto boxes_lod = boxes->lod().back();
         scores->Slice<float>(scores_slice, boxes_lod[i], boxes_lod[i + 1]);
         boxes->Slice<float>(boxes_slice, boxes_lod[i], boxes_lod[i + 1]);
+        if (return_index) {
+          offset = boxes_lod[i] * score_dims[1];
+        }
       }
       int64_t s = static_cast<int64_t>(batch_starts[i]);
       int64_t e = static_cast<int64_t>(batch_starts[i + 1]);
-
       if (e > s) {
         Tensor out;
         outs->Slice<float>(out, s, e);
-        MultiClassOutput<float>(
-            scores_slice, boxes_slice, all_indices[i], score_dims.size(), &out);
+        if (return_index) {
+          index->Resize({static_cast<int64_t>(num_kept), 1});
+          int* output_idx = index->mutable_data<int>();
+          oindices = output_idx + s;
+        }
+        MultiClassOutput<float>(scores_slice,
+                                boxes_slice,
+                                all_indices[i],
+                                score_dims.size(),
+                                &out,
+                                oindices,
+                                offset);
+        // out.ZynqTensor()->saveToFile("nms_o", true);
         outs->ZynqTensor()->copyFrom(out.ZynqTensor());
-        out.ZynqTensor()->saveToFile("nms_oo", true);
+        outs->ZynqTensor()->flush();
       }
-      outs->Resize({static_cast<int64_t>(e - s), out_dim});
     }
   }
+
   LoD lod;
   lod.emplace_back(batch_starts);
+  if (return_index) {
+    index->set_lod(lod);
+  }
   outs->set_lod(lod);
 
-#ifdef FPGA_PRINT_TENSOR
-  Debugger::get_instance().registerOutput("boxes", boxes->ZynqTensor());
-  Debugger::get_instance().registerOutput("scores", scores->ZynqTensor());
-  Debugger::get_instance().registerOutput("nms", outs->ZynqTensor());
-#endif
+  // boxes->ZynqTensor()->saveToFile("boxes", true);
+  // scores->ZynqTensor()->saveToFile("scores", true);
+  // outs->ZynqTensor()->saveToFile("nms", true);
 }
 }  // namespace fpga
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
 
-// REGISTER_LITE_KERNEL(multiclass_nms,
-//                      kFPGA,
-//                      kFP16,
-//                      kNHWC,
-//                      paddle::lite::kernels::fpga::MulticlassNmsCompute,
-//                      def)
-//     .BindInput("BBoxes", {LiteType::GetTensorTy(TARGET(kHost))})
-//     .BindInput("Scores", {LiteType::GetTensorTy(TARGET(kHost))})
-//     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
-//     .Finalize();
-
 REGISTER_LITE_KERNEL(multiclass_nms,
                      kFPGA,
                      kFP16,
                      kNHWC,
                      paddle::lite::kernels::fpga::MulticlassNmsCompute,
-                     def2)
-    .BindInput("BBoxes",
-               {LiteType::GetTensorTy(TARGET(kFPGA),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kNHWC))})
-    .BindInput("Scores",
-               {LiteType::GetTensorTy(TARGET(kFPGA),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kNHWC))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kFPGA),
-                                       PRECISION(kFloat),
-                                       DATALAYOUT(kNHWC))})
+                     def)
+    .BindInput("BBoxes", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Scores", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
+
+// REGISTER_LITE_KERNEL(multiclass_nms,
+//                      kFPGA,
+//                      kFP16,
+//                      kNHWC,
+//                      paddle::lite::kernels::fpga::MulticlassNmsCompute,
+//                      def2)
+//     .BindInput("BBoxes",
+//                {LiteType::GetTensorTy(TARGET(kFPGA),
+//                                       PRECISION(kFP16),
+//                                       DATALAYOUT(kNHWC))})
+//     .BindInput("Scores",
+//                {LiteType::GetTensorTy(TARGET(kFPGA),
+//                                       PRECISION(kFP16),
+//                                       DATALAYOUT(kNHWC))})
+//     .BindOutput("Out",
+//                 {LiteType::GetTensorTy(TARGET(kFPGA),
+//                                        PRECISION(kFloat),
+//                                        DATALAYOUT(kNHWC))})
+//     .Finalize();
diff --git a/lite/kernels/fpga/prior_box_compute.cc b/lite/kernels/fpga/prior_box_compute.cc
index e1f361440c..c19744fa52 100644
--- a/lite/kernels/fpga/prior_box_compute.cc
+++ b/lite/kernels/fpga/prior_box_compute.cc
@@ -64,7 +64,7 @@ void PriorBoxCompute::PrepareForRun() {
   float offset = param.offset;
   std::vector<float> aspect_ratios_vec;
   ExpandAspectRatios(aspect_ratio, is_flip, &aspect_ratios_vec);
-  size_t prior_num = aspect_ratios_vec.size() * min_size.size();
+  int prior_num = aspect_ratios_vec.size() * min_size.size();
   prior_num += max_size.size();
   std::vector<std::string> order = param.order;
   bool min_max_aspect_ratios_order = param.min_max_aspect_ratios_order;
@@ -78,6 +78,7 @@ void PriorBoxCompute::PrepareForRun() {
 
   param.boxes->mutable_data<float>();
   param.variances->mutable_data<float>();
+
   zynqmp::PriorBoxParam& priobox_param = pe_.param();
   priobox_param.input = param.input->ZynqTensor();
   priobox_param.image = param.image->ZynqTensor();
diff --git a/lite/kernels/fpga/reshape_compute.cc b/lite/kernels/fpga/reshape_compute.cc
index b79051f5b1..24c60f54ef 100644
--- a/lite/kernels/fpga/reshape_compute.cc
+++ b/lite/kernels/fpga/reshape_compute.cc
@@ -23,31 +23,64 @@ namespace fpga {
 
 using float16 = zynqmp::float16;
 
-void ReshapeCompute::Run() {
+
+void FlattenCompute::Run() {
   auto& param = Param<operators::ReshapeParam>();
-  param.output->mutable_data<float16>();
   auto x = param.x;
-  // auto actual_shape = param.actual_shape;
-  Tensor* actual_shape = nullptr;  // TODO(chonwhite) change it.
   auto output = param.output;
-  bool inplace = param.inplace;
-  auto x_dims = x->dims();
+  output->mutable_data<float16>();
   auto output_dims = output->dims();
-  if (actual_shape) {
-    auto actual_shape_dims = actual_shape->dims();
-    auto* actual_shape_data = actual_shape->data<int>();
-    auto shape = std::vector<int>(
-        actual_shape_data, actual_shape_data + actual_shape_dims.production());
-    // output_dims = lite::operators::ValidateShape(shape, x_dims); //TODO
-    output->Resize(output_dims);
+  if (param.inplace) {
+    output->ShareDataWith(*x);
+  } else {
+    // output->CopyDataFrom(*x);
   }
-  // if (inplace) {
-  //   output->ShareDataWith(*x);
-  // } else {
-  //   output->CopyDataFrom(*x);
-  // }
+  x->ZynqTensor()->unalignImage();
+  // x->ZynqTensor()->saveToFile("fi", true);
+
   output->ZynqTensor()->copyFrom(x->ZynqTensor());
+  // output->ZynqTensor()->saveToFile("fo", true);
+  output->ZynqTensor()->flush();
+  output->ZynqTensor()->setAligned(x->ZynqTensor()->aligned());
   output->Resize(output_dims);
+
+#ifdef FPGA_PRINT_TENSOR
+  Debugger::get_instance().registerOutput("flatten",
+                                          output->ZynqTensor());
+#endif
+}
+
+
+void ReshapeCompute::Run() {
+  auto& param = Param<operators::ReshapeParam>();
+  auto x = param.x;
+  auto output = param.output;
+  auto output_dims = output->dims();
+
+  x->ZynqTensor()->unalignImage();
+
+  // x->ZynqTensor()->saveToFile("ri", true);
+
+  output->Resize(output_dims);
+  output->mutable_data<float16>();
+
+  if (param.inplace) {
+    output->ShareDataWith(*x);
+  } else {
+    // output->CopyDataFrom(*x);
+  }
+  
+  
+
+  output->ZynqTensor()->copyFrom(x->ZynqTensor());
+  // output->ZynqTensor()->saveToFile("ro", true);
+  output->ZynqTensor()->flush();
+  output->ZynqTensor()->setAligned(x->ZynqTensor()->aligned());
+  
+#ifdef FPGA_PRINT_TENSOR
+  Debugger::get_instance().registerOutput("reshape",
+                                          output->ZynqTensor());
+#endif
 }
 
 }  // namespace fpga
@@ -66,9 +99,9 @@ REGISTER_LITE_KERNEL(reshape,
                                       PRECISION(kFP16),
                                       DATALAYOUT(kNHWC))})
     .BindInput("Shape",
-               {LiteType::GetTensorTy(TARGET(kFPGA),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kNHWC))})
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny))})
     .BindOutput("Out",
                 {LiteType::GetTensorTy(TARGET(kFPGA),
                                        PRECISION(kFP16),
@@ -86,9 +119,9 @@ REGISTER_LITE_KERNEL(reshape2,
                                       PRECISION(kFP16),
                                       DATALAYOUT(kNHWC))})
     .BindInput("Shape",
-               {LiteType::GetTensorTy(TARGET(kFPGA),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kNHWC))})
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny))})
     .BindOutput("Out",
                 {LiteType::GetTensorTy(TARGET(kFPGA),
                                        PRECISION(kFP16),
@@ -103,16 +136,16 @@ REGISTER_LITE_KERNEL(flatten,
                      kFPGA,
                      kFP16,
                      kNHWC,
-                     paddle::lite::kernels::fpga::ReshapeCompute,
+                     paddle::lite::kernels::fpga::FlattenCompute,
                      def)
     .BindInput("X",
                {LiteType::GetTensorTy(TARGET(kFPGA),
                                       PRECISION(kFP16),
                                       DATALAYOUT(kNHWC))})
     .BindInput("Shape",
-               {LiteType::GetTensorTy(TARGET(kFPGA),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kNHWC))})
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny))})
     .BindOutput("Out",
                 {LiteType::GetTensorTy(TARGET(kFPGA),
                                        PRECISION(kFP16),
@@ -123,16 +156,16 @@ REGISTER_LITE_KERNEL(flatten2,
                      kFPGA,
                      kFP16,
                      kNHWC,
-                     paddle::lite::kernels::fpga::ReshapeCompute,
+                     paddle::lite::kernels::fpga::FlattenCompute,
                      def)
     .BindInput("X",
                {LiteType::GetTensorTy(TARGET(kFPGA),
                                       PRECISION(kFP16),
                                       DATALAYOUT(kNHWC))})
     .BindInput("Shape",
-               {LiteType::GetTensorTy(TARGET(kFPGA),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kNHWC))})
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny))})
     .BindOutput("Out",
                 {LiteType::GetTensorTy(TARGET(kFPGA),
                                        PRECISION(kFP16),
diff --git a/lite/kernels/fpga/reshape_compute.h b/lite/kernels/fpga/reshape_compute.h
index cc5ed0b565..8a3b3c266e 100755
--- a/lite/kernels/fpga/reshape_compute.h
+++ b/lite/kernels/fpga/reshape_compute.h
@@ -30,6 +30,14 @@ class ReshapeCompute
   virtual ~ReshapeCompute() = default;
 };
 
+class FlattenCompute
+    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
+ public:
+  void Run() override;
+
+  virtual ~FlattenCompute() = default;
+};
+
 class ReshapeComputeFpgaToHost
     : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
  public:
diff --git a/lite/kernels/fpga/scale_compute.cc b/lite/kernels/fpga/scale_compute.cc
index 991c73f295..f28fbf736c 100755
--- a/lite/kernels/fpga/scale_compute.cc
+++ b/lite/kernels/fpga/scale_compute.cc
@@ -29,8 +29,8 @@ void ScaleCompute::PrepareForRun() {
   scale_param.output = param.output->ZynqTensor();
 
   int channel = scale_param.input->shape().channel();
-  zynqmp::Tensor* scale = new zynqmp::Tensor();
-  zynqmp::Tensor* bias = new zynqmp::Tensor();
+  zynqmp::Tensor* scale = &scale_;
+  zynqmp::Tensor* bias = &bias_;
   zynqmp::Shape shape(zynqmp::N, {channel});
   float* scale_data = scale->mutableData<float>(zynqmp::FP32, shape);
   float* bias_data = bias->mutableData<float>(zynqmp::FP32, shape);
diff --git a/lite/kernels/fpga/scale_compute.h b/lite/kernels/fpga/scale_compute.h
index 217399db72..10ddf04ca7 100755
--- a/lite/kernels/fpga/scale_compute.h
+++ b/lite/kernels/fpga/scale_compute.h
@@ -37,6 +37,8 @@ class ScaleCompute
 
  private:
   zynqmp::ScalePE pe_;
+  zynqmp::Tensor scale_;
+  zynqmp::Tensor bias_;
 };
 
 }  // namespace fpga
diff --git a/lite/kernels/fpga/softmax_compute.cc b/lite/kernels/fpga/softmax_compute.cc
index b13b5f0f46..25fceda569 100755
--- a/lite/kernels/fpga/softmax_compute.cc
+++ b/lite/kernels/fpga/softmax_compute.cc
@@ -26,7 +26,8 @@ void SoftmaxCompute::PrepareForRun() {
   zynqmp::SoftmaxParam& softmax_param = pe_.param();
   auto& param = Param<operators::SoftmaxParam>();
 
-  param.output->mutable_data<float16>();
+  // param.output->mutable_data<float16>();
+  param.output->mutable_data<float>();
   softmax_param.input = param.x->ZynqTensor();
   softmax_param.output = param.output->ZynqTensor();
   pe_.init();
@@ -34,9 +35,13 @@ void SoftmaxCompute::PrepareForRun() {
 }
 
 void SoftmaxCompute::Run() {
+  zynqmp::SoftmaxParam& softmax_param = pe_.param();
+  // softmax_param.input->saveToFile("softmax_in", true);
   pe_.dispatch();
+  
+  softmax_param.output->flush();
+  // softmax_param.output->saveToFile("softmax", true);
 #ifdef FPGA_PRINT_TENSOR
-  zynqmp::SoftmaxParam& softmax_param = pe_.param();
   Debugger::get_instance().registerOutput("softmax", softmax_param.output);
 #endif
 }
@@ -57,7 +62,17 @@ REGISTER_LITE_KERNEL(softmax,
                                       PRECISION(kFP16),
                                       DATALAYOUT(kNHWC))})
     .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kFPGA),
-                                       PRECISION(kFP16),
-                                       DATALAYOUT(kNHWC))})
+                {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
+
+
+
+
+
+
+
+
+// .BindOutput("Out",
+//                 {LiteType::GetTensorTy(TARGET(kFPGA),
+//                                        PRECISION(kFP16),
+//                                        DATALAYOUT(kNHWC))})
\ No newline at end of file
diff --git a/lite/kernels/fpga/transpose_compute.cc b/lite/kernels/fpga/transpose_compute.cc
index 4ffeb4c82b..5f55ae3d9f 100644
--- a/lite/kernels/fpga/transpose_compute.cc
+++ b/lite/kernels/fpga/transpose_compute.cc
@@ -34,17 +34,17 @@ void transposeCompute(operators::TransposeParam param) {
   input_x->ZynqTensor()->invalidate();
   input_x->ZynqTensor()->unalignImage();
 
-  Tensor float_input;
-  float_input.Resize(input_x_dims);
-  float_input.mutable_data<float>();
-  float_input.ZynqTensor()->copyFrom(input_x->ZynqTensor());
+  // Tensor float_input;
+  // float_input.Resize(input_x_dims);
+  // float_input.mutable_data<float>();
+  // float_input.ZynqTensor()->copyFrom(input_x->ZynqTensor());
 
-  const auto* input_x_data = float_input.data<float>();
+  const auto* input_x_data = input_x->data<float16>();
 
   auto* out = param.output;
   const auto axis = param.axis;
 
-  auto* out_data = out->mutable_data<float>();
+  auto* out_data = out->mutable_data<float16>();
 
   size_t ndim = axis.size();
   std::vector<int> xdim(ndim);
@@ -84,10 +84,11 @@ void transposeCompute(operators::TransposeParam param) {
 void TransposeCompute::Run() {
   auto& param = this->Param<param_t>();
   param.output->mutable_data<zynqmp::float16>();
-  param.x->ZynqTensor()->invalidate();
+  // param.x->ZynqTensor()->invalidate();
   param.x->ZynqTensor()->unalignImage();
   if (param.x->dims().size() != 4) {
     transposeCompute(param);
+    param.output->ZynqTensor()->setAligned(param.x->ZynqTensor()->aligned());
   } else {
     param.output->ZynqTensor()->copyFrom(param.x->ZynqTensor());
   }
@@ -96,14 +97,25 @@ void TransposeCompute::Run() {
 // Transpose2
 void Transpose2Compute::Run() {
   auto& param = this->Param<param_t>();
-  param.output->mutable_data<float>();
-  param.x->ZynqTensor()->invalidate();
+  param.output->mutable_data<float16>();
+  // param.x->ZynqTensor()->syncToCPU();
+  // param.x->ZynqTensor()->saveToFile("t_in", true);
   param.x->ZynqTensor()->unalignImage();
+  // param.x->ZynqTensor()->saveToFile("t_unaligned", true);
+  param.x->ZynqTensor()->flush();
+  param.x->ZynqTensor()->invalidate();
+  
   if (param.x->dims().size() != 4) {
     transposeCompute(param);
+    param.output->ZynqTensor()->setAligned(param.x->ZynqTensor()->aligned());
   } else {
     param.output->ZynqTensor()->copyFrom(param.x->ZynqTensor());
   }
+
+  // param.output->ZynqTensor()->copyFrom(param.x->ZynqTensor());
+
+  param.output->ZynqTensor()->flush();
+  // param.output->ZynqTensor()->saveToFile("Transpose2", true);
 }
 
 }  // namespace fpga
@@ -139,6 +151,8 @@ REGISTER_LITE_KERNEL(transpose2,
                {LiteType::GetTensorTy(TARGET(kFPGA),
                                       PRECISION(kFP16),
                                       DATALAYOUT(kNHWC))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
     .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
-- 
GitLab