fix: update fpga backend and kernel (#2848)

* fix: update fpga backend and kernel test=develop * style: style fix test=develop

fix: update fpga backend and kernel (#2848)
* fix: update fpga backend and kernel test=develop * style: style fix test=develop
4ea3ca7a · xiaogang · GitHub · 4c1bf784 · 4ea3ca7a · 4ea3ca7a
27 changed file
--- a/lite/backends/fpga/KD/debugger.hpp
+++ b/lite/backends/fpga/KD/debugger.hpp
@@ -32,7 +32,8 @@ class Debugger {
  }
  void registerOutput(std::string op_type, zynqmp::Tensor* tensor) {
-    if (op_type != "conv") {  // NOLINT
+    if (op_config[op_type]) {
+      tensor->saveToFile(op_type, true);
    }
  }
@@ -40,8 +41,21 @@ class Debugger {
  std::unordered_map<std::string, bool> op_config;
  Debugger() {
    op_config["concat"] = true;
+    op_config["pooling"] = true;
    op_config["conv"] = true;
+    op_config["dwconv"] = true;
+    op_config["ew_add"] = true;
    op_config["crop"] = true;
+    op_config["feed"] = true;
+    op_config["mul"] = true;
+    op_config["fetch"] = true;
+    op_config["boxes"] = true;
+    op_config["scores"] = true;
+    op_config["nms"] = true;
+    op_config["pb_boxes"] = true;
+    op_config["pb_variances"] = true;
+    // op_config["fc"] = true;
+    op_config["softmax"] = true;
  }
 };
@@ -131,9 +145,7 @@ inline void save_tensor(const lite::Tensor* t,
    chw_to_hwc(const_cast<lite::Tensor*>(t), dst);
    data = dst;
  }
  save_float(data, name, t->numel());
  delete[] dst;
 }
 }  // namespace lite

--- a/lite/backends/fpga/KD/llapi/filter.cpp
+++ b/lite/backends/fpga/KD/llapi/filter.cpp
@@ -31,7 +31,7 @@ void saveToFile(std::string name, void* data_in, int size) {
  std::ofstream ofs;
  ofs.open(name);
-  int8_t* data = static_cast<int8_t*> data_in;
+  int8_t* data = static_cast<int8_t*>(data_in);
  for (int i = 0; i < size; i++) {
    float value = data[i];
    ofs << value << std::endl;
@@ -84,6 +84,14 @@ int calc_num_per_div(int num, int group_num, int division_capacity) {
  }
 }
+int calc_pack_num(int num_per_group, int group, int division_capacity) {
+  auto n = 1;
+  while ((num_per_group * (group + n - 1) / n) > division_capacity) {
+    n++;
+  }
+  return (n);
+}
 void convert_to_hwc(int8_t* chw_data,
                    int8_t* hwc_data,
                    int num,
@@ -231,10 +239,9 @@ int8_t* format_filter(float* data_in,
  for (int n = 0; n < num; n++) {
    float* filter_start = data_in + n * chw;
-    float f_max = find_max(filter_start, chw);
    int8_t* quantized_start = quantized_data + n * chw;
    quantize(filter_start, quantized_start, chw, max);
-    filter_max.push_back(max);
+    filter_max.push_back(1);
  }
  int8_t* hwc_data =
@@ -256,6 +263,7 @@ int8_t* format_filter(float* data_in,
    int filter_num_alignment = get_filter_num_alignment();
    int num_per_div_after_alignment =
        align_to_x(num_per_div_before_alignment, filter_num_alignment);
    int num_element = div_num * num_per_div_after_alignment * chw_aligned;
    int8_t* num_aligned_data =
        reinterpret_cast<int8_t*>(fpga_malloc(num_element * sizeof(int8_t)));

--- a/lite/backends/fpga/KD/llapi/filter.h
+++ b/lite/backends/fpga/KD/llapi/filter.h
@@ -31,6 +31,7 @@ int calc_division_capacity(int chw);
 int calc_split_num(int num, int division_capacity);
 int calc_division_number(int num, int group_num, int division_capacity);
 int calc_num_per_div(int num, int group_num, int division_capacity);
+int calc_pack_num(int num_per_group, int group, int division_capacity);
 float find_max(float* data_in, int data_size);
 int8_t* format_filter(float* data_in,
@@ -40,11 +41,13 @@ int8_t* format_filter(float* data_in,
                      int height,
                      int width,
                      int group_num,
-                      float max,                        // NOLINT
+                      float max,
                      std::vector<float>& filter_max);  // NOLINT
 void convert_to_hwn(int16_t** data_in, int num, int height, int width);
 size_t align_element_n(int16_t** data_in, int num, int height, int width);
+// void quantize_to_fp16(float** data_in, int num, int height, int width,
+//                       float* scale_ptr);
 size_t format_dwconv_filter(
    float** data_in, int num, int height, int width, float* scale_ptr);

--- a/lite/backends/fpga/KD/llapi/zynqmp_api.cpp
+++ b/lite/backends/fpga/KD/llapi/zynqmp_api.cpp
@@ -28,7 +28,7 @@ limitations under the License. */
 namespace paddle {
 namespace zynqmp {
-#define PADDLE_OS_LINUX
+#define PADDLE_MOBILE_OS_LINUX
 static int fd = -1;
 static const char *device_path = "/dev/fpgadrv0";
@@ -38,7 +38,7 @@ static size_t memory_size_max = 0;
 static size_t memory_size = 0;
 static inline int do_ioctl(uint64_t req, const void *arg) {
-#ifdef PADDLE_OS_LINUX
+#ifdef PADDLE_MOBILE_OS_LINUX
  return ioctl(fd, req, arg);
 #else
  return -1;
@@ -61,17 +61,33 @@ void reset_device() {
 // memory management;
 void *fpga_malloc(size_t size) {
-#ifdef ENABLE_DEBUG
+#ifdef PADDLE_MOBILE_OS_LINUX
-#endif
-#ifdef PADDLE_OS_LINUX
  void *ptr = reinterpret_cast<void *>(
      mmap64(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0));
-  if (ptr == NULL) {
+  if (ptr == MAP_FAILED) {
    std::cout << "not enough memory !";
    exit(-1);
  }
+  if (errno == ENOMEM) {
+    std::cout << "mmap failed with not enough memory !";
+    exit(-1);
+  }
+  if (errno == EINVAL) {
+    std::cout << "mmap failed with invalid arguments ! (size=" << size << ")"
+              << std::endl;
+    exit(-1);
+  }
+  if (ptr == NULL) {
+    std::cout << "NULL returned, errno=" << errno
+              << ", mmap failed with other errors other than memory usage !"
+              << std::endl;
+    exit(-1);
+  }
  memory_map.insert(std::make_pair(ptr, size));
  memory_size += size;
  if (memory_size > memory_size_max) {
    memory_size_max = memory_size;
  }
@@ -87,7 +103,7 @@ size_t fpga_get_memory_size_max() { return memory_size_max; }
 size_t fpga_diagnose_memory(int detailed) {
  size_t total = 0;
-  auto iter = memory_map.begin();  // std::map<void *, size_t>::iterator
+  auto iter = memory_map.begin();
  while (iter != memory_map.end()) {
    total += iter->second;
    iter++;
@@ -97,13 +113,15 @@ size_t fpga_diagnose_memory(int detailed) {
 void fpga_free(void *ptr) {
  size_t size = 0;
-  auto iter = memory_map.find(ptr);  // std::map<void *, size_t>::iterator
+  auto iter = memory_map.find(ptr);
  if (iter != memory_map.end()) {
    size = iter->second;
    memory_map.erase(iter);
  }
  memory_size -= size;
-#ifdef PADDLE_OS_LINUX
+#ifdef PADDLE_MOBILE_OS_LINUX
  munmap(ptr, size);
 #else
  free(ptr);
@@ -230,6 +248,7 @@ int perform_bypass(const struct BypassArgs &args) {
    ret = do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs);
    scale = std::max(scale, scales[0]);
  }
  args.output.scale_address[0] = scale;
  args.output.scale_address[1] = 1.0f / scale;
  return ret;
@@ -238,6 +257,26 @@ int perform_bypass(const struct BypassArgs &args) {
 int compute_fpga_concat(const struct ConcatArgs &args) { return -1; }
 int compute_fpga_scale(const struct ScaleArgs &args) {
+#ifdef ENABLE_DEBUG
+  std::cout << "======Compute Scale======";
+  std::cout << "scale_address:" << args.scale_address << std::endl;
+  std::cout << "bias_address:" << args.bias_address << std::endl;
+  std::cout << "wc_alignment:" << args.wc_alignment << std::endl;
+  std::cout << "channel_alignment:" << args.channel_alignment << std::endl;
+  std::cout << "   image_address:" << args.image.address
+            << "   image_scale_address:" << args.image.scale_address
+            << "   image_channels:" << args.image.channels
+            << "   image_height:" << args.image.height
+            << "   image_width:" << args.image.width
+            << "   pad_height:" << args.image.pad_height
+            << "   pad_width:" << args.image.pad_width;
+  std::cout << "   out_address:" << args.output.address
+            << "   out_scale_address:" << args.output.scale_address;
+#endif
  return do_ioctl(IOCTL_CONFIG_SCALE, &args);
 }

--- a/lite/backends/fpga/KD/llapi/zynqmp_api.h
+++ b/lite/backends/fpga/KD/llapi/zynqmp_api.h
@@ -28,7 +28,6 @@ namespace zynqmp {
 typedef int16_t half;
 #define IMAGE_ALIGNMENT 16           // Aligned to 16
-#define FILTER_NUM_ALIGNMENT 32      // Filter number aligned to 32
 #define FILTER_ELEMENT_ALIGNMENT 16  // Filter element number aligned to 16
 #define BS_NUM_ALIGNMENT 8
 #define BIAS_NUM_ALIGNMENT 16
@@ -44,14 +43,11 @@ enum DLayoutType {
 };
 enum ActiveType {
-  TYPE_RELU = 0,
+  TYPE_NONE = 0,
-  TYPE_RELU6 = 1,
+  TYPE_RELU = 1,
-  TYPE_LEAK_RELU = 2,
+  TYPE_RELU6 = 2,
-  TYPE_SIGMOID = 3,
+  TYPE_LEAKY_RELU = 3,
-};
+  TYPE_SIGMOID = 4,
-struct VersionArgs {
-  void* buffer;
 };
 struct DeviceInfo {
@@ -67,6 +63,11 @@ struct DeviceInfo {
  uint32_t reserved6;
 };
+struct VersionArgs {
+  void* buffer;
+  size_t size;
+};
 struct MemoryCopyArgs {
  void* src;
  void* dest;
@@ -78,7 +79,9 @@ struct MemoryCacheArgs {
  size_t size;
 };
-struct MemoryBarrierArgs {};
+struct MemoryBarrierArgs {
+  uint16_t dummy;
+};
 struct BNArgs {
  bool enabled;

--- a/lite/backends/fpga/KD/pe_params.hpp
+++ b/lite/backends/fpga/KD/pe_params.hpp
@@ -30,8 +30,13 @@ struct ReLUParam {
  float leaky_relu_factor = 0.0f;
 };
+struct ActiveParam {
+  enum ActiveType type = TYPE_NONE;
+  float leaky_relu_factor;
+};
 struct PEParam {
-  ReLUParam relu;
+  ActiveParam activeParam;
 };
 struct InputParam : PEParam {
@@ -100,24 +105,6 @@ struct DepthwiseConvParam : ConvParam {
  Tensor* quantizedFilter_ = new Tensor();
 };
-struct GRUParam : PEParam {
- public:
-  Tensor* input = nullptr;
-  Tensor* h0 = nullptr;
-  Tensor* weight = nullptr;
-  Tensor* bias = nullptr;
-  Tensor* batch_gate = nullptr;
-  Tensor* batch_reset_hidden_prev = nullptr;
-  Tensor* batch_hidden = nullptr;
-  Tensor* hidden = nullptr;
-  std::string gate_activation = "sigmoid";
-  std::string activation = "tanh";
-  bool is_reverse = false;
-  bool origin_mode = false;
-};
 enum PoolingType : int {
  MAX = 0,
  AVERAGE = 1,
@@ -155,7 +142,8 @@ struct ElementwiseAddParam : PEParam {
 struct ElementwiseMulParam : PEParam {
 public:
-  std::vector<Tensor*> inputs;
+  Tensor* input_x;
+  Tensor* input_y = nullptr;
  Tensor* output = nullptr;
 };
@@ -223,6 +211,17 @@ struct PriorBoxParam : PEParam {
  float offset;
 };
+struct YoloBoxParam : PEParam {
+  Tensor* input;
+  Tensor* imgSize;
+  Tensor* outputBoxes;
+  Tensor* outputScores;
+  int downsampleRatio;
+  std::vector<int> anchors;
+  int classNum;
+  float confThresh;
+};
 struct ScaleParam : PEParam {
 public:
  Tensor* input = nullptr;
@@ -255,5 +254,24 @@ struct CropParam : PEParam {
  std::vector<int> offsets;
  std::vector<int> shape;
 };
+struct GRUParam : PEParam {
+ public:
+  Tensor* input = nullptr;
+  Tensor* h0 = nullptr;
+  Tensor* weight = nullptr;
+  Tensor* bias = nullptr;
+  Tensor* batch_gate = nullptr;
+  Tensor* batch_reset_hidden_prev = nullptr;
+  Tensor* batch_hidden = nullptr;
+  Tensor* hidden = nullptr;
+  std::string gate_activation = "sigmoid";
+  std::string activation = "tanh";
+  bool is_reverse = false;
+  bool origin_mode = false;
+};
 }  // namespace zynqmp
 }  // namespace paddle
--- a/lite/backends/fpga/KD/pes/conv_pe.hpp
+++ b/lite/backends/fpga/KD/pes/conv_pe.hpp
@@ -25,6 +25,7 @@ limitations under the License. */
 #include "lite/backends/fpga/KD/pes/conv_process.hpp"
 #include "lite/backends/fpga/KD/pes/elementwise_add_pe.hpp"
 #include "lite/backends/fpga/KD/pes/scale_pe.hpp"
+#include "lite/backends/fpga/KD/pes/split_pe.hpp"
 namespace paddle {
 namespace zynqmp {
@@ -41,6 +42,8 @@ class ConvPE : public PE {
  void apply() {
    split_axis = fill_split_arg(param_);
+    split_channel = param_.groups != 1 && param_.splitParams().size() > 1;
    if (split_axis == 0 && param_.splitParams().size() > 1) {
      ConcatParam& concat_param = concatPE_.param();
      for (auto conv_param : param_.splitParams()) {
@@ -51,107 +54,28 @@ class ConvPE : public PE {
      concatPE_.apply();
    }
+    if (split_channel) {
+      SplitParam& split_param = splitPE_.param();
+      split_param.input = param_.input;
+      for (auto conv_param : param_.splitParams()) {
+        split_param.outputs.push_back(&conv_param->input);
+      }
+      splitPE_.init();
+      splitPE_.apply();
+    }
    if (DLEngine::get_instance().isZU3() &&
        param_.input->shape().dimSize() == 4 &&
        param_.input->shape().width() == 1 &&
-        param_.input->shape().width() >= 2048) {
+        param_.input->shape().channel() >= 2048) {
      use_cpu_ = true;
    }
+    if (!use_cpu_) {
-    if (param_.filter->shape().width() == 1 &&
+      // param_.filter->releaseData();
-        param_.filter->shape().height() == 1) {  // NOLINT
-    }
-    if (!use_cpu_) {  // NOLINT
    }
-  }
-  void cpu_conv_hwc() {
-    Tensor* input = param_.input;
-    Tensor* output = param_.output;
-    input->syncToCPU();
-    Tensor float_input;
-    Tensor float_output;
-    float* image_addr = float_input.mutableData<float>(FP32, input->shape());
-    float_input.copyFrom(input);
-    float_input.syncToCPU();
-    float* out = float_output.mutableData<float>(FP32, output->shape());
-    int out_width = output->shape().width();
-    int out_channel = output->shape().channel();
-    int in_channel = input->shape().channel();
-    float* filter_data = param_.filter->data<float>();
-    int image_height = input->shape().height();
+    // exit(-1);
-    int image_width = input->shape().width();
-    int image_channels = input->shape().channel();
-    int image_pad_h = param_.paddings[0];
-    int image_pad_w = param_.paddings[1];
-    int kernel_height = param_.filter->shape().height();
-    int kernel_width = param_.filter->shape().width();
-    int kernel_step_h = param_.strides[0];
-    int kernel_step_w = param_.strides[1];
-    int pooled_height_ = output->shape().height();
-    int pooled_width_ = out_width;
-    int filter_chw = image_channels * kernel_height * kernel_width;
-    float max = 0;
-    for (int ph = 0; ph < pooled_height_; ph++) {
-      for (int pw = 0; pw < pooled_width_; pw++) {
-        int hstart = ph * kernel_step_h - image_pad_h;
-        int wstart = pw * kernel_step_w - image_pad_w;
-        int hend =
-            std::min(hstart + kernel_height, static_cast<int>(image_height));
-        int wend =
-            std::min(wstart + kernel_width, static_cast<int>(image_width));
-        hstart = std::max(hstart, static_cast<int>(0));
-        wstart = std::max(wstart, static_cast<int>(0));
-        for (int oc = 0; oc < out_channel; oc++) {
-          float sum = 0.0f;
-          const int pool_index = (ph * pooled_width_ + pw) * out_channel + oc;
-          for (int c = 0; c < image_channels; c++) {
-            for (int h = hstart; h < hend; h++) {
-              int hi = 0;
-              if (ph == 0) {
-                hi = h - hstart + image_pad_h;
-              } else {
-                hi = h - hstart;
-              }
-              for (int w = wstart; w < wend; w++) {
-                int wi = 0;
-                if (pw == 0) {
-                  wi = w - wstart + image_pad_w;
-                } else {
-                  wi = w - wstart;
-                }
-                const int index = (h * image_width + w) * image_channels + c;
-                int weight_index = oc * filter_chw +
-                                   kernel_width * kernel_height * c +
-                                   kernel_width * hi + wi;
-                float value = image_addr[index] * filter_data[weight_index];
-                sum += value;
-              }
-            }
-          }
-          if (param_.relu.enabled && sum < 0) {
-            sum = -sum;
-          }
-          if (sum > max) {
-            max = sum;
-          }
-          out[pool_index] = sum;
-        }
-      }
-    }
-    float_output.flush();
-    output->copyFrom(&float_output);
-    output->scale()[0] = max / 127;
-    output->scale()[1] = 127 / max;
  }
  void cpu_compute() {
    Tensor* input = param_.input;
    Tensor* output = param_.output;
@@ -161,93 +85,98 @@ class ConvPE : public PE {
    Tensor float_output;
    float* image_addr = float_input.mutableData<float>(FP32, input->shape());
    float_input.copyFrom(input);
-    float_input.syncToCPU();
+    // float16* data_out = output->data<float16>();
    float* out = float_output.mutableData<float>(FP32, output->shape());
-    float* bias_data = param_.bias()->data<float>();
-    int out_width = output->shape().width();
    int out_channel = output->shape().channel();
    int in_channel = input->shape().channel();
    float* filter_data = param_.filter->data<float>();
    float* mi = new float[in_channel];
-    float max = 0;
-    int out_index = 0;
    for (int i = 0; i < out_channel; i++) {
      float* image = image_addr;
      float* filter_ptr = filter_data + i * in_channel;
      float* out_ptr = mi;
+#pragma omp parallel for
+      for (int j = 0; j < in_channel; j++) {
+        // float32x4_t x0 = vld1q_f32(image);
+        // float32x4_t x1 = vld1q_f32(filter_ptr);
+        // float32x4_t r = vmulq_f32(x0, x1);
+        // vst1q_f32(out_ptr, r);
+        // image += 4;
+        // filter_ptr += 4;
+        // out_ptr += 4;
+        float value = image_addr[j] * filter_ptr[j];
+        mi[j] = value;
+      }
-      for (int h = 0; h < output->shape().height(); h++) {
+      float sum = 0;
-        for (int w = 0; w < output->shape().width(); w++) {
+      for (int j = 0; j < in_channel; j++) {
-          float sum = 0;
+        sum += mi[j];
-          // #pragma omp parallel for
-          for (int j = 0; j < in_channel; j++) {
-            int image_index = h * out_width * in_channel + w * in_channel + j;
-            float value = image_addr[image_index] * filter_ptr[j];
-            sum += value;
-          }
-          sum += bias_data[i];
-          if (param_.relu.enabled && sum < 0) {
-            sum = 0;
-          }
-          if (sum > max) {
-            max = sum;
-          }
-          out_index = h * out_width * out_channel + w * out_channel + i;
-          out[out_index] = sum;
-        }
      }
+      out[i] = sum;
    }
    delete[] mi;
    float_output.flush();
    output->copyFrom(&float_output);
-    output->scale()[0] = max / 127;
-    output->scale()[1] = 127 / max;
  }
  bool dispatch() {
+    fpga_reset();
    if (use_cpu_) {
      cpu_compute();
      return true;
    }
-    inplace_.leaky_relu_enable =
+    if (param_.activeParam.type == TYPE_RELU) {
-        (param_.relu.leaky_relu_factor != 0) ? true : false;
+      inplace_.relu_enable = true;
-    inplace_.relu_enable =
+    } else if (param_.activeParam.type == TYPE_RELU6) {
-        inplace_.leaky_relu_enable ? false : param_.relu.enabled;
+      inplace_.relu6_enable = true;
+    } else if (param_.activeParam.type == TYPE_SIGMOID) {
+      inplace_.sigmoid_enable = true;
+    } else if (param_.activeParam.type == TYPE_LEAKY_RELU) {
+      inplace_.leaky_relu_enable = true;
+    }
-    inplace_.power_enable = false;
+    if (inplace_.relu_enable || inplace_.leaky_relu_enable ||
-    inplace_.normalize_enable = false;
+        inplace_.relu6_enable || inplace_.sigmoid_enable) {
-    if (inplace_.relu_enable || inplace_.leaky_relu_enable) {
      config_inplace(inplace_);
      if (inplace_.leaky_relu_enable) {
-        activeParamterArgs.type = TYPE_LEAK_RELU;
+        activeParamterArgs.type = TYPE_LEAKY_RELU;
        activeParamterArgs.leaky_relu_factor =
-            fp32_2_fp16(param_.relu.leaky_relu_factor);
+            fp32_2_fp16(param_.activeParam.leaky_relu_factor);
        config_activation(activeParamterArgs);
      }
    }
    std::vector<BasicConvParam*>& params = param_.splitParams();
+    if (split_channel) {
+      // splitPE_.param().input->saveToFile("input_image",true);
+      splitPE_.dispatch();
+    }
    int ret = 0;
    for (auto conv_param : params) {
+      // conv_param->input.printScale();
+      // if (split_channel) {
+      //   conv_param->input.saveToFile("pack_image",true);
+      // }
      ret |= compute_fpga_conv_basic(conv_param->args);
    }
-    if (inplace_.relu_enable || inplace_.leaky_relu_enable) {
+    if (inplace_.relu_enable || inplace_.leaky_relu_enable ||
+        inplace_.relu6_enable || inplace_.sigmoid_enable) {
      inplace_.relu_enable = false;
      inplace_.leaky_relu_enable = false;
+      inplace_.relu6_enable = false;
+      inplace_.sigmoid_enable = false;
      config_inplace(inplace_);
      if (inplace_.leaky_relu_enable) {
-        activeParamterArgs.type = TYPE_LEAK_RELU;
+        activeParamterArgs.type = TYPE_LEAKY_RELU;
        activeParamterArgs.leaky_relu_factor = fp32_2_fp16(0);
        config_activation(activeParamterArgs);
      }
@@ -255,16 +184,29 @@ class ConvPE : public PE {
    size_t size = params.size();
    if (split_axis == 0 && ret == 0 && size > 1) {
+      // std::cout << "concat size:" << size << std::endl;
      concatPE_.dispatch();
    }
    if (split_axis == 1 && ret == 0 && size > 1) {
+      // for (int n = 0; n < size - 1; n++) {
      ElementwiseAddParam& add_param = addPE_.param();
      add_param.inputs = {&params[0]->output, &params[1]->output};
      add_param.output = param_.output;
      addPE_.init();
      addPE_.apply();
      addPE_.dispatch();
+      // param_.output->printScale();
+      // params[0]->input.saveToFile("conv_1.txt");
+      // params[1]->input.saveToFile("conv_2.txt");
+      // params[0]->output.saveToFile("ew_o1.txt");
+      // params[1]->output.saveToFile("ew_o2.txt");
+      // std::cout << "\n ================== EW ================== \n";
+      // }
    }
    return ret == 0;
  }
@@ -272,8 +214,10 @@ class ConvPE : public PE {
 private:
  bool use_cpu_ = false;
+  bool split_channel = false;
  ConvParam param_;
  ConcatPE concatPE_;
+  SplitPE splitPE_;
  ElementwiseAddPE addPE_;
  int split_axis = 0;
  InplaceArgs inplace_ = {0};

--- a/lite/backends/fpga/KD/pes/conv_process.hpp
+++ b/lite/backends/fpga/KD/pes/conv_process.hpp
@@ -53,6 +53,16 @@ inline int get_split_num(Tensor* filter) {
  return filter::calc_split_num(aligned_num, div_capacity);
 }
+inline int get_pack_num(Tensor* filter, int group_num) {
+  auto chw = filter->shape().channel() * filter->shape().height() *
+             filter->shape().width();
+  auto num = filter->shape().num();
+  int div_capacity = filter::calc_division_capacity(chw);
+  int filter_num_alignment = filter::get_filter_num_alignment();
+  int aligned_num_per_group = align_to_x(num / group_num, filter_num_alignment);
+  return filter::calc_pack_num(aligned_num_per_group, group_num, div_capacity);
+}
 inline void fill_scale_bias_const(ConvParam* param_) {
  int channel = param_->output->shape().channel();
  Shape sb_shape(N, {channel});
@@ -117,6 +127,50 @@ inline void combine_add_bn_params(BatchnormParam* bn,
  param_->bias()->setDataLocation(CPU);
 }
+inline int gcd_(int a, int b) {
+  while (b) {
+    int temp = a;
+    a = b;
+    b = temp % b;
+  }
+  return a;
+}
+inline int lcm_(int a, int b) { return a * b / gcd_(a, b); }
+inline void format_bias_scale_new(Tensor* bias,
+                                  Tensor* scale,
+                                  Tensor* scale_bias) {
+  Shape& bias_shape = bias->shape();
+  int channel = bias_shape.channel();
+  int repeat = 1;
+  int alignment = 16;
+  int length = channel;
+  if (channel % alignment != 0 || channel < alignment) {
+    int c_lcm = lcm_(channel, alignment);
+    repeat = c_lcm / (channel);
+  }
+  Shape shape(N, {2 * channel * repeat});
+  float16* scale_bias_data = scale_bias->mutableData<float16>(FP16, shape);
+  float* bias_data_float = bias->data<float>();
+  float* scale_data_float = scale->data<float>();
+  for (int i = 0; i < repeat; i++) {
+    for (int j = 0; j < length; j++) {
+      float16 value_bias = float_to_half(bias_data_float[j]);
+      scale_bias_data[i * length + j] = value_bias;
+    }
+  }
+  for (int i = 0; i < repeat; i++) {
+    for (int j = 0; j < length; j++) {
+      float16 value_scale = float_to_half(scale_data_float[j]);
+      scale_bias_data[i * length + j + length * repeat] = value_scale;
+    }
+  }
+}
 inline void format_scale_bias(Tensor* scale,
                              Tensor* bias,
                              Tensor* filter,
@@ -180,8 +234,10 @@ inline void format_scale_bias(Tensor* scale,
 inline void format_filter(Tensor* filter,
                          Tensor* quantized_filter,
                          int group,
-                          std::vector<float>& scales) {  // NOLINT
+                          std::vector<float>& scales,  // NOLINT
+                          float max) {
  float max_value = find_max(*filter);
+  // max_value = max; //TODO: global quantization for filter
  Shape& filter_shape = filter->shape();
  int mem_size;
@@ -206,10 +262,22 @@ inline void format_filter(Tensor* filter,
  memcpy(src, quantized_data, mem_size);
  quantized_filter->flush();
+  fpga_free(quantized_data);
-  for (size_t i = 0; i < max_values.size(); i++) {
-    scales.push_back(max_values[i] / max_value);
+  // for (size_t i = 0; i < max_values.size(); i++) {
-  }
+  //   // scales.push_back(max_values[i] / max_value);
+  //   scales.push_back(1.0f);
+  // }
+  // filter->saveToFile("filter.txt");
+  // std::ofstream ofs;
+  // ofs.open("quant.txt");
+  // for (int i = 0; i < mem_size; i++) {
+  //   float value = quantized_data[i];
+  //   ofs << value << std::endl;
+  // }
+  // ofs.close();
+  // exit(-1);
 }
 inline void format_dw_filter(Tensor* filter,
@@ -256,17 +324,10 @@ inline void split_filter_num(const ConvParam& c_param) {
  Tensor* out = param.output;
  Tensor* filter = param.filter;
  auto channel = out->shape().channel();
-  int split_num = param.groups == 1 ? get_split_num(param.filter) : 1;
+  int split_num = get_split_num(param.filter);
  int filter_num_per_div = get_filter_num_per_div(filter, param.groups);
-  auto chw = filter->shape().channel() * filter->shape().height() *
+  float max = find_max(*filter);
-             filter->shape().width();
-  auto num = filter->shape().num();
-  int div_capacity = filter::calc_division_capacity(chw);
-  int filter_num_alignment = filter::get_filter_num_alignment();
-  int aligned_num =
-      align_to_x(num / param.groups, filter_num_alignment) * param.groups;
-  split_num = filter::calc_split_num(aligned_num, div_capacity);
  Shape& out_shape = out->shape();
  for (int i = 0; i < split_num; i++) {
@@ -310,45 +371,29 @@ inline void split_filter_num(const ConvParam& c_param) {
    new_filter.flush();
    conv_param->filter.mutableData<float>(FP32, f_shape);
-    if (param.groups != 1) {
+    std::vector<float> v;  // TODO(chonwhite) change variable name;
-      int mem_factor =
+    format_filter(&new_filter, &(conv_param->filter), param.groups, v, max);
-          32 / filter_num_per_div;  // TODO(chonwhite): change 32 to param;
-      conv_param->filter.setMemScale(mem_factor);
-    }
-    std::vector<float> v;  // TODO(chonwhite): change local variable name
-    format_filter(&new_filter, &(conv_param->filter), param.groups, v);
    conv_param->filter.setDataType(INT8);
-    int sb_num = 2 * align_to_x(filter_num, BS_NUM_ALIGNMENT);
    Tensor scale;
    Tensor bias;
    int chnnnel_start = i * filter_num_per_div;
-    Shape s_shape(N, {filter_num});
+    Shape s_shape(NC, {1, filter_num});
    float* scale_data = scale.mutableData<float>(FP32, s_shape);
    float* bias_data = bias.mutableData<float>(FP32, s_shape);
    for (int n = 0; n < filter_num; n++) {
-      scale_data[n] = param.scale()->data<float>()[n + chnnnel_start] * v[n];
+      scale_data[n] = param.scale()->data<float>()[n + chnnnel_start];
    }
    for (int n = 0; n < filter_num; n++) {
      bias_data[n] = param.bias()->data<float>()[n + chnnnel_start];
    }
-    Shape sb_shape(N, {sb_num});
+    format_bias_scale_new(&bias, &scale, &conv_param->scaleBias);
-    format_scale_bias(&scale,
-                      &bias,
-                      &conv_param->filter,
-                      &conv_param->scaleBias,
-                      param.groups);
    conv_param->scaleBias.flush();
-    float* bs_data = conv_param->scaleBias.data<float>();
    args.group_num = param.groups;
-    args.relu_enabled = param.relu.enabled;
+    args.sb_address = conv_param->scaleBias.data<float16>();
-    args.sb_address = conv_param->scaleBias.data<float>();
-    args.sb_address = bs_data;
    args.kernel.stride_h = param.strides[1];
    args.kernel.stride_w = param.strides[0];
    args.kernel.height = new_filter.shape().height();
@@ -372,6 +417,135 @@ inline void split_filter_num(const ConvParam& c_param) {
  }
 }
+inline void pack_channel_filter(const ConvParam& c_param) {
+  ConvParam& param = const_cast<ConvParam&>(c_param);
+  Tensor* input = param.input;
+  Tensor* out = param.output;
+  Tensor* filter = param.filter;
+  int filter_num_alignment = filter::get_filter_num_alignment();
+  auto filter_num = filter->shape().num();
+  int pack_num = get_pack_num(param.filter, param.groups);
+  int group_per_pack = (param.groups + pack_num - 1) / pack_num;
+  int filter_per_group = filter_num / param.groups;
+  int filter_per_pack = filter_per_group * group_per_pack;
+  int channel_per_pack = filter->shape().channel() * group_per_pack;
+  float max = find_max(*filter);
+  Shape& out_shape = out->shape();
+  for (int i = 0; i < pack_num; i++) {
+    BasicConvParam* conv_param = new BasicConvParam();
+    conv_param->output.setDataLocation(Device);
+    conv_param->output.setAligned(true);
+    float16* out_address = nullptr;
+    float* out_scale_address = nullptr;
+    float16* input_address = nullptr;
+    ConvArgs& args = conv_param->args;
+    if (pack_num == 1) {
+      out_address = out->data<float16>();
+      out_scale_address = out->scale();
+    }
+    int new_group = param.groups;
+    int filter_current_pack = filter->shape().num();
+    int channel_current_pack = input->shape().channel();
+    new_group = i == pack_num - 1
+                    ? param.groups - (pack_num - 1) * group_per_pack
+                    : group_per_pack;
+    filter_current_pack = new_group * filter_per_group;
+    channel_current_pack = new_group * filter->shape().channel();
+    if (pack_num == 1) {
+      input_address = input->data<float16>();
+    } else {
+      Shape in_shape(NCHW,
+                     {1,
+                      channel_current_pack,
+                      input->shape().height(),
+                      input->shape().width()});
+      input_address = conv_param->input.mutableData<float16>(FP16, in_shape);
+    }
+    if (pack_num != 1) {
+      Shape shape(
+          NHWC,
+          {1, out_shape.height(), out_shape.width(), filter_current_pack});
+      out_address = conv_param->output.mutableData<float16>(FP16, shape);
+      out_scale_address = conv_param->output.scale();
+    }
+    Shape f_shape(NCHW,
+                  {filter_current_pack,
+                   filter->shape().channel(),
+                   filter->shape().height(),
+                   filter->shape().width()});
+    Tensor new_filter;
+    float* new_filter_data = new_filter.mutableData<float>(FP32, f_shape);
+    int filter_hwc = filter->shape().height() * filter->shape().width() *
+                     filter->shape().channel();
+    memcpy(new_filter_data,
+           filter->data<float>() + i * filter_per_pack * filter_hwc,
+           filter_current_pack * filter_hwc * sizeof(float));
+    new_filter.flush();
+    conv_param->filter.mutableData<float>(FP32, f_shape);
+    float mem_factor = filter_num_alignment / filter_per_pack;
+    conv_param->filter.setMemScale(mem_factor);
+    std::vector<float> v;  // TODO(chonwhite) change variable name
+    format_filter(&new_filter, &(conv_param->filter), new_group, v, max);
+    conv_param->filter.setDataType(INT8);
+    Tensor scale;
+    Tensor bias;
+    int chnnnel_start = i * filter_per_pack;
+    Shape s_shape(NC, {1, filter_current_pack});
+    float* scale_data = scale.mutableData<float>(FP32, s_shape);
+    float* bias_data = bias.mutableData<float>(FP32, s_shape);
+    for (int n = 0; n < filter_current_pack; n++) {
+      scale_data[n] = param.scale()->data<float>()[n + chnnnel_start];
+    }
+    for (int n = 0; n < filter_current_pack; n++) {
+      bias_data[n] = param.bias()->data<float>()[n + chnnnel_start];
+    }
+    format_bias_scale_new(&bias, &scale, &conv_param->scaleBias);
+    conv_param->scaleBias.flush();
+    args.group_num = new_group;
+    args.sb_address = conv_param->scaleBias.data<float16>();
+    args.kernel.stride_h = param.strides[1];
+    args.kernel.stride_w = param.strides[0];
+    args.kernel.height = new_filter.shape().height();
+    args.kernel.width = new_filter.shape().width();
+    args.filter_address = conv_param->filter.data<int8_t>();
+    args.filter_num = filter_current_pack;
+    args.filter_scale_address = conv_param->filter.scale();
+    args.image.address = input_address;
+    args.image.scale_address = input->scale();
+    args.image.channels = channel_current_pack;
+    args.image.width = input->shape().width();
+    args.image.height = input->shape().height();
+    args.image.pad_width = param.paddings[1];
+    args.image.pad_height = param.paddings[0];
+    args.dilation = param.dilations[0];
+    args.output.address = out_address;
+    args.output.scale_address = out_scale_address;
+    param.splitParams().push_back(conv_param);
+  }
+}
 inline void split_channel(const ConvParam& c_param) {
  ConvParam& param = const_cast<ConvParam&>(c_param);
  Tensor* input = param.input;
@@ -383,6 +557,8 @@ inline void split_channel(const ConvParam& c_param) {
  Shape bs_shape(N, {channel});
+  float max = 1.0f;
  for (int i = 0; i < num; i++) {
    BasicConvParam* conv_param = new BasicConvParam();
@@ -406,7 +582,8 @@ inline void split_channel(const ConvParam& c_param) {
    }
    new_filter.flush();
    std::vector<float> scales;
-    format_filter(&new_filter, &(conv_param->filter), param.groups, scales);
+    format_filter(
+        &new_filter, &(conv_param->filter), param.groups, scales, max);
    Tensor bias;
    Tensor scale;
@@ -428,7 +605,6 @@ inline void split_channel(const ConvParam& c_param) {
    ConvArgs& args = conv_param->args;
    args.group_num = param.groups;
-    args.relu_enabled = param.relu.enabled;
    args.sb_address = conv_param->scaleBias.data<float>();
    args.kernel.stride_h = param.strides[1];
    args.kernel.stride_w = param.strides[0];
@@ -457,13 +633,17 @@ inline int fill_split_arg(const ConvParam& c_param) {
  ConvParam& param = const_cast<ConvParam&>(c_param);
  Tensor* input = param.input;
  Tensor* output = param.output;
  if (output->shape().dimSize() == 4 && input->shape().channel() > 2047 &&
      input->shape().width() == 1) {
    split_channel(c_param);
    return 1;
-  } else {
+  } else if (param.groups == 1) {
    split_filter_num(c_param);
    return 0;
+  } else {
+    pack_channel_filter(c_param);
+    return 0;
  }
 }

--- a/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp
+++ b/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp
@@ -24,6 +24,17 @@ namespace zynqmp {
 class DepthwiseConvPE : public PE {
 public:
+  inline int gcd_(int a, int b) {
+    while (b) {
+      int temp = a;
+      a = b;
+      b = temp % b;
+    }
+    return a;
+  }
+  inline int lcm_(int a, int b) { return a * b / gcd_(a, b); }
  bool init() {
    Tensor* output = param_.output;
    output->setAligned(true);
@@ -37,17 +48,40 @@ class DepthwiseConvPE : public PE {
    Tensor* output = param.output;
    int channel = output->shape().channel();
-    float16* b_data = bias_.mutableData<float16>(FP16, param_.bias()->shape());
+    int repeat = 1;
+    int alignment = 16;
+    int length = channel;
+    if (channel % alignment != 0 || channel < alignment) {
+      int c_lcm = lcm_(channel, alignment);
+      repeat = c_lcm / (channel);
+    }
+    Shape shape(N, {channel * repeat});
+    float16* b_data = bias_.mutableData<float16>(FP16, shape);
    if (param_.bias()->dataType() == FP32) {
      float* new_bias_data = param_.bias()->data<float>();
      // bias从float转换成float16
-      for (int i = 0; i < channel; i++) {
+      // for (int i = 0; i < channel; i++) {
-        b_data[i] = float_to_half(new_bias_data[i]);
+      //   b_data[i] = float_to_half(new_bias_data[i]);
+      // }
+      // bias 按16对齐填充hw
+      for (int i = 0; i < repeat; i++) {
+        for (int j = 0; j < length; j++) {
+          float16 value = float_to_half(new_bias_data[j]);
+          b_data[i * length + j] = value;
+        }
      }
      bias_.flush();
    } else {
      float16* new_bias_data = param_.bias()->data<float16>();
-      memcpy(b_data, new_bias_data, channel * sizeof(float16));
+      // memcpy(b_data, new_bias_data, channel * sizeof(float16));
+      for (int i = 0; i < repeat; i++) {
+        for (int j = 0; j < length; j++) {
+          // float16 value = float_to_half(bias_data_float[j]);
+          b_data[i * length + j] = new_bias_data[j];
+        }
+      }
      bias_.flush();
    }
@@ -62,6 +96,8 @@ class DepthwiseConvPE : public PE {
      float16* scale_data = param_.scale()->data<float16>();
      float16* filter_data = param.quantizedFilter()->mutableData<float16>(
          FP16, param.filter->shape());
+      // memcpy(filter_data, scale_data, channel * sizeof(float16));
      memcpy(filter_data,
             scale_data,
             param.filter->shape().numel() * sizeof(float16));
@@ -89,20 +125,33 @@ class DepthwiseConvPE : public PE {
    args.sub_conv_num = 1;
    param.args = args;
-    inplace_.relu_enable = param_.relu.enabled;
    inplace_.power_enable = false;
    inplace_.normalize_enable = false;
  }
  bool dispatch() {
    param_.input->syncToDevice();
-    if (param_.relu.enabled) {
+    if (param_.activeParam.type == TYPE_RELU) {
-      inplace_.relu_enable = param_.relu.enabled;
+      inplace_.relu_enable = true;
+    } else if (param_.activeParam.type == TYPE_RELU6) {
+      inplace_.relu6_enable = true;
+    } else if (param_.activeParam.type == TYPE_SIGMOID) {
+      inplace_.sigmoid_enable = true;
+    } else if (param_.activeParam.type == TYPE_LEAKY_RELU) {
+      inplace_.leaky_relu_enable = true;
+    }
+    if (inplace_.relu_enable || inplace_.leaky_relu_enable ||
+        inplace_.relu6_enable || inplace_.sigmoid_enable) {
      config_inplace(inplace_);
    }
    bool ret = compute_fpga_dwconv(param_.args) == 0;
-    if (param_.relu.enabled) {
+    if (inplace_.relu_enable || inplace_.leaky_relu_enable ||
+        inplace_.relu6_enable || inplace_.sigmoid_enable) {
      inplace_.relu_enable = false;
+      inplace_.leaky_relu_enable = false;
+      inplace_.relu6_enable = false;
+      inplace_.sigmoid_enable = false;
      config_inplace(inplace_);
    }
    return ret;

--- a/lite/backends/fpga/KD/pes/elementwise_add_pe.hpp
+++ b/lite/backends/fpga/KD/pes/elementwise_add_pe.hpp
@@ -58,15 +58,29 @@ class ElementwiseAddPE : public PE {
  bool dispatch() {
    param_.inputs[0]->syncToDevice();
    param_.inputs[1]->syncToDevice();
-    InplaceArgs inplace_args = {0};
+    // InplaceArgs inplace_ = {0};
-    if (param_.relu.enabled) {
-      inplace_args.relu_enable = true;
+    if (param_.activeParam.type == TYPE_RELU) {
-      config_inplace(inplace_args);
+      inplace_.relu_enable = true;
+    } else if (param_.activeParam.type == TYPE_RELU6) {
+      inplace_.relu6_enable = true;
+    } else if (param_.activeParam.type == TYPE_SIGMOID) {
+      inplace_.sigmoid_enable = true;
+    } else if (param_.activeParam.type == TYPE_LEAKY_RELU) {
+      inplace_.leaky_relu_enable = true;
+    }
+    if (inplace_.relu_enable || inplace_.leaky_relu_enable ||
+        inplace_.relu6_enable || inplace_.sigmoid_enable) {
+      config_inplace(inplace_);
    }
    compute_fpga_ewadd(param_.ewargs);
-    if (param_.relu.enabled) {
+    if (inplace_.relu_enable || inplace_.leaky_relu_enable ||
-      inplace_args.relu_enable = false;
+        inplace_.relu6_enable || inplace_.sigmoid_enable) {
-      config_inplace(inplace_args);
+      inplace_.relu_enable = false;
+      inplace_.relu6_enable = false;
+      inplace_.sigmoid_enable = false;
+      inplace_.leaky_relu_enable = false;
+      config_inplace(inplace_);
    }
    return true;
  }
@@ -75,6 +89,7 @@ class ElementwiseAddPE : public PE {
 private:
  ElementwiseAddParam param_;
+  InplaceArgs inplace_ = {0};
 };
 }  // namespace zynqmp

--- a/lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp
+++ b/lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp
@@ -29,17 +29,17 @@ class ElementwiseMulPE : public PE {
  }
  void apply() {
-    Tensor* input = param_.inputs[0];
+    Tensor* input = param_.input_x;
    Tensor* output = param_.output;
-    int wc_aligned = align_to_x(param_.inputs[0]->shape().numel(), 32);
+    int wc_aligned = align_to_x(param_.input_x->shape().numel(), 32);
    Shape s(N, {wc_aligned});
    float16* bias_data = bias_tensor.mutableData<float16>(FP16, s);
    memset(bias_data, 0, wc_aligned * sizeof(float16));
    ScaleArgs& args = args_;
-    args.scale_address = param_.inputs[1]->data<void>();
+    args.scale_address = param_.input_y->data<void>();
    args.bias_address = bias_tensor.data<void>();
    args.wc_alignment = wc_aligned;
    args.channel_alignment = wc_aligned;

--- a/lite/backends/fpga/KD/pes/fully_connected_pe.hpp
+++ b/lite/backends/fpga/KD/pes/fully_connected_pe.hpp
@@ -37,10 +37,9 @@ class FullyConnectedPE : public PE {
    ConvParam& convParam_ = convPE_.param();
    Tensor* input = param_.input;
    convParam_.input = param_.input;
-    num_ = param_.input->shape().num();
    convParam_.output = param_.output;
+    // convParam_.relu = param_.relu;
+    convParam_.activeParam.type = param_.activeParam.type;
    convParam_.groups = 1;
    convParam_.strides = {1, 1};
    convParam_.paddings = {0, 0};
@@ -49,6 +48,9 @@ class FullyConnectedPE : public PE {
    int num = param_.filter->shape().channel();
    int chw = param_.filter->shape().num();
+    // if (num == 2) {
+    //   return;
+    // }
    int height = param_.input->shape().height();
    int width = param_.input->shape().width();
@@ -66,6 +68,7 @@ class FullyConnectedPE : public PE {
        new_filter_data[i * chw + j] = scale;
      }
    }
    conv_filter->flush();
    convParam_.filter = conv_filter;
@@ -84,15 +87,51 @@ class FullyConnectedPE : public PE {
    convPE_.apply();
  }
-  bool dispatch() { return convPE_.dispatch(); }
+  void cpu_compute() {
+    int num = param_.filter->shape().channel();
+    int chw = param_.filter->shape().num();
+    float* filter_data = param_.filter->data<float>();
+    float max = 0.0f;
+    Tensor* input = param_.input;
+    Tensor* output = param_.output;
+    float16* input_data = input->data<float16>();
+    float16* output_data = output->data<float16>();
+    for (int i = 0; i < num; i++) {
+      float sum = 0;
+      float bias = param_.bias->data<float>()[i];
+      for (int j = 0; j < chw; j++) {
+        float scale = filter_data[j * num + i];
+        float data = half_to_float(input_data[j]);
+        sum += scale * data;
+      }
+      output_data[i] = float_to_half(sum + bias);
+      if (max < output_data[i]) {
+        max = output_data[i];
+      }
+    }
+    output->flush();
+    output->scale()[0] = max / 127.0f;
+    output->scale()[1] = 127.0f / max;
+  }
+  bool dispatch() {
+    // int num = param_.filter->shape().channel();
+    // if (num == 2) {
+    //   cpu_compute();
+    //   return 1;
+    // } else {
+    return convPE_.dispatch();
+    // }
+  }
  FullyConnectedParam& param() { return param_; }
 private:
  FullyConnectedParam param_;
  ConvPE convPE_;
-  Tensor tempOut_;
-  int num_ = 1;
 };
 }  // namespace zynqmp
 }  // namespace paddle
--- a/lite/backends/fpga/KD/pes/gru_pe.hpp
+++ b/lite/backends/fpga/KD/pes/gru_pe.hpp
@@ -47,8 +47,10 @@ class GRUPE : public PE {
    zynqmp::Shape hidden_shape{zynqmp::NCHW, {1, frame_size, 1, 1}};
    float16* prev_hidden_data =
        prev_hidden_.mutableData<float16>(zynqmp::FP16, hidden_shape);
+    // set previous hidden data to 0;
    memset(prev_hidden_data, 0, hidden_shape.numel() * sizeof(float16));
+    // copy 2/3 weight from param.weight;
    zynqmp::Shape weight_shape{zynqmp::NC, {frame_size, frame_size * 2}};
    float* weight_data = weight_.mutableData<float>(zynqmp::FP32, weight_shape);
    memset(weight_data, 0, weight_shape.numel() * sizeof(float));
@@ -74,7 +76,9 @@ class GRUPE : public PE {
    reset_hidden_.mutableData<void>(FP16, hidden_shape);
    ElementwiseMulParam& mul_param = mul_pe_.param();
-    mul_param.inputs = {&reset_gate_, &prev_hidden_};
+    // mul_param.inputs = {&reset_gate_, &prev_hidden_};
+    mul_param.input_x = &reset_gate_;
+    mul_param.input_y = &prev_hidden_;
    mul_param.output = &reset_hidden_;
    mul_pe_.init();
    mul_pe_.apply();
@@ -115,11 +119,8 @@ class GRUPE : public PE {
      if (hidden_prev) {
        // TODO(chonwhite): change to pre_out;
        prev_hidden_.copyFrom(value.pre_output);
-        prev_hidden_.saveToFile("prev_.txt");
      }
      mul_pe_.dispatch();
-      reset_hidden_.saveToFile("reset_hidden_.txt");
      update_gate_data += stride_update;
      reset_gate_data += stride_update;

--- a/lite/backends/fpga/KD/pes/norm_pe.hpp
+++ b/lite/backends/fpga/KD/pes/norm_pe.hpp
@@ -72,8 +72,10 @@ class NormPE : public PE {
    input_float.mutableData<float>(FP32, param_.input->shape());
    float_out.mutableData<float>(FP32, param_.output->shape());
+    // param_.input->syncToDevice();
    input_float.copyFrom(param_.input);
    input_float.syncToCPU();
+    // input_float.saveToFile("normalize_", true);
    int channel = input_float.shape().channel();
    int height = input_float.shape().height();
@@ -85,6 +87,7 @@ class NormPE : public PE {
    float* out_ptr = float_out.data<float>();
    int loop = height * width;
+#pragma omp parallel for
    for (int i = 0; i < loop; i++) {
      float sum = param_.epsilon;
      for (int c = 0; c < channel; c++) {
@@ -98,11 +101,26 @@ class NormPE : public PE {
      }
    }
    float_out.flush();
+    // float_out.saveToFile("normalize_", true);
    param_.output->copyFrom(&float_out);
  }
  bool dispatch() {
    cpuCompute();
+    // std::cout << "CPU normalize ---------------------" << std::endl;
+    // param_.input->syncToDevice();
+    // // param_.input->saveToFile("normalize_fpga_", true);
+    // config_norm_param(norm_param_args_);
+    // inplace_args_.normalize_enable = true;
+    // config_inplace(inplace_args_);
+    // perform_bypass(bypass_args_);
+    // inplace_args_.normalize_enable = false;
+    // config_inplace(inplace_args_);
+    // compute_norm(norm_args_);
+    // param_.output->saveToFile("normalize_fpga_", true);
+    // std::cout << "FPGA normalize ---------------------" << std::endl;
    return true;
  }

--- a/lite/backends/fpga/KD/pes/pooling_pe.hpp
+++ b/lite/backends/fpga/KD/pes/pooling_pe.hpp
@@ -35,14 +35,17 @@ class PoolingPE : public PE {
    Tensor* input = param_.input;
    Tensor* output = param_.output;
-    uint32_t k_height = param_.kernelSize[0];
+    uint32_t k_height = 1;
-    uint32_t k_width = param_.kernelSize[1];
+    uint32_t k_width = 1;
    if (param_.globalPooling) {
      k_width = input->shape().width();
      k_height = input->shape().height();
      param_.kernelSize[0] = k_height;
      param_.kernelSize[1] = k_width;
+    } else {
+      k_height = param_.kernelSize[0];
+      k_width = param_.kernelSize[1];
    }
    PoolingArgs args = {0};
@@ -65,9 +68,12 @@ class PoolingPE : public PE {
    args.out_width = output->shape().width();
    param_.poolingArgs = args;
+    // use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1
+    // &&
+    //            (k_width > 7 || k_height > 7);
    use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1 &&
-               (k_width > 7 || k_height > 7);
+               (k_width > 255 || k_height > 255);
-    use_cpu_ = param_.type == AVERAGE;
+    // use_cpu_ = param_.type == AVERAGE;
  }
  void compute() {
@@ -76,6 +82,7 @@ class PoolingPE : public PE {
    input->syncToCPU();
    Tensor float_input;
+    // Tensor float_output;
    float* image_addr = float_input.mutableData<float>(FP32, input->shape());
    float_input.copyFrom(input);
    float16* data_out = output->data<float16>();
@@ -110,6 +117,8 @@ class PoolingPE : public PE {
        for (int c = 0; c < image_channels; ++c) {
          const int pool_index = (ph * pooled_width_ + pw) * image_channels + c;
          float sum = 0;
+          // const int index =
+          //     (hstart * image_width + wstart) * image_channels + c;
          for (int h = hstart; h < hend; ++h) {
            for (int w = wstart; w < wend; ++w) {
              const int index = (h * image_width + w) * image_channels + c;
@@ -138,7 +147,9 @@ class PoolingPE : public PE {
    Tensor float_input;
    float_input.mutableData<float>(FP32, input->shape());
    float_input.copyFrom(input);
+    // float_input.saveToFile("pool_float.txt");
    float16* data_out = output->data<float16>();
    int kernel_hw = param_.kernelSize[0] * param_.kernelSize[1];
    float scale_max = 0;
@@ -155,6 +166,7 @@ class PoolingPE : public PE {
    output->scale()[0] = scale_max / 127.0f;
    output->scale()[1] = 127.0f / scale_max;
    output->flush();
+    // exit(-1);
  }
  void cpu_compute() {
@@ -184,11 +196,14 @@ class PoolingPE : public PE {
    output->scale()[0] = scale_max / 127.0f;
    output->scale()[1] = 127.0f / scale_max;
    output->flush();
+    // exit(-1);
  }
  bool dispatch() {
    if (use_cpu_) {
+      // cpu_compute();
      compute();
+      // exit(-1);
      return true;
    }
    param_.input->syncToDevice();

--- a/lite/backends/fpga/KD/pes/scale_pe.hpp
+++ b/lite/backends/fpga/KD/pes/scale_pe.hpp
@@ -89,7 +89,6 @@ class ScalePE : public PE {
          }
        }
      }
      float* scale_data_float = param_.scale->data<float>();
      for (int i = 0; i < repeat; i++) {
        for (int j = 0; j < length; j++) {

--- a/lite/backends/fpga/KD/pes/split_pe.hpp
+++ b/lite/backends/fpga/KD/pes/split_pe.hpp
@@ -53,20 +53,37 @@ class SplitPE : public PE {
    int64_t src_after = src_stride_numel[axis];
    int64_t dst_after = dst_stride_numel[axis];
+    // PADDLE_MOBILE_ENFORCE(src_stride_numel.size() == dst_stride_numel.size(),
+    //                       "src and dst tensor should have the same dims
+    //                       size.");
    for (int64_t i = 0; i < axis; ++i) {
      if (i < axis) {
+        // PADDLE_MOBILE_ENFORCE(src_stride_numel[i] / src_stride_numel[axis] ==
+        //                           dst_stride_numel[i] /
+        //                           dst_stride_numel[axis],
+        //                       "src and dst should have the same elements "
+        //                       "except the specified axis.");
      } else if (i == axis) {
        continue;
      } else {
+        // PADDLE_MOBILE_ENFORCE(src_stride_numel[i] == dst_stride_numel[i],
+        //                       "src and dst should have the same elements "
+        //                       "except the specified axis.");
      }
    }
    for (int64_t i = 0; i < before; ++i) {
-      memory::Copy(dst + i * dst_after, src + i * src_after, sizeof(T) * size);
+      memcpy(dst + i * dst_after, src + i * src_after, sizeof(T) * size);
    }
  }
-  void split3D() { int axis = param_.axis; }
+  void split3D() {
+    int axis = param_.axis;
+    // float16* dst = param_.output->data<float16>();
+    // std::vector<int>& dst_dims = ;
+    // StridedNumelCopyWithAxis();
+  }
  bool dispatch() {
    Tensor* input = param_.input;
@@ -88,6 +105,7 @@ class SplitPE : public PE {
                                          in_stride,
                                          out_stride[axis]);
        input_offset += out_stride[axis];
+        // out->flush();
      }
      return true;
    }
@@ -95,21 +113,26 @@ class SplitPE : public PE {
    std::vector<Tensor*> outputs = param_.outputs;
    int in_channel = input->shape().channel();
-    int split_channel = input->shape().channel() / param_.num;
+    // int split_channel = input->shape().channel() / param_.num;
    int hw = input->shape().height() * input->shape().width();
    float16* in_data = input->data<float16>();
    for (int i = 0; i < hw; i++) {
+      int channel_stride = 0;
      for (int n = 0; n < outputs.size(); n++) {
        Tensor* out = outputs[n];
        float16* out_data = out->data<float16>();
-        memcpy(out_data + i * split_channel,
+        memcpy(out_data + i * out->shape().channel(),
-               in_data + i * in_channel + n * split_channel,
+               in_data + i * in_channel + channel_stride,
-               split_channel * sizeof(float16));
+               out->shape().channel() * sizeof(float16));
+        channel_stride += out->shape().channel();
      }
    }
    for (int n = 0; n < outputs.size(); n++) {
      Tensor* out = outputs[n];
+      out->flush();
      out->copyScaleFrom(input);
    }
    return true;
@@ -120,5 +143,6 @@ class SplitPE : public PE {
 private:
  SplitParam param_;
 };
 }  // namespace zynqmp
 }  // namespace paddle
--- a/lite/backends/fpga/KD/tensor.hpp
+++ b/lite/backends/fpga/KD/tensor.hpp
@@ -300,14 +300,12 @@ class Tensor {
  }
  void flush() {
-    size_t memorySize =
+    size_t memorySize = placeHolder_->memorySize();
-        shape_->memorySize(CellSize(dataType_)) * mem_scale_factor_;
    fpga_flush(placeHolder_->data(), memorySize);
  }
  void invalidate() {
-    size_t memorySize =
+    size_t memorySize = placeHolder_->memorySize();
-        shape_->memorySize(CellSize(dataType_)) * mem_scale_factor_;
    fpga_invalidate(placeHolder_->data(), memorySize);
  }
@@ -385,6 +383,7 @@ class Tensor {
    invalidate();
    std::ofstream ofs;
    ofs.open(path);
+    ofs << scale()[0] << " / " << scale()[1] << std::endl;
    for (int i = 0; i < shape_->numel(); i++) {
      float value = 0;

--- a/lite/kernels/fpga/conv_compute.cc
+++ b/lite/kernels/fpga/conv_compute.cc
@@ -46,7 +46,10 @@ void ConvCompute::PrepareForRun() {
    conv_param.dilations = *param.dilations;
    fill_scale_bias_const(&conv_param);
    conv_param.bias()->copyFrom(param.bias->ZynqTensor());
-    conv_param.relu.enabled = param.fuse_relu;
+    if (param.fuse_relu) {
+      conv_param.activeParam.type = zynqmp::TYPE_RELU;
+    }
    dw_conv_pe_.init();
    dw_conv_pe_.apply();
@@ -65,7 +68,16 @@ void ConvCompute::PrepareForRun() {
      conv_param.bias()->copyFrom(param.bias->ZynqTensor());
    }
-    conv_param.relu.enabled = param.fuse_relu;
+    if (param.fuse_relu) {
+      conv_param.activeParam.type = zynqmp::TYPE_RELU;
+    }
+    // conv_param.filter->saveToFile("conv_filter_", true);
+    // if (param.bias != nullptr) {
+    //   std::cout << "param.bias != nullptr" << std::endl;
+    //   conv_param.bias()->saveToFile("conv_bias_", true);
+    // }
    conv_pe_.init();
    conv_pe_.apply();
  }
@@ -76,8 +88,14 @@ void ConvCompute::Run() {
  if (param.x->ZynqTensor()->shape().channel() != 1 &&
      param.groups == param.x->ZynqTensor()->shape().channel()) {
    dw_conv_pe_.dispatch();
+#ifdef FPGA_PRINT_TENSOR
+    zynqmp::DepthwiseConvParam& dwconv_param = dw_conv_pe_.param();
+    Debugger::get_instance().registerOutput("dwconv", dwconv_param.output);
+#endif
  } else {
+    // zynqmp::ConvParam& conv_param = conv_pe_.param();
    conv_pe_.dispatch();
 #ifdef FPGA_PRINT_TENSOR
    zynqmp::ConvParam& conv_param = conv_pe_.param();
    Debugger::get_instance().registerOutput("conv", conv_param.output);
@@ -103,3 +121,21 @@ REGISTER_LITE_KERNEL(
                                       PRECISION(kFP16),
                                       DATALAYOUT(kNHWC))})
    .Finalize();
+REGISTER_LITE_KERNEL(depthwise_conv2d,
+                     kFPGA,
+                     kFP16,
+                     kNHWC,
+                     paddle::lite::kernels::fpga::ConvCompute,
+                     def)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Output",
+                {LiteType::GetTensorTy(TARGET(kFPGA),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
--- a/lite/kernels/fpga/elementwise_compute.cc
+++ b/lite/kernels/fpga/elementwise_compute.cc
@@ -33,7 +33,8 @@ void ElementwiseAddCompute::PrepareForRun() {
  ew_param.inputs = {param.X->ZynqTensor(), param.Y->ZynqTensor()};
  ew_param.output = param.Out->ZynqTensor();
  ew_param.axis = param.axis;
-  ew_param.relu.enabled = false;
+  ew_param.activeParam.type = zynqmp::TYPE_NONE;
  pe_.init();
  pe_.apply();
@@ -56,7 +57,7 @@ void ElementwiseAddActivationCompute::PrepareForRun() {
  ew_param.inputs = {param.X->ZynqTensor(), param.Y->ZynqTensor()};
  ew_param.output = param.Out->ZynqTensor();
  ew_param.axis = param.axis;
-  ew_param.relu.enabled = true;
+  ew_param.activeParam.type = zynqmp::TYPE_RELU;
  pe_.init();
  pe_.apply();
 }
@@ -76,7 +77,7 @@ void ElementwiseMulCompute::PrepareForRun() {
  scale_param.input = param.X->ZynqTensor();
  scale_param.output = param.Out->ZynqTensor();
-  scale_param.relu.enabled = false;
+  scale_param.activeParam.type = zynqmp::TYPE_NONE;
  int channel = scale_param.input->shape().channel();
  zynqmp::Tensor* scale = new zynqmp::Tensor();
@@ -124,7 +125,10 @@ REGISTER_LITE_KERNEL(elementwise_add,
               {LiteType::GetTensorTy(TARGET(kFPGA),
                                      PRECISION(kFP16),
                                      DATALAYOUT(kNHWC))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
    .BindOutput("Out",
                {LiteType::GetTensorTy(TARGET(kFPGA),
                                       PRECISION(kFP16),

--- a/lite/kernels/fpga/elementwise_compute_test.cc
+++ b/lite/kernels/fpga/elementwise_compute_test.cc
@@ -93,18 +93,25 @@ void elementwise_compute_ref(const operators::ElementwiseParam& param,
  }
  // do elementwise add/sub/max...
  if (elt_type == "add") {
-    for (int i = 0; i < batch; ++i) {
+    // for (int i = 0; i < batch; ++i) {
-      for (int j = 0; j < channels; ++j) {
+    //   for (int j = 0; j < channels; ++j) {
-        int offset = (i * channels + j) * num;
+    //     int offset = (i * channels + j) * num;
-        const dtype* din_ptr = x_data + offset;
+    //     const dtype* din_ptr = x_data + offset;
-        const dtype diny_data = y_data[j];
+    //     const dtype diny_data = y_data[j];
-        dtype* dout_ptr = out_data + offset;
+    //     dtype* dout_ptr = out_data + offset;
-        for (int k = 0; k < num; ++k) {
+    //     for (int k = 0; k < num; ++k) {
-          *dout_ptr = sum(*din_ptr, diny_data);
+    //       *dout_ptr =
-          dout_ptr++;
+    //       zynqmp::float_to_half(sum(zynqmp::half_to_float(*din_ptr),
-          din_ptr++;
+    //       zynqmp::half_to_float(diny_data)));
-        }
+    //       dout_ptr++;
-      }
+    //       din_ptr++;
+    //     }
+    //   }
+    // }
+    int count = x_dims[0] * x_dims[1] * x_dims[2] * x_dims[3];
+    for (int i = 0; i < count; ++i) {
+      out_data[i] = zynqmp::float_to_half(sum(
+          zynqmp::half_to_float(x_data[i]), zynqmp::half_to_float(y_data[i])));
    }
  } else if (elt_type == "sub") {
    for (int i = 0; i < batch; ++i) {
@@ -148,9 +155,9 @@ TEST(elementwise_add, compute) {
  lite::Tensor x, y, output, output_ref;
  for (auto n : {1}) {
-    for (auto c : {8}) {
+    for (auto h : {72}) {
-      for (auto h : {8}) {
+      for (auto w : {192}) {
-        for (auto w : {8}) {
+        for (auto c : {24}) {
          for (auto axis : {0}) {
            for (auto yd : {std::vector<int64_t>({n, c, h, w})}) {
              auto x_dim = DDim(std::vector<int64_t>({n, c, h, w}));
@@ -174,10 +181,16 @@ TEST(elementwise_add, compute) {
              auto* output_ref_data =
                  output_ref.mutable_data<float16>(TARGET(kFPGA));
              for (int i = 0; i < x_dim.production(); i++) {
-                x_data[i] = zynqmp::float_to_half(i);
+                float sign = i % 3 == 0 ? -0.03 : 0.05f;
+                float x = sign * (i % 128);
+                std::cout << "x:" << x << std::endl;
+                x_data[i] = zynqmp::float_to_half(x);
              }
              for (int i = 0; i < y_dim.production(); i++) {
-                y_data[i] = zynqmp::float_to_half(i);
+                float sign = i % 3 == 0 ? -0.03 : 0.05f;
+                float y = sign * (i % 128);
+                std::cout << "y:" << y << std::endl;
+                y_data[i] = zynqmp::float_to_half(y);
              }
              param.X = &x;
              param.Y = &y;
@@ -190,7 +203,14 @@ TEST(elementwise_add, compute) {
              elementwise_compute_ref<float16>(param, "add", "");
              for (int i = 0; i < output.dims().production(); i++) {
-                EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
+                std::cout << "output_data:"
+                          << zynqmp::half_to_float(output_data[i])
+                          << ",output_ref_data:"
+                          << zynqmp::half_to_float(output_ref_data[i])
+                          << std::endl;
+                EXPECT_NEAR(zynqmp::half_to_float(output_data[i]),
+                            zynqmp::half_to_float(output_ref_data[i]),
+                            1e-5);
              }
            }
          }
@@ -209,73 +229,74 @@ TEST(fusion_elementwise_add_activation_fpga, retrive_op) {
  ASSERT_TRUE(fusion_elementwise_add_activation.front());
 }
-TEST(fusion_elementwise_add_activation_fpga, init) {
+// TEST(fusion_elementwise_add_activation_fpga, init) {
-  ElementwiseAddActivationCompute fusion_elementwise_add_activation;
+//   ElementwiseAddActivationCompute fusion_elementwise_add_activation;
-  ASSERT_EQ(fusion_elementwise_add_activation.precision(), PRECISION(kFP16));
+//   ASSERT_EQ(fusion_elementwise_add_activation.precision(), PRECISION(kFP16));
-  ASSERT_EQ(fusion_elementwise_add_activation.target(), TARGET(kFPGA));
+//   ASSERT_EQ(fusion_elementwise_add_activation.target(), TARGET(kFPGA));
-}
+// }
-TEST(fusion_elementwise_add_activation_fpga, compute) {
+// TEST(fusion_elementwise_add_activation_fpga, compute) {
-  ElementwiseAddActivationCompute fusion_elementwise_add_activation;
+//   ElementwiseAddActivationCompute fusion_elementwise_add_activation;
-  operators::FusionElementwiseActivationParam param;
+//   operators::FusionElementwiseActivationParam param;
-  lite::Tensor x, y, output, output_ref;
+//   lite::Tensor x, y, output, output_ref;
-  for (auto act_type : {"relu"}) {
+//   for (auto act_type : {"relu"}) {
-    for (auto n : {1}) {
+//     for (auto n : {1}) {
-      for (auto c : {8}) {
+//       for (auto c : {8}) {
-        for (auto h : {8}) {
+//         for (auto h : {8}) {
-          for (auto w : {8}) {
+//           for (auto w : {8}) {
-            for (auto axis : {0}) {
+//             for (auto axis : {0}) {
-              for (auto yd : {std::vector<int64_t>({n, c, h, w})}) {
+//               for (auto yd : {std::vector<int64_t>({n, c, h, w})}) {
-                auto x_dim = DDim(std::vector<int64_t>({n, c, h, w}));
+//                 auto x_dim = DDim(std::vector<int64_t>({n, c, h, w}));
-                auto y_dim = DDim(yd);
+//                 auto y_dim = DDim(yd);
-                int axis_t = axis < 0 ? x_dim.size() - y_dim.size() : axis;
+//                 int axis_t = axis < 0 ? x_dim.size() - y_dim.size() : axis;
-                if (axis_t + y_dim.size() > 4) continue;
+//                 if (axis_t + y_dim.size() > 4) continue;
-                bool flag = false;
+//                 bool flag = false;
-                for (int i = 0; i < y_dim.size(); i++) {
+//                 for (int i = 0; i < y_dim.size(); i++) {
-                  if (x_dim[i + axis_t] != y_dim[i]) flag = true;
+//                   if (x_dim[i + axis_t] != y_dim[i]) flag = true;
-                }
+//                 }
-                if (flag) continue;
+//                 if (flag) continue;
-                x.Resize(x_dim);
+//                 x.Resize(x_dim);
-                y.Resize(y_dim);
+//                 y.Resize(y_dim);
-                output.Resize(x_dim);
+//                 output.Resize(x_dim);
-                output_ref.Resize(x_dim);
+//                 output_ref.Resize(x_dim);
-                auto* x_data = x.mutable_data<float16>(TARGET(kFPGA));
+//                 auto* x_data = x.mutable_data<float16>(TARGET(kFPGA));
-                auto* y_data = y.mutable_data<float16>(TARGET(kFPGA));
+//                 auto* y_data = y.mutable_data<float16>(TARGET(kFPGA));
-                auto* output_data = output.mutable_data<float16>(TARGET(kFPGA));
+//                 auto* output_data =
-                auto* output_ref_data =
+//                 output.mutable_data<float16>(TARGET(kFPGA));
-                    output_ref.mutable_data<float16>(TARGET(kFPGA));
+//                 auto* output_ref_data =
-                for (int i = 0; i < x_dim.production(); i++) {
+//                     output_ref.mutable_data<float16>(TARGET(kFPGA));
-                  float sign = i % 3 == 0 ? -1.0f : 1.0f;
+//                 for (int i = 0; i < x_dim.production(); i++) {
-                  x_data[i] = zynqmp::float_to_half(i * sign);
+//                   float sign = i % 3 == 0 ? -1.0f : 1.0f;
-                }
+//                   x_data[i] = zynqmp::float_to_half(i * sign);
-                for (int i = 0; i < y_dim.production(); i++) {
+//                 }
-                  float sign = i % 2 == 0 ? 0.5f : -0.5f;
+//                 for (int i = 0; i < y_dim.production(); i++) {
-                  y_data[i] = zynqmp::float_to_half(i * sign);
+//                   float sign = i % 2 == 0 ? 0.5f : -0.5f;
-                }
+//                   y_data[i] = zynqmp::float_to_half(i * sign);
-                param.X = &x;
+//                 }
-                param.Y = &y;
+//                 param.X = &x;
-                param.axis = axis;
+//                 param.Y = &y;
-                param.Out = &output;
+//                 param.axis = axis;
-                param.act_type = act_type;
+//                 param.Out = &output;
-                fusion_elementwise_add_activation.SetParam(param);
+//                 param.act_type = act_type;
-                fusion_elementwise_add_activation.PrepareForRun();
+//                 fusion_elementwise_add_activation.SetParam(param);
-                fusion_elementwise_add_activation.Run();
+//                 fusion_elementwise_add_activation.PrepareForRun();
-                param.Out = &output_ref;
+//                 fusion_elementwise_add_activation.Run();
-                elementwise_compute_ref<float16>(param, "add", act_type);
+//                 param.Out = &output_ref;
-                for (int i = 0; i < output.dims().production(); i++) {
+//                 elementwise_compute_ref<float16>(param, "add", act_type);
-                  EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
+//                 for (int i = 0; i < output.dims().production(); i++) {
-                }
+//                   EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
-              }
+//                 }
-            }
+//               }
-          }
+//             }
-        }
+//           }
-      }
+//         }
-    }
+//       }
-  }
+//     }
-}
+//   }
+// }
 }  // namespace fpga
 }  // namespace kernels
@@ -283,4 +304,4 @@ TEST(fusion_elementwise_add_activation_fpga, compute) {
 }  // namespace paddle
 USE_LITE_KERNEL(elementwise_add, kFPGA, kFP16, kNHWC, def);
-USE_LITE_KERNEL(fusion_elementwise_add_activation, kFPGA, kFP16, kNHWC, def);
+// USE_LITE_KERNEL(fusion_elementwise_add_activation, kFPGA, kFP16, kNHWC, def);
--- a/lite/kernels/fpga/feed_compute.cc
+++ b/lite/kernels/fpga/feed_compute.cc
@@ -67,3 +67,13 @@ REGISTER_LITE_KERNEL(
                                       PRECISION(kFP16),
                                       DATALAYOUT(kNHWC))})
    .Finalize();
+REGISTER_LITE_KERNEL(feed,
+                     kFPGA,
+                     kFP16,
+                     kNHWC,
+                     paddle::lite::kernels::fpga::FeedCompute,
+                     def_host)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
+    .Finalize();
--- a/lite/kernels/fpga/fetch_compute.cc
+++ b/lite/kernels/fpga/fetch_compute.cc
@@ -43,8 +43,14 @@ void FetchCompute::PrepareForRun() {
 }
 void FetchCompute::Run() {
-  pe_.dispatch();
  auto& param = this->Param<param_t>();
+  auto fetch_list = param.fetch_list;
+  if (fetch_list->size() <= static_cast<size_t>(param.col)) {
+    fetch_list->resize(param.col + 1);
+  }
+  Tensor& out = param.fetch_list->at(param.col);
+  out.Resize(param.input->dims());
+  pe_.dispatch();
 #ifdef FPGA_PRINT_TENSOR
  zynqmp::OutputParam& fetch_param = pe_.param();
@@ -67,10 +73,7 @@ REGISTER_LITE_KERNEL(fetch,
               {LiteType::GetTensorTy(TARGET(kFPGA),
                                      PRECISION(kAny),
                                      DATALAYOUT(kAny))})
-    .BindOutput("Out",
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
-                {LiteType::GetTensorTy(TARGET(kHost),
-                                       PRECISION(kAny),
-                                       DATALAYOUT(kAny))})
    .Finalize();
 REGISTER_LITE_KERNEL(fetch,
@@ -79,12 +82,6 @@ REGISTER_LITE_KERNEL(fetch,
                     kNHWC,
                     paddle::lite::kernels::fpga::FetchCompute,
                     host_host)
-    .BindInput("X",
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))})
-               {LiteType::GetTensorTy(TARGET(kHost),
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kAny))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kHost),
-                                       PRECISION(kAny),
-                                       DATALAYOUT(kAny))})
    .Finalize();
--- a/lite/kernels/fpga/gru_compute.cc
+++ b/lite/kernels/fpga/gru_compute.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include <unistd.h>
 #include <iostream>
 #include <string>
 #include <vector>
@@ -83,6 +84,7 @@ void GRUCompute::PrepareForRun() {
 void GRUCompute::Run() {
  auto& param = this->Param<param_t>();
  param.hidden->mutable_data<float>();
  // inputs
  auto input = param.input;
  auto h0 = param.h0;
@@ -130,6 +132,7 @@ void GRUCompute::Run() {
    // //3.
    gru_value.prev_out_value = ordered_h0.mutable_data<float>();
    gru_tensors.pre_output = ordered_h0.ZynqTensor();
  } else {
    gru_value.prev_out_value = nullptr;
    gru_tensors.pre_output = nullptr;
@@ -169,6 +172,7 @@ void GRUCompute::Run() {
    float* hidden_data =
        hidden_out.mutableData<float>(zynqmp::FP32, float_input_shape);
    gru_tensors.gate = &float_input;
    gru_tensors.output = &hidden_out;
@@ -187,11 +191,6 @@ void GRUCompute::Run() {
  *(batch_hidden->mutable_lod()) = batch_gate->lod();
  batch_hidden->mutable_data<float>();
  to_seq(*batch_hidden, hidden);
-  save_tensor(const_cast<Tensor*>(input), "_input.txt");
-  save_tensor(hidden, "_gru.txt");
-  exit(-1);
 }
 }  // namespace fpga

--- a/lite/kernels/fpga/io_copy_compute.cc
+++ b/lite/kernels/fpga/io_copy_compute.cc
@@ -119,7 +119,79 @@ class IoCopyFpgaToHostCompute
    auto out_lod = param.y->mutable_lod();
    *out_lod = param.x->lod();
  }
+  std::string doc() const override { return "Copy IO from FPGA to HOST"; }
+};
+void hwc_to_chw(float* chw_data,
+                float* hwc_data,
+                int num,
+                int channel,
+                int height,
+                int width) {
+  int chw = channel * height * width;
+  int wc = width * channel;
+  int wh = width * height;
+  int index = 0;
+  for (int n = 0; n < num; n++) {
+    for (int h = 0; h < height; h++) {
+      for (int w = 0; w < width; w++) {
+        for (int c = 0; c < channel; c++) {
+          chw_data[n * chw + c * wh + h * width + w] = hwc_data[index];
+          index++;
+        }
+      }
+    }
+  }
+}
+class IoCopyFpgaToHostCHWCompute
+    : public KernelLite<TARGET(kFPGA), PRECISION(kAny), DATALAYOUT(kAny)> {
+ public:
+  void Run() override {
+    auto& param = Param<operators::IoCopyParam>();
+    CHECK(param.x->target() == TARGET(kHost) ||
+          param.x->target() == TARGET(kFPGA));
+    Tensor hwc;
+    hwc.Resize(param.y->dims());
+    float* hwc_data = hwc.mutable_data<float>();
+    float* chw_data = param.y->mutable_data<float>();
+    param.y->ZynqTensor()->setDataType(zynqmp::FP32);
+    param.x->ZynqTensor()->syncToDevice();
+    if (param.x->ZynqTensor()->aligned() &&
+        param.x->ZynqTensor()->shape().shouldAlign()) {
+      zynqmp::Tensor tempTensor;
+      tempTensor.mutableData<float16>(zynqmp::FP16,
+                                      param.x->ZynqTensor()->shape());
+      tempTensor.copyFrom(param.x->ZynqTensor());
+      tempTensor.setAligned(true);
+      tempTensor.unalignImage();
+      hwc.ZynqTensor()->copyFrom(&tempTensor);
+    } else {
+      hwc.ZynqTensor()->copyFrom(param.x->ZynqTensor());
+    }
+    int num = 1;
+    int channel = 1;
+    int height = 1;
+    int width = 1;
+    auto dims = param.y->ZynqTensor()->shape();
+    hwc_to_chw(chw_data,
+               hwc_data,
+               dims.num(),
+               dims.channel(),
+               dims.height(),
+               dims.width());
+    param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor());
+    param.y->ZynqTensor()->flush();
+    auto out_lod = param.y->mutable_lod();
+    *out_lod = param.x->lod();
+  }
  std::string doc() const override { return "Copy IO from FPGA to HOST"; }
 };
@@ -170,7 +242,7 @@ REGISTER_LITE_KERNEL(io_copy,
                                      PRECISION(kFP16),
                                      DATALAYOUT(kNHWC))})
    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kARM),
+                {LiteType::GetTensorTy(TARGET(kHost),
                                       PRECISION(kFloat),
                                       DATALAYOUT(kNHWC))})
    .Finalize();
@@ -179,8 +251,8 @@ REGISTER_LITE_KERNEL(io_copy,
                     kFPGA,
                     kAny,
                     kAny,
-                     paddle::lite::kernels::fpga::IoCopyFpgaToHostCompute,
+                     paddle::lite::kernels::fpga::IoCopyFpgaToHostCHWCompute,
-                     device_to_host_22)
+                     device_to_host_chw)
    .BindInput("Input",
               {LiteType::GetTensorTy(TARGET(kFPGA),
                                      PRECISION(kFP16),

--- a/lite/kernels/fpga/multiclass_nms_compute.cc
+++ b/lite/kernels/fpga/multiclass_nms_compute.cc
@@ -384,6 +384,7 @@ void MulticlassNmsCompute::Run() {
            scores_slice, boxes_slice, all_indices[i], score_dims.size(), &out);
        outs->ZynqTensor()->copyFrom(out.ZynqTensor());
      }
+      outs->Resize({static_cast<int64_t>(e - s), out_dim});
    }
  }
  LoD lod;

--- a/lite/kernels/fpga/pooling_compute.cc
+++ b/lite/kernels/fpga/pooling_compute.cc
@@ -34,7 +34,9 @@ void PoolCompute::PrepareForRun() {
  zynqmp::PoolingParam& pool_param = pe_.param();
  pool_param.input = param.x->ZynqTensor();
  pool_param.output = param.output->ZynqTensor();
-  pool_param.relu.enabled = false;
+  pool_param.activeParam.type = zynqmp::TYPE_RELU;
  pool_param.type = param.pooling_type == "max" ? zynqmp::PoolingType::MAX
                                                : zynqmp::PoolingType::AVERAGE;
  pool_param.globalPooling = param.global_pooling;