fix: update backend fpga patch

2e2c9d4b · MyPandaShaoxiang · 41826a31 · 2e2c9d4b · 2e2c9d4b · 2e2c9d4b
30 changed file
--- a/lite/backends/fpga/CMakeLists.txt
+++ b/lite/backends/fpga/CMakeLists.txt
@@ -3,13 +3,35 @@ if (NOT LITE_WITH_FPGA)
 endif()
 set(LITE_FPGA_KD_PATH "${PADDLE_SOURCE_DIR}/lite/backends/fpga/KD")
+set(LITE_FPGA_KD_LLAPI_PATH "${PADDLE_SOURCE_DIR}/lite/backends/fpga/KD/llapi")
+set(LITE_FPGA_KD_PE_PATH "${PADDLE_SOURCE_DIR}/lite/backends/fpga/KD/pes")
 set(LITE_FPGA_PATH "${PADDLE_SOURCE_DIR}/lite/backends/fpga")
 message("fpga_kd_path ${LITE_FPGA_KD_PATH}")
 message("fpga_path ${LITE_FPGA_PATH}")
-file(GLOB_RECURSE KD_CPP *.cpp *.cc)
+file(GLOB KD_CPP "${LITE_FPGA_KD_PATH}/*.cpp")
+file(GLOB PE_CPP "${LITE_FPGA_KD_PE_PATH}/*.cpp")
+file(GLOB LLAPI_CPP "${LITE_FPGA_KD_LLAPI_PATH}/*.cpp")
 file(GLOB FPGA_CPP "${LITE_FPGA_PATH}/*.cc")
+set(FPGA_ALL_CPP "")
-cc_library(kernel_fpga SRCS ${KD_CPP} ${FPGA_CPP})
+FOREACH(FILE_PATH ${KD_CPP})
+    STRING(REGEX REPLACE ".+/(.+\\..*)" "\\1" FILE_NAME ${FILE_PATH})
+    list(APPEND FPGA_ALL_CPP KD/${FILE_NAME})
+ENDFOREACH(FILE_PATH)
+FOREACH(FILE_PATH ${PE_CPP})
+    STRING(REGEX REPLACE ".+/(.+\\..*)" "\\1" FILE_NAME ${FILE_PATH})
+    list(APPEND FPGA_ALL_CPP KD/pes/${FILE_NAME})
+ENDFOREACH(FILE_PATH)
+FOREACH(FILE_PATH ${LLAPI_CPP})
+    STRING(REGEX REPLACE ".+/(.+\\..*)" "\\1" FILE_NAME ${FILE_PATH})
+    list(APPEND FPGA_ALL_CPP KD/llapi/${FILE_NAME})
+ENDFOREACH(FILE_PATH)
+FOREACH(FILE_PATH ${FPGA_CPP})
+    STRING(REGEX REPLACE ".+/(.+\\..*)" "\\1" FILE_NAME ${FILE_PATH})
+    list( APPEND FPGA_ALL_CPP ${FILE_NAME})
+ENDFOREACH(FILE_PATH)
+message("fpga kd: ${FPGA_ALL_CPP}")
+cc_library(kernel_fpga SRCS ${FPGA_ALL_CPP})
+#cc_library(kernel_fpga SRCS ${KD_CPP} ${FPGA_CPP})
 cc_library(lite_tensor_fpga SRCS lite_tensor.cc DEPS memory)
-cc_library(fpga_target_wrapper SRCS ${LITE_FPGA_PATH}/target_wrapper.cc DEPS kernel_fpga)
+cc_library(fpga_target_wrapper SRCS target_wrapper.cc DEPS kernel_fpga)
--- a/lite/backends/fpga/KD/debugger.hpp
+++ b/lite/backends/fpga/KD/debugger.hpp
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include <unordered_map>
+// #include "lite/backends/fpga/lite_tensor.h"
+#include "lite/core/tensor.h"
+namespace paddle {
+namespace lite {
+#define FPGA_PRINT_TENSOR
+class Debugger {
+ public:
+  static Debugger& get_instance() {
+    static Debugger s_instance;
+    return s_instance;
+  }
+  void registerOutput(std::string op_type, zynqmp::Tensor* tensor) {
+    // tensor->printScale();
+    if (op_type != "conv") {
+      // tensor->saveToFile(op_type, true);
+    }
+  }
+ private:
+  std::unordered_map<std::string, bool> op_config;
+  Debugger() {
+    op_config["concat"] = true;
+    op_config["conv"] = true;
+    op_config["crop"] = true;
+  }
+};
+inline void chw_to_hwc(Tensor* t, float* dst) {
+  int num = t->dims()[0];
+  int channel = t->dims()[1];
+  int height = 1;
+  int width = 1;
+  if (t->dims().size() > 2) {
+    height = t->dims()[2];
+  }
+  if (t->dims().size() > 3) {
+    width = t->dims()[3];
+  }
+  // int width = t->dims()[3];
+  const float* chw_data = t->data<float>();
+  float* hwc_data = dst;
+  int chw = channel * height * width;
+  int wc = width * channel;
+  int index = 0;
+  for (int n = 0; n < num; n++) {
+    for (int c = 0; c < channel; c++) {
+      for (int h = 0; h < height; h++) {
+        for (int w = 0; w < width; w++) {
+          hwc_data[n * chw + h * wc + w * channel + c] = chw_data[index];
+          index++;
+        }
+      }
+    }
+  }
+}
+inline void read_from_file(lite::Tensor* t, const std::string& path) {
+  std::ifstream file_stream;
+  file_stream.open(path);
+  if (!file_stream) {
+    return;
+  }
+  float* data = t->mutable_data<float>();
+  int num = t->numel();
+  for (int i = 0; i < num; ++i) {
+    float value = 0;
+    file_stream >> value;
+    data[i] = value;
+  }
+  // flush();
+}
+inline void save_float(float* data, const std::string& name, int len) {
+  // return;
+  static int counter = 0;
+  std::string old_string = std::to_string(counter);
+  std::string new_string =
+      std::string(3 - old_string.length(), '0') + old_string;
+  std::string file = "arm_" + new_string + name;
+  counter++;
+  std::cout
+      << "-------------------------- saving file: --------------------------"
+      << file << std::endl;
+  std::ofstream ofs;
+  ofs.open(file);
+  // float* data = dst;
+  for (int i = 0; i < len; i++) {
+    float value = data[i];
+    ofs << value << std::endl;
+  }
+  ofs.close();
+}
+inline void save_tensor(lite::Tensor* t,
+                        const std::string& name,
+                        bool convert = true) {
+  float* data = const_cast<float*>(t->data<float>());
+  float* dst = new float[t->numel()];
+  if (convert) {
+    chw_to_hwc(t, dst);
+    data = dst;
+  }
+  save_float(data, name, t->numel());
+  delete[] dst;
+}
+inline void save_tensor(const lite::Tensor* t,
+                        const std::string& name,
+                        bool convert = true) {
+  // return;
+  float* data = const_cast<float*>(t->data<float>());
+  float* dst = new float[t->numel()];
+  if (convert) {
+    chw_to_hwc(const_cast<lite::Tensor*>(t), dst);
+    data = dst;
+  }
+  save_float(data, name, t->numel());
+  delete[] dst;
+}
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/fpga/KD/dl_engine.cpp
+++ b/lite/backends/fpga/KD/dl_engine.cpp
@@ -13,14 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "lite/backends/fpga/KD/dl_engine.hpp"
 namespace paddle {
 namespace zynqmp {
 DLEngine::DLEngine() {
  open_device();
-  struct DeviceInfo info;
+  int ret = get_device_info(info_);
-  int ret = get_device_info(info);
+  filter::set_filter_capacity(info_.filter_cap);
-  filter::set_filter_capacity(info.filter_cap);
+  filter::set_colunm(info_.colunm);
 }
 }  // namespace zynqmp

--- a/lite/backends/fpga/KD/dl_engine.hpp
+++ b/lite/backends/fpga/KD/dl_engine.hpp
@@ -15,7 +15,6 @@ limitations under the License. */
 #pragma once
 #include <stdio.h>
 #include "lite/backends/fpga/KD/llapi/filter.h"
 #include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
@@ -29,8 +28,15 @@ class DLEngine {
    return s_instance;
  }
+  DeviceInfo& deviceInfo();
+  bool isZU3() { return info_.device_type / 100 == 3; }
+  float* out_data = nullptr;
 private:
  DLEngine();
+  DeviceInfo info_;
 };
 }  // namespace zynqmp
 }  // namespace paddle
--- a/lite/backends/fpga/KD/layout.hpp
+++ b/lite/backends/fpga/KD/layout.hpp
@@ -22,6 +22,7 @@ namespace paddle {
 namespace zynqmp {
 enum LayoutType {
+  None,
  N,
  NC,
  NCHW,
@@ -39,6 +40,15 @@ class Layout {
  virtual int elementCount(const std::vector<int>& dims) = 0;
 };
+struct None : Layout {
+  int numIndex() { return -1; }
+  int channelIndex() { return -1; }
+  int heightIndex() { return -1; }
+  int widthIndex() { return -1; }
+  int alignedElementCount(const std::vector<int>& dims) { return 16; }
+  virtual int elementCount(const std::vector<int>& dims) { return 1; }
+};
 struct NCHW : Layout {
  int numIndex() { return 0; }
  int channelIndex() { return 1; }

--- a/lite/backends/fpga/KD/llapi/bias_scale.cpp
+++ b/lite/backends/fpga/KD/llapi/bias_scale.cpp
@@ -14,6 +14,7 @@ limitations under the License. */
 #include <memory.h>
+#include "lite/backends/fpga/KD/float16.hpp"
 #include "lite/backends/fpga/KD/llapi/bias_scale.h"
 #include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
@@ -54,7 +55,7 @@ void align_element(float **data_in, int num_per_div_before_alignment, int num) {
  *data_in = ptr_aligned;
 }
-void interleave(float **data_in, int num_after_alignment) {
+size_t interleave(float **data_in, int num_after_alignment) {
  float *ptr_uninterleaved = *data_in;
  float *ptr_interleaved =
      (float *)fpga_malloc(2 * num_after_alignment * sizeof(float));  // NOLINT
@@ -69,6 +70,7 @@ void interleave(float **data_in, int num_after_alignment) {
  fpga_free(ptr_uninterleaved);
  *data_in = ptr_interleaved;
+  return 2 * num_after_alignment * sizeof(float);
 }
 void format_bias_scale_array(float **bias_scale_array,
@@ -78,8 +80,9 @@ void format_bias_scale_array(float **bias_scale_array,
  int div_num = (num + element_num_per_division - 1) / element_num_per_division;
  int element_num_after_division =
      align_to_x(element_num_per_division, BS_NUM_ALIGNMENT);
-  interleave(bias_scale_array, div_num * element_num_after_division);
+  size_t mem =
-  fpga_flush(*bias_scale_array, 2 * element_num_after_division * sizeof(float));
+      interleave(bias_scale_array, div_num * element_num_after_division);
+  fpga_flush(*bias_scale_array, mem);
 }
 void format_bias_array(float **bias_array, int num) {
  float *ptr_unaligned = *bias_array;

--- a/lite/backends/fpga/KD/llapi/bias_scale.h
+++ b/lite/backends/fpga/KD/llapi/bias_scale.h
@@ -19,7 +19,7 @@ namespace zynqmp {
 namespace bias_scale {
 void align_element(float** data_in, int num_per_div_before_alignment, int num);
-void interleave(float** data_in, int num_after_alignment);
+size_t interleave(float** data_in, int num_after_alignment);
 void format_bias_scale_array(float** bias_scale_array,
                             int element_num_per_division,
                             int num);

--- a/lite/backends/fpga/KD/llapi/filter.cpp
+++ b/lite/backends/fpga/KD/llapi/filter.cpp
@@ -15,6 +15,8 @@ limitations under the License. */
 #include "lite/backends/fpga/KD/llapi/filter.h"
 #include <memory.h>
 #include <algorithm>
+#include <fstream>
+#include <string>
 #include "lite/backends/fpga/KD/float16.hpp"
 #include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
@@ -23,11 +25,42 @@ namespace zynqmp {
 namespace filter {
 static int FILTER_SIZE = 2048;
+static int COLUMN = 4;
+void saveToFile(std::string name, void* data_in, int size) {
+  // std::ofstream ofs;
+  // ofs.open(name);
+  // int8_t* data = (int8_t*)data_in;
+  // for (int i = 0; i < size; i++) {
+  //   float value = data[i];
+  //   ofs << value << std::endl;
+  // }
+  // ofs.close();
+}
+void saveFloatToFile(std::string name, float* data_in, int size) {
+  // std::ofstream ofs;
+  // ofs.open(name);
+  // for (int i = 0; i < size; i++) {
+  //   float value = data_in[i];
+  //   ofs << value << std::endl;
+  // }
+  // ofs.close();
+}
 void set_filter_capacity(uint32_t cap) { FILTER_SIZE = cap; }
+void set_colunm(uint32_t column) { COLUMN = column; }
+// replace zynqmp_api.h  #define FILTER_NUM_ALIGNMENT
+int get_filter_num_alignment() { return COLUMN * 4; }
 int calc_division_capacity(int chw) {
-  int n = FILTER_SIZE / ((chw + 15) / 16) * 32;
+  // int n = FILTER_SIZE / ((chw + 15) / 16) * 32;
+  int filter_num_alignment = get_filter_num_alignment();
+  int n = FILTER_SIZE / ((chw + 15) / 16) * filter_num_alignment;
  return n < FILTER_SIZE ? n : FILTER_SIZE;
 }
@@ -52,28 +85,28 @@ int calc_num_per_div(int num, int group_num, int division_capacity) {
  }
 }
-void convert_to_hwc(
+void convert_to_hwc(int8_t* chw_data,
-    char **data_in, int num, int channel, int height, int width) {
+                    int8_t* hwc_data,
-  char *tmp = *data_in;
+                    int num,
+                    int channel,
+                    int height,
+                    int width) {
  int chw = channel * height * width;
-  char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char));  // NOLINT
+  int wc = width * channel;
+  int index = 0;
  for (int n = 0; n < num; n++) {
-    int64_t amount_per_row = width * channel;
    for (int c = 0; c < channel; c++) {
      for (int h = 0; h < height; h++) {
-        int64_t offset_height = h * amount_per_row;
        for (int w = 0; w < width; w++) {
-          *(data_tmp + n * chw + offset_height + w * channel + c) =
+          hwc_data[n * chw + h * wc + w * channel + c] = chw_data[index];
-              *((*data_in)++);
+          index++;
        }
      }
    }
  }
-  *data_in = data_tmp;
-  fpga_free(tmp);
 }
-float find_max(float *data_in, int data_size) {
+float find_max(float* data_in, int data_size) {
  float max = 0.0;
  for (int i = 0; i < data_size; ++i) {
    float value = data_in[i];
@@ -83,166 +116,194 @@ float find_max(float *data_in, int data_size) {
  return max;
 }
-signed char float_to_int8(float fdata) {
+int8_t float_to_int8(float fdata) {
  if (fdata < 0.0) {
    fdata -= 0.5;
  } else {
    fdata += 0.5;
  }
-  return (signed char)fdata;
+  return (int8_t)fdata;
 }
-void quantize(float **data_in, int data_size, float max) {
+void quantize(float* src, int8_t* dst, int len, float max) {
-  float *tmp = *data_in;
  float fix_range = 127;
  float scale = fix_range / max;
+  for (size_t i = 0; i < len; i++) {
-  signed char *tmp_data = (signed char *)fpga_malloc(data_size * sizeof(char));
+    dst[i] = float_to_int8(src[i] * scale);
-  for (int i = 0; i < data_size; i++) {
-    tmp_data[i] = float_to_int8(
-        (*data_in)[i] * scale);  // (signed char)((*data_in)[i] * scale);
  }
-  *data_in = (float *)tmp_data;  // NOLINT
-  fpga_free(tmp);
 }
-void align_element(char **data_in, int num, int chw) {
+bool should_align_chw(int chw) {
-  int j = 0;
  int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
-  if (align_chw != chw) {
+  return align_chw != chw;
-    char *tmp = *data_in;
+}
-    char *data_tmp =
-        (char *)fpga_malloc(num * align_chw * sizeof(char));  // NOLINT
+void align_chw(int8_t* src, int8_t* dst, int num, int chw) {
+  int aligned_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
-    memset(data_tmp, 0, num * align_chw);
+  memset(dst, 0, num * aligned_chw);
-    for (j = 0; j < num; j++) {
+  for (int j = 0; j < num; j++) {
-      memcpy(data_tmp + j * align_chw, (*data_in) + j * chw, chw);
+    memcpy((dst + j * aligned_chw), (src + j * chw), chw);
-    }
-    *data_in = data_tmp;
-    fpga_free(tmp);
  }
 }
-void align_num(char **data_in,
+void align_num(int8_t* src,
+               int8_t* dst,
               int num_per_div_before_alignment,
               int num,
-               int chw) {
+               int align_chw) {
-  int i = 0;
+  int filter_num_alignment = get_filter_num_alignment();
-  int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
  int num_per_div_after_alignment =
-      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
+      align_to_x(num_per_div_before_alignment, filter_num_alignment);
-  char *tmp = *data_in;
  int div_num =
      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
  int num_element = div_num * num_per_div_after_alignment * align_chw;
-  char *data_tmp = (char *)fpga_malloc(num_element * sizeof(char));  // NOLINT
-  memset(data_tmp, 0, num_element * sizeof(char));
+  memset(dst, 0, num_element * sizeof(int8_t));
+  int i = 0;
  for (i = 0; i < div_num - 1; i++) {
-    memcpy(data_tmp + num_per_div_after_alignment * align_chw * i,
+    memcpy(dst + num_per_div_after_alignment * align_chw * i,
-           *data_in + num_per_div_before_alignment * align_chw * i,
+           src + num_per_div_before_alignment * align_chw * i,
           num_per_div_before_alignment * align_chw);
  }
-  memcpy(data_tmp + num_per_div_after_alignment * align_chw * i,
+  memcpy(dst + num_per_div_after_alignment * align_chw * i,
-         *data_in + num_per_div_before_alignment * align_chw * i,
+         src + num_per_div_before_alignment * align_chw * i,
         (num - (div_num - 1) * num_per_div_before_alignment) * align_chw);
-  *data_in = data_tmp;
-  fpga_free(tmp);
 }
-void reorder(char **data_in, int num_after_alignment, int chw) {
+void reorder(int8_t* src, int8_t* dst, int num_after_alignment, int chw) {
  int index = 0;
  int new_index = 0;
+  int filter_num_alignment = get_filter_num_alignment();
  int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
-  char *data_tmp =
-      (char *)fpga_malloc(chw_align * num_after_alignment *  // NOLINT
-                          sizeof(char));
-  char *tmp = *data_in;
  for (index = 0; index < num_after_alignment; index++) {
-    new_index = index / 32 * 32 + (index % 16 / 4 * 8) + (index % 16 % 4) +
+    new_index = index / filter_num_alignment * filter_num_alignment +
-                (index / 16 % 2 * 4);
+                (index % (filter_num_alignment / 2) / 4 * 8) +
-    memcpy(data_tmp + index * chw_align,
+                (index % (filter_num_alignment / 2) % 4) +
-           *data_in + new_index * chw_align,
+                (index / (filter_num_alignment / 2) % 2 * 4);
-           chw_align);
+    memcpy((dst + index * chw_align), (src + new_index * chw_align), chw_align);
  }
-  *data_in = data_tmp;
-  fpga_free(tmp);
 }
-size_t interleave(char **data_in, int num_after_alignment, int chw) {
+void interleave(int8_t* src, int8_t* dst, int num_after_alignment, int chw) {
-  int i = 0;
-  int j = 0;
-  int k = 0;
  int interleave_per_num = 16;
  int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
-  char *data_tmp =
-      (char *)fpga_malloc(chw_align * num_after_alignment *  // NOLINT
-                          sizeof(char));
-  char *tmp = *data_in;
  int interleave_num = chw_align * 2 / interleave_per_num;
-  for (i = 0; i < num_after_alignment; i += 2) {
+  for (int i = 0; i < num_after_alignment; i += 2) {
-    for (j = 0, k = 0; j < interleave_num; j += 2, k++) {
+    for (int j = 0, k = 0; j < interleave_num; j += 2, k++) {
-      memcpy(data_tmp + i * chw_align + interleave_per_num * j,
+      memcpy(dst + i * chw_align + interleave_per_num * j,
-             *data_in + i * chw_align + interleave_per_num * k,
+             src + i * chw_align + interleave_per_num * k,
             interleave_per_num);
-      memcpy(data_tmp + i * chw_align + interleave_per_num * (j + 1),
+      memcpy(dst + i * chw_align + interleave_per_num * (j + 1),
-             *data_in + (i + 1) * chw_align + interleave_per_num * k,
+             src + (i + 1) * chw_align + interleave_per_num * k,
             interleave_per_num);
    }
  }
-  *data_in = data_tmp;
-  fpga_free(tmp);
-  return chw_align * num_after_alignment;
 }
-size_t format_filter(float **data_in,
+int8_t* format_filter(float* data_in,
-                     int num,
+                      int& mem_size_a,  // NOLINT
-                     int channel,
+                      int num,
-                     int height,
+                      int channel,
-                     int width,
+                      int height,
-                     int group_num,
+                      int width,
-                     float max) {
+                      int group_num,
+                      float max,
+                      std::vector<float>& filter_max) {  // NOLINT
  int data_size = channel * height * width * num;
  int chw = channel * height * width;
  int division_capacity = calc_division_capacity(chw);
+  int filter_num_alignment = get_filter_num_alignment();
  int num_per_div_before_alignment =
      calc_num_per_div(num, group_num, division_capacity);
  int num_per_div_after_alignment =
-      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
+      align_to_x(num_per_div_before_alignment, filter_num_alignment);
  int div_num =
      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
+  // int num_after_alignment = num_per_div_after_alignment * div_num;
  int residual = num % num_per_div_before_alignment;
  int num_after_alignment = num_per_div_after_alignment *
                                ((residual == 0) ? div_num : (div_num - 1)) +
-                            align_to_x(residual, FILTER_NUM_ALIGNMENT);
+                            align_to_x(residual, filter_num_alignment);
-  quantize(data_in, data_size, max);
-  char **quantize_data = (char **)data_in;  // NOLINT
+  // saveFloatToFile("quantize_before", data_in, data_size);
-  convert_to_hwc(quantize_data, num, channel, height, width);
-  align_element(quantize_data, num, chw);
+  int8_t* quantized_data =
-  if (num_after_alignment != num) {
+      reinterpret_cast<int8_t*>(fpga_malloc(data_size * sizeof(int8_t)));
-    align_num(quantize_data, num_per_div_before_alignment, num, chw);
+  for (int n = 0; n < num; n++) {
+    float* filter_start = data_in + n * chw;
+    float f_max = find_max(filter_start, chw);
+    int8_t* quantized_start = quantized_data + n * chw;
+    // quantize(filter_start, quantized_start, chw, f_max);
+    quantize(filter_start, quantized_start, chw, max);
+    // filter_max.push_back(f_max);
+    filter_max.push_back(max);
  }
-  reorder(quantize_data, num_after_alignment, chw);
+  // saveToFile("chw.txt", quantized_data, data_size);
-  size_t mem_size = interleave(quantize_data, num_after_alignment, chw);
-  fpga_flush(*quantize_data,
+  int8_t* hwc_data =
+      reinterpret_cast<int8_t*>(fpga_malloc(data_size * sizeof(int8_t)));
+  convert_to_hwc(quantized_data, hwc_data, num, channel, height, width);
+  fpga_free(quantized_data);
+  // saveToFile("hwc.txt", hwc_data, data_size);
+  int8_t* temp_data = hwc_data;  // NOLINT
+  int chw_aligned = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
+  if (should_align_chw(chw)) {
+    int8_t* hwc_aligned_data = reinterpret_cast<int8_t*>(
+        fpga_malloc(num * chw_aligned * sizeof(int8_t)));
+    align_chw(hwc_data, hwc_aligned_data, num, chw);
+    // saveToFile("align_el.txt", hwc_aligned_data, data_size * 2);
+    temp_data = hwc_aligned_data;
+    fpga_free(hwc_data);
+  }
+  if (num_after_alignment != num) {
+    int filter_num_alignment = get_filter_num_alignment();
+    int num_per_div_after_alignment =
+        align_to_x(num_per_div_before_alignment, filter_num_alignment);
+    // int div_num =
+    //     (num + num_per_div_before_alignment - 1) /
+    //     num_per_div_before_alignment;
+    int num_element = div_num * num_per_div_after_alignment * chw_aligned;
+    int8_t* num_aligned_data =
+        reinterpret_cast<int8_t*>(fpga_malloc(num_element * sizeof(int8_t)));
+    align_num(temp_data,
+              num_aligned_data,
+              num_per_div_before_alignment,
+              num,
+              chw_aligned);
+    // saveToFile("align_num.txt", num_aligned_data, data_size * 8);
+    fpga_free(temp_data);
+    temp_data = num_aligned_data;
+  }
+  int8_t* aligned_data =
+      reinterpret_cast<int8_t*>(fpga_malloc(num_after_alignment * chw_aligned));
+  reorder(temp_data, aligned_data, num_after_alignment, chw);
+  // saveToFile("reorder.txt", aligned_data, data_size * 8);
+  fpga_free(temp_data);
+  int8_t* interleaved_data =
+      reinterpret_cast<int8_t*>(fpga_malloc(num_after_alignment * chw_aligned));
+  interleave(aligned_data, interleaved_data, num_after_alignment, chw);
+  // saveToFile("interleave.txt", interleaved_data, data_size * 8);
+  fpga_free(aligned_data);
+  fpga_flush(interleaved_data,
             align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) * num_after_alignment *
                 sizeof(char));
-  return mem_size;
+  mem_size_a = num_after_alignment * chw_aligned;
+  return interleaved_data;
 }
-void convert_to_hwn(int16_t **data_in, int num, int height, int width) {
+void convert_to_hwn(int16_t** data_in, int num, int height, int width) {
-  int16_t *tmp = *data_in;
+  int16_t* tmp = *data_in;
-  int16_t *data_tmp =
+  int16_t* data_tmp =
-      (int16_t *)fpga_malloc(height * width * num * sizeof(int16_t));  // NOLINT
+      (int16_t*)fpga_malloc(height * width * num * sizeof(int16_t));  // NOLINT
  for (int n = 0; n < num; n++) {
    for (int h = 0; h < height; h++) {
      for (int w = 0; w < width; w++) {
@@ -254,16 +315,16 @@ void convert_to_hwn(int16_t **data_in, int num, int height, int width) {
  fpga_free(tmp);
 }
-size_t align_element_n(int16_t **data_in, int num, int height, int width) {
+size_t align_element_n(int16_t** data_in, int num, int height, int width) {
  int unalign_n = num;
  int align_n = align_to_x(num, FILTER_ELEMENT_ALIGNMENT);
  int num_element = height * width * align_n;
  if (unalign_n != align_n) {
-    int16_t *tmp = *data_in;
+    int16_t* tmp = *data_in;
    int num_element = height * width * align_n;
-    int16_t *data_tmp =
+    int16_t* data_tmp =
-        (int16_t *)fpga_malloc(num_element * sizeof(int16_t));  // NOLINT
+        (int16_t*)fpga_malloc(num_element * sizeof(int16_t));  // NOLINT
    memset(data_tmp, 0, num_element * sizeof(int16_t));
    for (int h = 0; h < height; h++) {
@@ -276,17 +337,37 @@ size_t align_element_n(int16_t **data_in, int num, int height, int width) {
      }
    }
    *data_in = data_tmp;
-    free(tmp);
+    fpga_free(tmp);
  }
  return num_element * sizeof(int16_t);
 }
+void to_fp16(float* src,
+             float16* dst,
+             int num,
+             int height,
+             int width,
+             float* scale_ptr) {
+  int size = num * height * width;
+  for (int n = 0; n < num; n++) {
+    float scale_val = scale_ptr[n];
+    for (int h = 0; h < height; h++) {
+      for (int w = 0; w < width; w++) {
+        int index = n * height * width + h * width + w;
+        float value = src[index] * scale_val;
+        dst[index] = float_to_half(value);
+      }
+    }
+  }
+  fpga_flush(dst, size * sizeof(int16_t));
+}
 void quantize_to_fp16(
-    float **data_in, int num, int height, int width, float *scale_ptr) {
+    float** data_in, int num, int height, int width, float* scale_ptr) {
-  float *tmp = *data_in;
+  float* tmp = *data_in;
  int size = num * height * width;
-  float16 *tmp_data = (float16 *)fpga_malloc(size * sizeof(float16));  // NOLINT
+  float16* tmp_data = (float16*)fpga_malloc(size * sizeof(float16));  // NOLINT
  for (int n = 0; n < num; n++) {
    float scale_val = scale_ptr[n];
    for (int h = 0; h < height; h++) {
@@ -298,13 +379,14 @@ void quantize_to_fp16(
    }
  }
  fpga_flush(tmp_data, size * sizeof(int16_t));
-  *data_in = (float *)tmp_data;  // NOLINT
+  *data_in = (float*)tmp_data;  // NOLINT
  fpga_free(tmp);
 }
 size_t format_dwconv_filter(
-    float **data_in, int num, int height, int width, float *scale_ptr) {
+    float** data_in, int num, int height, int width, float* scale_ptr) {
  quantize_to_fp16(data_in, num, height, width, scale_ptr);
-  int16_t **quantize_data = (int16_t **)data_in;  // NOLINT
+  int16_t** quantize_data = reinterpret_cast<int16_t**>(data_in);
  convert_to_hwn(quantize_data, num, height, width);
  size_t size = align_element_n(quantize_data, num, height, width);
  fpga_flush(*quantize_data,

--- a/lite/backends/fpga/KD/llapi/filter.h
+++ b/lite/backends/fpga/KD/llapi/filter.h
@@ -18,38 +18,35 @@ limitations under the License. */
 #include <cstdlib>
 #include <cwchar>
+#include <vector>
 namespace paddle {
 namespace zynqmp {
 namespace filter {
 void set_filter_capacity(uint32_t cap);
+void set_colunm(uint32_t column);
+int get_filter_num_alignment();
 int calc_division_capacity(int chw);
 int calc_split_num(int num, int division_capacity);
 int calc_division_number(int num, int group_num, int division_capacity);
 int calc_num_per_div(int num, int group_num, int division_capacity);
-void convert_to_hwc(
-    char** data_in, int num, int channel, int height, int width);
 float find_max(float* data_in, int data_size);
-void quantize(float** data_in, int data_size, float max);
+int8_t* format_filter(float* data_in,
-void align_element(char** data_in, int num, int chw);
+                      int& mem_size,  // NOLINT
-void align_num(char** data_in,
+                      int num,
-               int num_per_div_before_alignment,
+                      int channel,
-               int num,
+                      int height,
-               int chw);
+                      int width,
-void reorder(char** data_in, int num_after_alignment, int chw);
+                      int group_num,
-size_t interleave(char** data_in, int num_after_alignment, int chw);
+                      float max,                        // NOLINT
-size_t format_filter(float** data_in,
+                      std::vector<float>& filter_max);  // NOLINT
-                     int num,
-                     int channel,
-                     int height,
-                     int width,
-                     int group_num,
-                     float max);
 void convert_to_hwn(int16_t** data_in, int num, int height, int width);
 size_t align_element_n(int16_t** data_in, int num, int height, int width);
-void quantize_to_fp16(
+// void quantize_to_fp16(float** data_in, int num, int height, int width,
-    float** data_in, int num, int height, int width, float* scale_ptr);
+//                       float* scale_ptr);
 size_t format_dwconv_filter(
    float** data_in, int num, int height, int width, float* scale_ptr);

--- a/lite/backends/fpga/KD/llapi/zynqmp_api.cpp
+++ b/lite/backends/fpga/KD/llapi/zynqmp_api.cpp
@@ -23,13 +23,12 @@ limitations under the License. */
 #include <map>
 #include <utility>
-#include "lite/backends/fpga/KD/llapi/config.h"
 #include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
 namespace paddle {
 namespace zynqmp {
-#define PADDLE_LITE_OS_LINUX
+#define PADDLE_OS_LINUX
 static int fd = -1;
 static const char *device_path = "/dev/fpgadrv0";
@@ -39,14 +38,10 @@ static size_t memory_size_max = 0;
 static size_t memory_size = 0;
 static inline int do_ioctl(uint64_t req, const void *arg) {
-  int ret = -1;
+#ifdef PADDLE_OS_LINUX
-#ifdef PADDLE_LITE_OS_LINUX
+  return ioctl(fd, req, arg);
-  ret = ioctl(fd, req, arg);
-  if (ret != 0) {
-    throw - 1;
-  }
 #else
-  return ret;
+  return -1;
 #endif
 }
@@ -66,7 +61,10 @@ void reset_device() {
 // memory management;
 void *fpga_malloc(size_t size) {
-#ifdef PADDLE_LITE_OS_LINUX
+#ifdef ENABLE_DEBUG
+// std::cout << "fpga_malloc:" << size << std::endl;
+#endif
+#ifdef PADDLE_OS_LINUX
  void *ptr = reinterpret_cast<void *>(
      mmap64(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0));
  if (ptr == NULL) {
@@ -105,11 +103,8 @@ void fpga_free(void *ptr) {
    size = iter->second;
    memory_map.erase(iter);
  }
  memory_size -= size;
+#ifdef PADDLE_OS_LINUX
-#ifdef PADDLE_LITE_OS_LINUX
  munmap(ptr, size);
 #else
  free(ptr);
@@ -150,6 +145,11 @@ void fpga_copy(void *dest, const void *src, size_t num) {
  memcpy(dest, src, num);
 }
+int fpga_reset() {
+  struct FpgaResetArgs args;
+  return do_ioctl(IOCTL_FPGA_RESET, &args);
+}
 int ioctl_conv(const struct ConvArgs &args) {
  return do_ioctl(IOCTL_CONFIG_CONV, &args);
 }
@@ -166,7 +166,6 @@ int compute_fpga_conv(const struct SplitConvArgs &args) {
  }
  if (split_num > 1) {
-    std::cout << "Split num > 1 !!!!!!!!!!!!!!!!!!" << std::endl;
    exit(-1);
  }
  return ret;
@@ -186,6 +185,7 @@ int get_device_info(const struct DeviceInfo &args) {
 }
 int perform_bypass(const struct BypassArgs &args) {
+  int ret = -1;
  int size = args.image.channels * args.image.width * args.image.height;
  int max_size = 1 << 21;
@@ -213,7 +213,7 @@ int perform_bypass(const struct BypassArgs &args) {
        reinterpret_cast<char *>(input_address + i * max_size * type_size);
    bypassArgs.output.address =
        reinterpret_cast<char *>(output_address + i * max_size * out_type_size);
-    int ret = do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs);
+    ret = do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs);
    scale = std::max(scale, scales[0]);
    if (ret != 0) {
@@ -222,13 +222,15 @@ int perform_bypass(const struct BypassArgs &args) {
  }
  int remainder = size - max_size * count;
-  bypassArgs.image.channels = remainder;
+  if (remainder > 0) {
-  bypassArgs.image.address =
+    bypassArgs.image.channels = remainder;
-      reinterpret_cast<char *>(input_address + count * max_size * type_size);
+    bypassArgs.image.address =
-  bypassArgs.output.address = reinterpret_cast<char *>(
+        reinterpret_cast<char *>(input_address + count * max_size * type_size);
-      output_address + count * max_size * out_type_size);
+    bypassArgs.output.address = reinterpret_cast<char *>(
-  int ret = do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs);
+        output_address + count * max_size * out_type_size);
-  scale = std::max(scale, scales[0]);
+    ret = do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs);
+    scale = std::max(scale, scales[0]);
+  }
  args.output.scale_address[0] = scale;
  args.output.scale_address[1] = 1.0f / scale;
  return ret;
@@ -237,52 +239,21 @@ int perform_bypass(const struct BypassArgs &args) {
 int compute_fpga_concat(const struct ConcatArgs &args) { return -1; }
 int compute_fpga_scale(const struct ScaleArgs &args) {
-#ifdef ENABLE_DEBUG
-  std::cout << "======Compute Scale======";
-  std::cout << "scale_address:" << args.scale_address << std::endl;
-  std::cout << "bias_address:" << args.bias_address << std::endl;
-  std::cout << "wc_alignment:" << args.wc_alignment << std::endl;
-  std::cout << "channel_alignment:" << args.channel_alignment << std::endl;
-  std::cout << "   image_address:" << args.image.address
-            << "   image_scale_address:" << args.image.scale_address
-            << "   image_channels:" << args.image.channels
-            << "   image_height:" << args.image.height
-            << "   image_width:" << args.image.width
-            << "   pad_height:" << args.image.pad_height
-            << "   pad_width:" << args.image.pad_width;
-  std::cout << "   out_address:" << args.output.address
-            << "   out_scale_address:" << args.output.scale_address;
-#endif
  return do_ioctl(IOCTL_CONFIG_SCALE, &args);
 }
 int compute_fpga_dwconv(const struct DWconvArgs &args) {
-#ifdef ENABLE_DEBUG
-  std::cout << "======Compute Basic Conv======";
-  std::cout << "   relu_enabled:" << args.relu_enabled
-            << "   filter_address:" << args.filter_address;
-  std::cout << "   image_address:" << args.image.address
-            << "   image_scale_address:" << args.image.scale_address
-            << "   image_channels:" << args.image.channels
-            << "   image_height:" << args.image.height
-            << "   image_width:" << args.image.width
-            << "   pad_height:" << args.image.pad_height
-            << "   pad_width:" << args.image.pad_width;
-  std::cout << "   kernel_height:" << args.kernel.height
-            << "   kernel_width:" << args.kernel.width
-            << "   stride_h:" << args.kernel.stride_h
-            << "   stride_w:" << args.kernel.stride_w;
-  std::cout << "   out_address:" << args.output.address
-            << "   out_scale_address:" << args.output.scale_address;
-#endif
  return do_ioctl(IOCTL_CONFIG_DWCONV, &args);
 }
+int config_activation(const struct ActiveParamterArgs &args) {
+  return do_ioctl(IOCTL_CONFIG_ACTIVATION_PARAMETER, &args);
+}
+// int config_power(const struct PowerArgs& args) {
+//     return do_ioctl(IOCTL_CONFIG_POWER, &args);
+// }
 int config_inplace(const struct InplaceArgs &args) {
  return do_ioctl(IOCTL_CONFIG_INPLACE, &args);
 }

--- a/lite/backends/fpga/KD/llapi/zynqmp_api.h
+++ b/lite/backends/fpga/KD/llapi/zynqmp_api.h
@@ -14,6 +14,9 @@ limitations under the License. */
 #pragma once
+#ifndef PADDLE_MOBILE_SRC_FPGA_KD_ZYNQMP_API_H
+#define PADDLE_MOBILE_SRC_FPGA_KD_ZYNQMP_API_H
 #include <stdint.h>
 #include <cstddef>
 #include <iostream>
@@ -40,6 +43,13 @@ enum DLayoutType {
  LAYOUT_HWC = 0,
 };
+enum ActiveType {
+  TYPE_RELU = 0,
+  TYPE_RELU6 = 1,
+  TYPE_LEAK_RELU = 2,
+  TYPE_SIGMOID = 3,
+};
 struct VersionArgs {
  void* buffer;
 };
@@ -48,7 +58,7 @@ struct DeviceInfo {
  uint32_t filter_cap;
  uint32_t version;
  uint16_t device_type;
-  uint32_t reserved0;
+  uint32_t colunm;
  uint32_t reserved1;
  uint32_t reserved2;
  uint32_t reserved3;
@@ -108,6 +118,7 @@ struct ConvArgs {
  void* filter_scale_address;
  uint32_t filter_num;
  uint32_t group_num;
+  uint32_t dilation;
  struct KernelArgs kernel;
  struct ImageInputArgs image;  // input image;
@@ -199,9 +210,16 @@ struct NormalizeParameterArgs {
  uint32_t hight_width;
 };
+struct ActiveParamterArgs {
+  ActiveType type;
+  uint16_t leaky_relu_factor;
+};
 struct InplaceArgs {
  bool leaky_relu_enable;
  bool relu_enable;
+  bool sigmoid_enable;
+  bool relu6_enable;
  bool power_enable;
  bool normalize_enable;
 };
@@ -216,7 +234,9 @@ struct FpgaRegReadArgs {
  uint64_t value;
 };
-struct FpgaResetArgs {};
+struct FpgaResetArgs {
+  uint32_t val;
+};
 #define IOCTL_FPGA_MAGIC (('F' + 'P' + 'G' + 'A') / 4)
@@ -248,6 +268,8 @@ struct FpgaResetArgs {};
  _IOW(IOCTL_FPGA_MAGIC, 41, struct PowerParameterArgs)
 #define IOCTL_CONFIG_NORMALIZE_PARAMETER \
  _IOW(IOCTL_FPGA_MAGIC, 42, struct NormalizeParameterArgs)
+#define IOCTL_CONFIG_ACTIVATION_PARAMETER \
+  _IOW(IOCTL_FPGA_MAGIC, 43, struct ActiveParamterArgs)
 #define IOCTL_FPGA_REG_READ _IOW(IOCTL_FPGA_MAGIC, 50, struct FpgaRegReadArgs)
 #define IOCTL_FPGA_REG_WRITE _IOW(IOCTL_FPGA_MAGIC, 51, struct FpgaRegWriteArgs)
 #define IOCTL_FPGA_RESET _IOW(IOCTL_FPGA_MAGIC, 52, struct FpgaResetArgs)
@@ -331,6 +353,7 @@ int compute_fpga_scale(const struct ScaleArgs& args);
 int compute_fpga_concat(const struct ConcatArgs& args);
 int compute_fpga_resize(const struct ResizeArgs& args);
+int config_activation(const struct ActiveParamterArgs& args);
 int config_power(const struct PowerArgs& args);
 int compute_fpga_dwconv(const struct DWconvArgs& args);
 int config_norm_param(const struct NormalizeParameterArgs& args);
@@ -341,7 +364,11 @@ int config_inplace(const struct InplaceArgs& args);
 int flush_cache(void* addr, int size);
 int invalidate_cache(void* addr, int size);
+int fpga_reset();
 int16_t fp32_2_fp16(float fp32_num);
 float fp16_2_fp32(int16_t fp16_num);
 }  // namespace zynqmp
 }  // namespace paddle
+#endif  // PADDLE_MOBILE_SRC_FPGA_KD_ZYNQMP_API_H
--- a/lite/backends/fpga/KD/pe.hpp
+++ b/lite/backends/fpga/KD/pe.hpp
@@ -32,6 +32,5 @@ class PE {
  virtual ~PE() {}
 };
 }  // namespace zynqmp
 }  // namespace paddle
--- a/lite/backends/fpga/KD/pe_params.hpp
+++ b/lite/backends/fpga/KD/pe_params.hpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <stdio.h>
+#include <string>
 #include <vector>
 #include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
@@ -26,6 +27,7 @@ namespace zynqmp {
 struct ReLUParam {
 public:
  bool enabled = false;
+  float leaky_relu_factor = 0.0f;
 };
 struct PEParam {
@@ -98,6 +100,24 @@ struct DepthwiseConvParam : ConvParam {
  Tensor* quantizedFilter_ = new Tensor();
 };
+struct GRUParam : PEParam {
+ public:
+  Tensor* input = nullptr;
+  Tensor* h0 = nullptr;
+  Tensor* weight = nullptr;
+  Tensor* bias = nullptr;
+  Tensor* batch_gate = nullptr;
+  Tensor* batch_reset_hidden_prev = nullptr;
+  Tensor* batch_hidden = nullptr;
+  Tensor* hidden = nullptr;
+  std::string gate_activation = "sigmoid";
+  std::string activation = "tanh";
+  bool is_reverse = false;
+  bool origin_mode = false;
+};
 enum PoolingType : int {
  MAX = 0,
  AVERAGE = 1,
@@ -133,6 +153,12 @@ struct ElementwiseAddParam : PEParam {
  EWAddArgs ewargs;
 };
+struct ElementwiseMulParam : PEParam {
+ public:
+  std::vector<Tensor*> inputs;
+  Tensor* output = nullptr;
+};
 struct FullyConnectedParam : PEParam {
 public:
  Tensor* input = nullptr;

--- a/lite/backends/fpga/KD/pes/conv_pe.hpp
+++ b/lite/backends/fpga/KD/pes/conv_pe.hpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <arm_neon.h>
+#include <algorithm>
 #include <vector>
 #include "lite/backends/fpga/KD/pe.hpp"
@@ -49,7 +50,111 @@ class ConvPE : public PE {
      concatPE_.init();
      concatPE_.apply();
    }
+    if (DLEngine::get_instance().isZU3() &&
+        param_.input->shape().dimSize() == 4 &&
+        param_.input->shape().width() == 1 &&
+        param_.input->shape().width() >= 2048) {
+      use_cpu_ = true;
+    }
+    if (param_.filter->shape().width() == 1 &&
+        param_.filter->shape().height() == 1) {
+      // use_cpu_ = true;
+    }
+    if (!use_cpu_) {
+      // param_.filter->releaseData();
+    }
  }
+  void cpu_conv_hwc() {
+    Tensor* input = param_.input;
+    Tensor* output = param_.output;
+    input->syncToCPU();
+    Tensor float_input;
+    Tensor float_output;
+    float* image_addr = float_input.mutableData<float>(FP32, input->shape());
+    float_input.copyFrom(input);
+    float_input.syncToCPU();
+    float* out = float_output.mutableData<float>(FP32, output->shape());
+    int out_width = output->shape().width();
+    int out_channel = output->shape().channel();
+    int in_channel = input->shape().channel();
+    float* filter_data = param_.filter->data<float>();
+    int image_height = input->shape().height();
+    int image_width = input->shape().width();
+    int image_channels = input->shape().channel();
+    int image_pad_h = param_.paddings[0];
+    int image_pad_w = param_.paddings[1];
+    int kernel_height = param_.filter->shape().height();
+    int kernel_width = param_.filter->shape().width();
+    int kernel_step_h = param_.strides[0];
+    int kernel_step_w = param_.strides[1];
+    // int out_channel = param_.strides[1];
+    int pooled_height_ = output->shape().height();
+    int pooled_width_ = out_width;
+    int filter_chw = image_channels * kernel_height * kernel_width;
+    float max = 0;
+    for (int ph = 0; ph < pooled_height_; ph++) {
+      for (int pw = 0; pw < pooled_width_; pw++) {
+        int hstart = ph * kernel_step_h - image_pad_h;
+        int wstart = pw * kernel_step_w - image_pad_w;
+        int hend =
+            std::min(hstart + kernel_height, static_cast<int>(image_height));
+        int wend =
+            std::min(wstart + kernel_width, static_cast<int>(image_width));
+        hstart = std::max(hstart, static_cast<int>(0));
+        wstart = std::max(wstart, static_cast<int>(0));
+        for (int oc = 0; oc < out_channel; oc++) {
+          float sum = 0.0f;
+          const int pool_index = (ph * pooled_width_ + pw) * out_channel + oc;
+          for (int c = 0; c < image_channels; c++) {
+            for (int h = hstart; h < hend; h++) {
+              int hi = 0;
+              if (ph == 0) {
+                hi = h - hstart + image_pad_h;
+              } else {
+                hi = h - hstart;
+              }
+              for (int w = wstart; w < wend; w++) {
+                int wi = 0;
+                if (pw == 0) {
+                  wi = w - wstart + image_pad_w;
+                } else {
+                  wi = w - wstart;
+                }
+                const int index = (h * image_width + w) * image_channels + c;
+                int weight_index = oc * filter_chw +
+                                   kernel_width * kernel_height * c +
+                                   kernel_width * hi + wi;
+                float value = image_addr[index] * filter_data[weight_index];
+                sum += value;
+              }
+            }
+          }
+          if (param_.relu.enabled && sum < 0) {
+            sum = -sum;
+          }
+          if (sum > max) {
+            max = sum;
+          }
+          out[pool_index] = sum;
+        }
+      }
+    }
+    float_output.flush();
+    output->copyFrom(&float_output);
+    output->scale()[0] = max / 127;
+    output->scale()[1] = 127 / max;
+  }
  void cpu_compute() {
    Tensor* input = param_.input;
    Tensor* output = param_.output;
@@ -59,43 +164,79 @@ class ConvPE : public PE {
    Tensor float_output;
    float* image_addr = float_input.mutableData<float>(FP32, input->shape());
    float_input.copyFrom(input);
+    float_input.syncToCPU();
    float* out = float_output.mutableData<float>(FP32, output->shape());
+    float* bias_data = param_.bias()->data<float>();
+    int out_width = output->shape().width();
    int out_channel = output->shape().channel();
    int in_channel = input->shape().channel();
    float* filter_data = param_.filter->data<float>();
    float* mi = new float[in_channel];
+    float max = 0;
+    int out_index = 0;
    for (int i = 0; i < out_channel; i++) {
      float* image = image_addr;
      float* filter_ptr = filter_data + i * in_channel;
      float* out_ptr = mi;
-#pragma omp parallel for
-      for (int j = 0; j < in_channel; j++) {
-        float value = image_addr[j] * filter_ptr[j];
-        mi[j] = value;
-      }
-      float sum = 0;
+      for (int h = 0; h < output->shape().height(); h++) {
-      for (int j = 0; j < in_channel; j++) {
+        for (int w = 0; w < output->shape().width(); w++) {
-        sum += mi[j];
+          float sum = 0;
+          // #pragma omp parallel for
+          for (int j = 0; j < in_channel; j++) {
+            int image_index = h * out_width * in_channel + w * in_channel + j;
+            float value = image_addr[image_index] * filter_ptr[j];
+            sum += value;
+          }
+          sum += bias_data[i];
+          if (param_.relu.enabled && sum < 0) {
+            sum = 0;
+          }
+          if (sum > max) {
+            max = sum;
+          }
+          out_index = h * out_width * out_channel + w * out_channel + i;
+          out[out_index] = sum;
+          // out_index++;
+        }
      }
-      out[i] = sum;
    }
    delete[] mi;
    float_output.flush();
    output->copyFrom(&float_output);
+    output->scale()[0] = max / 127;
+    output->scale()[1] = 127 / max;
  }
  bool dispatch() {
-    inplace_.relu_enable = param_.relu.enabled;
+    if (use_cpu_) {
+      cpu_compute();
+      return true;
+    }
+    inplace_.leaky_relu_enable =
+        (param_.relu.leaky_relu_factor != 0) ? true : false;
+    inplace_.relu_enable =
+        inplace_.leaky_relu_enable ? false : param_.relu.enabled;
    inplace_.power_enable = false;
    inplace_.normalize_enable = false;
+    if (inplace_.relu_enable || inplace_.leaky_relu_enable) {
-    if (param_.relu.enabled) {
-      inplace_.relu_enable = param_.relu.enabled;
      config_inplace(inplace_);
+      if (inplace_.leaky_relu_enable) {
+        activeParamterArgs.type = TYPE_LEAK_RELU;
+        activeParamterArgs.leaky_relu_factor =
+            fp32_2_fp16(param_.relu.leaky_relu_factor);
+        config_activation(activeParamterArgs);
+      }
    }
    std::vector<BasicConvParam*>& params = param_.splitParams();
@@ -104,9 +245,16 @@ class ConvPE : public PE {
      ret |= compute_fpga_conv_basic(conv_param->args);
    }
-    if (param_.relu.enabled) {
+    if (inplace_.relu_enable || inplace_.leaky_relu_enable) {
      inplace_.relu_enable = false;
+      inplace_.leaky_relu_enable = false;
      config_inplace(inplace_);
+      if (inplace_.leaky_relu_enable) {
+        activeParamterArgs.type = TYPE_LEAK_RELU;
+        activeParamterArgs.leaky_relu_factor = fp32_2_fp16(0);
+        config_activation(activeParamterArgs);
+      }
    }
    size_t size = params.size();
@@ -127,11 +275,13 @@ class ConvPE : public PE {
  ConvParam& param() { return param_; }
 private:
+  bool use_cpu_ = false;
  ConvParam param_;
  ConcatPE concatPE_;
  ElementwiseAddPE addPE_;
  int split_axis = 0;
  InplaceArgs inplace_ = {0};
+  ActiveParamterArgs activeParamterArgs;
 };
 }  // namespace zynqmp

--- a/lite/backends/fpga/KD/pes/conv_process.hpp
+++ b/lite/backends/fpga/KD/pes/conv_process.hpp
@@ -14,6 +14,9 @@ limitations under the License. */
 #pragma once
+#ifndef conv_process_hpp
+#define conv_process_hpp
 #include <string.h>
 #include <cmath>
 #include <vector>
@@ -45,7 +48,9 @@ inline int get_split_num(Tensor* filter) {
             filter->shape().width();
  auto num = filter->shape().num();
  int div_capacity = filter::calc_division_capacity(chw);
-  return filter::calc_split_num(num, div_capacity);
+  int filter_num_alignment = filter::get_filter_num_alignment();
+  int aligned_num = align_to_x(num, filter_num_alignment);
+  return filter::calc_split_num(aligned_num, div_capacity);
 }
 inline void fill_scale_bias_const(ConvParam* param_) {
@@ -126,41 +131,87 @@ inline void format_scale_bias(Tensor* scale,
    bias_data = bias->data<float>();
  }
  int channel = filter->shape().num();
-  Shape bias_scale_shape(N, {2 * channel});
+  int scale_bias_len = align_to_x(channel / group, BS_NUM_ALIGNMENT) * group;
+  int c_per_group = channel / group;
+  int aligned_c_per_group = align_to_x(channel / group, BS_NUM_ALIGNMENT);
+  Shape bias_scale_shape(N, {2 * scale_bias_len});
  float* bs_data = scale_bias->mutableData<float>(FP32, bias_scale_shape);
-  for (int i = 0; i < channel; i++) {
+  float* temp_data =
-    float scale_value = scale_data == nullptr ? 1 : scale_data[i];
+      reinterpret_cast<float*>(fpga_malloc(2 * scale_bias_len * sizeof(float)));
-    float bias_value = bias_data == nullptr ? 0 : bias_data[i];
+  memset(temp_data, 0, 2 * scale_bias_len * sizeof(float));
-    bs_data[i + channel] = scale_value;
-    bs_data[i] = bias_value;
+  std::vector<float> scales;
+  if (scale_data != nullptr) {
+    for (int i = 0; i < channel; ++i) {
+      scales.push_back(scale_data[i]);
+    }
+    for (int i = 0; i < scale_bias_len - channel; i++) {
+      scales.push_back(1);
+    }
+  } else {
+    for (int i = 0; i < scale_bias_len; i++) {
+      scales.push_back(1);
+    }
  }
-  int element_num_per_div = get_filter_num_per_div(filter, group);
+  for (int i = 0; i < scale_bias_len; ++i) {
-  bias_scale::format_bias_scale_array(&bs_data, element_num_per_div, channel);
+    temp_data[i + scale_bias_len] = 1;
+    temp_data[i] = 0;
+  }
+  for (int g = 0; g < group; g++) {
+    for (int c = 0; c < c_per_group; c++) {
+      int src_index = g * c_per_group + c;
+      int dst_index = g * aligned_c_per_group + c;
+      float scale_value = scales[src_index];
+      float bias_value = bias_data == nullptr ? 0 : bias_data[src_index];
+      temp_data[dst_index + scale_bias_len] = scale_value;
+      temp_data[dst_index] = bias_value;
+    }
+  }
+  // int element_num_per_div = get_filter_num_per_div(filter, group);
+  // int scale_bias_len = align_to_x(channel / group, 8) * group;
+  bias_scale::format_bias_scale_array(
+      &temp_data, scale_bias_len / group, scale_bias_len);
+  memcpy(bs_data, temp_data, 2 * scale_bias_len * sizeof(float));
 }
-inline void format_filter(Tensor* filter, Tensor* quantized_filter, int group) {
+inline void format_filter(Tensor* filter,
+                          Tensor* quantized_filter,
+                          int group,
+                          std::vector<float>& scales) {  // NOLINT
  float max_value = find_max(*filter);
  Shape& filter_shape = filter->shape();
+  int mem_size;
+  std::vector<float> max_values;
+  int8_t* quantized_data = filter::format_filter(filter->data<float>(),
+                                                 mem_size,
+                                                 filter_shape.num(),
+                                                 filter_shape.channel(),
+                                                 filter_shape.height(),
+                                                 filter_shape.width(),
+                                                 group,
+                                                 max_value,
+                                                 max_values);
+  float mem_factor = mem_size * 1.0f / filter->shape().numel();
+  quantized_filter->setMemScale(mem_factor);
  quantized_filter->setAligned(true);
-  quantized_filter->mutableData<int8_t>(INT8, filter->shape());
+  int8_t* src = quantized_filter->mutableData<int8_t>(INT8, filter->shape());
  quantized_filter->scale()[0] = max_value / 127.0f;
  quantized_filter->scale()[1] = 127.0f / max_value;
-  auto memory_size = filter->shape().memorySize(sizeof(float));
+  memcpy(src, quantized_data, mem_size);
-  auto new_data = reinterpret_cast<float*>(fpga_malloc(memory_size));
-  memcpy(new_data, filter->data<float>(), memory_size);
-  size_t mem_size = filter::format_filter(&new_data,
-                                          filter_shape.num(),
-                                          filter_shape.channel(),
-                                          filter_shape.height(),
-                                          filter_shape.width(),
-                                          group,
-                                          max_value);
-  int8_t* src = quantized_filter->mutableData<int8_t>(INT8, filter->shape());
-  memcpy(src, new_data, mem_size);
-  fpga_free(new_data);
  quantized_filter->flush();
+  for (size_t i = 0; i < max_values.size(); i++) {
+    scales.push_back(max_values[i] / max_value);
+  }
 }
 inline void format_dw_filter(Tensor* filter,
@@ -207,10 +258,20 @@ inline void split_filter_num(const ConvParam& c_param) {
  Tensor* out = param.output;
  Tensor* filter = param.filter;
  auto channel = out->shape().channel();
  int split_num = param.groups == 1 ? get_split_num(param.filter) : 1;
  int filter_num_per_div = get_filter_num_per_div(filter, param.groups);
+  auto chw = filter->shape().channel() * filter->shape().height() *
+             filter->shape().width();
+  auto num = filter->shape().num();
+  int div_capacity = filter::calc_division_capacity(chw);
+  int filter_num_alignment = filter::get_filter_num_alignment();
+  int aligned_num =
+      align_to_x(num / param.groups, filter_num_alignment) * param.groups;
+  // int aligned_num = align_to_x(num / param.groups ,FILTER_NUM_ALIGNMENT) *
+  // param.groups;
+  split_num = filter::calc_split_num(aligned_num, div_capacity);
  Shape& out_shape = out->shape();
  for (int i = 0; i < split_num; i++) {
    BasicConvParam* conv_param = new BasicConvParam();
@@ -251,9 +312,17 @@ inline void split_filter_num(const ConvParam& c_param) {
           filter->data<float>() + i * filter_num_per_div * filter_hwc,
           filter_num * filter_hwc * sizeof(float));
    new_filter.flush();
    conv_param->filter.mutableData<float>(FP32, f_shape);
-    format_filter(&new_filter, &(conv_param->filter), param.groups);
+    if (param.groups != 1) {
+      int mem_factor =
+          32 / filter_num_per_div;  // TODO(chonwhite): change 32 to param;
+      conv_param->filter.setMemScale(mem_factor);
+    }
+    std::vector<float> v;  // TODO(chonwhite): change local variable name
+    format_filter(&new_filter, &(conv_param->filter), param.groups, v);
+    conv_param->filter.setDataType(INT8);
    int sb_num = 2 * align_to_x(filter_num, BS_NUM_ALIGNMENT);
    Tensor scale;
@@ -265,7 +334,7 @@ inline void split_filter_num(const ConvParam& c_param) {
    float* scale_data = scale.mutableData<float>(FP32, s_shape);
    float* bias_data = bias.mutableData<float>(FP32, s_shape);
    for (int n = 0; n < filter_num; n++) {
-      scale_data[n] = param.scale()->data<float>()[n + chnnnel_start];
+      scale_data[n] = param.scale()->data<float>()[n + chnnnel_start] * v[n];
    }
    for (int n = 0; n < filter_num; n++) {
      bias_data[n] = param.bias()->data<float>()[n + chnnnel_start];
@@ -276,11 +345,14 @@ inline void split_filter_num(const ConvParam& c_param) {
                      &conv_param->filter,
                      &conv_param->scaleBias,
                      param.groups);
    conv_param->scaleBias.flush();
+    float* bs_data = conv_param->scaleBias.data<float>();
    args.group_num = param.groups;
    args.relu_enabled = param.relu.enabled;
    args.sb_address = conv_param->scaleBias.data<float>();
+    args.sb_address = bs_data;
    args.kernel.stride_h = param.strides[1];
    args.kernel.stride_w = param.strides[0];
    args.kernel.height = new_filter.shape().height();
@@ -294,17 +366,13 @@ inline void split_filter_num(const ConvParam& c_param) {
    args.image.channels = input->shape().channel();
    args.image.width = input->shape().width();
    args.image.height = input->shape().height();
-    auto paddings = *param.padding;
+    args.image.pad_width = param.paddings[1];
-    args.image.pad_width = param.paddings[2];
    args.image.pad_height = param.paddings[0];
+    // dilations[0] = dilations[1] ;
+    args.dilation = param.dilations[0];
    args.output.address = out_address;
    args.output.scale_address = out_scale_address;
-    bool pad_equal =
-        ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
-    if (!pad_equal) {
-      LOG(FATA) << "This pad not support ! " << paddings[0] << ", "
-                << paddings[1] << ", " << paddings[2] << ", " << paddings[3];
-    }
    param.splitParams().push_back(conv_param);
  }
 }
@@ -317,7 +385,7 @@ inline void split_channel(const ConvParam& c_param) {
  int num = ceil(input->shape().channel() * 1.0f / 2047);
  int channel = input->shape().channel() / num;
-  std::cout << "channel::" << channel << "num::" << num << std::endl;
  Shape bs_shape(N, {channel});
  for (int i = 0; i < num; i++) {
@@ -331,6 +399,7 @@ inline void split_channel(const ConvParam& c_param) {
    // filter transformation;
    Shape f_shape(NCHW, {param.filter->shape().num(), channel, 1, 1});
    Tensor new_filter;
    float* dst = new_filter.mutableData<float>(FP32, f_shape);
@@ -341,7 +410,8 @@ inline void split_channel(const ConvParam& c_param) {
      src += param.filter->shape().channel();
    }
    new_filter.flush();
-    format_filter(&new_filter, &(conv_param->filter), param.groups);
+    std::vector<float> scales;
+    format_filter(&new_filter, &(conv_param->filter), param.groups, scales);
    Tensor bias;
    Tensor scale;
@@ -354,6 +424,7 @@ inline void split_channel(const ConvParam& c_param) {
    }
    scale.flush();
    bias.flush();
+    // Shape sb_shape(N, {2 * channel});
    format_scale_bias(&scale,
                      &bias,
                      &conv_param->filter,
@@ -379,18 +450,12 @@ inline void split_channel(const ConvParam& c_param) {
    args.image.channels = conv_param->input.shape().channel();
    args.image.width = conv_param->input.shape().width();
    args.image.height = conv_param->input.shape().height();
-    auto paddings = *param.paddings;
+    args.image.pad_width = param.paddings[1];
-    args.image.pad_width = paddings[2];
+    args.image.pad_height = param.paddings[0];
-    args.image.pad_height = paddings[0];
+    // dilations[0] = dilations[1]
+    args.dilation = param.dilations[0];
    args.output.address = conv_param->output.mutableData<void>();
    args.output.scale_address = conv_param->output.scale();
-    bool pad_equal =
-        ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
-    if (!pad_equal) {
-      LOG(FATA) << "This pad not support ! " << paddings[0] << ", "
-                << paddings[1] << ", " << paddings[2] << ", " << paddings[3];
-    }
    param.splitParams().push_back(conv_param);
  }
 }
@@ -418,11 +483,11 @@ inline bool compute_conv(const ConvParam& c_conv_params) {
  }
  size_t size = params.size();
  if (ret == 0 && size > 1) {
+    // Tensor* output = conv_params.output;
    Tensor& img = params[0]->output;
    for (int i = 0; i < 1; i++) {
      for (int i = 0; i < img.shape().numel(); i++) {
        float value = half_to_float(img.data<float16>()[i]);
-        std::cout << "value:" << value << std::endl;
      }
    }
  }
@@ -431,3 +496,5 @@ inline bool compute_conv(const ConvParam& c_conv_params) {
 }  // namespace zynqmp
 }  // namespace paddle
+#endif /* conv_process_hpp */
--- a/lite/backends/fpga/KD/pes/crop_pe.cpp
+++ b/lite/backends/fpga/KD/pes/crop_pe.cpp
@@ -14,8 +14,6 @@ limitations under the License. */
 #include "lite/backends/fpga/KD/pes/crop_pe.hpp"
-#include <vector>
 namespace paddle {
 namespace zynqmp {

--- a/lite/backends/fpga/KD/pes/crop_pe.hpp
+++ b/lite/backends/fpga/KD/pes/crop_pe.hpp
@@ -14,6 +14,7 @@ limitations under the License. */
 #pragma once
+#include <algorithm>
 #include <cstring>
 #include <vector>

--- a/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp
+++ b/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp
@@ -37,18 +37,37 @@ class DepthwiseConvPE : public PE {
    Tensor* output = param.output;
    int channel = output->shape().channel();
-    float* new_scale_data = param_.scale()->data<float>();
-    float* new_bias_data = param_.bias()->data<float>();
    float16* b_data = bias_.mutableData<float16>(FP16, param_.bias()->shape());
-    for (int i = 0; i < channel; i++) {
+    if (param_.bias()->dataType() == FP32) {
-      b_data[i] = float_to_half(new_bias_data[i]);
+      float* new_bias_data = param_.bias()->data<float>();
+      // bias从float转换成float16
+      for (int i = 0; i < channel; i++) {
+        b_data[i] = float_to_half(new_bias_data[i]);
+      }
+      bias_.flush();
+    } else {
+      float16* new_bias_data = param_.bias()->data<float16>();
+      memcpy(b_data, new_bias_data, channel * sizeof(float16));
+      bias_.flush();
    }
-    bias_.flush();
-    Tensor* quantized_filter = param.quantizedFilter();
+    if (param_.scale()->dataType() == FP32) {
-    quantized_filter->mutableData<float16>(FP16, param.filter->shape());
+      float* new_scale_data = param_.scale()->data<float>();
-    format_dw_filter(param.filter, param.quantizedFilter(), new_scale_data);
+      Tensor* quantized_filter = param.quantizedFilter();
+      quantized_filter->mutableData<float16>(FP16, param.filter->shape());
+      format_dw_filter(param.filter, param.quantizedFilter(), new_scale_data);
+    } else {
+      // filter 全为1时，且channal为对齐时
+      float16* scale_data = param_.scale()->data<float16>();
+      float16* filter_data = param.quantizedFilter()->mutableData<float16>(
+          FP16, param.filter->shape());
+      // memcpy(filter_data, scale_data, channel * sizeof(float16));
+      memcpy(filter_data,
+             scale_data,
+             param.filter->shape().numel() * sizeof(float16));
+      param.quantizedFilter()->flush();
+    }
    DWconvArgs args = {0};
    args.bias_address = b_data;
@@ -61,21 +80,14 @@ class DepthwiseConvPE : public PE {
    args.image.channels = input->shape().channel();
    args.image.height = input->shape().height();
    args.image.width = input->shape().width();
-    auto paddings = *param.paddings;
+    args.image.pad_width = param.paddings[0];
-    args.image.pad_width = param.paddings[2];
+    args.image.pad_height = param.paddings[1];
-    args.image.pad_height = param.paddings[0];
    args.image.scale_address = input->scale();
    args.output.address = output->data<void>();
    args.output.scale_address = output->scale();
    args.out_width = param.output->shape().width();
    args.out_height = param.output->shape().height();
    args.sub_conv_num = 1;
-    bool pad_equal =
-        ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
-    if (!pad_equal) {
-      LOG(FATA) << "This pad not support ! " << paddings[0] << ", "
-                << paddings[1] << ", " << paddings[2] << ", " << paddings[3];
-    }
    param.args = args;
    inplace_.relu_enable = param_.relu.enabled;

--- a/lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp
+++ b/lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "lite/backends/fpga/KD/pe.hpp"
+#include "lite/backends/fpga/KD/pe_params.hpp"
+namespace paddle {
+namespace zynqmp {
+class ElementwiseMulPE : public PE {
+ public:
+  bool init() {
+    Tensor* output = param_.output;
+    output->setAligned(true);
+    output->setDataLocation(Device);
+    return true;
+  }
+  void apply() {
+    Tensor* input = param_.inputs[0];
+    Tensor* output = param_.output;
+    int wc_aligned = align_to_x(param_.inputs[0]->shape().numel(), 32);
+    // int wc_aligned =  / 32 * 32;
+    Shape s(N, {wc_aligned});
+    float16* bias_data = bias_tensor.mutableData<float16>(FP16, s);
+    memset(bias_data, 0, wc_aligned * sizeof(float16));
+    ScaleArgs& args = args_;
+    args.scale_address = param_.inputs[1]->data<void>();
+    args.bias_address = bias_tensor.data<void>();
+    args.wc_alignment = wc_aligned;
+    args.channel_alignment = wc_aligned;
+    args.image.address = input->data<void>();
+    args.image.scale_address = input->scale();
+    args.image.channels = wc_aligned;
+    args.image.height = 1;
+    args.image.width = 1;
+    args.image.pad_width = 0;
+    args.image.pad_height = 0;
+    args.output.address = output->data<void>();
+    args.output.scale_address = output->scale();
+  }
+  void updateInput(Tensor* t, int index) {
+    if (index == 0) {
+      args_.scale_address = t->data<void>();  // replace inputs?
+    }
+  }
+  bool dispatch() {
+    compute_fpga_scale(args_) == 0;
+    return true;
+  }
+  ElementwiseMulParam& param() { return param_; }
+ private:
+  ElementwiseMulParam param_;
+  ScaleArgs args_ = {0};
+  Tensor bias_tensor;
+};
+}  // namespace zynqmp
+}  // namespace paddle
--- a/lite/backends/fpga/KD/pes/fully_connected_pe.hpp
+++ b/lite/backends/fpga/KD/pes/fully_connected_pe.hpp
@@ -37,7 +37,10 @@ class FullyConnectedPE : public PE {
    ConvParam& convParam_ = convPE_.param();
    Tensor* input = param_.input;
    convParam_.input = param_.input;
+    num_ = param_.input->shape().num();
    convParam_.output = param_.output;
    convParam_.groups = 1;
    convParam_.strides = {1, 1};
    convParam_.paddings = {0, 0};
@@ -63,7 +66,6 @@ class FullyConnectedPE : public PE {
        new_filter_data[i * chw + j] = scale;
      }
    }
    conv_filter->flush();
    convParam_.filter = conv_filter;
@@ -89,6 +91,8 @@ class FullyConnectedPE : public PE {
 private:
  FullyConnectedParam param_;
  ConvPE convPE_;
+  Tensor tempOut_;
+  int num_ = 1;
 };
 }  // namespace zynqmp
 }  // namespace paddle
--- a/lite/backends/fpga/KD/pes/gru_pe.hpp
+++ b/lite/backends/fpga/KD/pes/gru_pe.hpp
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "lite/backends/arm/math/sgemm.h"
+#include "lite/backends/fpga/KD/pe.hpp"
+#include "lite/backends/fpga/KD/pe_params.hpp"
+#include "lite/backends/fpga/KD/pes/elementwise_add_pe.hpp"
+#include "lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp"
+#include "lite/backends/fpga/KD/pes/fully_connected_pe.hpp"
+#include "lite/backends/fpga/KD/pes/relu_pe.hpp"
+#include "lite/api/paddle_place.h"
+#include "lite/backends/arm/math/funcs.h"
+#include "lite/core/type_system.h"
+namespace paddle {
+namespace zynqmp {
+struct GRUTensors {
+  Tensor* gate;
+  Tensor* pre_output;
+  Tensor* output;
+  Tensor* reset_output;
+};
+class GRUPE : public PE {
+ public:
+  bool init() {
+    // Tensor* output = param_.output;
+    // output->setAligned(true);
+    // output->setDataLocation(Device);
+    return true;
+  }
+  void apply() {
+    auto hidden = param_.hidden;
+    // auto hidden_dims = hidden->dims();
+    int frame_size = hidden->shape().channel();
+    zynqmp::Shape hidden_shape{zynqmp::NCHW, {1, frame_size, 1, 1}};
+    float16* prev_hidden_data =
+        prev_hidden_.mutableData<float16>(zynqmp::FP16, hidden_shape);
+    // set previous hidden data to 0;
+    memset(prev_hidden_data, 0, hidden_shape.numel() * sizeof(float16));
+    // copy 2/3 weight from param.weight;
+    zynqmp::Shape weight_shape{zynqmp::NC, {frame_size, frame_size * 2}};
+    float* weight_data = weight_.mutableData<float>(zynqmp::FP32, weight_shape);
+    memset(weight_data, 0, weight_shape.numel() * sizeof(float));
+    weight_data = weight_.mutableData<float>(zynqmp::FP32, weight_shape);
+    memcpy(weight_data,
+           param_.weight->data<float>(),
+           weight_shape.numel() * sizeof(float));
+    Shape gate_shape(zynqmp::NC, {1, frame_size * 2});
+    gate_ping_.mutableData<void>(FP32, gate_shape);
+    gate_pong_.mutableData<void>(FP16, gate_shape);
+    zynqmp::FullyConnectedParam& pre_out_param = pre_out_pe_.param();
+    pre_out_param.input = &prev_hidden_;
+    pre_out_param.output = &gate_pong_;
+    pre_out_param.filter = &weight_;
+    pre_out_param.bias = &gate_ping_;
+    pre_out_pe_.init();
+    pre_out_pe_.apply();
+    // // ============= C
+    // ElementwiseAddParam& bias_add_param = bias_ew_pe_.param();
+    // bias_add_param.inputs = {&pre_output_, &pre_input_};
+    // bias_add_param.output = &pre_input_;
+    // bias_ew_pe_.init();
+    // bias_ew_pe_.apply();
+    // // ====================
+    // Shape state_weight_shape(NC,{frame_size, frame_size});
+    // float* state_weight_data = state_weight_.mutableData<float>(FP32,
+    // state_weight_shape);
+    // memcpy(state_weight_data, weight_data + 2 * frame_size * frame_size,
+    //   state_weight_shape.numel() * sizeof(float));
+    // FullyConnectedParam& reset_out_param = reset_out_pe_.param();
+    // reset_out_param.input = &prev_hidden;
+    // reset_out_param.output = &gate_ping;
+    // reset_out_param.filter = &state_weight_;
+    // // ============== unit reset;
+    // update_gate_.mutableData<void>(FP16, pre_input_shape);
+    // InputParam& relu_param = update_relu_pe_.param();
+    // relu_param.input = &tempTensor;
+    // relu_param.output = &update_gate_;
+    // update_relu_pe_.init();
+    // update_relu_pe_.apply();
+    reset_gate_.mutableData<void>(FP16, hidden_shape);
+    prev_hidden_.mutableData<void>(FP16, hidden_shape);
+    reset_hidden_.mutableData<void>(FP16, hidden_shape);
+    // InputParam& reset_param = reset_relu_pe_.param();
+    // reset_param.input = &tempTensor;
+    // reset_param.output = &reset_gate_;
+    // reset_relu_pe_.init();
+    // reset_relu_pe_.apply();
+    // float16* prev_data = prev_.mutableData<float16>(FP16, pre_input_shape);
+    // memset(prev_data, 0, (pre_input_shape.numel() + 32) * sizeof(float16));
+    // // TODO
+    // reset_hidden_prev_.mutableData<float16>(FP16, pre_input_shape);
+    ElementwiseMulParam& mul_param = mul_pe_.param();
+    mul_param.inputs = {&reset_gate_, &prev_hidden_};
+    mul_param.output = &reset_hidden_;
+    mul_pe_.init();
+    mul_pe_.apply();
+    // ==============
+  }
+  bool dispatch() { return true; }
+  void gru_unit_reset_act(const lite_api::ActivationType active_gate,
+                          GRUTensors& value,  // NOLINT
+                          int frame_size,
+                          int batch_size) {
+    int stride_update = 3 * frame_size;
+    int stride_cell_state = 3 * frame_size;
+    int stride_hidden_prev = frame_size;
+    int stride_hidden = frame_size;
+    // Tensor* gate = value.gate;
+    // value.gate->saveToFile("value_input.txt");
+    float* update_gate_data = gate_ping_.data<float>();
+    float* reset_gate_data = update_gate_data + frame_size;
+    for (int b = 0; b < batch_size; b++) {
+      // memcpy(tempTensor.data<void>(), reset_gate_data, gate->shape().numel()
+      // * sizeof(float));
+      // tempTensor.flush();
+      Tensor tmp;
+      Shape s(NC, {1, frame_size});
+      float* tmp_data = tmp.mutableData<float>(FP32, s);
+      for (int i = 0; i < frame_size; i++) {
+        // f(x) = x / (1 + abs(x))?
+        update_gate_data[i] =
+            lite::arm::math::active_f32<lite_api::ActivationType::kSigmoid>(
+                update_gate_data[i]);
+        reset_gate_data[i] =
+            lite::arm::math::active_f32<lite_api::ActivationType::kSigmoid>(
+                reset_gate_data[i]);
+      }
+      memcpy(tmp_data, reset_gate_data, frame_size * sizeof(float));
+      tmp.flush();
+      reset_gate_.copyFrom(&tmp);
+      // reset_gate_.copyFrom(&tempTensor);
+      Tensor* hidden_prev = value.pre_output;
+      if (hidden_prev) {
+        // memcpy(prev_data, )
+        // TODO(chonwhite): change to pre_out;
+        prev_hidden_.copyFrom(value.pre_output);
+        prev_hidden_.saveToFile("prev_.txt");
+      }
+      // // 4.0 reset_date * hidden_prev;
+      // // reset_hidden_prev[i] = reset_gate[i] * prev;
+      mul_pe_.dispatch();
+      reset_hidden_.saveToFile("reset_hidden_.txt");
+      update_gate_data += stride_update;
+      reset_gate_data += stride_update;
+      // reset_hidden_prev += stride_hidden;// TODO
+    }
+  }
+  void gru_unit_out_act(const lite_api::ActivationType active_node,
+                        bool origin_mode,
+                        GRUTensors& value,  // NOLINT
+                        int frame_size,
+                        int batch_size) {
+    // int stride_update = 3 * frame_size;
+    // int stride_cell_state = 3 * frame_size;
+    // int stride_hidden_prev = frame_size;
+    // int stride_hidden = frame_size;
+    // Tensor* hidden = value.output_value;
+    // float* hidden_prev = nullptr;
+    // if (hidden) {
+    //   hidden_prev = hidden->data<float>();
+    // }
+    // float* cell_state = value.gate->data<float>() + 2 * frame_size;
+    // float* updata_gate = value.gate->data<float>();
+    // // float* reset_gate_data = update_gate_data + frame_size;
+    // float prev = 0.0f;
+    // for (int b = 0; b < batch_size; ++b) {
+    //   if (origin_mode) {
+    //     // for (int i = 0; i < frame_size; i++) {
+    //     //   float prev = 0;
+    //     //   if (hidden_prev) {
+    //     //     prev = hidden_prev[i];
+    //     //   }
+    //     //   cell_state[i] =
+    //     lite::arm::math::active_f32<kSigmoid>(cell_state[i]);
+    //     //   hidden[i] =
+    //     //       cell_state[i] * (1.f - updata_gate[i]) + updata_gate[i] *
+    //     prev;
+    //     // }
+    //   } else {
+    //     for (int i = 0; i < frame_size; ++i) {
+    //       cell_state[i] =
+    //       lite::arm::math::active_f32<lite_api::ActivationType::kRelu>(cell_state[i]);
+    //       if (hidden_prev) {
+    //        prev = hidden_prev[i];
+    //       }
+    //       float hidden_value =
+    //         prev * (1.f - updata_gate[i]) + updata_gate[i] * cell_state[i];
+    //       hidden_prev[i] = hidden_value;
+    //       std::cout << "hidden_value::" << hidden_value << std::endl;
+    //     }
+    //   }
+    //   updata_gate += stride_update;
+    //   cell_state += stride_cell_state;
+    //   hidden_prev += frame_size;
+    // }
+  }
+  void copy_input(GRUTensors& value) {  // NOLINT
+    float max = find_max(*(value.gate));
+    gate_ping_.mutableData<void>(FP32, value.gate->shape());
+    gate_ping_.copyFrom(value.gate);
+    // update input pointer?
+    // gate_.readFromFile("input/in.txt");
+    // // pre_input_.saveToFile("pppp_in.txt");
+    // gate_.scale()[0] = max / 127;
+    // gate_.scale()[1] = 127 / max;
+    // gate_.printScale("pre_input_");
+    // gate_.saveToFile("pre_input_.txt");
+    // pre_out_pe_.dispatch();
+    // pre_output_.saveToFile("pp_out.txt");
+  }
+  void GRUCOmpute(GRUTensors& value,  // NOLINT
+                  int frame_size,
+                  int batch_size,
+                  const lite_api::ActivationType active_node,
+                  const lite_api::ActivationType active_gate,
+                  bool origin_mode) {
+    copy_input(value);
+    if (value.pre_output) {
+      // copy by batch;
+      pre_out_pe_.dispatch();
+      gate_ping_.copyFrom(&gate_pong_);
+    }
+    gru_unit_reset_act(active_gate, value, frame_size, batch_size);
+    // if (value.pre_output) {
+    //   // state weight;
+    //   reset_out_pe_.dispatch();
+    // }
+    // gru_unit_out_act(active_node, origin_mode, value, frame_size,
+    // batch_size);
+  }
+  GRUParam& param() { return param_; }
+  // Tensor* preOutput() {
+  //   return &pre_output_;
+  // }
+  // Tensor* gate() {
+  //   return &gate_;
+  // }
+  Tensor* updateGate() { return &update_gate_; }
+  Tensor* resetGate() { return &reset_gate_; }
+ private:
+  GRUParam param_;
+  zynqmp::Tensor gate_ping_;
+  zynqmp::Tensor gate_pong_;
+  zynqmp::Tensor bias_;
+  zynqmp::Tensor weight_;
+  zynqmp::Tensor state_weight_;
+  // =================================
+  zynqmp::Tensor update_gate_;
+  zynqmp::Tensor reset_gate_;
+  zynqmp::Tensor cell_state_;
+  zynqmp::Tensor prev_hidden_;
+  zynqmp::Tensor reset_hidden_;
+  Tensor tempTensor;
+  // =================================
+  ReluPE update_relu_pe_;
+  ReluPE reset_relu_pe_;
+  zynqmp::ElementwiseMulPE mul_pe_;
+  zynqmp::FullyConnectedPE pre_out_pe_;
+  zynqmp::FullyConnectedPE reset_out_pe_;
+  zynqmp::ElementwiseAddPE bias_ew_pe_;
+};
+}  // namespace zynqmp
+}  // namespace paddle
--- a/lite/backends/fpga/KD/pes/gru_util.hpp
+++ b/lite/backends/fpga/KD/pes/gru_util.hpp
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "lite/backends/arm/math/gru_utils.h"
+namespace paddle {
+namespace lite {
+namespace fpga {}
+}
+}
--- a/lite/backends/fpga/KD/pes/output_pe.hpp
+++ b/lite/backends/fpga/KD/pes/output_pe.hpp
@@ -25,6 +25,8 @@ class OutputPE : public PE {
  bool init() {
    Tensor* output = param_.output;
    output->setAligned(false);
+    DLEngine::get_instance().out_data = reinterpret_cast<float*>(
+        fpga_malloc(output->shape().numel() * sizeof(float)));
    return true;
  }
@@ -41,6 +43,15 @@ class OutputPE : public PE {
    } else {
      output->copyFrom(input);
    }
+    //
+    output->syncToCPU();
+    if (DLEngine::get_instance().out_data == nullptr) {
+      DLEngine::get_instance().out_data = reinterpret_cast<float*>(
+          fpga_malloc(output->shape().numel() * sizeof(float)));
+    }
+    memcpy(DLEngine::get_instance().out_data,
+           output->data<void>(),
+           output->shape().numel() * sizeof(float));
    return true;
  }

--- a/lite/backends/fpga/KD/pes/pooling_pe.hpp
+++ b/lite/backends/fpga/KD/pes/pooling_pe.hpp
@@ -35,24 +35,25 @@ class PoolingPE : public PE {
    Tensor* input = param_.input;
    Tensor* output = param_.output;
-    uint32_t k_width = param_.kernelSize[0];
+    uint32_t k_height = param_.kernelSize[0];
-    uint32_t k_height = param_.kernelSize[1];
+    uint32_t k_width = param_.kernelSize[1];
    if (param_.globalPooling) {
      k_width = input->shape().width();
      k_height = input->shape().height();
+      param_.kernelSize[0] = k_height;
+      param_.kernelSize[1] = k_width;
    }
    PoolingArgs args = {0};
    args.mode = param_.type;
-    auto paddings = *param_.paddings;
    args.kernel_reciprocal = fp32_2_fp16(1.0f / (k_width * k_height));
    args.image.address = input->data<float16>();
    args.image.channels = input->shape().channel();
    args.image.height = input->shape().height();
    args.image.width = input->shape().width();
-    args.image.pad_height = paddings[0];
+    args.image.pad_height = param_.paddings[0];
-    args.image.pad_width = paddings[2];
+    args.image.pad_width = param_.paddings[1];
    args.image.scale_address = input->scale();
    args.output.address = output->mutableData<float16>();
    args.output.scale_address = output->scale();
@@ -66,6 +67,10 @@ class PoolingPE : public PE {
    use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1 &&
               (k_width > 7 || k_height > 7);
+    // use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1
+    // &&
+    //            (k_width > 255 || k_height > 255);
+    use_cpu_ = param_.type == AVERAGE;
  }
  void compute() {
@@ -74,16 +79,16 @@ class PoolingPE : public PE {
    input->syncToCPU();
    Tensor float_input;
+    // Tensor float_output;
    float* image_addr = float_input.mutableData<float>(FP32, input->shape());
    float_input.copyFrom(input);
    float16* data_out = output->data<float16>();
-    auto paddings = *param_.paddings;
    int image_height = input->shape().height();
    int image_width = input->shape().width();
    int image_channels = input->shape().channel();
-    int image_pad_h = paddings[0];
+    int image_pad_h = param_.paddings[0];
-    int image_pad_w = paddings[2];
+    int image_pad_w = param_.paddings[1];
    int kernel_height = param_.kernelSize[1];
    int kernel_width = param_.kernelSize[0];
    int kernel_step_h = param_.strides[0];
@@ -129,7 +134,7 @@ class PoolingPE : public PE {
    output->flush();
  }
-  void cpu_compute() {
+  void cpu_compute1() {
    Tensor* input = param_.input;
    Tensor* output = param_.output;
    input->syncToCPU();
@@ -138,7 +143,6 @@ class PoolingPE : public PE {
    float_input.mutableData<float>(FP32, input->shape());
    float_input.copyFrom(input);
    float16* data_out = output->data<float16>();
    int kernel_hw = param_.kernelSize[0] * param_.kernelSize[1];
    float scale_max = 0;
@@ -154,13 +158,43 @@ class PoolingPE : public PE {
    }
    output->scale()[0] = scale_max / 127.0f;
    output->scale()[1] = 127.0f / scale_max;
-    std::cout << "pool scale:" << scale_max / 127.0f << std::endl;
+    output->flush();
+  }
+  void cpu_compute() {
+    Tensor* input = param_.input;
+    Tensor* output = param_.output;
+    input->syncToCPU();
+    Tensor float_input;
+    float* float_input_data =
+        float_input.mutableData<float>(FP32, input->shape());
+    float_input.copyFrom(input);
+    float16* data_out = output->data<float16>();
+    int kernel_hw = param_.kernelSize[0] * param_.kernelSize[1];
+    float scale_max = 0;
+    for (int i = 0; i < output->shape().channel(); i++) {
+      float sum = 0;
+      for (int j = 0; j < kernel_hw; j++) {
+        sum += float_input_data[i * kernel_hw + j];
+      }
+      float value = sum / kernel_hw;
+      data_out[i] = float_to_half(value);
+      scale_max = std::max(scale_max, std::abs(value));
+    }
+    output->scale()[0] = scale_max / 127.0f;
+    output->scale()[1] = 127.0f / scale_max;
    output->flush();
  }
  bool dispatch() {
    if (use_cpu_) {
+      // cpu_compute();
      compute();
+      // exit(-1);
      return true;
    }
    param_.input->syncToDevice();

--- a/lite/backends/fpga/KD/pes/prior_box_pe.cpp
+++ b/lite/backends/fpga/KD/pes/prior_box_pe.cpp
@@ -253,9 +253,8 @@ bool PriorBoxPE::dispatch() {
  if (cachedBoxes_ == nullptr) {
    cachedBoxes_ = new Tensor();
    cachedVariances_ = new Tensor();
-    cachedBoxes_->mutableData<float16>(FP16, param_.outputBoxes->shape());
+    cachedBoxes_->mutableData<float>(FP32, param_.outputBoxes->shape());
-    cachedVariances_->mutableData<float16>(FP16,
+    cachedVariances_->mutableData<float>(FP32, param_.outputVariances->shape());
-                                           param_.outputVariances->shape());
    cachedBoxes_->setDataLocation(CPU);
    cachedVariances_->setDataLocation(CPU);
    compute_prior_box();

--- a/lite/backends/fpga/KD/pes/scale_pe.hpp
+++ b/lite/backends/fpga/KD/pes/scale_pe.hpp
@@ -14,11 +14,16 @@ limitations under the License. */
 #pragma once
+#include <algorithm>
 #include "lite/backends/fpga/KD/pe.hpp"
 #include "lite/backends/fpga/KD/pe_params.hpp"
+#include "lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp"
+#include "lite/backends/fpga/KD/tensor.hpp"
 namespace paddle {
 namespace zynqmp {
 class ScalePE : public PE {
 public:
  inline int gcd(int a, int b) {
@@ -42,6 +47,8 @@ class ScalePE : public PE {
    Tensor* input = param_.input;
    Tensor* output = param_.output;
    Shape& input_shape = input->shape();
+    DepthwiseConvParam& dw_param = dw_pe_.param();
    int channel = input_shape.channel();
    int repeat = 1;
    int alignment = 16;
@@ -51,70 +58,154 @@ class ScalePE : public PE {
      int c_lcm = lcm(channel, alignment);
      repeat = c_lcm / (channel);
    }
+    // FPGA限制 H >2047, W >1023 , WC> 65536 ，需要使用CPU实现
    Shape shape(N, {channel * repeat});
-    param_.alignedBias()->mutableData<float16>(FP16, shape);
-    param_.alignedScale()->mutableData<float16>(FP16, shape);
-    float16* bias_data = param_.alignedBias()->data<float16>();
+    float* filter_data = filter.mutableData<float>(FP32, shape);
-    float16* scale_data = param_.alignedScale()->data<float16>();
+    std::fill_n(filter_data, input->shape().channel(), 1.0f);
+    Tensor* scale = dw_param.scale();
+    float16* scale_data = scale->mutableData<float16>(FP16, shape);
+    // memcpy(scale_data, param_.scale->data<float>(), input->shape().channel()
+    // * sizeof(float));
+    Tensor* bias = dw_param.bias();
+    float16* bias_data = bias->mutableData<float16>(FP16, shape);
+    std::fill_n(bias_data, input->shape().channel(), 0);
+    if (param_.scale->dataType() == FP32) {
+      // std::cout << "scale dataType FP32:" << std::endl;
+      if (param_.bias != nullptr) {
+        float* bias_data_float = param_.bias->data<float>();
+        for (int i = 0; i < repeat; i++) {
+          for (int j = 0; j < length; j++) {
+            float16 value = float_to_half(bias_data_float[j]);
+            bias_data[i * length + j] = value;
+          }
+        }
+      } else {
+        float16 zero = float_to_half(0.0f);
+        for (int i = 0; i < repeat; i++) {
+          for (int j = 0; j < length; j++) {
+            bias_data[i * length + j] = zero;
+          }
+        }
+      }
-    if (param_.bias != nullptr) {
+      float* scale_data_float = param_.scale->data<float>();
-      float* bias_data_float = param_.bias->data<float>();
      for (int i = 0; i < repeat; i++) {
        for (int j = 0; j < length; j++) {
-          float16 value = float_to_half(bias_data_float[j]);
+          float16 value = float_to_half(scale_data_float[j]);
-          bias_data[i * length + j] = value;
+          scale_data[i * length + j] = value;
        }
      }
    } else {
-      float16 zero = float_to_half(0.0f);
+      if (param_.bias != nullptr) {
+        float16* bias_data_float = param_.bias->data<float16>();
+        for (int i = 0; i < repeat; i++) {
+          for (int j = 0; j < length; j++) {
+            float16 value = bias_data_float[j];
+            bias_data[i * length + j] = value;
+          }
+        }
+      } else {
+        float16 zero = float_to_half(0.0f);
+        for (int i = 0; i < repeat; i++) {
+          for (int j = 0; j < length; j++) {
+            bias_data[i * length + j] = zero;
+          }
+        }
+      }
+      float16* scale_data_float = param_.scale->data<float16>();
      for (int i = 0; i < repeat; i++) {
        for (int j = 0; j < length; j++) {
-          bias_data[i * length + j] = zero;
+          float16 value = scale_data_float[j];
+          scale_data[i * length + j] = value;
        }
      }
    }
-    float* scale_data_float = param_.scale->data<float>();
+    // if (param_.bias != nullptr) {
-    for (int i = 0; i < repeat; i++) {
+    //   memcpy(bias_data, param_.bias->data<float>(), input->shape().channel()
-      for (int j = 0; j < length; j++) {
+    //   * sizeof(float));
-        float16 value = float_to_half(scale_data_float[j]);
+    // }
-        scale_data[i * length + j] = value;
+    dw_param.input = param_.input;
+    dw_param.output = param_.output;
+    dw_param.filter = &filter;
+    dw_param.strides = {1, 1};
+    dw_param.paddings = {0, 0};
+    dw_param.kernelSize = {1, 1};
+    dw_param.dilations = {1, 1};
+    dw_pe_.init();
+    dw_pe_.apply();
+  }
+  void cpu_compute() {
+    Tensor* input = param_.input;
+    Tensor* output = param_.output;
+    Tensor float_input;
+    float* image_addr = float_input.mutableData<float>(FP32, input->shape());
+    input->syncToCPU();
+    float_input.copyFrom(input);
+    float16* data_out = output->data<float16>();
+    float* scale_data = param_.scale->data<float>();
+    int wh = input->shape().width() * input->shape().height();
+    float16* in_data = input->data<float16>();
+    float max = 0;
+    for (int i = 0; i < wh; i++) {
+      for (int c = 0; c < input->shape().channel(); c++) {
+        int index = i * input->shape().channel() + c;
+        float value = half_to_float(in_data[index]) * scale_data[c];
+        data_out[index] = float_to_half(value);
+        if (value < 0) {
+          value = -value;
+        }
+        if (value > max) {
+          max = value;
+        }
      }
    }
+    output->flush();
-    param_.alignedScale()->flush();
+    output->scale()[0] = max / 127.0f;
-    param_.alignedBias()->flush();
+    output->scale()[1] = 127.0f / max;
-    int wc = input_shape.width() * input_shape.channel();
-    int wc_aligned = align_image(wc);
-    ScaleArgs& args = param_.args;
-    args.scale_address = param_.alignedScale()->data<void>();
-    args.bias_address = param_.alignedBias()->data<void>();
-    args.wc_alignment = wc_aligned;
-    args.channel_alignment = channel * repeat;
-    args.image.address = input->data<void>();
-    args.image.scale_address = input->scale();
-    args.image.channels = channel;
-    args.image.height = input_shape.height();
-    args.image.width = input_shape.width();
-    args.image.pad_width = 0;
-    args.image.pad_height = 0;
-    args.output.address = output->data<void>();
-    args.output.scale_address = output->scale();
  }
  bool dispatch() {
+    // cpu_compute();
+    // return true;
+    if (param_.scale->dataType() == FP16) {
+      DepthwiseConvParam& dw_param = dw_pe_.param();
+      memcpy(dw_param.quantizedFilter()->mutableData<float16>(),
+             param_.scale->data<float16>(),
+             param_.scale->shape().numel() * sizeof(float16));
+      dw_param.quantizedFilter()->scale()[0] = param_.scale->scale()[0];
+      dw_param.quantizedFilter()->scale()[1] = param_.scale->scale()[1];
+      dw_param.quantizedFilter()->flush();
+      // apply();
+    }
    param_.input->syncToDevice();
-    return compute_fpga_scale(param_.args) == 0;
+    return dw_pe_.dispatch();
  }
  ScaleParam& param() { return param_; }
 private:
  ScaleParam param_;
+  Tensor filter;
+  DepthwiseConvPE dw_pe_;
 };
 }  // namespace zynqmp
 }  // namespace paddle
--- a/lite/backends/fpga/KD/shape.hpp
+++ b/lite/backends/fpga/KD/shape.hpp
@@ -23,6 +23,7 @@ limitations under the License. */
 namespace paddle {
 namespace zynqmp {
+static struct None none_;
 static struct NCHW nchw_;
 static struct NHWC nhwc_;
 static struct NC nc_;
@@ -82,6 +83,9 @@ class Shape {
  void setLayoutType(LayoutType layout) {
    this->layoutType_ = layout;
    switch (layout) {
+      case None:
+        layout_ = &none_;
+        break;
      case NCHW:
        layout_ = &nchw_;
        break;

--- a/lite/backends/fpga/KD/tensor.hpp
+++ b/lite/backends/fpga/KD/tensor.hpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <stdio.h>
+#include <unistd.h>
 #include <algorithm>
 #include <cmath>
 #include <cstring>
@@ -24,13 +25,10 @@ limitations under the License. */
 #include <string>
 #include <vector>
-// #include "lite/core/tensor.h"
 #include "lite/backends/fpga/KD/dl_engine.hpp"
 #include "lite/backends/fpga/KD/float16.hpp"
 #include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
 #include "lite/backends/fpga/KD/shape.hpp"
-// #include "lite/backends/fpga/KD/types.hpp"
 namespace paddle {
 namespace zynqmp {
@@ -117,7 +115,8 @@ class Tensor {
  template <typename Dtype>
  Dtype* mutableData() {
-    size_t memorySize = shape_->memorySize(CellSize(dataType_));
+    size_t memorySize =
+        shape_->memorySize(CellSize(dataType_)) * mem_scale_factor_;
    if (placeHolder_ != nullptr) {
      if (memorySize > placeHolder_->memorySize()) {
        placeHolder_.reset(new PlaceHolder(memorySize));
@@ -241,6 +240,10 @@ class Tensor {
    }
  }
+  void setMemScale(float scale_factor) {
+    this->mem_scale_factor_ = scale_factor;
+  }
  void shareDataWith(Tensor* src) { shareDataWith(src, src->shape()); }
  void shareDataWith(Tensor* src, const Shape& shape, int offset = 0) {
@@ -276,9 +279,11 @@ class Tensor {
                  .height = 1,
                  .pad_width = 0u,
                  .pad_height = 0u};
-    args.output = {
+    ImageOutputArgs output = {
        .address = data<void>(), .scale_address = scale(),
    };
+    args.output = output;
    src->syncToDevice();
    size_t aligned_remainder = src->shape().numel() % 16;
    if (aligned_remainder > 0) {
@@ -294,10 +299,16 @@ class Tensor {
    this->invalidate();
  }
-  void flush() { fpga_flush(placeHolder_->data(), placeHolder_->memorySize()); }
+  void flush() {
+    size_t memorySize =
+        shape_->memorySize(CellSize(dataType_)) * mem_scale_factor_;
+    fpga_flush(placeHolder_->data(), memorySize);
+  }
  void invalidate() {
-    fpga_invalidate(placeHolder_->data(), placeHolder_->memorySize());
+    size_t memorySize =
+        shape_->memorySize(CellSize(dataType_)) * mem_scale_factor_;
+    fpga_invalidate(placeHolder_->data(), memorySize);
  }
  void sync() {
@@ -337,6 +348,18 @@ class Tensor {
    if (placeHolder_ == nullptr) {
      return;
    }
+    std::cout << scale()[0] << " , " << scale()[1] << std::endl;
+  }
+  void printScale(std::string type) {
+    std::cout << type << " : "
+              << std::to_string(shape_->num()) + "_" +
+                     std::to_string(shape_->channel()) + "_" +
+                     std::to_string(shape_->height()) + "_" +
+                     std::to_string(shape_->width())
+              << std::endl;
+    std::cout << type << " \n";
+    printScale();
  }
  std::string dimsFileName() {
@@ -358,33 +381,14 @@ class Tensor {
    saveToFile(path);
  }
-  friend std::ostream& operator<<(std::ostream& os, Tensor& tensor) {
-    os << "tensor:"
-       << "\n";
-    os << "dims: {";
-    for (int i = 0; i < tensor.shape().dimSize(); ++i) {
-      os << tensor.shape()[i] << " ";
-    }
-    os << "}\n";
-    for (int i = 0; i < tensor.shape().numel(); i++) {
-      float value = 0;
-      if (tensor.dataType() == FP32) {
-        value = tensor.data<float>()[i];
-      } else {
-        value = half_to_float(tensor.data<float16>()[i]);
-      }
-      os << value << " ";
-    }
-    os << "\n";
-    return os;
-  }
  void saveToFile(std::string path) {
    syncToCPU();
+    invalidate();
    std::ofstream ofs;
    static int counter = 0;
    std::string npath = std::to_string(counter) + "_" + path;
    counter++;
+    std::cout << "======== saving file:" << npath << " ============\n";
    save_file_with_name(npath);
  }
@@ -392,14 +396,16 @@ class Tensor {
    // return;
    invalidate();
    std::ofstream ofs;
    ofs.open(path);
    for (int i = 0; i < shape_->numel(); i++) {
      float value = 0;
      if (dataType_ == FP32) {
        value = data<float>()[i];
-      } else {
+      } else if (dataType_ == FP16) {
        value = half_to_float(data<float16>()[i]);
+      } else {
+        value = data<int8_t>()[i];
      }
      ofs << value << std::endl;
    }
@@ -415,18 +421,49 @@ class Tensor {
    int num = shape_->numel();
    invalidate();
    float max = 0.0f;
-    float16* data = mutableData<float16>();
+    if (dataType_ == FP16) {
-    for (int i = 0; i < num; ++i) {
+      float16* data = mutableData<float16>();
-      float value = 0;
+      for (int i = 0; i < num; ++i) {
-      file_stream >> value;
+        float value = 0;
-      max = std::max(std::abs(value), max);
+        file_stream >> value;
-      data[i] = float_to_half(value);
+        max = std::max(std::abs(value), max);
+        data[i] = float_to_half(value);
+      }
+    } else {
+      float* data = mutableData<float>();
+      for (int i = 0; i < num; ++i) {
+        float value = 0;
+        file_stream >> value;
+        max = std::max(std::abs(value), max);
+        data[i] = value;
+      }
    }
    flush();
    placeHolder_->scale_[0] = max / 127.0f;
    placeHolder_->scale_[1] = 127.0f / max;
  }
+  friend std::ostream& operator<<(std::ostream& os, Tensor& tensor) {
+    os << "tensor:"
+       << "\n";
+    os << "dims: {";
+    for (int i = 0; i < tensor.shape().dimSize(); ++i) {
+      os << tensor.shape()[i] << " ";
+    }
+    os << "}\n";
+    for (int i = 0; i < tensor.shape().numel(); i++) {
+      float value = 0;
+      if (tensor.dataType() == FP32) {
+        value = tensor.data<float>()[i];
+      } else {
+        value = half_to_float(tensor.data<float16>()[i]);
+      }
+      os << value << " ";
+    }
+    os << "\n";
+    return os;
+  }
  ~Tensor() {
    if (shape_ != nullptr) {
      delete shape_;
@@ -436,6 +473,7 @@ class Tensor {
 private:
  int offset = 0;
+  float mem_scale_factor_ = 1.0f;
  std::shared_ptr<PlaceHolder> placeHolder_;
  Shape* shape_ = nullptr;
  DataType dataType_ = FP32;

--- a/lite/backends/fpga/lite_tensor.cc
+++ b/lite/backends/fpga/lite_tensor.cc
@@ -95,16 +95,14 @@ void TensorLite::CopyDataFrom(const TensorLite &other) {
  dims_ = other.dims_;
  target_ = other.target_;
  lod_ = other.lod_;
-  // memory_size_ = other.memory_size_;
+  auto dt = zynq_tensor_->dataType();
-  // buffer_->CopyDataFrom(*other.buffer_, memory_size_);
-  zynq_tensor_->mutableData<void>(other.zynq_tensor_->dataType(),
-                                  other.zynq_tensor_->shape());
-}
-// template <typename T>
+  auto shape = other.zynq_tensor_->shape();
-// void TensorLite::mutable_data_internal() {
-// }
+  Resize(other.dims());
+  zynq_tensor_->mutableData<void>(zynq_tensor_->dataType(), shape);
+  this->ZynqTensor()->copyFrom(other.ZynqTensor());
+}
 }  // namespace lite
 }  // namespace paddle
--- a/lite/backends/fpga/lite_tensor.h
+++ b/lite/backends/fpga/lite_tensor.h
@@ -106,7 +106,7 @@ class TensorLite {
  // For other devices, T and R may be the same type.
  template <typename T, typename R = T>
  const R *data() const {
-    return zynq_tensor_->data<R>();
+    return zynq_tensor_->data<R>() + offset_;
  }
  void Resize(const DDimLite &ddim) { dims_ = ddim; }
@@ -125,6 +125,7 @@ class TensorLite {
  bool persistable() const { return persistable_; }
  void set_persistable(bool persistable) { persistable_ = persistable; }
  // T is the data type and R is the return type
  // For OpenCL, the return type can be cl::Buffer
  // and the data type can be float/int8_t.
@@ -147,6 +148,8 @@ class TensorLite {
  size_t memory_size() const { return zynq_tensor_->memorySize(); }
+  size_t offset() const { return offset_; }
  bool IsInitialized() const { return buffer_->data(); }
  // Other share data to this.
@@ -157,8 +160,14 @@ class TensorLite {
  template <typename T>
  TensorLite Slice(int64_t begin, int64_t end) const;
+  template <typename T>
+  void Slice(TensorLite &dst, int64_t begin, int64_t end) const;  // NOLINT
  TargetType target() const { return target_; }
+  // template <typename T>
+  // TensorLite Slice(int64_t begin, int64_t end) const;
  zynqmp::Tensor *ZynqTensor() const { return zynq_tensor_; }
  friend std::ostream &operator<<(std::ostream &os, const TensorLite &tensor) {
@@ -173,16 +182,21 @@ class TensorLite {
 private:
  TargetType target_{TargetType::kHost};
+  // precision_ and persistable_ are only used for persistable vars.
+  // If your tensor wants to be saved and loaded correctly, you must
+  // set values of precision_ and persistable_ after updating it.
+  // If your tensor is just a temp tensor, such as activations,
+  // you can ignore these two attributes.
+  PrecisionType precision_{PrecisionType::kUnk};
+  bool persistable_{false};
  DDimLite dims_;
  std::shared_ptr<Buffer> buffer_;
  LoD lod_;
  size_t memory_size_{};
  size_t offset_{0};
-  PrecisionType precision_{PrecisionType::kUnk};
-  bool persistable_{false};
  zynqmp::Tensor *zynq_tensor_ = new zynqmp::Tensor();
  template <typename T>
@@ -197,6 +211,9 @@ R *TensorLite::mutable_data() {
  }
  zynqmp::LayoutType layout_type = zynqmp::NCHW;
  switch (v.size()) {
+    case 0:
+      layout_type = zynqmp::None;
+      break;
    case 1:
      layout_type = zynqmp::N;
      break;
@@ -228,24 +245,63 @@ R *TensorLite::mutable_data(TargetType target) {
  return mutable_data<T>();
 }
-template <typename TensorT>
-bool TensorCompareWith(const TensorT &a, const TensorT &b) {
-  if (a.dims() != b.dims()) return false;
-  if (memcmp(a.raw_data(), b.raw_data(), a.data_size()) != 0) return false;
-  return true;
-}
 template <typename T>
 TensorLite TensorLite::Slice(int64_t begin, int64_t end) const {
-  int64_t base = numel() / dims_[0];
+  throw - 1;
+  CHECK_GE(begin, 0);
+  CHECK_LE(end, dims_[0]);
+  CHECK_LT(begin, end);
+  if (dims_[0] == 1) {
+    return *this;
+  } else {
+    int64_t base = numel() / dims_[0];
+    TensorLite dst;
+    // dst.buffer_ = buffer_;
+    // dst.zynq_tensor_ = zynq_tensor_;
+    dst.target_ = target_;
+    auto dst_dims = dims_;
+    dst_dims[0] = end - begin;
+    dst.Resize(dst_dims);
+    void *dst_data = dst.mutable_data<T>();
+    T *src_data = const_cast<T *>(data<T>());
+    memcpy(dst_data,
+           src_data + static_cast<size_t>(begin * base) * sizeof(T),
+           dst_dims.production() * sizeof(T));
+    dst.ZynqTensor()->saveToFile("_slice", true);
+    // dst.offset_ = offset_ + static_cast<size_t>(begin * base) * sizeof(T);
+    return dst;
+  }
+}
+template <typename T>
+void TensorLite::Slice(TensorLite &dst, int64_t begin, int64_t end) const {
+  CHECK_GE(begin, 0);
+  CHECK_LE(end, dims_[0]);
+  CHECK_LT(begin, end);
-  TensorLite dst;
-  dst.buffer_ = buffer_;
  dst.target_ = target_;
  auto dst_dims = dims_;
  dst_dims[0] = end - begin;
  dst.Resize(dst_dims);
-  dst.offset_ = offset_ + static_cast<size_t>(begin * base) * sizeof(T);
+  void *dst_data = dst.mutable_data<T>();
-  return dst;
+  int64_t base = numel() / dims_[0];
+  T *src_data = const_cast<T *>(data<T>());
+  memcpy(dst_data,
+         src_data + static_cast<size_t>(begin * dst_dims.production()),
+         dst_dims.production() * sizeof(T));
 }
+template <typename TensorT>
+bool TensorCompareWith(const TensorT &a, const TensorT &b) {
+  if (a.dims() != b.dims()) return false;
+  if (memcmp(a.raw_data(), b.raw_data(), a.data_size()) != 0) return false;
+  return true;
+}
 }  // namespace lite
 }  // namespace paddle