[FPGA]merge Edgeboard internal codebase 1.5 into develop (#4392)

* classifications and yolov3 works

[FPGA]merge Edgeboard internal codebase 1.5 into develop (#4392)
* classifications and yolov3 works
cde383dc · Chon · GitHub · af2770d3 · cde383dc · cde383dc
81 changed file
--- a/lite/backends/fpga/KD/debugger.hpp
+++ b/lite/backends/fpga/KD/debugger.hpp
@@ -14,15 +14,19 @@
 #pragma once
-#include <map>
+#include <fstream>
+#include <iostream>
 #include <string>
+#include <unordered_map>
+#include "lite/core/program.h"
 #include "lite/core/tensor.h"
 namespace paddle {
 namespace lite {
-#define FPGA_PRINT_TENSOR
+// uncomment line below to print tensors;
+// #define FPGA_PRINT_TENSOR
 class Debugger {
 public:
@@ -37,25 +41,34 @@ class Debugger {
    }
  }
+  void setEnable(bool en) { enabled_ = en; }
 private:
-  std::map<std::string, bool> op_config;
+  bool enabled_ = false;
+  std::unordered_map<std::string, bool> op_config;
+  std::unordered_map<std::string, float> tick_tock_map;
  Debugger() {
    op_config["concat"] = true;
    op_config["pooling"] = true;
    op_config["conv"] = true;
+    op_config["dropout"] = true;
    op_config["dwconv"] = true;
    op_config["ew_add"] = true;
+    op_config["ew_mul"] = true;
    op_config["crop"] = true;
    op_config["feed"] = true;
-    op_config["mul"] = true;
    op_config["fetch"] = true;
+    op_config["fc"] = true;
+    op_config["mul"] = true;
    op_config["boxes"] = true;
    op_config["scores"] = true;
    op_config["nms"] = true;
    op_config["pb_boxes"] = true;
    op_config["pb_variances"] = true;
-    // op_config["fc"] = true;
+    op_config["reshape"] = true;
    op_config["softmax"] = true;
+    op_config["split"] = true;
  }
 };

--- a/lite/backends/fpga/KD/dl_engine.cpp
+++ b/lite/backends/fpga/KD/dl_engine.cpp
@@ -21,7 +21,7 @@ DLEngine::DLEngine() {
  open_device();
  int ret = get_device_info(info_);
  filter::set_filter_capacity(info_.filter_cap);
-  filter::set_colunm(info_.colunm);
+  filter::set_colunm(info_.column);
 }
 }  // namespace zynqmp

--- a/lite/backends/fpga/KD/dl_engine.hpp
+++ b/lite/backends/fpga/KD/dl_engine.hpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <stdio.h>
 #include "lite/backends/fpga/KD/llapi/filter.h"
 #include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
@@ -28,15 +29,13 @@ class DLEngine {
    return s_instance;
  }
-  DeviceInfo& deviceInfo();
+  DeviceInfoArgs& deviceInfo();
  bool isZU3() { return info_.device_type / 100 == 3; }
-  float* out_data = nullptr;
 private:
  DLEngine();
-  DeviceInfo info_;
+  DeviceInfoArgs info_;
 };
 }  // namespace zynqmp
 }  // namespace paddle
--- a/lite/backends/fpga/KD/fpga_cv.cpp
+++ b/lite/backends/fpga/KD/fpga_cv.cpp
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "lite/backends/fpga/KD/fpga_cv.hpp"
-using paddle::zynqmp::float16;
-void fpga_resize(float* input,
-                 int input_width,
-                 int input_height,
-                 int input_channel,
-                 uint8_t* output,
-                 int output_width,
-                 int output_height) {
-  paddle::zynqmp::InplaceArgs inplace_args = {0, 0, 0};
-  paddle::zynqmp::config_inplace(inplace_args);
-  paddle::zynqmp::ImageInputArgs input_args = {nullptr};
-  input_args.address = nullptr;
-  input_args.scale_address = nullptr;
-  float16* input_image_address =
-      reinterpret_cast<float16*>(paddle::zynqmp::fpga_malloc(
-          input_width * input_height * input_channel * sizeof(float16)));
-  int index = 0;
-  for (int i = 0; i < input_width * input_height * input_channel; i++) {
-    input_image_address[i] = float16(1.0 * input[i]);
-  }
-  paddle::zynqmp::ResizeArgs resize_args = {0};
-  resize_args.input_width = input_width;
-  resize_args.input_height = input_height;
-  resize_args.image_channel = input_channel;
-  resize_args.output_width = output_width;
-  resize_args.output_height = output_height;
-  float height_ratio = static_cast<float>(input_height) /
-                       static_cast<float>(resize_args.output_height);
-  float width_ratio = static_cast<float>(input_width) /
-                      static_cast<float>(resize_args.output_width);
-  resize_args.height_ratio = *reinterpret_cast<uint32_t*>(&height_ratio);
-  resize_args.width_ratio = *reinterpret_cast<uint32_t*>(&width_ratio);
-  int output_size =
-      resize_args.output_width * resize_args.output_height * input_channel;
-  float16* fpga_output = reinterpret_cast<float16*>(
-      paddle::zynqmp::fpga_malloc(output_size * sizeof(float16)));
-  resize_args.input_image_address = input_image_address;
-  resize_args.output_image_address = fpga_output;
-  memset(fpga_output, 0, output_size * sizeof(float16));
-  paddle::zynqmp::fpga_flush(
-      input_image_address,
-      input_width * input_height * input_channel * sizeof(float16));
-  paddle::zynqmp::fpga_flush(resize_args.output_image_address,
-                             output_size * sizeof(float16));
-  int ret = paddle::zynqmp::compute_fpga_resize(resize_args);
-  if (ret == 0) {
-    paddle::zynqmp::fpga_invalidate(resize_args.output_image_address,
-                                    output_size * sizeof(float16));
-  }
-  for (int i = 0; i < output_size; i++) {
-    output[i] = fpga_output[i];
-  }
-}
--- a/lite/backends/fpga/KD/fpga_cv.hpp
+++ b/lite/backends/fpga/KD/fpga_cv.hpp
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include <stdlib.h>
-#include "lite/backends/fpga/KD/float16.hpp"
-#include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
-#include "lite/backends/fpga/KD/pe.hpp"
-void fpga_resize(float* input,
-                 int input_width,
-                 int input_height,
-                 int input_channel,
-                 uint8_t* output,
-                 int output_width,
-                 int output_height);
--- a/lite/backends/fpga/KD/layout.hpp
+++ b/lite/backends/fpga/KD/layout.hpp
@@ -26,6 +26,7 @@ enum LayoutType {
  N,
  NC,
  NCHW,
+  CNHW,
  NHWC,
  NHW,
 };
@@ -75,6 +76,19 @@ struct NHWC : Layout {
  }
 };
+struct CNHW : Layout {
+  int numIndex() { return 1; }
+  int channelIndex() { return 0; }
+  int heightIndex() { return 2; }
+  int widthIndex() { return 3; }
+  int alignedElementCount(const std::vector<int>& dims) {
+    return dims[1] * dims[2] * align_image(dims[0] * dims[3]);
+  }
+  int elementCount(const std::vector<int>& dims) {
+    return dims[0] * dims[1] * dims[2] * dims[3];
+  }
+};
 struct NC : Layout {
  int numIndex() { return 0; }
  int channelIndex() { return 1; }

--- a/lite/backends/fpga/KD/llapi/config.h
+++ b/lite/backends/fpga/KD/llapi/config.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#define PADDLE_LITE_ZU5
-#define FPGA_PRINT_MODE
-#define PADDLE_LITE_PROFILE
--- a/lite/backends/fpga/KD/llapi/filter.cpp
+++ b/lite/backends/fpga/KD/llapi/filter.cpp
@@ -31,7 +31,7 @@ void saveToFile(std::string name, void* data_in, int size) {
  std::ofstream ofs;
  ofs.open(name);
-  int8_t* data = static_cast<int8_t*>(data_in);
+  int8_t* data = reinterpret_cast<int8_t*>(data_in);
  for (int i = 0; i < size; i++) {
    float value = data[i];
    ofs << value << std::endl;
@@ -86,6 +86,11 @@ int calc_num_per_div(int num, int group_num, int division_capacity) {
 int calc_pack_num(int num_per_group, int group, int division_capacity) {
  auto n = 1;
+  if (num_per_group * group % division_capacity == 0) {
+    n = num_per_group * group / division_capacity;
+    return n;
+  }
  while ((num_per_group * (group + n - 1) / n) > division_capacity) {
    n++;
  }
@@ -239,9 +244,10 @@ int8_t* format_filter(float* data_in,
  for (int n = 0; n < num; n++) {
    float* filter_start = data_in + n * chw;
+    float f_max = find_max(filter_start, chw);
    int8_t* quantized_start = quantized_data + n * chw;
-    quantize(filter_start, quantized_start, chw, max);
+    quantize(filter_start, quantized_start, chw, f_max);
-    filter_max.push_back(1);
+    filter_max.push_back(f_max);
  }
  int8_t* hwc_data =
@@ -377,7 +383,6 @@ size_t format_dwconv_filter(
    float** data_in, int num, int height, int width, float* scale_ptr) {
  quantize_to_fp16(data_in, num, height, width, scale_ptr);
  int16_t** quantize_data = reinterpret_cast<int16_t**>(data_in);
  convert_to_hwn(quantize_data, num, height, width);
  size_t size = align_element_n(quantize_data, num, height, width);
  fpga_flush(*quantize_data,
@@ -385,6 +390,7 @@ size_t format_dwconv_filter(
                 sizeof(int16_t));
  return size;
 }
 }  // namespace filter
 }  // namespace zynqmp
 }  // namespace paddle
--- a/lite/backends/fpga/KD/llapi/zynqmp_api.cpp
+++ b/lite/backends/fpga/KD/llapi/zynqmp_api.cpp
@@ -28,7 +28,7 @@ limitations under the License. */
 namespace paddle {
 namespace zynqmp {
-#define PADDLE_MOBILE_OS_LINUX
+#define PADDLE_OS_LINUX
 static int fd = -1;
 static const char *device_path = "/dev/fpgadrv0";
@@ -38,7 +38,7 @@ static size_t memory_size_max = 0;
 static size_t memory_size = 0;
 static inline int do_ioctl(uint64_t req, const void *arg) {
-#ifdef PADDLE_MOBILE_OS_LINUX
+#ifdef PADDLE_OS_LINUX
  return ioctl(fd, req, arg);
 #else
  return -1;
@@ -48,6 +48,11 @@ static inline int do_ioctl(uint64_t req, const void *arg) {
 int open_device() {
  if (fd == -1) {
    fd = open(device_path, O_RDWR);
+    if (fd == -1) {
+      std::cout << "please check if driver has insmoded!" << std::endl;
+      exit(-1);
+    }
  }
  return fd;
 }
@@ -61,28 +66,33 @@ void reset_device() {
 // memory management;
 void *fpga_malloc(size_t size) {
-#ifdef PADDLE_MOBILE_OS_LINUX
+#ifdef PADDLE_OS_LINUX
  void *ptr = reinterpret_cast<void *>(
      mmap64(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0));
  if (ptr == MAP_FAILED) {
-    std::cout << "not enough memory !";
+    if (errno == ENOMEM) {
-    exit(-1);
+      std::cout << "mmap failed with not enough memory ! (size=" << size << ")"
-  }
+                << std::endl;
-  if (errno == ENOMEM) {
+      throw(-1);
-    std::cout << "mmap failed with not enough memory !";
+    }
-    exit(-1);
+    if (errno == EINVAL) {
-  }
+      std::cout << "mmap failed with invalid arguments ! (size=" << size << ")"
-  if (errno == EINVAL) {
+                << std::endl;
-    std::cout << "mmap failed with invalid arguments ! (size=" << size << ")"
+      throw(-1);
-              << std::endl;
+    }
-    exit(-1);
+    std::cout << "mmap failed with other than memory usage and invalid "
+                 "arguments! errno="
+              << errno << ", (size=" << size << ")" << std::endl;
+    throw(-1);
  }
  if (ptr == NULL) {
    std::cout << "NULL returned, errno=" << errno
-              << ", mmap failed with other errors other than memory usage !"
+              << ", null retured, mmap failed with other errors other than "
+                 "memory usage !"
              << std::endl;
-    exit(-1);
+    throw(-1);
  }
  memory_map.insert(std::make_pair(ptr, size));
@@ -103,7 +113,7 @@ size_t fpga_get_memory_size_max() { return memory_size_max; }
 size_t fpga_diagnose_memory(int detailed) {
  size_t total = 0;
-  auto iter = memory_map.begin();
+  auto iter = memory_map.begin();  // std::map<void *, size_t>::iterator
  while (iter != memory_map.end()) {
    total += iter->second;
    iter++;
@@ -113,7 +123,7 @@ size_t fpga_diagnose_memory(int detailed) {
 void fpga_free(void *ptr) {
  size_t size = 0;
-  auto iter = memory_map.find(ptr);
+  auto iter = memory_map.find(ptr);  // std::map<void *, size_t>::iterator
  if (iter != memory_map.end()) {
    size = iter->second;
    memory_map.erase(iter);
@@ -121,7 +131,8 @@ void fpga_free(void *ptr) {
  memory_size -= size;
-#ifdef PADDLE_MOBILE_OS_LINUX
+#ifdef PADDLE_OS_LINUX
  munmap(ptr, size);
 #else
  free(ptr);
@@ -175,19 +186,6 @@ int compute_fpga_conv_basic(const struct ConvArgs &args) {
  return do_ioctl(IOCTL_CONFIG_CONV, &args);
 }
-int compute_fpga_conv(const struct SplitConvArgs &args) {
-  int split_num = args.split_num;
-  int ret = -1;
-  for (int i = 0; i < split_num; i++) {
-    ret = compute_fpga_conv_basic(args.conv_arg[i]);
-  }
-  if (split_num > 1) {
-    exit(-1);
-  }
-  return ret;
-}
 int compute_fpga_pool(const struct PoolingArgs &args) {
  return do_ioctl(IOCTL_CONFIG_POOLING, &args);
 }
@@ -196,9 +194,8 @@ int compute_fpga_ewadd(const struct EWAddArgs &args) {
  return do_ioctl(IOCTL_CONFIG_EW, &args);
 }
-int get_device_info(const struct DeviceInfo &args) {
+int get_device_info(const struct DeviceInfoArgs &args) {
-  int ret = do_ioctl(IOCTL_DEVICE_INFO, &args);
+  return do_ioctl(IOCTL_DEVICE_INFO, &args);
-  return ret;
 }
 int perform_bypass(const struct BypassArgs &args) {
@@ -257,26 +254,6 @@ int perform_bypass(const struct BypassArgs &args) {
 int compute_fpga_concat(const struct ConcatArgs &args) { return -1; }
 int compute_fpga_scale(const struct ScaleArgs &args) {
-#ifdef ENABLE_DEBUG
-  std::cout << "======Compute Scale======";
-  std::cout << "scale_address:" << args.scale_address << std::endl;
-  std::cout << "bias_address:" << args.bias_address << std::endl;
-  std::cout << "wc_alignment:" << args.wc_alignment << std::endl;
-  std::cout << "channel_alignment:" << args.channel_alignment << std::endl;
-  std::cout << "   image_address:" << args.image.address
-            << "   image_scale_address:" << args.image.scale_address
-            << "   image_channels:" << args.image.channels
-            << "   image_height:" << args.image.height
-            << "   image_width:" << args.image.width
-            << "   pad_height:" << args.image.pad_height
-            << "   pad_width:" << args.image.pad_width;
-  std::cout << "   out_address:" << args.output.address
-            << "   out_scale_address:" << args.output.scale_address;
-#endif
  return do_ioctl(IOCTL_CONFIG_SCALE, &args);
 }
@@ -288,6 +265,10 @@ int config_activation(const struct ActiveParamterArgs &args) {
  return do_ioctl(IOCTL_CONFIG_ACTIVATION_PARAMETER, &args);
 }
+int config_global_pool(const struct GlobalPoolArgs &args) {
+  return do_ioctl(IOCTL_CONFIG_GLOBAL_POOL_PARAMETER, &args);
+}
 int config_inplace(const struct InplaceArgs &args) {
  return do_ioctl(IOCTL_CONFIG_INPLACE, &args);
 }
@@ -304,6 +285,10 @@ int compute_fpga_resize(const struct ResizeArgs &args) {
  return do_ioctl(IOCTL_CONFIG_RESIZE, &args);
 }
+int compute_preprocess(const struct PreprocessArgs &args) {
+  return do_ioctl(IOCTL_PREPROCESS, &args);
+}
 int16_t fp32_2_fp16(float fp32_num) {
  unsigned long tmp = *(unsigned long *)(&fp32_num);  // NOLINT
  auto t = (int16_t)(((tmp & 0x007fffff) >> 13) | ((tmp & 0x80000000) >> 16) |

--- a/lite/backends/fpga/KD/llapi/zynqmp_api.h
+++ b/lite/backends/fpga/KD/llapi/zynqmp_api.h
@@ -29,6 +29,9 @@ typedef int16_t half;
 #define IMAGE_ALIGNMENT 16           // Aligned to 16
 #define FILTER_ELEMENT_ALIGNMENT 16  // Filter element number aligned to 16
+// #define FILTER_NUM_ALIGNMENT 32      // Filter number aligned to 32  replace
+// by filter.hpp "get_filter_num_alignment()"
+// #define FILTER_ELEMENT_ALIGNMENT 64  // Filter element number aligned to 64
 #define BS_NUM_ALIGNMENT 8
 #define BIAS_NUM_ALIGNMENT 16
@@ -50,11 +53,11 @@ enum ActiveType {
  TYPE_SIGMOID = 4,
 };
-struct DeviceInfo {
+struct DeviceInfoArgs {
  uint32_t filter_cap;
  uint32_t version;
  uint16_t device_type;
-  uint32_t colunm;
+  uint32_t column;
  uint32_t reserved1;
  uint32_t reserved2;
  uint32_t reserved3;
@@ -114,6 +117,14 @@ struct ImageOutputArgs {
  float* scale_address;  // output scale address;
 };
+struct DeconvArgs {
+  bool enabled;
+  uint16_t sub_kernel_num;   // which is the stride of deconv, means that deconv
+                             // will be divided into several sub conv operation
+  uint16_t invalid_col_num;  // which will be dumped in the left and right for
+                             // each row directly in FPGA
+};
 struct ConvArgs {
  bool relu_enabled;
  void* sb_address;  // scale and bias are interlaced;
@@ -123,6 +134,7 @@ struct ConvArgs {
  uint32_t group_num;
  uint32_t dilation;
+  struct DeconvArgs deconv;
  struct KernelArgs kernel;
  struct ImageInputArgs image;  // input image;
  struct ImageOutputArgs output;
@@ -189,6 +201,29 @@ struct NormalizeArgs {
  uint32_t* output_scale_address;
 };
+struct PreprocessArgs {
+  void* input_image_address;
+  void* output_image_address;
+  uint32_t input_width;
+  uint32_t input_height;
+  uint32_t output_width;
+  uint32_t output_height;
+  uint32_t height_ratio;
+  uint32_t width_ratio;
+  uint16_t mean0;
+  uint16_t mean1;
+  uint16_t mean2;
+  uint16_t scale0;
+  uint16_t scale1;
+  uint16_t scale2;
+  uint32_t rd_ring_buf_size;
+  uint32_t wr_ring_buf_size;
+  uint32_t vedio_in_fomat;
+  uint32_t vedio_out_fomat;
+  uint32_t vedio_source;
+  bool mean_scale_enabled;
+};
 struct ResizeArgs {
  void* input_image_address;
  void* output_image_address;
@@ -214,10 +249,14 @@ struct NormalizeParameterArgs {
 };
 struct ActiveParamterArgs {
-  ActiveType type;
+  enum ActiveType type;
  uint16_t leaky_relu_factor;
 };
+struct GlobalPoolArgs {
+  uint16_t global_pool_factor;
+};
 struct InplaceArgs {
  bool leaky_relu_enable;
  bool relu_enable;
@@ -225,6 +264,7 @@ struct InplaceArgs {
  bool relu6_enable;
  bool power_enable;
  bool normalize_enable;
+  bool global_pool_en;
 };
 struct FpgaRegWriteArgs {
@@ -238,13 +278,13 @@ struct FpgaRegReadArgs {
 };
 struct FpgaResetArgs {
-  uint32_t val;
+  uint32_t dummy;
 };
 #define IOCTL_FPGA_MAGIC (('F' + 'P' + 'G' + 'A') / 4)
+// #define IOCTL_MEMORY_MAGIC                  (('M' + 'E' + 'M' + 'Y') / 4)
 #define IOCTL_VERSION _IOW(IOCTL_FPGA_MAGIC, 01, struct VersionArgs)
-#define IOCTL_DEVICE_INFO _IOW(IOCTL_FPGA_MAGIC, 100, struct DeviceInfo)
 #define IOCTL_SEPARATOR_0 10
@@ -263,7 +303,6 @@ struct FpgaResetArgs {
 #define IOCTL_CONFIG_SCALE _IOW(IOCTL_FPGA_MAGIC, 25, struct ScaleArgs)
 #define IOCTL_CONFIG_NORMALIZE _IOW(IOCTL_FPGA_MAGIC, 26, struct NormalizeArgs)
 #define IOCTL_CONFIG_RESIZE _IOW(IOCTL_FPGA_MAGIC, 30, struct ResizeArgs)
 #define IOCTL_CONFIG_DWCONV _IOW(IOCTL_FPGA_MAGIC, 31, struct DWconvArgs)
 #define IOCTL_CONFIG_INPLACE _IOW(IOCTL_FPGA_MAGIC, 40, struct InplaceArgs)
@@ -273,61 +312,19 @@ struct FpgaResetArgs {
  _IOW(IOCTL_FPGA_MAGIC, 42, struct NormalizeParameterArgs)
 #define IOCTL_CONFIG_ACTIVATION_PARAMETER \
  _IOW(IOCTL_FPGA_MAGIC, 43, struct ActiveParamterArgs)
+#define IOCTL_CONFIG_GLOBAL_POOL_PARAMETER \
+  _IOW(IOCTL_FPGA_MAGIC, 44, struct GlobalPoolArgs)
 #define IOCTL_FPGA_REG_READ _IOW(IOCTL_FPGA_MAGIC, 50, struct FpgaRegReadArgs)
 #define IOCTL_FPGA_REG_WRITE _IOW(IOCTL_FPGA_MAGIC, 51, struct FpgaRegWriteArgs)
 #define IOCTL_FPGA_RESET _IOW(IOCTL_FPGA_MAGIC, 52, struct FpgaResetArgs)
-//============================== API =============================
+#define IOCTL_DEVICE_INFO _IOW(IOCTL_FPGA_MAGIC, 100, struct DeviceInfoArgs)
-struct DeconvArgs {
-  uint32_t sub_conv_num;
-  uint32_t group_num;
-  uint32_t filter_num;
-  uint32_t omit_size;
-  uint32_t sub_output_width;
-  uint32_t sub_output_height;
-  struct ImageOutputArgs output;
-  struct SplitConvArgs* split_conv_args;
-};
-struct SplitArgs {
-  uint32_t image_num;
-  int16_t* image_in;
-  float* scale_in;
-  void** images_out;
-  float** scales_out;
-  uint32_t* out_channel_nums;
-  uint32_t height;
-  uint32_t width;
-};
-struct ConcatArgs {
+#define IOCTL_SEPARATOR_2 200
-  uint32_t image_num;
+#define IOCTL_PREPROCESS _IOW(IOCTL_FPGA_MAGIC, 201, struct PreprocessArgs)
-  half** images_in;
-  float** scales_in;
-  void* image_out;
-  float* scale_out;
-  uint32_t* channel_num;
-  uint32_t height;
-  uint32_t width;
-};
-struct SplitConvArgs {
+//============================== API =============================
-  uint32_t split_num;
-  uint32_t group_num;
-  uint32_t filter_num;
-  struct ImageOutputArgs output;
-  struct ConvArgs* conv_arg;
-  struct ConcatArgs concat_arg;
-};
-struct GroupConvArgs {
-  uint32_t group_num;
-  uint32_t filter_num;
-  struct ImageOutputArgs output;
-  struct SplitConvArgs* conv_args;
-  struct ConcatArgs concat_arg;
-};
 inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; }
 int open_device();
@@ -345,11 +342,10 @@ void fpga_copy(void* dst, const void* src, int size);
 int fpga_flush(void* address, size_t size);
 int fpga_invalidate(void* address, size_t size);
-int get_device_info(const struct DeviceInfo& args);
+int get_device_info(const struct DeviceInfoArgs& args);
 int perform_bypass(const struct BypassArgs& args);
 int compute_fpga_conv_basic(const struct ConvArgs& args);
-int compute_fpga_conv(const struct SplitConvArgs& args);
 int compute_fpga_pool(const struct PoolingArgs& args);
 int compute_fpga_ewadd(const struct EWAddArgs& args);
 int compute_fpga_scale(const struct ScaleArgs& args);
@@ -357,6 +353,7 @@ int compute_fpga_concat(const struct ConcatArgs& args);
 int compute_fpga_resize(const struct ResizeArgs& args);
 int config_activation(const struct ActiveParamterArgs& args);
+int config_global_pool(const struct GlobalPoolArgs& args);
 int config_power(const struct PowerArgs& args);
 int compute_fpga_dwconv(const struct DWconvArgs& args);
 int config_norm_param(const struct NormalizeParameterArgs& args);
@@ -368,6 +365,7 @@ int flush_cache(void* addr, int size);
 int invalidate_cache(void* addr, int size);
 int fpga_reset();
+int compute_preprocess(const struct PreprocessArgs& args);
 int16_t fp32_2_fp16(float fp32_num);
 float fp16_2_fp32(int16_t fp16_num);

--- a/lite/backends/fpga/KD/pe_params.hpp
+++ b/lite/backends/fpga/KD/pe_params.hpp
@@ -78,31 +78,59 @@ struct ConvParam : PEParam {
  Tensor* filter = nullptr;
  int groups = 1;
+  bool deconv = false;
  std::vector<int> strides;
  std::vector<int> paddings;
  std::vector<int> kernelSize;
  std::vector<int> dilations;
-  Tensor* scale() { return scale_; }
+  Tensor* scale() { return &scale_; }
-  Tensor* bias() { return bias_; }
+  Tensor* bias() { return &bias_; }
  std::vector<BasicConvParam*>& splitParams() { return splitParams_; }
+  ~ConvParam() {
+    for (BasicConvParam* p : splitParams_) {
+      delete p;
+    }
+    splitParams_.clear();
+  }
 protected:
  std::vector<BasicConvParam*> splitParams_;
-  Tensor* scale_ = new Tensor();
+  Tensor scale_;
-  Tensor* bias_ = new Tensor();
+  Tensor bias_;
+};
+struct BasicDWConvParam {
+  Tensor input;
+  Tensor output;
+  Tensor filter;
+  Tensor bias;
+  DWconvArgs args;
+  Tensor quantizedFilter;
+  Tensor quantizedBias;
+};
+struct DepthwiseConvSplitParam : ConvParam {
+ public:
+  DWconvArgs args;
+  std::vector<BasicDWConvParam*>& splitParams() { return splitParams_; }
+ protected:
+  std::vector<BasicDWConvParam*> splitParams_;
 };
 struct DepthwiseConvParam : ConvParam {
 public:
-  Tensor* quantizedFilter() { return quantizedFilter_; }
+  Tensor* quantizedFilter() { return &quantizedFilter_; }
  DWconvArgs args;
 protected:
-  Tensor* quantizedFilter_ = new Tensor();
+  Tensor quantizedFilter_;
 };
 enum PoolingType : int {
@@ -124,6 +152,16 @@ struct PoolingParam : PEParam {
  PoolingArgs poolingArgs = {0};
 };
+struct PoolingSplitParam : ConvParam {
+ public:
+  PoolingArgs args;
+  std::vector<PoolingParam*>& splitParams() { return splitParams_; }
+ protected:
+  std::vector<PoolingParam*> splitParams_;
+};
 struct ConcatParam : PEParam {
 public:
  std::vector<Tensor*> inputs;
@@ -154,13 +192,13 @@ struct FullyConnectedParam : PEParam {
  Tensor* bias = nullptr;
  Tensor* output = nullptr;
-  Tensor* quantizedFilter() { return quantizedFilter_; }
+  Tensor* quantizedFilter() { return &quantizedFilter_; }
-  Tensor* biasScale() { return biasScale_; }
+  Tensor* biasScale() { return &biasScale_; }
 protected:
-  Tensor* quantizedFilter_ = new Tensor();
+  Tensor quantizedFilter_;
-  Tensor* biasScale_ = new Tensor();
+  Tensor biasScale_;
 };
 struct SoftmaxParam : PEParam {
@@ -229,15 +267,15 @@ struct ScaleParam : PEParam {
  Tensor* scale = nullptr;
  Tensor* bias = nullptr;
-  Tensor* alignedScale() { return alignedScale_; }
+  Tensor* alignedScale() { return &alignedScale_; }
-  Tensor* alignedBias() { return alignedBias_; }
+  Tensor* alignedBias() { return &alignedBias_; }
  ScaleArgs args = {0};
 protected:
-  Tensor* alignedScale_ = new Tensor();
+  Tensor alignedScale_;
-  Tensor* alignedBias_ = new Tensor();
+  Tensor alignedBias_;
 };
 struct ResizeParam : PEParam {

--- a/lite/backends/fpga/KD/pes/concat_pe.hpp
+++ b/lite/backends/fpga/KD/pes/concat_pe.hpp
@@ -29,6 +29,11 @@ class ConcatPE : public PE {
    Tensor* output = param_.output;
    output->setAligned(false);
    output->setDataLocation(CPU);
+    bool cacheable = true;
+    for (auto in : param_.inputs) {
+      cacheable &= in->cacheable();
+    }
+    output->setCacheable(cacheable);
    return true;
  }

--- a/lite/backends/fpga/KD/pes/conv_pe.hpp
+++ b/lite/backends/fpga/KD/pes/conv_pe.hpp
@@ -32,6 +32,107 @@ namespace zynqmp {
 class ConvPE : public PE {
 public:
+  void cpu_conv_half_hwc() {
+    Tensor* input = param_.input;
+    Tensor* output = param_.output;
+    Shape& input_shape = input->shape();
+    Shape& out_shape = output->shape();
+    int image_height = input_shape.height();
+    int image_width = input_shape.width();
+    int image_channels = input_shape.channel();
+    int image_pad_h = param_.paddings[0];
+    int image_pad_w = param_.paddings[0];
+    int kernel_height = param_.filter->shape().height();
+    int kernel_width = param_.filter->shape().width();
+    int kernel_step_h = param_.strides[0];
+    int kernel_step_w = param_.strides[1];
+    int dilation_rate = 1;
+    int out_channel = out_shape.channel();
+    int pooled_height_ = out_shape.height();
+    int pooled_width_ = out_shape.width();
+    int filter_chw = image_channels * kernel_height * kernel_width;
+    int kernel_rw = kernel_width + (dilation_rate - 1) * (kernel_width - 1);
+    int kernel_rh = kernel_height + (dilation_rate - 1) * (kernel_height - 1);
+    float* weight = param_.filter->data<float>();
+    Tensor float_input;
+    Tensor float_output;
+    float* image_addr = float_input.mutableData<float>(FP32, input->shape());
+    input->syncToDevice();
+    float_input.copyFrom(input);
+    float_input.invalidate();
+    float_input.saveToFile("fi", true);
+    float* out = float_output.mutableData<float>(FP32, output->shape());
+    for (int ph = 0; ph < pooled_height_; ph++) {
+      for (int pw = 0; pw < pooled_width_; pw++) {
+        int hstart = ph * kernel_step_h - image_pad_h;
+        int wstart = pw * kernel_step_w - image_pad_w;
+        int hend = std::min(hstart + kernel_rh, static_cast<int>(image_height));
+        int wend = std::min(wstart + kernel_rw, static_cast<int>(image_width));
+        int hstart_plus =
+            dilation_rate *
+                ceil(static_cast<float>(image_pad_h - ph * kernel_step_h)) /
+                static_cast<float>(dilation_rate) -
+            image_pad_h + ph * kernel_step_h;
+        int wstart_plus =
+            dilation_rate *
+                ceil(static_cast<float>(image_pad_w - pw * kernel_step_w) /
+                     static_cast<float>(dilation_rate)) -
+            image_pad_w + pw * kernel_step_w;
+        int hstart_ = hstart < 0 ? hstart_plus : hstart;
+        int wstart_ = wstart < 0 ? wstart_plus : wstart;
+        for (int oc = 0; oc < out_channel; oc++) {
+          float sum = 0.0f;
+          const int pool_index = (ph * pooled_width_ + pw) * out_channel + oc;
+          for (int c = 0; c < image_channels; c++) {
+            for (int h = hstart_; h < hend; h += dilation_rate) {
+              int hi = 0;
+              if (hstart < 0) {
+                hi = (kernel_rh - (hend - h)) / dilation_rate;
+              } else {
+                hi = (h - hstart_) / dilation_rate;
+              }
+              for (int w = wstart_; w < wend; w += dilation_rate) {
+                int wi = 0;
+                if (wstart < 0) {
+                  wi = (kernel_rw - (wend - w)) / dilation_rate;
+                } else {
+                  wi = (w - wstart_) / dilation_rate;
+                }
+                const int index = (h * image_width + w) * image_channels + c;
+                int weight_index = oc * filter_chw +
+                                   kernel_width * kernel_height * c +
+                                   kernel_width * hi + wi;
+                float value = image_addr[index] * weight[weight_index];
+                sum += value;
+              }
+            }
+          }
+          float s = param_.scale()->data<float>()[oc];
+          float b = param_.bias()->data<float>()[oc];
+          out[pool_index] = sum * s + b;
+        }
+      }
+    }
+    float_output.flush();
+    float_output.saveToFile("fo", true);
+    output->copyFrom(&float_output);
+    output->invalidate();
+    output->saveToFile("out", true);
+    // exit(-1);
+  }
  bool init() {
    Tensor* output = param_.output;
    output->setAligned(true);
@@ -40,28 +141,30 @@ class ConvPE : public PE {
  }
  void apply() {
-    split_axis = fill_split_arg(param_);
+    if (param_.deconv == false) {
+      split_axis = fill_split_arg(param_);
-    split_channel = param_.groups != 1 && param_.splitParams().size() > 1;
+      split_channel = param_.groups != 1 && param_.splitParams().size() > 1;
-    if (split_axis == 0 && param_.splitParams().size() > 1) {
-      ConcatParam& concat_param = concatPE_.param();
+      if (split_axis == 0 && param_.splitParams().size() > 1) {
-      for (auto conv_param : param_.splitParams()) {
+        ConcatParam& concat_param = concatPE_.param();
-        concat_param.inputs.push_back(&conv_param->output);
+        for (auto conv_param : param_.splitParams()) {
+          concat_param.inputs.push_back(&conv_param->output);
+        }
+        concat_param.output = param_.output;
+        concatPE_.init();
+        concatPE_.apply();
      }
-      concat_param.output = param_.output;
-      concatPE_.init();
-      concatPE_.apply();
-    }
-    if (split_channel) {
+      if (split_channel) {
-      SplitParam& split_param = splitPE_.param();
+        SplitParam& split_param = splitPE_.param();
-      split_param.input = param_.input;
+        split_param.input = param_.input;
-      for (auto conv_param : param_.splitParams()) {
+        for (auto conv_param : param_.splitParams()) {
-        split_param.outputs.push_back(&conv_param->input);
+          split_param.outputs.push_back(&conv_param->input);
+        }
+        splitPE_.init();
+        splitPE_.apply();
      }
-      splitPE_.init();
-      splitPE_.apply();
    }
    if (DLEngine::get_instance().isZU3() &&
@@ -70,8 +173,9 @@ class ConvPE : public PE {
        param_.input->shape().channel() >= 2048) {
      use_cpu_ = true;
    }
    if (!use_cpu_) {
-      // param_.filter->releaseData();
+      param_.filter->releaseData();
    }
    // exit(-1);
@@ -120,16 +224,17 @@ class ConvPE : public PE {
    }
    delete[] mi;
    float_output.flush();
+    output->flush();
    output->copyFrom(&float_output);
+    output->invalidate();
  }
  bool dispatch() {
-    fpga_reset();
    if (use_cpu_) {
      cpu_compute();
      return true;
    }
+    inplace_.global_pool_en = false;
    if (param_.activeParam.type == TYPE_RELU) {
      inplace_.relu_enable = true;
    } else if (param_.activeParam.type == TYPE_RELU6) {
@@ -146,24 +251,20 @@ class ConvPE : public PE {
      if (inplace_.leaky_relu_enable) {
        activeParamterArgs.type = TYPE_LEAKY_RELU;
        activeParamterArgs.leaky_relu_factor =
-            fp32_2_fp16(param_.activeParam.leaky_relu_factor);
+            float_to_half(param_.activeParam.leaky_relu_factor);
        config_activation(activeParamterArgs);
      }
    }
    std::vector<BasicConvParam*>& params = param_.splitParams();
-    if (split_channel) {
+    if (split_channel && param_.deconv == false) {
      // splitPE_.param().input->saveToFile("input_image",true);
      splitPE_.dispatch();
    }
    int ret = 0;
    for (auto conv_param : params) {
-      // conv_param->input.printScale();
-      // if (split_channel) {
-      //   conv_param->input.saveToFile("pack_image",true);
-      // }
      ret |= compute_fpga_conv_basic(conv_param->args);
    }
@@ -173,18 +274,18 @@ class ConvPE : public PE {
      inplace_.leaky_relu_enable = false;
      inplace_.relu6_enable = false;
      inplace_.sigmoid_enable = false;
+      inplace_.global_pool_en = false;
      config_inplace(inplace_);
-      if (inplace_.leaky_relu_enable) {
+      if (param_.activeParam.type == TYPE_LEAKY_RELU) {
        activeParamterArgs.type = TYPE_LEAKY_RELU;
-        activeParamterArgs.leaky_relu_factor = fp32_2_fp16(0);
+        activeParamterArgs.leaky_relu_factor = float_to_half(0);
        config_activation(activeParamterArgs);
      }
    }
    size_t size = params.size();
-    if (split_axis == 0 && ret == 0 && size > 1) {
+    if (split_axis == 0 && ret == 0 && size > 1 && param_.deconv == false) {
-      // std::cout << "concat size:" << size << std::endl;
      concatPE_.dispatch();
    }
    if (split_axis == 1 && ret == 0 && size > 1) {

--- a/lite/backends/fpga/KD/pes/conv_process.hpp
+++ b/lite/backends/fpga/KD/pes/conv_process.hpp
@@ -171,6 +171,30 @@ inline void format_bias_scale_new(Tensor* bias,
  }
 }
+inline void format_16_bias(Tensor* bias, Tensor* quantized_bias, int channel) {
+  int repeat = 1;
+  int alignment = 16;
+  int length = channel;
+  if (channel % alignment != 0 || channel < alignment) {
+    int c_lcm = lcm_(channel, alignment);
+    repeat = c_lcm / (channel);
+  }
+  Shape shape(N, {channel * repeat});
+  float16* quantized_bias_data =
+      quantized_bias->mutableData<float16>(FP16, shape);
+  float* bias_data = bias->data<float>();
+  // bias aligned to 16 by hw;
+  for (int i = 0; i < repeat; i++) {
+    for (int j = 0; j < length; j++) {
+      float16 value = float_to_half(bias_data[j]);
+      quantized_bias_data[i * length + j] = value;
+    }
+  }
+  quantized_bias->flush();
+}
 inline void format_scale_bias(Tensor* scale,
                              Tensor* bias,
                              Tensor* filter,
@@ -237,7 +261,6 @@ inline void format_filter(Tensor* filter,
                          std::vector<float>& scales,  // NOLINT
                          float max) {
  float max_value = find_max(*filter);
-  // max_value = max; //TODO: global quantization for filter
  Shape& filter_shape = filter->shape();
  int mem_size;
@@ -264,20 +287,9 @@ inline void format_filter(Tensor* filter,
  quantized_filter->flush();
  fpga_free(quantized_data);
-  // for (size_t i = 0; i < max_values.size(); i++) {
+  for (size_t i = 0; i < max_values.size(); i++) {
-  //   // scales.push_back(max_values[i] / max_value);
+    scales.push_back(max_values[i] / max_value);
-  //   scales.push_back(1.0f);
+  }
-  // }
-  // filter->saveToFile("filter.txt");
-  // std::ofstream ofs;
-  // ofs.open("quant.txt");
-  // for (int i = 0; i < mem_size; i++) {
-  //   float value = quantized_data[i];
-  //   ofs << value << std::endl;
-  // }
-  // ofs.close();
-  // exit(-1);
 }
 inline void format_dw_filter(Tensor* filter,
@@ -371,8 +383,9 @@ inline void split_filter_num(const ConvParam& c_param) {
    new_filter.flush();
    conv_param->filter.mutableData<float>(FP32, f_shape);
-    std::vector<float> v;  // TODO(chonwhite) change variable name;
+    std::vector<float> quant_scale;
-    format_filter(&new_filter, &(conv_param->filter), param.groups, v, max);
+    format_filter(
+        &new_filter, &(conv_param->filter), param.groups, quant_scale, max);
    conv_param->filter.setDataType(INT8);
    Tensor scale;
@@ -384,14 +397,14 @@ inline void split_filter_num(const ConvParam& c_param) {
    float* scale_data = scale.mutableData<float>(FP32, s_shape);
    float* bias_data = bias.mutableData<float>(FP32, s_shape);
    for (int n = 0; n < filter_num; n++) {
-      scale_data[n] = param.scale()->data<float>()[n + chnnnel_start];
+      scale_data[n] =
+          param.scale()->data<float>()[n + chnnnel_start] * quant_scale[n];
    }
    for (int n = 0; n < filter_num; n++) {
      bias_data[n] = param.bias()->data<float>()[n + chnnnel_start];
    }
    format_bias_scale_new(&bias, &scale, &conv_param->scaleBias);
    conv_param->scaleBias.flush();
    args.group_num = param.groups;
    args.sb_address = conv_param->scaleBias.data<float16>();
    args.kernel.stride_h = param.strides[1];
@@ -431,7 +444,6 @@ inline void pack_channel_filter(const ConvParam& c_param) {
  int channel_per_pack = filter->shape().channel() * group_per_pack;
  float max = find_max(*filter);
  Shape& out_shape = out->shape();
  for (int i = 0; i < pack_num; i++) {
@@ -500,8 +512,9 @@ inline void pack_channel_filter(const ConvParam& c_param) {
    float mem_factor = filter_num_alignment / filter_per_pack;
    conv_param->filter.setMemScale(mem_factor);
-    std::vector<float> v;  // TODO(chonwhite) change variable name
+    std::vector<float> quant_scale;
-    format_filter(&new_filter, &(conv_param->filter), new_group, v, max);
+    format_filter(
+        &new_filter, &(conv_param->filter), new_group, quant_scale, max);
    conv_param->filter.setDataType(INT8);
    Tensor scale;
@@ -513,7 +526,8 @@ inline void pack_channel_filter(const ConvParam& c_param) {
    float* scale_data = scale.mutableData<float>(FP32, s_shape);
    float* bias_data = bias.mutableData<float>(FP32, s_shape);
    for (int n = 0; n < filter_current_pack; n++) {
-      scale_data[n] = param.scale()->data<float>()[n + chnnnel_start];
+      scale_data[n] =
+          param.scale()->data<float>()[n + chnnnel_start] * quant_scale[n];
    }
    for (int n = 0; n < filter_current_pack; n++) {
      bias_data[n] = param.bias()->data<float>()[n + chnnnel_start];
@@ -591,7 +605,7 @@ inline void split_channel(const ConvParam& c_param) {
    float* bias_data = bias.mutableData<float>(FP32, bs_shape);
    float* scale_data = scale.mutableData<float>(FP32, bs_shape);
    for (int c = 0; c < channel; c++) {
-      scale_data[c] = 1;
+      scale_data[c] = scales[c];
      bias_data[c] = param.bias()->data<float>()[c] / num;
    }
    scale.flush();
@@ -666,6 +680,105 @@ inline bool compute_conv(const ConvParam& c_conv_params) {
  return ret == 0;
 }
+inline void dwconv_split_channel(DepthwiseConvSplitParam& param) {  // NOLINT
+  Tensor* input = param.input;
+  Tensor* output = param.output;
+  Tensor* filter = param.filter;
+  input->syncToCPU();
+  int h_kernel = filter->shape().height();
+  int w_kernel = filter->shape().width();
+  int c = input->shape().channel();
+  int w = input->shape().width();
+  int wc_h_kernel = w * c * h_kernel;
+  int dwconv_limit = 131072;
+  int num = ceil(wc_h_kernel * 1.0f / dwconv_limit);
+  while (input->shape().channel() % num != 0) {
+    num++;
+  }
+  int channel = input->shape().channel() / num;
+  if (channel % 16 != 0) {
+    std::cout << "input channel must div by 16" << std::endl;
+    // throw -1;
+  }
+  Shape bs_shape(N, {channel});
+  float16* output_address = nullptr;
+  float16* input_address = nullptr;
+  float* out_scale_address = nullptr;
+  for (int i = 0; i < num; i++) {
+    BasicDWConvParam* dwconv_param = new BasicDWConvParam();
+    // input && output;
+    Shape in_shape(
+        NCHW, {1, channel, input->shape().height(), input->shape().width()});
+    if (num == 1) {
+      input_address = input->data<float16>();
+      output_address = output->data<float16>();
+      out_scale_address = output->scale();
+    } else {
+      input_address = dwconv_param->input.mutableData<float16>(FP16, in_shape);
+      output_address =
+          dwconv_param->output.mutableData<float16>(FP16, in_shape);
+      out_scale_address = dwconv_param->output.scale();
+    }
+    // filter transformation;
+    Shape f_shape(NCHW, {channel, 1, h_kernel, w_kernel});
+    Tensor split_filter;
+    float* split_filter_data = split_filter.mutableData<float>(FP32, f_shape);
+    int filter_hwc = h_kernel * w_kernel * channel;
+    memcpy(split_filter_data,
+           filter->data<float>() + i * filter_hwc,
+           filter_hwc * sizeof(float));
+    split_filter.flush();
+    Tensor split_scale;
+    Tensor split_bias;
+    float* scale_data = split_scale.mutableData<float>(FP32, bs_shape);
+    float* bias_data = split_bias.mutableData<float>(FP32, bs_shape);
+    for (int c = 0; c < channel; c++) {
+      scale_data[c] = param.scale()->data<float>()[i * channel + c];
+      bias_data[c] = param.bias()->data<float>()[i * channel + c];
+    }
+    split_bias.flush();
+    Tensor quantized_filter = dwconv_param->quantizedFilter;
+    Tensor quantized_bias = dwconv_param->quantizedBias;
+    quantized_filter.mutableData<float16>(FP16, f_shape);
+    quantized_bias.mutableData<float16>(FP16, f_shape);
+    format_dw_filter(
+        &split_filter, &(dwconv_param->quantizedFilter), scale_data);
+    format_16_bias(&split_bias, &(dwconv_param->quantizedBias), channel);
+    DWconvArgs& args = dwconv_param->args;
+    args.bias_address = dwconv_param->quantizedBias.data<float16>();
+    args.filter_address = dwconv_param->quantizedFilter.data<float16>();
+    args.kernel.width = f_shape.height();
+    args.kernel.height = f_shape.width();
+    args.kernel.stride_w = param.strides[0];
+    args.kernel.stride_h = param.strides[1];
+    args.image.address = input_address;
+    args.image.channels = channel;
+    args.image.height = input->shape().height();
+    args.image.width = input->shape().width();
+    args.image.pad_width = param.paddings[0];
+    args.image.pad_height = param.paddings[1];
+    args.image.scale_address = input->scale();
+    args.output.address = output_address;
+    args.output.scale_address = out_scale_address;
+    args.out_width = param.output->shape().width();
+    args.out_height = param.output->shape().height();
+    args.sub_conv_num = 1;
+    param.splitParams().push_back(dwconv_param);
+  }
+}
 }  // namespace zynqmp
 }  // namespace paddle

--- a/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp
+++ b/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp
@@ -61,11 +61,6 @@ class DepthwiseConvPE : public PE {
    float16* b_data = bias_.mutableData<float16>(FP16, shape);
    if (param_.bias()->dataType() == FP32) {
      float* new_bias_data = param_.bias()->data<float>();
-      // bias从float转换成float16
-      // for (int i = 0; i < channel; i++) {
-      //   b_data[i] = float_to_half(new_bias_data[i]);
-      // }
-      // bias 按16对齐填充hw
      for (int i = 0; i < repeat; i++) {
        for (int j = 0; j < length; j++) {
          float16 value = float_to_half(new_bias_data[j]);
@@ -75,10 +70,8 @@ class DepthwiseConvPE : public PE {
      bias_.flush();
    } else {
      float16* new_bias_data = param_.bias()->data<float16>();
-      // memcpy(b_data, new_bias_data, channel * sizeof(float16));
      for (int i = 0; i < repeat; i++) {
        for (int j = 0; j < length; j++) {
-          // float16 value = float_to_half(bias_data_float[j]);
          b_data[i * length + j] = new_bias_data[j];
        }
      }
@@ -92,12 +85,10 @@ class DepthwiseConvPE : public PE {
      format_dw_filter(param.filter, param.quantizedFilter(), new_scale_data);
    } else {
-      // filter 全为1时，且channal为对齐时
+      // TODO(chonwhite) filter fall one and channel aligned case
      float16* scale_data = param_.scale()->data<float16>();
      float16* filter_data = param.quantizedFilter()->mutableData<float16>(
          FP16, param.filter->shape());
-      // memcpy(filter_data, scale_data, channel * sizeof(float16));
      memcpy(filter_data,
             scale_data,
             param.filter->shape().numel() * sizeof(float16));

--- a/lite/backends/fpga/KD/pes/depthwise_conv_split_pe.hpp
+++ b/lite/backends/fpga/KD/pes/depthwise_conv_split_pe.hpp
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <vector>
+#include "lite/backends/fpga/KD/float16.hpp"
+#include "lite/backends/fpga/KD/pe.hpp"
+#include "lite/backends/fpga/KD/pe_params.hpp"
+#include "lite/backends/fpga/KD/pes/conv_process.hpp"
+namespace paddle {
+namespace zynqmp {
+class DepthwiseConvSplitPE : public PE {
+ public:
+  inline int gcd_(int a, int b) {
+    while (b) {
+      int temp = a;
+      a = b;
+      b = temp % b;
+    }
+    return a;
+  }
+  inline int lcm_(int a, int b) { return a * b / gcd_(a, b); }
+  bool init() {
+    Tensor* output = param_.output;
+    output->setAligned(true);
+    output->setDataLocation(Device);
+    return true;
+  }
+  void apply() {
+    DepthwiseConvSplitParam& param = param_;
+    Tensor* input = param.input;
+    Tensor* output = param.output;
+    int channel = output->shape().channel();
+    dwconv_split_channel(param);
+    if (param.splitParams().size() > 1) {
+      SplitParam& split_param = splitPE_.param();
+      split_param.input = param_.input;
+      for (auto dwconv_param : param_.splitParams()) {
+        split_param.outputs.push_back(&dwconv_param->input);
+      }
+      splitPE_.init();
+      splitPE_.apply();
+      ConcatParam& concat_param = concatPE_.param();
+      for (auto dwconv_param : param_.splitParams()) {
+        concat_param.inputs.push_back(&dwconv_param->output);
+      }
+      concat_param.output = param_.output;
+      concatPE_.init();
+      concatPE_.apply();
+    }
+  }
+  bool dispatch() {
+    param_.input->syncToDevice();
+    if (param_.activeParam.type == TYPE_RELU) {
+      inplace_.relu_enable = true;
+    } else if (param_.activeParam.type == TYPE_RELU6) {
+      inplace_.relu6_enable = true;
+    } else if (param_.activeParam.type == TYPE_SIGMOID) {
+      inplace_.sigmoid_enable = true;
+    } else if (param_.activeParam.type == TYPE_LEAKY_RELU) {
+      inplace_.leaky_relu_enable = true;
+    }
+    if (inplace_.relu_enable || inplace_.leaky_relu_enable ||
+        inplace_.relu6_enable || inplace_.sigmoid_enable) {
+      config_inplace(inplace_);
+    }
+    std::vector<BasicDWConvParam*>& params = param_.splitParams();
+    if (params.size() > 1) {
+      splitPE_.dispatch();
+    }
+    int ret = 0;
+    for (auto dwconv_param : params) {
+      ret |= compute_fpga_dwconv(dwconv_param->args);
+    }
+    if (params.size() > 1) {
+      concatPE_.dispatch();
+    }
+    if (inplace_.relu_enable || inplace_.leaky_relu_enable ||
+        inplace_.relu6_enable || inplace_.sigmoid_enable) {
+      inplace_.relu_enable = false;
+      inplace_.leaky_relu_enable = false;
+      inplace_.relu6_enable = false;
+      inplace_.sigmoid_enable = false;
+      config_inplace(inplace_);
+    }
+    return ret;
+  }
+  DepthwiseConvSplitParam& param() { return param_; }
+ private:
+  DepthwiseConvSplitParam param_;
+  ConcatPE concatPE_;
+  SplitPE splitPE_;
+  InplaceArgs inplace_ = {0};
+};
+}  // namespace zynqmp
+}  // namespace paddle
--- a/lite/backends/fpga/KD/pes/fully_connected_pe.hpp
+++ b/lite/backends/fpga/KD/pes/fully_connected_pe.hpp
@@ -38,7 +38,7 @@ class FullyConnectedPE : public PE {
    Tensor* input = param_.input;
    convParam_.input = param_.input;
    convParam_.output = param_.output;
-    // convParam_.relu = param_.relu;
    convParam_.activeParam.type = param_.activeParam.type;
    convParam_.groups = 1;
    convParam_.strides = {1, 1};
@@ -48,9 +48,6 @@ class FullyConnectedPE : public PE {
    int num = param_.filter->shape().channel();
    int chw = param_.filter->shape().num();
-    // if (num == 2) {
-    //   return;
-    // }
    int height = param_.input->shape().height();
    int width = param_.input->shape().width();

--- a/lite/backends/fpga/KD/pes/input_pe.hpp
+++ b/lite/backends/fpga/KD/pes/input_pe.hpp
@@ -41,7 +41,9 @@ class InputPE : public PE {
      src = &half_tensor;
    }
    output->mutableData<void>();
-    src->alignImage(output, true);
+    src->alignImage();
+    output->copyFrom(src);
+    // src->alignImage(output, true);
    return true;
  }

--- a/lite/backends/fpga/KD/pes/norm_pe.hpp
+++ b/lite/backends/fpga/KD/pes/norm_pe.hpp
@@ -23,6 +23,7 @@ limitations under the License. */
 namespace paddle {
 namespace zynqmp {
 class NormPE : public PE {
 public:
  bool init() {
@@ -106,21 +107,19 @@ class NormPE : public PE {
  }
  bool dispatch() {
-    cpuCompute();
+    // cpuCompute();
-    // std::cout << "CPU normalize ---------------------" << std::endl;
-    // param_.input->syncToDevice();
-    // // param_.input->saveToFile("normalize_fpga_", true);
-    // config_norm_param(norm_param_args_);
-    // inplace_args_.normalize_enable = true;
-    // config_inplace(inplace_args_);
-    // perform_bypass(bypass_args_);
-    // inplace_args_.normalize_enable = false;
-    // config_inplace(inplace_args_);
-    // compute_norm(norm_args_);
-    // param_.output->saveToFile("normalize_fpga_", true);
    // std::cout << "FPGA normalize ---------------------" << std::endl;
+    param_.input->syncToDevice();
+    config_norm_param(norm_param_args_);
+    inplace_args_.normalize_enable = true;
+    config_inplace(inplace_args_);
+    perform_bypass(bypass_args_);
+    inplace_args_.normalize_enable = false;
+    config_inplace(inplace_args_);
+    compute_norm(norm_args_);
    return true;
  }
@@ -135,5 +134,6 @@ class NormPE : public PE {
  NormalizeArgs norm_args_ = {0};
 };
 }  // namespace zynqmp
 }  // namespace paddle
--- a/lite/backends/fpga/KD/pes/output_pe.hpp
+++ b/lite/backends/fpga/KD/pes/output_pe.hpp
@@ -14,6 +14,7 @@ limitations under the License. */
 #pragma once
+#include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
 #include "lite/backends/fpga/KD/pe.hpp"
 #include "lite/backends/fpga/KD/pe_params.hpp"
@@ -25,8 +26,6 @@ class OutputPE : public PE {
  bool init() {
    Tensor* output = param_.output;
    output->setAligned(false);
-    DLEngine::get_instance().out_data = reinterpret_cast<float*>(
-        fpga_malloc(output->shape().numel() * sizeof(float)));
    return true;
  }
@@ -43,15 +42,7 @@ class OutputPE : public PE {
    } else {
      output->copyFrom(input);
    }
-    //
    output->syncToCPU();
-    if (DLEngine::get_instance().out_data == nullptr) {
-      DLEngine::get_instance().out_data = reinterpret_cast<float*>(
-          fpga_malloc(output->shape().numel() * sizeof(float)));
-    }
-    memcpy(DLEngine::get_instance().out_data,
-           output->data<void>(),
-           output->shape().numel() * sizeof(float));
    return true;
  }

--- a/lite/backends/fpga/KD/pes/pooling_pe.hpp
+++ b/lite/backends/fpga/KD/pes/pooling_pe.hpp
@@ -50,14 +50,17 @@ class PoolingPE : public PE {
    PoolingArgs args = {0};
    args.mode = param_.type;
-    auto paddings = *param_.paddings;
+    if (param_.globalPooling) {
-    args.kernel_reciprocal = fp32_2_fp16(1.0f / (k_width * k_height));
+      args.kernel_reciprocal = fp32_2_fp16(1.0f);
+    } else {
+      args.kernel_reciprocal = fp32_2_fp16(1.0f / (k_width * k_height));
+    }
    args.image.address = input->data<float16>();
    args.image.channels = input->shape().channel();
    args.image.height = input->shape().height();
    args.image.width = input->shape().width();
-    args.image.pad_height = paddings[0];
+    args.image.pad_height = param_.paddings[0];
-    args.image.pad_width = paddings[2];
+    args.image.pad_width = param_.paddings[1];
    args.image.scale_address = input->scale();
    args.output.address = output->mutableData<float16>();
    args.output.scale_address = output->scale();
@@ -69,11 +72,8 @@ class PoolingPE : public PE {
    args.out_width = output->shape().width();
    param_.poolingArgs = args;
-    // use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1
-    // && (k_width > 7 || k_height > 7);
    use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1 &&
               (k_width > 255 || k_height > 255);
-    // use_cpu_ = param_.type == AVERAGE;
  }
  void compute() {
@@ -86,13 +86,12 @@ class PoolingPE : public PE {
    float* image_addr = float_input.mutableData<float>(FP32, input->shape());
    float_input.copyFrom(input);
    float16* data_out = output->data<float16>();
-    auto paddings = *param_.paddings;
    int image_height = input->shape().height();
    int image_width = input->shape().width();
    int image_channels = input->shape().channel();
-    int image_pad_h = paddings[0];
+    int image_pad_h = param_.paddings[0];
-    int image_pad_w = paddings[2];
+    int image_pad_w = param_.paddings[1];
    int kernel_height = param_.kernelSize[1];
    int kernel_width = param_.kernelSize[0];
    int kernel_step_h = param_.strides[0];
@@ -118,8 +117,7 @@ class PoolingPE : public PE {
        for (int c = 0; c < image_channels; ++c) {
          const int pool_index = (ph * pooled_width_ + pw) * image_channels + c;
          float sum = 0;
-          // const int index =
-          //     (hstart * image_width + wstart) * image_channels + c;
          for (int h = hstart; h < hend; ++h) {
            for (int w = wstart; w < wend; ++w) {
              const int index = (h * image_width + w) * image_channels + c;
@@ -127,6 +125,7 @@ class PoolingPE : public PE {
              sum += value;
            }
          }
          float value = sum / kernel;
          if (value > max) {
            max = value;
@@ -148,7 +147,6 @@ class PoolingPE : public PE {
    Tensor float_input;
    float_input.mutableData<float>(FP32, input->shape());
    float_input.copyFrom(input);
-    // float_input.saveToFile("pool_float.txt");
    float16* data_out = output->data<float16>();
    int kernel_hw = param_.kernelSize[0] * param_.kernelSize[1];
@@ -167,7 +165,6 @@ class PoolingPE : public PE {
    output->scale()[0] = scale_max / 127.0f;
    output->scale()[1] = 127.0f / scale_max;
    output->flush();
-    // exit(-1);
  }
  void cpu_compute() {
@@ -197,18 +194,41 @@ class PoolingPE : public PE {
    output->scale()[0] = scale_max / 127.0f;
    output->scale()[1] = 127.0f / scale_max;
    output->flush();
-    // exit(-1);
  }
  bool dispatch() {
    if (use_cpu_) {
      // cpu_compute();
      compute();
-      // exit(-1);
      return true;
    }
-    param_.input->syncToDevice();
+    if (param_.globalPooling) {
-    return compute_fpga_pool(param_.poolingArgs) == 0;
+      inplace_.relu_enable = false;
+      inplace_.leaky_relu_enable = false;
+      inplace_.relu6_enable = false;
+      inplace_.sigmoid_enable = false;
+      inplace_.global_pool_en = true;
+      config_inplace(inplace_);
+      int kernel_height = param_.kernelSize[1];
+      int kernel_width = param_.kernelSize[0];
+      globalPoolArgs.global_pool_factor =
+          float_to_half(1.0f / (kernel_height * kernel_width));
+      config_global_pool(globalPoolArgs);
+    }
+    int ret = (compute_fpga_pool(param_.poolingArgs) == 0);
+    if (param_.globalPooling) {
+      inplace_.relu_enable = false;
+      inplace_.leaky_relu_enable = false;
+      inplace_.relu6_enable = false;
+      inplace_.sigmoid_enable = false;
+      inplace_.global_pool_en = false;
+      config_inplace(inplace_);
+      globalPoolArgs.global_pool_factor = float_to_half(0);
+      config_global_pool(globalPoolArgs);
+    }
+    return ret;
  }
  PoolingParam& param() { return param_; }
@@ -216,6 +236,8 @@ class PoolingPE : public PE {
 private:
  PoolingParam param_;
  bool use_cpu_;
+  InplaceArgs inplace_ = {0};
+  GlobalPoolArgs globalPoolArgs;
 };
 }  // namespace zynqmp

--- a/lite/backends/fpga/KD/pes/pooling_process.hpp
+++ b/lite/backends/fpga/KD/pes/pooling_process.hpp
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#ifndef pooling_process_hpp
+#define pooling_process_hpp
+#include <string.h>
+#include <cmath>
+#include <vector>
+#include "lite/backends/fpga/KD/float16.hpp"
+#include "lite/backends/fpga/KD/pe_params.hpp"
+#include "lite/backends/fpga/KD/tensor.hpp"
+namespace paddle {
+namespace zynqmp {
+inline void pooling_split_channel(
+    PoolingParam& param,                        // NOLINT
+    std::vector<PoolingParam*>& splitParams) {  // NOLINT
+  Tensor* input = param.input;
+  Tensor* output = param.output;
+  input->syncToCPU();
+  int h_kernel = param.kernelSize[0];
+  int w_kernel = param.kernelSize[1];
+  if (param.globalPooling) {
+    h_kernel = input->shape().height();
+    w_kernel = input->shape().width();
+  }
+  int c = input->shape().channel();
+  int w = input->shape().width();
+  int wc_h_kernel = w * c * h_kernel;
+  int dwconv_limit = 131072;
+  int num = ceil(wc_h_kernel * 1.0f / dwconv_limit);
+  while (input->shape().channel() % num != 0) {
+    num++;
+  }
+  int channel = ceil(input->shape().channel() * 1.0f / num);
+  float16* output_address = nullptr;
+  float16* input_address = nullptr;
+  float* out_scale_address = nullptr;
+  for (int i = 0; i < num; i++) {
+    PoolingParam* pooling_param = new PoolingParam();
+    // input && output;
+    Shape in_shape(
+        NCHW, {1, channel, input->shape().height(), input->shape().width()});
+    Shape out_shape(
+        NCHW, {1, channel, output->shape().height(), output->shape().width()});
+    if (num == 1) {
+      pooling_param->input = input;
+      pooling_param->output = output;
+      input_address = input->data<float16>();
+      output_address = output->data<float16>();
+      out_scale_address = output->scale();
+    } else {
+      pooling_param->input = new Tensor();
+      pooling_param->output = new Tensor();
+      input_address =
+          pooling_param->input->mutableData<float16>(FP16, in_shape);
+      output_address =
+          pooling_param->output->mutableData<float16>(FP16, out_shape);
+      out_scale_address = pooling_param->output->scale();
+    }
+    PoolingArgs& args = pooling_param->poolingArgs;
+    args.mode = param.type;
+    args.kernel_reciprocal = fp32_2_fp16(1.0f / (w_kernel * h_kernel));
+    if (param.globalPooling) {
+      args.kernel_reciprocal = fp32_2_fp16(1.0f);
+    }
+    args.image.address = input_address;
+    args.image.channels = channel;
+    args.image.height = input->shape().height();
+    args.image.width = input->shape().width();
+    args.image.pad_height = param.paddings[0];
+    args.image.pad_width = param.paddings[1];
+    args.image.scale_address = input->scale();
+    args.output.address = output_address;
+    args.output.scale_address = out_scale_address;
+    args.kernel.height = h_kernel;
+    args.kernel.width = w_kernel;
+    args.kernel.stride_h = param.strides[0];
+    args.kernel.stride_w = param.strides[1];
+    args.out_height = output->shape().height();
+    args.out_width = output->shape().width();
+    splitParams.push_back(pooling_param);
+  }
+}
+}  // namespace zynqmp
+}  // namespace paddle
+#endif /* conv_process_hpp */
--- a/lite/backends/fpga/KD/pes/pooling_split_pe.hpp
+++ b/lite/backends/fpga/KD/pes/pooling_split_pe.hpp
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <algorithm>
+#include <vector>
+#include "lite/backends/fpga/KD/pe.hpp"
+#include "lite/backends/fpga/KD/pe_params.hpp"
+#include "lite/backends/fpga/KD/pes/concat_pe.hpp"
+#include "lite/backends/fpga/KD/pes/elementwise_add_pe.hpp"
+#include "lite/backends/fpga/KD/pes/pooling_process.hpp"
+#include "lite/backends/fpga/KD/pes/scale_pe.hpp"
+#include "lite/backends/fpga/KD/pes/split_pe.hpp"
+namespace paddle {
+namespace zynqmp {
+class PoolingSplitPE : public PE {
+ public:
+  inline int gcd_(int a, int b) {
+    while (b) {
+      int temp = a;
+      a = b;
+      b = temp % b;
+    }
+    return a;
+  }
+  inline int lcm_(int a, int b) { return a * b / gcd_(a, b); }
+  bool init() {
+    Tensor* output = param_.output;
+    output->setAligned(true);
+    output->setDataLocation(Device);
+    return true;
+  }
+  void apply() {
+    PoolingParam& param = param_;
+    Tensor* input = param.input;
+    Tensor* output = param.output;
+    int channel = output->shape().channel();
+    int k_height = param_.kernelSize[1];
+    int k_width = param_.kernelSize[0];
+    if (param_.globalPooling) {
+      k_width = input->shape().width();
+      k_height = input->shape().height();
+      param_.kernelSize[0] = k_height;
+      param_.kernelSize[1] = k_width;
+    } else {
+      k_height = param_.kernelSize[0];
+      k_width = param_.kernelSize[1];
+    }
+    use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1 &&
+               (k_width > 255 || k_height > 255);
+    if (use_cpu_) {
+      return;
+    }
+    pooling_split_channel(param, splitParams_);
+    if (splitParams_.size() > 1) {
+      SplitParam& split_param = splitPE_.param();
+      split_param.input = param_.input;
+      for (auto pooling_param : splitParams_) {
+        split_param.outputs.push_back(pooling_param->input);
+      }
+      splitPE_.init();
+      splitPE_.apply();
+      ConcatParam& concat_param = concatPE_.param();
+      for (auto pooling_param : splitParams_) {
+        concat_param.inputs.push_back(pooling_param->output);
+      }
+      concat_param.output = param_.output;
+      concatPE_.init();
+      concatPE_.apply();
+    }
+  }
+  void compute() {
+    Tensor* input = param_.input;
+    Tensor* output = param_.output;
+    input->syncToCPU();
+    Tensor float_input;
+    // Tensor float_output;
+    float* image_addr = float_input.mutableData<float>(FP32, input->shape());
+    float_input.copyFrom(input);
+    float16* data_out = output->data<float16>();
+    int image_height = input->shape().height();
+    int image_width = input->shape().width();
+    int image_channels = input->shape().channel();
+    int image_pad_h = param_.paddings[0];
+    int image_pad_w = param_.paddings[1];
+    int kernel_height = param_.kernelSize[1];
+    int kernel_width = param_.kernelSize[0];
+    int kernel_step_h = param_.strides[0];
+    int kernel_step_w = param_.strides[1];
+    int pooled_height_ = output->shape().height();
+    int pooled_width_ = output->shape().width();
+    int kernel = kernel_height * kernel_width;
+    float max = 0;
+    for (int ph = 0; ph < pooled_height_; ++ph) {
+      for (int pw = 0; pw < pooled_width_; ++pw) {
+        int hstart = ph * kernel_step_h - image_pad_h;
+        int wstart = pw * kernel_step_w - image_pad_w;
+        int hend = std::min(hstart + kernel_height, image_height);
+        int wend = std::min(wstart + kernel_width, image_width);
+        hstart = std::max(hstart, 0);
+        wstart = std::max(wstart, 0);
+        kernel = (hend - hstart) * (wend - wstart);
+        for (int c = 0; c < image_channels; ++c) {
+          const int pool_index = (ph * pooled_width_ + pw) * image_channels + c;
+          float sum = 0;
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+              const int index = (h * image_width + w) * image_channels + c;
+              float value = image_addr[index];
+              // ofs_out << value << std::endl;
+              sum += value;
+            }
+          }
+          float value = sum / kernel;
+          if (value > max) {
+            max = value;
+          }
+          data_out[pool_index] = float_to_half(value);
+        }
+      }
+    }
+    output->scale()[0] = max / 127.0f;
+    output->scale()[1] = 127.0f / max;
+    output->flush();
+  }
+  bool dispatch() {
+    Tensor* output = param_.output;
+    param_.input->syncToDevice();
+    if (use_cpu_) {
+      compute();
+      return true;
+    }
+    if (splitParams_.size() > 1) {
+      splitPE_.dispatch();
+    }
+    int ret = 0;
+    int index = 0;
+    InplaceArgs inplace_ = {0};
+    GlobalPoolArgs globalPoolArgs;
+    if (param_.globalPooling) {
+      inplace_.relu_enable = false;
+      inplace_.leaky_relu_enable = false;
+      inplace_.relu6_enable = false;
+      inplace_.sigmoid_enable = false;
+      inplace_.global_pool_en = true;
+      config_inplace(inplace_);
+      int kernel_height = param_.kernelSize[1];
+      int kernel_width = param_.kernelSize[0];
+      globalPoolArgs.global_pool_factor =
+          fp32_2_fp16(1.0f / (kernel_height * kernel_width));
+      config_global_pool(globalPoolArgs);
+    }
+    for (auto pooling_param : splitParams_) {
+      ret |= compute_fpga_pool(pooling_param->poolingArgs);
+      float* scale_address = pooling_param->poolingArgs.output.scale_address;
+      output->scale()[0] = scale_address[0];
+      output->scale()[1] = scale_address[1];
+    }
+    if (param_.globalPooling) {
+      inplace_.relu_enable = false;
+      inplace_.leaky_relu_enable = false;
+      inplace_.relu6_enable = false;
+      inplace_.sigmoid_enable = false;
+      inplace_.global_pool_en = false;
+      config_inplace(inplace_);
+      globalPoolArgs.global_pool_factor = fp32_2_fp16(1.0f);
+      config_global_pool(globalPoolArgs);
+    }
+    if (splitParams_.size() > 1) {
+      concatPE_.dispatch();
+    }
+    return ret;
+  }
+  ~PoolingSplitPE() {
+    for (auto pooling_param : splitParams_) {
+      if (splitParams_.size() > 1) {
+        delete pooling_param->input;
+        delete pooling_param->output;
+        delete pooling_param;
+      }
+    }
+    splitParams_.clear();
+  }
+  PoolingParam& param() { return param_; }
+ private:
+  PoolingParam param_;
+  ConcatPE concatPE_;
+  SplitPE splitPE_;
+  std::vector<PoolingParam*> splitParams_;
+  bool use_cpu_ = false;
+};
+}  // namespace zynqmp
+}  // namespace paddle
--- a/lite/backends/fpga/KD/pes/prior_box_pe.cpp
+++ b/lite/backends/fpga/KD/pes/prior_box_pe.cpp
@@ -93,8 +93,8 @@ void PriorBoxPE::compute_prior_box() {
  const float &step_h = param.stepH;
  const float &offset = param.offset;
-  Tensor *output_boxes = this->cachedBoxes_;
+  Tensor *output_boxes = this->cachedBoxes_.get();
-  Tensor *output_variances = this->cachedVariances_;
+  Tensor *output_variances = this->cachedVariances_.get();
  Tensor boxes;
  Tensor variances;
@@ -241,7 +241,6 @@ void PriorBoxPE::compute_prior_box() {
  }
  boxes.flush();
-  boxes.syncToCPU();
  variances.flush();
  output_boxes->copyFrom(&boxes);
  output_variances->copyFrom(&variances);
@@ -251,8 +250,8 @@ void PriorBoxPE::apply() {}
 bool PriorBoxPE::dispatch() {
  if (cachedBoxes_ == nullptr) {
-    cachedBoxes_ = new Tensor();
+    cachedBoxes_.reset(new Tensor());
-    cachedVariances_ = new Tensor();
+    cachedVariances_.reset(new Tensor());
    cachedBoxes_->mutableData<float>(FP32, param_.outputBoxes->shape());
    cachedVariances_->mutableData<float>(FP32, param_.outputVariances->shape());
    cachedBoxes_->setDataLocation(CPU);
@@ -260,12 +259,14 @@ bool PriorBoxPE::dispatch() {
    compute_prior_box();
  }
-  param_.outputBoxes->copyFrom(this->cachedBoxes_);
+  param_.outputBoxes->copyFrom(this->cachedBoxes_.get());
+  param_.outputVariances->copyFrom(this->cachedVariances_.get());
-  param_.outputVariances->copyFrom(this->cachedVariances_);
  param_.outputBoxes->flush();
-  param_.outputBoxes->syncToCPU();
  param_.outputVariances->flush();
+  param_.outputBoxes->setCached(true);
+  param_.outputVariances->setCached(true);
+  return true;
 }
 }  // namespace zynqmp

--- a/lite/backends/fpga/KD/pes/prior_box_pe.hpp
+++ b/lite/backends/fpga/KD/pes/prior_box_pe.hpp
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <memory>
 #include "lite/backends/fpga/KD/pe.hpp"
 #include "lite/backends/fpga/KD/pe_params.hpp"
@@ -23,9 +24,11 @@ class PriorBoxPE : public PE {
 public:
  bool init() {
    param_.outputBoxes->setAligned(false);
-    param_.outputVariances->setAligned(false);
    param_.outputBoxes->setDataLocation(CPU);
+    param_.outputBoxes->setCacheable(true);
+    param_.outputVariances->setAligned(false);
    param_.outputVariances->setDataLocation(CPU);
+    param_.outputVariances->setCacheable(true);
    return true;
  }
@@ -37,8 +40,9 @@ class PriorBoxPE : public PE {
 private:
  PriorBoxParam param_;
-  Tensor* cachedBoxes_ = nullptr;
+  // TODO(chonwhite) use unique_ptr;
-  Tensor* cachedVariances_ = nullptr;
+  std::unique_ptr<Tensor> cachedBoxes_;
+  std::unique_ptr<Tensor> cachedVariances_;
  void compute_prior_box();
 };

--- a/lite/backends/fpga/KD/pes/relu_pe.hpp
+++ b/lite/backends/fpga/KD/pes/relu_pe.hpp
@@ -23,43 +23,27 @@ class ReluPE : public PE {
 public:
  bool init() {
    Tensor* output = param_.output;
-    output->setAligned(true);
+    output->setAligned(param_.input->aligned());
-    output->setDataLocation(Device);
+    output->setDataLocation(CPU);
    return true;
  }
-  void apply() {
+  void apply() {}
-    Tensor* src = param_.input;
-    args_.input_data_type = DATA_TYPE_FP16;
-    args_.output_data_type = DATA_TYPE_FP16;
-    args_.input_layout_type = LAYOUT_HWC;
-    args_.output_layout_type = LAYOUT_HWC;
-    args_.image = {.address = src->data<void>(),
-                   .scale_address = src->scale(),
-                   .channels = (uint32_t)src->shape().channel(),
-                   .width = (uint32_t)src->shape().width(),
-                   .height = (uint32_t)src->shape().height(),
-                   .pad_width = 0u,
-                   .pad_height = 0u};
-    args_.output = {
-        .address = param_.output->data<void>(),
-        .scale_address = param_.output->scale(),
-    };
-    inplace_.relu_enable = false;
-    inplace_.power_enable = false;
-    inplace_.normalize_enable = false;
-  }
  bool dispatch() {
-    inplace_.relu_enable = true;
+    param_.input->invalidate();
-    config_inplace(inplace_);
+    int16_t* input_data = param_.input->data<int16_t>();
-    param_.input->syncToDevice();
+    float16* out_data = param_.output->data<float16>();
-    param_.output->copyFrom(param_.input);
+    for (int i = 0; i < param_.input->shape().alignedElementCount(); i++) {
-    param_.output->invalidate();
+      int16_t v = param_.input->data<float16>()[i];
-    inplace_.relu_enable = false;
+      if (v > 0) {
-    config_inplace(inplace_);
+        out_data[i] = input_data[i];
+      } else {
+        out_data[i] = zero;
+      }
+    }
+    param_.output->copyScaleFrom(param_.input);
+    param_.output->flush();
    return true;
  }
@@ -67,8 +51,7 @@ class ReluPE : public PE {
 private:
  InputParam param_;
-  BypassArgs args_;
+  float16 zero = float_to_half(0.0f);
-  InplaceArgs inplace_;
 };
 }  // namespace zynqmp

--- a/lite/backends/fpga/KD/pes/resize.hpp
+++ b/lite/backends/fpga/KD/pes/resize.hpp
@@ -73,9 +73,38 @@ class ResizePE : public PE {
    scale[0] = max / 127.0;
    scale[1] = 127.0 / max;
  }
+  void cpu_compute() {
+    Shape& in_shape = param_.input->shape();
+    Shape& out_shape = param_.output->shape();
+    int channel = in_shape.channel();
+    int in_height = in_shape.height();
+    int in_width = in_shape.width();
+    int out_width = out_shape.width();
+    int factor = out_shape.width() / in_shape.width();
+    param_.input->syncToCPU();
+    for (int h = 0; h < in_height; h++) {
+      for (int w = 0; w < in_width; w++) {
+        int src_index = in_width * channel * h + w * channel;
+        float16* src = param_.input->data<float16>() + src_index;
+        for (int v = 0; v < factor; v++) {
+          for (int i = 0; i < factor; i++) {
+            int dst_index = out_width * channel * h * factor +
+                            out_width * channel * v + w * channel * factor +
+                            channel * i;
+            float16* dst = param_.output->data<float16>() + dst_index;
+            memcpy(dst, src, channel * sizeof(float16));
+          }
+        }
+      }
+    }
+    param_.output->flush();
+    param_.output->copyScaleFrom(param_.input);
+  }
  bool dispatch() {
-    bool ret = compute_fpga_resize(args_) == 0;
+    cpu_compute();
    return true;
  }

--- a/lite/backends/fpga/KD/pes/scale_pe.hpp
+++ b/lite/backends/fpga/KD/pes/scale_pe.hpp
@@ -141,22 +141,22 @@ class ScalePE : public PE {
    Tensor* output = param_.output;
    Tensor float_input;
    float* image_addr = float_input.mutableData<float>(FP32, input->shape());
-    input->syncToCPU();
    float_input.copyFrom(input);
    float16* data_out = output->data<float16>();
-    float* scale_data = param_.scale->data<float>();
+    float16* scale_data = param_.scale->data<float16>();
    int wh = input->shape().width() * input->shape().height();
    float16* in_data = input->data<float16>();
    float max = 0;
    for (int i = 0; i < wh; i++) {
      for (int c = 0; c < input->shape().channel(); c++) {
        int index = i * input->shape().channel() + c;
-        float value = half_to_float(in_data[index]) * scale_data[c];
+        float x = image_addr[index];
+        float y = half_to_float(scale_data[c]);
+        float value = x * y;
        data_out[index] = float_to_half(value);
        if (value < 0) {
@@ -180,7 +180,6 @@ class ScalePE : public PE {
             param_.scale->shape().numel() * sizeof(float16));
      dw_param.quantizedFilter()->scale()[0] = param_.scale->scale()[0];
      dw_param.quantizedFilter()->scale()[1] = param_.scale->scale()[1];
      dw_param.quantizedFilter()->flush();
    }
    param_.input->syncToDevice();

--- a/lite/backends/fpga/KD/pes/softmax_pe.cpp
+++ b/lite/backends/fpga/KD/pes/softmax_pe.cpp
@@ -59,6 +59,7 @@ static void softmax(Tensor *X, Tensor *Y) {
  int batch_size = X->shape().num();
  int num_classes = dims[X->shape().dimSize() - 1];
  int channels = X->shape().numel() / batch_size / num_classes;
  float *x = X->data<float>();
  float *y = Y->mutableData<float>();
@@ -140,11 +141,11 @@ bool SoftmaxPE::init() {
 bool SoftmaxPE::dispatch() {
  Tensor *input = param_.input;
  Tensor *output = param_.output;
-  input->syncToCPU();
  Tensor float_input;
  Tensor float_output;
  float_input.mutableData<float>(DataType::FP32, input->shape());
+  input->syncToDevice();
  float_input.copyFrom(input);
  float *out_data =
@@ -154,6 +155,7 @@ bool SoftmaxPE::dispatch() {
  float_output.flush();
  output->copyFrom(&float_output);
+  output->flush();
  return true;
 }

--- a/lite/backends/fpga/KD/pes/split_pe.hpp
+++ b/lite/backends/fpga/KD/pes/split_pe.hpp
@@ -105,7 +105,7 @@ class SplitPE : public PE {
                                          in_stride,
                                          out_stride[axis]);
        input_offset += out_stride[axis];
-        // out->flush();
+        out->flush();
      }
      return true;
    }

--- a/lite/backends/fpga/KD/pes/yolobox_pe.hpp
+++ b/lite/backends/fpga/KD/pes/yolobox_pe.hpp
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <algorithm>
+#include "lite/backends/fpga/KD/pe.hpp"
+#include "lite/backends/fpga/KD/pe_params.hpp"
+namespace paddle {
+namespace zynqmp {
+float sigmoid(float x) { return 1.0 / (1.0 + std::exp(-x)); }
+inline void GetYoloBox(float* box,
+                       const float* x,
+                       const int* anchors,
+                       int w,
+                       int h,
+                       int an_idx,
+                       int grid_size,
+                       int input_size,
+                       int index,
+                       int img_height,
+                       int img_width) {
+  box[0] = (w + sigmoid(x[index])) * img_width * 1.0f / grid_size;
+  box[1] = (h + sigmoid(x[index + 1])) * img_height * 1.0f / grid_size;
+  box[2] = std::exp(x[index + 2]) * anchors[2 * an_idx] * img_width * 1.0f /
+           input_size;
+  box[3] = std::exp(x[index + 3]) * anchors[2 * an_idx + 1] * img_height *
+           1.0f / input_size;
+}
+inline int GetEntryIndex(int batch,
+                         int an_idx,
+                         int hw_idx,
+                         int an_num,
+                         int an_stride,
+                         int stride,
+                         int entry) {
+  return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx;
+}
+inline void CalcDetectionBox(float* boxes,
+                             float* box,
+                             const int box_idx,
+                             const int img_height,
+                             const int img_width) {
+  boxes[box_idx] = box[0] - box[2] / 2;
+  boxes[box_idx + 1] = box[1] - box[3] / 2;
+  boxes[box_idx + 2] = box[0] + box[2] / 2;
+  boxes[box_idx + 3] = box[1] + box[3] / 2;
+  boxes[box_idx] = boxes[box_idx] > 0 ? boxes[box_idx] : 0;
+  boxes[box_idx + 1] = boxes[box_idx + 1] > 0 ? boxes[box_idx + 1] : 0;
+  boxes[box_idx + 2] =
+      boxes[box_idx + 2] < img_width - 1 ? boxes[box_idx + 2] : (img_width - 1);
+  boxes[box_idx + 3] = boxes[box_idx + 3] < img_height - 1 ? boxes[box_idx + 3]
+                                                           : (img_height - 1);
+}
+inline void CalcLabelScore(float* scores,
+                           const float* input,
+                           const int label_idx,
+                           const int score_idx,
+                           const int class_num,
+                           const float conf) {
+  for (int i = 0; i < class_num; i++) {
+    scores[score_idx + i] = conf * sigmoid(input[label_idx + i]);
+  }
+}
+class YoloBoxPE : public PE {
+ public:
+  bool init() {
+    param_.outputBoxes->setAligned(false);
+    param_.outputScores->setAligned(false);
+    param_.outputBoxes->setDataLocation(CPU);
+    param_.outputScores->setDataLocation(CPU);
+    return true;
+  }
+  bool dispatch() {
+    auto* input = param_.input;
+    auto* imgsize = param_.imgSize;
+    auto* boxes = param_.outputBoxes;
+    auto* scores = param_.outputScores;
+    auto anchors = param_.anchors;
+    int class_num = param_.classNum;
+    float conf_thresh = param_.confThresh;
+    int downsample_ratio = param_.downsampleRatio;
+    const int num = input->shape().num();
+    const int height = input->shape().height();
+    const int width = input->shape().width();
+    const int box_num = boxes->shape().channel();
+    const int an_num = anchors.size() / 2;
+    int input_size = downsample_ratio * height;
+    const int stride = height * width;
+    const int an_stride = (class_num + 5) * stride;
+    Tensor anchors_;
+    Shape anchors_shape(N, {an_num * 2});
+    auto anchors_data = anchors_.mutableData<int32_t>(INT32, anchors_shape);
+    std::copy(anchors.begin(), anchors.end(), anchors_data);
+    input->syncToCPU();
+    Tensor input_float;
+    input_float.setDataLocation(CPU);
+    float* input_data = input_float.mutableData<float>(FP32, input->shape());
+    input_float.setAligned(input->aligned());
+    input_float.copyFrom(input);
+    input_float.unalignImage();
+    int32_t* imgsize_data = imgsize->mutableData<int32_t>();
+    Tensor boxes_float;
+    Tensor scores_float;
+    boxes_float.setDataLocation(CPU);
+    float* boxes_float_data =
+        boxes_float.mutableData<float>(FP32, boxes->shape());
+    memset(boxes_float_data, 0, boxes->shape().numel() * sizeof(float));
+    scores_float.setDataLocation(CPU);
+    float* scores_float_data =
+        scores_float.mutableData<float>(FP32, scores->shape());
+    memset(scores_float_data, 0, scores->shape().numel() * sizeof(float));
+    float box[4];
+    int img_height = imgsize_data[0];
+    int img_width = imgsize_data[1];
+    int channel = input_float.shape().channel();
+    int count = 0;
+    for (int h = 0; h < height; h++) {
+      for (int w = 0; w < width; w++) {
+        for (int n = 0; n < an_num; n++) {
+          int obj_idx =
+              channel * width * h + channel * w + n * (5 + class_num) + 4;
+          float conf = sigmoid(input_data[obj_idx]);
+          if (conf < conf_thresh) {
+            count++;
+            continue;
+          }
+          int box_idx =
+              channel * width * h + channel * w + n * (5 + class_num) + 0;
+          GetYoloBox(box,
+                     input_data,
+                     anchors_data,
+                     w,
+                     h,
+                     n,
+                     height,
+                     input_size,
+                     box_idx,
+                     img_height,
+                     img_width);
+          box_idx = h * an_num * 4 * width + an_num * 4 * w + n * 4;
+          CalcDetectionBox(
+              boxes_float_data, box, box_idx, img_height, img_width);
+          int label_idx =
+              channel * width * h + channel * w + n * (5 + class_num) + 5;
+          int score_idx = h * an_num * class_num * width +
+                          an_num * class_num * w + n * class_num;
+          CalcLabelScore(scores_float_data,
+                         input_data,
+                         label_idx,
+                         score_idx,
+                         class_num,
+                         conf);
+        }
+      }
+    }
+    boxes->copyFrom(&boxes_float);
+    scores->copyFrom(&scores_float);
+    input->setAligned(true);
+  }
+  void apply() {}
+  YoloBoxParam& param() { return param_; }
+ private:
+  YoloBoxParam param_;
+};
+}  // namespace zynqmp
+}  // namespace paddle
--- a/lite/backends/fpga/KD/shape.hpp
+++ b/lite/backends/fpga/KD/shape.hpp
@@ -32,6 +32,10 @@ static struct N n_;
 class Shape {
 public:
+  std::function<int(Shape& s)> aligment_fuction = [](Shape& s) {  // NOLINT
+    return s.layout_->alignedElementCount(s.dims_);
+  };
  explicit Shape(std::vector<int> dims) { dims_ = dims; }
  Shape(LayoutType type, std::vector<int> dims) {
@@ -44,6 +48,10 @@ class Shape {
    setLayoutType(src.layoutType_);
  }
+  void setAligmentFunction(std::function<int(Shape& s)> f) {  // NOLINT
+    aligment_fuction = f;
+  }
  bool shouldAlign() {
    return layout_->alignedElementCount(dims_) != layout_->elementCount(dims_);
  }
@@ -72,13 +80,11 @@ class Shape {
  std::vector<int> dims() { return dims_; }
-  size_t memorySize(int cellSize) {
+  size_t memorySize(int cellSize) { return aligment_fuction(*this) * cellSize; }
-    return layout_->alignedElementCount(dims_) * cellSize;
-  }
  int numel() { return layout_->elementCount(dims_); }
-  int alignedElementCount() { return layout_->alignedElementCount(dims_); }
+  int alignedElementCount() { return aligment_fuction(*this); }
  void setLayoutType(LayoutType layout) {
    this->layoutType_ = layout;

--- a/lite/backends/fpga/KD/tensor.hpp
+++ b/lite/backends/fpga/KD/tensor.hpp
@@ -38,6 +38,7 @@ enum DataType : int {
  FP16 = 1,
  INT8 = 2,
  INT32 = 3,
+  INT64 = 4,
 };
 enum DataSyncStatus : int {
@@ -58,6 +59,8 @@ inline int CellSize(DataType type) {
      return sizeof(int32_t);
    case INT8:
      return sizeof(int8_t);
+    case INT64:
+      return sizeof(int64_t);
    default:
      return 0;
  }
@@ -66,17 +69,16 @@ inline int CellSize(DataType type) {
 class PlaceHolder {
 public:
-  PlaceHolder() {}
  explicit PlaceHolder(size_t size) {
    size_ = size;
    data_ = fpga_malloc(size_);
+    memset(data_, 0, size);
+    fpga_flush(data_, size);
  }
  void* data() { return data_; }
-  void set_data(const void* ptr) { data_ = const_cast<void*>(ptr); }
  size_t memorySize() { return size_; }
-  void set_size(size_t new_size) { size_ = new_size; }
  ~PlaceHolder() { fpga_free(data_); }
@@ -99,7 +101,7 @@ class Tensor {
      return nullptr;
    }
    void* ptr = reinterpret_cast<char*>(this->placeHolder_->data()) +
-                offset * CellSize(dataType_);
+                offset_ * CellSize(dataType_);
    return reinterpret_cast<Dtype*>(ptr);
  }
@@ -116,7 +118,7 @@ class Tensor {
  template <typename Dtype>
  Dtype* mutableData() {
    size_t memorySize =
-        shape_->memorySize(CellSize(dataType_)) * mem_scale_factor_;
+        shape_->memorySize(CellSize(dataType_)) * mem_factor_ + 16;
    if (placeHolder_ != nullptr) {
      if (memorySize > placeHolder_->memorySize()) {
        placeHolder_.reset(new PlaceHolder(memorySize));
@@ -134,6 +136,10 @@ class Tensor {
    return placeHolder_->memorySize();
  }
+  void setMemScale(float mem_factor) { mem_factor_ = mem_factor; }
+  void setOffset(int offset) { offset_ = offset; }
  void setDataType(DataType dataType) { this->dataType_ = dataType; }
  DataType dataType() { return this->dataType_; }
@@ -240,10 +246,6 @@ class Tensor {
    }
  }
-  void setMemScale(float scale_factor) {
-    this->mem_scale_factor_ = scale_factor;
-  }
  void shareDataWith(Tensor* src) { shareDataWith(src, src->shape()); }
  void shareDataWith(Tensor* src, const Shape& shape, int offset = 0) {
@@ -254,7 +256,7 @@ class Tensor {
    this->dataType_ = src->dataType_;
    this->aligned_ = src->aligned_;
    this->dateLocation_ = src->dateLocation_;
-    this->offset = offset;
+    this->offset_ = offset;
    shape_ = new Shape(const_cast<Shape&>(shape));
  }
@@ -279,16 +281,13 @@ class Tensor {
                  .height = 1,
                  .pad_width = 0u,
                  .pad_height = 0u};
+    args.output = {
-    ImageOutputArgs output = {
        .address = data<void>(), .scale_address = scale(),
    };
-    args.output = output;
    src->syncToDevice();
    size_t aligned_remainder = src->shape().numel() % 16;
    if (aligned_remainder > 0) {
-      size_t dtype_size =
+      size_t dtype_size = CellSize(src->dataType_);
-          src->dataType_ == FP32 ? sizeof(float) : sizeof(float16);
      void* dst = src->data<char>() + src->shape().numel() * dtype_size;
      memset(dst, 0, aligned_remainder * dtype_size);
      fpga_flush(dst, aligned_remainder * dtype_size);
@@ -299,14 +298,10 @@ class Tensor {
    this->invalidate();
  }
-  void flush() {
+  void flush() { fpga_flush(placeHolder_->data(), placeHolder_->memorySize()); }
-    size_t memorySize = placeHolder_->memorySize();
-    fpga_flush(placeHolder_->data(), memorySize);
-  }
  void invalidate() {
-    size_t memorySize = placeHolder_->memorySize();
+    fpga_invalidate(placeHolder_->data(), placeHolder_->memorySize());
-    fpga_invalidate(placeHolder_->data(), memorySize);
  }
  void sync() {
@@ -348,16 +343,17 @@ class Tensor {
    }
  }
-  void printScale(std::string type) { printScale(); }
  std::string dimsFileName() {
-    return paddle::lite::to_string(shape_->num()) + "_" +
+    return std::to_string(shape_->num()) + "_" +
-           paddle::lite::to_string(shape_->channel()) + "_" +
+           std::to_string(shape_->channel()) + "_" +
-           paddle::lite::to_string(shape_->height()) + "_" +
+           std::to_string(shape_->height()) + "_" +
-           paddle::lite::to_string(shape_->width()) + ".txt";
+           std::to_string(shape_->width()) + ".txt";
  }
-  void saveToFile() { std::string path = dimsFileName(); }
+  void saveToFile() {
+    std::string path = dimsFileName();
+    // saveToFile(path);
+  }
  void saveToFile(std::string prefix, bool with_shape) {
    std::string path = prefix;
@@ -371,34 +367,61 @@ class Tensor {
  void saveToFile(std::string path) {
    syncToCPU();
-    invalidate();
    std::ofstream ofs;
    static int counter = 0;
-    std::string npath = paddle::lite::to_string(counter) + "_" + path;
+    std::string npath = std::to_string(counter) + "_" + path;
    counter++;
    save_file_with_name(npath);
  }
  void save_file_with_name(std::string path) {
    invalidate();
+    Tensor* t = this;
+    Tensor unaligned;
+    if (this->aligned_) {
+      unaligned.dataType_ = this->dataType_;
+      unaligned.aligned_ = this->aligned_;
+      unaligned.mutableData<void>(dataType_, *shape_);
+      unaligned.copyFrom(this);
+      unaligned.unalignImage();
+      unaligned.syncToCPU();
+      t = &unaligned;
+    }
    std::ofstream ofs;
    ofs.open(path);
-    ofs << scale()[0] << " / " << scale()[1] << std::endl;
+    ofs << "type:" << dataType_ << " scale: " << scale()[0] << " id:" << id_
+        << std::endl;
    for (int i = 0; i < shape_->numel(); i++) {
      float value = 0;
-      if (dataType_ == FP32) {
+      switch (dataType_) {
-        value = data<float>()[i];
+        case FP16:
-      } else if (dataType_ == FP16) {
+          value = half_to_float(t->data<float16>()[i]);
-        value = half_to_float(data<float16>()[i]);
+          break;
-      } else {
+        case FP32:
-        value = data<int8_t>()[i];
+          value = t->data<float>()[i];
+          break;
+        case INT8:
+          value = t->data<int8_t>()[i];
+          break;
+        case INT32:
+          value = data<int32_t>()[i];
+          break;
+        case INT64:
+          value = data<int64_t>()[i];
+          break;
+        default:
+          std::cout << "Unknown type!! \n";
+          exit(-1);
      }
      ofs << value << std::endl;
    }
    ofs.close();
  }
+  void releaseData() { placeHolder_.reset(); }
  void readFromFile(std::string path) {
    std::ifstream file_stream;
    file_stream.open(path);
@@ -408,48 +431,25 @@ class Tensor {
    int num = shape_->numel();
    invalidate();
    float max = 0.0f;
-    if (dataType_ == FP16) {
+    float16* data = mutableData<float16>();
-      float16* data = mutableData<float16>();
+    for (int i = 0; i < num; ++i) {
-      for (int i = 0; i < num; ++i) {
+      float value = 0;
-        float value = 0;
+      file_stream >> value;
-        file_stream >> value;
+      max = std::max(std::abs(value), max);
-        max = std::max(std::abs(value), max);
+      data[i] = float_to_half(value);
-        data[i] = float_to_half(value);
-      }
-    } else {
-      float* data = mutableData<float>();
-      for (int i = 0; i < num; ++i) {
-        float value = 0;
-        file_stream >> value;
-        max = std::max(std::abs(value), max);
-        data[i] = value;
-      }
    }
    flush();
    placeHolder_->scale_[0] = max / 127.0f;
    placeHolder_->scale_[1] = 127.0f / max;
  }
-  friend std::ostream& operator<<(std::ostream& os, Tensor& tensor) {
+  void setCacheable(bool cacheable) { cacheable_ = cacheable; }
-    os << "tensor:"
-       << "\n";
+  bool cacheable() { return cacheable_; }
-    os << "dims: {";
-    for (int i = 0; i < tensor.shape().dimSize(); ++i) {
+  void setCached(bool cached) { cached_ = cached; }
-      os << tensor.shape()[i] << " ";
-    }
+  bool cached() { return cached_; }
-    os << "}\n";
-    for (int i = 0; i < tensor.shape().numel(); i++) {
-      float value = 0;
-      if (tensor.dataType() == FP32) {
-        value = tensor.data<float>()[i];
-      } else {
-        value = half_to_float(tensor.data<float16>()[i]);
-      }
-      os << value << " ";
-    }
-    os << "\n";
-    return os;
-  }
  ~Tensor() {
    if (shape_ != nullptr) {
@@ -459,8 +459,10 @@ class Tensor {
  }
 private:
-  int offset = 0;
+  bool cacheable_ = false;
-  float mem_scale_factor_ = 1.0f;
+  bool cached_ = false;
+  int offset_ = 0;
+  float mem_factor_ = 1.0f;
  std::shared_ptr<PlaceHolder> placeHolder_;
  Shape* shape_ = nullptr;
  DataType dataType_ = FP32;

--- a/lite/backends/fpga/lite_tensor.cc
+++ b/lite/backends/fpga/lite_tensor.cc
@@ -22,21 +22,17 @@ using value_type = int64_t;
 value_type DDimLite::production() const {
  value_type res = 1;
-  for (size_t i = 0; i < this->size(); i++) {
+  for (size_t i = 0; i < data_.size(); i++) {
-    res *= (*this)[i];
+    res *= data_[i];
  }
  return res;
 }
 value_type DDimLite::count(int start, int end) const {
-  if (start < 0) {
+  start = (std::max)(start, 0);
-    start = 0;
+  end = (std::min)(end, static_cast<int>(data_.size()));
-  }
-  if (end > size()) {
-    end = size();
-  }
  if (end < start) {
-    end = start;
+    return 0;
  }
  value_type sum = 1;
  for (auto i = start; i < end; ++i) {
@@ -46,11 +42,13 @@ value_type DDimLite::count(int start, int end) const {
 }
 DDimLite DDimLite::Slice(int start, int end) const {
-  std::vector<value_type> vec;
+  start = (std::max)(start, 0);
+  end = (std::min)(end, static_cast<int>(data_.size()));
+  std::vector<value_type> new_dim(end - start);
  for (int i = start; i < end; i++) {
-    vec.push_back((*this)[i]);
+    new_dim[i - start] = data_[i];
  }
-  return DDimLite(vec);
+  return DDim(new_dim);
 }
 std::string DDimLite::repr() const {
@@ -69,7 +67,7 @@ std::string DDimLite::repr() const {
 }
 void TensorLite::ShareDataWith(const TensorLite &other) {
-  buffer_ = other.buffer_;
+  buffer_ = other.buffer_;  // TODO(chonwhite) delete buffer;
  dims_ = other.dims_;
  zynq_tensor_ = other.zynq_tensor_;
  target_ = other.target_;
@@ -78,30 +76,35 @@ void TensorLite::ShareDataWith(const TensorLite &other) {
  throw - 1;
 }
-void *TensorLite::mutable_data(size_t memory_size) {
-  memory_size_ = memory_size;
-  buffer_->ResetLazy(target_, memory_size_);
-  // throw -1;
-  std::cout << memory_size << std::endl;
-  return buffer_->data();
-}
-void *TensorLite::mutable_data(TargetType target, size_t memory_size) {
-  target_ = target;
-  return mutable_data(memory_size);
-}
 void TensorLite::CopyDataFrom(const TensorLite &other) {
  dims_ = other.dims_;
  target_ = other.target_;
  lod_ = other.lod_;
-  auto dt = zynq_tensor_->dataType();
-  auto shape = other.zynq_tensor_->shape();
+  if (zynq_tensor_.get() == nullptr) {
+    zynq_tensor_.reset(new zynqmp::Tensor());
+  }
+  auto dt = zynq_tensor_->dataType();
  Resize(other.dims());
+  auto shape = other.zynq_tensor_->shape();
  zynq_tensor_->mutableData<void>(zynq_tensor_->dataType(), shape);
-  this->ZynqTensor()->copyFrom(other.ZynqTensor());
+  precision_ = other.precision_;
+  memcpy(this->ZynqTensor()->data<void>(),
+         other.ZynqTensor()->data<void>(),
+         other.ZynqTensor()->shape().numel() * sizeof(float));
+}
+void *TensorLite::mutable_data(size_t memory_size) {
+  memory_size_ = memory_size;  // TODO(chonwhite) delete buffer;
+  buffer_->ResetLazy(target_, memory_size_);
+  return buffer_->data();
+}
+void *TensorLite::mutable_data(TargetType target, size_t memory_size) {
+  target_ = target;
+  return mutable_data(memory_size);
 }
 }  // namespace lite

--- a/lite/backends/fpga/lite_tensor.h
+++ b/lite/backends/fpga/lite_tensor.h
@@ -78,7 +78,11 @@ class DDimLite {
  }
  friend bool operator!=(const DDimLite &a, const DDimLite &b) {
-    return !(a == b);
+    if (a.size() != b.size()) return true;
+    for (size_t i = 0; i < a.size(); i++) {
+      if (a[i] != b[i]) return true;
+    }
+    return false;
  }
 private:
@@ -93,7 +97,7 @@ class TensorLite {
  TensorLite() : buffer_(std::make_shared<Buffer>()) {}
  template <typename DType, typename DimT, TargetType Target>
-  void Assign(DType *data, const DimT &dim) {
+  void Assign(const DType *data, const DimT &dim) {
    Resize(dim);
    auto *dst = mutable_data<DType, void>(Target);
    CopySync<Target>(
@@ -107,10 +111,11 @@ class TensorLite {
  template <typename T, typename R = T>
  const R *data() const {
    return zynq_tensor_->data<R>() + offset_;
+    // return zynq_tensor_->data<R>();
  }
  void Resize(const DDimLite &ddim) { dims_ = ddim; }
-  void Resize(const std::vector<int64_t> &x) { dims_ = DDimLite(x); }
+  void Resize(const std::vector<int64_t> &x) { dims_.ConstructFrom(x); }
  const DDimLite &dims() const { return dims_; }
  int64_t numel() const { return dims_.production(); }
@@ -142,7 +147,16 @@ class TensorLite {
  void *mutable_data(size_t memory_size);
  void *mutable_data(TargetType target, size_t memory_size);
-  const void *raw_data() const { return buffer_->data(); }
+  const void *raw_data() const {
+    return buffer_->data();
+  }  // TODO(chonwhite) delete buffer;
+  void clear() {
+    // zynq_tensor_->releaseData();
+    if (zynq_tensor_) {
+      memset(zynq_tensor_->data<void>(), 0, zynq_tensor_->memorySize());
+    }
+  }
  size_t data_size() const { return this->dims().production(); }
@@ -150,17 +164,19 @@ class TensorLite {
  size_t offset() const { return offset_; }
-  bool IsInitialized() const { return buffer_->data(); }
+  bool IsInitialized() const {
-  void clear() {
+    return buffer_->data();
-    buffer_->Free();
+  }  // TODO(chonwhite) delete buffer;
-    offset_ = 0;
-  }
  // Other share data to this.
  void ShareDataWith(const TensorLite &other);
  void CopyDataFrom(const TensorLite &other);
+  void ResetBuffer(std::shared_ptr<Buffer> buffer, size_t memory_size) {
+    // TODO(chonwhite) deal with buffer;
+  }
  template <typename T>
  TensorLite Slice(int64_t begin, int64_t end) const;
@@ -169,7 +185,7 @@ class TensorLite {
  TargetType target() const { return target_; }
-  zynqmp::Tensor *ZynqTensor() const { return zynq_tensor_; }
+  zynqmp::Tensor *ZynqTensor() const { return zynq_tensor_.get(); }
  friend std::ostream &operator<<(std::ostream &os, const TensorLite &tensor) {
    os << "Tensor:" << '\n';
@@ -189,7 +205,7 @@ class TensorLite {
  // set values of precision_ and persistable_ after updating it.
  // If your tensor is just a temp tensor, such as activations,
  // you can ignore these two attributes.
-  PrecisionType precision_{PrecisionType::kUnk};
+  PrecisionType precision_{PrecisionType::kFloat};
  bool persistable_{false};
  DDimLite dims_;
@@ -198,12 +214,62 @@ class TensorLite {
  size_t memory_size_{};
  size_t offset_{0};
-  zynqmp::Tensor *zynq_tensor_ = new zynqmp::Tensor();
+  std::shared_ptr<zynqmp::Tensor> zynq_tensor_;
  template <typename T>
  void mutable_data_internal();
 };
+template <typename T>
+zynqmp::DataType get_date_type() {
+  zynqmp::DataType data_type = zynqmp::FP32;
+  if (typeid(T) == typeid(float)) {
+    data_type = zynqmp::FP32;
+  }
+  if (typeid(T) == typeid(zynqmp::float16)) {
+    data_type = zynqmp::FP16;
+  }
+  if (typeid(T) == typeid(int)) {
+    data_type = zynqmp::INT32;
+  }
+  if (typeid(T) == typeid(int32_t)) {
+    data_type = zynqmp::INT32;
+  }
+  if (typeid(T) == typeid(int8_t)) {
+    data_type = zynqmp::INT8;
+  }
+  if (typeid(T) == typeid(int64_t)) {
+    data_type = zynqmp::INT64;
+  }
+  return data_type;
+}
+template <typename T>
+PrecisionType get_precistion_type() {
+  PrecisionType data_type = PrecisionType::kUnk;
+  if (typeid(T) == typeid(float)) {
+    data_type = PrecisionType::kFloat;
+  }
+  if (typeid(T) == typeid(zynqmp::float16)) {
+    data_type = PrecisionType::kFP16;
+  }
+  if (typeid(T) == typeid(int)) {
+    data_type = PrecisionType::kInt32;
+  }
+  if (typeid(T) == typeid(int32_t)) {
+    data_type = PrecisionType::kInt32;
+  }
+  if (typeid(T) == typeid(int8_t)) {
+    data_type = PrecisionType::kInt8;
+  }
+  if (typeid(T) == typeid(int64_t)) {
+    data_type = PrecisionType::kInt64;
+  }
+  return data_type;
+}
 template <typename T, typename R>
 R *TensorLite::mutable_data() {
  std::vector<int> v;
@@ -229,14 +295,13 @@ R *TensorLite::mutable_data() {
      break;
  }
  zynqmp::Shape input_shape(layout_type, v);
+  zynqmp::DataType data_type = get_date_type<T>();
+  precision_ = get_precistion_type<T>();
-  zynqmp::DataType data_type = zynqmp::FP32;
+  if (zynq_tensor_.get() == nullptr) {
-  if (typeid(T) == typeid(float)) {
+    zynq_tensor_.reset(new zynqmp::Tensor());
-    data_type = zynqmp::FP32;
-  }
-  if (typeid(T) == typeid(zynqmp::float16)) {
-    data_type = zynqmp::FP16;
  }
  return zynq_tensor_->mutableData<R>(data_type, input_shape);
 }
@@ -268,14 +333,13 @@ TensorLite TensorLite::Slice(int64_t begin, int64_t end) const {
    memcpy(dst_data,
           src_data + static_cast<size_t>(begin * base) * sizeof(T),
           dst_dims.production() * sizeof(T));
-    dst.ZynqTensor()->saveToFile("_slice", true);
    return dst;
  }
 }
 template <typename T>
 void TensorLite::Slice(TensorLite &dst, int64_t begin, int64_t end) const {
+  // TODO(chonwhite) delete this function;
  CHECK_GE(begin, 0);
  CHECK_LE(end, dims_[0]);
  CHECK_LT(begin, end);

--- a/lite/backends/fpga/monitor.hpp
+++ b/lite/backends/fpga/monitor.hpp
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <cstddef>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "lite/core/program.h"
+#include "lite/core/tensor.h"
+namespace paddle {
+namespace lite {
+class Monitor {
+ public:
+  static Monitor& get_instance() {
+    static Monitor s_instance;
+    return s_instance;
+  }
+  void inferStart() {}
+  void preRun(Instruction& inst) {  // NOLINT
+    auto op = const_cast<OpLite*>(inst.op());
+    auto op_type = op->Type();
+    VLOG(4) << "Running op:" << op_type << " on " << inst.kernel()->name();
+  }
+  void postRun(Instruction& inst) {  // NOLINT
+    auto op = const_cast<OpLite*>(inst.op());
+    auto op_info = op->op_info();
+    auto in_names = op_info->input_names();
+    static std::vector<std::string> tensor_names = {};
+    auto should_print = [tensor_names](std::string& name) -> bool {
+      if (std::find(tensor_names.begin(), tensor_names.end(), name) !=
+          tensor_names.end()) {
+        return true;
+      }
+      return false;
+    };
+    auto out_args = op_info->output_names();
+    for (auto name : out_args) {
+      VLOG(4) << "\n out_tensor:" << name;
+      auto* var = op->scope()->FindVar(name);
+      if (var->IsType<lite::Tensor>()) {
+        lite::Tensor* tensor =
+            const_cast<lite::Tensor*>(&var->Get<lite::Tensor>());
+        if (tensor->ZynqTensor() != nullptr) {
+          std::string substr = "/";
+          std::size_t found = name.rfind(substr);
+          VLOG(4) << "\n out_tensor:::" << name << "," << found;
+          if (found != std::string::npos) {
+            name.replace(found, substr.length(), "_");
+          }
+          VLOG(4) << "\n out_tensor:::" << name;
+          if (tensor->ZynqTensor() != nullptr && should_print(name)) {
+            tensor->ZynqTensor()->saveToFile(name, true);
+          }
+        }
+      }
+    }
+  }
+  void inferEnd() {}
+ private:
+};
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/mir/CMakeLists.txt
+++ b/lite/core/mir/CMakeLists.txt
@@ -44,6 +44,7 @@ lite_cc_library(mir_passes
      elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.cc
      static_kernel_pick_pass.cc
      variable_place_inference_pass.cc
+      fpga_kernel_place_correct_pass.cc
      type_target_cast_pass.cc
      type_layout_cast_pass.cc
      type_precision_cast_pass.cc

--- a/lite/core/mir/fpga_kernel_place_correct_pass.cc
+++ b/lite/core/mir/fpga_kernel_place_correct_pass.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/core/mir/fpga_kernel_place_correct_pass.h"
+#include <memory>
+#include "lite/core/mir/pass_registry.h"
+namespace paddle {
+namespace lite {
+namespace mir {
+void KernelPlaceCorrectPass::Apply(const std::unique_ptr<SSAGraph> &graph) {
+  CorrectArgumentPlace(graph.get());
+}
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+REGISTER_MIR_PASS(kernel_place_correct_pass,
+                  paddle::lite::mir::KernelPlaceCorrectPass)
+    .BindTargets({TARGET(kFPGA)});
--- a/lite/core/mir/fpga_kernel_place_correct_pass.h
+++ b/lite/core/mir/fpga_kernel_place_correct_pass.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+#include "lite/core/mir/pass.h"
+#include "lite/core/target_wrapper.h"
+namespace paddle {
+namespace lite {
+namespace mir {
+/*
+ * Correct the place of the variables in the SSAGrpah, it will inference the
+ * variables' place by the kernels outputs them.
+ */
+class KernelPlaceCorrectPass : public DebugPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+ private:
+  void CorrectArgumentPlace(SSAGraph* graph) {
+    auto& valid_places = graph->valid_places();
+    auto valid_places_has_target = [&](TargetType t) -> bool {
+      for (auto& p : valid_places) {
+        if (p.target == t) {
+          return true;
+        }
+      }
+      return false;
+    };
+    std::map<std::string, bool> lite_with_targets{
+        {"kOpenCL", valid_places_has_target(TARGET(kOpenCL))},
+        {"kFPGA", valid_places_has_target(TARGET(kFPGA))}};
+    VLOG(4) << "lite_with_targets['kOpenCL']:" << lite_with_targets["kOpenCL"];
+    VLOG(4) << "lite_with_targets['kFPGA']:" << lite_with_targets["kFPGA"];
+    VLOG(3) << "param-type-registry:\n" << ParamTypeRegistry::Global();
+    for (auto& x : graph->StmtTopologicalOrder()) {
+      auto& inst = x->AsStmt();
+      // The IoCopyOp is a tool operator, it won't support the type inference.
+      // in fpga, we has io_copy+cali+layout tool ops, so we need type inference
+      // for
+      // tool operator
+      if ((!lite_with_targets["kFPGA"]) && (!lite_with_targets["kOpenCL"])) {
+        VLOG(3) << "inst.op_type() == 'io_copy', continue";
+        if (inst.op_type() == "io_copy") continue;
+      }
+      // deal with inputs
+      VLOG(4) << "checking op " << inst.op_info()->Repr();
+      auto get_argname = [&](
+          const std::string& node_name,
+          const std::map<std::string, std::vector<std::string>>& argname_map)
+          -> std::string {
+            for (auto& ele : argname_map) {
+              auto it =
+                  std::find(ele.second.begin(), ele.second.end(), node_name);
+              if (it != ele.second.end()) return ele.first;
+            }
+            return "";
+          };
+      auto in = x->inlinks.front();
+      if (!in) {
+        break;
+      }
+      auto out = x->outlinks.front();
+      auto p = in->AsArg().type->precision();
+      std::string node_name = out->AsArg().name;
+      std::string arg_name = get_argname(node_name, inst.op_info()->outputs());
+      auto op_type = inst.op_type();
+      if (op_type == "reshape" || op_type == "reshape2") {
+        for (auto* x_in : x->inlinks) {
+          std::string in_name =
+              get_argname(x_in->AsArg().name, inst.op_info()->inputs());
+          if (in_name == "X") {
+            in = x_in;
+          }
+        }
+        p = in->AsArg().type->precision();
+        if (p != PrecisionType::kFP16) {
+          UpdateTarget(inst, TargetType::kHost);
+          UpdateTensor(inst, in, out, TargetType::kHost);
+        }
+      }
+      if (inst.op_type() == "fetch") {
+        UpdateTarget(inst, TargetType::kFPGA);
+      }
+      if (inst.op_type() == "split" || inst.op_type() == "transpose" ||
+          inst.op_type() == "transpose2") {
+        if (p != PrecisionType::kFP16) {
+          UpdateTarget(inst, TargetType::kARM);
+          for (auto* x_out : x->outlinks) {
+            UpdateTensor(inst, in, x_out, TargetType::kARM);
+          }
+        }
+      }
+      if (inst.op_type() == "concat") {
+        if (p != PrecisionType::kFP16) {
+          UpdateTarget(inst, TargetType::kARM);
+          UpdateTensor(inst, in, out, TargetType::kARM);
+        }
+      }
+      if (inst.op_type() == "elementwise_mul") {
+        UpdateTarget(inst, TargetType::kFPGA);
+        for (auto* in : x->inlinks) {
+          std::string in_name =
+              get_argname(in->AsArg().name, inst.op_info()->inputs());
+          if (in_name == "Y") {
+            in = in;
+            p = in->AsArg().type->precision();
+            std::unique_ptr<KernelBase> best_match;
+            for (auto& k : inst.kernels()) {
+              auto kp = k->GetInputDeclType(in_name)->precision();
+              if (kp == p) {
+                best_match = std::move(k);
+              }
+            }
+            inst.kernels().clear();
+            inst.kernels().emplace_back(std::move(best_match));
+            break;
+          }
+        }
+      }
+    }
+  }
+  // Update me's kUnk fields by other's fields.
+  void UpdateTarget(mir::Node::Stmt& inst, TargetType new_target) {  // NOLINT
+    auto new_place = inst.place();
+    new_place.target = new_target;
+    if (new_target == TargetType::kARM) {
+      new_place.precision = PrecisionType::kFloat;
+      new_place.layout = DataLayoutType::kNCHW;
+    }
+    if (new_target == TargetType::kHost) {
+      new_place.precision = PrecisionType::kFloat;
+      new_place.layout = DataLayoutType::kNCHW;
+    }
+    std::vector<Place> places;
+    places.push_back(new_place);
+    inst.ResetKernels(places);
+  }
+  void UpdateTensor(mir::Node::Stmt& inst,  // NOLINT
+                    Node* in,
+                    Node* out,
+                    TargetType new_target = TargetType::kUnk) {
+    auto get_argname = [&](
+        const std::string& node_name,
+        const std::map<std::string, std::vector<std::string>>& argname_map)
+        -> std::string {
+          for (auto& ele : argname_map) {
+            auto it =
+                std::find(ele.second.begin(), ele.second.end(), node_name);
+            if (it != ele.second.end()) return ele.first;
+          }
+          return "";
+        };
+    std::string arg_name =
+        get_argname(out->AsArg().name, inst.op_info()->outputs());
+    std::string in_name =
+        get_argname(in->AsArg().name, inst.op_info()->inputs());
+    auto type = inst.picked_kernel().GetInputDeclType(in_name);
+    auto tmp_ptype = in->AsArg().type->precision();
+    auto tmp_target = type->target();
+    auto tmp_layout = type->layout();
+    if (new_target == TargetType::kARM) {
+      tmp_target = TargetType::kARM;
+      tmp_ptype = PrecisionType::kFloat;
+      tmp_layout = DataLayoutType::kNCHW;
+    }
+    if (new_target == TargetType::kHost) {
+      tmp_target = TargetType::kHost;
+      tmp_ptype = PrecisionType::kFloat;
+      tmp_layout = DataLayoutType::kNCHW;
+    }
+    out->AsArg().type =
+        LiteType::GetTensorTy(tmp_target, tmp_ptype, tmp_layout);
+  }
+};
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/mir/fusion/fc_fuse_pass.cc
+++ b/lite/core/mir/fusion/fc_fuse_pass.cc
@@ -32,9 +32,12 @@ void FcFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
  fuser(graph.get());
 #endif
 #endif
  fusion::FcFuser fuser2(false);
  fuser2(graph.get());
+#ifdef LITE_WITH_FPGA
+  fusion::FcFuser fpga_fuser(true);
+  fpga_fuser(graph.get());
+#endif
 }
 }  // namespace mir

--- a/lite/core/mir/io_copy_kernel_pick_pass.cc
+++ b/lite/core/mir/io_copy_kernel_pick_pass.cc
@@ -51,10 +51,17 @@ class IoCopyKernelPickPass : public StmtPass {
          // directly.
          if (TargetCompatibleTo(*outy, *out_arg_ty)) {
            LOG(INFO) << "get a IOCopy kernel";
+            if (kernel->target() == TargetType::kFPGA) {
+              node.outlinks.front()->AsArg().type = LiteType::GetTensorTy(
+                  kernel->target(), kernel->precision(), kernel->layout());
+            }
            auto x = std::move(kernel);
            kernels.clear();
            kernels.emplace_back(std::move(x));
            is_found = true;
            break;
          }
        }

--- a/lite/core/mir/static_kernel_pick_pass.h
+++ b/lite/core/mir/static_kernel_pick_pass.h
@@ -148,6 +148,49 @@ class StaticKernelPickPass : public mir::StmtPass {
      }
    }
+    if (kernel.target() == TARGET(kFPGA)) {
+      VLOG(4) << "alias:" << kernel.alias();
+      /**
+       * we want to use fpga kernel as much as possible, so we give it a very
+       *high score,
+       * so this kernel can be picked, it may be not the best option, and we
+       *shall correct
+       * it in kernel_place_correct_pass
+       *
+       * 4000 is a arbitrary high score that can purpress all the other kernels.
+       **/
+      final_score = 4000;
+      for (size_t i = 0; i < in_names.size(); ++i) {
+        std::string tmp;
+        CHECK(instruct.op_info()->GetInputArgname(in_names[i], &tmp));
+        if (in_types.count(in_names[i]) &&
+            in_types.at(in_names[i]) ==
+                kernel.GetInputDeclType(tmp)->precision()) {
+          final_score += 100;  // multiple inputs pick the most matched one;
+        }
+      }
+      for (size_t i = 0; i < out_names.size(); ++i) {
+        std::string tmp;
+        CHECK(instruct.op_info()->GetOutputArgname(out_names[i], &tmp));
+        VLOG(4) << tmp << " == "
+                << PrecisionToStr(kernel.GetOutputDeclType(tmp)->precision());
+        if (out_types.count(out_names[i]) > 0) {
+          VLOG(4) << "decType: "
+                  << PrecisionToStr(kernel.GetOutputDeclType(tmp)->precision());
+          VLOG(4) << "cout:" << out_types.count(out_names[i]) << " type_name: "
+                  << PrecisionToStr(out_types.at(out_names[i]));
+        }
+        if (out_types.count(out_names[i]) &&
+            out_types.at(out_names[i]) ==
+                kernel.GetOutputDeclType(tmp)->precision()) {
+          final_score += 100;  // multiple outputs pick the most matched one;
+        }
+      }
+    }
    VLOG(4) << "[score(final)]:" << final_score;
    VLOG(2) << "-------- pick summary for " << instruct.op_type()
            << " --------";

--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
@@ -80,99 +80,100 @@ class Optimizer {
    InitControlFlowOpUnusedInputsAndOutputsEliminatePass();
    if (passes.empty() || passes.size() == 1) {
-      std::vector<std::string> passes_local{{
+      std::vector<std::string> passes_local{
-          "lite_quant_dequant_fuse_pass",         //
+          {"lite_quant_dequant_fuse_pass",         //
-          "weight_quantization_preprocess_pass",  //
+           "weight_quantization_preprocess_pass",  //
-          "lite_conv_elementwise_fuse_pass",      // conv-elemwise-bn
+           "lite_conv_elementwise_fuse_pass",      // conv-elemwise-bn
-          "lite_conv_bn_fuse_pass",               //
+           "lite_conv_bn_fuse_pass",               //
-          "lite_conv_elementwise_fuse_pass",      // conv-bn-elemwise
+           "lite_conv_elementwise_fuse_pass",      // conv-bn-elemwise
-          "lite_conv_conv_fuse_pass",             //
+           "lite_conv_conv_fuse_pass",             //
-          // TODO(Superjomn) Refine the fusion related design to select fusion
+           // TODO(Superjomn) Refine the fusion related design to select fusion
-          // kernels for devices automatically.
+           // kernels for devices automatically.
-          "lite_conv_activation_fuse_pass",              //
+           "lite_conv_activation_fuse_pass",              //
-          "lite_var_conv_2d_activation_fuse_pass",       //
+           "lite_var_conv_2d_activation_fuse_pass",       //
-          "lite_match_matrix_activation_fuse_pass",      //
+           "lite_match_matrix_activation_fuse_pass",      //
-          "lite_fc_fuse_pass",                           //
+           "lite_fc_fuse_pass",                           //
-          "lite_shuffle_channel_fuse_pass",              //
+           "lite_shuffle_channel_fuse_pass",              //
-          "lite_transpose_softmax_transpose_fuse_pass",  //
+           "lite_transpose_softmax_transpose_fuse_pass",  //
-          "lite_interpolate_fuse_pass",                  //
+           "lite_interpolate_fuse_pass",                  //
-          "identity_scale_eliminate_pass",               //
+           "identity_scale_eliminate_pass",               //
-          "lite_scales_fuse_pass",                       //
+           "lite_scales_fuse_pass",                       //
-          "lite_sequence_reverse_embedding_fuse_pass",   //
+           "lite_sequence_reverse_embedding_fuse_pass",   //
-          "elementwise_mul_constant_eliminate_pass",     //
+           "elementwise_mul_constant_eliminate_pass",     //
-          "lite_sequence_pool_concat_fuse_pass",         //
+           "lite_sequence_pool_concat_fuse_pass",         //
-          "lite_scale_activation_fuse_pass",             //
+           "lite_scale_activation_fuse_pass",             //
 #if (defined LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) || (defined LITE_WITH_CUDA) || \
    (defined LITE_WITH_ARM)
-          "lite_elementwise_activation_fuse_pass",  //
+           "lite_elementwise_activation_fuse_pass",  //
 #endif
-          "identity_dropout_eliminate_pass",
+           "identity_dropout_eliminate_pass",
-          "__xpu__resnet_fuse_pass",
+           "__xpu__resnet_fuse_pass",
-          "__xpu__resnet_d_fuse_pass",
+           "__xpu__resnet_d_fuse_pass",
-          "__xpu__resnet_cbam_fuse_pass",
+           "__xpu__resnet_cbam_fuse_pass",
-          "__xpu__conv2d_fuse_pass",
+           "__xpu__conv2d_fuse_pass",
-          "__xpu__conv2d_link_previous_out_max_pass",
+           "__xpu__conv2d_link_previous_out_max_pass",
-          "__xpu__sfa_head_meanstd_fuse_pass",
+           "__xpu__sfa_head_meanstd_fuse_pass",
-          "__xpu__sfa_head_moment_fuse_pass",
+           "__xpu__sfa_head_moment_fuse_pass",
-          "__xpu__mmdnn_fuse_pass",
+           "__xpu__mmdnn_fuse_pass",
-          "__xpu__multi_encoder_fuse_pass",
+           "__xpu__multi_encoder_fuse_pass",
-          "__xpu__embedding_with_eltwise_add_fuse_pass",
+           "__xpu__embedding_with_eltwise_add_fuse_pass",
-          "__xpu__fc_fuse_pass",
+           "__xpu__fc_fuse_pass",
-          "quantized_op_attributes_inference_pass",  // Only for fully
+           "quantized_op_attributes_inference_pass",  // Only for fully
-                                                     // quantized model, infer
+                                                      // quantized model, infer
-                                                     // the output scale and
+                                                      // the output scale and
-                                                     // fix the attribute
+                                                      // fix the attribute
-                                                     // 'enable_int8' for all
+                                                      // 'enable_int8' for all
-                                                     // of the quantized ops.
+                                                      // of the quantized ops.
-          "npu_subgraph_pass",
+           "npu_subgraph_pass",
-          "huawei_ascend_npu_subgraph_pass",
+           "huawei_ascend_npu_subgraph_pass",
-          "xpu_subgraph_pass",
+           "xpu_subgraph_pass",
-          "bm_subgraph_pass",
+           "bm_subgraph_pass",
-          "apu_subgraph_pass",
+           "apu_subgraph_pass",
-          "rknpu_subgraph_pass",
+           "rknpu_subgraph_pass",
-          "mlu_subgraph_pass",
+           "mlu_subgraph_pass",
-          "control_flow_op_unused_inputs_and_outputs_eliminate_pass",
+           "control_flow_op_unused_inputs_and_outputs_eliminate_pass",
-          "static_kernel_pick_pass",  // pick original kernel from graph
+           "static_kernel_pick_pass",  // pick original kernel from graph
-          "remove_tf_redundant_ops_pass",
+           "remove_tf_redundant_ops_pass",
-          "variable_place_inference_pass",  // inference arg/var's
+           "variable_place_inference_pass",  // inference arg/var's
-          "mlu_postprocess_pass",
+           "mlu_postprocess_pass",
-          // info(target/precision/layout/device)
+           // info(target/precision/layout/device)
-          // using kernel info
+           // using kernel info
-          "argument_type_display_pass",  // debug pass: show arg-type-node's
+           "argument_type_display_pass",  // debug pass: show arg-type-node's
-                                         // info
+                                          // info
-                                         // (target/precision/layout/device)
+                                          // (target/precision/layout/device)
-          "type_target_cast_pass",  // add io_copy/io_copy_once if meet
+           "type_target_cast_pass",  // add io_copy/io_copy_once if meet
-                                    // different targets when last and next
+                                     // different targets when last and next
-                                    // node
+                                     // node
-          "variable_place_inference_pass",  //
+           "variable_place_inference_pass",  //
-          "argument_type_display_pass",     //
+           "argument_type_display_pass",     //
-          "io_copy_kernel_pick_pass",    //
+           "io_copy_kernel_pick_pass",    //
-          "argument_type_display_pass",  //
+           "argument_type_display_pass",  //
-          "variable_place_inference_pass",  //
+           "variable_place_inference_pass",  //
-          "argument_type_display_pass",     //
+           "argument_type_display_pass",     //
-          "type_precision_cast_pass",       //
+           "type_precision_cast_pass",       //
-          "variable_place_inference_pass",  //
+           "variable_place_inference_pass",  //
-          "argument_type_display_pass",     //
+           "argument_type_display_pass",     //
-          "type_layout_cast_pass",  // add layout/layout_once op if meet
+           "type_layout_cast_pass",  // add layout/layout_once op if meet
-                                    // different layout when last and next node
+                                     // different layout when last and next node
-          "argument_type_display_pass",  //
+           "argument_type_display_pass",  //
-          "variable_place_inference_pass",  //
+           "variable_place_inference_pass",  //
-          "argument_type_display_pass",
+           "argument_type_display_pass",
-          "runtime_context_assign_pass",
+           "runtime_context_assign_pass",
-          "argument_type_display_pass",
+           "argument_type_display_pass",
-          "lite_reshape_fuse_pass",
+           "lite_reshape_fuse_pass",
-          "memory_optimize_pass"  // you can comment this line when enable
+#if !(defined(LITE_WITH_FPGA) || defined(LITE_WITH_PRECISION_PROFILE))
-                                  // PRECISION_PROFILE
+           "memory_optimize_pass"
-      }};
+#endif
+          }};
      if (passes.size() == 1) {
        // multi_stream_analysis_pass must be in the front of

--- a/lite/core/program.h
+++ b/lite/core/program.h
@@ -139,7 +139,9 @@ struct Instruction {
 #ifdef LITE_WITH_PROFILE
  void set_profiler(profile::Profiler* profiler) {
    profiler_ = profiler;
+#ifndef LITE_WITH_FPGA
    if (op_->Type() != "feed" && op_->Type() != "fetch") {
+#endif
      profile::OpCharacter ch;
      ch.op_lite = static_cast<void*>(const_cast<paddle::lite::OpLite*>(op()));
      ch.target = kernel()->target();
@@ -150,7 +152,9 @@ struct Instruction {
      // append `ch.kernel_func_name` in StopTiming
      profile_id_ = profiler->NewTimer(ch);
      kernel_->SetProfiler(profiler_, profile_id_);
+#ifndef LITE_WITH_FPGA
    }
+#endif
  }
  void SetProfileRuntimeOpInfo(paddle::lite::profile::OpCharacter* ch) {

--- a/lite/kernels/fpga/CMakeLists.txt
+++ b/lite/kernels/fpga/CMakeLists.txt
@@ -5,28 +5,36 @@ endif()
 set(fpga_deps fpga_target_wrapper kernel_fpga)
-# add_kernel(activation_compute_fpga FPGA basic SRCS activation_compute.cc DEPS ${fpga_deps})
+add_kernel(activation_compute_fpga FPGA basic SRCS activation_compute.cc DEPS ${fpga_deps})
 # add_kernel(box_coder_compute_fpga FPGA basic SRCS box_coder_compute.cc DEPS ${fpga_deps})
-# add_kernel(concat_compute_fpga FPGA basic SRCS concat_compute.cc DEPS ${fpga_deps})
+add_kernel(cast_compute_fpga FPGA basic SRCS cast_compute.cc DEPS ${fpga_deps})
+add_kernel(concat_compute_fpga FPGA basic SRCS concat_compute.cc DEPS ${fpga_deps})
 add_kernel(conv_compute_fpga FPGA basic SRCS conv_compute.cc DEPS ${fpga_deps})
 # add_kernel(density_prior_box_compute_fpga FPGA basic SRCS density_prior_box_compute.cc DEPS ${fpga_deps})
 add_kernel(dropout_compute_fpga FPGA basic SRCS dropout_compute.cc DEPS ${fpga_deps})
 add_kernel(elementwise_compute_fpga FPGA basic SRCS elementwise_compute.cc DEPS ${fpga_deps})
-# add_kernel(feed_compute_fpga FPGA basic SRCS fc_compute.cc DEPS ${fpga_deps})
+add_kernel(interpolate_compute_fpga FPGA basic SRCS interpolate_compute.cc DEPS ${fpga_deps})
 add_kernel(fc_compute_fpga FPGA basic SRCS fc_compute.cc DEPS ${fpga_deps})
 add_kernel(gru_compute_fpga FPGA extra SRCS gru_compute.cc DEPS ${fpga_deps})
 # add_kernel(mul_compute_fpga FPGA basic SRCS mul_compute.cc DEPS ${fpga_deps})
 add_kernel(multiclass_nms_compute_fpga FPGA basic SRCS multiclass_nms_compute.cc DEPS ${fpga_deps})
 add_kernel(norm_compute_fpga FPGA basic SRCS norm_compute.cc DEPS ${fpga_deps})
 # add_kernel(im2sequence_compute_fpga FPGA basic SRCS im2sequence_compute.cc DEPS ${fpga_deps})
 add_kernel(pooling_compute_fpga FPGA basic SRCS pooling_compute.cc DEPS ${fpga_deps})
 add_kernel(prior_box_compute_fpga FPGA basic SRCS prior_box_compute.cc DEPS ${fpga_deps})
-# add_kernel(reshape_compute_fpga FPGA basic SRCS reshape_compute.cc DEPS ${fpga_deps} reshape_op)
+add_kernel(reshape_compute_fpga FPGA basic SRCS reshape_compute.cc DEPS ${fpga_deps} reshape_op)
 # add_kernel(sequence_pool_compute_fpga FPGA basic SRCS sequence_pool_compute.cc DEPS ${fpga_deps})
 add_kernel(scale_compute_fpga FPGA basic SRCS scale_compute.cc DEPS ${fpga_deps})
-# add_kernel(softmax_compute_fpga FPGA basic SRCS softmax_compute.cc DEPS ${fpga_deps})
+add_kernel(softmax_compute_fpga FPGA basic SRCS softmax_compute.cc DEPS ${fpga_deps})
-# add_kernel(transpose_compute_fpga FPGA basic SRCS transpose_compute.cc DEPS ${fpga_deps})
+add_kernel(split_compute_fpga FPGA basic SRCS split_compute.cc DEPS ${fpga_deps})
+add_kernel(transpose_compute_fpga FPGA basic SRCS transpose_compute.cc DEPS ${fpga_deps})
 add_kernel(io_copy_compute_fpga FPGA basic SRCS io_copy_compute.cc DEPS ${fpga_deps})
 add_kernel(calib_compute_fpga FPGA basic SRCS calib_compute.cc DEPS ${fpga_deps})
@@ -34,6 +42,8 @@ add_kernel(layout_compute_fpga FPGA basic SRCS layout_compute.cc DEPS ${fpga_dep
 add_kernel(feed_compute_fpga FPGA basic SRCS feed_compute.cc DEPS ${fpga_deps})
 add_kernel(fetch_compute_fpga FPGA basic SRCS fetch_compute.cc DEPS ${fpga_deps})
+add_kernel(yolo_box_compute_fpga FPGA basic SRCS yolo_box_compute.cc DEPS ${fpga_deps})
 # add_kernel(while_compute_fpga FPGA extra SRCS while_compute.cc DEPS ${fpga_deps})
 # add_kernel(write_to_array_compute_fpga FPGA extra SRCS write_to_array_compute.cc DEPS ${fpga_deps})

--- a/lite/kernels/fpga/activation_compute.cc
+++ b/lite/kernels/fpga/activation_compute.cc
@@ -25,16 +25,38 @@ using float16 = zynqmp::float16;
 void ReluCompute::PrepareForRun() {
  auto& param = this->Param<param_t>();
  auto output_data = param.Out->mutable_data<float16>();
-  zynqmp::InputParam& input_param = pe_.param();
+  zynqmp::InputParam& relu_param = pe_.param();
-  input_param.input = param.X->ZynqTensor();
+  relu_param.input = param.X->ZynqTensor();
-  input_param.output = param.Out->ZynqTensor();
+  relu_param.output = param.Out->ZynqTensor();
  pe_.init();
  pe_.apply();
 }
 void ReluCompute::Run() { pe_.dispatch(); }
+void SigmoidCompute::Run() {
+  // TODO(chonwhite) use fpga and arm implementation;
+  auto& param = this->Param<param_t>();
+  auto output_data = param.Out->mutable_data<float16>();
+  int numel = param.Out->numel();
+  float16* in_data = param.X->ZynqTensor()->data<float16>();
+  float16* out_data = param.Out->ZynqTensor()->data<float16>();
+  param.X->ZynqTensor()->syncToCPU();
+  float max = 0.0f;
+  for (int i = 0; i < numel; i++) {
+    /* code */
+    float value = zynqmp::half_to_float(in_data[i]);
+    value = 1 / (1 + exp(-value));
+    out_data[i] = zynqmp::float_to_half(value);
+    max = std::max(std::abs(value), max);
+  }
+  param.Out->ZynqTensor()->scale()[0] = max / 127.0;
+  param.Out->ZynqTensor()->scale()[1] = 127.0 / max;
+  param.Out->ZynqTensor()->flush();
+}
 }  // namespace fpga
 }  // namespace kernels
 }  // namespace lite
@@ -51,3 +73,19 @@ REGISTER_LITE_KERNEL(
                                       PRECISION(kFP16),
                                       DATALAYOUT(kNHWC))})
    .Finalize();
+REGISTER_LITE_KERNEL(sigmoid,
+                     kFPGA,
+                     kFP16,
+                     kNHWC,
+                     paddle::lite::kernels::fpga::SigmoidCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kFPGA),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
--- a/lite/kernels/fpga/activation_compute.h
+++ b/lite/kernels/fpga/activation_compute.h
@@ -14,6 +14,8 @@
 #pragma once
 #include <algorithm>
+#include <map>
+#include <string>
 #include "lite/backends/fpga/KD/float16.hpp"
 #include "lite/backends/fpga/KD/pes/relu_pe.hpp"
 #include "lite/core/kernel.h"
@@ -24,6 +26,13 @@ namespace lite {
 namespace kernels {
 namespace fpga {
+static std::map<std::string, zynqmp::ActiveType> activation_map = {
+    {"relu", zynqmp::TYPE_RELU},
+    {"relu6", zynqmp::TYPE_RELU6},
+    {"leaky_relu", zynqmp::TYPE_LEAKY_RELU},
+    {"sigmoid", zynqmp::TYPE_SIGMOID},
+    {"", zynqmp::TYPE_NONE}};
 class ReluCompute
    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
 public:
@@ -40,6 +49,16 @@ class ReluCompute
  zynqmp::Tensor output_;
 };
+class SigmoidCompute
+    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
+ public:
+  using param_t = operators::ActivationParam;
+  void Run() override;
+  virtual ~SigmoidCompute() = default;
+};
 }  // namespace fpga
 }  // namespace kernels
 }  // namespace lite

--- a/lite/kernels/fpga/activation_compute_test.cc
+++ b/lite/kernels/fpga/activation_compute_test.cc
@@ -12,14 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#include "lite/kernels/fpga/activation_compute.h"
 #include <gtest/gtest.h>
 #include <memory>
 #include <utility>
 #include <vector>
 #include "lite/core/op_registry.h"
-#include "lite/kernels/fpga/activation_compute.h"
 namespace paddle {
 namespace lite {
@@ -39,7 +37,8 @@ void activation_compute_ref(const operators::ActivationParam& param) {
 }
 TEST(activation_fpga, retrive_op) {
-  auto activation = KernelRegistry::Global().Create("relu");
+  auto activation =
+      KernelRegistry::Global().Create<TARGET(kFPGA), PRECISION(kFP16)>("relu");
  ASSERT_FALSE(activation.empty());
  ASSERT_TRUE(activation.front());
 }

--- a/lite/kernels/fpga/calib_compute.cc
+++ b/lite/kernels/fpga/calib_compute.cc
@@ -44,6 +44,16 @@ void CalibComputeFP16ToFp32::Run() {
  return;
 }
+void CalibComputeFloat2Int::Run() {
+  auto& param = this->Param<operators::CalibParam>();
+  const auto* din = param.input->data<float>();
+  auto* dout = param.output->mutable_data<int>();
+  // param.output->ZynqTensor()->copyFrom(param.input->ZynqTensor());
+  auto out_lod = param.output->mutable_lod();
+  *out_lod = param.input->lod();
+  return;
+}
 }  // namespace fpga
 }  // namespace kernels
 }  // namespace lite
@@ -58,10 +68,26 @@ REGISTER_LITE_KERNEL(calib,
    .BindInput("Input",
               {LiteType::GetTensorTy(TARGET(kFPGA),
                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNCHW))})
+                                      DATALAYOUT(kAny))})
    .BindOutput("Out",
                {LiteType::GetTensorTy(TARGET(kFPGA),
                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
+REGISTER_LITE_KERNEL(calib,
+                     kFPGA,
+                     kFP16,
+                     kNHWC,
+                     paddle::lite::kernels::fpga::CalibComputeFloat2Int,
+                     float_2_int_fpga)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kARM),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kARM),
+                                       PRECISION(kInt32),
                                       DATALAYOUT(kNCHW))})
    .Finalize();
@@ -70,7 +96,7 @@ REGISTER_LITE_KERNEL(calib,
                     kFP16,
                     kNHWC,
                     paddle::lite::kernels::fpga::CalibComputeFP16ToFp32,
-                     fp16_to_fp32_fpga)
+                     float_to_int_fpga)
    .BindInput("Input",
               {LiteType::GetTensorTy(TARGET(kFPGA),
                                      PRECISION(kFP16),

--- a/lite/kernels/fpga/calib_compute.h
+++ b/lite/kernels/fpga/calib_compute.h
@@ -45,6 +45,18 @@ class CalibComputeFP16ToFp32
 private:
 };
+class CalibComputeFloat2Int
+    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
+ public:
+  using param_t = operators::CalibParam;
+  void Run() override;
+  ~CalibComputeFloat2Int() override{};
+ private:
+};
 }  // namespace fpga
 }  // namespace kernels
 }  // namespace lite

--- a/lite/kernels/fpga/cast_compute.cc
+++ b/lite/kernels/fpga/cast_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/fpga/cast_compute.h"
+#include <algorithm>
+// #include "lite/backends/arm/math/funcs.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace fpga {
+template <class in_type, class out_type>
+out_type TransOp(in_type in) {
+  return static_cast<out_type>(in);
+}
+void CastCompute::PrepareForRun() {}
+void CastCompute::Run() {
+  auto& ctx = this->ctx_->template As<ARMContext>();
+  auto& param = this->Param<operators::CastParam>();
+  auto input_dims = param.X->dims();
+}
+}  // namespace fpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+REGISTER_LITE_KERNEL(
+    cast, kFPGA, kFP16, kNHWC, paddle::lite::kernels::fpga::CastCompute, def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kFPGA),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
--- a/lite/kernels/fpga/cast_compute.h
+++ b/lite/kernels/fpga/cast_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <stdint.h>
+#include "lite/backends/arm/math/type_trans.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace fpga {
+class CastCompute
+    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
+ public:
+  using param_t = operators::CastParam;
+  void PrepareForRun() override;
+  void Run() override;
+  ~CastCompute() {}
+ private:
+};
+}  // namespace fpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/fpga/conv_compute.cc
+++ b/lite/kernels/fpga/conv_compute.cc
@@ -25,12 +25,46 @@ namespace kernels {
 namespace fpga {
 using float16 = zynqmp::float16;
+using lite_api::ActivationType;
 void ConvCompute::PrepareForRun() {
  auto& param = this->Param<param_t>();
  param.output->mutable_data<float16>();
  int pad_h = (*param.paddings)[0];
  int pad_w = (*param.paddings)[2];
+  zynqmp::ActiveType active_type = zynqmp::TYPE_NONE;
+  float leaky_relu_factor = 0;
+  switch (param.activation_param.active_type) {
+    case ActivationType::kIndentity:
+      active_type = zynqmp::TYPE_NONE;
+      break;
+    case ActivationType::kRelu:
+      active_type = zynqmp::TYPE_RELU;
+      break;
+    case ActivationType::kRelu6:
+      active_type = zynqmp::TYPE_RELU6;
+      break;
+    case ActivationType::kPRelu:
+    case ActivationType::kLeakyRelu:
+      active_type = zynqmp::TYPE_LEAKY_RELU;
+      leaky_relu_factor = param.activation_param.Leaky_relu_alpha;
+      break;
+    case ActivationType::kSigmoid:
+      active_type = zynqmp::TYPE_SIGMOID;
+      break;
+    case ActivationType::kTanh:
+    case ActivationType::kSwish:
+    case ActivationType::kExp:
+    case ActivationType::kAbs:
+    case ActivationType::kHardSwish:
+    case ActivationType::kReciprocal:
+    default:
+      throw("not supported activation");
+      break;
+  }
  // ====================================================
  if (param.x->ZynqTensor()->shape().channel() != 1 &&
      param.groups == param.x->ZynqTensor()->shape().channel()) {
@@ -45,12 +79,13 @@ void ConvCompute::PrepareForRun() {
    conv_param.paddings = std::vector<int>({pad_h, pad_w});
    conv_param.dilations = *param.dilations;
    fill_scale_bias_const(&conv_param);
-    conv_param.bias()->copyFrom(param.bias->ZynqTensor());
+    if (param.bias != nullptr) {
+      conv_param.bias()->copyFrom(param.bias->ZynqTensor());
-    if (param.fuse_relu) {
-      conv_param.activeParam.type = zynqmp::TYPE_RELU;
    }
+    conv_param.activeParam.type = active_type;
+    conv_param.activeParam.leaky_relu_factor = leaky_relu_factor;
    dw_conv_pe_.init();
    dw_conv_pe_.apply();
  } else {
@@ -68,15 +103,8 @@ void ConvCompute::PrepareForRun() {
      conv_param.bias()->copyFrom(param.bias->ZynqTensor());
    }
-    if (param.fuse_relu) {
+    conv_param.activeParam.type = active_type;
-      conv_param.activeParam.type = zynqmp::TYPE_RELU;
+    conv_param.activeParam.leaky_relu_factor = leaky_relu_factor;
-    }
-    // conv_param.filter->saveToFile("conv_filter_", true);
-    // if (param.bias != nullptr) {
-    //   std::cout << "param.bias != nullptr" << std::endl;
-    //   conv_param.bias()->saveToFile("conv_bias_", true);
-    // }
    conv_pe_.init();
    conv_pe_.apply();
@@ -93,9 +121,7 @@ void ConvCompute::Run() {
    Debugger::get_instance().registerOutput("dwconv", dwconv_param.output);
 #endif
  } else {
-    // zynqmp::ConvParam& conv_param = conv_pe_.param();
    conv_pe_.dispatch();
 #ifdef FPGA_PRINT_TENSOR
    zynqmp::ConvParam& conv_param = conv_pe_.param();
    Debugger::get_instance().registerOutput("conv", conv_param.output);

--- a/lite/kernels/fpga/conv_compute.h
+++ b/lite/kernels/fpga/conv_compute.h
@@ -35,8 +35,6 @@ class ConvCompute
  void Run() override;
-  ~ConvCompute() {}
 private:
  zynqmp::ConvPE conv_pe_;
  zynqmp::DepthwiseConvPE dw_conv_pe_;

--- a/lite/kernels/fpga/elementwise_compute.cc
+++ b/lite/kernels/fpga/elementwise_compute.cc
@@ -16,6 +16,7 @@
 #include <string>
 #include "lite/backends/arm/math/funcs.h"
 #include "lite/backends/fpga/KD/debugger.hpp"
+#include "lite/kernels/fpga/activation_compute.h"
 namespace paddle {
 namespace lite {
@@ -29,11 +30,9 @@ void ElementwiseAddCompute::PrepareForRun() {
  auto& param = Param<operators::ElementwiseParam>();
  param.Out->mutable_data<float16>();
  ew_param.inputs = {param.X->ZynqTensor(), param.Y->ZynqTensor()};
  ew_param.output = param.Out->ZynqTensor();
  ew_param.axis = param.axis;
  ew_param.activeParam.type = zynqmp::TYPE_NONE;
  pe_.init();
@@ -50,14 +49,17 @@ void ElementwiseAddCompute::Run() {
 void ElementwiseAddActivationCompute::PrepareForRun() {
  zynqmp::ElementwiseAddParam& ew_param = pe_.param();
  auto& param = Param<operators::FusionElementwiseActivationParam>();
-  if (param.act_type != "relu") {
+  if (activation_map.count(param.act_type)) {
+    ew_param.activeParam.type = activation_map[param.act_type];
+  } else {
    LOG(FATAL) << "unsupported Activation type: " << param.act_type;
  }
  param.Out->mutable_data<float16>();
  ew_param.inputs = {param.X->ZynqTensor(), param.Y->ZynqTensor()};
  ew_param.output = param.Out->ZynqTensor();
  ew_param.axis = param.axis;
-  ew_param.activeParam.type = zynqmp::TYPE_RELU;
  pe_.init();
  pe_.apply();
 }
@@ -76,25 +78,33 @@ void ElementwiseMulCompute::PrepareForRun() {
  scale_param.input = param.X->ZynqTensor();
  scale_param.output = param.Out->ZynqTensor();
  scale_param.activeParam.type = zynqmp::TYPE_NONE;
  int channel = scale_param.input->shape().channel();
-  zynqmp::Tensor* scale = new zynqmp::Tensor();
+  scale_param.scale = &scale_;
-  zynqmp::Tensor* bias = new zynqmp::Tensor();
+  scale_param.bias = &bias_;
-  scale_param.scale = scale;
-  scale_param.bias = bias;
  zynqmp::Shape shape(zynqmp::N, {channel});
-  float* scale_data = scale->mutableData<float>(zynqmp::FP32, shape);
+  zynqmp::float16* scale_data =
-  float* bias_data = bias->mutableData<float>(zynqmp::FP32, shape);
+      scale_.mutableData<zynqmp::float16>(zynqmp::FP16, shape);
-  float scale_value = param.Y->data<float>()[0];
+  zynqmp::float16* bias_data =
+      bias_.mutableData<zynqmp::float16>(zynqmp::FP16, shape);
+  zynqmp::float16 scale_value = 0;
+  if (param.Y->ZynqTensor()->dataType() == zynqmp::FP32) {
+    scale_value = zynqmp::float_to_half(param.Y->data<float>()[0]);
+  } else {
+    scale_value = param.Y->data<zynqmp::float16>()[0];
+  }
-  for (int i = 0; i < channel; ++i) {
+  for (int i = 0; i < channel; i++) {
    if (param.Y->dims().production() != 1) {
-      scale_value = param.Y->ZynqTensor()->data<float>()[i];
+      if (param.Y->ZynqTensor()->dataType() == zynqmp::FP32) {
+        scale_value = zynqmp::float_to_half(param.Y->data<float>()[i]);
+      } else {
+        scale_value = param.Y->data<zynqmp::float16>()[i];
+      }
    }
    scale_data[i] = scale_value;
-    bias_data[i] = 0;
+    bias_data[i] = zero_;
  }
  pe_.init();
@@ -102,10 +112,18 @@ void ElementwiseMulCompute::PrepareForRun() {
 }
 void ElementwiseMulCompute::Run() {
+  auto& param = Param<operators::ElementwiseParam>();
+  if (!param.Y->persistable()) {
+    // TODO(chonwhite) alignment;
+    param.Y->ZynqTensor()->invalidate();
+    scale_.copyFrom(param.Y->ZynqTensor());
+    scale_.flush();
+  }
  pe_.dispatch();
 #ifdef FPGA_PRINT_TENSOR
  zynqmp::ScaleParam& scale_param = pe_.param();
-  Debugger::get_instance().registerOutput("ew_mul_in", scale_param.input);
  Debugger::get_instance().registerOutput("ew_mul", scale_param.output);
 #endif
 }
@@ -161,7 +179,27 @@ REGISTER_LITE_KERNEL(elementwise_mul,
                     kFP16,
                     kNHWC,
                     paddle::lite::kernels::fpga::ElementwiseMulCompute,
-                     def)
+                     ew_mul_fpga)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kFPGA),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
+REGISTER_LITE_KERNEL(elementwise_mul,
+                     kFPGA,
+                     kFP16,
+                     kNHWC,
+                     paddle::lite::kernels::fpga::ElementwiseMulCompute,
+                     ew_mul_y_arm)
    .BindInput("X",
               {LiteType::GetTensorTy(TARGET(kFPGA),
                                      PRECISION(kFP16),

--- a/lite/kernels/fpga/elementwise_compute.h
+++ b/lite/kernels/fpga/elementwise_compute.h
@@ -61,6 +61,9 @@ class ElementwiseMulCompute
 private:
  zynqmp::ScalePE pe_;
+  zynqmp::Tensor scale_;
+  zynqmp::Tensor bias_;
+  zynqmp::float16 zero_ = zynqmp::float_to_half(0.0f);
 };
 }  // namespace fpga

--- a/lite/kernels/fpga/fc_compute.cc
+++ b/lite/kernels/fpga/fc_compute.cc
@@ -13,6 +13,8 @@
 // limitations under the License.
 #include "lite/kernels/fpga/fc_compute.h"
+#include "lite/kernels/fpga/activation_compute.h"
 #include "lite/backends/fpga/KD/debugger.hpp"
 #include "lite/core/op_registry.h"
 #include "lite/core/type_system.h"
@@ -35,6 +37,11 @@ void FcCompute::PrepareForRun() {
  fc_param.output = param.output->ZynqTensor();
  fc_param.filter = param.w->ZynqTensor();
  fc_param.bias = param.bias->ZynqTensor();
+  fc_param.bias->flush();
+  if (activation_map.count(param.activation_type)) {
+    fc_param.activeParam.type = activation_map[param.activation_type];
+  }
  pe_.init();
  pe_.apply();
@@ -42,6 +49,7 @@ void FcCompute::PrepareForRun() {
 void FcCompute::Run() {
  pe_.dispatch();
 #ifdef FPGA_PRINT_TENSOR
  zynqmp::FullyConnectedParam& fc_param = pe_.param();
  Debugger::get_instance().registerOutput("fc", fc_param.output);

--- a/lite/kernels/fpga/fc_compute_test.cc
+++ b/lite/kernels/fpga/fc_compute_test.cc
@@ -12,17 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#include "lite/kernels/fpga/fc_compute.h"
 #include <gtest/gtest.h>
 #include <algorithm>
 #include <iostream>
 #include <memory>
 #include <random>
 #include <utility>
 #include <vector>
 #include "lite/core/op_registry.h"
-#include "lite/kernels/fpga/fc_compute.h"
 namespace paddle {
 namespace lite {
@@ -78,7 +76,8 @@ void FillData(T* a,
 }
 TEST(fc_fpga, retrive_op) {
-  auto fc = KernelRegistry::Global().Create("fc");
+  auto fc =
+      KernelRegistry::Global().Create<TARGET(kFPGA), PRECISION(kFP16)>("fc");
  ASSERT_FALSE(fc.empty());
  ASSERT_TRUE(fc.front());
 }

--- a/lite/kernels/fpga/feed_compute.cc
+++ b/lite/kernels/fpga/feed_compute.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 #include "lite/kernels/fpga/feed_compute.h"
 #include "lite/backends/fpga/KD/debugger.hpp"
 #include "lite/core/op_registry.h"
 #include "lite/core/type_system.h"
@@ -28,7 +29,14 @@ void FeedCompute::PrepareForRun() {
  auto& param = this->Param<param_t>();
  Tensor& x = param.feed_list->at(param.col);
  param.out->Resize(x.dims());
-  param.out->mutable_data<float16>();
+  auto in_type = x.ZynqTensor()->dataType();
+  if (in_type == zynqmp::FP32 || in_type == zynqmp::FP16) {
+    param.out->mutable_data<float16>();
+  }
+  if (in_type == zynqmp::INT32) {
+    param.out->mutable_data<int32_t>();
+  }
  // ====================================================
  zynqmp::InputParam& feed_param = pe_.param();
  feed_param.input = x.ZynqTensor();
@@ -40,8 +48,8 @@ void FeedCompute::PrepareForRun() {
 void FeedCompute::Run() {
  auto& param = this->Param<param_t>();
  Tensor& x = param.feed_list->at(param.col);
+  pe_.param().input = x.ZynqTensor();
  pe_.dispatch();
  auto out_lod = param.out->mutable_lod();
  *out_lod = x.lod();
@@ -61,7 +69,7 @@ REGISTER_LITE_KERNEL(
    .BindInput("X",
               {LiteType::GetTensorTy(TARGET(kHost),
                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNHWC))})
+                                      DATALAYOUT(kAny))})
    .BindOutput("Out",
                {LiteType::GetTensorTy(TARGET(kFPGA),
                                       PRECISION(kFP16),
@@ -73,7 +81,13 @@ REGISTER_LITE_KERNEL(feed,
                     kFP16,
                     kNHWC,
                     paddle::lite::kernels::fpga::FeedCompute,
-                     def_host)
+                     feed_int32)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("X",
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kInt32),
+                                       DATALAYOUT(kNCHW))})
    .Finalize();
--- a/lite/kernels/fpga/feed_compute.h
+++ b/lite/kernels/fpga/feed_compute.h
@@ -12,7 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
+#include <map>
+#include <memory>
+#include <string>
 #include "lite/backends/fpga/KD/pes/input_pe.hpp"
+#include "lite/core/kernel.h"
 #include "lite/core/op_registry.h"
 #include "lite/core/type_system.h"
@@ -26,6 +32,34 @@ class FeedCompute
 public:
  using param_t = operators::FeedParam;
+  std::unique_ptr<type_infer_handler_t> GetTypeInferHandler() override {
+    std::unique_ptr<type_infer_handler_t> res(new type_infer_handler_t);
+    *res = [](const std::map<std::string, const Type*>& inputs,
+              const std::string& out) -> const Type* {
+      CHECK(!inputs.empty());
+      // std::cout << "inputs: " << inputs << std::endl;
+      auto* type = inputs.at("Input");
+      std::cout << "type: " << type << std::endl;
+      exit(-1);
+      CHECK(type->target() == TARGET(kHost));
+      auto in_place = type->place();
+      auto target = TARGET(kFPGA);
+      auto precision = in_place.precision;
+      auto layout = in_place.layout;
+      if (in_place.precision == PRECISION(kFloat)) {
+        precision = PRECISION(kFP16);
+        layout = DATALAYOUT(kNHWC);
+      }
+      auto* out_type =
+          Type::Get(type->id(), target, precision, layout, in_place.device);
+      return out_type;
+    };
+    return res;
+  }
  void PrepareForRun() override;
  void Run() override;

--- a/lite/kernels/fpga/fetch_compute.cc
+++ b/lite/kernels/fpga/fetch_compute.cc
@@ -23,17 +23,36 @@ namespace fpga {
 using float16 = zynqmp::float16;
+void resize_output(const Tensor* input, Tensor& out) {  // NOLINT
+  auto in_type = input->ZynqTensor()->dataType();
+  out.Resize(input->dims());
+  switch (in_type) {
+    case zynqmp::FP16:
+    case zynqmp::FP32:
+      out.mutable_data<float>();
+      break;
+    case zynqmp::INT32:
+      out.mutable_data<int32_t>();
+      break;
+    case zynqmp::INT64:
+      out.mutable_data<int64_t>();
+      break;
+    default:
+      break;
+  }
+}
 void FetchCompute::PrepareForRun() {
  auto& param = this->Param<param_t>();
-  // ====================================================
  zynqmp::OutputParam& fetch_param = pe_.param();
  auto fetch_list = param.fetch_list;
  if (fetch_list->size() <= static_cast<size_t>(param.col)) {
    fetch_list->resize(param.col + 1);
  }
  Tensor& out = param.fetch_list->at(param.col);
-  out.Resize(param.input->dims());
+  resize_output(param.input, out);
-  out.mutable_data<float>();
  fetch_param.input = param.input->ZynqTensor();
  fetch_param.output = out.ZynqTensor();
@@ -48,13 +67,16 @@ void FetchCompute::Run() {
  if (fetch_list->size() <= static_cast<size_t>(param.col)) {
    fetch_list->resize(param.col + 1);
  }
  Tensor& out = param.fetch_list->at(param.col);
-  out.Resize(param.input->dims());
+  resize_output(param.input, out);
  pe_.dispatch();
 #ifdef FPGA_PRINT_TENSOR
  zynqmp::OutputParam& fetch_param = pe_.param();
  Debugger::get_instance().registerOutput("fetch", fetch_param.output);
+  Debugger::get_instance().setEnable(true);
 #endif
 }
@@ -63,19 +85,6 @@ void FetchCompute::Run() {
 }  // namespace lite
 }  // namespace paddle
-REGISTER_LITE_KERNEL(fetch,
-                     kFPGA,
-                     kFP16,
-                     kNHWC,
-                     paddle::lite::kernels::fpga::FetchCompute,
-                     fpga_host)
-    .BindInput("X",
-               {LiteType::GetTensorTy(TARGET(kFPGA),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kAny))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
-    .Finalize();
 REGISTER_LITE_KERNEL(fetch,
                     kFPGA,
                     kFP16,

--- a/lite/kernels/fpga/interpolate_compute.cc
+++ b/lite/kernels/fpga/interpolate_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/fpga/interpolate_compute.h"
+#include <string>
+#include <vector>
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace fpga {
+using float16 = zynqmp::float16;
+void BilinearInterpCompute::Run() {}
+void nearest_interp(const float16* src,
+                    int w_in,
+                    int h_in,
+                    int c,
+                    float16* dst,
+                    int w_out,
+                    int h_out,
+                    float scale_x,
+                    float scale_y,
+                    bool with_align) {
+  float scale_w_new = (with_align)
+                          ? (static_cast<float>(w_in - 1) / (w_out - 1))
+                          : (static_cast<float>(w_in) / (w_out));
+  float scale_h_new = (with_align)
+                          ? (static_cast<float>(h_in - 1) / (h_out - 1))
+                          : (static_cast<float>(h_in) / (h_out));
+  if (with_align) {
+    for (int h = 0; h < h_out; ++h) {
+      float16* dst_p = dst + h * w_out * c;
+      int near_y = static_cast<int>(scale_h_new * h + 0.5);
+      for (int w = 0; w < w_out; ++w) {
+        int near_x = static_cast<int>(scale_w_new * w + 0.5);
+        const float16* src_n = src + (near_y * w_in + near_x) * c;
+        memcpy(dst_p, src_n, c * sizeof(float16));
+        dst_p += c;
+      }
+    }
+  } else {
+    for (int h = 0; h < h_out; ++h) {
+      float16* dst_p = dst + h * w_out;
+      int near_y = static_cast<int>(scale_h_new * h);
+      for (int w = 0; w < w_out; ++w) {
+        int near_x = static_cast<int>(scale_w_new * w);
+        const float16* src_n = src + (near_y * w_in + near_x) * c;
+        memcpy(dst_p, src_n, c * sizeof(float16));
+        dst_p += c;
+      }
+    }
+  }
+}
+void NearestInterpCompute::PrepareForRun() {
+  auto& param = Param<operators::InterpolateParam>();
+  lite::Tensor* X = param.X;
+  lite::Tensor* OutSize = param.OutSize;
+  lite::Tensor* Out = param.Out;
+  Out->mutable_data<float16>();
+  zynqmp::ResizeParam& norm_param = pe_.param();
+  norm_param.input = X->ZynqTensor();
+  norm_param.output = Out->ZynqTensor();
+  pe_.init();
+  pe_.apply();
+}
+inline std::vector<int> get_new_shape(
+    std::vector<const lite::Tensor*> list_new_shape_tensor) {
+  // get tensor from
+  std::vector<int> vec_new_shape;
+  for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) {
+    auto tensor = list_new_shape_tensor[i];
+    vec_new_shape.push_back(static_cast<int32_t>(*tensor->data<int32_t>()));
+  }
+  return vec_new_shape;
+}
+template <typename T>
+inline std::vector<T> get_new_data_from_tensor(const Tensor* new_data_tensor) {
+  std::vector<T> vec_new_data;
+  auto* new_data = new_data_tensor->data<T>();
+  lite::Tensor cpu_starts_tensor;
+  vec_new_data =
+      std::vector<T>(new_data, new_data + new_data_tensor->dims().production());
+  return vec_new_data;
+}
+void interpolate(lite::Tensor* X,
+                 lite::Tensor* OutSize,
+                 std::vector<const lite::Tensor*> SizeTensor,
+                 lite::Tensor* Scale,
+                 lite::Tensor* Out,
+                 int out_height,
+                 int out_width,
+                 float scale,
+                 bool with_align,
+                 std::string interpolate_type) {
+  int in_h = X->dims()[2];
+  int in_w = X->dims()[3];
+  if (SizeTensor.size() > 0) {
+    auto new_size = get_new_shape(SizeTensor);
+    out_height = new_size[0];
+    out_width = new_size[1];
+  } else {
+    auto scale_tensor = Scale;
+    if (scale_tensor != nullptr) {
+      auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
+      scale = scale_data[0];
+    }
+    if (scale > 0) {
+      out_height = static_cast<int>(in_h * scale);
+      out_width = static_cast<int>(in_w * scale);
+    }
+    auto out_size = OutSize;
+    if (out_size != nullptr) {
+      auto out_size_data = get_new_data_from_tensor<int>(out_size);
+      out_height = out_size_data[0];
+      out_width = out_size_data[1];
+    }
+  }
+  float height_scale = scale;
+  float width_scale = scale;
+  if (out_width > 0 && out_height > 0) {
+    height_scale = static_cast<float>(out_height / X->dims()[2]);
+    width_scale = static_cast<float>(out_width / X->dims()[3]);
+  }
+  int num_cout = X->dims()[0];
+  int c_cout = X->dims()[1];
+  Out->Resize({num_cout, c_cout, out_height, out_width});
+  float16* dout = Out->mutable_data<float16>();
+  const float16* din = X->data<float16>();
+  int out_num = Out->dims()[0];
+  int out_c = Out->dims()[1];
+  int count = out_num;
+  int out_h = Out->dims()[2];
+  int out_w = Out->dims()[3];
+  int spatial_in = in_h * in_w;
+  int spatial_out = out_h * out_w;
+  for (int i = 0; i < count; ++i) {
+    nearest_interp(din + spatial_in * i,
+                   in_w,
+                   in_h,
+                   out_c,
+                   dout + spatial_out * i,
+                   out_w,
+                   out_h,
+                   1.f / width_scale,
+                   1.f / height_scale,
+                   with_align);
+  }
+}
+void NearestInterpCompute::Run() {
+  auto& param = Param<operators::InterpolateParam>();
+  lite::Tensor* X = param.X;
+  lite::Tensor* OutSize = param.OutSize;
+  auto SizeTensor = param.SizeTensor;
+  auto Scale = param.Scale;
+  lite::Tensor* Out = param.Out;
+  float scale = param.scale;
+  int out_w = param.out_w;
+  int out_h = param.out_h;
+  bool align_corners = param.align_corners;
+  std::string interp_method = "";
+  X->ZynqTensor()->syncToCPU();
+  interpolate(X,
+              OutSize,
+              SizeTensor,
+              Scale,
+              Out,
+              out_h,
+              out_w,
+              scale,
+              align_corners,
+              interp_method);
+  Out->ZynqTensor()->flush();
+  Out->ZynqTensor()->copyScaleFrom(X->ZynqTensor());
+}
+} /* namespace fpga */
+} /* namespace kernels */
+} /* namespace lite */
+} /* namespace paddle */
+REGISTER_LITE_KERNEL(bilinear_interp,
+                     kFPGA,
+                     kFP16,
+                     kNHWC,
+                     paddle::lite::kernels::fpga::BilinearInterpCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindInput("OutSize",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("SizeTensor",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kFPGA),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
+REGISTER_LITE_KERNEL(nearest_interp,
+                     kFPGA,
+                     kFP16,
+                     kNHWC,
+                     paddle::lite::kernels::fpga::NearestInterpCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindInput("OutSize",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("SizeTensor",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kFPGA),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
--- a/lite/kernels/fpga/interpolate_compute.h
+++ b/lite/kernels/fpga/interpolate_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include "lite/backends/fpga/KD/pes/resize_pe.hpp"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace fpga {
+class BilinearInterpCompute
+    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
+ public:
+  void Run() override;
+  virtual ~BilinearInterpCompute() = default;
+};
+class NearestInterpCompute
+    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
+ public:
+  void PrepareForRun() override;
+  void Run() override;
+  virtual ~NearestInterpCompute() = default;
+ private:
+  zynqmp::ResizePE pe_;
+};
+} /* namespace fpga */
+} /* namespace kernels */
+} /* namespace lite */
+} /* namespace paddle */
--- a/lite/kernels/fpga/io_copy_compute.cc
+++ b/lite/kernels/fpga/io_copy_compute.cc
--- a/lite/kernels/fpga/multiclass_nms_compute.cc
+++ b/lite/kernels/fpga/multiclass_nms_compute.cc
--- a/lite/kernels/fpga/pooling_compute.cc
+++ b/lite/kernels/fpga/pooling_compute.cc
@@ -31,26 +31,67 @@ void PoolCompute::PrepareForRun() {
  auto& param = Param<operators::PoolParam>();
  param.output->mutable_data<float16>();
-  zynqmp::PoolingParam& pool_param = pe_.param();
+  int h_kernel = param.ksize[0];
-  pool_param.input = param.x->ZynqTensor();
+  int w_kernel = param.ksize[1];
-  pool_param.output = param.output->ZynqTensor();
+  if (param.global_pooling) {
+    h_kernel = param.x->ZynqTensor()->shape().height();
-  pool_param.activeParam.type = zynqmp::TYPE_RELU;
+    w_kernel = param.x->ZynqTensor()->shape().width();
+  }
-  pool_param.type = param.pooling_type == "max" ? zynqmp::PoolingType::MAX
+  int c = param.x->ZynqTensor()->shape().channel();
-                                                : zynqmp::PoolingType::AVERAGE;
+  int w = param.x->ZynqTensor()->shape().width();
-  pool_param.globalPooling = param.global_pooling;
-  pool_param.kernelSize = param.ksize;
+  int wc_h_kernel = w * c * h_kernel;
-  pool_param.strides = param.strides;
+  int dwconv_limit = 131072;
-  int pad_h = (*param.paddings)[0];
+  int num = ceil(wc_h_kernel * 1.0f / dwconv_limit);
-  int pad_w = (*param.paddings)[2];
-  pool_param.paddings = std::vector<int>({pad_h, pad_w});
+  split_num_ = num;
-  pe_.init();
-  pe_.apply();
+  if (num == 1) {
+    zynqmp::PoolingParam& pool_param = pe_.param();
+    pool_param.input = param.x->ZynqTensor();
+    pool_param.output = param.output->ZynqTensor();
+    pool_param.type = param.pooling_type == "max"
+                          ? zynqmp::PoolingType::MAX
+                          : zynqmp::PoolingType::AVERAGE;
+    pool_param.globalPooling = param.global_pooling;
+    pool_param.kernelSize = param.ksize;
+    pool_param.strides = param.strides;
+    int pad_h = (*param.paddings)[0];
+    int pad_w = (*param.paddings)[2];
+    pool_param.paddings = std::vector<int>({pad_h, pad_w});
+    pe_.init();
+    pe_.apply();
+  } else {
+    zynqmp::PoolingParam& pool_param = split_pe_.param();
+    pool_param.input = param.x->ZynqTensor();
+    pool_param.output = param.output->ZynqTensor();
+    pool_param.type = param.pooling_type == "max"
+                          ? zynqmp::PoolingType::MAX
+                          : zynqmp::PoolingType::AVERAGE;
+    pool_param.globalPooling = param.global_pooling;
+    pool_param.kernelSize = param.ksize;
+    pool_param.strides = param.strides;
+    int pad_h = (*param.paddings)[0];
+    int pad_w = (*param.paddings)[2];
+    pool_param.paddings = std::vector<int>({pad_h, pad_w});
+    split_pe_.init();
+    split_pe_.apply();
+  }
 }
 void PoolCompute::Run() {
-  pe_.dispatch();
+  if (split_num_ == 1) {
+    zynqmp::PoolingParam& pool_param = pe_.param();
+    pe_.dispatch();
+  } else {
+    split_pe_.dispatch();
+    zynqmp::PoolingParam& pool_param = split_pe_.param();
+  }
 #ifdef FPGA_PRINT_TENSOR
  zynqmp::PoolingParam& pool_param = pe_.param();
  Debugger::get_instance().registerOutput("pooling", pool_param.output);

--- a/lite/kernels/fpga/pooling_compute.h
+++ b/lite/kernels/fpga/pooling_compute.h
@@ -16,6 +16,7 @@
 #include <algorithm>
 #include "lite/backends/fpga/KD/float16.hpp"
 #include "lite/backends/fpga/KD/pes/pooling_pe.hpp"
+#include "lite/backends/fpga/KD/pes/pooling_split_pe.hpp"
 #include "lite/core/kernel.h"
 #include "lite/operators/pool_op.h"
@@ -36,6 +37,8 @@ class PoolCompute
 private:
  zynqmp::PoolingPE pe_;
+  zynqmp::PoolingSplitPE split_pe_;
+  int split_num_ = 1;
 };
 }  // namespace fpga

--- a/lite/kernels/fpga/pooling_compute_test.cc
+++ b/lite/kernels/fpga/pooling_compute_test.cc
@@ -12,15 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#include "lite/kernels/fpga/pooling_compute.h"
 #include <gtest/gtest.h>
 #include <limits>
 #include <string>
 #include <vector>
+#include "lite/core/op_registry.h"
 #include "lite/backends/fpga/KD/float16.hpp"
-#include "lite/core/op_registry.h"
-#include "lite/kernels/fpga/pooling_compute.h"
 namespace paddle {
 namespace lite {
@@ -278,7 +277,8 @@ TEST(pool_fpga, compute) {
 }
 TEST(pool_fpga, retrive_op) {
-  auto pool = KernelRegistry::Global().Create("pool2d");
+  auto pool = KernelRegistry::Global().Create<TARGET(kFPGA), PRECISION(kFP16)>(
+      "pool2d");
  ASSERT_FALSE(pool.empty());
  ASSERT_TRUE(pool.front());
 }

--- a/lite/kernels/fpga/prior_box_compute.cc
+++ b/lite/kernels/fpga/prior_box_compute.cc
@@ -64,7 +64,7 @@ void PriorBoxCompute::PrepareForRun() {
  float offset = param.offset;
  std::vector<float> aspect_ratios_vec;
  ExpandAspectRatios(aspect_ratio, is_flip, &aspect_ratios_vec);
-  size_t prior_num = aspect_ratios_vec.size() * min_size.size();
+  int prior_num = aspect_ratios_vec.size() * min_size.size();
  prior_num += max_size.size();
  std::vector<std::string> order = param.order;
  bool min_max_aspect_ratios_order = param.min_max_aspect_ratios_order;
@@ -78,6 +78,7 @@ void PriorBoxCompute::PrepareForRun() {
  param.boxes->mutable_data<float>();
  param.variances->mutable_data<float>();
  zynqmp::PriorBoxParam& priobox_param = pe_.param();
  priobox_param.input = param.input->ZynqTensor();
  priobox_param.image = param.image->ZynqTensor();

--- a/lite/kernels/fpga/reshape_compute.cc
+++ b/lite/kernels/fpga/reshape_compute.cc
--- a/lite/kernels/fpga/reshape_compute.h
+++ b/lite/kernels/fpga/reshape_compute.h
@@ -25,14 +25,24 @@ namespace fpga {
 class ReshapeCompute
    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
 public:
+  void PrepareForRun() override;
  void Run() override;
  virtual ~ReshapeCompute() = default;
 };
+class FlattenCompute
+    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
+ public:
+  void Run() override;
+  virtual ~FlattenCompute() = default;
+};
 class ReshapeComputeFpgaToHost
    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
 public:
+  void PrepareForRun() override;
  void Run() override;
  virtual ~ReshapeComputeFpgaToHost() = default;

--- a/lite/kernels/fpga/scale_compute.cc
+++ b/lite/kernels/fpga/scale_compute.cc
@@ -29,8 +29,8 @@ void ScaleCompute::PrepareForRun() {
  scale_param.output = param.output->ZynqTensor();
  int channel = scale_param.input->shape().channel();
-  zynqmp::Tensor* scale = new zynqmp::Tensor();
+  zynqmp::Tensor* scale = &scale_;
-  zynqmp::Tensor* bias = new zynqmp::Tensor();
+  zynqmp::Tensor* bias = &bias_;
  zynqmp::Shape shape(zynqmp::N, {channel});
  float* scale_data = scale->mutableData<float>(zynqmp::FP32, shape);
  float* bias_data = bias->mutableData<float>(zynqmp::FP32, shape);

--- a/lite/kernels/fpga/scale_compute.h
+++ b/lite/kernels/fpga/scale_compute.h
@@ -37,6 +37,8 @@ class ScaleCompute
 private:
  zynqmp::ScalePE pe_;
+  zynqmp::Tensor scale_;
+  zynqmp::Tensor bias_;
 };
 }  // namespace fpga

--- a/lite/kernels/fpga/softmax_compute.cc
+++ b/lite/kernels/fpga/softmax_compute.cc
--- a/lite/kernels/fpga/softmax_compute_test.cc
+++ b/lite/kernels/fpga/softmax_compute_test.cc
--- a/lite/kernels/fpga/split_compute.cc
+++ b/lite/kernels/fpga/split_compute.cc
--- a/lite/kernels/fpga/split_compute.h
+++ b/lite/kernels/fpga/split_compute.h
--- a/lite/kernels/fpga/transpose_compute.cc
+++ b/lite/kernels/fpga/transpose_compute.cc
--- a/lite/kernels/fpga/yolo_box_compute.cc
+++ b/lite/kernels/fpga/yolo_box_compute.cc
--- a/lite/kernels/fpga/yolo_box_compute.h
+++ b/lite/kernels/fpga/yolo_box_compute.h
--- a/lite/tools/build_fpga.sh
+++ b/lite/tools/build_fpga.sh
@@ -19,7 +19,11 @@ cmake .. \
        -DLITE_WITH_OPENMP=ON \
        -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \
        -DWITH_TESTING=OFF \
-        -DARM_TARGET_OS=armlinux
+        -DARM_TARGET_OS=armlinux \
+        -DLITE_BUILD_EXTRA=ON \
+        -DLITE_WITH_PYTHON=OFF \
+        -DLITE_WITH_PROFILE=OFF \
+        -DLITE_WITH_LOG=OFF
 make -j8