updated backends/fpga and kernels/fpga files

b8923be2 · chonwhite · 97882cfa · b8923be2 · b8923be2 · b8923be2
111 changed file
--- a/arm.sh
+++ b/arm.sh
+./lite/tools/build.sh \
+  --arm_os=armlinux \
+  --arm_abi=armv8 \
+  --arm_lang=gcc \
+  test
--- a/fpga.sh
+++ b/fpga.sh
+./lite/tools/build.sh \
+  --arm_os=armlinux \
+  --arm_abi=armv8 \
+  --arm_lang=gcc \
+  test
--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
--- a/lite/api/paddle_use_kernels.h
+++ b/lite/api/paddle_use_kernels.h
+#pragma once
+#include "paddle_lite_factory_helper.h"
+USE_LITE_KERNEL(pool2d, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(yolo_box, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(conv2d, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(depthwise_conv2d, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, int8_out);
+USE_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, fp32_out);
+USE_LITE_KERNEL(depthwise_conv2d, kARM, kInt8, kNCHW, int8_out);
+USE_LITE_KERNEL(depthwise_conv2d, kARM, kInt8, kNCHW, fp32_out);
+USE_LITE_KERNEL(expand, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(reduce_mean, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(prior_box, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(affine_channel, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(stack, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(fill_constant, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(fill_constant_batch_size_like, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(relu, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(leaky_relu, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(relu_clipped, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(prelu, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(sigmoid, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(tanh, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(swish, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(relu6, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(log, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(exp, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(floor, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(hard_sigmoid, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(rsqrt, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(pad2d, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(fc, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(fc, kARM, kInt8, kNCHW, int8out);
+USE_LITE_KERNEL(fc, kARM, kInt8, kNCHW, fp32out);
+USE_LITE_KERNEL(multiclass_nms, kHost, kFloat, kNCHW, def);
+USE_LITE_KERNEL(transpose, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(transpose2, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(layout, kARM, kFloat, kNCHW, nchw2nhwc);
+USE_LITE_KERNEL(layout, kARM, kFloat, kNCHW, nhwc2nchw);
+USE_LITE_KERNEL(layout, kARM, kInt8, kNCHW, int8_nchw2nhwc);
+USE_LITE_KERNEL(layout, kARM, kInt8, kNCHW, int8_nhwc2nchw);
+USE_LITE_KERNEL(layout_once, kARM, kFloat, kNCHW, nchw2nhwc);
+USE_LITE_KERNEL(layout_once, kARM, kFloat, kNCHW, nhwc2nchw);
+USE_LITE_KERNEL(layout_once, kARM, kInt8, kNCHW, int8_nchw2nhwc);
+USE_LITE_KERNEL(layout_once, kARM, kInt8, kNCHW, int8_nhwc2nchw);
+USE_LITE_KERNEL(matmul, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(fusion_elementwise_add_activation, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(elementwise_sub, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(fusion_elementwise_sub_activation, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(elementwise_mul, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(fusion_elementwise_mul_activation, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(elementwise_max, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(fusion_elementwise_max_activation, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(elementwise_div, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(fusion_elementwise_div_activation, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(split, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def);
+USE_LITE_KERNEL(scale, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(batch_norm, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(concat, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(shuffle_channel, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(arg_max, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(feed, kHost, kAny, kAny, def);
+USE_LITE_KERNEL(reshape, kHost, kAny, kAny, def);
+USE_LITE_KERNEL(reshape2, kHost, kAny, kAny, def);
+USE_LITE_KERNEL(flatten, kHost, kAny, kAny, def);
+USE_LITE_KERNEL(flatten2, kHost, kAny, kAny, def);
+USE_LITE_KERNEL(calib, kARM, kInt8, kNCHW, fp32_to_int8);
+USE_LITE_KERNEL(calib, kARM, kInt8, kNCHW, int8_to_fp32);
+USE_LITE_KERNEL(calib_once, kARM, kInt8, kNCHW, fp32_to_int8);
+USE_LITE_KERNEL(calib_once, kARM, kInt8, kNCHW, int8_to_fp32);
+USE_LITE_KERNEL(conv2d_transpose, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(range, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(unsqueeze, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(unsqueeze2, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(softmax, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(squeeze, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(squeeze2, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(bilinear_interp, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(nearest_interp, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(box_coder, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(dropout, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(slice, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(cast, kARM, kFloat, kNCHW, def);
\ No newline at end of file
--- a/lite/api/paddle_use_ops.h
+++ b/lite/api/paddle_use_ops.h
+#pragma once
+#include "paddle_lite_factory_helper.h"
+USE_LITE_OP(feed);
+USE_LITE_OP(mul);
+USE_LITE_OP(search_fc);
+USE_LITE_OP(conv2d);
+USE_LITE_OP(depthwise_conv2d);
+USE_LITE_OP(layout);
+USE_LITE_OP(split);
+USE_LITE_OP(stack);
+USE_LITE_OP(scale);
+USE_LITE_OP(multiclass_nms);
+USE_LITE_OP(fusion_elementwise_sub_activation);
+USE_LITE_OP(fusion_elementwise_add_activation);
+USE_LITE_OP(fusion_elementwise_mul_activation);
+USE_LITE_OP(fusion_elementwise_max_activation);
+USE_LITE_OP(fusion_elementwise_div_activation);
+USE_LITE_OP(unsqueeze);
+USE_LITE_OP(unsqueeze2);
+USE_LITE_OP(sequence_topk_avg_pooling);
+USE_LITE_OP(transpose);
+USE_LITE_OP(transpose2);
+USE_LITE_OP(dropout);
+USE_LITE_OP(pool2d);
+USE_LITE_OP(elementwise_sub);
+USE_LITE_OP(elementwise_add);
+USE_LITE_OP(elementwise_mul);
+USE_LITE_OP(elementwise_max);
+USE_LITE_OP(elementwise_div);
+USE_LITE_OP(io_copy_once);
+USE_LITE_OP(batch_norm);
+USE_LITE_OP(reshape);
+USE_LITE_OP(reshape2);
+USE_LITE_OP(expand);
+USE_LITE_OP(matmul);
+USE_LITE_OP(concat);
+USE_LITE_OP(slice);
+USE_LITE_OP(graph_op);
+USE_LITE_OP(pad2d);
+USE_LITE_OP(squeeze);
+USE_LITE_OP(squeeze2);
+USE_LITE_OP(arg_max);
+USE_LITE_OP(fill_constant);
+USE_LITE_OP(fill_constant_batch_size_like);
+USE_LITE_OP(square);
+USE_LITE_OP(relu);
+USE_LITE_OP(leaky_relu);
+USE_LITE_OP(relu_clipped);
+USE_LITE_OP(prelu);
+USE_LITE_OP(sigmoid);
+USE_LITE_OP(tanh);
+USE_LITE_OP(swish);
+USE_LITE_OP(relu6);
+USE_LITE_OP(log);
+USE_LITE_OP(exp);
+USE_LITE_OP(floor);
+USE_LITE_OP(hard_sigmoid);
+USE_LITE_OP(sqrt);
+USE_LITE_OP(rsqrt);
+USE_LITE_OP(softsign);
+USE_LITE_OP(range);
+USE_LITE_OP(box_coder);
+USE_LITE_OP(calib);
+USE_LITE_OP(mean);
+USE_LITE_OP(conv2d_transpose);
+USE_LITE_OP(fetch);
+USE_LITE_OP(prior_box);
+USE_LITE_OP(fake_quantize_moving_average_abs_max);
+USE_LITE_OP(io_copy);
+USE_LITE_OP(nearest_interp);
+USE_LITE_OP(bilinear_interp);
+USE_LITE_OP(softmax);
+USE_LITE_OP(reduce_mean);
+USE_LITE_OP(affine_channel);
+USE_LITE_OP(cast);
+USE_LITE_OP(fake_dequantize_max_abs);
+USE_LITE_OP(shuffle_channel);
+USE_LITE_OP(yolo_box);
+USE_LITE_OP(fc);
\ No newline at end of file
--- a/lite/backends/arm/math/dotprod/gemm_sdot.h
+++ b/lite/backends/arm/math/dotprod/gemm_sdot.h
--- a/lite/backends/fpga/CMakeLists.txt
+++ b/lite/backends/fpga/CMakeLists.txt
--- a/lite/backends/fpga/KD/alignment.h
+++ b/lite/backends/fpga/KD/alignment.h
--- a/lite/backends/fpga/KD/context.hpp
+++ b/lite/backends/fpga/KD/context.hpp
--- a/lite/backends/fpga/KD/debugger.hpp
+++ b/lite/backends/fpga/KD/debugger.hpp
+// #include "lite/backends/fpga/lite_tensor.h"
+#include "lite/core/tensor.h"
+namespace paddle {
+namespace lite {
+class Debugger {
+ public:
+  static Debugger& get_instance() {
+    static Debugger s_instance;
+    return s_instance;
+  }
+  void registerOutput(std::string op_type, Tensor* tensor) {
+    // tensor->printScale();
+    // tensor->saveToFile(op_type, true);
+  }
+ private:
+  std::unordered_map<std::string, bool> op_config;
+  Debugger() {
+    op_config["concat"] = true;
+    op_config["conv_add_bn"] = true;
+    op_config["conv_add_bn_relu"] = true;
+    op_config["conv_add"] = true;
+    op_config["conv_add_relu"] = true;
+    op_config["conv_bn"] = true;
+    op_config["conv_bn_relu"] = true;
+    op_config["crop"] = true;
+  }
+};
+inline void chw_to_hwc(Tensor* t, float* dst) {
+  int num = t->dims()[0];
+  int channel = t->dims()[1];
+  int height = 1;
+  int width = 1;
+  if (t->dims().size() > 2){
+  	height = t->dims()[2];
+  }
+  if (t->dims().size() > 3){
+  	width = t->dims()[3];
+  }
+  // int width = t->dims()[3];
+  const float* chw_data = t->data<float>();
+  float* hwc_data = dst;
+  int chw = channel * height * width;
+  int wc = width * channel;
+  int index = 0;
+  for (int n = 0; n < num; n++) {
+    for (int c = 0; c < channel; c++) {
+      for (int h = 0; h < height; h++) {
+        for (int w = 0; w < width; w++) {
+          hwc_data[n * chw + h * wc + w * channel + c] = chw_data[index];
+          index++;
+        }
+      }
+    }
+  }
+}
+inline void read_from_file(lite::Tensor* t,const std::string& path) {
+  std::ifstream file_stream;
+  file_stream.open(path);
+  if (!file_stream) {
+    return;
+  }
+  float* data = t->mutable_data<float>();
+  int num = t->numel();
+  for (int i = 0; i < num; ++i) {
+    float value = 0;
+    file_stream >> value;
+    data[i] = value;
+  }
+  // flush();
+}
+inline void save_float(float* data, const std::string& name, int len) {
+    return;
+  static int counter = 0;
+  std::string old_string = std::to_string(counter);
+  std::string new_string = std::string(3 - old_string.length(), '0') + old_string;
+  std::string file = "arm_" + new_string + name;
+  counter++;
+  std::cout << "-------------------------- saving file: --------------------------" << file << std::endl;
+  std::ofstream ofs;
+  ofs.open(file);
+  // float* data = dst;
+  for (int i = 0; i < len; i++) {
+    float value = data[i];
+    ofs << value << std::endl;
+  }
+  ofs.close();
+}
+inline void save_tensor(lite::Tensor* t,const std::string& name, bool convert = true) {
+  float* data = const_cast<float*>(t->data<float>());
+	float* dst = new float[t->numel()];
+  if (convert) {
+    chw_to_hwc(t, dst);
+    data = dst;
+  }
+  save_float(data, name, t->numel());
+  delete[] dst;
+}
+inline void save_tensor(const lite::Tensor* t,const std::string& name, bool convert = true) {
+  // return;
+  float* data = const_cast<float*>(t->data<float>());
+  float* dst = new float[t->numel()];
+  if (convert) {
+    chw_to_hwc(const_cast<lite::Tensor*>(t), dst);
+    data = dst;
+  }
+  save_float(data, name, t->numel());
+  delete[] dst;
+}
+}
+}
\ No newline at end of file
--- a/lite/backends/fpga/KD/dl_engine.cpp
+++ b/lite/backends/fpga/KD/dl_engine.cpp
@@ -13,14 +13,21 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "lite/backends/fpga/KD/dl_engine.hpp"
 namespace paddle {
 namespace zynqmp {
 DLEngine::DLEngine() {
  open_device();
-  struct DeviceInfo info;
+  int ret = get_device_info(info_);
-  int ret = get_device_info(info);
+  // filter::set_filter_capacity(2048);
-  filter::set_filter_capacity(info.filter_cap);
+  filter::set_filter_capacity(info_.filter_cap);
+  filter::set_colunm(info_.colunm);
+  std::cout << " version:" << info_.version;
+  std::cout << " device_type:" << info_.device_type;
+  std::cout << " filter_cap:" << info_.filter_cap;
+  std::cout << " colunm:" << info_.colunm << std::endl;
 }
 }  // namespace zynqmp

--- a/lite/backends/fpga/KD/dl_engine.hpp
+++ b/lite/backends/fpga/KD/dl_engine.hpp
@@ -15,7 +15,6 @@ limitations under the License. */
 #pragma once
 #include <stdio.h>
 #include "lite/backends/fpga/KD/llapi/filter.h"
 #include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
@@ -29,8 +28,16 @@ class DLEngine {
    return s_instance;
  }
+  DeviceInfo& deviceInfo();
+  // bool isZU3() { return info_.device_type / 100 == 3; }
+  bool isZU3() { return true; }
+  float* out_data = nullptr;
 private:
  DLEngine();
+  DeviceInfo info_;
 };
 }  // namespace zynqmp
 }  // namespace paddle
--- a/lite/backends/fpga/KD/fpga_cv.cpp
+++ b/lite/backends/fpga/KD/fpga_cv.cpp
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "lite/backends/fpga/KD/fpga_cv.hpp"
-using paddle::zynqmp::float16;
-void fpga_resize(float* input,
-                 int input_width,
-                 int input_height,
-                 int input_channel,
-                 uint8_t* output,
-                 int output_width,
-                 int output_height) {
-  paddle::zynqmp::InplaceArgs inplace_args = {0, 0, 0};
-  paddle::zynqmp::config_inplace(inplace_args);
-  paddle::zynqmp::ImageInputArgs input_args = {nullptr};
-  input_args.address = nullptr;
-  input_args.scale_address = nullptr;
-  float16* input_image_address =
-      reinterpret_cast<float16*>(paddle::zynqmp::fpga_malloc(
-          input_width * input_height * input_channel * sizeof(float16)));
-  int index = 0;
-  for (int i = 0; i < input_width * input_height * input_channel; i++) {
-    input_image_address[i] = float16(1.0 * input[i]);
-  }
-  paddle::zynqmp::ResizeArgs resize_args = {0};
-  resize_args.input_width = input_width;
-  resize_args.input_height = input_height;
-  resize_args.image_channel = input_channel;
-  resize_args.output_width = output_width;
-  resize_args.output_height = output_height;
-  float height_ratio = static_cast<float>(input_height) /
-                       static_cast<float>(resize_args.output_height);
-  float width_ratio = static_cast<float>(input_width) /
-                      static_cast<float>(resize_args.output_width);
-  resize_args.height_ratio = *reinterpret_cast<uint32_t*>(&height_ratio);
-  resize_args.width_ratio = *reinterpret_cast<uint32_t*>(&width_ratio);
-  int output_size =
-      resize_args.output_width * resize_args.output_height * input_channel;
-  float16* fpga_output = reinterpret_cast<float16*>(
-      paddle::zynqmp::fpga_malloc(output_size * sizeof(float16)));
-  resize_args.input_image_address = input_image_address;
-  resize_args.output_image_address = fpga_output;
-  memset(fpga_output, 0, output_size * sizeof(float16));
-  paddle::zynqmp::fpga_flush(
-      input_image_address,
-      input_width * input_height * input_channel * sizeof(float16));
-  paddle::zynqmp::fpga_flush(resize_args.output_image_address,
-                             output_size * sizeof(float16));
-  int ret = paddle::zynqmp::compute_fpga_resize(resize_args);
-  if (ret == 0) {
-    paddle::zynqmp::fpga_invalidate(resize_args.output_image_address,
-                                    output_size * sizeof(float16));
-  }
-  for (int i = 0; i < output_size; i++) {
-    output[i] = fpga_output[i];
-  }
-}
--- a/lite/backends/fpga/KD/fpga_cv.hpp
+++ b/lite/backends/fpga/KD/fpga_cv.hpp
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include <stdlib.h>
-#include "lite/backends/fpga/KD/float16.hpp"
-#include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
-#include "lite/backends/fpga/KD/pe.hpp"
-void fpga_resize(float* input,
-                 int input_width,
-                 int input_height,
-                 int input_channel,
-                 uint8_t* output,
-                 int output_width,
-                 int output_height);
--- a/lite/backends/fpga/KD/layout.hpp
+++ b/lite/backends/fpga/KD/layout.hpp
@@ -22,6 +22,7 @@ namespace paddle {
 namespace zynqmp {
 enum LayoutType {
+  None,
  N,
  NC,
  NCHW,
@@ -39,6 +40,19 @@ class Layout {
  virtual int elementCount(const std::vector<int>& dims) = 0;
 };
+struct None : Layout {
+  int numIndex() { return -1; }
+  int channelIndex() { return -1; }
+  int heightIndex() { return -1; }
+  int widthIndex() { return -1; }
+  int alignedElementCount(const std::vector<int>& dims) {
+    return 16;
+  }
+  virtual int elementCount(const std::vector<int>& dims) {
+    return 1;
+  }
+};
 struct NCHW : Layout {
  int numIndex() { return 0; }
  int channelIndex() { return 1; }

--- a/lite/backends/fpga/KD/llapi/bias_scale.cpp
+++ b/lite/backends/fpga/KD/llapi/bias_scale.cpp
@@ -14,6 +14,7 @@ limitations under the License. */
 #include <memory.h>
+#include "lite/backends/fpga/KD/float16.hpp"
 #include "lite/backends/fpga/KD/llapi/bias_scale.h"
 #include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
@@ -24,74 +25,72 @@ namespace bias_scale {
 void align_element(float **data_in, int num_per_div_before_alignment, int num) {
  int copynum = 0;
  float *ptr_unaligned = *data_in;
-  int div_num =
+  int div_num = (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
-      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
+  int num_per_div_after_alignment = align_to_x(num_per_div_before_alignment,
-  int num_per_div_after_alignment =
+                                               BS_NUM_ALIGNMENT);
-      align_to_x(num_per_div_before_alignment, BS_NUM_ALIGNMENT);
+  int num_element = 2 * div_num * num_per_div_after_alignment;  // including bias & scale
-  int num_element =
+  float *ptr_aligned = (float *)fpga_malloc(num_element * sizeof(float));  // NOLINT
-      2 * div_num * num_per_div_after_alignment;  // including bias & scale
-  float *ptr_aligned =
-      (float *)fpga_malloc(num_element * sizeof(float));  // NOLINT
  memset(ptr_aligned, 0, num_element * sizeof(float));
  for (int i = 0; i < div_num; i++) {
-    if (i == div_num - 1) {
+  if (i == div_num - 1) {
-      copynum = (num_per_div_after_alignment * div_num > num)
+    copynum = (num_per_div_after_alignment * div_num > num)
-                    ? (num % num_per_div_after_alignment)
+                ? (num % num_per_div_after_alignment)
-                    : (num_per_div_before_alignment);
+                : (num_per_div_before_alignment);
-    } else {
+  } else {
-      copynum = num_per_div_before_alignment;
+    copynum = num_per_div_before_alignment;
-    }
+  }
-    memcpy(ptr_aligned + i * num_per_div_after_alignment,
+  memcpy(ptr_aligned + i * num_per_div_after_alignment,
-           ptr_unaligned + num_per_div_before_alignment * i,
+         ptr_unaligned + num_per_div_before_alignment * i,
-           copynum * sizeof(float));
+         copynum * sizeof(float));
-    memcpy(ptr_aligned + (div_num + i) * num_per_div_after_alignment,
+  memcpy(ptr_aligned + (div_num + i) * num_per_div_after_alignment,
-           ptr_unaligned + num_per_div_before_alignment * i + num,
+         ptr_unaligned + num_per_div_before_alignment * i + num,
-           copynum * sizeof(float));
+         copynum * sizeof(float));
  }
  fpga_free(ptr_unaligned);
  *data_in = ptr_aligned;
 }
-void interleave(float **data_in, int num_after_alignment) {
+size_t interleave(float **data_in, int num_after_alignment) {
  float *ptr_uninterleaved = *data_in;
  float *ptr_interleaved =
-      (float *)fpga_malloc(2 * num_after_alignment * sizeof(float));  // NOLINT
+    (float *)fpga_malloc(2 * num_after_alignment * sizeof(float));  // NOLINT
  int num = num_after_alignment / 4;
  for (int i = 0; i < num; i++) {
-    memcpy(
+    memcpy(ptr_interleaved + 8 * i, ptr_uninterleaved + 4 * i,
-        ptr_interleaved + 8 * i, ptr_uninterleaved + 4 * i, 4 * sizeof(float));
-    memcpy(ptr_interleaved + 8 * i + 4,
-           ptr_uninterleaved + num_after_alignment + 4 * i,
           4 * sizeof(float));
+    memcpy(ptr_interleaved + 8 * i + 4,
+           ptr_uninterleaved + num_after_alignment + 4 * i, 4 * sizeof(float));
  }
  fpga_free(ptr_uninterleaved);
  *data_in = ptr_interleaved;
+  return 2 * num_after_alignment * sizeof(float);
 }
 void format_bias_scale_array(float **bias_scale_array,
-                             int element_num_per_division,
+                             int element_num_per_division, int num) {
-                             int num) {
  align_element(bias_scale_array, element_num_per_division, num);
  int div_num = (num + element_num_per_division - 1) / element_num_per_division;
  int element_num_after_division =
-      align_to_x(element_num_per_division, BS_NUM_ALIGNMENT);
+    align_to_x(element_num_per_division, BS_NUM_ALIGNMENT);
-  interleave(bias_scale_array, div_num * element_num_after_division);
+  size_t mem = interleave(bias_scale_array, div_num * element_num_after_division);
-  fpga_flush(*bias_scale_array, 2 * element_num_after_division * sizeof(float));
+  fpga_flush(*bias_scale_array, mem);
 }
 void format_bias_array(float **bias_array, int num) {
  float *ptr_unaligned = *bias_array;
  int num_before_align = num;
  int num_after_align = align_to_x(num_before_align, BIAS_NUM_ALIGNMENT);
  int16_t *ptr_aligned =
-      (int16_t *)fpga_malloc(num_after_align * sizeof(int16_t));  // NOLINT
+    (int16_t *)fpga_malloc(num_after_align * sizeof(int16_t));  // NOLINT
  memset(ptr_aligned, 0, num_after_align * sizeof(int16_t));
+  std::cout << "bias::" << std::endl;
  for (int i = 0; i < num_before_align; i++) {
-    float value = ptr_aligned[i];
+      float value = ptr_aligned[i];
-    ptr_aligned[i] = fp32_2_fp16(ptr_unaligned[i]);
+      std::cout << "@:" << i << " = " << value << std::endl;
+      ptr_aligned[i] = fp32_2_fp16(ptr_unaligned[i]);
  }
  *bias_array = (float *)ptr_aligned;  // NOLINT
  fpga_free(ptr_unaligned);

--- a/lite/backends/fpga/KD/llapi/bias_scale.h
+++ b/lite/backends/fpga/KD/llapi/bias_scale.h
@@ -19,10 +19,9 @@ namespace zynqmp {
 namespace bias_scale {
 void align_element(float** data_in, int num_per_div_before_alignment, int num);
-void interleave(float** data_in, int num_after_alignment);
+size_t interleave(float** data_in, int num_after_alignment);
 void format_bias_scale_array(float** bias_scale_array,
-                             int element_num_per_division,
+                             int element_num_per_division, int num);
-                             int num);
 void format_bias_array(float** bias_array, int num);
 }  // namespace bias_scale

--- a/lite/backends/fpga/KD/llapi/config.h
+++ b/lite/backends/fpga/KD/llapi/config.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#define PADDLE_LITE_ZU5
-#define FPGA_PRINT_MODE
-#define PADDLE_LITE_PROFILE
--- a/lite/backends/fpga/KD/llapi/filter.cpp
+++ b/lite/backends/fpga/KD/llapi/filter.cpp
@@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "lite/backends/fpga/KD/llapi/filter.h"
 #include <memory.h>
 #include <algorithm>
+#include <fstream>
+#include <string>
+#include "lite/backends/fpga/KD/llapi/filter.h"
 #include "lite/backends/fpga/KD/float16.hpp"
 #include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
@@ -23,11 +25,48 @@ namespace zynqmp {
 namespace filter {
 static int FILTER_SIZE = 2048;
+static int COLUMN = 4;
+void saveToFile(std::string name,void* data_in, int size) {
+  // std::ofstream ofs;
+  // ofs.open(name);
+  // int8_t* data = (int8_t*)data_in;
+  // for (int i = 0; i < size; i++) {
+  //   float value = data[i];
+  //   ofs << value << std::endl;
+  // }
+  // ofs.close();
+}
+void saveFloatToFile(std::string name,float* data_in, int size) {
+  // std::ofstream ofs;
+  // ofs.open(name);
+  // for (int i = 0; i < size; i++) {
+  //   float value = data_in[i];
+  //   ofs << value << std::endl;
+  // }
+  // ofs.close();
+}
+void set_filter_capacity(uint32_t cap) {
+  FILTER_SIZE = cap;
+}
-void set_filter_capacity(uint32_t cap) { FILTER_SIZE = cap; }
+void set_colunm(uint32_t column) {
+  COLUMN = column;
+}
+// replace zynqmp_api.h  #define FILTER_NUM_ALIGNMENT
+int get_filter_num_alignment() {
+  return COLUMN * 4;
+}
 int calc_division_capacity(int chw) {
-  int n = FILTER_SIZE / ((chw + 15) / 16) * 32;
+  // int n = FILTER_SIZE / ((chw + 15) / 16) * 32;
+  int filter_num_alignment =  get_filter_num_alignment();
+  int n = FILTER_SIZE / ((chw + 15) / 16) * filter_num_alignment;
  return n < FILTER_SIZE ? n : FILTER_SIZE;
 }
@@ -52,28 +91,24 @@ int calc_num_per_div(int num, int group_num, int division_capacity) {
  }
 }
-void convert_to_hwc(
+void convert_to_hwc(int8_t* chw_data, int8_t* hwc_data, int num, int channel,
-    char **data_in, int num, int channel, int height, int width) {
+                    int height, int width) {
-  char *tmp = *data_in;
  int chw = channel * height * width;
-  char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char));  // NOLINT
+  int wc = width * channel;
+  int index = 0;
  for (int n = 0; n < num; n++) {
-    int64_t amount_per_row = width * channel;
    for (int c = 0; c < channel; c++) {
      for (int h = 0; h < height; h++) {
-        int64_t offset_height = h * amount_per_row;
        for (int w = 0; w < width; w++) {
-          *(data_tmp + n * chw + offset_height + w * channel + c) =
+          hwc_data[n * chw + h * wc + w * channel + c] = chw_data[index];
-              *((*data_in)++);
+          index++;
        }
      }
    }
  }
-  *data_in = data_tmp;
-  fpga_free(tmp);
 }
-float find_max(float *data_in, int data_size) {
+float find_max(float* data_in, int data_size) {
  float max = 0.0;
  for (int i = 0; i < data_size; ++i) {
    float value = data_in[i];
@@ -83,166 +118,178 @@ float find_max(float *data_in, int data_size) {
  return max;
 }
-signed char float_to_int8(float fdata) {
+int8_t float_to_int8(float fdata) {
  if (fdata < 0.0) {
    fdata -= 0.5;
  } else {
    fdata += 0.5;
  }
-  return (signed char)fdata;
+  return (int8_t)fdata;
 }
-void quantize(float **data_in, int data_size, float max) {
+void quantize(float* src, int8_t* dst, int len, float max) {
-  float *tmp = *data_in;
  float fix_range = 127;
  float scale = fix_range / max;
+  for (size_t i = 0; i < len; i++) {
-  signed char *tmp_data = (signed char *)fpga_malloc(data_size * sizeof(char));
+    dst[i] = float_to_int8(src[i] * scale);
-  for (int i = 0; i < data_size; i++) {
-    tmp_data[i] = float_to_int8(
-        (*data_in)[i] * scale);  // (signed char)((*data_in)[i] * scale);
  }
-  *data_in = (float *)tmp_data;  // NOLINT
-  fpga_free(tmp);
 }
-void align_element(char **data_in, int num, int chw) {
+bool should_align_chw(int chw) {
-  int j = 0;
  int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
-  if (align_chw != chw) {
+  return align_chw != chw;
-    char *tmp = *data_in;
+}
-    char *data_tmp =
-        (char *)fpga_malloc(num * align_chw * sizeof(char));  // NOLINT
+void align_chw(int8_t* src, int8_t* dst, int num, int chw) {
+  int aligned_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
-    memset(data_tmp, 0, num * align_chw);
+  memset(dst, 0, num * aligned_chw);
-    for (j = 0; j < num; j++) {
+  for (int j = 0; j < num; j++) {
-      memcpy(data_tmp + j * align_chw, (*data_in) + j * chw, chw);
+    memcpy((dst + j * aligned_chw), (src + j * chw), chw);
-    }
-    *data_in = data_tmp;
-    fpga_free(tmp);
  }
 }
-void align_num(char **data_in,
+void align_num(int8_t* src, int8_t* dst, int num_per_div_before_alignment,
-               int num_per_div_before_alignment,
+               int num, int align_chw) {
-               int num,
+  int filter_num_alignment =  get_filter_num_alignment();
-               int chw) {
-  int i = 0;
-  int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
  int num_per_div_after_alignment =
-      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
+      align_to_x(num_per_div_before_alignment, filter_num_alignment);
-  char *tmp = *data_in;
  int div_num =
      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
  int num_element = div_num * num_per_div_after_alignment * align_chw;
-  char *data_tmp = (char *)fpga_malloc(num_element * sizeof(char));  // NOLINT
-  memset(data_tmp, 0, num_element * sizeof(char));
+  memset(dst, 0, num_element * sizeof(int8_t));
+  int i = 0;
  for (i = 0; i < div_num - 1; i++) {
-    memcpy(data_tmp + num_per_div_after_alignment * align_chw * i,
+    memcpy(dst + num_per_div_after_alignment * align_chw * i,
-           *data_in + num_per_div_before_alignment * align_chw * i,
+           src + num_per_div_before_alignment * align_chw * i,
           num_per_div_before_alignment * align_chw);
  }
-  memcpy(data_tmp + num_per_div_after_alignment * align_chw * i,
+  memcpy(dst + num_per_div_after_alignment * align_chw * i,
-         *data_in + num_per_div_before_alignment * align_chw * i,
+         src + num_per_div_before_alignment * align_chw * i,
         (num - (div_num - 1) * num_per_div_before_alignment) * align_chw);
-  *data_in = data_tmp;
-  fpga_free(tmp);
 }
-void reorder(char **data_in, int num_after_alignment, int chw) {
+void reorder(int8_t* src, int8_t* dst, int num_after_alignment, int chw) {
  int index = 0;
  int new_index = 0;
+  int filter_num_alignment = get_filter_num_alignment();
  int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
-  char *data_tmp =
-      (char *)fpga_malloc(chw_align * num_after_alignment *  // NOLINT
-                          sizeof(char));
-  char *tmp = *data_in;
  for (index = 0; index < num_after_alignment; index++) {
-    new_index = index / 32 * 32 + (index % 16 / 4 * 8) + (index % 16 % 4) +
+    new_index = index / filter_num_alignment * filter_num_alignment + (index % (filter_num_alignment/2) / 4 * 8) + (index % (filter_num_alignment/2) % 4) +
-                (index / 16 % 2 * 4);
+                (index / (filter_num_alignment/2) % 2 * 4);
-    memcpy(data_tmp + index * chw_align,
+    memcpy((dst + index * chw_align), (src + new_index * chw_align), chw_align);
-           *data_in + new_index * chw_align,
-           chw_align);
  }
-  *data_in = data_tmp;
-  fpga_free(tmp);
 }
-size_t interleave(char **data_in, int num_after_alignment, int chw) {
+void interleave(int8_t* src, int8_t* dst, int num_after_alignment, int chw) {
-  int i = 0;
-  int j = 0;
-  int k = 0;
  int interleave_per_num = 16;
  int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
-  char *data_tmp =
-      (char *)fpga_malloc(chw_align * num_after_alignment *  // NOLINT
-                          sizeof(char));
-  char *tmp = *data_in;
  int interleave_num = chw_align * 2 / interleave_per_num;
-  for (i = 0; i < num_after_alignment; i += 2) {
+  for (int i = 0; i < num_after_alignment; i += 2) {
-    for (j = 0, k = 0; j < interleave_num; j += 2, k++) {
+    for (int j = 0, k = 0; j < interleave_num; j += 2, k++) {
-      memcpy(data_tmp + i * chw_align + interleave_per_num * j,
+      memcpy(dst + i * chw_align + interleave_per_num * j,
-             *data_in + i * chw_align + interleave_per_num * k,
+             src + i * chw_align + interleave_per_num * k, interleave_per_num);
-             interleave_per_num);
+      memcpy(dst + i * chw_align + interleave_per_num * (j + 1),
-      memcpy(data_tmp + i * chw_align + interleave_per_num * (j + 1),
+             src + (i + 1) * chw_align + interleave_per_num * k,
-             *data_in + (i + 1) * chw_align + interleave_per_num * k,
             interleave_per_num);
    }
  }
-  *data_in = data_tmp;
-  fpga_free(tmp);
-  return chw_align * num_after_alignment;
 }
-size_t format_filter(float **data_in,
+int8_t* format_filter(float* data_in, int& mem_size_a, int num, int channel,
-                     int num,
+                      int height, int width, int group_num, float max,
-                     int channel,
+                      std::vector<float>& filter_max) {
-                     int height,
-                     int width,
-                     int group_num,
-                     float max) {
  int data_size = channel * height * width * num;
  int chw = channel * height * width;
  int division_capacity = calc_division_capacity(chw);
+  int filter_num_alignment =  get_filter_num_alignment();
  int num_per_div_before_alignment =
      calc_num_per_div(num, group_num, division_capacity);
  int num_per_div_after_alignment =
-      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
+      align_to_x(num_per_div_before_alignment, filter_num_alignment);
  int div_num =
      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
+  // int num_after_alignment = num_per_div_after_alignment * div_num;
  int residual = num % num_per_div_before_alignment;
-  int num_after_alignment = num_per_div_after_alignment *
+  int num_after_alignment = num_per_div_after_alignment * 
-                                ((residual == 0) ? div_num : (div_num - 1)) +
+                            ((residual == 0) ? div_num : (div_num - 1)) +
-                            align_to_x(residual, FILTER_NUM_ALIGNMENT);
+                            align_to_x(residual, filter_num_alignment);
-  quantize(data_in, data_size, max);
-  char **quantize_data = (char **)data_in;  // NOLINT
+  // saveFloatToFile("quantize_before", data_in, data_size);
-  convert_to_hwc(quantize_data, num, channel, height, width);
-  align_element(quantize_data, num, chw);
+  int8_t* quantized_data =
-  if (num_after_alignment != num) {
+      reinterpret_cast<int8_t*>(fpga_malloc(data_size * sizeof(int8_t)));
-    align_num(quantize_data, num_per_div_before_alignment, num, chw);
+  for (int n = 0; n < num; n++) {
+    float* filter_start = data_in + n * chw;
+    float f_max = find_max(filter_start, chw);
+    int8_t* quantized_start = quantized_data + n * chw;
+    // quantize(filter_start, quantized_start, chw, f_max);
+    quantize(filter_start, quantized_start, chw, max);
+    // filter_max.push_back(f_max);
+    filter_max.push_back(max);
  }
-  reorder(quantize_data, num_after_alignment, chw);
+  // saveToFile("chw.txt", quantized_data, data_size);
-  size_t mem_size = interleave(quantize_data, num_after_alignment, chw);
-  fpga_flush(*quantize_data,
+  int8_t* hwc_data =
-             align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) * num_after_alignment *
+      reinterpret_cast<int8_t*>(fpga_malloc(data_size * sizeof(int8_t)));
-                 sizeof(char));
+  convert_to_hwc(quantized_data, hwc_data, num, channel, height, width);
-  return mem_size;
+  fpga_free(quantized_data);
+  // saveToFile("hwc.txt", hwc_data, data_size);
+  int8_t* temp_data = hwc_data;  // NOLINT
+  int chw_aligned = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
+  if (should_align_chw(chw)) {
+    int8_t* hwc_aligned_data = reinterpret_cast<int8_t*>(
+        fpga_malloc(num * chw_aligned * sizeof(int8_t)));
+    align_chw(hwc_data, hwc_aligned_data, num, chw);
+    // saveToFile("align_el.txt", hwc_aligned_data, data_size * 2);
+    temp_data = hwc_aligned_data;
+    fpga_free(hwc_data);
+  }
+  if (num_after_alignment != num) {
+    int filter_num_alignment =  get_filter_num_alignment();
+    int num_per_div_after_alignment =
+        align_to_x(num_per_div_before_alignment, filter_num_alignment);
+    // int div_num =
+    //     (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
+    int num_element = div_num * num_per_div_after_alignment * chw_aligned;
+    int8_t* num_aligned_data =
+        reinterpret_cast<int8_t*>(fpga_malloc(num_element * sizeof(int8_t)));
+    align_num(temp_data, num_aligned_data, num_per_div_before_alignment, num,
+              chw_aligned);
+    // saveToFile("align_num.txt", num_aligned_data, data_size * 8);
+    fpga_free(temp_data);
+    temp_data = num_aligned_data;
+  }
+  int8_t* aligned_data =
+      reinterpret_cast<int8_t*>(fpga_malloc(num_after_alignment * chw_aligned));
+  reorder(temp_data, aligned_data, num_after_alignment, chw);
+  // saveToFile("reorder.txt", aligned_data, data_size * 8);
+  fpga_free(temp_data);  // TODO change name of qdata;
+  int8_t* interleaved_data =
+      reinterpret_cast<int8_t*>(fpga_malloc(num_after_alignment * chw_aligned));
+  interleave(aligned_data, interleaved_data, num_after_alignment, chw);
+  // saveToFile("interleave.txt", interleaved_data, data_size * 8);
+  fpga_free(aligned_data);
+  fpga_flush(interleaved_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) *
+                                   num_after_alignment * sizeof(char));
+  mem_size_a = num_after_alignment * chw_aligned;
+  return interleaved_data;
 }
-void convert_to_hwn(int16_t **data_in, int num, int height, int width) {
+void convert_to_hwn(int16_t** data_in, int num, int height, int width) {
-  int16_t *tmp = *data_in;
+  int16_t* tmp = *data_in;
-  int16_t *data_tmp =
+  int16_t* data_tmp =
-      (int16_t *)fpga_malloc(height * width * num * sizeof(int16_t));  // NOLINT
+      (int16_t*)fpga_malloc(height * width * num * sizeof(int16_t));  // NOLINT
  for (int n = 0; n < num; n++) {
    for (int h = 0; h < height; h++) {
      for (int w = 0; w < width; w++) {
@@ -254,16 +301,16 @@ void convert_to_hwn(int16_t **data_in, int num, int height, int width) {
  fpga_free(tmp);
 }
-size_t align_element_n(int16_t **data_in, int num, int height, int width) {
+size_t align_element_n(int16_t** data_in, int num, int height, int width) {
  int unalign_n = num;
  int align_n = align_to_x(num, FILTER_ELEMENT_ALIGNMENT);
  int num_element = height * width * align_n;
  if (unalign_n != align_n) {
-    int16_t *tmp = *data_in;
+    int16_t* tmp = *data_in;
    int num_element = height * width * align_n;
-    int16_t *data_tmp =
+    int16_t* data_tmp =
-        (int16_t *)fpga_malloc(num_element * sizeof(int16_t));  // NOLINT
+        (int16_t*)fpga_malloc(num_element * sizeof(int16_t));  // NOLINT
    memset(data_tmp, 0, num_element * sizeof(int16_t));
    for (int h = 0; h < height; h++) {
@@ -276,17 +323,33 @@ size_t align_element_n(int16_t **data_in, int num, int height, int width) {
      }
    }
    *data_in = data_tmp;
-    free(tmp);
+    fpga_free(tmp);
  }
  return num_element * sizeof(int16_t);
 }
-void quantize_to_fp16(
+void to_fp16(float* src, float16* dst, int num, int height, int width,
-    float **data_in, int num, int height, int width, float *scale_ptr) {
+             float* scale_ptr) {
-  float *tmp = *data_in;
+  int size = num * height * width;
+  for (int n = 0; n < num; n++) {
+    float scale_val = scale_ptr[n];
+    for (int h = 0; h < height; h++) {
+      for (int w = 0; w < width; w++) {
+        int index = n * height * width + h * width + w;
+        float value = src[index] * scale_val;
+        dst[index] = float_to_half(value);
+      }
+    }
+  }
+  fpga_flush(dst, size * sizeof(int16_t));
+}
+void quantize_to_fp16(float** data_in, int num, int height, int width,
+                      float* scale_ptr) {
+  float* tmp = *data_in;
  int size = num * height * width;
-  float16 *tmp_data = (float16 *)fpga_malloc(size * sizeof(float16));  // NOLINT
+  float16* tmp_data = (float16*)fpga_malloc(size * sizeof(float16));  // NOLINT
  for (int n = 0; n < num; n++) {
    float scale_val = scale_ptr[n];
    for (int h = 0; h < height; h++) {
@@ -298,18 +361,23 @@ void quantize_to_fp16(
    }
  }
  fpga_flush(tmp_data, size * sizeof(int16_t));
-  *data_in = (float *)tmp_data;  // NOLINT
+  *data_in = (float*)tmp_data;  // NOLINT
  fpga_free(tmp);
 }
-size_t format_dwconv_filter(
+size_t format_dwconv_filter(float** data_in, int num, int height, int width,
-    float **data_in, int num, int height, int width, float *scale_ptr) {
+                            float* scale_ptr) {
+  // float16* fp16_data = reinterpret_cast<float16*>(
+  //     fpga_malloc(num * height * width * sizeof(float16)));
+  // to_fp16(*data_in, fp16_data, num, height, width, scale_ptr);
+  // int16_t** quantize_data = (int16_t**)&fp16_data;  // NOLINT
  quantize_to_fp16(data_in, num, height, width, scale_ptr);
-  int16_t **quantize_data = (int16_t **)data_in;  // NOLINT
+  int16_t **quantize_data = (int16_t **)data_in;
  convert_to_hwn(quantize_data, num, height, width);
  size_t size = align_element_n(quantize_data, num, height, width);
-  fpga_flush(*quantize_data,
+  fpga_flush(*quantize_data, align_to_x(num, FILTER_ELEMENT_ALIGNMENT) *
-             align_to_x(num, FILTER_ELEMENT_ALIGNMENT) * height * width *
+                                 height * width * sizeof(int16_t));
-                 sizeof(int16_t));
  return size;
 }
 }  // namespace filter

--- a/lite/backends/fpga/KD/llapi/filter.h
+++ b/lite/backends/fpga/KD/llapi/filter.h
@@ -18,40 +18,31 @@ limitations under the License. */
 #include <cstdlib>
 #include <cwchar>
+#include <vector>
 namespace paddle {
 namespace zynqmp {
 namespace filter {
 void set_filter_capacity(uint32_t cap);
+void set_colunm(uint32_t column);
+int get_filter_num_alignment();
 int calc_division_capacity(int chw);
 int calc_split_num(int num, int division_capacity);
 int calc_division_number(int num, int group_num, int division_capacity);
 int calc_num_per_div(int num, int group_num, int division_capacity);
-void convert_to_hwc(
-    char** data_in, int num, int channel, int height, int width);
 float find_max(float* data_in, int data_size);
-void quantize(float** data_in, int data_size, float max);
+int8_t* format_filter(float* data_in, int& mem_size, int num, int channel,
-void align_element(char** data_in, int num, int chw);
+                      int height, int width, int group_num, float max,
-void align_num(char** data_in,
+                      std::vector<float>& filter_max);
-               int num_per_div_before_alignment,
-               int num,
-               int chw);
-void reorder(char** data_in, int num_after_alignment, int chw);
-size_t interleave(char** data_in, int num_after_alignment, int chw);
-size_t format_filter(float** data_in,
-                     int num,
-                     int channel,
-                     int height,
-                     int width,
-                     int group_num,
-                     float max);
 void convert_to_hwn(int16_t** data_in, int num, int height, int width);
 size_t align_element_n(int16_t** data_in, int num, int height, int width);
-void quantize_to_fp16(
+// void quantize_to_fp16(float** data_in, int num, int height, int width,
-    float** data_in, int num, int height, int width, float* scale_ptr);
+//                       float* scale_ptr);
-size_t format_dwconv_filter(
+size_t format_dwconv_filter(float** data_in, int num, int height, int width,
-    float** data_in, int num, int height, int width, float* scale_ptr);
+                            float* scale_ptr);
 }  // namespace filter
 }  // namespace zynqmp

--- a/lite/backends/fpga/KD/llapi/zynqmp_api.cpp
+++ b/lite/backends/fpga/KD/llapi/zynqmp_api.cpp
@@ -23,13 +23,12 @@ limitations under the License. */
 #include <map>
 #include <utility>
-#include "lite/backends/fpga/KD/llapi/config.h"
 #include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
 namespace paddle {
 namespace zynqmp {
-#define PADDLE_LITE_OS_LINUX
+#define PADDLE_MOBILE_OS_LINUX
 static int fd = -1;
 static const char *device_path = "/dev/fpgadrv0";
@@ -39,21 +38,19 @@ static size_t memory_size_max = 0;
 static size_t memory_size = 0;
 static inline int do_ioctl(uint64_t req, const void *arg) {
-  int ret = -1;
+#ifdef PADDLE_MOBILE_OS_LINUX
-#ifdef PADDLE_LITE_OS_LINUX
+  return ioctl(fd, req, arg);
-  ret = ioctl(fd, req, arg);
-  if (ret != 0) {
-    throw - 1;
-  }
 #else
-  return ret;
+  return -1;
 #endif
 }
 int open_device() {
+  // std::cout << "open_device" << std::endl;
  if (fd == -1) {
    fd = open(device_path, O_RDWR);
  }
+  // std::cout << "open_device fd:" << fd << std::endl;
  return fd;
 }
@@ -66,13 +63,19 @@ void reset_device() {
 // memory management;
 void *fpga_malloc(size_t size) {
-#ifdef PADDLE_LITE_OS_LINUX
+// std::cout << "fpga malloc: 0x" << std::hex << size  << std::dec << "  (" <<
+// size << ") - ";
+#ifdef ENABLE_DEBUG
+// std::cout << "fpga_malloc:" << size << std::endl;
+#endif
+#ifdef PADDLE_MOBILE_OS_LINUX
  void *ptr = reinterpret_cast<void *>(
      mmap64(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0));
  if (ptr == NULL) {
    std::cout << "not enough memory !";
    exit(-1);
  }
+  // std::cout << std::hex << ptr << std::dec << std::endl;
  memory_map.insert(std::make_pair(ptr, size));
  memory_size += size;
  if (memory_size > memory_size_max) {
@@ -90,6 +93,8 @@ size_t fpga_get_memory_size_max() { return memory_size_max; }
 size_t fpga_diagnose_memory(int detailed) {
  size_t total = 0;
+  //        size_t size = 0;
+  //        int i = 0;
  auto iter = memory_map.begin();  // std::map<void *, size_t>::iterator
  while (iter != memory_map.end()) {
    total += iter->second;
@@ -108,7 +113,7 @@ void fpga_free(void *ptr) {
  memory_size -= size;
-#ifdef PADDLE_LITE_OS_LINUX
+#ifdef PADDLE_MOBILE_OS_LINUX
  munmap(ptr, size);
 #else
@@ -126,6 +131,9 @@ int fpga_flush(void *address, size_t size) {
 }
 int fpga_invalidate(void *address, size_t size) {
+  // std::cout <<
+  // "=================================================================================="
+  // << std::endl;
  struct MemoryCacheArgs args;
  args.address = address;
  args.size = size;
@@ -150,18 +158,85 @@ void fpga_copy(void *dest, const void *src, size_t num) {
  memcpy(dest, src, num);
 }
+int fpga_reset() {
+  struct FpgaResetArgs args;
+  return  do_ioctl(IOCTL_FPGA_RESET, &args);
+}
 int ioctl_conv(const struct ConvArgs &args) {
+#ifdef ENABLE_DEBUG
+//        std::cout << "======Compute Basic Conv======";
+//        std::cout << "   relu_enabled:" << args.relu_enabled
+//       << "   sb_address:" << args.sb_address
+//       << "   filter_address:" << args.filter_address
+//       << "   filter_num:" << args.filter_num
+//       << "   group_num:" << args.group_num;
+//  std::cout << "   image_address:" << args.image.address
+//       << "   image_scale_address:" << args.image.scale_address
+//       << "   image_channels:" << args.image.channels
+//       << "   image_height:" << args.image.height
+//       << "   image_width:" << args.image.width
+//       << "   pad_height:" << args.image.pad_height
+//       << "   pad_width:" << args.image.pad_width;
+//  std::cout << "   kernel_height:" << args.kernel.height
+//       << "   kernel_width:" << args.kernel.width
+//       << "   stride_h:" << args.kernel.stride_h
+//       << "   stride_w:" << args.kernel.stride_w;
+//  std::cout << "   out_address:" << args.output.address
+//       << "   out_scale_address:" << args.output.scale_address;
+//
+//       float* in_scale = (float*)args.image.scale_address;
+//       std::cout << "inv_scale:" << in_scale[0] << "," << in_scale[1] <<
+//       std::endl;
+#endif
  return do_ioctl(IOCTL_CONFIG_CONV, &args);
+  // return 0;
 }
 int compute_fpga_conv_basic(const struct ConvArgs &args) {
+#ifdef ENABLE_DEBUG
+//        std::cout << "======Compute Basic Conv======";
+//        std::cout << "   relu_enabled:" << args.relu_enabled
+//       << "   sb_address:" << args.sb_address
+//       << "   filter_address:" << args.filter_address
+//       << "   filter_num:" << args.filter_num
+//       << "   group_num:" << args.group_num;
+//  std::cout << "   image_address:" << args.image.address
+//       << "   image_scale_address:" << args.image.scale_address
+//       << "   image_channels:" << args.image.channels
+//       << "   image_height:" << args.image.height
+//       << "   image_width:" << args.image.width
+//       << "   pad_height:" << args.image.pad_height
+//       << "   pad_width:" << args.image.pad_width;
+//  std::cout << "   kernel_height:" << args.kernel.height
+//       << "   kernel_width:" << args.kernel.width
+//       << "   stride_h:" << args.kernel.stride_h
+//       << "   stride_w:" << args.kernel.stride_w;
+//  std::cout << "   out_address:" << args.output.address
+//       << "   out_scale_address:" << args.output.scale_address;
+// float *in_scale = (float *)args.image.scale_address;
+//        std::cout << " scale:" << in_scale[0] << "," << in_scale[1] <<
+//        std::endl;
+// float *filter_scale = (float *)args.filter_scale_address;
+//        std::cout << " filter scale:" << filter_scale[0] << "," <<
+//        filter_scale[1] << std::endl;
+#endif
  return do_ioctl(IOCTL_CONFIG_CONV, &args);
 }
 int compute_fpga_conv(const struct SplitConvArgs &args) {
+  // return do_ioctl(IOCTL_CONFIG_CONV, &args);
  int split_num = args.split_num;
  int ret = -1;
  for (int i = 0; i < split_num; i++) {
+    // ComputeBasicConv(args.conv_args[i]);
    ret = compute_fpga_conv_basic(args.conv_arg[i]);
  }
@@ -181,11 +256,15 @@ int compute_fpga_ewadd(const struct EWAddArgs &args) {
 }
 int get_device_info(const struct DeviceInfo &args) {
+  // DeviceInfo info;
+  // struct DeviceInfo* a = &info;
  int ret = do_ioctl(IOCTL_DEVICE_INFO, &args);
+  // std::cout << "a." << a->filter_cap << std::endl;
  return ret;
 }
 int perform_bypass(const struct BypassArgs &args) {
+  int ret = -1;
  int size = args.image.channels * args.image.width * args.image.height;
  int max_size = 1 << 21;
@@ -206,6 +285,7 @@ int perform_bypass(const struct BypassArgs &args) {
  bypassArgs.image.height = 1;
  bypassArgs.output.scale_address = scales;
  float scale = 0;
  for (int i = 0; i < count; ++i) {
    bypassArgs.image.channels = max_size;
@@ -213,7 +293,7 @@ int perform_bypass(const struct BypassArgs &args) {
        reinterpret_cast<char *>(input_address + i * max_size * type_size);
    bypassArgs.output.address =
        reinterpret_cast<char *>(output_address + i * max_size * out_type_size);
-    int ret = do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs);
+    ret = do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs);
    scale = std::max(scale, scales[0]);
    if (ret != 0) {
@@ -222,13 +302,18 @@ int perform_bypass(const struct BypassArgs &args) {
  }
  int remainder = size - max_size * count;
-  bypassArgs.image.channels = remainder;
+  // std::cout << "remainder:" << remainder << std::endl;
-  bypassArgs.image.address =
+  if (remainder > 0) {
-      reinterpret_cast<char *>(input_address + count * max_size * type_size);
+    bypassArgs.image.channels = remainder;
-  bypassArgs.output.address = reinterpret_cast<char *>(
+    bypassArgs.image.address =
-      output_address + count * max_size * out_type_size);
+        reinterpret_cast<char *>(input_address + count * max_size * type_size);
-  int ret = do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs);
+    bypassArgs.output.address = reinterpret_cast<char *>(
-  scale = std::max(scale, scales[0]);
+        output_address + count * max_size * out_type_size);
+    ret = do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs);
+    scale = std::max(scale, scales[0]);
+  }
  args.output.scale_address[0] = scale;
  args.output.scale_address[1] = 1.0f / scale;
  return ret;
@@ -279,10 +364,21 @@ int compute_fpga_dwconv(const struct DWconvArgs &args) {
  std::cout << "   out_address:" << args.output.address
            << "   out_scale_address:" << args.output.scale_address;
+  // float *in_scale = (float *)args.image.scale_address;
+  // std::cout << "inv_scale:" << in_scale[0] << "," << in_scale[1] <<
+  // std::endl;
 #endif
  return do_ioctl(IOCTL_CONFIG_DWCONV, &args);
 }
+int config_activation(const struct ActiveParamterArgs& args) {
+    return do_ioctl(IOCTL_CONFIG_ACTIVATION_PARAMETER, &args);
+}
+// int config_power(const struct PowerArgs& args) {
+//     return do_ioctl(IOCTL_CONFIG_POWER, &args);
+// }
 int config_inplace(const struct InplaceArgs &args) {
  return do_ioctl(IOCTL_CONFIG_INPLACE, &args);
 }

--- a/lite/backends/fpga/KD/llapi/zynqmp_api.h
+++ b/lite/backends/fpga/KD/llapi/zynqmp_api.h
@@ -14,6 +14,9 @@ limitations under the License. */
 #pragma once
+#ifndef PADDLE_MOBILE_SRC_FPGA_KD_ZYNQMP_API_H
+#define PADDLE_MOBILE_SRC_FPGA_KD_ZYNQMP_API_H
 #include <stdint.h>
 #include <cstddef>
 #include <iostream>
@@ -40,6 +43,13 @@ enum DLayoutType {
  LAYOUT_HWC = 0,
 };
+enum ActiveType {
+  TYPE_RELU = 0,
+  TYPE_RELU6 = 1,
+  TYPE_LEAK_RELU = 2,
+  TYPE_SIGMOID = 3,
+};
 struct VersionArgs {
  void* buffer;
 };
@@ -48,7 +58,7 @@ struct DeviceInfo {
  uint32_t filter_cap;
  uint32_t version;
  uint16_t device_type;
-  uint32_t reserved0;
+  uint32_t colunm;
  uint32_t reserved1;
  uint32_t reserved2;
  uint32_t reserved3;
@@ -108,6 +118,7 @@ struct ConvArgs {
  void* filter_scale_address;
  uint32_t filter_num;
  uint32_t group_num;
+  uint32_t dilation;
  struct KernelArgs kernel;
  struct ImageInputArgs image;  // input image;
@@ -199,9 +210,16 @@ struct NormalizeParameterArgs {
  uint32_t hight_width;
 };
+struct ActiveParamterArgs {
+  ActiveType type;
+  uint16_t leaky_relu_factor;
+};
 struct InplaceArgs {
  bool leaky_relu_enable;
  bool relu_enable;
+  bool sigmoid_enable;
+  bool relu6_enable;
  bool power_enable;
  bool normalize_enable;
 };
@@ -216,7 +234,9 @@ struct FpgaRegReadArgs {
  uint64_t value;
 };
-struct FpgaResetArgs {};
+struct FpgaResetArgs {
+  uint32_t val;
+};
 #define IOCTL_FPGA_MAGIC (('F' + 'P' + 'G' + 'A') / 4)
@@ -248,6 +268,8 @@ struct FpgaResetArgs {};
  _IOW(IOCTL_FPGA_MAGIC, 41, struct PowerParameterArgs)
 #define IOCTL_CONFIG_NORMALIZE_PARAMETER \
  _IOW(IOCTL_FPGA_MAGIC, 42, struct NormalizeParameterArgs)
+#define IOCTL_CONFIG_ACTIVATION_PARAMETER \
+  _IOW(IOCTL_FPGA_MAGIC, 43, struct ActiveParamterArgs)
 #define IOCTL_FPGA_REG_READ _IOW(IOCTL_FPGA_MAGIC, 50, struct FpgaRegReadArgs)
 #define IOCTL_FPGA_REG_WRITE _IOW(IOCTL_FPGA_MAGIC, 51, struct FpgaRegWriteArgs)
 #define IOCTL_FPGA_RESET _IOW(IOCTL_FPGA_MAGIC, 52, struct FpgaResetArgs)
@@ -331,6 +353,7 @@ int compute_fpga_scale(const struct ScaleArgs& args);
 int compute_fpga_concat(const struct ConcatArgs& args);
 int compute_fpga_resize(const struct ResizeArgs& args);
+int config_activation(const struct ActiveParamterArgs& args);
 int config_power(const struct PowerArgs& args);
 int compute_fpga_dwconv(const struct DWconvArgs& args);
 int config_norm_param(const struct NormalizeParameterArgs& args);
@@ -341,7 +364,11 @@ int config_inplace(const struct InplaceArgs& args);
 int flush_cache(void* addr, int size);
 int invalidate_cache(void* addr, int size);
+int fpga_reset();
 int16_t fp32_2_fp16(float fp32_num);
 float fp16_2_fp32(int16_t fp16_num);
 }  // namespace zynqmp
 }  // namespace paddle
+#endif  // PADDLE_MOBILE_SRC_FPGA_KD_ZYNQMP_API_H
--- a/lite/backends/fpga/KD/pe.hpp
+++ b/lite/backends/fpga/KD/pe.hpp
--- a/lite/backends/fpga/KD/pe_params.hpp
+++ b/lite/backends/fpga/KD/pe_params.hpp
@@ -26,6 +26,7 @@ namespace zynqmp {
 struct ReLUParam {
 public:
  bool enabled = false;
+  float leaky_relu_factor = 0.0f;
 };
 struct PEParam {
@@ -98,6 +99,24 @@ struct DepthwiseConvParam : ConvParam {
  Tensor* quantizedFilter_ = new Tensor();
 };
+struct GRUParam : PEParam {
+public:
+  Tensor* input = nullptr;
+  Tensor* h0 = nullptr;
+  Tensor* weight = nullptr;
+  Tensor* bias = nullptr;
+  Tensor* batch_gate = nullptr;
+  Tensor* batch_reset_hidden_prev = nullptr;
+  Tensor* batch_hidden = nullptr;
+  Tensor* hidden = nullptr;
+  std::string gate_activation = "sigmoid";
+  std::string activation= "tanh";
+  bool is_reverse = false;
+  bool origin_mode = false;
+};
 enum PoolingType : int {
  MAX = 0,
  AVERAGE = 1,
@@ -133,6 +152,12 @@ struct ElementwiseAddParam : PEParam {
  EWAddArgs ewargs;
 };
+struct ElementwiseMulParam : PEParam {
+ public:
+  std::vector<Tensor*> inputs;
+  Tensor* output = nullptr;
+};
 struct FullyConnectedParam : PEParam {
 public:
  Tensor* input = nullptr;

--- a/lite/backends/fpga/KD/pes/batchnorm_pe.hpp
+++ b/lite/backends/fpga/KD/pes/batchnorm_pe.hpp
--- a/lite/backends/fpga/KD/pes/concat_pe.hpp
+++ b/lite/backends/fpga/KD/pes/concat_pe.hpp
--- a/lite/backends/fpga/KD/pes/conv_pe.hpp
+++ b/lite/backends/fpga/KD/pes/conv_pe.hpp
@@ -49,7 +49,118 @@ class ConvPE : public PE {
      concatPE_.init();
      concatPE_.apply();
    }
+    if (DLEngine::get_instance().isZU3() &&
+        param_.input->shape().dimSize() == 4 &&
+        param_.input->shape().width() == 1 &&
+        param_.input->shape().width() >= 2048) {
+      use_cpu_ = true;
+    }
+    if (param_.filter->shape().width() == 1 &&
+        param_.filter->shape().height() == 1) {
+        use_cpu_ = true;
+    }
+    if (!use_cpu_) {
+      // param_.filter->releaseData();
+    }
+    // exit(-1);
  }
+  void cpu_conv_hwc() {
+    Tensor* input = param_.input;
+    Tensor* output = param_.output;
+    input->syncToCPU();
+    Tensor float_input;
+    Tensor float_output;
+    float* image_addr = float_input.mutableData<float>(FP32, input->shape());
+    float_input.copyFrom(input);
+    float_input.syncToCPU();
+    float* out = float_output.mutableData<float>(FP32, output->shape());
+    int out_width = output->shape().width();
+    int out_channel = output->shape().channel();
+    int in_channel = input->shape().channel();
+    float* filter_data = param_.filter->data<float>();
+    int image_height = input->shape().height();
+    int image_width = input->shape().width();
+    int image_channels = input->shape().channel();
+    int image_pad_h = param_.paddings[0];
+    int image_pad_w = param_.paddings[1];
+    int kernel_height = param_.filter->shape().height();
+    int kernel_width = param_.filter->shape().width();
+    int kernel_step_h = param_.strides[0];
+    int kernel_step_w = param_.strides[1];
+    // int out_channel = param_.strides[1];
+    int pooled_height_ = output->shape().height();
+    int pooled_width_ = out_width;
+    int filter_chw = image_channels * kernel_height * kernel_width;
+    float max = 0;
+    for (int ph = 0; ph < pooled_height_; ph++) {
+      for (int pw = 0; pw < pooled_width_; pw++) {
+        int hstart = ph * kernel_step_h - image_pad_h;
+        int wstart = pw * kernel_step_w - image_pad_w;
+        int hend = std::min(hstart + kernel_height, (int) image_height);
+        int wend = std::min(wstart + kernel_width, (int) image_width);
+        hstart = std::max(hstart, (int) 0);
+        wstart = std::max(wstart, (int) 0);
+        for (int oc = 0; oc < out_channel; oc++) {
+          float sum = 0.0f;
+          const int pool_index = (ph * pooled_width_ + pw) * out_channel + oc;
+          for (int c = 0; c < image_channels; c++) {
+            for (int h = hstart; h < hend; h++) {
+              int hi = 0;
+              if (ph == 0) {
+                  hi = h - hstart + image_pad_h;
+              } else{
+                  hi = h - hstart;
+              }
+              for (int w = wstart; w < wend; w++) {
+                int wi = 0;
+                if (pw == 0) {
+                    wi = w - wstart + image_pad_w;
+                }else {
+                    wi = w - wstart;
+                }
+                const int index = (h * image_width + w) * image_channels + c;
+//                          int weight_index = (hi * kernel_width + wi) * image_channels + c;//TODO
+                int weight_index = oc * filter_chw + kernel_width *
+                    kernel_height * c + kernel_width * hi + wi;
+                float value = image_addr[index] * filter_data[weight_index];
+                sum += value;
+              }
+            }
+          }
+            // std::cout << " ============================= pool_index:" << pool_index << " sum:" << sum << std::endl;
+          if (param_.relu.enabled && sum < 0) {
+            sum = -sum;
+          }
+          if (sum > max) {
+            max = sum;
+          }
+          out[pool_index] = sum;
+        }
+      }
+    }
+    float_output.flush();
+    output->copyFrom(&float_output);
+    output->scale()[0] = max / 127;
+    output->scale()[1] = 127 / max;
+  }
  void cpu_compute() {
    Tensor* input = param_.input;
    Tensor* output = param_.output;
@@ -59,79 +170,161 @@ class ConvPE : public PE {
    Tensor float_output;
    float* image_addr = float_input.mutableData<float>(FP32, input->shape());
    float_input.copyFrom(input);
+    float_input.syncToCPU();
+    // float_input.saveToFile("input", true);
+    // param_.filter->saveToFile("filter", true);
+    // param_.bias()->saveToFile("bias", true);
+    // exit(-1);
+    // float16* data_out = output->data<float16>();
    float* out = float_output.mutableData<float>(FP32, output->shape());
+    float* bias_data = param_.bias()->data<float>();
+    int out_width = output->shape().width();
    int out_channel = output->shape().channel();
    int in_channel = input->shape().channel();
    float* filter_data = param_.filter->data<float>();
    float* mi = new float[in_channel];
+    float max = 0;
+    int out_index = 0;
    for (int i = 0; i < out_channel; i++) {
      float* image = image_addr;
      float* filter_ptr = filter_data + i * in_channel;
      float* out_ptr = mi;
-#pragma omp parallel for
-      for (int j = 0; j < in_channel; j++) {
-        float value = image_addr[j] * filter_ptr[j];
-        mi[j] = value;
-      }
-      float sum = 0;
+      for (int h = 0; h < output->shape().height(); h++) {
-      for (int j = 0; j < in_channel; j++) {
+        for (int w = 0; w < output->shape().width(); w++) {
-        sum += mi[j];
+          float sum = 0;
+          // #pragma omp parallel for
+          for (int j = 0; j < in_channel; j++) {
+            int image_index = h * out_width * in_channel + w * in_channel + j;
+            float value = image_addr[image_index] * filter_ptr[j];
+            sum += value;
+            // mi[j] = value;
+          }
+          // for (int j = 0; j < in_channel; j++) {
+          //   sum += mi[j];
+          // }
+          sum += bias_data[i];
+          if (param_.relu.enabled && sum < 0) {
+            sum = 0;
+          }
+          if (sum > max) {
+            max = sum;
+          }
+          out_index = h * out_width * out_channel + w * out_channel + i;
+          out[out_index] = sum;
+          // out_index++;
+        }
      }
-      out[i] = sum;
    }
    delete[] mi;
    float_output.flush();
    output->copyFrom(&float_output);
+    output->scale()[0] = max / 127;
+    output->scale()[1] = 127 / max;
+    // float_output.saveToFile("out", true);
+    // exit(-1);
  }
  bool dispatch() {
-    inplace_.relu_enable = param_.relu.enabled;
+    if (use_cpu_) {
+      cpu_compute();
+      return true;
+    }
+    inplace_.leaky_relu_enable =
+        (param_.relu.leaky_relu_factor != 0) ? true : false;
+    inplace_.relu_enable =
+        inplace_.leaky_relu_enable ? false : param_.relu.enabled;
    inplace_.power_enable = false;
    inplace_.normalize_enable = false;
+    if (inplace_.relu_enable || inplace_.leaky_relu_enable) {
-    if (param_.relu.enabled) {
-      inplace_.relu_enable = param_.relu.enabled;
      config_inplace(inplace_);
+      if (inplace_.leaky_relu_enable) {
+        activeParamterArgs.type = TYPE_LEAK_RELU;
+        activeParamterArgs.leaky_relu_factor =
+            fp32_2_fp16(param_.relu.leaky_relu_factor);
+        config_activation(activeParamterArgs);
+      }
    }
    std::vector<BasicConvParam*>& params = param_.splitParams();
    int ret = 0;
    for (auto conv_param : params) {
+      // conv_param->input.printScale();
      ret |= compute_fpga_conv_basic(conv_param->args);
    }
-    if (param_.relu.enabled) {
+    if (inplace_.relu_enable || inplace_.leaky_relu_enable) {
      inplace_.relu_enable = false;
+      inplace_.leaky_relu_enable = false;
      config_inplace(inplace_);
+      if (inplace_.leaky_relu_enable) {
+        activeParamterArgs.type = TYPE_LEAK_RELU;
+        activeParamterArgs.leaky_relu_factor = fp32_2_fp16(0);
+        config_activation(activeParamterArgs);
+      }
    }
    size_t size = params.size();
    if (split_axis == 0 && ret == 0 && size > 1) {
+      // std::cout << "concat size:" << size << std::endl;
      concatPE_.dispatch();
    }
    if (split_axis == 1 && ret == 0 && size > 1) {
+      // for (int n = 0; n < size - 1; n++) {
      ElementwiseAddParam& add_param = addPE_.param();
      add_param.inputs = {&params[0]->output, &params[1]->output};
      add_param.output = param_.output;
      addPE_.init();
      addPE_.apply();
      addPE_.dispatch();
+      // param_.output->printScale();
+      // params[0]->input.saveToFile("conv_1.txt");
+      // params[1]->input.saveToFile("conv_2.txt");
+      // params[0]->output.saveToFile("ew_o1.txt");
+      // params[1]->output.saveToFile("ew_o2.txt");
+      // std::cout << "\n ================== EW ================== \n";
+      // }
    }
+    if (param_.input->shape().channel() == 64 && param_.output->shape().channel() == 128) {
+      // exit(-1);
+    }
    return ret == 0;
  }
  ConvParam& param() { return param_; }
 private:
+  bool use_cpu_ = false;
  ConvParam param_;
  ConcatPE concatPE_;
  ElementwiseAddPE addPE_;
  int split_axis = 0;
  InplaceArgs inplace_ = {0};
+  ActiveParamterArgs activeParamterArgs;
 };
 }  // namespace zynqmp

--- a/lite/backends/fpga/KD/pes/conv_process.hpp
+++ b/lite/backends/fpga/KD/pes/conv_process.hpp
@@ -14,6 +14,9 @@ limitations under the License. */
 #pragma once
+#ifndef conv_process_hpp
+#define conv_process_hpp
 #include <string.h>
 #include <cmath>
 #include <vector>
@@ -45,7 +48,10 @@ inline int get_split_num(Tensor* filter) {
             filter->shape().width();
  auto num = filter->shape().num();
  int div_capacity = filter::calc_division_capacity(chw);
-  return filter::calc_split_num(num, div_capacity);
+  // int aligned_num = align_to_x(num ,FILTER_NUM_ALIGNMENT);
+  int filter_num_alignment = filter::get_filter_num_alignment();
+  int aligned_num = align_to_x(num, filter_num_alignment);
+  return filter::calc_split_num(aligned_num, div_capacity);
 }
 inline void fill_scale_bias_const(ConvParam* param_) {
@@ -79,8 +85,7 @@ inline void combine_bn_params(BatchnormParam* bn, ConvParam* param_) {
  }
 }
-inline void combine_add_bn_params(BatchnormParam* bn,
+inline void combine_add_bn_params(BatchnormParam* bn, Tensor* bias,
-                                  Tensor* bias,
                                  ConvParam* param_) {
  int channel = param_->output->shape().channel();
  Shape sb_shape(N, {channel});
@@ -112,59 +117,102 @@ inline void combine_add_bn_params(BatchnormParam* bn,
  param_->bias()->setDataLocation(CPU);
 }
-inline void format_scale_bias(Tensor* scale,
+inline void format_scale_bias(Tensor* scale, Tensor* bias, Tensor* filter,
-                              Tensor* bias,
+                              Tensor* scale_bias, int group) {
-                              Tensor* filter,
-                              Tensor* scale_bias,
-                              int group) {
  float* scale_data = nullptr;
-  float* bias_data = nullptr;
+    float* bias_data = nullptr;
-  if (scale != nullptr) {
+    if (scale != nullptr) {
-    scale_data = scale->data<float>();
+        scale_data = scale->data<float>();
-  }
+    }
-  if (bias != nullptr) {
+    if (bias != nullptr) {
-    bias_data = bias->data<float>();
+        bias_data = bias->data<float>();
-  }
+    }
-  int channel = filter->shape().num();
+    int channel = filter->shape().num();
-  Shape bias_scale_shape(N, {2 * channel});
+    int scale_bias_len = align_to_x(channel / group, BS_NUM_ALIGNMENT) * group;
-  float* bs_data = scale_bias->mutableData<float>(FP32, bias_scale_shape);
-  for (int i = 0; i < channel; i++) {
-    float scale_value = scale_data == nullptr ? 1 : scale_data[i];
-    float bias_value = bias_data == nullptr ? 0 : bias_data[i];
-    bs_data[i + channel] = scale_value;
-    bs_data[i] = bias_value;
-  }
-  int element_num_per_div = get_filter_num_per_div(filter, group);
+    int c_per_group = channel / group;
-  bias_scale::format_bias_scale_array(&bs_data, element_num_per_div, channel);
+    int aligned_c_per_group = align_to_x(channel / group, BS_NUM_ALIGNMENT);
+    Shape bias_scale_shape(N, {2 * scale_bias_len});
+    float* bs_data = scale_bias->mutableData<float>(FP32, bias_scale_shape);
+    float* temp_data = (float*)fpga_malloc(2 * scale_bias_len * sizeof(float)) ;
+    memset(temp_data, 0, 2 * scale_bias_len * sizeof(float));
+    std::vector<float> scales;
+    if (scale_data != nullptr) {
+      for (int i = 0; i < channel; ++i) {
+        scales.push_back(scale_data[i]);
+      }
+      for (int i = 0;i < scale_bias_len - channel; i++) {
+        scales.push_back(1);
+      }
+    } else {
+      for (int i = 0;i < scale_bias_len; i++) {
+        scales.push_back(1);
+      }
+    }
+    for (int i = 0; i < scale_bias_len; ++i) {
+      temp_data[i + scale_bias_len] = 1;
+      temp_data[i] = 0;
+    }
+    for (int g = 0; g < group; g++) {
+      for (int c = 0; c < c_per_group; c++) {
+        int src_index = g * c_per_group + c;
+        int dst_index = g * aligned_c_per_group + c;
+        float scale_value = scales[src_index];
+        float bias_value = bias_data == nullptr ? 0 : bias_data[src_index];
+        temp_data[dst_index + scale_bias_len] = scale_value;
+        temp_data[dst_index] = bias_value;
+      }
+    }
+    // int element_num_per_div = get_filter_num_per_div(filter, group);
+    // int scale_bias_len = align_to_x(channel / group, 8) * group;
+    bias_scale::format_bias_scale_array(&temp_data, scale_bias_len / group, scale_bias_len);
+    memcpy(bs_data, temp_data, 2 * scale_bias_len * sizeof(float));
 }
-inline void format_filter(Tensor* filter, Tensor* quantized_filter, int group) {
+inline void format_filter(Tensor* filter, Tensor* quantized_filter, int group,
+                          std::vector<float>& scales) {
  float max_value = find_max(*filter);
  Shape& filter_shape = filter->shape();
+  int mem_size;
+  std::vector<float> max_values;
+  int8_t* quantized_data = filter::format_filter(filter->data<float>(), mem_size ,filter_shape.num(),
+    filter_shape.channel(), filter_shape.height(), filter_shape.width(), group, max_value, max_values);
+  float mem_factor = mem_size  * 1.0f / filter->shape().numel();
+  quantized_filter->setMemScale(mem_factor);
  quantized_filter->setAligned(true);
-  quantized_filter->mutableData<int8_t>(INT8, filter->shape());
+  int8_t* src = quantized_filter->mutableData<int8_t>(INT8, filter->shape());
  quantized_filter->scale()[0] = max_value / 127.0f;
  quantized_filter->scale()[1] = 127.0f / max_value;
-  auto memory_size = filter->shape().memorySize(sizeof(float));
+  memcpy(src, quantized_data, mem_size);
-  auto new_data = reinterpret_cast<float*>(fpga_malloc(memory_size));
-  memcpy(new_data, filter->data<float>(), memory_size);
-  size_t mem_size = filter::format_filter(&new_data,
-                                          filter_shape.num(),
-                                          filter_shape.channel(),
-                                          filter_shape.height(),
-                                          filter_shape.width(),
-                                          group,
-                                          max_value);
-  int8_t* src = quantized_filter->mutableData<int8_t>(INT8, filter->shape());
-  memcpy(src, new_data, mem_size);
-  fpga_free(new_data);
  quantized_filter->flush();
+  for (size_t i = 0; i < max_values.size(); i++) {
+    scales.push_back(max_values[i] / max_value);
+    // scales.push_back(1.0f);
+  }
+  // filter->saveToFile("filter.txt");
+  // std::ofstream ofs;
+  // ofs.open("quant.txt");
+  // for (int i = 0; i < mem_size; i++) {
+  //   float value = quantized_data[i];
+  //   ofs << value << std::endl;
+  // }
+  // ofs.close();
+  // exit(-1);
 }
-inline void format_dw_filter(Tensor* filter,
+inline void format_dw_filter(Tensor* filter, Tensor* quantized_filter,
-                             Tensor* quantized_filter,
                             float* scale) {
  int num = filter->shape().num();
  int height = filter->shape().height();
@@ -207,10 +255,18 @@ inline void split_filter_num(const ConvParam& c_param) {
  Tensor* out = param.output;
  Tensor* filter = param.filter;
  auto channel = out->shape().channel();
  int split_num = param.groups == 1 ? get_split_num(param.filter) : 1;
  int filter_num_per_div = get_filter_num_per_div(filter, param.groups);
+  auto chw = filter->shape().channel() * filter->shape().height() *
+           filter->shape().width();
+  auto num = filter->shape().num();
+  int div_capacity = filter::calc_division_capacity(chw);
+  int filter_num_alignment = filter::get_filter_num_alignment();
+  int aligned_num = align_to_x(num / param.groups, filter_num_alignment) * param.groups;
+  // int aligned_num = align_to_x(num / param.groups ,FILTER_NUM_ALIGNMENT) * param.groups;
+  split_num = filter::calc_split_num(aligned_num, div_capacity);
  Shape& out_shape = out->shape();
  for (int i = 0; i < split_num; i++) {
    BasicConvParam* conv_param = new BasicConvParam();
@@ -224,23 +280,20 @@ inline void split_filter_num(const ConvParam& c_param) {
    ConvArgs& args = conv_param->args;
    if (split_num == 1) {
-      out_address = out->data<float16>();
+        out_address = out->data<float16>();
-      out_scale_address = out->scale();
+        out_scale_address = out->scale();
    }
    filter_num = i == split_num - 1
-                     ? channel - (split_num - 1) * filter_num_per_div  // NOLINT
+                 ? channel - (split_num - 1) * filter_num_per_div  // NOLINT
-                     : filter_num_per_div;
+                 : filter_num_per_div;
    if (split_num != 1) {
-      Shape shape(NHWC, {1, out_shape.height(), out_shape.width(), filter_num});
+        Shape shape(NHWC, {1, out_shape.height(), out_shape.width(), filter_num});
-      out_address = conv_param->output.mutableData<float16>(FP16, shape);
+        out_address = conv_param->output.mutableData<float16>(FP16, shape);
-      out_scale_address = conv_param->output.scale();
+        out_scale_address = conv_param->output.scale();
    }
-    Shape f_shape(NCHW,
+    Shape f_shape(NCHW, {filter_num, filter->shape().channel(),
-                  {filter_num,
+                     filter->shape().height(), filter->shape().width()});
-                   filter->shape().channel(),
-                   filter->shape().height(),
-                   filter->shape().width()});
    Tensor new_filter;
    float* new_filter_data = new_filter.mutableData<float>(FP32, f_shape);
@@ -251,9 +304,16 @@ inline void split_filter_num(const ConvParam& c_param) {
           filter->data<float>() + i * filter_num_per_div * filter_hwc,
           filter_num * filter_hwc * sizeof(float));
    new_filter.flush();
    conv_param->filter.mutableData<float>(FP32, f_shape);
-    format_filter(&new_filter, &(conv_param->filter), param.groups);
+    if (param.groups != 1) {
+      int mem_factor = 32 / filter_num_per_div; // TODO
+      conv_param->filter.setMemScale(mem_factor);
+    }
+    std::vector<float> v; // TODO
+    format_filter(&new_filter, &(conv_param->filter), param.groups, v);
+    conv_param->filter.setDataType(INT8);
    int sb_num = 2 * align_to_x(filter_num, BS_NUM_ALIGNMENT);
    Tensor scale;
@@ -264,23 +324,30 @@ inline void split_filter_num(const ConvParam& c_param) {
    Shape s_shape(N, {filter_num});
    float* scale_data = scale.mutableData<float>(FP32, s_shape);
    float* bias_data = bias.mutableData<float>(FP32, s_shape);
+    std::cout << "v size: " << v.size() << std::endl;
    for (int n = 0; n < filter_num; n++) {
-      scale_data[n] = param.scale()->data<float>()[n + chnnnel_start];
+        scale_data[n] = param.scale()->data<float>()[n + chnnnel_start] * v[n];
+      // scale_data[n] = param.scale()->data<float>()[n + chnnnel_start];
    }
    for (int n = 0; n < filter_num; n++) {
-      bias_data[n] = param.bias()->data<float>()[n + chnnnel_start];
+        bias_data[n] = param.bias()->data<float>()[n + chnnnel_start];
    }
    Shape sb_shape(N, {sb_num});
-    format_scale_bias(&scale,
+    format_scale_bias(&scale, &bias, &conv_param->filter,
-                      &bias,
+                      &conv_param->scaleBias, param.groups);
-                      &conv_param->filter,
+    // conv_param->scaleBias.saveToFile("sb.txt");
-                      &conv_param->scaleBias,
-                      param.groups);
    conv_param->scaleBias.flush();
+    float* bs_data = conv_param->scaleBias.data<float>();
+    // conv_param->scaleBias.saveToFile("sb.txt");
+    // param.scale()->saveToFile("scale.txt");
+    // param.bias()->saveToFile("bias.txt");
+    // exit(-1);
    args.group_num = param.groups;
    args.relu_enabled = param.relu.enabled;
    args.sb_address = conv_param->scaleBias.data<float>();
+    args.sb_address = bs_data;
    args.kernel.stride_h = param.strides[1];
    args.kernel.stride_w = param.strides[0];
    args.kernel.height = new_filter.shape().height();
@@ -294,17 +361,13 @@ inline void split_filter_num(const ConvParam& c_param) {
    args.image.channels = input->shape().channel();
    args.image.width = input->shape().width();
    args.image.height = input->shape().height();
-    auto paddings = *param.padding;
+    args.image.pad_width = param.paddings[1];
-    args.image.pad_width = param.paddings[2];
    args.image.pad_height = param.paddings[0];
+     // TODO dilations[0] = dilations[1]
+    args.dilation = param.dilations[0];
    args.output.address = out_address;
    args.output.scale_address = out_scale_address;
-    bool pad_equal =
-        ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
-    if (!pad_equal) {
-      LOG(FATA) << "This pad not support ! " << paddings[0] << ", "
-                << paddings[1] << ", " << paddings[2] << ", " << paddings[3];
-    }
    param.splitParams().push_back(conv_param);
  }
 }
@@ -317,7 +380,7 @@ inline void split_channel(const ConvParam& c_param) {
  int num = ceil(input->shape().channel() * 1.0f / 2047);
  int channel = input->shape().channel() / num;
-  std::cout << "channel::" << channel << "num::" << num << std::endl;
  Shape bs_shape(N, {channel});
  for (int i = 0; i < num; i++) {
@@ -331,6 +394,7 @@ inline void split_channel(const ConvParam& c_param) {
    // filter transformation;
    Shape f_shape(NCHW, {param.filter->shape().num(), channel, 1, 1});
    Tensor new_filter;
    float* dst = new_filter.mutableData<float>(FP32, f_shape);
@@ -341,7 +405,8 @@ inline void split_channel(const ConvParam& c_param) {
      src += param.filter->shape().channel();
    }
    new_filter.flush();
-    format_filter(&new_filter, &(conv_param->filter), param.groups);
+    std::vector<float> scales;
+    format_filter(&new_filter, &(conv_param->filter), param.groups, scales);
    Tensor bias;
    Tensor scale;
@@ -354,12 +419,11 @@ inline void split_channel(const ConvParam& c_param) {
    }
    scale.flush();
    bias.flush();
-    format_scale_bias(&scale,
+    // Shape sb_shape(N, {2 * channel});
-                      &bias,
+    format_scale_bias(&scale, &bias, &conv_param->filter,
-                      &conv_param->filter,
+                      &conv_param->scaleBias, param.groups);
-                      &conv_param->scaleBias,
-                      param.groups);
    conv_param->scaleBias.flush();
+    // conv_param->scaleBias.saveToFile("sb.txt");
    ConvArgs& args = conv_param->args;
    args.group_num = param.groups;
@@ -379,18 +443,12 @@ inline void split_channel(const ConvParam& c_param) {
    args.image.channels = conv_param->input.shape().channel();
    args.image.width = conv_param->input.shape().width();
    args.image.height = conv_param->input.shape().height();
-    auto paddings = *param.paddings;
+    args.image.pad_width = param.paddings[1];
-    args.image.pad_width = paddings[2];
+    args.image.pad_height = param.paddings[0];
-    args.image.pad_height = paddings[0];
+    // TODO dilations[0] = dilations[1]
+    args.dilation = param.dilations[0];
    args.output.address = conv_param->output.mutableData<void>();
    args.output.scale_address = conv_param->output.scale();
-    bool pad_equal =
-        ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
-    if (!pad_equal) {
-      LOG(FATA) << "This pad not support ! " << paddings[0] << ", "
-                << paddings[1] << ", " << paddings[2] << ", " << paddings[3];
-    }
    param.splitParams().push_back(conv_param);
  }
 }
@@ -407,6 +465,7 @@ inline int fill_split_arg(const ConvParam& c_param) {
    split_filter_num(c_param);
    return 0;
  }
+  // split_filter_num(c_param);
 }
 inline bool compute_conv(const ConvParam& c_conv_params) {
@@ -418,11 +477,11 @@ inline bool compute_conv(const ConvParam& c_conv_params) {
  }
  size_t size = params.size();
  if (ret == 0 && size > 1) {
+    // Tensor* output = conv_params.output;
    Tensor& img = params[0]->output;
    for (int i = 0; i < 1; i++) {
      for (int i = 0; i < img.shape().numel(); i++) {
        float value = half_to_float(img.data<float16>()[i]);
-        std::cout << "value:" << value << std::endl;
      }
    }
  }
@@ -431,3 +490,5 @@ inline bool compute_conv(const ConvParam& c_conv_params) {
 }  // namespace zynqmp
 }  // namespace paddle
+#endif /* conv_process_hpp */
--- a/lite/backends/fpga/KD/pes/crop_pe.cpp
+++ b/lite/backends/fpga/KD/pes/crop_pe.cpp
--- a/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp
+++ b/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp
@@ -36,19 +36,36 @@ class DepthwiseConvPE : public PE {
    Tensor* input = param.input;
    Tensor* output = param.output;
    int channel = output->shape().channel();
-    float* new_scale_data = param_.scale()->data<float>();
-    float* new_bias_data = param_.bias()->data<float>();
    float16* b_data = bias_.mutableData<float16>(FP16, param_.bias()->shape());
-    for (int i = 0; i < channel; i++) {
+    if (param_.bias()->dataType() == FP32) {
-      b_data[i] = float_to_half(new_bias_data[i]);
+      float* new_bias_data = param_.bias()->data<float>();
+      // bias从float转换成float16   
+      for (int i = 0; i < channel; i++) {
+        b_data[i] = float_to_half(new_bias_data[i]);
+      }
+      bias_.flush();
+    } else {
+      float16* new_bias_data = param_.bias()->data<float16>();
+      memcpy(b_data, new_bias_data, channel * sizeof(float16));
+      bias_.flush();
    }
-    bias_.flush();
-    Tensor* quantized_filter = param.quantizedFilter();
+    if (param_.scale()->dataType() == FP32) {
-    quantized_filter->mutableData<float16>(FP16, param.filter->shape());
+      float* new_scale_data = param_.scale()->data<float>();
-    format_dw_filter(param.filter, param.quantizedFilter(), new_scale_data);
+      Tensor* quantized_filter = param.quantizedFilter();
+      quantized_filter->mutableData<float16>(FP16, param.filter->shape());
+      format_dw_filter(param.filter, param.quantizedFilter(), new_scale_data);
+    } else {
+      //TODO filter 全为1时，且channal为对齐时
+      float16* scale_data = param_.scale()->data<float16>();
+      float16* filter_data = param.quantizedFilter()->mutableData<float16>(FP16, param.filter->shape());
+      // memcpy(filter_data, scale_data, channel * sizeof(float16));
+      memcpy(filter_data, scale_data, param.filter->shape().numel() * sizeof(float16));
+      param.quantizedFilter()->flush();
+    }
    DWconvArgs args = {0};
    args.bias_address = b_data;
@@ -61,21 +78,14 @@ class DepthwiseConvPE : public PE {
    args.image.channels = input->shape().channel();
    args.image.height = input->shape().height();
    args.image.width = input->shape().width();
-    auto paddings = *param.paddings;
+    args.image.pad_width = param.paddings[0];
-    args.image.pad_width = param.paddings[2];
+    args.image.pad_height = param.paddings[1];
-    args.image.pad_height = param.paddings[0];
    args.image.scale_address = input->scale();
    args.output.address = output->data<void>();
    args.output.scale_address = output->scale();
    args.out_width = param.output->shape().width();
    args.out_height = param.output->shape().height();
    args.sub_conv_num = 1;
-    bool pad_equal =
-        ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
-    if (!pad_equal) {
-      LOG(FATA) << "This pad not support ! " << paddings[0] << ", "
-                << paddings[1] << ", " << paddings[2] << ", " << paddings[3];
-    }
    param.args = args;
    inplace_.relu_enable = param_.relu.enabled;
@@ -106,4 +116,4 @@ class DepthwiseConvPE : public PE {
 };
 }  // namespace zynqmp
-}  // namespace paddle
+}  // namespace paddle_mobile
--- a/lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp
+++ b/lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "lite/backends/fpga/KD/pe.hpp"
+#include "lite/backends/fpga/KD/pe_params.hpp"
+namespace paddle {
+namespace zynqmp {
+class ElementwiseMulPE : public PE {
+ public:
+  bool init() {
+    Tensor* output = param_.output;
+    output->setAligned(true);
+    output->setDataLocation(Device);
+    return true;
+  }
+  void apply() {
+    Tensor* input = param_.inputs[0];
+    Tensor* output = param_.output;
+    int wc_aligned = align_to_x(param_.inputs[0]->shape().numel(), 32);
+    // int wc_aligned =  / 32 * 32;
+    Shape s(N, {wc_aligned});
+    float16* bias_data = bias_tensor.mutableData<float16>(FP16, s);
+    memset(bias_data, 0, wc_aligned * sizeof(float16));
+    ScaleArgs& args = args_;
+    args.scale_address = param_.inputs[1]->data<void>();
+    args.bias_address = bias_tensor.data<void>();
+    args.wc_alignment = wc_aligned;
+    args.channel_alignment = wc_aligned;
+    args.image.address = input->data<void>();
+    args.image.scale_address = input->scale();
+    args.image.channels = wc_aligned;
+    args.image.height = 1;
+    args.image.width = 1;
+    args.image.pad_width = 0;
+    args.image.pad_height = 0;
+    args.output.address = output->data<void>();
+    args.output.scale_address = output->scale();
+  }
+  void updateInput(Tensor* t, int index) {
+    if (index == 0) {
+      args_.scale_address = t->data<void>();//replace inputs?
+    }
+  }
+  bool dispatch() {
+    compute_fpga_scale(args_) == 0;
+    return true;
+  }
+  ElementwiseMulParam& param() { return param_; }
+ private:
+  ElementwiseMulParam param_;
+  ScaleArgs args_ = {0};
+  Tensor bias_tensor;
+};
+}  // namespace zynqmp
+}  // namespace paddle
--- a/lite/backends/fpga/KD/pes/fully_connected_pe.hpp
+++ b/lite/backends/fpga/KD/pes/fully_connected_pe.hpp
@@ -37,7 +37,16 @@ class FullyConnectedPE : public PE {
    ConvParam& convParam_ = convPE_.param();
    Tensor* input = param_.input;
    convParam_.input = param_.input;
+    num_ = param_.input->shape().num();
+    // if (num_ == 1) {
+    // } else {
+    //   tempOut_.mutableData<void>(FP16, param_.out->shape());
+    //   convParam_.output = &tempOut_;
+    // }
    convParam_.output = param_.output;
    convParam_.groups = 1;
    convParam_.strides = {1, 1};
    convParam_.paddings = {0, 0};
@@ -63,9 +72,11 @@ class FullyConnectedPE : public PE {
        new_filter_data[i * chw + j] = scale;
      }
    }
+    // conv_filter->copyFrom(param_.filter);
    conv_filter->flush();
    convParam_.filter = conv_filter;
+    // convParam_.filter = param_.filter;
    Shape sb_shape(N, {channel});
    float* scale_data = convParam_.scale()->mutableData<float>(FP32, sb_shape);
@@ -82,13 +93,31 @@ class FullyConnectedPE : public PE {
    convPE_.apply();
  }
-  bool dispatch() { return convPE_.dispatch(); }
+  bool dispatch() { 
+    // return 
+    return convPE_.dispatch();
+    // convPE_.dispatch(); 
+    // if (num_ == 1) {
+    //   return true;
+    // }
+    // Tensor* output = param_.output;
+    // int size = output->shape().numel() * sizeof(floa16);
+    // memcpy(output->data<void>(), tempOut_->data<void>(), size);
+    // for (int i = 1;i < num_;i ++) {
+    //   memcpy(output->data<void>(), tempOut_->data<void>(), size);
+    // }
+    // return true;
+  }
  FullyConnectedParam& param() { return param_; }
 private:
  FullyConnectedParam param_;
  ConvPE convPE_;
+  Tensor tempOut_;
+  int num_ = 1;
 };
 }  // namespace zynqmp
 }  // namespace paddle
--- a/lite/backends/fpga/KD/pes/gru_pe.hpp
+++ b/lite/backends/fpga/KD/pes/gru_pe.hpp
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "lite/backends/fpga/KD/pes/elementwise_add_pe.hpp"
+#include "lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp"
+#include "lite/backends/fpga/KD/pes/fully_connected_pe.hpp"
+#include "lite/backends/fpga/KD/pes/relu_pe.hpp"
+#include "lite/backends/fpga/KD/pe.hpp"
+#include "lite/backends/fpga/KD/pe_params.hpp"
+#include "lite/backends/arm/math/sgemm.h"
+#include "lite/backends/arm/math/funcs.h"
+#include "lite/api/paddle_place.h"
+#include "lite/core/type_system.h"
+namespace paddle {
+namespace zynqmp {
+struct GRUTensors {
+  Tensor* gate;
+  Tensor* pre_output;
+  Tensor* output;
+  Tensor* reset_output;
+};
+class GRUPE : public PE {
+ public:
+  bool init() {
+    // Tensor* output = param_.output;
+    // output->setAligned(true);
+    // output->setDataLocation(Device);
+    return true;
+  }
+  void apply() {
+    auto hidden = param_.hidden;
+    // auto hidden_dims = hidden->dims();
+    int frame_size = hidden->shape().channel();
+    zynqmp::Shape hidden_shape{zynqmp::NCHW, {1, frame_size, 1, 1}};
+    float16* prev_hidden_data = prev_hidden_.mutableData<float16>(zynqmp::FP16, hidden_shape);
+    // set previous hidden data to 0;
+    memset(prev_hidden_data, 0, hidden_shape.numel() * sizeof(float16));
+    // copy 2/3 weight from param.weight;
+    zynqmp::Shape weight_shape{zynqmp::NC, {frame_size, frame_size * 2}};
+    float* weight_data = weight_.mutableData<float>(zynqmp::FP32, weight_shape);
+    memset(weight_data, 0, weight_shape.numel() * sizeof(float));
+    weight_data = weight_.mutableData<float>(zynqmp::FP32, weight_shape);
+    memcpy(weight_data, param_.weight->data<float>(), weight_shape.numel() * sizeof(float));
+    Shape gate_shape(zynqmp::NC, {1, frame_size * 2});
+    gate_ping_.mutableData<void>(FP32, gate_shape);
+    gate_pong_.mutableData<void>(FP16, gate_shape);
+    zynqmp::FullyConnectedParam& pre_out_param = pre_out_pe_.param();
+    pre_out_param.input = &prev_hidden_;
+    pre_out_param.output = &gate_pong_;
+    pre_out_param.filter = &weight_;
+    pre_out_param.bias = &gate_ping_;
+    pre_out_pe_.init();
+    pre_out_pe_.apply();
+    // // ============= C
+    // ElementwiseAddParam& bias_add_param = bias_ew_pe_.param();
+    // bias_add_param.inputs = {&pre_output_, &pre_input_};
+    // bias_add_param.output = &pre_input_;
+    // bias_ew_pe_.init();
+    // bias_ew_pe_.apply();
+    // // ====================
+    // Shape state_weight_shape(NC,{frame_size, frame_size});
+    // float* state_weight_data = state_weight_.mutableData<float>(FP32, state_weight_shape);
+    // memcpy(state_weight_data, weight_data + 2 * frame_size * frame_size, 
+    //   state_weight_shape.numel() * sizeof(float));
+    // FullyConnectedParam& reset_out_param = reset_out_pe_.param();
+    // reset_out_param.input = &prev_hidden;
+    // reset_out_param.output = &gate_ping;
+    // reset_out_param.filter = &state_weight_;
+    // // ============== unit reset;
+    // update_gate_.mutableData<void>(FP16, pre_input_shape);
+    // InputParam& relu_param = update_relu_pe_.param(); 
+    // relu_param.input = &tempTensor;
+    // relu_param.output = &update_gate_;
+    // update_relu_pe_.init();
+    // update_relu_pe_.apply();
+    reset_gate_.mutableData<void>(FP16, hidden_shape);
+    prev_hidden_.mutableData<void>(FP16, hidden_shape);
+    reset_hidden_.mutableData<void>(FP16, hidden_shape);
+    // InputParam& reset_param = reset_relu_pe_.param();
+    // reset_param.input = &tempTensor;
+    // reset_param.output = &reset_gate_;
+    // reset_relu_pe_.init();
+    // reset_relu_pe_.apply();
+    // float16* prev_data = prev_.mutableData<float16>(FP16, pre_input_shape);
+    // memset(prev_data, 0, (pre_input_shape.numel() + 32) * sizeof(float16)); // TODO
+    // reset_hidden_prev_.mutableData<float16>(FP16, pre_input_shape);
+    ElementwiseMulParam& mul_param = mul_pe_.param();
+    mul_param.inputs = {&reset_gate_, &prev_hidden_};
+    mul_param.output = &reset_hidden_;
+    mul_pe_.init();
+    mul_pe_.apply();
+    // ============== 
+  }
+  bool dispatch() {
+    return true;
+  }
+  void gru_unit_reset_act(const lite_api::ActivationType active_gate, GRUTensors& value,
+                int frame_size, int batch_size) {
+    int stride_update = 3 * frame_size;
+    int stride_cell_state = 3 * frame_size;
+    int stride_hidden_prev = frame_size;
+    int stride_hidden = frame_size;
+    // Tensor* gate = value.gate;
+    // value.gate->saveToFile("value_input.txt");
+    float* update_gate_data = gate_ping_.data<float>();
+    float* reset_gate_data = update_gate_data + frame_size;
+    for (int b = 0; b < batch_size; b++) {
+      // memcpy(tempTensor.data<void>(), reset_gate_data, gate->shape().numel() * sizeof(float));
+      // tempTensor.flush();
+      Tensor tmp;
+      Shape s(NC, {1, frame_size}); //TODO
+      float* tmp_data = tmp.mutableData<float>(FP32, s);
+      for (int i = 0; i < frame_size; i++) {
+        // f(x) = x / (1 + abs(x))?
+        update_gate_data[i] = lite::arm::math::active_f32<lite_api::ActivationType::kSigmoid>(update_gate_data[i]);
+        reset_gate_data[i] = lite::arm::math::active_f32<lite_api::ActivationType::kSigmoid>(reset_gate_data[i]);
+      }
+      memcpy(tmp_data, reset_gate_data, frame_size * sizeof(float));
+      tmp.flush();
+      reset_gate_.copyFrom(&tmp);
+      // reset_gate_.copyFrom(&tempTensor);
+      Tensor* hidden_prev = value.pre_output;
+      if (hidden_prev) {
+        // memcpy(prev_data, )
+        // TODO change to pre_out;
+        prev_hidden_.copyFrom(value.pre_output);
+        prev_hidden_.saveToFile("prev_.txt");
+      }
+      // // 4.0 reset_date * hidden_prev;
+      // // reset_hidden_prev[i] = reset_gate[i] * prev;
+      mul_pe_.dispatch();
+      reset_hidden_.saveToFile("reset_hidden_.txt");
+      update_gate_data += stride_update;
+      reset_gate_data += stride_update;
+      // reset_hidden_prev += stride_hidden;// TODO
+    }
+  }
+  void gru_unit_out_act(const lite_api::ActivationType active_node, bool origin_mode,
+                GRUTensors& value, int frame_size, int batch_size) {
+    // int stride_update = 3 * frame_size;
+    // int stride_cell_state = 3 * frame_size;
+    // int stride_hidden_prev = frame_size;
+    // int stride_hidden = frame_size;
+    // Tensor* hidden = value.output_value;
+    // float* hidden_prev = nullptr;
+    // if (hidden) {
+    //   hidden_prev = hidden->data<float>();
+    // }
+    // float* cell_state = value.gate->data<float>() + 2 * frame_size;
+    // float* updata_gate = value.gate->data<float>();
+    // // float* reset_gate_data = update_gate_data + frame_size;
+    // float prev = 0.0f;
+    // for (int b = 0; b < batch_size; ++b) {
+    //   if (origin_mode) {
+    //     // for (int i = 0; i < frame_size; i++) {
+    //     //   float prev = 0;
+    //     //   if (hidden_prev) {
+    //     //     prev = hidden_prev[i];
+    //     //   }
+    //     //   cell_state[i] = lite::arm::math::active_f32<kSigmoid>(cell_state[i]);
+    //     //   hidden[i] =
+    //     //       cell_state[i] * (1.f - updata_gate[i]) + updata_gate[i] * prev;
+    //     // }
+    //   } else {
+    //     for (int i = 0; i < frame_size; ++i) {
+    //       cell_state[i] = lite::arm::math::active_f32<lite_api::ActivationType::kRelu>(cell_state[i]);
+    //       if (hidden_prev) {
+    //        prev = hidden_prev[i];
+    //       }
+    //       float hidden_value =
+    //         prev * (1.f - updata_gate[i]) + updata_gate[i] * cell_state[i];
+    //       hidden_prev[i] = hidden_value;
+    //       std::cout << "hidden_value::" << hidden_value << std::endl;
+    //     }
+    //   }
+    //   updata_gate += stride_update;
+    //   cell_state += stride_cell_state;
+    //   hidden_prev += frame_size;
+    // }
+  }
+  void copy_input(GRUTensors& value) {
+    float max = find_max(*(value.gate));
+    gate_ping_.mutableData<void>(FP32, value.gate->shape());
+    gate_ping_.copyFrom(value.gate);
+    // TODO update input pointer?
+    // gate_.readFromFile("input/in.txt");
+    // // pre_input_.saveToFile("pppp_in.txt");
+    // gate_.scale()[0] = max / 127;
+    // gate_.scale()[1] = 127 / max;
+    // gate_.printScale("pre_input_");
+    // gate_.saveToFile("pre_input_.txt");
+    // pre_out_pe_.dispatch();
+    // pre_output_.saveToFile("pp_out.txt");
+  }
+  void GRUCOmpute(GRUTensors& value,
+                      int frame_size,
+                      int batch_size,
+                      const lite_api::ActivationType active_node,
+                      const lite_api::ActivationType active_gate,
+                      bool origin_mode) {
+    copy_input(value);
+    if (value.pre_output) {
+      // copy by batch;
+      pre_out_pe_.dispatch();
+      gate_ping_.copyFrom(&gate_pong_);
+    }
+    gru_unit_reset_act(active_gate, value, frame_size, batch_size);
+    // if (value.pre_output) {
+    //   // state weight;
+    //   reset_out_pe_.dispatch();
+    // }
+    // gru_unit_out_act(active_node, origin_mode, value, frame_size, batch_size);
+  }
+  GRUParam& param() { return param_; }
+  // Tensor* preOutput() {
+  //   return &pre_output_;
+  // }
+  // Tensor* gate() {
+  //   return &gate_;
+  // }
+  Tensor* updateGate() {
+    return &update_gate_;
+  }
+  Tensor* resetGate() {
+    return &reset_gate_;
+  }
+ private:
+  GRUParam param_;
+  zynqmp::Tensor gate_ping_;
+  zynqmp::Tensor gate_pong_;
+  zynqmp::Tensor bias_;
+  zynqmp::Tensor weight_;
+  zynqmp::Tensor state_weight_;
+  // =================================
+  zynqmp::Tensor update_gate_;
+  zynqmp::Tensor reset_gate_;
+  zynqmp::Tensor cell_state_;
+  zynqmp::Tensor prev_hidden_;
+  zynqmp::Tensor reset_hidden_;
+  Tensor tempTensor;
+  // =================================
+  ReluPE update_relu_pe_;
+  ReluPE reset_relu_pe_;
+  zynqmp::ElementwiseMulPE mul_pe_;
+  zynqmp::FullyConnectedPE pre_out_pe_;
+  zynqmp::FullyConnectedPE reset_out_pe_;
+  zynqmp::ElementwiseAddPE bias_ew_pe_;
+};
+}  // namespace zynqmp
+}  // namespace paddle
--- a/lite/backends/fpga/KD/pes/gru_util.hpp
+++ b/lite/backends/fpga/KD/pes/gru_util.hpp
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "lite/backends/arm/math/gru_utils.h"
+namespace paddle {
+namespace lite {
+namespace fpga {
+// inline void gru_unit_reset_act(lite_api::ActivationType act_type,
+//                                GRUMetaValue<float> value,
+//                                int frame_size,
+//                                int batch_size) {
+//   auto updata_gate = value.gate_value;
+//   auto reset_gate = value.gate_value + frame_size;
+//   auto hidden_prev = value.prev_out_value;
+//   auto reset_hidden_prev = value.reset_output_value;
+//   int stride_update = 3 * frame_size;
+//   int stride_reset = 3 * frame_size;
+//   int stride_hidden_prev = frame_size;
+//   int stride_reset_hidden_prev = frame_size;
+//   if (act_type == kRelu) {
+//   }
+// }
+// void gru_compute(arm::math::GRUMetaValue<float> value,
+//                       int frame_size,
+//                       int batch_size,
+//                       const lite_api::ActivationType active_node,
+//                       const lite_api::ActivationType active_gate,
+//                       bool origin_mode) {
+// 	std::cout << " =================== gru gru_compute =================== \n";
+// 	// exit(-1);
+// 	// sgemm(bool is_transA,
+//  //           bool is_transB,
+//  //           int M,
+//  //           int N,
+//  //           int K,
+//  //           float alpha,
+//  //           const float* A,
+//  //           int lda,
+//  //           const float* B,
+//  //           int ldb,
+//  //           float beta,
+//  //           float* C,
+//  //           int ldc,
+//  //           const float* bias,
+//  //           bool is_bias,
+//  //           bool is_relu,
+//  //           ARMContext* ctx);
+// 	// sgemm for fc;
+//     // lite::arm::math::sgemm(false,
+//     //                    false,
+//     //                    m_,// batch;
+//     //                    n_,// filter num;
+//     //                    k_,// input_channel;
+//     //                    1.f,
+//     //                    i_data,// input data;
+//     //                    k_,
+//     //                    w_data,// weight data;
+//     //                    n_,
+//     //                    0.f,//beta;
+//     //                    o_data,// out data;
+//     //                    n_,
+//     //                    b_data,// bias;
+//     //                    false,
+//     //                    false,
+//     //                    &ctx);
+// 	// C := alpha*op( A )*op( B ) + beta*C,
+// 	if (value.prev_out_value) {
+//       // sgemm(false, // is_transA
+//       //       false, // is_transB
+//       //       batch_size, // M specifies  the number  of rows  of the  matrix
+//       //       frame_size * 2, // N specifies the number  of columns of the matrix
+//       //       frame_size, // K
+//       //       1.f,  // alpha
+//       //       value.prev_out_value, // float* A,
+//       //       frame_size, // lda
+//       //       value.gate_weight, // float* B,
+//       //       frame_size * 2, // ldb
+//       //       1.f, // beta
+//       //       value.gate_value, // C*
+//       //       frame_size * 3, // ldc
+//       //       nullptr, // bias
+//       //       false, // is_bias
+//       //       false, // is_relu
+//       //       ctx); // context
+//     }
+//     // gru_unit_reset_act(active_gate, value, frame_size, batch_size);
+//     if (value.prev_out_value) {
+//       // sgemm(false,
+//       //       false,
+//       //       batch_size,
+//       //       frame_size,
+//       //       frame_size,
+//       //       1.f,
+//       //       value.reset_output_value,
+//       //       frame_size,
+//       //       value.state_weight,
+//       //       frame_size,
+//       //       1.f,
+//       //       value.gate_value + frame_size * 2,
+//       //       frame_size * 3,
+//       //       nullptr,
+//       //       false,
+//       //       false,
+//       //       ctx);
+//     }
+//     // gru_unit_out_act(active_node, origin_mode, value, frame_size, batch_size);
+// }
+}
+}
+}
\ No newline at end of file
--- a/lite/backends/fpga/KD/pes/norm_pe.hpp
+++ b/lite/backends/fpga/KD/pes/norm_pe.hpp
--- a/lite/backends/fpga/KD/pes/output_pe.hpp
+++ b/lite/backends/fpga/KD/pes/output_pe.hpp
@@ -25,6 +25,8 @@ class OutputPE : public PE {
  bool init() {
    Tensor* output = param_.output;
    output->setAligned(false);
+    DLEngine::get_instance().out_data = reinterpret_cast<float*>(
+        fpga_malloc(output->shape().numel() * sizeof(float)));
    return true;
  }
@@ -41,6 +43,15 @@ class OutputPE : public PE {
    } else {
      output->copyFrom(input);
    }
+    //
+    output->syncToCPU();
+    if (DLEngine::get_instance().out_data == nullptr) {
+      DLEngine::get_instance().out_data = reinterpret_cast<float*>(
+          fpga_malloc(output->shape().numel() * sizeof(float)));
+    }
+    memcpy(DLEngine::get_instance().out_data,
+           output->data<void>(),
+           output->shape().numel() * sizeof(float));
    return true;
  }

--- a/lite/backends/fpga/KD/pes/pooling_pe.hpp
+++ b/lite/backends/fpga/KD/pes/pooling_pe.hpp
@@ -35,24 +35,26 @@ class PoolingPE : public PE {
    Tensor* input = param_.input;
    Tensor* output = param_.output;
-    uint32_t k_width = param_.kernelSize[0];
+    uint32_t k_height = param_.kernelSize[0];
-    uint32_t k_height = param_.kernelSize[1];
+    uint32_t k_width = param_.kernelSize[1];
    if (param_.globalPooling) {
      k_width = input->shape().width();
      k_height = input->shape().height();
+      param_.kernelSize[0] = k_height;
+      param_.kernelSize[1] = k_width;
    }
    PoolingArgs args = {0};
    args.mode = param_.type;
-    auto paddings = *param_.paddings;
    args.kernel_reciprocal = fp32_2_fp16(1.0f / (k_width * k_height));
    args.image.address = input->data<float16>();
    args.image.channels = input->shape().channel();
    args.image.height = input->shape().height();
    args.image.width = input->shape().width();
-    args.image.pad_height = paddings[0];
+    args.image.pad_height = param_.paddings[0];
-    args.image.pad_width = paddings[2];
+    args.image.pad_width = param_.paddings[1];
    args.image.scale_address = input->scale();
    args.output.address = output->mutableData<float16>();
    args.output.scale_address = output->scale();
@@ -66,6 +68,9 @@ class PoolingPE : public PE {
    use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1 &&
               (k_width > 7 || k_height > 7);
+    // use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1 &&
+    //            (k_width > 255 || k_height > 255);           
+    use_cpu_ = param_.type == AVERAGE;
  }
  void compute() {
@@ -74,16 +79,16 @@ class PoolingPE : public PE {
    input->syncToCPU();
    Tensor float_input;
+    // Tensor float_output;
    float* image_addr = float_input.mutableData<float>(FP32, input->shape());
    float_input.copyFrom(input);
    float16* data_out = output->data<float16>();
-    auto paddings = *param_.paddings;
    int image_height = input->shape().height();
    int image_width = input->shape().width();
    int image_channels = input->shape().channel();
-    int image_pad_h = paddings[0];
+    int image_pad_h = param_.paddings[0];
-    int image_pad_w = paddings[2];
+    int image_pad_w = param_.paddings[1];
    int kernel_height = param_.kernelSize[1];
    int kernel_width = param_.kernelSize[0];
    int kernel_step_h = param_.strides[0];
@@ -109,6 +114,8 @@ class PoolingPE : public PE {
        for (int c = 0; c < image_channels; ++c) {
          const int pool_index = (ph * pooled_width_ + pw) * image_channels + c;
          float sum = 0;
+          // const int index =
+          //     (hstart * image_width + wstart) * image_channels + c;
          for (int h = hstart; h < hend; ++h) {
            for (int w = wstart; w < wend; ++w) {
              const int index = (h * image_width + w) * image_channels + c;
@@ -129,7 +136,7 @@ class PoolingPE : public PE {
    output->flush();
  }
-  void cpu_compute() {
+  void cpu_compute1() {
    Tensor* input = param_.input;
    Tensor* output = param_.output;
    input->syncToCPU();
@@ -137,6 +144,7 @@ class PoolingPE : public PE {
    Tensor float_input;
    float_input.mutableData<float>(FP32, input->shape());
    float_input.copyFrom(input);
+    // float_input.saveToFile("pool_float.txt");
    float16* data_out = output->data<float16>();
    int kernel_hw = param_.kernelSize[0] * param_.kernelSize[1];
@@ -154,13 +162,44 @@ class PoolingPE : public PE {
    }
    output->scale()[0] = scale_max / 127.0f;
    output->scale()[1] = 127.0f / scale_max;
-    std::cout << "pool scale:" << scale_max / 127.0f << std::endl;
    output->flush();
+    // exit(-1);
+  }
+  void cpu_compute() {
+    Tensor* input = param_.input;
+    Tensor* output = param_.output;
+    input->syncToCPU();
+    Tensor float_input;
+    float* float_input_data = float_input.mutableData<float>(FP32, input->shape());
+    float_input.copyFrom(input);
+    float16* data_out = output->data<float16>();
+    int kernel_hw = param_.kernelSize[0] * param_.kernelSize[1];
+    float scale_max = 0;
+    for (int i = 0; i < output->shape().channel(); i++) {
+      float sum = 0;
+      for (int j = 0; j < kernel_hw; j++) {
+        sum += float_input_data[i * kernel_hw + j];
+      }
+      float value = sum / kernel_hw;
+      data_out[i] = float_to_half(value);
+      scale_max = std::max(scale_max, std::abs(value));
+    }
+    output->scale()[0] = scale_max / 127.0f;
+    output->scale()[1] = 127.0f / scale_max;
+    output->flush();
+    // exit(-1);
  }
  bool dispatch() {
    if (use_cpu_) {
+      // cpu_compute();
      compute();
+      // exit(-1);
      return true;
    }
    param_.input->syncToDevice();

--- a/lite/backends/fpga/KD/pes/prior_box_pe.cpp
+++ b/lite/backends/fpga/KD/pes/prior_box_pe.cpp
--- a/lite/backends/fpga/KD/pes/resize.hpp
+++ b/lite/backends/fpga/KD/pes/resize.hpp
--- a/lite/backends/fpga/KD/pes/scale_pe.hpp
+++ b/lite/backends/fpga/KD/pes/scale_pe.hpp
@@ -17,8 +17,16 @@ limitations under the License. */
 #include "lite/backends/fpga/KD/pe.hpp"
 #include "lite/backends/fpga/KD/pe_params.hpp"
+#include <algorithm> 
+#include "lite/backends/fpga/KD/pe.hpp"
+#include "lite/backends/fpga/KD/pe_params.hpp"
+#include "lite/backends/fpga/KD/tensor.hpp"
+#include "lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp"
 namespace paddle {
 namespace zynqmp {
 class ScalePE : public PE {
 public:
  inline int gcd(int a, int b) {
@@ -38,10 +46,87 @@ class ScalePE : public PE {
    return true;
  }
+  // void apply() {
+  //   Tensor* input = param_.input;
+  //   Tensor* output = param_.output;
+  //   Shape& input_shape = input->shape();
+  //   int channel = input_shape.channel();
+  //   int repeat = 1;
+  //   int alignment = 16;
+  //   int length = channel;
+  //   if (channel % alignment != 0 || channel < alignment) {
+  //     int c_lcm = lcm(channel, alignment);
+  //     repeat = c_lcm / (channel);
+  //   }
+  //   Shape shape(N, {channel * repeat});
+  //   param_.alignedBias()->mutableData<float16>(FP16, shape);
+  //   param_.alignedScale()->mutableData<float16>(FP16, shape);
+  //   float16* bias_data = param_.alignedBias()->data<float16>();
+  //   float16* scale_data = param_.alignedScale()->data<float16>();
+  //   if (param_.bias != nullptr) {
+  //     float* bias_data_float = param_.bias->data<float>();
+  //     for (int i = 0; i < repeat; i++) {
+  //       for (int j = 0; j < length; j++) {
+  //         float16 value = float_to_half(bias_data_float[j]);
+  //         bias_data[i * length + j] = value;
+  //         // bias_data[i * length + j] = float_to_half(1.0f);
+  //       }
+  //     }
+  //   } else {
+  //     float16 zero = float_to_half(0.0f);
+  //     for (int i = 0; i < repeat; i++) {
+  //       for (int j = 0; j < length; j++) {
+  //         bias_data[i * length + j] = zero;
+  //       }
+  //     }
+  //   }
+  //   float* scale_data_float = param_.scale->data<float>();
+  //   for (int i = 0; i < repeat; i++) {
+  //     for (int j = 0; j < length; j++) {
+  //       float16 value = float_to_half(scale_data_float[j]);
+  //       scale_data[i * length + j] = value;
+  //     }
+  //   }
+  //   param_.alignedScale()->flush();
+  //   param_.alignedBias()->flush();
+  //   int wc = input_shape.width() * input_shape.channel();
+  //   int wc_aligned = align_image(wc);
+  //   ScaleArgs& args = param_.args;
+  //   args.scale_address = param_.alignedScale()->data<void>();
+  //   args.bias_address = param_.alignedBias()->data<void>();
+  //   args.wc_alignment = wc_aligned;
+  //   args.channel_alignment = channel * repeat;
+  //   args.image.address = input->data<void>();
+  //   args.image.scale_address = input->scale();
+  //   args.image.channels = channel;
+  //   args.image.height = input_shape.height();
+  //   args.image.width = input_shape.width();
+  //   args.image.pad_width = 0;
+  //   args.image.pad_height = 0;
+  //   args.output.address = output->data<void>();
+  //   args.output.scale_address = output->scale();
+  // }
+  // bool dispatch() {
+  //   param_.input->syncToDevice();
+  //   std::cout << "scale dispatch" << std::endl;
+  //   return compute_fpga_scale(param_.args) == 0;
+  // }
  void apply() {
    Tensor* input = param_.input;
    Tensor* output = param_.output;
-    Shape& input_shape = input->shape();
+    Shape& input_shape = input->shape(); 
+    DepthwiseConvParam& dw_param = dw_pe_.param();
    int channel = input_shape.channel();
    int repeat = 1;
    int alignment = 16;
@@ -51,70 +136,162 @@ class ScalePE : public PE {
      int c_lcm = lcm(channel, alignment);
      repeat = c_lcm / (channel);
    }
+    // TODO FPGA限制 H >2047, W >1023 , WC> 65536 ，需要使用CPU实现
    Shape shape(N, {channel * repeat});
-    param_.alignedBias()->mutableData<float16>(FP16, shape);
-    param_.alignedScale()->mutableData<float16>(FP16, shape);
-    float16* bias_data = param_.alignedBias()->data<float16>();
+    float* filter_data = filter.mutableData<float>(FP32, shape);
-    float16* scale_data = param_.alignedScale()->data<float16>();
+    std::fill_n(filter_data, input->shape().channel(), 1.0f);
+    Tensor *scale = dw_param.scale();
+    float16* scale_data = scale->mutableData<float16>(FP16, shape);
+    // memcpy(scale_data, param_.scale->data<float>(), input->shape().channel() * sizeof(float));
+    Tensor *bias = dw_param.bias();
+    float16* bias_data = bias->mutableData<float16>(FP16, shape);
+    std::fill_n(bias_data, input->shape().channel(), 0);
+    if (param_.scale->dataType() == FP32) {
+      // std::cout << "scale dataType FP32:" << std::endl;
+      if (param_.bias != nullptr) {
+        float* bias_data_float = param_.bias->data<float>();
+        for (int i = 0; i < repeat; i++) {
+          for (int j = 0; j < length; j++) {
+            float16 value = float_to_half(bias_data_float[j]);
+            bias_data[i * length + j] = value;
+          }
+        }
+      } else {
+        float16 zero = float_to_half(0.0f);
+        for (int i = 0; i < repeat; i++) {
+          for (int j = 0; j < length; j++) {
+            bias_data[i * length + j] = zero;
+          }
+        }
+      }
-    if (param_.bias != nullptr) {
+      float* scale_data_float = param_.scale->data<float>();
-      float* bias_data_float = param_.bias->data<float>();
      for (int i = 0; i < repeat; i++) {
        for (int j = 0; j < length; j++) {
-          float16 value = float_to_half(bias_data_float[j]);
+          float16 value = float_to_half(scale_data_float[j]);
-          bias_data[i * length + j] = value;
+          scale_data[i * length + j] = value;
        }
      }
    } else {
-      float16 zero = float_to_half(0.0f);
+      if (param_.bias != nullptr) {
+        float16* bias_data_float = param_.bias->data<float16>();
+        for (int i = 0; i < repeat; i++) {
+          for (int j = 0; j < length; j++) {
+            float16 value = bias_data_float[j];
+            bias_data[i * length + j] = value;
+          }
+        }
+      } else {
+        float16 zero = float_to_half(0.0f);
+        for (int i = 0; i < repeat; i++) {
+          for (int j = 0; j < length; j++) {
+            bias_data[i * length + j] = zero;
+          }
+        }
+      }
+      float16* scale_data_float = param_.scale->data<float16>();
      for (int i = 0; i < repeat; i++) {
        for (int j = 0; j < length; j++) {
-          bias_data[i * length + j] = zero;
+          float16 value = scale_data_float[j];
+          scale_data[i * length + j] = value;
        }
      }
    }
-    float* scale_data_float = param_.scale->data<float>();
+    // if (param_.bias != nullptr) {
-    for (int i = 0; i < repeat; i++) {
+    //   memcpy(bias_data, param_.bias->data<float>(), input->shape().channel() * sizeof(float));
-      for (int j = 0; j < length; j++) {
+    // }
-        float16 value = float_to_half(scale_data_float[j]);
-        scale_data[i * length + j] = value;
+    dw_param.input = param_.input;
+    dw_param.output = param_.output;
+    dw_param.filter = &filter;
+    dw_param.strides = {1, 1};
+    dw_param.paddings = {0, 0};
+    dw_param.kernelSize = {1, 1};
+    dw_param.dilations = {1, 1};
+    dw_pe_.init();
+    dw_pe_.apply();
+  }
+  void cpu_compute() {
+    Tensor* input = param_.input;
+    Tensor* output = param_.output;
+    Tensor float_input;
+    float* image_addr = float_input.mutableData<float>(FP32, input->shape());
+    input->syncToCPU();
+    float_input.copyFrom(input);
+    float16* data_out = output->data<float16>();
+    float* scale_data = param_.scale->data<float>();
+    int wh = input->shape().width() * input->shape().height();
+    float16* in_data = input->data<float16>();
+    float max = 0;
+    for (int i = 0; i < wh; i++) {
+      for (int c = 0; c < input->shape().channel(); c++) {
+        int index = i * input->shape().channel() + c;
+        float value = half_to_float(in_data[index]) * scale_data[c];
+        std::cout << "value:" << value << " = " << half_to_float(in_data[index]) << " x " << scale_data[c] << std::endl;
+        data_out[index] = float_to_half(value);
+        if (value < 0) {
+          value = -value;
+        }
+        if (value > max) {
+          max = value;
+        }
      }
    }
+    output->flush();
-    param_.alignedScale()->flush();
+    output->scale()[0] = max / 127.0f;
-    param_.alignedBias()->flush();
+    output->scale()[1] = 127.0f / max;
-    int wc = input_shape.width() * input_shape.channel();
-    int wc_aligned = align_image(wc);
-    ScaleArgs& args = param_.args;
-    args.scale_address = param_.alignedScale()->data<void>();
-    args.bias_address = param_.alignedBias()->data<void>();
-    args.wc_alignment = wc_aligned;
-    args.channel_alignment = channel * repeat;
-    args.image.address = input->data<void>();
-    args.image.scale_address = input->scale();
-    args.image.channels = channel;
-    args.image.height = input_shape.height();
-    args.image.width = input_shape.width();
-    args.image.pad_width = 0;
-    args.image.pad_height = 0;
-    args.output.address = output->data<void>();
-    args.output.scale_address = output->scale();
  }
  bool dispatch() {
+    // cpu_compute();
+    // return true;
+    if (param_.scale->dataType() == FP16) {
+       DepthwiseConvParam& dw_param = dw_pe_.param();
+       memcpy(dw_param.quantizedFilter()->mutableData<float16>(), param_.scale->data<float16>(), param_.scale->shape().numel() * sizeof(float16));
+       dw_param.quantizedFilter()->scale()[0] = param_.scale->scale()[0];
+       dw_param.quantizedFilter()->scale()[1] = param_.scale->scale()[1];
+       dw_param.quantizedFilter()->flush();
+       // apply();
+    }
+    // param_.scale->saveToFile("scale.txt");
+    // cpu_compute();
+    // return true;
+    // param_.input->syncToDevice();
+    // return compute_fpga_scale(param_.args) == 0;
    param_.input->syncToDevice();
-    return compute_fpga_scale(param_.args) == 0;
+    return dw_pe_.dispatch();
  }
  ScaleParam& param() { return param_; }
 private:
  ScaleParam param_;
+  Tensor filter;
+  DepthwiseConvPE dw_pe_;
 };
 }  // namespace zynqmp
 }  // namespace paddle
--- a/lite/backends/fpga/KD/pes/softmax_pe.hpp
+++ b/lite/backends/fpga/KD/pes/softmax_pe.hpp
--- a/lite/backends/fpga/KD/pes/split_pe.hpp
+++ b/lite/backends/fpga/KD/pes/split_pe.hpp
--- a/lite/backends/fpga/KD/shape.hpp
+++ b/lite/backends/fpga/KD/shape.hpp
@@ -23,6 +23,7 @@ limitations under the License. */
 namespace paddle {
 namespace zynqmp {
+static struct None none_;
 static struct NCHW nchw_;
 static struct NHWC nhwc_;
 static struct NC nc_;
@@ -82,6 +83,9 @@ class Shape {
  void setLayoutType(LayoutType layout) {
    this->layoutType_ = layout;
    switch (layout) {
+      case None:
+        layout_ = &none_;
+        break;
      case NCHW:
        layout_ = &nchw_;
        break;

--- a/lite/backends/fpga/KD/tensor.hpp
+++ b/lite/backends/fpga/KD/tensor.hpp
@@ -24,6 +24,8 @@ limitations under the License. */
 #include <string>
 #include <vector>
+#include <unistd.h>
 // #include "lite/core/tensor.h"
 #include "lite/backends/fpga/KD/dl_engine.hpp"
@@ -117,7 +119,8 @@ class Tensor {
  template <typename Dtype>
  Dtype* mutableData() {
-    size_t memorySize = shape_->memorySize(CellSize(dataType_));
+    size_t memorySize =
+        shape_->memorySize(CellSize(dataType_)) * mem_scale_factor_;
    if (placeHolder_ != nullptr) {
      if (memorySize > placeHolder_->memorySize()) {
        placeHolder_.reset(new PlaceHolder(memorySize));
@@ -241,6 +244,10 @@ class Tensor {
    }
  }
+  void setMemScale(float scale_factor) {
+    this->mem_scale_factor_ = scale_factor;
+  }
  void shareDataWith(Tensor* src) { shareDataWith(src, src->shape()); }
  void shareDataWith(Tensor* src, const Shape& shape, int offset = 0) {
@@ -276,9 +283,11 @@ class Tensor {
                  .height = 1,
                  .pad_width = 0u,
                  .pad_height = 0u};
-    args.output = {
+    ImageOutputArgs output = {
        .address = data<void>(), .scale_address = scale(),
    };
+    args.output = output;
    src->syncToDevice();
    size_t aligned_remainder = src->shape().numel() % 16;
    if (aligned_remainder > 0) {
@@ -294,10 +303,14 @@ class Tensor {
    this->invalidate();
  }
-  void flush() { fpga_flush(placeHolder_->data(), placeHolder_->memorySize()); }
+  void flush() { 
+    size_t memorySize = shape_->memorySize(CellSize(dataType_)) * mem_scale_factor_;
+    fpga_flush(placeHolder_->data(), memorySize); 
+  }
  void invalidate() {
-    fpga_invalidate(placeHolder_->data(), placeHolder_->memorySize());
+    size_t memorySize = shape_->memorySize(CellSize(dataType_)) * mem_scale_factor_;
+    fpga_invalidate(placeHolder_->data(), memorySize);
  }
  void sync() {
@@ -337,6 +350,17 @@ class Tensor {
    if (placeHolder_ == nullptr) {
      return;
    }
+    std::cout << scale()[0] << " , " << scale()[1] << std::endl;
+  }
+  void printScale(std::string type) {
+    std::cout << type << " : "
+              << std::to_string(shape_->num()) + "_" +
+                     std::to_string(shape_->channel()) + "_" +
+                     std::to_string(shape_->height()) + "_" + std::to_string(shape_->width())
+              << std::endl;
+    std::cout << type << " \n";
+    printScale();
  }
  std::string dimsFileName() {
@@ -358,48 +382,39 @@ class Tensor {
    saveToFile(path);
  }
-  friend std::ostream& operator<<(std::ostream& os, Tensor& tensor) {
-    os << "tensor:"
-       << "\n";
-    os << "dims: {";
-    for (int i = 0; i < tensor.shape().dimSize(); ++i) {
-      os << tensor.shape()[i] << " ";
-    }
-    os << "}\n";
-    for (int i = 0; i < tensor.shape().numel(); i++) {
-      float value = 0;
-      if (tensor.dataType() == FP32) {
-        value = tensor.data<float>()[i];
-      } else {
-        value = half_to_float(tensor.data<float16>()[i]);
-      }
-      os << value << " ";
-    }
-    os << "\n";
-    return os;
-  }
  void saveToFile(std::string path) {
    syncToCPU();
+    invalidate();
    std::ofstream ofs;
    static int counter = 0;
    std::string npath = std::to_string(counter) + "_" + path;
    counter++;
+    std::cout << "======== saving file:" << npath << " ============\n";
    save_file_with_name(npath);
  }
  void save_file_with_name(std::string path) {
-    // return;
+    return;
    invalidate();
+    // usleep(20000);
+    // return;
    std::ofstream ofs;
    ofs.open(path);
+    ofs << "dataType: " << dataType_ << std::endl;
+    ofs << "scale: " << scale()[0] << " , " << scale()[1] << std::endl;
    for (int i = 0; i < shape_->numel(); i++) {
      float value = 0;
      if (dataType_ == FP32) {
        value = data<float>()[i];
-      } else {
+      } else if (dataType_ == FP16) {
        value = half_to_float(data<float16>()[i]);
+      } else {
+        value = data<int8_t>()[i];
      }
      ofs << value << std::endl;
    }
@@ -415,18 +430,49 @@ class Tensor {
    int num = shape_->numel();
    invalidate();
    float max = 0.0f;
-    float16* data = mutableData<float16>();
+    if (dataType_ == FP16) {
-    for (int i = 0; i < num; ++i) {
+      float16* data = mutableData<float16>();
-      float value = 0;
+      for (int i = 0; i < num; ++i) {
-      file_stream >> value;
+        float value = 0;
-      max = std::max(std::abs(value), max);
+        file_stream >> value;
-      data[i] = float_to_half(value);
+        max = std::max(std::abs(value), max);
+        data[i] = float_to_half(value);
+      }
+    } else {
+      float* data = mutableData<float>();
+      for (int i = 0; i < num; ++i) {
+        float value = 0;
+        file_stream >> value;
+        max = std::max(std::abs(value), max);
+        data[i] = value;
+      }
    }
    flush();
    placeHolder_->scale_[0] = max / 127.0f;
    placeHolder_->scale_[1] = 127.0f / max;
  }
+  friend std::ostream& operator<<(std::ostream& os, Tensor& tensor) {
+    os << "tensor:"
+       << "\n";
+    os << "dims: {";
+    for (int i = 0; i < tensor.shape().dimSize(); ++i) {
+      os << tensor.shape()[i] << " ";
+    }
+    os << "}\n";
+    for (int i = 0; i < tensor.shape().numel(); i++) {
+      float value = 0;
+      if (tensor.dataType() == FP32) {
+        value = tensor.data<float>()[i];
+      } else {
+        value = half_to_float(tensor.data<float16>()[i]);
+      }
+      os << value << " ";
+    }
+    os << "\n";
+    return os;
+  }
  ~Tensor() {
    if (shape_ != nullptr) {
      delete shape_;
@@ -436,6 +482,7 @@ class Tensor {
 private:
  int offset = 0;
+  float mem_scale_factor_ = 1.0f;
  std::shared_ptr<PlaceHolder> placeHolder_;
  Shape* shape_ = nullptr;
  DataType dataType_ = FP32;

--- a/lite/backends/fpga/KD/tensor_util.cpp
+++ b/lite/backends/fpga/KD/tensor_util.cpp
--- a/lite/backends/fpga/KD/tensor_util.hpp
+++ b/lite/backends/fpga/KD/tensor_util.hpp
--- a/lite/backends/fpga/lite_tensor.cc
+++ b/lite/backends/fpga/lite_tensor.cc
@@ -92,13 +92,28 @@ void *TensorLite::mutable_data(TargetType target, size_t memory_size) {
 }
 void TensorLite::CopyDataFrom(const TensorLite &other) {
+  // std::cout << "1\n";
  dims_ = other.dims_;
+  // std::cout << "2\n";
  target_ = other.target_;
+  // std::cout << "3\n";
  lod_ = other.lod_;
-  // memory_size_ = other.memory_size_;
+  auto dt = zynq_tensor_->dataType();
-  // buffer_->CopyDataFrom(*other.buffer_, memory_size_);
-  zynq_tensor_->mutableData<void>(other.zynq_tensor_->dataType(),
+  // std::cout << "4\n";
-                                  other.zynq_tensor_->shape());
+  // std::cout << "dt:" << dt << std::endl;
+  auto shape = other.zynq_tensor_->shape();
+  Resize(other.dims());
+  // mutable_data<float>();
+  zynq_tensor_->mutableData<void>(zynq_tensor_->dataType(), shape);
+  // std::cout << "copy Data From: \n";
+  // std::cout << "ss" << (void*)(other.ZynqTensor()) << "\n";
+  this->ZynqTensor()->copyFrom(other.ZynqTensor());
+  // set_lod(other.lod());
 }
 // template <typename T>

--- a/lite/backends/fpga/lite_tensor.h
+++ b/lite/backends/fpga/lite_tensor.h
@@ -106,7 +106,7 @@ class TensorLite {
  // For other devices, T and R may be the same type.
  template <typename T, typename R = T>
  const R *data() const {
-    return zynq_tensor_->data<R>();
+    return zynq_tensor_->data<R>() + offset_;
  }
  void Resize(const DDimLite &ddim) { dims_ = ddim; }
@@ -125,6 +125,7 @@ class TensorLite {
  bool persistable() const { return persistable_; }
  void set_persistable(bool persistable) { persistable_ = persistable; }
  // T is the data type and R is the return type
  // For OpenCL, the return type can be cl::Buffer
  // and the data type can be float/int8_t.
@@ -147,6 +148,8 @@ class TensorLite {
  size_t memory_size() const { return zynq_tensor_->memorySize(); }
+  size_t offset() const { return offset_; }
  bool IsInitialized() const { return buffer_->data(); }
  // Other share data to this.
@@ -157,8 +160,14 @@ class TensorLite {
  template <typename T>
  TensorLite Slice(int64_t begin, int64_t end) const;
+  template <typename T>
+  void Slice(TensorLite& dst, int64_t begin, int64_t end) const;
  TargetType target() const { return target_; }
+  // template <typename T>
+  // TensorLite Slice(int64_t begin, int64_t end) const;
  zynqmp::Tensor *ZynqTensor() const { return zynq_tensor_; }
  friend std::ostream &operator<<(std::ostream &os, const TensorLite &tensor) {
@@ -173,16 +182,21 @@ class TensorLite {
 private:
  TargetType target_{TargetType::kHost};
+  // precision_ and persistable_ are only used for persistable vars.
+  // If your tensor wants to be saved and loaded correctly, you must
+  // set values of precision_ and persistable_ after updating it.
+  // If your tensor is just a temp tensor, such as activations,
+  // you can ignore these two attributes.
+  PrecisionType precision_{PrecisionType::kUnk};
+  bool persistable_{false};
  DDimLite dims_;
  std::shared_ptr<Buffer> buffer_;
  LoD lod_;
  size_t memory_size_{};
  size_t offset_{0};
-  PrecisionType precision_{PrecisionType::kUnk};
-  bool persistable_{false};
  zynqmp::Tensor *zynq_tensor_ = new zynqmp::Tensor();
  template <typename T>
@@ -197,6 +211,9 @@ R *TensorLite::mutable_data() {
  }
  zynqmp::LayoutType layout_type = zynqmp::NCHW;
  switch (v.size()) {
+    case 0:
+      layout_type = zynqmp::None;
+      break;
    case 1:
      layout_type = zynqmp::N;
      break;
@@ -228,24 +245,79 @@ R *TensorLite::mutable_data(TargetType target) {
  return mutable_data<T>();
 }
-template <typename TensorT>
-bool TensorCompareWith(const TensorT &a, const TensorT &b) {
-  if (a.dims() != b.dims()) return false;
-  if (memcmp(a.raw_data(), b.raw_data(), a.data_size()) != 0) return false;
-  return true;
-}
 template <typename T>
 TensorLite TensorLite::Slice(int64_t begin, int64_t end) const {
-  int64_t base = numel() / dims_[0];
-  TensorLite dst;
-  dst.buffer_ = buffer_;
+  throw -1;
+  CHECK_GE(begin, 0);
+  CHECK_LE(end, dims_[0]);
+  CHECK_LT(begin, end);
+  if (dims_[0] == 1) {
+    return *this;
+  } else {
+    int64_t base = numel() / dims_[0];
+    TensorLite dst;
+    // dst.buffer_ = buffer_;
+    // dst.zynq_tensor_ = zynq_tensor_;
+    dst.target_ = target_;
+    auto dst_dims = dims_;
+    dst_dims[0] = end - begin;
+    dst.Resize(dst_dims);
+    void* dst_data = dst.mutable_data<T>();
+    T* src_data = const_cast<T*>(data<T>());
+    memcpy(dst_data, src_data + static_cast<size_t>(begin * base) * sizeof(T), dst_dims.production() * sizeof(T));
+    dst.ZynqTensor()->saveToFile("_slice", true);
+    // dst.offset_ = offset_ + static_cast<size_t>(begin * base) * sizeof(T);
+    return dst;
+  }  
+}
+template <typename T>
+void TensorLite::Slice(TensorLite& dst, int64_t begin, int64_t end) const {
+  CHECK_GE(begin, 0);
+  CHECK_LE(end, dims_[0]);
+  CHECK_LT(begin, end);
  dst.target_ = target_;
  auto dst_dims = dims_;
  dst_dims[0] = end - begin;
  dst.Resize(dst_dims);
-  dst.offset_ = offset_ + static_cast<size_t>(begin * base) * sizeof(T);
+  void* dst_data = dst.mutable_data<T>();
-  return dst;
+  int64_t base = numel() / dims_[0];
+  T* src_data = const_cast<T*>(data<T>());
+  std::cout << "end:" << end << " begin:" << begin << std::endl;
+  std::cout << "base:" << base << std::endl;
+  std::cout << "production:" << dst_dims.production() << std::endl;
+  memcpy(dst_data, src_data + static_cast<size_t>(begin * dst_dims.production()), dst_dims.production() * sizeof(T));
+  // dst.ZynqTensor()->saveToFile("_slice", true);
+  // if (dims_[0] == 1) {
+  //   dst-
+  //   return;
+  // } else {
+  //   // dst.offset_ = offset_ + static_cast<size_t>(begin * base) * sizeof(T);
+  //   return dst;
+  // }  
+}
+template <typename TensorT>
+bool TensorCompareWith(const TensorT &a, const TensorT &b) {
+  if (a.dims() != b.dims()) return false;
+  if (memcmp(a.raw_data(), b.raw_data(), a.data_size()) != 0) return false;
+  return true;
 }
 }  // namespace lite
 }  // namespace paddle
--- a/lite/backends/fpga/target_wrapper.cc
+++ b/lite/backends/fpga/target_wrapper.cc
--- a/lite/kernels/fpga/CMakeLists.txt
+++ b/lite/kernels/fpga/CMakeLists.txt
@@ -4,29 +4,40 @@ endif()
 set(fpga_deps fpga_target_wrapper kernel_fpga)
-add_kernel(activation_compute_fpga FPGA basic SRCS activation_compute.cc DEPS ${fpga_deps})
-lite_cc_test(test_acivation_fpga SRCS activation_compute_test.cc DEPS ${lite_kernel_deps} activation_compute_fpga ${fpga_deps})
+# add_kernel(activation_compute_fpga FPGA basic SRCS activation_compute.cc DEPS ${fpga_deps})
+# add_kernel(box_coder_compute_fpga FPGA basic SRCS box_coder_compute.cc DEPS ${fpga_deps})
+add_kernel(concat_compute_fpga FPGA basic SRCS concat_compute.cc DEPS ${fpga_deps})
 add_kernel(conv_compute_fpga FPGA basic SRCS conv_compute.cc DEPS ${fpga_deps})
-lite_cc_test(test_conv_fpga SRCS conv_compute_test.cc DEPS ${lite_kernel_deps} conv_compute_fpga ${fpga_deps})
+add_kernel(density_prior_box_compute_fpga FPGA basic SRCS density_prior_box_compute.cc DEPS ${fpga_deps})
+add_kernel(dropout_compute_fpga FPGA basic SRCS dropout_compute.cc DEPS ${fpga_deps})
 add_kernel(elementwise_compute_fpga FPGA basic SRCS elementwise_compute.cc DEPS ${fpga_deps})
-lite_cc_test(test_elementwise_fpga SRCS elementwise_compute_test.cc DEPS ${lite_kernel_deps} elementwise_compute_fpga ${fpga_deps})
+add_kernel(fc_compute_fpga FPGA basic SRCS fc_compute.cc DEPS ${fpga_deps})
+add_kernel(gru_compute_fpga FPGA extra SRCS gru_compute.cc DEPS ${fpga_deps})
+# add_kernel(mul_compute_fpga FPGA basic SRCS mul_compute.cc DEPS ${fpga_deps})
+# add_kernel(multiclass_nms_compute_fpga FPGA basic SRCS multiclass_nms_compute.cc DEPS ${fpga_deps})
+add_kernel(norm_compute_fpga FPGA basic SRCS norm_compute.cc DEPS ${fpga_deps})
+# add_kernel(im2sequence_compute_fpga FPGA basic SRCS im2sequence_compute.cc DEPS ${fpga_deps})
 add_kernel(pooling_compute_fpga FPGA basic SRCS pooling_compute.cc DEPS ${fpga_deps})
-lite_cc_test(test_pooling_compute_fpga SRCS pooling_compute_test.cc DEPS ${lite_kernel_deps} pooling_compute_fpga ${fpga_deps})
+# add_kernel(prior_box_compute_fpga FPGA basic SRCS prior_box_compute.cc DEPS ${fpga_deps})
+# add_kernel(reshape_compute_fpga FPGA basic SRCS reshape_compute.cc DEPS ${fpga_deps} reshape_op)
+# add_kernel(sequence_pool_compute_fpga FPGA basic SRCS sequence_pool_compute.cc DEPS ${fpga_deps})
 add_kernel(scale_compute_fpga FPGA basic SRCS scale_compute.cc DEPS ${fpga_deps})
+# add_kernel(softmax_compute_fpga FPGA basic SRCS softmax_compute.cc DEPS ${fpga_deps})
-add_kernel(softmax_compute_fpga FPGA basic SRCS softmax_compute.cc DEPS ${fpga_deps})
+# add_kernel(transpose_compute_fpga FPGA basic SRCS transpose_compute.cc DEPS ${fpga_deps})
-lite_cc_test(test_softmax_compute_fpga SRCS softmax_compute_test.cc DEPS ${lite_kernel_deps} softmax_compute_fpga ${fpga_deps})
-add_kernel(fc_compute_fpga FPGA basic SRCS fc_compute.cc DEPS ${fpga_deps})
-lite_cc_test(test_fc_compute_fpga SRCS fc_compute_test.cc DEPS ${lite_kernel_deps} fc_compute_fpga ${fpga_deps})
 add_kernel(io_copy_compute_fpga FPGA basic SRCS io_copy_compute.cc DEPS ${fpga_deps})
 add_kernel(calib_compute_fpga FPGA basic SRCS calib_compute.cc DEPS ${fpga_deps})
 add_kernel(layout_compute_fpga FPGA basic SRCS layout_compute.cc DEPS ${fpga_deps})
 add_kernel(feed_compute_fpga FPGA basic SRCS feed_compute.cc DEPS ${fpga_deps})
 add_kernel(fetch_compute_fpga FPGA basic SRCS fetch_compute.cc DEPS ${fpga_deps})
+# add_kernel(while_compute_fpga FPGA extra SRCS while_compute.cc DEPS ${fpga_deps})
+# add_kernel(write_to_array_compute_fpga FPGA extra SRCS write_to_array_compute.cc DEPS ${fpga_deps})
+# lite_cc_test(test_acivation_fpga SRCS activation_compute_test.cc DEPS ${lite_kernel_deps} activation_compute_fpga ${fpga_deps})
+lite_cc_test(test_conv_fpga SRCS conv_compute_test.cc DEPS ${lite_kernel_deps} conv_compute_fpga ${fpga_deps})
+lite_cc_test(test_elementwise_fpga SRCS elementwise_compute_test.cc DEPS ${lite_kernel_deps} elementwise_compute_fpga ${fpga_deps})
+lite_cc_test(test_fc_compute_fpga SRCS fc_compute_test.cc DEPS ${lite_kernel_deps} fc_compute_fpga ${fpga_deps})
+lite_cc_test(test_pooling_compute_fpga SRCS pooling_compute_test.cc DEPS ${lite_kernel_deps} pooling_compute_fpga ${fpga_deps})
+# lite_cc_test(test_softmax_compute_fpga SRCS softmax_compute_test.cc DEPS ${lite_kernel_deps} softmax_compute_fpga ${fpga_deps})
--- a/lite/kernels/fpga/activation_compute.cc
+++ b/lite/kernels/fpga/activation_compute.cc
--- a/lite/kernels/fpga/activation_compute.h
+++ b/lite/kernels/fpga/activation_compute.h
--- a/lite/kernels/fpga/activation_compute_test.cc
+++ b/lite/kernels/fpga/activation_compute_test.cc
--- a/lite/kernels/fpga/beam_search_decode_compute.cc
+++ b/lite/kernels/fpga/beam_search_decode_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/arm/beam_search_decode_compute.h"
+#include <algorithm>
+#include <vector>
+#include "lite/api/paddle_place.h"
+#include "lite/backends/arm/math/funcs.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/core/type_system.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+using LoDTensor = lite::Tensor;
+using LoDTensorArray = std::vector<lite::Tensor>;
+// all the lod have 2 levels.
+// The first is source level, the second is sentence level.
+// source level describe how many prefixes (branchs) for each source sentece
+// (beam). sentence level describe how these candidates belong to the prefixes.
+const size_t kSourceLevel = 0;
+const size_t kSentenceLevel = 1;
+template <typename T>
+struct Sentence {
+  std::vector<float> word_ids;
+  std::vector<T> scores;
+};
+template <typename T>
+using SentenceVector = std::vector<Sentence<T>>;
+template <typename T>
+struct BeamSearchDecoder {
+  BeamSearchDecoder(size_t beam_size, int end_id)
+      : beam_size_(beam_size), end_id_(end_id) {}
+  /**
+   * convert the result sentence_vector for each source sentence into two
+   * LodTensor.
+   * One is all candidate sentences with word id, one is all candidate sentences
+   * with word score.
+   * Param:
+   *  sentence_vector_list: sentence_vector for each source sentence.
+   *  id_tensor: result LoDTensor for sentences of id.
+   *  score_tensor: result LoDTensor for sentences of score.
+   *  reverse: whether ids of sentence in sentence_vector_list is reversed
+   *  sort_by_score: whether to sort hypotheses of each sentence by scores.
+   */
+  void ConvertSentenceVectorToLodTensor(
+      std::vector<SentenceVector<T>> sentence_vector_list,
+      LoDTensor* id_tensor,
+      LoDTensor* score_tensor,
+      bool reverse = true,
+      bool sort_by_score = true) const {
+    size_t src_num = sentence_vector_list.size();
+    CHECK_GT(src_num, 0) << "src_num should not be 0";
+    std::vector<uint64_t> source_level_lod = {0};
+    std::vector<uint64_t> sentence_level_lod = {0};
+    std::vector<float> id_data;
+    std::vector<T> score_data;
+    for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
+      if (sort_by_score) {
+        sort(sentence_vector_list[src_idx].begin(),
+             sentence_vector_list[src_idx].end(),
+             [reverse](const Sentence<T>& a, const Sentence<T>& b) {
+               if (reverse)
+                 return a.scores.front() > b.scores.front();
+               else
+                 return a.scores.back() > b.scores.back();
+             });
+      }
+      for (Sentence<T>& sentence : sentence_vector_list[src_idx]) {
+        if (reverse) {
+          id_data.insert(id_data.end(),
+                         sentence.word_ids.rbegin(),
+                         sentence.word_ids.rend());
+          score_data.insert(score_data.end(),
+                            sentence.scores.rbegin(),
+                            sentence.scores.rend());
+        } else {
+          id_data.insert(id_data.end(),
+                         sentence.word_ids.begin(),
+                         sentence.word_ids.end());
+          score_data.insert(
+              score_data.end(), sentence.scores.begin(), sentence.scores.end());
+        }
+        sentence_level_lod.push_back(sentence_level_lod.back() +
+                                     sentence.word_ids.size());
+      }
+      source_level_lod.push_back(source_level_lod.back() +
+                                 sentence_vector_list[src_idx].size());
+    }
+    LoD lod;
+    lod.push_back(source_level_lod);
+    lod.push_back(sentence_level_lod);
+    *(id_tensor->mutable_lod()) = lod;
+    id_tensor->Resize({static_cast<int64_t>(id_data.size())});
+    auto id_ptr = id_tensor->mutable_data<float>();
+    TargetCopy(
+        TARGET(kARM), id_ptr, id_data.data(), id_data.size() * sizeof(float));
+    *(score_tensor->mutable_lod()) = lod;
+    score_tensor->Resize({static_cast<int64_t>(score_data.size())});
+    auto score_ptr = score_tensor->mutable_data<T>();
+    TargetCopy(TARGET(kARM),
+               score_ptr,
+               score_data.data(),
+               score_data.size() * sizeof(T));
+  }
+  /**
+   * Gather the hypotheses for each source sentence by backtrace though the
+   * LoDTensorArray step_ids whose lods reserve the path in the tree.
+   */
+  void Backtrace(const LoDTensorArray& step_ids,
+                 const LoDTensorArray& step_scores,
+                 LoDTensor* id_tensor,
+                 LoDTensor* score_tensor) const {
+    CHECK(!step_ids.empty()) << "step num should be larger than 0";
+    CHECK_EQ(step_ids.size(), step_scores.size())
+        << "step_ids and step_scores should be the same";
+    const size_t step_num = step_ids.size();
+    const size_t src_num = step_ids.at(0).lod().at(kSourceLevel).size() - 1;
+    std::vector<SentenceVector<T>> sentence_vector_list(
+        src_num, SentenceVector<T>(beam_size_));
+    std::vector<std::vector<size_t>> prefix_idx_vector_list(src_num);
+    for (int step_id = step_num - 1; step_id >= 0; --step_id) {
+      auto& cur_ids = step_ids.at(step_id);
+      auto& cur_scores = step_scores.at(step_id);
+      for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
+        // for each source sentence
+        auto& sentence_vector = sentence_vector_list.at(src_idx);
+        auto& prefix_idx_vector = prefix_idx_vector_list.at(src_idx);
+        size_t src_prefix_start = cur_ids.lod().at(kSourceLevel)[src_idx];
+        size_t src_prefix_end = cur_ids.lod().at(kSourceLevel)[src_idx + 1];
+        if (prefix_idx_vector.empty()) {  // be finished and pruned at this step
+          // or the last time step
+          for (size_t prefix_idx = src_prefix_start;
+               prefix_idx < src_prefix_end;
+               ++prefix_idx) {
+            size_t candidate_start =
+                cur_ids.lod().at(kSentenceLevel)[prefix_idx];
+            size_t candidate_end =
+                cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1];
+            for (size_t candidate_idx = candidate_start;
+                 candidate_idx < candidate_end;
+                 ++candidate_idx) {
+              prefix_idx_vector.push_back(prefix_idx);
+              size_t idx = prefix_idx_vector.size() - 1;
+              auto cur_id = cur_ids.data<float>()[candidate_idx];
+              auto cur_score = cur_scores.data<T>()[candidate_idx];
+              sentence_vector.at(idx).word_ids.push_back(cur_id);
+              sentence_vector.at(idx).scores.push_back(cur_score);
+            }
+          }
+        } else {  // use prefix_idx_vector to backtrace
+          size_t src_candidate_start =
+              cur_ids.lod().at(kSentenceLevel)[src_prefix_start];
+          size_t prefix_idx = src_prefix_start;
+          size_t candidate_num =
+              cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1] -
+              cur_ids.lod().at(kSentenceLevel)[prefix_idx];
+          for (size_t idx = 0; idx < prefix_idx_vector.size(); ++idx) {
+            auto candidate_idx = prefix_idx_vector.at(idx);
+            auto cur_id = cur_ids.data<float>()[candidate_idx];
+            auto cur_score = cur_scores.data<T>()[candidate_idx];
+            if (cur_id != end_id_ || sentence_vector.at(idx).word_ids.empty()) {
+              // to skip redundant end tokens
+              sentence_vector.at(idx).word_ids.push_back(cur_id);
+              sentence_vector.at(idx).scores.push_back(cur_score);
+            }
+            while (src_candidate_start + candidate_num <=
+                   candidate_idx) {  // search the corresponding prefix
+              prefix_idx++;
+              candidate_num +=
+                  cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1] -
+                  cur_ids.lod().at(kSentenceLevel)[prefix_idx];
+            }
+            prefix_idx_vector.at(idx) = prefix_idx;
+          }
+        }
+      }
+    }
+    ConvertSentenceVectorToLodTensor(
+        sentence_vector_list, id_tensor, score_tensor, true, true);
+  }
+  size_t beam_size_;
+  int end_id_;
+};
+struct BeamSearchDecodeFunctor {
+  BeamSearchDecodeFunctor(const LoDTensorArray& step_ids,
+                          const LoDTensorArray& step_scores,
+                          LoDTensor* id_tensor,
+                          LoDTensor* score_tensor,
+                          size_t beam_size,
+                          int end_id)
+      : beam_size_(beam_size),
+        end_id_(end_id),
+        step_ids_(step_ids),
+        step_scores_(step_scores),
+        id_tensor_(id_tensor),
+        score_tensor_(score_tensor) {}
+  template <typename T>
+  void apply() const {
+    BeamSearchDecoder<T> beam_search_decoder(beam_size_, end_id_);
+    beam_search_decoder.Backtrace(
+        step_ids_, step_scores_, id_tensor_, score_tensor_);
+  }
+  size_t beam_size_;
+  int end_id_;
+  const LoDTensorArray& step_ids_;
+  const LoDTensorArray& step_scores_;
+  LoDTensor* id_tensor_;
+  LoDTensor* score_tensor_;
+};
+template <>
+void BeamSearchDecodeFunctor::apply<bool>() const {
+  LOG(FATAL) << "beam search decode op does not support bool!";
+}
+void BeamSearchDecodeCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<ARMContext>();
+  // inputs
+  auto ids = param.ids;
+  auto scores = param.scores;
+  // outputs
+  auto sentence_ids = param.sentence_ids;
+  auto sentence_scores = param.sentence_scores;
+  const size_t step_num = ids->size();
+  CHECK_GT(step_num, 0UL) << "beam search steps should be larger than 0";
+  const size_t source_num = ids->at(0).lod().at(0).size() - 1;
+  CHECK_GT(source_num, 0UL) << "source num should be larger than 0";
+  for (size_t i = 0; i < step_num; ++i) {
+    CHECK_EQ(ids->at(i).lod().size(), 2UL) << "Level of LodTensor should be 2";
+  }
+  //! fixme
+  // only support float score now
+  BeamSearchDecodeFunctor func(*ids,
+                               *scores,
+                               sentence_ids,
+                               sentence_scores,
+                               param.beam_size,
+                               param.end_id);
+  func.apply<float>();
+}
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+REGISTER_LITE_KERNEL(beam_search_decode,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::BeamSearchDecodeCompute,
+                     def)
+    .BindInput("Ids", {LiteType::GetTensorListTy(TARGET(kARM))})
+    .BindInput("Scores", {LiteType::GetTensorListTy(TARGET(kARM))})
+    .BindOutput("SentenceIds", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("SentenceScores", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
--- a/lite/kernels/fpga/beam_search_decode_compute.h
+++ b/lite/kernels/fpga/beam_search_decode_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <algorithm>
+#include "lite/core/kernel.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace fpga {
+class BeamSearchDecodeCompute
+    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
+ public:
+  using param_t = operators::BeamSearchDecodeParam;
+  BeamSearchDecodeCompute() = default;
+  void Run() override;
+  virtual ~BeamSearchDecodeCompute() = default;
+};
+}  // namespace fpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/fpga/box_coder_compute.cc
+++ b/lite/kernels/fpga/box_coder_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/fpga/box_coder_compute.h"
+#include <string>
+#include <vector>
+#include "lite/backends/arm/math/funcs.h"
+#include "lite/backends/fpga/KD/float16.hpp"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace fpga {
+using float16 = zynqmp::float16;
+void BoxCoderCompute::Run() {
+  auto& param = Param<operators::ReshapeParam>();
+  param.output->mutable_data<float16>();
+}
+}  // namespace fpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+REGISTER_LITE_KERNEL(box_coder,
+                     kFPGA,
+                     kFP16,
+                     kNHWC,
+                     paddle::lite::kernels::fpga::BoxCoderCompute,
+                     def)
+    .BindInput("PriorBox",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindInput("PriorBoxVar",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindInput("TargetBox",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("OutputBox",
+                {LiteType::GetTensorTy(TARGET(kFPGA),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
--- a/lite/kernels/fpga/box_coder_compute.h
+++ b/lite/kernels/fpga/box_coder_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace fpga {
+class BoxCoderCompute
+    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
+ public:
+  using param_t = operators::BoxCoderParam;
+  void Run() override;
+  virtual ~BoxCoderCompute() = default;
+};
+}  // namespace fpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/fpga/calib_compute.cc
+++ b/lite/kernels/fpga/calib_compute.cc
@@ -23,24 +23,45 @@ namespace lite {
 namespace kernels {
 namespace fpga {
 using float16 = zynqmp::float16;
+// void CalibComputeFp32ToFP16::PrepareForRun() {
+// }
 void CalibComputeFp32ToFP16::Run() {
  auto& param = this->Param<operators::CalibParam>();
  const auto* din = param.input->data<float>();
-  auto* dout = param.output->mutable_data<float16>(TARGET(kFPGA));
+  param.output->mutable_data<float16>();
+  param.output->ZynqTensor()->copyFrom(param.input->ZynqTensor());
-  for (int i = 0; i < param.input->numel(); ++i) {
+  // for (int i = 0; i < param.input->numel(); ++i) {
-    dout[i] = zynqmp::float_to_half(din[i]);
+  //   dout[i] = zynqmp::float_to_half(din[i]);
-  }
+  // }
+  param.input->ZynqTensor()->saveToFile("calib_input.txt");
+  param.output->ZynqTensor()->saveToFile("ouput_31.txt");
+  param.output->ZynqTensor()->printScale("calib");
+  auto out_lod = param.output->mutable_lod();
+  *out_lod = param.input->lod();
  return;
 }
+// void CalibComputeFP16ToFp32::PrepareForRun() {
+// }
 void CalibComputeFP16ToFp32::Run() {
  auto& param = this->Param<operators::CalibParam>();
  const auto* din = param.input->data<float16>();
-  auto* dout = param.output->mutable_data<float>(TARGET(kFPGA));
+  auto* dout = param.output->mutable_data<float>();
-  for (int i = 0; i < param.input->numel(); ++i) {
+  // for (int i = 0; i < param.input->numel(); ++i) {
-    dout[i] = zynqmp::half_to_float(din[i]);
+  //   dout[i] = zynqmp::half_to_float(din[i]);
-  }
+  // }
+  param.output->ZynqTensor()->copyFrom(param.input->ZynqTensor());
+  param.output->ZynqTensor()->saveToFile("ouput_13.txt");
+  auto out_lod = param.output->mutable_lod();
+  *out_lod = param.input->lod();
  return;
 }

--- a/lite/kernels/fpga/calib_compute.h
+++ b/lite/kernels/fpga/calib_compute.h
@@ -26,6 +26,8 @@ class CalibComputeFp32ToFP16
 public:
  using param_t = operators::CalibParam;
+  // void PrepareForRun() override;
  void Run() override;
  ~CalibComputeFp32ToFP16() override{};
@@ -38,6 +40,8 @@ class CalibComputeFP16ToFp32
 public:
  using param_t = operators::CalibParam;
+  // void PrepareForRun() override;
  void Run() override;
  ~CalibComputeFP16ToFp32() override{};

--- a/lite/kernels/fpga/concat_compute.cc
+++ b/lite/kernels/fpga/concat_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/fpga/concat_compute.h"
+#include <string>
+#include <vector>
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/core/type_system.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace fpga {
+using float16 = zynqmp::float16;
+void ConcatCompute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+  param.output->mutable_data<float16>();
+  // ====================================================
+  zynqmp::ConcatParam& concat_param = pe_.param();
+  for (auto t : param.x) {
+    concat_param.inputs.push_back(t->ZynqTensor());
+  }
+  concat_param.output = param.output->ZynqTensor();
+  concat_param.axis = param.axis;
+  pe_.init();
+  pe_.apply();
+}
+void ConcatCompute::Run() {
+  pe_.dispatch();
+  zynqmp::ConcatParam& concat_param = pe_.param();
+  concat_param.output->saveToFile("concat", true);
+}
+}  // namespace fpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+REGISTER_LITE_KERNEL(concat,
+                     kFPGA,
+                     kFP16,
+                     kNHWC,
+                     paddle::lite::kernels::fpga::ConcatCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kFPGA),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
--- a/lite/kernels/fpga/concat_compute.h
+++ b/lite/kernels/fpga/concat_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <algorithm>
+#include "lite/core/kernel.h"
+#include "lite/operators/concat_op.h"
+#include "lite/backends/fpga/KD/float16.hpp"
+#include "lite/backends/fpga/KD/pes/concat_pe.hpp"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace fpga {
+class ConcatCompute
+    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
+ public:
+  using param_t = operators::ConcatParam;
+  void PrepareForRun() override;
+  void Run() override;
+  virtual ~ConcatCompute() = default;
+ private:
+  zynqmp::ConcatPE pe_;
+};
+}  // namespace fpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/fpga/conv_compute.cc
+++ b/lite/kernels/fpga/conv_compute.cc
@@ -25,37 +25,67 @@ using float16 = zynqmp::float16;
 void ConvCompute::PrepareForRun() {
  auto& param = this->Param<param_t>();
-  // ====================================================
-  zynqmp::ConvParam& conv_param = pe_.param();
  param.output->mutable_data<float16>();
+  // ====================================================
+  if (param.x->ZynqTensor()->shape().channel() != 1 && 
+    param.groups == param.x->ZynqTensor()->shape().channel()) {
+    zynqmp::DepthwiseConvParam& conv_param = dw_conv_pe_.param();
-  // filter_.setDataType(zynqmp::FP32);
+    conv_param.input = param.x->ZynqTensor();
-  conv_param.input = param.x->ZynqTensor();
+    conv_param.output = param.output->ZynqTensor();
-  conv_param.output = param.output->ZynqTensor();
+    conv_param.filter = param.filter->ZynqTensor();
-  conv_param.filter = param.filter->ZynqTensor();
+    conv_param.filter->setDataType(zynqmp::FP32);
-  conv_param.groups = param.groups;
+    conv_param.groups = param.groups;
-  conv_param.strides = param.strides;
+    conv_param.strides = param.strides;
-  auto paddings = *param.paddings;
+    conv_param.paddings = param.paddings;
-  conv_param.paddings = param.paddings;
+    conv_param.dilations = param.dilations;
-  conv_param.dilations = param.dilations;
+    fill_scale_bias_const(&conv_param);
-  bool pad_equal =
+    conv_param.bias()->copyFrom(param.bias->ZynqTensor());
-      ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
+    conv_param.relu.enabled = param.fuse_relu;
-  if (!pad_equal) {
-    LOG(FATA) << "This pad not support ! " << paddings[0] << ", " << paddings[1]
+    dw_conv_pe_.init();
-              << ", " << paddings[2] << ", " << paddings[3];
+    dw_conv_pe_.apply();
+  } else {
+    zynqmp::ConvParam& conv_param = conv_pe_.param();
+    conv_param.input = param.x->ZynqTensor();
+    conv_param.output = param.output->ZynqTensor();
+    conv_param.filter = param.filter->ZynqTensor();
+    conv_param.filter->setDataType(zynqmp::FP32);
+    conv_param.groups = param.groups;
+    conv_param.strides = param.strides;
+    conv_param.paddings = param.paddings;
+    conv_param.dilations = param.dilations;
+    fill_scale_bias_const(&conv_param);
+    if (param.bias != nullptr) {
+      conv_param.bias()->copyFrom(param.bias->ZynqTensor());
+      std::cout << "copy bias \n";
+    }
+    conv_param.relu.enabled = param.fuse_relu;
+    // conv_param.filter->saveToFile("filter", true);
+    // conv_param.bias()->saveToFile("bias", true);
+    // conv_param.scale()->saveToFile("scale", true);
+    conv_pe_.init();
+    conv_pe_.apply();
  }
-  fill_scale_bias_const(&conv_param);
-  conv_param.bias()->copyFrom(param.bias->ZynqTensor());
-  conv_param.relu.enabled = param.fuse_relu;
-  pe_.init();
-  pe_.apply();
 }
 void ConvCompute::Run() {
  auto& param = this->Param<param_t>();
-  zynqmp::ConvParam& conv_param = pe_.param();
+  // std::cout << "in:" << param.x->ZynqTensor()->data<void>() << std::endl;
-  pe_.dispatch();
+  if (param.x->ZynqTensor()->shape().channel() != 1 && 
+    param.groups == param.x->ZynqTensor()->shape().channel()) {
+    dw_conv_pe_.dispatch();
+    // param.output->ZynqTensor()->saveToFile("dw", true);
+  } else {
+    zynqmp::ConvParam& conv_param = conv_pe_.param();
+    conv_pe_.dispatch();
+    // conv_param.input->saveToFile("_conv_in", true);
+    conv_param.output->printScale("conv");
+    param.output->ZynqTensor()->saveToFile("_conv", true);
+    // conv_param.output->saveToFile("_conv_param", true);
+  }
 }
 }  // namespace fpga

--- a/lite/kernels/fpga/conv_compute.h
+++ b/lite/kernels/fpga/conv_compute.h
@@ -14,11 +14,13 @@
 #pragma once
-#include "lite/backends/fpga/KD/float16.hpp"
-#include "lite/backends/fpga/KD/pes/conv_pe.hpp"
 #include "lite/core/kernel.h"
 #include "lite/operators/conv_op.h"
+#include "lite/backends/fpga/KD/float16.hpp"
+#include "lite/backends/fpga/KD/pes/conv_pe.hpp"
+#include "lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp"
 namespace paddle {
 namespace lite {
 namespace kernels {
@@ -36,7 +38,8 @@ class ConvCompute
  ~ConvCompute() {}
 private:
-  zynqmp::ConvPE pe_;
+  zynqmp::ConvPE conv_pe_;
+  zynqmp::DepthwiseConvPE dw_conv_pe_;
 };
 }  // namespace fpga

--- a/lite/kernels/fpga/conv_compute_test.cc
+++ b/lite/kernels/fpga/conv_compute_test.cc
@@ -141,15 +141,13 @@ void conv_compute_ref(const operators::ConvParam& param) {
  int group = param.groups;
  int kernel_w = param.filter->dims()[2];
  int kernel_h = param.filter->dims()[3];
-  auto paddings = *param.paddings;
-  auto dilations = *para.dilations;
  int stride_w = param.strides[0];
  int stride_h = param.strides[1];
-  int dila_w = dilations[0];
+  int dila_w = param.dilations[0];
-  int dila_h = dilations[1];
+  int dila_h = param.dilations[1];
-  int pad_w = paddings[2];
-  int pad_h = paddings[0];
+  int pad_w = param.paddings[0];
+  int pad_h = param.paddings[1];
  bool flag_bias = (param.bias != nullptr);
  bool flag_relu = param.fuse_relu;
@@ -279,14 +277,10 @@ TEST(conv_fpga, compute) {
                            param.bias = &bias;
                          }
                          param.fuse_relu = flag_relu;
-                          std::vector<int> paddings = {
+                          param.paddings = std::vector<int>({padding, padding});
-                              padding, padding, padding, padding};
                          param.strides = std::vector<int>({stride, stride});
-                          std::vector<int> dilations = {dilation, dilation};
-                          param.paddings =
-                              std::make_shared<std::vector<int>>(paddings);
                          param.dilations =
-                              std::make_shared<std::vector<int>>(dilations);
+                              std::vector<int>({dilation, dilation});
                          param.groups = group;
                          conv.SetParam(param);
                          conv.Launch();

--- a/lite/kernels/fpga/density_prior_box_compute.cc
+++ b/lite/kernels/fpga/density_prior_box_compute.cc
--- a/lite/kernels/fpga/density_prior_box_compute.h
+++ b/lite/kernels/fpga/density_prior_box_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace fpga {
+class DensityPriorBoxCompute
+    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
+ public:
+  using param_t = operators::DensityPriorBoxParam;
+  void Run() override;
+  virtual ~DensityPriorBoxCompute() = default;
+};
+}  // namespace fpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/fpga/dropout_compute.cc
+++ b/lite/kernels/fpga/dropout_compute.cc
--- a/lite/kernels/fpga/dropout_compute.h
+++ b/lite/kernels/fpga/dropout_compute.h
--- a/lite/kernels/fpga/elementwise_compute.cc
+++ b/lite/kernels/fpga/elementwise_compute.cc
--- a/lite/kernels/fpga/elementwise_compute.h
+++ b/lite/kernels/fpga/elementwise_compute.h
@@ -16,6 +16,7 @@
 #include <algorithm>
 #include "lite/backends/fpga/KD/float16.hpp"
 #include "lite/backends/fpga/KD/pes/elementwise_add_pe.hpp"
+#include "lite/backends/fpga/KD/pes/scale_pe.hpp"
 #include "lite/core/kernel.h"
 #include "lite/core/op_registry.h"
@@ -50,6 +51,17 @@ class ElementwiseAddActivationCompute
  zynqmp::ElementwiseAddPE pe_;
 };
+class ElementwiseMulCompute
+    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
+ public:
+  void PrepareForRun() override;
+  void Run() override;
+  virtual ~ElementwiseMulCompute() = default;
+ private:
+  zynqmp::ScalePE pe_;
+};
 }  // namespace fpga
 }  // namespace kernels
 }  // namespace lite

--- a/lite/kernels/fpga/elementwise_compute_test.cc
+++ b/lite/kernels/fpga/elementwise_compute_test.cc
--- a/lite/kernels/fpga/fc_compute.cc
+++ b/lite/kernels/fpga/fc_compute.cc
--- a/lite/kernels/fpga/fc_compute.h
+++ b/lite/kernels/fpga/fc_compute.h
--- a/lite/kernels/fpga/fc_compute_test.cc
+++ b/lite/kernels/fpga/fc_compute_test.cc
--- a/lite/kernels/fpga/feed_compute.cc
+++ b/lite/kernels/fpga/feed_compute.cc
--- a/lite/kernels/fpga/feed_compute.h
+++ b/lite/kernels/fpga/feed_compute.h
@@ -32,8 +32,6 @@ class FeedCompute
 private:
  zynqmp::InputPE pe_;
-  zynqmp::Tensor input_;
-  zynqmp::Tensor output_;
 };
 }  // namespace fpga

--- a/lite/kernels/fpga/fetch_compute.cc
+++ b/lite/kernels/fpga/fetch_compute.cc
--- a/lite/kernels/fpga/fetch_compute.h
+++ b/lite/kernels/fpga/fetch_compute.h
--- a/lite/kernels/fpga/gru_compute.cc
+++ b/lite/kernels/fpga/gru_compute.cc
--- a/lite/kernels/fpga/gru_compute.h
+++ b/lite/kernels/fpga/gru_compute.h
--- a/lite/kernels/fpga/im2sequence_compute.cc
+++ b/lite/kernels/fpga/im2sequence_compute.cc
--- a/lite/kernels/fpga/im2sequence_compute.h
+++ b/lite/kernels/fpga/im2sequence_compute.h
--- a/lite/kernels/fpga/io_copy_compute.cc
+++ b/lite/kernels/fpga/io_copy_compute.cc
--- a/lite/kernels/fpga/layout_compute.cc
+++ b/lite/kernels/fpga/layout_compute.cc
--- a/lite/kernels/fpga/mul_compute.cc
+++ b/lite/kernels/fpga/mul_compute.cc
--- a/lite/kernels/fpga/mul_compute.h
+++ b/lite/kernels/fpga/mul_compute.h
--- a/lite/kernels/fpga/multiclass_nms_compute.cc
+++ b/lite/kernels/fpga/multiclass_nms_compute.cc
--- a/lite/kernels/fpga/multiclass_nms_compute.h
+++ b/lite/kernels/fpga/multiclass_nms_compute.h
--- a/lite/kernels/fpga/norm_compute.cc
+++ b/lite/kernels/fpga/norm_compute.cc
--- a/lite/kernels/fpga/norm_compute.h
+++ b/lite/kernels/fpga/norm_compute.h
--- a/lite/kernels/fpga/pooling_compute.cc
+++ b/lite/kernels/fpga/pooling_compute.cc
--- a/lite/kernels/fpga/pooling_compute.h
+++ b/lite/kernels/fpga/pooling_compute.h
--- a/lite/kernels/fpga/pooling_compute_test.cc
+++ b/lite/kernels/fpga/pooling_compute_test.cc
--- a/lite/kernels/fpga/prior_box_compute.cc
+++ b/lite/kernels/fpga/prior_box_compute.cc
--- a/lite/kernels/fpga/prior_box_compute.h
+++ b/lite/kernels/fpga/prior_box_compute.h
--- a/lite/kernels/fpga/reshape_compute.cc
+++ b/lite/kernels/fpga/reshape_compute.cc
--- a/lite/kernels/fpga/reshape_compute.h
+++ b/lite/kernels/fpga/reshape_compute.h
--- a/lite/kernels/fpga/scale_compute.cc
+++ b/lite/kernels/fpga/scale_compute.cc
--- a/lite/kernels/fpga/scale_compute.h
+++ b/lite/kernels/fpga/scale_compute.h
--- a/lite/kernels/fpga/sequence_pool_compute.cc
+++ b/lite/kernels/fpga/sequence_pool_compute.cc
--- a/lite/kernels/fpga/sequence_pool_compute.h
+++ b/lite/kernels/fpga/sequence_pool_compute.h
--- a/lite/kernels/fpga/softmax_compute.cc
+++ b/lite/kernels/fpga/softmax_compute.cc
--- a/lite/kernels/fpga/softmax_compute.h
+++ b/lite/kernels/fpga/softmax_compute.h
--- a/lite/kernels/fpga/softmax_compute_test.cc
+++ b/lite/kernels/fpga/softmax_compute_test.cc
--- a/lite/kernels/fpga/transpose_compute.cc
+++ b/lite/kernels/fpga/transpose_compute.cc
--- a/lite/kernels/fpga/transpose_compute.h
+++ b/lite/kernels/fpga/transpose_compute.h
--- a/lite/kernels/fpga/while_compute.cc
+++ b/lite/kernels/fpga/while_compute.cc
--- a/lite/kernels/fpga/while_compute.h
+++ b/lite/kernels/fpga/while_compute.h
--- a/lite/kernels/fpga/write_to_array_compute.cc
+++ b/lite/kernels/fpga/write_to_array_compute.cc
--- a/lite/kernels/fpga/write_to_array_compute.h
+++ b/lite/kernels/fpga/write_to_array_compute.h
--- a/lite/tools/build_fpga.sh
+++ b/lite/tools/build_fpga.sh