FPGA refactor #1560 (#1561)

* FPGA refactor * disable FPGAKD * recover misdeleted code segment

FPGA refactor #1560 (#1561)
* FPGA refactor * disable FPGAKD * recover misdeleted code segment
3f1e8f7d · Chon · Houjiang Chen · 54f13158 · 3f1e8f7d · 3f1e8f7d
67 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -14,6 +14,7 @@ option(FPGA       "build with FPGA support" OFF)
 if(FPGA)
  option(FPGAV1     "build with fpga v1 support" ON)
  option(FPGAV2     "build with fpga v2 support" OFF)
+  option(FPGAKD     "build with fpga KD support" OFF)
 endif()
 project(paddle-mobile)
@@ -97,8 +98,7 @@ else()
 endif()
 if(FPGA)
-    add_definitions(-DPADDLE_MOBILE_FPGA)
+    file(GLOB_RECURSE _tmp_list src/operators/math/*.cpp src/operators/math/*.cc src/operators/kernel/fpga/*.cc)
-    file(GLOB_RECURSE _tmp_list src/operators/math/*.cpp src/operators/kernel/fpga/*.cc)
    foreach(f ${_tmp_list})
        list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
    endforeach()
@@ -110,6 +110,7 @@ if(FPGA)
    list(APPEND PADDLE_MOBILE_h src/operators/math/softmax.h)
    list(APPEND PADDLE_MOBILE_h src/operators/math/math_func_neon.h)
    if(FPGAV1)
+        add_definitions(-DPADDLE_MOBILE_FPGA)
        message("FPGA_V1 enabled")
        add_definitions(-DPADDLE_MOBILE_FPGA_V1)
        file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V2/*.cpp src/fpga/V2/*.cpp)
@@ -120,8 +121,18 @@ if(FPGA)
        foreach(f ${_tmp_list})
            list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
        endforeach()
+        file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/KD/*.cpp src/fpga/KD/*.cpp)
+        foreach(f ${_tmp_list})
+            list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
+        endforeach()
+        file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/KD/*.h src/operators/kernel/fpga/KD/*.hpp
+            src/fpga/KD/*.h src/fpga/KD/*.hpp)
+        foreach(f ${_tmp_list})
+            list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
+        endforeach()
    endif()
    if(FPGAV2)
+        add_definitions(-DPADDLE_MOBILE_FPGA)
        message("FPGA_V2 enabled")
        add_definitions(-DPADDLE_MOBILE_FPGA_V2)
        file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V1/*.cpp src/fpga/V1/*.cpp)
@@ -132,8 +143,46 @@ if(FPGA)
        foreach(f ${_tmp_list})
            list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
        endforeach()
+        file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/KD/*.cpp src/fpga/KD/*.cpp)
+        foreach(f ${_tmp_list})
+            list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
+        endforeach()
+        file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/KD/*.h src/operators/kernel/fpga/KD/*.hpp
+            src/fpga/KD/*.h src/fpga/KD/*.hpp)
+        foreach(f ${_tmp_list})
+            list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
+        endforeach()
    endif()
+    if(FPGAKD)
+        message("FPGAKD enabled")
+        add_definitions(-DPADDLE_MOBILE_FPGA_KD)
+        file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V1/*.cpp src/fpga/V1/*.cpp)
+        foreach(f ${_tmp_list})
+            list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
+        endforeach()
+        file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V1/*.h src/fpga/V1/*.h)
+        foreach(f ${_tmp_list})
+            list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
+        endforeach()
+        file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V2/*.cpp src/fpga/V2/*.cpp)
+        foreach(f ${_tmp_list})
+            list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
+        endforeach()
+        file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V2/*.h src/fpga/V2/*.h)
+        foreach(f ${_tmp_list})
+            list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
+        endforeach()
+        file(GLOB_RECURSE _tmp_list src/operators/kernel/central-arm-func/*.h)
+        foreach(f ${_tmp_list})
+            list(APPEND PADDLE_MOBILE_H ${f})
+        endforeach()
+        file(GLOB_RECURSE _tmp_list src/operators/kernel/central-arm-func/*.cpp)
+        foreach(f ${_tmp_list})
+            list(APPEND PADDLE_MOBILE_CC ${f})
+        endforeach()
+    endif()
 else()
    file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/*.cpp src/operators/kernel/fpga/*.cc)
    foreach(f ${_tmp_list})
@@ -183,6 +232,8 @@ if(FPGAV1)
    set(NET "FPGA_NET_V1" CACHE STRING "select net type")
 elseif(FPGAV2)
    set(NET "FPGA_NET_V2" CACHE STRING "select net type")
+elseif(FPGAKD)
+    set(NET "FPGA_OPS_KD" CACHE STRING "select net type")
 else()
    set(NET "default" CACHE STRING "select net type")
 endif()

--- a/src/fpga/KD/alignment.h
+++ b/src/fpga/KD/alignment.h
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#ifndef alignment_h
+#define alignment_h
+#include <stdio.h>
+#include "llapi/zynqmp_api.h"
+namespace paddle_mobile {
+namespace zynqmp {
+inline int align_image(int wc) { return align_to_x(wc, IMAGE_ALIGNMENT); }
+}  // namespace zynqmp
+}  // namespace paddle_mobile
+#endif /* alignment_h */
--- a/src/fpga/KD/context.hpp
+++ b/src/fpga/KD/context.hpp
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#ifndef Context_hpp
+#define Context_hpp
+#include <stdio.h>
+#include "pe.hpp"
+#include "pes/conv_pe.hpp"
+#include "pes/depthwise_conv_pe.hpp"
+#include "pes/fully_connected_pe.hpp"
+#include "pes/input_pe.hpp"
+#include "pes/output_pe.hpp"
+#include "pes/pooling_pe.hpp"
+#include "pes/softmax_pe.hpp"
+namespace paddle_mobile {
+namespace zynqmp {
+class Context {
+ public:
+  template <typename Ptype>
+  Ptype& pe() {
+    if (pe_ == nullptr) {
+      pe_ = new Ptype();
+    }
+    return static_cast<Ptype&>(*pe_);
+  }
+  ~Context() {
+    if (pe_ != nullptr) {
+      delete pe_;
+    }
+  }
+ private:
+  PE* pe_ = nullptr;
+};
+}  // namespace zynqmp
+}  // namespace paddle_mobile
+#endif /* Context_hpp */
--- a/src/fpga/KD/dl_engine.cpp
+++ b/src/fpga/KD/dl_engine.cpp
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "dl_engine.hpp"
--- a/src/fpga/KD/dl_engine.hpp
+++ b/src/fpga/KD/dl_engine.hpp
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <stdio.h>
+namespace paddle_mobile {
+namespace zynqmp {
+class DLEngine {
+ public:
+  static DLEngine& get_instance() {
+    static DLEngine s_instance;
+    return s_instance;
+  }
+ private:
+  DLEngine();
+};
+}  // namespace zynqmp
+}  // namespace paddle_mobile
--- a/src/fpga/KD/float16.hpp
+++ b/src/fpga/KD/float16.hpp
--- a/src/fpga/KD/layout.hpp
+++ b/src/fpga/KD/layout.hpp
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <vector>
+#include "fpga/KD/alignment.h"
+namespace paddle_mobile {
+namespace zynqmp {
+enum LayoutType {
+  N,
+  NC,
+  NCHW,
+  NHWC,
+  NHW,
+};
+class Layout {
+ public:
+  virtual int numIndex() = 0;
+  virtual int channelIndex() { return -1; }
+  virtual int heightIndex() { return -1; }
+  virtual int widthIndex() { return -1; }
+  virtual int alignedElementCount(const std::vector<int>& dims) = 0;
+  virtual int elementCount(const std::vector<int>& dims) = 0;
+};
+struct NCHW : Layout {
+  int numIndex() { return 0; }
+  int channelIndex() { return 1; }
+  int heightIndex() { return 2; }
+  int widthIndex() { return 3; }
+  int alignedElementCount(const std::vector<int>& dims) {
+    return dims[0] * dims[2] * align_image(dims[1] * dims[3]);
+  }
+  virtual int elementCount(const std::vector<int>& dims) {
+    return dims[0] * dims[1] * dims[2] * dims[3];
+  }
+};
+struct NHWC : Layout {
+  int numIndex() { return 0; }
+  int heightIndex() { return 1; }
+  int widthIndex() { return 2; }
+  int channelIndex() { return 3; }
+  int alignedElementCount(const std::vector<int>& dims) {
+    return dims[0] * dims[1] * align_image(dims[2] * dims[3]);
+  }
+  virtual int elementCount(const std::vector<int>& dims) {
+    return dims[0] * dims[1] * dims[2] * dims[3];
+  }
+};
+struct NC : Layout {
+  int numIndex() { return 0; }
+  int channelIndex() { return 1; }
+  int alignedElementCount(const std::vector<int>& dims) {
+    return dims[0] * dims[1];
+  }
+  virtual int elementCount(const std::vector<int>& dims) {
+    return dims[0] * dims[1];
+  }
+};
+struct N : Layout {
+  int numIndex() { return 0; }
+  int alignedElementCount(const std::vector<int>& dims) { return dims[0]; }
+  virtual int elementCount(const std::vector<int>& dims) { return dims[0]; }
+};
+struct NHW : Layout {
+  int numIndex() { return 0; }
+  int heightIndex() { return 1; }
+  int widthIndex() { return 2; }
+  int alignedElementCount(const std::vector<int>& dims) {
+    // TODO(chonwhite) align it;
+    return dims[0] * dims[1] * dims[2];
+  }
+  virtual int elementCount(const std::vector<int>& dims) {
+    return dims[0] * dims[1] * dims[2];
+  }
+};
+}  // namespace zynqmp
+}  // namespace paddle_mobile
--- a/src/fpga/KD/llapi/bias_scale.cpp
+++ b/src/fpga/KD/llapi/bias_scale.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <memory.h>
+#include "fpga/KD/llapi/bias_scale.h"
+#include "fpga/KD/llapi/zynqmp_api.h"
+namespace paddle_mobile {
+namespace zynqmp {
+namespace bias_scale {
+void align_element(float **data_in, int num_per_div_before_alignment, int num) {
+  int copynum = 0;
+  float *ptr_unaligned = *data_in;
+  int div_num =
+      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
+  int num_per_div_after_alignment =
+      align_to_x(num_per_div_before_alignment, BS_NUM_ALIGNMENT);
+  int num_element =
+      2 * div_num * num_per_div_after_alignment;  // including bias & scale
+  float *ptr_aligned =
+      (float *)fpga_malloc(num_element * sizeof(float));  // NOLINT
+  memset(ptr_aligned, 0, num_element * sizeof(float));
+  for (int i = 0; i < div_num; i++) {
+    if (i == div_num - 1) {
+      copynum = (num_per_div_after_alignment * div_num > num)
+                    ? (num % num_per_div_after_alignment)
+                    : (num_per_div_before_alignment);
+    } else {
+      copynum = num_per_div_before_alignment;
+    }
+    memcpy(ptr_aligned + i * num_per_div_after_alignment,
+           ptr_unaligned + num_per_div_before_alignment * i,
+           copynum * sizeof(float));
+    memcpy(ptr_aligned + (div_num + i) * num_per_div_after_alignment,
+           ptr_unaligned + num_per_div_before_alignment * i + num,
+           copynum * sizeof(float));
+  }
+  fpga_free(ptr_unaligned);
+  *data_in = ptr_aligned;
+}
+void interleave(float **data_in, int num_after_alignment) {
+  float *ptr_uninterleaved = *data_in;
+  float *ptr_interleaved =
+      (float *)fpga_malloc(2 * num_after_alignment * sizeof(float));  // NOLINT
+  int num = num_after_alignment / 4;
+  for (int i = 0; i < num; i++) {
+    memcpy(ptr_interleaved + 8 * i, ptr_uninterleaved + 4 * i,
+           4 * sizeof(float));
+    memcpy(ptr_interleaved + 8 * i + 4,
+           ptr_uninterleaved + num_after_alignment + 4 * i, 4 * sizeof(float));
+  }
+  fpga_free(ptr_uninterleaved);
+  *data_in = ptr_interleaved;
+}
+void format_bias_scale_array(float **bias_scale_array,
+                             int element_num_per_division, int num) {
+  align_element(bias_scale_array, element_num_per_division, num);
+  int div_num = (num + element_num_per_division - 1) / element_num_per_division;
+  int element_num_after_division =
+      align_to_x(element_num_per_division, BS_NUM_ALIGNMENT);
+  interleave(bias_scale_array, div_num * element_num_after_division);
+  fpga_flush(*bias_scale_array, 2 * element_num_after_division * sizeof(float));
+}
+void format_bias_array(float **bias_array, int num) {
+  float *ptr_unaligned = *bias_array;
+  int num_before_align = num;
+  int num_after_align = align_to_x(num_before_align, BIAS_NUM_ALIGNMENT);
+  int16_t *ptr_aligned =
+      (int16_t *)fpga_malloc(num_after_align * sizeof(int16_t));  // NOLINT
+  memset(ptr_aligned, 0, num_after_align * sizeof(int16_t));
+  for (int i = 0; i < num_before_align; i++) {
+    float value = ptr_aligned[i];
+    ptr_aligned[i] = fp32_2_fp16(ptr_unaligned[i]);
+  }
+  *bias_array = (float *)ptr_aligned;  // NOLINT
+  fpga_free(ptr_unaligned);
+}
+}  // namespace bias_scale
+}  // namespace zynqmp
+}  // namespace paddle_mobile
--- a/src/fpga/KD/llapi/bias_scale.h
+++ b/src/fpga/KD/llapi/bias_scale.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+namespace paddle_mobile {
+namespace zynqmp {
+namespace bias_scale {
+void align_element(float** data_in, int num_per_div_before_alignment, int num);
+void interleave(float** data_in, int num_after_alignment);
+void format_bias_scale_array(float** bias_scale_array,
+                             int element_num_per_division, int num);
+void format_bias_array(float** bias_array, int num);
+}  // namespace bias_scale
+}  // namespace zynqmp
+}  // namespace paddle_mobile
--- a/src/fpga/KD/llapi/config.h
+++ b/src/fpga/KD/llapi/config.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#define PADDLE_MOBILE_ZU5
+#define FPGA_PRINT_MODE
+#define PADDLE_MOBILE_PROFILE
--- a/src/fpga/KD/llapi/filter.cpp
+++ b/src/fpga/KD/llapi/filter.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "fpga/KD/llapi/filter.h"
+#include <memory.h>
+#include <algorithm>
+#include "fpga/KD/float16.hpp"
+#include "fpga/KD/llapi/zynqmp_api.h"
+namespace paddle_mobile {
+namespace zynqmp {
+namespace filter {
+int calc_division_capacity(int chw) {
+  int n = 2048 / ((chw + 15) / 16) * 32;
+  return n < 2048 ? n : 2048;
+}
+int calc_split_num(int num, int division_capacity) {
+  return (num + division_capacity - 1) / division_capacity;
+}
+int calc_division_number(int num, int group_num, int division_capacity) {
+  int split_num = calc_split_num(num, division_capacity);
+  //  PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1,
+  //                        "Split number or group number should be 1");
+  return group_num * split_num;
+}
+int calc_num_per_div(int num, int group_num, int division_capacity) {
+  if (group_num == 1) {
+    if (num > division_capacity) {
+      return division_capacity;
+    } else {
+      return num;
+    }
+  } else {
+    return (num + group_num - 1) / group_num;
+  }
+}
+void convert_to_hwc(char **data_in, int num, int channel, int height,
+                    int width) {
+  char *tmp = *data_in;
+  int chw = channel * height * width;
+  char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char));  // NOLINT
+  for (int n = 0; n < num; n++) {
+    int64_t amount_per_row = width * channel;
+    for (int c = 0; c < channel; c++) {
+      for (int h = 0; h < height; h++) {
+        int64_t offset_height = h * amount_per_row;
+        for (int w = 0; w < width; w++) {
+          *(data_tmp + n * chw + offset_height + w * channel + c) =
+              *((*data_in)++);
+        }
+      }
+    }
+  }
+  *data_in = data_tmp;
+  fpga_free(tmp);
+}
+float find_max(float *data_in, int data_size) {
+  float max = 0.0;
+  for (int i = 0; i < data_size; ++i) {
+    float value = data_in[i];
+    float abs = value > 0 ? value : -value;
+    max = std::max(max, abs);
+  }
+  return max;
+}
+signed char float_to_int8(float fdata) {
+  if (fdata < 0.0) {
+    fdata -= 0.5;
+  } else {
+    fdata += 0.5;
+  }
+  return (signed char)fdata;
+}
+void quantize(float **data_in, int data_size, float max) {
+  float *tmp = *data_in;
+  float fix_range = 127;
+  float scale = fix_range / max;
+  signed char *tmp_data = (signed char *)fpga_malloc(data_size * sizeof(char));
+  for (int i = 0; i < data_size; i++) {
+    tmp_data[i] = float_to_int8(
+        (*data_in)[i] * scale);  // (signed char)((*data_in)[i] * scale);
+  }
+  *data_in = (float *)tmp_data;  // NOLINT
+  fpga_free(tmp);
+}
+void align_element(char **data_in, int num, int chw) {
+  int j = 0;
+  int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
+  if (align_chw != chw) {
+    char *tmp = *data_in;
+    char *data_tmp =
+        (char *)fpga_malloc(num * align_chw * sizeof(char));  // NOLINT
+    memset(data_tmp, 0, num * align_chw);
+    for (j = 0; j < num; j++) {
+      memcpy(data_tmp + j * align_chw, (*data_in) + j * chw, chw);
+    }
+    *data_in = data_tmp;
+    fpga_free(tmp);
+  }
+}
+void align_num(char **data_in, int num_per_div_before_alignment, int num,
+               int chw) {
+  int i = 0;
+  int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
+  int num_per_div_after_alignment =
+      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
+  char *tmp = *data_in;
+  int div_num =
+      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
+  int num_element = div_num * num_per_div_after_alignment * align_chw;
+  char *data_tmp = (char *)fpga_malloc(num_element * sizeof(char));  // NOLINT
+  memset(data_tmp, 0, num_element * sizeof(char));
+  for (i = 0; i < div_num - 1; i++) {
+    memcpy(data_tmp + num_per_div_after_alignment * align_chw * i,
+           *data_in + num_per_div_before_alignment * align_chw * i,
+           num_per_div_before_alignment * align_chw);
+  }
+  memcpy(data_tmp + num_per_div_after_alignment * align_chw * i,
+         *data_in + num_per_div_before_alignment * align_chw * i,
+         (num - (div_num - 1) * num_per_div_before_alignment) * align_chw);
+  *data_in = data_tmp;
+  fpga_free(tmp);
+}
+void reorder(char **data_in, int num_after_alignment, int chw) {
+  int index = 0;
+  int new_index = 0;
+  int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
+  char *data_tmp =
+      (char *)fpga_malloc(chw_align * num_after_alignment *  // NOLINT
+                          sizeof(char));
+  char *tmp = *data_in;
+  for (index = 0; index < num_after_alignment; index++) {
+    new_index = index / 32 * 32 + (index % 16 / 4 * 8) + (index % 16 % 4) +
+                (index / 16 % 2 * 4);
+    memcpy(data_tmp + index * chw_align, *data_in + new_index * chw_align,
+           chw_align);
+  }
+  *data_in = data_tmp;
+  fpga_free(tmp);
+}
+size_t interleave(char **data_in, int num_after_alignment, int chw) {
+  int i = 0;
+  int j = 0;
+  int k = 0;
+  int interleave_per_num = 16;
+  int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
+  char *data_tmp =
+      (char *)fpga_malloc(chw_align * num_after_alignment *  // NOLINT
+                          sizeof(char));
+  std::cout << "interleave size:" << chw_align * num_after_alignment
+            << std::endl;
+  char *tmp = *data_in;
+  int interleave_num = chw_align * 2 / interleave_per_num;
+  for (i = 0; i < num_after_alignment; i += 2) {
+    for (j = 0, k = 0; j < interleave_num; j += 2, k++) {
+      memcpy(data_tmp + i * chw_align + interleave_per_num * j,
+             *data_in + i * chw_align + interleave_per_num * k,
+             interleave_per_num);
+      memcpy(data_tmp + i * chw_align + interleave_per_num * (j + 1),
+             *data_in + (i + 1) * chw_align + interleave_per_num * k,
+             interleave_per_num);
+    }
+  }
+  *data_in = data_tmp;
+  fpga_free(tmp);
+  return chw_align * num_after_alignment;
+}
+size_t format_filter(float **data_in, int num, int channel, int height,
+                     int width, int group_num, float max) {
+  int data_size = channel * height * width * num;
+  int chw = channel * height * width;
+  int division_capacity = calc_division_capacity(chw);
+  int num_per_div_before_alignment =
+      calc_num_per_div(num, group_num, division_capacity);
+  int num_per_div_after_alignment =
+      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
+  int div_num =
+      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
+  int residual = num % num_per_div_before_alignment;
+  int num_after_alignment = num_per_div_after_alignment *
+                                ((residual == 0) ? div_num : (div_num - 1)) +
+                            align_to_x(residual, FILTER_NUM_ALIGNMENT);
+  quantize(data_in, data_size, max);
+  char **quantize_data = (char **)data_in;  // NOLINT
+  convert_to_hwc(quantize_data, num, channel, height, width);
+  align_element(quantize_data, num, chw);
+  if (num_after_alignment != num) {
+    align_num(quantize_data, num_per_div_before_alignment, num, chw);
+  }
+  reorder(quantize_data, num_after_alignment, chw);
+  size_t mem_size = interleave(quantize_data, num_after_alignment, chw);
+  fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) *
+                                 num_after_alignment * sizeof(char));
+  return mem_size;
+}
+void convert_fc_filter(char **data_in, int num, int chw) {
+  char *tmp = *data_in;
+  char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char));  // NOLINT
+  for (int n = 0; n < num; n++) {
+    for (int c = 0; c < chw; c++) {
+      data_tmp[n * chw + c] = (*data_in)[num * c + n];
+    }
+  }
+  *data_in = data_tmp;
+  fpga_free(tmp);
+}
+void format_fc_filter(float **data_in, int num, int channel, int height,
+                      int width, int group_num, float max) {
+  int data_size = channel * height * width * num;
+  int chw = channel * height * width;
+  int division_capacity = calc_division_capacity(chw);
+  int num_per_div_before_alignment =
+      calc_num_per_div(num, group_num, division_capacity);
+  int num_per_div_after_alignment =
+      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
+  int div_num =
+      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
+  int residual = num % num_per_div_before_alignment;
+  int num_after_alignment = num_per_div_after_alignment *
+                                ((residual == 0) ? div_num : (div_num - 1)) +
+                            align_to_x(residual, FILTER_NUM_ALIGNMENT);
+  quantize(data_in, data_size, max);
+  char **quantize_data = (char **)data_in;  // NOLINT
+  convert_fc_filter(quantize_data, num, chw);
+  align_element(quantize_data, num, chw);
+  if (num_after_alignment != num) {
+    align_num(quantize_data, num_per_div_before_alignment, num, chw);
+  }
+  reorder(quantize_data, num_after_alignment, chw);
+  interleave(quantize_data, num_after_alignment, chw);
+  fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) *
+                                 num_after_alignment * sizeof(char));
+}
+void convert_to_hwn(int16_t **data_in, int num, int height, int width) {
+  int16_t *tmp = *data_in;
+  int16_t *data_tmp =
+      (int16_t *)fpga_malloc(height * width * num * sizeof(int16_t));  // NOLINT
+  for (int n = 0; n < num; n++) {
+    for (int h = 0; h < height; h++) {
+      for (int w = 0; w < width; w++) {
+        *(data_tmp + h * width * num + w * num + n) = *((*data_in)++);
+      }
+    }
+  }
+  *data_in = data_tmp;
+  fpga_free(tmp);
+}
+void align_element_n(int16_t **data_in, int num, int height, int width) {
+  int unalign_n = num;
+  int align_n = align_to_x(num, FILTER_ELEMENT_ALIGNMENT);
+  if (unalign_n == align_n) {
+    return;
+  } else {
+    int16_t *tmp = *data_in;
+    int num_element = height * width * align_n;
+    int16_t *data_tmp =
+        (int16_t *)fpga_malloc(num_element * sizeof(int16_t));  // NOLINT
+    memset(data_tmp, 0, num_element * sizeof(int16_t));
+    for (int h = 0; h < height; h++) {
+      for (int w = 0; w < width; w++) {
+        int offset_unalign = h * width * unalign_n + w * unalign_n;
+        int offset_align = h * width * align_n + w * align_n;
+        for (int n = 0; n < unalign_n; n++) {
+          data_tmp[offset_align + n] = *((*data_in) + offset_unalign + n);
+        }
+      }
+    }
+    *data_in = data_tmp;
+    free(tmp);
+  }
+}
+void quantize_to_fp16(float **data_in, int num, int height, int width,
+                      float *scale_ptr) {
+  float *tmp = *data_in;
+  int size = num * height * width;
+  float16 *tmp_data = (float16 *)fpga_malloc(size * sizeof(float16));  // NOLINT
+  for (int n = 0; n < num; n++) {
+    float scale_val = scale_ptr[n];
+    for (int h = 0; h < height; h++) {
+      for (int w = 0; w < width; w++) {
+        int index = n * height * width + h * width + w;
+        float value = tmp[index] * scale_val;
+        tmp_data[index] = float_to_half(value);
+      }
+    }
+  }
+  fpga_flush(tmp_data, size * sizeof(int16_t));
+  *data_in = (float *)tmp_data;  // NOLINT
+  fpga_free(tmp);
+}
+void format_dwconv_filter(float **data_in, int num, int height, int width,
+                          float *scale_ptr) {
+  quantize_to_fp16(data_in, num, height, width, scale_ptr);
+  int16_t **quantize_data = (int16_t **)data_in;  // NOLINT
+  convert_to_hwn(quantize_data, num, height, width);
+  align_element_n(quantize_data, num, height, width);
+  fpga_flush(*quantize_data, align_to_x(num, FILTER_ELEMENT_ALIGNMENT) *
+                                 height * width * sizeof(int16_t));
+}
+}  // namespace filter
+}  // namespace zynqmp
+}  // namespace paddle_mobile
--- a/src/fpga/KD/llapi/filter.h
+++ b/src/fpga/KD/llapi/filter.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <cstdint>
+#include <cstdlib>
+#include <cwchar>
+namespace paddle_mobile {
+namespace zynqmp {
+namespace filter {
+int calc_division_capacity(int chw);
+int calc_split_num(int num, int division_capacity);
+int calc_division_number(int num, int group_num, int division_capacity);
+int calc_num_per_div(int num, int group_num, int division_capacity);
+void convert_to_hwc(char** data_in, int num, int channel, int height,
+                    int width);
+float find_max(float* data_in, int data_size);
+void quantize(float** data_in, int data_size, float max);
+void align_element(char** data_in, int num, int chw);
+void align_num(char** data_in, int num_per_div_before_alignment, int num,
+               int chw);
+void reorder(char** data_in, int num_after_alignment, int chw);
+size_t interleave(char** data_in, int num_after_alignment, int chw);
+size_t format_filter(float** data_in, int num, int channel, int height,
+                     int width, int group_num, float max);
+void convert_fc_filter(char** data_in, int num, int chw);
+void format_fc_filter(float** data_in, int num, int channel, int height,
+                      int width, int group_num, float max);
+void convert_to_hwn(int16_t** data_in, int num, int height, int width);
+void align_element_n(int16_t** data_in, int num, int height, int width);
+void quantize_to_fp16(float** data_in, int num, int height, int width,
+                      float* scale_ptr);
+void format_dwconv_filter(float** data_in, int num, int height, int width,
+                          float* scale_ptr);
+}  // namespace filter
+}  // namespace zynqmp
+}  // namespace paddle_mobile
--- a/src/fpga/KD/llapi/image.cpp
+++ b/src/fpga/KD/llapi/image.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <memory.h>
+#include <algorithm>
+#include "fpga/KD/llapi/image.h"
+#include "fpga/KD/llapi/zynqmp_api.h"
+namespace paddle_mobile {
+namespace zynqmp {
+namespace image {
+void convert_to_hwc(float **data_in, int channel, int height, int width) {
+  float *tmp = *data_in;
+  float *data_tmp =
+      (float *)fpga_malloc(channel * height * width * sizeof(float));  // NOLINT
+  int64_t amount_per_row = width * channel;
+  for (int c = 0; c < channel; c++) {
+    for (int h = 0; h < height; h++) {
+      int64_t offset_height = h * amount_per_row;
+      for (int w = 0; w < width; w++) {
+        *(data_tmp + offset_height + w * channel + c) = *((*data_in)++);
+      }
+    }
+  }
+  *data_in = data_tmp;
+  fpga_free(tmp);
+}
+void align_element_conv(float **data_in, int height, int cw) {
+  int h = 0;
+  int align_cw = align_to_x(cw, IMAGE_ALIGNMENT);
+  if (align_cw != cw) {
+    float *tmp = *data_in;
+    float *data_tmp =
+        (float *)fpga_malloc(height * align_cw * sizeof(float));  // NOLINT
+    memset(data_tmp, 0, height * align_cw * sizeof(float));
+    for (h = 0; h < height; h++) {
+      memcpy((void *)(data_tmp + h * align_cw),  // NOLINT
+             (void *)(*data_in + h * cw),        // NOLINT
+             cw * sizeof(float));
+    }
+    *data_in = data_tmp;
+    fpga_free(tmp);
+  }
+}
+void format_image(float **data_in, int channel, int height, int width) {
+  // convert_to_hwc(data_in, channel, height, width);
+  align_element_conv(data_in, height, channel * width);
+  fpga_flush(*data_in, align_to_x(channel * width, IMAGE_ALIGNMENT) * height *
+                           sizeof(float));
+}
+void concat_images(int16_t **images_in, float **scales_in, void *image_out,
+                   float *scale_out, int image_num, uint32_t *channel_num,
+                   int height, int width) {
+  int i = 0;
+  int j = 0;
+  int k = 0;
+  int each_out_line_channel = 0;
+  int align_each_out_area_cw = 0;
+  int align_each_in_area_cw = 0;
+  int align_each_out_area_cw_differ = 0;
+  int tmp_channel = 0;
+  scale_out[0] = 0.0;
+  scale_out[1] = 0.0;
+  for (i = 0; i < image_num; i++) {
+    each_out_line_channel += channel_num[i];
+    scale_out[0] = std::max(*scale_out, scales_in[i][0]);
+    // fpga_invalidate(images_in[i],
+    //                 height *
+    //                     align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT) *
+    //                     sizeof(int16_t));
+  }
+  scale_out[1] = 1 / scale_out[0];
+  align_each_out_area_cw =
+      align_to_x(each_out_line_channel * width, IMAGE_ALIGNMENT);
+  align_each_out_area_cw_differ =
+      align_each_out_area_cw - each_out_line_channel * width;
+  for (k = 0; k < height; k++) {
+    for (j = 0; j < width; j++) {
+      for (i = 0; i < image_num; i++) {
+        align_each_in_area_cw =
+            align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT);
+        memcpy((int16_t *)image_out + tmp_channel +  // NOLINT
+                   k * align_each_out_area_cw_differ,
+               images_in[i] + j * channel_num[i] + k * align_each_in_area_cw,
+               channel_num[i] * sizeof(int16_t));
+        tmp_channel += channel_num[i];
+      }
+    }
+  }
+  fpga_flush(image_out, height * align_each_out_area_cw * sizeof(int16_t));
+}
+void split_image(int16_t *image_in, const float *scale_in, void **images_out,
+                 float **scales_out, int image_num,
+                 const uint32_t *channel_nums, int height, int width) {
+  int total_channel = 0;
+  for (int i = 0; i < image_num; i++) {
+    scales_out[i][0] = scale_in[0];
+    scales_out[i][1] = scale_in[1];
+    total_channel += channel_nums[i];
+  }
+  int element_num = height * align_to_x(width * total_channel, IMAGE_ALIGNMENT);
+  fpga_invalidate(image_in, element_num * sizeof(int16_t));
+  int src_offset = 0;
+  int des_offset = 0;
+  for (int h = 0; h < height; h++) {
+    for (int w = 0; w < width; w++) {
+      src_offset = h * align_to_x(total_channel * width, IMAGE_ALIGNMENT) +
+                   w * total_channel;
+      for (int i = 0; i < image_num; i++) {
+        des_offset = h * align_to_x(channel_nums[i] * width, IMAGE_ALIGNMENT) +
+                     w * channel_nums[i];
+        memcpy(reinterpret_cast<int16_t *>(images_out[i] + des_offset),
+               image_in + src_offset, channel_nums[i] * sizeof(int16_t));
+        src_offset += channel_nums[i];
+      }
+    }
+  }
+  for (int i = 0; i < image_num; i++) {
+    element_num = height * align_to_x(width * channel_nums[i], IMAGE_ALIGNMENT);
+    fpga_flush(images_out[i], element_num * sizeof(int16_t));
+  }
+}
+}  // namespace image
+}  // namespace zynqmp
+}  // namespace paddle_mobile
--- a/src/fpga/KD/llapi/image.h
+++ b/src/fpga/KD/llapi/image.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <cstdint>
+namespace paddle_mobile {
+namespace zynqmp {
+namespace image {
+void convert_to_hwc(float** data_in, int channel, int height, int width);
+void align_element_conv(float** data_in, int height, int cw);
+void format_image(float** data_in, int channel, int height, int width);
+// Concat featuremaps along channel direction
+void concat_images(int16_t** images_in, float** scales_in, void* image_out,
+                   float* scale_out, int image_num, uint32_t* channel_num,
+                   int height, int width);
+// Split featuremap along channel direction
+void split_image(int16_t* image_in, const float* scale_in, void** images_out,
+                 float** scales_out, int image_num,
+                 const uint32_t* channel_nums, int height, int width);
+}  // namespace image
+}  // namespace zynqmp
+}  // namespace paddle_mobile
--- a/src/fpga/KD/llapi/zynqmp_api.cpp
+++ b/src/fpga/KD/llapi/zynqmp_api.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <cstring>
+#include <map>
+#include "fpga/KD/llapi/config.h"
+#include "fpga/KD/llapi/zynqmp_api.h"
+namespace paddle_mobile {
+namespace zynqmp {
+#define PADDLE_MOBILE_OS_LINUX
+static int fd = -1;
+static const char *device_path = "/dev/fpgadrv0";
+static std::map<void *, size_t> memory_map;
+static size_t memory_size_max = 0;
+static size_t memory_size = 0;
+static inline int do_ioctl(uint64_t req, const void *arg) {
+#ifdef PADDLE_MOBILE_OS_LINUX
+  return ioctl(fd, req, arg);
+#else
+  return -1;
+#endif
+}
+int open_device() {
+  std::cout << "open_device" << std::endl;
+  if (fd == -1) {
+    fd = open(device_path, O_RDWR);
+  }
+  std::cout << "open_device fd:" << fd << std::endl;
+  return fd;
+}
+void close_device() { close(fd); }
+void reset_device() {
+  FpgaResetArgs args;
+  do_ioctl(IOCTL_FPGA_RESET, &args);
+}
+// memory management;
+void *fpga_malloc(size_t size) {
+// std::cout << "fpga malloc: 0x" << std::hex << size  << std::dec << "  (" <<
+// size << ") - ";
+#ifdef ENABLE_DEBUG
+// std::cout << "fpga_malloc:" << size << std::endl;
+#endif
+#ifdef PADDLE_MOBILE_OS_LINUX
+  void *ptr = reinterpret_cast<void *>(
+      mmap64(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0));
+  if (ptr == NULL) {
+    std::cout << "not enough memory !";
+    exit(-1);
+  }
+  // std::cout << std::hex << ptr << std::dec << std::endl;
+  memory_map.insert(std::make_pair(ptr, size));
+  memory_size += size;
+  if (memory_size > memory_size_max) {
+    memory_size_max = memory_size;
+  }
+  return ptr;
+#else
+  return malloc(size);
+#endif
+}
+size_t fpga_get_memory_size(void *ptr) { return memory_map[ptr]; }
+size_t fpga_get_memory_size_max() { return memory_size_max; }
+size_t fpga_diagnose_memory(int detailed) {
+  size_t total = 0;
+  //        size_t size = 0;
+  //        int i = 0;
+  auto iter = memory_map.begin();  // std::map<void *, size_t>::iterator
+  while (iter != memory_map.end()) {
+    total += iter->second;
+    iter++;
+  }
+  return total;
+}
+void fpga_free(void *ptr) {
+  size_t size = 0;
+  auto iter = memory_map.find(ptr);  // std::map<void *, size_t>::iterator
+  if (iter != memory_map.end()) {
+    size = iter->second;
+    memory_map.erase(iter);
+  }
+  memory_size -= size;
+#ifdef PADDLE_MOBILE_OS_LINUX
+  munmap(ptr, size);
+#else
+  free(ptr);
+#endif
+}
+void fpga_copy(void *dst, const void *src, int size) { memcpy(dst, src, size); }
+int fpga_flush(void *address, size_t size) {
+  struct MemoryCacheArgs args;
+  args.address = address;
+  args.size = size;
+  return do_ioctl(IOCTL_MEMCACHE_FLUSH, &args);
+}
+int fpga_invalidate(void *address, size_t size) {
+  // std::cout <<
+  // "=================================================================================="
+  // << std::endl;
+  struct MemoryCacheArgs args;
+  args.address = address;
+  args.size = size;
+  return do_ioctl(IOCTL_MEMCACHE_INVAL, &args);
+}
+int invalidate_cache(void *addr, int size) {
+  struct MemoryCacheArgs args;
+  args.address = addr;
+  args.size = size;
+  return do_ioctl(IOCTL_MEMCACHE_INVAL, &args);
+}
+int flush_cache(void *addr, int size) {
+  struct MemoryCacheArgs args;
+  args.address = addr;
+  args.size = size;
+  return do_ioctl(IOCTL_MEMCACHE_FLUSH, &args);
+}
+void fpga_copy(void *dest, const void *src, size_t num) {
+  memcpy(dest, src, num);
+}
+int ioctl_conv(const struct ConvArgs &args) {
+#ifdef ENABLE_DEBUG
+//        std::cout << "======Compute Basic Conv======";
+//        std::cout << "   relu_enabled:" << args.relu_enabled
+//       << "   sb_address:" << args.sb_address
+//       << "   filter_address:" << args.filter_address
+//       << "   filter_num:" << args.filter_num
+//       << "   group_num:" << args.group_num;
+//  std::cout << "   image_address:" << args.image.address
+//       << "   image_scale_address:" << args.image.scale_address
+//       << "   image_channels:" << args.image.channels
+//       << "   image_height:" << args.image.height
+//       << "   image_width:" << args.image.width
+//       << "   pad_height:" << args.image.pad_height
+//       << "   pad_width:" << args.image.pad_width;
+//  std::cout << "   kernel_height:" << args.kernel.height
+//       << "   kernel_width:" << args.kernel.width
+//       << "   stride_h:" << args.kernel.stride_h
+//       << "   stride_w:" << args.kernel.stride_w;
+//  std::cout << "   out_address:" << args.output.address
+//       << "   out_scale_address:" << args.output.scale_address;
+//
+//       float* in_scale = (float*)args.image.scale_address;
+//       std::cout << "inv_scale:" << in_scale[0] << "," << in_scale[1] <<
+//       std::endl;
+#endif
+  return do_ioctl(IOCTL_CONFIG_CONV, &args);
+  // return 0;
+}
+int compute_fpga_conv_basic(const struct ConvArgs &args) {
+#ifdef ENABLE_DEBUG
+//        std::cout << "======Compute Basic Conv======";
+//        std::cout << "   relu_enabled:" << args.relu_enabled
+//       << "   sb_address:" << args.sb_address
+//       << "   filter_address:" << args.filter_address
+//       << "   filter_num:" << args.filter_num
+//       << "   group_num:" << args.group_num;
+//  std::cout << "   image_address:" << args.image.address
+//       << "   image_scale_address:" << args.image.scale_address
+//       << "   image_channels:" << args.image.channels
+//       << "   image_height:" << args.image.height
+//       << "   image_width:" << args.image.width
+//       << "   pad_height:" << args.image.pad_height
+//       << "   pad_width:" << args.image.pad_width;
+//  std::cout << "   kernel_height:" << args.kernel.height
+//       << "   kernel_width:" << args.kernel.width
+//       << "   stride_h:" << args.kernel.stride_h
+//       << "   stride_w:" << args.kernel.stride_w;
+//  std::cout << "   out_address:" << args.output.address
+//       << "   out_scale_address:" << args.output.scale_address;
+// float *in_scale = (float *)args.image.scale_address;
+//        std::cout << " scale:" << in_scale[0] << "," << in_scale[1] <<
+//        std::endl;
+// float *filter_scale = (float *)args.filter_scale_address;
+//        std::cout << " filter scale:" << filter_scale[0] << "," <<
+//        filter_scale[1] << std::endl;
+#endif
+  return do_ioctl(IOCTL_CONFIG_CONV, &args);
+}
+int compute_fpga_conv(const struct SplitConvArgs &args) {
+  // return do_ioctl(IOCTL_CONFIG_CONV, &args);
+  int split_num = args.split_num;
+  int ret = -1;
+  for (int i = 0; i < split_num; i++) {
+    // ComputeBasicConv(args.conv_args[i]);
+    ret = compute_fpga_conv_basic(args.conv_arg[i]);
+  }
+  if (split_num > 1) {
+    std::cout << "Split num > 1 !!!!!!!!!!!!!!!!!!" << std::endl;
+    exit(-1);
+  }
+  return ret;
+}
+int compute_fpga_pool(const struct PoolingArgs &args) {
+  return do_ioctl(IOCTL_CONFIG_POOLING, &args);
+}
+int compute_fpga_ewadd(const struct EWAddArgs &args) {
+  return do_ioctl(IOCTL_CONFIG_EW, &args);
+}
+int perform_bypass(const struct BypassArgs &args) {
+  int size = args.image.channels * args.image.width * args.image.height;
+  int max_size = 1 << 21;
+  float times = 1.0 * size / max_size;
+  int count = static_cast<int>(times);
+  void *input_address = args.image.address;
+  int type_size =
+      args.input_data_type == DATA_TYPE_FP32 ? sizeof(float) : sizeof(int16_t);
+  void *output_address = args.output.address;
+  int out_type_size =
+      args.output_data_type == DATA_TYPE_FP32 ? sizeof(float) : sizeof(int16_t);
+  struct BypassArgs bypassArgs = args;
+  bypassArgs.image.width = 1;
+  bypassArgs.image.height = 1;
+  // std::cout << "times:" << times << " count:" << count << std::endl;
+  for (int i = 0; i < count; ++i) {
+    bypassArgs.image.channels = max_size;
+    bypassArgs.image.address =
+        reinterpret_cast<char *>(input_address + i * max_size * type_size);
+    bypassArgs.output.address =
+        reinterpret_cast<char *>(output_address + i * max_size * out_type_size);
+    int ret = do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs);
+    if (ret != 0) {
+      return ret;
+    }
+    // std::cout << "@:" << i << " ret:" << ret << std::endl;
+  }
+  int remainder = size - max_size * count;
+  // std::cout << "remainder:" << remainder << std::endl;
+  bypassArgs.image.channels = remainder;
+  bypassArgs.image.address =
+      reinterpret_cast<char *>(input_address + count * max_size * type_size);
+  bypassArgs.output.address = reinterpret_cast<char *>(
+      output_address + count * max_size * out_type_size);
+  return do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs);
+}
+int compute_fpga_concat(const struct ConcatArgs &args) { return -1; }
+int compute_fpga_scale(const struct ScaleArgs &args) {
+#ifdef ENABLE_DEBUG
+  std::cout << "======Compute Scale======";
+  std::cout << "scale_address:" << args.scale_address << std::endl;
+  std::cout << "bias_address:" << args.bias_address << std::endl;
+  std::cout << "wc_alignment:" << args.wc_alignment << std::endl;
+  std::cout << "channel_alignment:" << args.channel_alignment << std::endl;
+  std::cout << "   image_address:" << args.image.address
+            << "   image_scale_address:" << args.image.scale_address
+            << "   image_channels:" << args.image.channels
+            << "   image_height:" << args.image.height
+            << "   image_width:" << args.image.width
+            << "   pad_height:" << args.image.pad_height
+            << "   pad_width:" << args.image.pad_width;
+  std::cout << "   out_address:" << args.output.address
+            << "   out_scale_address:" << args.output.scale_address;
+#endif
+  return do_ioctl(IOCTL_CONFIG_SCALE, &args);
+}
+int compute_fpga_dwconv(const struct DWconvArgs &args) {
+  std::cout << "======Compute Basic Conv======";
+  std::cout << "   relu_enabled:" << args.relu_enabled
+            << "   filter_address:" << args.filter_address;
+  std::cout << "   image_address:" << args.image.address
+            << "   image_scale_address:" << args.image.scale_address
+            << "   image_channels:" << args.image.channels
+            << "   image_height:" << args.image.height
+            << "   image_width:" << args.image.width
+            << "   pad_height:" << args.image.pad_height
+            << "   pad_width:" << args.image.pad_width;
+  std::cout << "   kernel_height:" << args.kernel.height
+            << "   kernel_width:" << args.kernel.width
+            << "   stride_h:" << args.kernel.stride_h
+            << "   stride_w:" << args.kernel.stride_w;
+  std::cout << "   out_address:" << args.output.address
+            << "   out_scale_address:" << args.output.scale_address;
+  // float *in_scale = (float *)args.image.scale_address;
+  // std::cout << "inv_scale:" << in_scale[0] << "," << in_scale[1] <<
+  // std::endl;
+  return do_ioctl(IOCTL_CONFIG_DWCONV, &args);
+}
+// int config_power(const struct PowerArgs& args) {
+//     return do_ioctl(IOCTL_CONFIG_POWER, &args);
+// }
+int config_inplace(const struct InplaceArgs &args) {
+  return do_ioctl(IOCTL_CONFIG_INPLACE, &args);
+}
+// uint64_t vaddr_to_paddr(void *address) {
+//     return 0;
+// }
+int16_t fp32_2_fp16(float fp32_num) {
+  unsigned long tmp = *(unsigned long *)(&fp32_num);  // NOLINT
+  auto t = (int16_t)(((tmp & 0x007fffff) >> 13) | ((tmp & 0x80000000) >> 16) |
+                     (((tmp & 0x7f800000) >> 13) - (112 << 10)));
+  if (tmp & 0x1000) {
+    t++;  // roundoff
+  }
+  return t;
+}
+float fp16_2_fp32(int16_t fp16_num) {
+  if (0 == fp16_num) {
+    return 0;
+  }
+  int frac = (fp16_num & 0x3ff);
+  int exp = ((fp16_num & 0x7c00) >> 10) + 112;
+  int s = fp16_num & 0x8000;
+  int tmp = 0;
+  float fp32_num = 0;
+  tmp = s << 16 | exp << 23 | frac << 13;
+  fp32_num = *(float *)&tmp;  // NOLINT
+  return fp32_num;
+}
+}  // namespace zynqmp
+}  // namespace paddle_mobile
--- a/src/fpga/KD/llapi/zynqmp_api.h
+++ b/src/fpga/KD/llapi/zynqmp_api.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#ifndef PADDLE_MOBILE_SRC_FPGA_KD_ZYNQMP_API_H
+#define PADDLE_MOBILE_SRC_FPGA_KD_ZYNQMP_API_H
+#include <stdint.h>
+#include <cstddef>
+#include <iostream>
+#include <limits>
+namespace paddle_mobile {
+namespace zynqmp {
+typedef int16_t half;
+#define IMAGE_ALIGNMENT 16           // Aligned to 16
+#define FILTER_NUM_ALIGNMENT 32      // Filter number aligned to 32
+#define FILTER_ELEMENT_ALIGNMENT 16  // Filter element number aligned to 16
+#define BS_NUM_ALIGNMENT 8
+#define BIAS_NUM_ALIGNMENT 16
+enum DDataType {
+  DATA_TYPE_FP32 = 1,
+  DATA_TYPE_FP16 = 0,
+};
+enum DLayoutType {
+  LAYOUT_CHW = 1,
+  LAYOUT_HWC = 0,
+};
+struct VersionArgs {
+  void* buffer;
+};
+struct MemoryCopyArgs {
+  void* src;
+  void* dest;
+  size_t size;
+};
+struct MemoryCacheArgs {
+  void* address;
+  size_t size;
+};
+struct MemoryBarrierArgs {};
+struct BNArgs {
+  bool enabled;
+  void* bias_address;
+  void* scale_address;
+};
+/**
+Conv and Pooling kernel
+*/
+struct KernelArgs {
+  uint32_t width;
+  uint32_t height;
+  uint32_t stride_w;
+  uint32_t stride_h;
+};
+struct ImageInputArgs {
+  void* address;        // input featuremap virtual address
+  void* scale_address;  // input scale address;
+  uint32_t channels;
+  uint32_t width;  // featuremap width
+  uint32_t height;
+  uint32_t pad_width;  // padding width;
+  uint32_t pad_height;
+};
+struct ImageOutputArgs {
+  void* address;         // output result address;
+  float* scale_address;  // output scale address;
+};
+struct ConvArgs {
+  bool relu_enabled;
+  void* sb_address;  // scale and bias are interlaced;
+  void* filter_address;
+  void* filter_scale_address;
+  uint32_t filter_num;
+  uint32_t group_num;
+  struct KernelArgs kernel;
+  struct ImageInputArgs image;  // input image;
+  struct ImageOutputArgs output;
+};
+struct DWconvArgs {
+  bool relu_enabled;
+  void* bias_address;
+  void* filter_address;
+  struct KernelArgs kernel;
+  struct ImageInputArgs image;
+  struct ImageOutputArgs output;
+  uint16_t out_width;
+  uint16_t out_height;
+  uint16_t sub_conv_num;
+};
+struct PoolingArgs {
+  uint16_t mode;
+  uint16_t kernel_reciprocal;
+  struct KernelArgs kernel;
+  struct ImageInputArgs image;  // input image;
+  struct ImageOutputArgs output;
+  uint16_t out_width;
+  uint16_t out_height;
+};
+// elementwise add arguments
+struct EWAddArgs {
+  bool relu_enabled;
+  uint32_t const0;  // output0 = const0 x input0 + const1 x input1;
+  uint32_t const1;
+  struct ImageInputArgs image0;
+  struct ImageInputArgs image1;
+  struct ImageOutputArgs output;
+};
+struct BypassArgs {
+  enum DDataType input_data_type;
+  enum DDataType output_data_type;
+  enum DLayoutType input_layout_type;
+  enum DLayoutType output_layout_type;
+  struct ImageInputArgs image;
+  struct ImageOutputArgs output;
+};
+struct ScaleArgs {
+  void* scale_address;
+  void* bias_address;
+  uint32_t wc_alignment;
+  uint32_t channel_alignment;
+  struct ImageInputArgs image;
+  struct ImageOutputArgs output;
+};
+struct NormalizeArgs {
+  void* input_image_address;
+  void* output_image_address;
+  uint32_t image_width;
+  uint32_t image_height;
+  uint32_t image_channel;
+  uint32_t* output_scale_address;
+};
+struct PowerParameterArgs {
+  uint16_t shift;
+  uint16_t scale;
+  uint16_t power;
+};
+struct NormalizeParameterArgs {
+  uint32_t channel;
+  uint32_t hight_width;
+};
+struct InplaceArgs {
+  bool relu_enable;
+  bool power_enable;
+  bool normalize_enable;
+};
+struct FpgaRegWriteArgs {
+  uint64_t address;  //
+  uint64_t value;
+};
+struct FpgaRegReadArgs {
+  uint64_t address;
+  uint64_t value;
+};
+struct FpgaResetArgs {};
+#define IOCTL_FPGA_MAGIC (('F' + 'P' + 'G' + 'A') / 4)
+#define IOCTL_VERSION _IOW(IOCTL_FPGA_MAGIC, 01, struct VersionArgs)
+#define IOCTL_SEPARATOR_0 10
+#define IOCTL_MEM_COPY _IOW(IOCTL_FPGA_MAGIC, 11, struct MemoryCopyArgs)
+#define IOCTL_MEMCACHE_INVAL _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryCacheArgs)
+#define IOCTL_MEMCACHE_FLUSH _IOW(IOCTL_FPGA_MAGIC, 13, struct MemoryCacheArgs)
+#define IOCTL_MEMORY_BARRIER \
+  _IOW(IOCTL_FPGA_MAGIC, 14, struct MemoryBarrierArgs)
+#define IOCTL_SEPARATOR_1 20
+#define IOCTL_CONFIG_CONV _IOW(IOCTL_FPGA_MAGIC, 21, struct ConvArgs)
+#define IOCTL_CONFIG_POOLING _IOW(IOCTL_FPGA_MAGIC, 22, struct PoolingArgs)
+#define IOCTL_CONFIG_EW _IOW(IOCTL_FPGA_MAGIC, 23, struct EWAddArgs)
+#define IOCTL_CONFIG_BYPASS _IOW(IOCTL_FPGA_MAGIC, 24, struct BypassArgs)
+#define IOCTL_CONFIG_SCALE _IOW(IOCTL_FPGA_MAGIC, 25, struct ScaleArgs)
+#define IOCTL_CONFIG_NORMALIZE _IOW(IOCTL_FPGA_MAGIC, 26, struct NormalizeArgs)
+#define IOCTL_CONFIG_DWCONV _IOW(IOCTL_FPGA_MAGIC, 31, struct DWconvArgs)
+#define IOCTL_CONFIG_INPLACE _IOW(IOCTL_FPGA_MAGIC, 40, struct InplaceArgs)
+#define IOCTL_CONFIG_POWER_PARAMETER \
+  _IOW(IOCTL_FPGA_MAGIC, 41, struct PowerParameterArgs)
+#define IOCTL_CONFIG_NORMALIZE_PARAMETER \
+  _IOW(IOCTL_FPGA_MAGIC, 42, struct NormalizeParameterArgs)
+#define IOCTL_FPGA_REG_READ _IOW(IOCTL_FPGA_MAGIC, 50, struct FpgaRegReadArgs)
+#define IOCTL_FPGA_REG_WRITE _IOW(IOCTL_FPGA_MAGIC, 51, struct FpgaRegWriteArgs)
+#define IOCTL_FPGA_RESET _IOW(IOCTL_FPGA_MAGIC, 52, struct FpgaResetArgs)
+//============================== API =============================
+// struct DWconvArgs {
+//     bool relu_enabled;
+//     void* bias_address;
+//     void* filter_address;
+//     struct KernelArgs kernel;
+//     struct ImageInputArgs image;
+//     struct ImageOutputArgs output;
+// };
+struct DeconvArgs {
+  uint32_t sub_conv_num;
+  uint32_t group_num;
+  uint32_t filter_num;
+  uint32_t omit_size;
+  uint32_t sub_output_width;
+  uint32_t sub_output_height;
+  struct ImageOutputArgs output;
+  struct SplitConvArgs* split_conv_args;
+};
+struct SplitArgs {
+  uint32_t image_num;
+  int16_t* image_in;
+  float* scale_in;
+  void** images_out;
+  float** scales_out;
+  uint32_t* out_channel_nums;
+  uint32_t height;
+  uint32_t width;
+};
+struct ConcatArgs {
+  uint32_t image_num;
+  half** images_in;
+  float** scales_in;
+  void* image_out;
+  float* scale_out;
+  uint32_t* channel_num;
+  uint32_t height;
+  uint32_t width;
+};
+struct SplitConvArgs {
+  uint32_t split_num;
+  uint32_t group_num;
+  uint32_t filter_num;
+  struct ImageOutputArgs output;
+  struct ConvArgs* conv_arg;
+  struct ConcatArgs concat_arg;
+};
+struct GroupConvArgs {
+  uint32_t group_num;
+  uint32_t filter_num;
+  struct ImageOutputArgs output;
+  struct SplitConvArgs* conv_args;
+  struct ConcatArgs concat_arg;
+};
+inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; }
+int open_device();
+void close_device();
+void reset_device();
+void* fpga_malloc(size_t size);
+void fpga_free(void* ptr);
+size_t fpga_get_memory_size(void* ptr);
+size_t fpga_get_memory_size_max();
+size_t fpga_diagnose_memory(int detailed);
+void fpga_copy(void* dst, const void* src, int size);
+int fpga_flush(void* address, size_t size);
+int fpga_invalidate(void* address, size_t size);
+int perform_bypass(const struct BypassArgs& args);
+int compute_fpga_conv_basic(const struct ConvArgs& args);
+int compute_fpga_conv(const struct SplitConvArgs& args);
+int compute_fpga_pool(const struct PoolingArgs& args);
+int compute_fpga_ewadd(const struct EWAddArgs& args);
+int compute_fpga_scale(const struct ScaleArgs& args);
+int compute_fpga_concat(const struct ConcatArgs& args);
+int config_power(const struct PowerArgs& args);
+int compute_fpga_dwconv(const struct DWconvArgs& args);
+// int config_relu(const struct ReluArgs& args);
+int config_inplace(const struct InplaceArgs& args);
+int flush_cache(void* addr, int size);
+int invalidate_cache(void* addr, int size);
+int16_t fp32_2_fp16(float fp32_num);
+float fp16_2_fp32(int16_t fp16_num);
+}  // namespace zynqmp
+}  // namespace paddle_mobile
+#endif  // PADDLE_MOBILE_SRC_FPGA_KD_ZYNQMP_API_H
--- a/src/fpga/KD/pe.hpp
+++ b/src/fpga/KD/pe.hpp
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#ifndef PE_hpp
+#define PE_hpp
+#include <stdio.h>
+#include <iostream>
+#include "pe_params.hpp"
+#include "tensor_util.hpp"
+namespace paddle_mobile {
+namespace zynqmp {
+class PE {
+ public:
+  virtual bool init() { return false; }
+  virtual void apply() {}
+  virtual bool dispatch() {
+    std::cout << "pe dispatch \n";
+    return false;
+  }
+  virtual ~PE() {}
+};
+}  // namespace zynqmp
+}  // namespace paddle_mobile
+#endif /* PE_hpp */
--- a/src/fpga/KD/pe_params.hpp
+++ b/src/fpga/KD/pe_params.hpp
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#ifndef PEParams_hpp
+#define PEParams_hpp
+#include <stdio.h>
+#include <vector>
+#include "llapi/zynqmp_api.h"
+#include "tensor.hpp"
+namespace paddle_mobile {
+namespace zynqmp {
+struct PEParam {};
+struct InputParam : PEParam {
+ public:
+  Tensor* input = nullptr;
+  Tensor* output = nullptr;
+};
+struct OutputParam : PEParam {
+ public:
+  Tensor* input = nullptr;
+  Tensor* output = nullptr;
+};
+struct ReLUParam : PEParam {
+ public:
+  bool enabled = false;
+};
+struct BatchnormParam : PEParam {
+ public:
+  Tensor* bias = nullptr;
+  Tensor* scale = nullptr;
+  Tensor* mean = nullptr;
+  Tensor* variance = nullptr;
+  float epsilon = 0;
+};
+struct BasicConvParam {
+  Tensor output;
+  Tensor filter;
+  Tensor scaleBias;
+  ConvArgs args;
+};
+struct ConvParam : PEParam {
+ public:
+  Tensor* input = nullptr;
+  Tensor* output = nullptr;
+  Tensor* filter = nullptr;
+  BatchnormParam* batchnorm = nullptr;
+  ReLUParam relu;
+  int groups = 1;
+  std::vector<int> strides;
+  std::vector<int> paddings;
+  std::vector<int> kernelSize;
+  std::vector<int> dilations;
+  Tensor* scale() { return scale_; }
+  Tensor* bias() { return bias_; }
+  // Tensor* quantizedFilter() {
+  //     return quantizedFilter_;
+  // }
+  std::vector<BasicConvParam*>& splitParams() { return splitParams_; }
+ protected:
+  std::vector<BasicConvParam*> splitParams_;
+  // Tensor* quantizedFilter_ = new Tensor();
+  Tensor* scale_ = new Tensor();
+  Tensor* bias_ = new Tensor();
+};
+struct DepthwiseConvParam : ConvParam {
+ public:
+  Tensor* quantizedFilter() { return quantizedFilter_; }
+  DWconvArgs args;
+ protected:
+  Tensor* quantizedFilter_ = new Tensor();
+};
+enum PoolingType : int {
+  MAX = 0,
+  AVERAGE = 1,
+};
+struct PoolingParam : PEParam {
+ public:
+  Tensor* input = nullptr;
+  Tensor* output = nullptr;
+  PoolingType type = PoolingType::MAX;
+  bool globalPooling = false;
+  std::vector<int> kernelSize;
+  std::vector<int> strides;
+  std::vector<int> paddings;
+  PoolingArgs poolingArgs = {0};
+};
+struct ConcatParam : PEParam {
+ public:
+  std::vector<Tensor*> inputs;
+  Tensor* output;
+  int axis = 0;
+};
+struct ElementwiseAddParam : PEParam {
+ public:
+  std::vector<Tensor*> inputs;
+  Tensor* output = nullptr;
+  int axis = 0;
+  ReLUParam relu;
+  EWAddArgs ewargs;
+};
+struct FullyConnectedParam : PEParam {
+ public:
+  Tensor* input = nullptr;
+  Tensor* filter = nullptr;
+  Tensor* bias = nullptr;
+  Tensor* output = nullptr;
+  Tensor* quantizedFilter() { return quantizedFilter_; }
+  Tensor* biasScale() { return biasScale_; }
+  SplitConvArgs convArgs;
+ protected:
+  Tensor* quantizedFilter_ = new Tensor();
+  Tensor* biasScale_ = new Tensor();
+};
+struct SoftmaxParam : PEParam {
+ public:
+  Tensor* input = nullptr;
+  Tensor* output = nullptr;
+ private:
+  Tensor* floatInput = nullptr;
+};
+struct NormParam : PEParam {
+ public:
+  Tensor* input = nullptr;
+  Tensor* output = nullptr;
+ private:
+  Tensor* floatInput = nullptr;
+};
+}  // namespace zynqmp
+}  // namespace paddle_mobile
+#endif /* PEParams_hpp */
--- a/src/fpga/KD/pes/concat_pe.hpp
+++ b/src/fpga/KD/pes/concat_pe.hpp
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <algorithm>
+#include "../pe.hpp"
+#include "../pe_params.hpp"
+namespace paddle_mobile {
+namespace zynqmp {
+class ConcatPE : public PE {
+ public:
+  bool init() {
+    Tensor* output = param_.output;
+    output->setAligned(true);
+    return true;
+  }
+  void apply() {}
+  bool dispatch() {
+    Tensor* output = param_.output;
+    Shape& output_shape = output->shape();
+    float16* out_data = param_.output->data<float16>();
+    int channel_sum = 0;
+    int out_channel = output_shape.channel();
+    float scale = 0;
+    for (int n = 0; n < param_.inputs.size(); n++) {
+      Tensor* input = param_.inputs[n];
+      input->invalidate();
+      scale = std::max(scale, input->scale()[0]);
+      Shape& input_shape = input->shape();
+      int wh = output_shape.width() * output_shape.height();
+      for (int j = 0; j < wh; j++) {
+        float16* src = input->data<float16>() + j * input_shape.channel();
+        memcpy(out_data + j * out_channel + channel_sum, src,
+               input_shape.channel() * sizeof(float16));
+      }
+      channel_sum += input_shape.channel();
+    }
+    output->scale()[0] = scale;
+    output->scale()[1] = 1.0f / scale;
+    std::cout << "conv scale::" << scale << std::endl;
+    output->flush();
+    return true;
+  }
+  ConcatParam& param() { return param_; }
+ private:
+  ConcatParam param_;
+};
+}  // namespace zynqmp
+}  // namespace paddle_mobile
--- a/src/fpga/KD/pes/conv_pe.hpp
+++ b/src/fpga/KD/pes/conv_pe.hpp
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <vector>
+#include "../llapi/image.h"
+#include "../pe.hpp"
+#include "../pe_params.hpp"
+#include "concat_pe.hpp"
+#include "conv_pe.hpp"
+#include "conv_process.hpp"
+namespace paddle_mobile {
+namespace zynqmp {
+class ConvPE : public PE {
+ public:
+  bool init() {
+    std::cout << "Conv init" << std::endl;
+    return true;
+  }
+  void apply() {
+    // process scale and bias;
+    BatchnormParam* bn = param_.batchnorm;
+    int channel = param_.output->shape().channel();
+    Shape sb_shape(N, {channel});
+    float* new_scale_ptr = param_.scale()->mutableData<float>(FP32, sb_shape);
+    float* new_bias_ptr = param_.bias()->mutableData<float>(FP32, sb_shape);
+    if (bn != nullptr) {
+      float* bn_scale_ptr = bn->scale->data<float>();
+      float* bn_bias_ptr = bn->bias->data<float>();
+      float* bn_var_ptr = bn->variance->data<float>();
+      float* bn_mean_ptr = bn->mean->data<float>();
+      float epsilon = bn->epsilon;
+      for (int i = 0; i < channel; i++) {
+        float new_scale =
+            bn_scale_ptr[i] /
+            static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
+        new_scale_ptr[i] = new_scale;
+        new_bias_ptr[i] =
+            bn_bias_ptr[i] + (0 - bn_mean_ptr[i]) * new_scale_ptr[i];
+      }
+    } else {
+      for (int i = 0; i < channel; i++) {
+        new_scale_ptr[i] = 1.0f;
+        new_bias_ptr[i] = 0.0f;
+      }
+    }
+    fill_split_arg(param_);
+    if (param_.splitParams().size() > 1) {
+      ConcatParam& concat_param = concatPE_.param();
+      for (auto conv_param : param_.splitParams()) {
+        concat_param.inputs.push_back(&conv_param->output);
+      }
+      concat_param.output = param_.output;
+      concatPE_.init();
+      concatPE_.apply();
+    }
+  }
+  bool dispatch() {
+    std::vector<BasicConvParam*>& params = param_.splitParams();
+    int ret = 0;
+    for (auto conv_param : params) {
+      ret |= compute_fpga_conv_basic(conv_param->args);
+    }
+    size_t size = params.size();
+    if (ret == 0 && size > 1) {
+      concatPE_.dispatch();
+    }
+    return ret == 0;
+  }
+  ConvParam& param() { return param_; }
+ private:
+  ConvParam param_;
+  ConcatPE concatPE_;
+};
+}  // namespace zynqmp
+}  // namespace paddle_mobile
--- a/src/fpga/KD/pes/conv_process.hpp
+++ b/src/fpga/KD/pes/conv_process.hpp
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#ifndef conv_process_hpp
+#define conv_process_hpp
+#include <string.h>
+#include <cmath>
+#include <vector>
+#include "../float16.hpp"
+#include "../llapi/bias_scale.h"
+#include "../llapi/filter.h"
+#include "../llapi/image.h"
+#include "../tensor.hpp"
+namespace paddle_mobile {
+namespace zynqmp {
+inline int get_aligned_filter_element_num(int chw) {
+  return align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
+}
+inline int get_filter_num_per_div(Tensor* filter, int group_num) {
+  auto chw = filter->shape().channel() * filter->shape().height() *
+             filter->shape().width();
+  auto num = filter->shape().num();
+  int div_capacity = filter::calc_division_capacity(chw);
+  return filter::calc_num_per_div(num, group_num, div_capacity);
+}
+inline int get_split_num(Tensor* filter) {
+  auto chw = filter->shape().channel() * filter->shape().height() *
+             filter->shape().width();
+  auto num = filter->shape().num();
+  int div_capacity = filter::calc_division_capacity(chw);
+  return filter::calc_split_num(num, div_capacity);
+}
+inline void format_scale_bias(Tensor* scale, Tensor* bias, Tensor* filter,
+                              Tensor* scale_bias, int group) {
+  float* scale_data = nullptr;
+  float* bias_data = nullptr;
+  if (scale != nullptr) {
+    scale_data = scale->data<float>();
+  }
+  if (bias != nullptr) {
+    bias_data = bias->data<float>();
+  }
+  int channel = filter->shape().num();
+  Shape bias_scale_shape(N, {2 * channel});
+  float* bs_data = scale_bias->mutableData<float>(FP32, bias_scale_shape);
+  for (int i = 0; i < channel; i++) {
+    float scale_value = scale_data == nullptr ? 1 : scale_data[i];
+    float bias_value = bias_data == nullptr ? 0 : bias_data[i];
+    bs_data[i + channel] = scale_value;
+    bs_data[i] = bias_value;
+  }
+  int element_num_per_div = get_filter_num_per_div(filter, group);
+  bias_scale::format_bias_scale_array(&bs_data, element_num_per_div, channel);
+}
+inline void format_filter(Tensor* filter, Tensor* quantized_filter, int group) {
+  float max_value = find_max(*filter);
+  Shape& filter_shape = filter->shape();
+  quantized_filter->setAligned(true);
+  quantized_filter->mutableData<int8_t>(INT8, filter->shape());
+  quantized_filter->scale()[0] = max_value / 127.0f;
+  quantized_filter->scale()[1] = 127.0f / max_value;
+  auto memory_size = filter->shape().memorySize(sizeof(float));
+  auto new_data = reinterpret_cast<float*>(fpga_malloc(memory_size));
+  memcpy(new_data, filter->data<float>(), memory_size);
+  size_t mem_size = filter::format_filter(
+      &new_data, filter_shape.num(), filter_shape.channel(),
+      filter_shape.height(), filter_shape.width(), group, max_value);
+  int8_t* src = quantized_filter->mutableData<int8_t>(INT8, filter->shape());
+  memcpy(src, new_data, mem_size);
+  fpga_free(new_data);
+  quantized_filter->flush();
+}
+inline void format_dw_filter(Tensor* filter, Tensor* quantized_filter,
+                             float* scale) {
+  int num = filter->shape().num();
+  int height = filter->shape().height();
+  int width = filter->shape().width();
+  auto memory_size = filter->shape().memorySize(sizeof(float));
+  auto new_data = (float*)fpga_malloc(memory_size);  // NOLINT
+  memcpy(new_data, filter->data<float>(), memory_size);
+  filter::format_dwconv_filter(&new_data, num, height, width, scale);
+  float16* src = quantized_filter->mutableData<float16>(FP16, filter->shape());
+  memcpy(src, new_data, quantized_filter->shape().memorySize(sizeof(float16)));
+  quantized_filter->flush();
+  fpga_free(new_data);
+}
+inline void format_fc_filter(Tensor* filter, Tensor* quantized_filter) {
+  float max_value = find_max(*filter);
+  Shape& filter_shape = filter->shape();
+  quantized_filter->setAligned(true);
+  quantized_filter->mutableData<int8_t>(INT8, filter->shape());
+  quantized_filter->scale()[0] = max_value / 127.0f;
+  quantized_filter->scale()[1] = 127.0f / max_value;
+  size_t memory_size = filter->shape().memorySize(sizeof(float));
+  auto new_data = (float*)fpga_malloc(memory_size);  // NOLINT
+  memcpy(new_data, filter->data<float>(), memory_size);
+  filter::format_fc_filter(&new_data, filter_shape.num(),
+                           filter_shape.channel(), filter_shape.height(),
+                           filter_shape.width(), 1, max_value);
+  int8_t* src = quantized_filter->mutableData<int8_t>(INT8, filter->shape());
+  memcpy(src, new_data, quantized_filter->shape().memorySize(sizeof(int8_t)));
+  quantized_filter->flush();
+  fpga_free(new_data);
+}
+inline void fill_split_arg(const ConvParam& c_param) {
+  ConvParam& param = const_cast<ConvParam&>(c_param);
+  Tensor* input = param.input;
+  Tensor* out = param.output;
+  Tensor* filter = param.filter;
+  auto channel = out->shape().channel();
+  int split_num = param.groups == 1 ? get_split_num(param.filter) : 1;
+  int filter_num_per_div = get_filter_num_per_div(filter, param.groups);
+  int element_num = get_aligned_filter_element_num(filter->shape().channel() *
+                                                   filter->shape().height() *
+                                                   filter->shape().width());
+  Shape& out_shape = out->shape();
+  for (int i = 0; i < split_num; i++) {
+    BasicConvParam* conv_param = new BasicConvParam();
+    int filter_num = filter->shape().num();
+    float16* out_address = nullptr;
+    int8_t* filter_address = nullptr;
+    float* sb_address = nullptr;
+    float* out_scale_address = nullptr;
+    ConvArgs& args = conv_param->args;
+    if (split_num == 1) {
+      out_address = out->data<float16>();
+      out_scale_address = out->scale();
+    }
+    filter_num = i == split_num - 1
+                     ? channel - (split_num - 1) * filter_num_per_div  // NOLINT
+                     : filter_num_per_div;
+    if (split_num != 1) {
+      Shape shape(NHWC, {1, out_shape.height(), out_shape.width(), filter_num});
+      out_address = conv_param->output.mutableData<float16>(FP16, shape);
+      out_scale_address = conv_param->output.scale();
+    }
+    Shape f_shape(NCHW, {filter_num, filter->shape().channel(),
+                         filter->shape().height(), filter->shape().width()});
+    Tensor new_filter;
+    float* new_filter_data = new_filter.mutableData<float>(FP32, f_shape);
+    int filter_hwc = filter->shape().height() * filter->shape().width() *
+                     filter->shape().channel();
+    memcpy(new_filter_data,
+           filter->data<float>() + i * filter_num_per_div * filter_hwc,
+           filter_num * filter_hwc * sizeof(float));
+    new_filter.flush();
+    conv_param->filter.mutableData<float>(FP32, f_shape);
+    format_filter(&new_filter, &(conv_param->filter), param.groups);
+    filter_address = conv_param->filter.data<int8_t>();
+    std::cout << conv_param->filter.scale()[0] << std::endl;
+    args.filter_scale_address = conv_param->filter.scale();
+    int sb_num = 2 * align_to_x(filter_num, BS_NUM_ALIGNMENT);
+    Tensor scale;
+    Tensor bias;
+    int chnnnel_start = i * filter_num_per_div;
+    Shape s_shape(N, {filter_num});
+    float* scale_data = scale.mutableData<float>(FP32, s_shape);
+    float* bias_data = bias.mutableData<float>(FP32, s_shape);
+    for (int i = 0; i < filter_num; i++) {
+      scale_data[i] = param.scale()->data<float>()[i + chnnnel_start];
+    }
+    for (int i = 0; i < filter_num; i++) {
+      // bias_data[i] = 0.0f;//TODO
+      bias_data[i] = param.bias()->data<float>()[i + chnnnel_start];
+    }
+    Shape sb_shape(N, {sb_num});
+    format_scale_bias(&scale, &bias, &conv_param->filter,
+                      &conv_param->scaleBias, param.groups);
+    sb_address = conv_param->scaleBias.mutableData<float>(FP32, sb_shape);
+    args.group_num = param.groups;
+    args.relu_enabled = param.relu.enabled;
+    args.sb_address = sb_address;
+    args.kernel.stride_h = param.strides[1];
+    args.kernel.stride_w = param.strides[0];
+    args.kernel.height = new_filter.shape().height();
+    args.kernel.width = new_filter.shape().width();
+    args.filter_address = filter_address;
+    args.filter_num = filter_num;
+    args.image.address = input->data<void>();
+    args.image.scale_address = input->scale();
+    args.image.channels = input->shape().channel();
+    args.image.width = input->shape().width();
+    args.image.height = input->shape().height();
+    args.image.pad_width = param.paddings[0];
+    args.image.pad_height = param.paddings[1];
+    args.output.address = out_address;
+    args.output.scale_address = out_scale_address;
+    param.splitParams().push_back(conv_param);
+  }
+}
+inline void fill_split_arg(struct SplitConvArgs* arg, Tensor* input,
+                           Tensor* out, Tensor* filter, bool relu_enabled,
+                           int group_num, int stride_h, int stride_w,
+                           int padding_h, int padding_w, float* bs_ptr) {
+  auto input_ptr = input->data<float>();
+  auto filter_ptr = filter->data<float>();
+  auto out_ptr = out->data<float>();
+  arg->group_num = (uint32_t)group_num;
+  arg->split_num = group_num == 1 ? get_split_num(filter) : 1;
+  arg->filter_num = filter->shape().num();
+  arg->output.address = out_ptr;
+  arg->output.scale_address = out->scale();
+  arg->conv_arg =
+      (ConvArgs*)fpga_malloc(arg->split_num * sizeof(ConvArgs));  // NOLINT
+  memset(arg->conv_arg, 0, arg->split_num * sizeof(struct ConvArgs));
+  arg->concat_arg.image_num = arg->split_num;
+  arg->concat_arg.image_out = out_ptr;
+  arg->concat_arg.scale_out = out->scale();
+  arg->concat_arg.height = out->shape().height();
+  arg->concat_arg.width = out->shape().width();
+  int n = arg->split_num;
+  arg->concat_arg.images_in = (half**)fpga_malloc(n * sizeof(int*));  // NOLINT
+  arg->concat_arg.scales_in =
+      (float**)fpga_malloc(n * sizeof(float*));  // NOLINT
+  arg->concat_arg.channel_num =
+      (uint32_t*)fpga_malloc(n * sizeof(uint32_t));  // NOLINT
+  auto channel = out->shape().channel();
+  int filter_num_per_div = get_filter_num_per_div(filter, group_num);
+  int element_num = get_aligned_filter_element_num(filter->shape().channel() *
+                                                   filter->shape().height() *
+                                                   filter->shape().width());
+  for (int i = 0; i < n; i++) {
+    arg->conv_arg[i].relu_enabled = relu_enabled;
+    arg->conv_arg[i].group_num = (uint32_t)group_num;
+    arg->conv_arg[i].kernel.stride_h = (uint32_t)stride_h;
+    arg->conv_arg[i].kernel.stride_w = (uint32_t)stride_w;
+    arg->conv_arg[i].kernel.height = filter->shape().height();
+    arg->conv_arg[i].kernel.width = filter->shape().width();
+    arg->conv_arg[i].image.address = input_ptr;
+    arg->conv_arg[i].image.channels = input->shape().channel();
+    arg->conv_arg[i].image.height = input->shape().height();
+    arg->conv_arg[i].image.width = input->shape().width();
+    arg->conv_arg[i].image.scale_address = input->scale();
+    arg->conv_arg[i].image.pad_height = (uint32_t)padding_h;
+    arg->conv_arg[i].image.pad_width = (uint32_t)padding_w;
+    arg->conv_arg[i].filter_scale_address = filter->scale();
+    arg->conv_arg[i].filter_num = (uint32_t)(
+        i == n - 1 ? channel - (n - 1) * filter_num_per_div  // NOLINT
+                   : filter_num_per_div);
+    size_t filter_size =
+        element_num *
+        align_to_x(arg->conv_arg[i].filter_num, FILTER_NUM_ALIGNMENT) *
+        sizeof(int8_t);
+    auto filter_head =
+        &((int8_t*)filter_ptr)[i * element_num * filter_num_per_div];  // NOLINT
+    arg->conv_arg[i].filter_address = fpga_malloc(filter_size);
+    memcpy(arg->conv_arg[i].filter_address, filter_head, filter_size);
+    fpga_flush(arg->conv_arg[i].filter_address, filter_size);
+    size_t bs_size = 2 *
+                     align_to_x(arg->conv_arg[i].filter_num, BS_NUM_ALIGNMENT) *
+                     sizeof(float);
+    auto bs_head = &bs_ptr[i * filter_num_per_div * 2];
+    arg->conv_arg[i].sb_address = fpga_malloc(bs_size);
+    memcpy(arg->conv_arg[i].sb_address, bs_head, bs_size);
+    fpga_flush(arg->conv_arg[i].sb_address, bs_size);
+    if (n > 1) {
+      arg->conv_arg[i].output.scale_address =
+          (float*)fpga_malloc(2 * sizeof(float));  // NOLINT
+      arg->conv_arg[i].output.address = fpga_malloc(
+          out->shape().height() *
+          align_to_x(out->shape().width() * arg->conv_arg[i].filter_num,
+                     IMAGE_ALIGNMENT) *
+          sizeof(half));
+    } else {
+      arg->conv_arg[i].output.scale_address = out->scale();
+      arg->conv_arg[i].output.address = out_ptr;
+    }
+    arg->concat_arg.images_in[i] =
+        (half*)arg->conv_arg[i].output.address;  // NOLINT
+    arg->concat_arg.scales_in[i] = arg->conv_arg[i].output.scale_address;
+    arg->concat_arg.channel_num[i] = arg->conv_arg[i].filter_num;
+  }
+}
+inline int do_concat(const struct ConcatArgs& args) {
+  image::concat_images(args.images_in, args.scales_in, args.image_out,
+                       args.scale_out, args.image_num, args.channel_num,
+                       args.height, args.width);
+  return 0;
+}
+inline bool compute_conv(const ConvParam& c_conv_params) {
+  ConvParam& conv_params = const_cast<ConvParam&>(c_conv_params);
+  std::vector<BasicConvParam*>& params = conv_params.splitParams();
+  int ret = 0;
+  for (auto conv_param : params) {
+    ret |= compute_fpga_conv_basic(conv_param->args);
+  }
+  size_t size = params.size();
+  if (ret == 0 && size > 1) {
+    Tensor* output = conv_params.output;
+    Tensor& img = params[0]->output;
+    for (int i = 0; i < 1; i++) {
+      for (int i = 0; i < img.shape().numel(); i++) {
+        float value = half_to_float(img.data<float16>()[i]);
+        std::cout << "value:" << value << std::endl;
+      }
+    }
+  }
+  return ret == 0;
+}
+inline bool compute_conv(const SplitConvArgs& args) {
+  int ret = 0;
+  int split_num = args.split_num;
+  for (int i = 0; i < split_num; i++) {
+    ret |= compute_fpga_conv_basic(args.conv_arg[i]);
+  }
+  if (split_num > 1) {
+    do_concat(args.concat_arg);
+  }
+  return ret == 0;
+}
+}  // namespace zynqmp
+}  // namespace paddle_mobile
+#endif /* conv_process_hpp */
--- a/src/fpga/KD/pes/depthwise_conv_pe.hpp
+++ b/src/fpga/KD/pes/depthwise_conv_pe.hpp
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "../float16.hpp"
+#include "../pe.hpp"
+#include "../pe_params.hpp"
+#include "conv_process.hpp"
+namespace paddle_mobile {
+namespace zynqmp {
+class DepthwiseConvPE : public PE {
+ public:
+  bool init() {
+    std::cout << "DWConv init" << std::endl;
+    return true;
+  }
+  void apply() {
+    DepthwiseConvParam& param = param_;
+    Tensor* input = param.input;
+    Tensor* output = param.output;
+    int channel = output->shape().channel();
+    Tensor* new_scale = param.scale();
+    Tensor* new_bias = param.bias();
+    Shape shape(NC, {channel, 1});
+    float* new_scale_data = new_scale->mutableData<float>(FP32, shape);
+    float16* new_bias_data = new_bias->mutableData<float16>(FP16, shape);
+    BatchnormParam* batchnorm = param.batchnorm;
+    memset(new_scale_data, 0, new_scale->shape().memorySize(sizeof(float16)));
+    memset(new_bias_data, 0, new_bias->shape().memorySize(sizeof(float16)));
+    if (batchnorm != nullptr) {
+      for (size_t i = 0; i < channel; i++) {
+        // TODO(chonwhite) combine;
+      }
+    } else {
+      float16 zero = float_to_half(0.0f);
+      for (size_t i = 0; i < channel; i++) {
+        new_bias_data[i] = zero;
+        new_scale_data[i] = 1.0f;
+      }
+    }
+    Tensor* quantized_filter = param.quantizedFilter();
+    quantized_filter->mutableData<float16>(FP16, param.filter->shape());
+    format_dw_filter(param.filter, param.quantizedFilter(), new_scale_data);
+    DWconvArgs args = {0};
+    void* filter_address = quantized_filter->data<float>();
+    std::cout << "filter:" << filter_address;
+    args.bias_address = new_bias_data;
+    args.filter_address = param.quantizedFilter()->data<void>();
+    args.kernel.width = param.kernelSize[0];
+    args.kernel.height = param.kernelSize[1];
+    args.kernel.stride_w = param.strides[0];
+    args.kernel.stride_h = param.strides[1];
+    args.image.address = input->data<void>();
+    args.image.channels = input->shape().channel();
+    args.image.height = input->shape().height();
+    args.image.width = input->shape().width();
+    args.image.pad_width = param.paddings[0];
+    args.image.pad_height = param.paddings[1];
+    args.image.scale_address = input->scale();
+    args.output.address = output->data<void>();
+    args.output.scale_address = output->scale();
+    args.out_width = param.output->shape().width();
+    args.out_height = param.output->shape().height();
+    args.sub_conv_num = 1;
+    param.args = args;
+  }
+  bool dispatch() { return compute_fpga_dwconv(param_.args) == 0; }
+  DepthwiseConvParam& param() { return param_; }
+ private:
+  DepthwiseConvParam param_;
+};
+}  // namespace zynqmp
+}  // namespace paddle_mobile
--- a/src/fpga/KD/pes/elementwise_add_pe.hpp
+++ b/src/fpga/KD/pes/elementwise_add_pe.hpp
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "../pe.hpp"
+#include "../pe_params.hpp"
+namespace paddle_mobile {
+namespace zynqmp {
+class ElementwiseAddPE : public PE {
+ public:
+  bool init() { return true; }
+  void apply() {
+    Tensor* input0 = param_.inputs[0];
+    Tensor* input1 = param_.inputs[1];
+    Tensor* output = param_.output;
+    EWAddArgs args = {0};
+    args.const0 = 0x3c00;
+    args.const1 = 0x3c00;  // =1
+    args.image0.address = input0->data<float16>();
+    args.image0.channels = input0->shape().channel();
+    args.image0.scale_address = input0->scale();
+    args.image0.height = input0->shape().height();
+    args.image0.width = input0->shape().width();
+    args.image0.pad_height = 0;
+    args.image0.pad_width = 0;
+    args.image1.address = input1->data<float16>();
+    args.image1.channels = input1->shape().channel();
+    args.image1.scale_address = input1->scale();
+    args.image1.height = input1->shape().height();
+    args.image1.width = input1->shape().width();
+    args.image1.pad_height = 0;
+    args.image1.pad_width = 0;
+    args.output.scale_address = output->scale();
+    args.output.address = output->data<float16>();
+    param_.ewargs = args;
+  }
+  bool dispatch() {
+    InplaceArgs inplace_args = {0};
+    if (param_.relu.enabled) {
+      inplace_args.relu_enable = true;
+      config_inplace(inplace_args);
+    }
+    compute_fpga_ewadd(param_.ewargs);
+    if (param_.relu.enabled) {
+      inplace_args.relu_enable = false;
+      config_inplace(inplace_args);
+    }
+    return true;
+  }
+  ElementwiseAddParam& param() { return param_; }
+ private:
+  ElementwiseAddParam param_;
+};
+}  // namespace zynqmp
+}  // namespace paddle_mobile
--- a/src/fpga/KD/pes/fully_connected_pe.hpp
+++ b/src/fpga/KD/pes/fully_connected_pe.hpp
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <vector>
+#include "../pe.hpp"
+#include "../pe_params.hpp"
+#include "conv_process.hpp"
+namespace paddle_mobile {
+namespace zynqmp {
+class FullyConnectedPE : public PE {
+ public:
+  bool init() { return true; }
+  void apply() {
+    Tensor* input = param_.input;
+    Tensor* output = param_.output;
+    convParam_.input = param_.input;
+    convParam_.output = param_.output;
+    // convParam_.relu = param_.relu;
+    convParam_.groups = 1;
+    convParam_.strides = {1, 1};
+    convParam_.paddings = {0, 0};
+    convParam_.kernelSize = {input->shape().width(), input->shape().height()};
+    convParam_.dilations = {1, 1};
+    int num = param_.filter->shape().channel();
+    int chw = param_.filter->shape().num();
+    int height = param_.input->shape().height();
+    int width = param_.input->shape().width();
+    int filter_channel = chw / height / width;
+    int channel = param_.output->shape().channel();
+    Shape shape(NCHW, {num, filter_channel, height, width});
+    Tensor* conv_filter = new Tensor();
+    float* new_filter_data = conv_filter->mutableData<float>(FP32, shape);
+    float* filter_data = param_.filter->data<float>();
+    for (int i = 0; i < num; i++) {
+      float sum = 0;
+      float* f_start = filter_data + i * chw;
+      for (int j = 0; j < chw; j++) {
+        float scale = filter_data[j * num + i];
+        new_filter_data[i * chw + j] = scale;
+      }
+    }
+    conv_filter->flush();
+    convParam_.filter = conv_filter;
+    Shape sb_shape(N, {channel});
+    float* scale_data = convParam_.scale()->mutableData<float>(FP32, sb_shape);
+    float* bias_data = convParam_.bias()->mutableData<float>(FP32, sb_shape);
+    for (int i = 0; i < channel; i++) {
+      scale_data[i] = 1.0f;
+      bias_data[i] = param_.bias->data<float>()[i];
+    }
+    fill_split_arg(convParam_);
+  }
+  bool dispatch() {
+    int ret = 0;
+    std::vector<BasicConvParam*>& params = convParam_.splitParams();
+    for (auto conv_param : params) {
+      std::cout << "conv basic \n";
+      ret |= compute_fpga_conv_basic(conv_param->args);
+    }
+    return ret == 0;
+  }
+  FullyConnectedParam& param() { return param_; }
+ private:
+  FullyConnectedParam param_;
+  ConvParam convParam_;
+};
+}  // namespace zynqmp
+}  // namespace paddle_mobile
--- a/src/fpga/KD/pes/input_pe.hpp
+++ b/src/fpga/KD/pes/input_pe.hpp
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "../pe.hpp"
+#include "../pe_params.hpp"
+namespace paddle_mobile {
+namespace zynqmp {
+class InputPE : public PE {
+ public:
+  bool init() {
+    Tensor* output = param_.output;
+    output->setAligned(true);
+    return true;
+  }
+  bool dispatch() {
+    std::cout << "InputPE dispatch \n";
+    Tensor* input = param_.input;
+    Tensor* output = param_.output;
+    Tensor* src = input;
+    Tensor half_tensor;
+    if (input->dataType() == DataType::FP32) {
+      half_tensor.mutableData<void*>(DataType::FP16, input->shape());
+      half_tensor.copyFrom(input);
+      src = &half_tensor;
+    }
+    output->mutableData<void>();
+    src->alignImage(output, true);
+    return true;
+  }
+  InputParam& param() { return param_; }
+ private:
+  InputParam param_;
+};
+}  // namespace zynqmp
+}  // namespace paddle_mobile
--- a/src/fpga/KD/pes/math_func_neon.h
+++ b/src/fpga/KD/pes/math_func_neon.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+/* NEON implementation of sin, cos, exp and log
+ *
+ *   Inspired by Intel Approximate Math library, and based on the
+ *   corresponding algorithms of the cephes math library
+ */
+/* Copyright (C) 2011  Julien Pommier
+ *
+ *  This software is provided 'as-is', without any express or implied
+ *  warranty.  In no event will the authors be held liable for any damages
+ *  arising from the use of this software.
+ *
+ *  Permission is granted to anyone to use this software for any purpose,
+ *  including commercial applications, and to alter it and redistribute it
+ *  freely, subject to the following restrictions:
+ *
+ *  1. The origin of this software must not be misrepresented; you must not
+ *     claim that you wrote the original software. If you use this software
+ *     in a product, an acknowledgment in the product documentation would be
+ *     appreciated but is not required.
+ *  2. Altered source versions must be plainly marked as such, and must not be
+ *     misrepresented as being the original software.
+ *  3. This notice may not be removed or altered from any source distribution.
+ *
+ *  (this is the zlib license)
+ */
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#pragma once
+#include <arm_neon.h>
+static const int32_t c_inv_mant_mask = ~0x7f800000u;
+static const float c_cephes_SQRTHF = 0.707106781186547524;
+static const float c_cephes_log_p0 = 7.0376836292E-2;
+static const float c_cephes_log_p1 = -1.1514610310E-1;
+static const float c_cephes_log_p2 = 1.1676998740E-1;
+static const float c_cephes_log_p3 = -1.2420140846E-1;
+static const float c_cephes_log_p4 = +1.4249322787E-1;
+static const float c_cephes_log_p5 = -1.6668057665E-1;
+static const float c_cephes_log_p6 = +2.0000714765E-1;
+static const float c_cephes_log_p7 = -2.4999993993E-1;
+static const float c_cephes_log_p8 = +3.3333331174E-1;
+static const float c_cephes_log_q1 = -2.12194440e-4;
+static const float c_cephes_log_q2 = 0.693359375;
+/* natural logarithm computed for 4 simultaneous float
+ *   return NaN for x <= 0
+ */
+static inline float32x4_t log_ps(float32x4_t x) {
+  float32x4_t one = vdupq_n_f32(1);
+  x = vmaxq_f32(x, vdupq_n_f32(0)); /* force flush to zero on denormal values */
+  uint32x4_t invalid_mask = vcleq_f32(x, vdupq_n_f32(0));
+  int32x4_t ux = vreinterpretq_s32_f32(x);
+  int32x4_t emm0 = vshrq_n_s32(ux, 23);
+  /* keep only the fractional part */
+  ux = vandq_s32(ux, vdupq_n_s32(c_inv_mant_mask));
+  ux = vorrq_s32(ux, vreinterpretq_s32_f32(vdupq_n_f32(0.5f)));
+  x = vreinterpretq_f32_s32(ux);
+  emm0 = vsubq_s32(emm0, vdupq_n_s32(0x7f));
+  float32x4_t e = vcvtq_f32_s32(emm0);
+  e = vaddq_f32(e, one);
+  /* part2:
+   *     if( x < SQRTHF ) {
+   *       e -= 1;
+   *       x = x + x - 1.0;
+   *     } else { x = x - 1.0; }
+   */
+  uint32x4_t mask = vcltq_f32(x, vdupq_n_f32(c_cephes_SQRTHF));
+  float32x4_t tmp =
+      vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask));
+  x = vsubq_f32(x, one);
+  e = vsubq_f32(
+      e, vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(one), mask)));
+  x = vaddq_f32(x, tmp);
+  float32x4_t z = vmulq_f32(x, x);
+  float32x4_t y = vdupq_n_f32(c_cephes_log_p0);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p1));
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p2));
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p3));
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p4));
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p5));
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p6));
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p7));
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p8));
+  y = vmulq_f32(y, x);
+  y = vmulq_f32(y, z);
+  tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q1));
+  y = vaddq_f32(y, tmp);
+  tmp = vmulq_f32(z, vdupq_n_f32(0.5f));
+  y = vsubq_f32(y, tmp);
+  tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q2));
+  x = vaddq_f32(x, y);
+  x = vaddq_f32(x, tmp);
+  x = vreinterpretq_f32_u32(vorrq_u32(
+      vreinterpretq_u32_f32(x), invalid_mask));  // negative arg will be NAN
+  return x;
+}
+static const float c_exp_hi = 88.3762626647949f;
+static const float c_exp_lo = -88.3762626647949f;
+static const float c_cephes_LOG2EF = 1.44269504088896341;
+static const float c_cephes_exp_C1 = 0.693359375;
+static const float c_cephes_exp_C2 = -2.12194440e-4;
+static const float c_cephes_exp_p0 = 1.9875691500E-4;
+static const float c_cephes_exp_p1 = 1.3981999507E-3;
+static const float c_cephes_exp_p2 = 8.3334519073E-3;
+static const float c_cephes_exp_p3 = 4.1665795894E-2;
+static const float c_cephes_exp_p4 = 1.6666665459E-1;
+static const float c_cephes_exp_p5 = 5.0000001201E-1;
+/* exp() computed for 4 float at once */
+static inline float32x4_t exp_ps(float32x4_t x) {
+  float32x4_t tmp, fx;
+  float32x4_t one = vdupq_n_f32(1);
+  x = vminq_f32(x, vdupq_n_f32(c_exp_hi));
+  x = vmaxq_f32(x, vdupq_n_f32(c_exp_lo));
+  /* express exp(x) as exp(g + n*log(2)) */
+  fx = vmlaq_f32(vdupq_n_f32(0.5f), x, vdupq_n_f32(c_cephes_LOG2EF));
+  /* perform a floorf */
+  tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx));
+  /* if greater, substract 1 */
+  uint32x4_t mask = vcgtq_f32(tmp, fx);
+  mask = vandq_u32(mask, vreinterpretq_u32_f32(one));
+  fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask));
+  tmp = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C1));
+  float32x4_t z = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C2));
+  x = vsubq_f32(x, tmp);
+  x = vsubq_f32(x, z);
+  static const float cephes_exp_p[6] = {c_cephes_exp_p0, c_cephes_exp_p1,
+                                        c_cephes_exp_p2, c_cephes_exp_p3,
+                                        c_cephes_exp_p4, c_cephes_exp_p5};
+  float32x4_t y = vld1q_dup_f32(cephes_exp_p + 0);
+  float32x4_t c1 = vld1q_dup_f32(cephes_exp_p + 1);
+  float32x4_t c2 = vld1q_dup_f32(cephes_exp_p + 2);
+  float32x4_t c3 = vld1q_dup_f32(cephes_exp_p + 3);
+  float32x4_t c4 = vld1q_dup_f32(cephes_exp_p + 4);
+  float32x4_t c5 = vld1q_dup_f32(cephes_exp_p + 5);
+  y = vmulq_f32(y, x);
+  z = vmulq_f32(x, x);
+  y = vaddq_f32(y, c1);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, c2);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, c3);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, c4);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, c5);
+  y = vmulq_f32(y, z);
+  y = vaddq_f32(y, x);
+  y = vaddq_f32(y, one);
+  /* build 2^n */
+  int32x4_t mm;
+  mm = vcvtq_s32_f32(fx);
+  mm = vaddq_s32(mm, vdupq_n_s32(0x7f));
+  mm = vshlq_n_s32(mm, 23);
+  float32x4_t pow2n = vreinterpretq_f32_s32(mm);
+  y = vmulq_f32(y, pow2n);
+  return y;
+}
+static const float c_minus_cephes_DP1 = -0.78515625;
+static const float c_minus_cephes_DP2 = -2.4187564849853515625e-4;
+static const float c_minus_cephes_DP3 = -3.77489497744594108e-8;
+static const float c_sincof_p0 = -1.9515295891E-4;
+static const float c_sincof_p1 = 8.3321608736E-3;
+static const float c_sincof_p2 = -1.6666654611E-1;
+static const float c_coscof_p0 = 2.443315711809948E-005;
+static const float c_coscof_p1 = -1.388731625493765E-003;
+static const float c_coscof_p2 = 4.166664568298827E-002;
+static const float c_cephes_FOPI = 1.27323954473516;  // 4 / M_PI
+/* evaluation of 4 sines & cosines at once.
+ *
+ *   The code is the exact rewriting of the cephes sinf function.
+ *   Precision is excellent as long as x < 8192 (I did not bother to
+ *   take into account the special handling they have for greater values
+ *   -- it does not return garbage for arguments over 8192, though, but
+ *   the extra precision is missing).
+ *
+ *   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
+ *   surprising but correct result.
+ *
+ *   Note also that when you compute sin(x), cos(x) is available at
+ *   almost no extra price so both sin_ps and cos_ps make use of
+ *   sincos_ps..
+ */
+static inline void sincos_ps(float32x4_t x, float32x4_t *ysin,
+                             float32x4_t *ycos) {
+  // any x
+  float32x4_t xmm1, xmm2, xmm3, y;
+  uint32x4_t emm2;
+  uint32x4_t sign_mask_sin, sign_mask_cos;
+  sign_mask_sin = vcltq_f32(x, vdupq_n_f32(0));
+  x = vabsq_f32(x);
+  /* scale by 4/Pi */
+  y = vmulq_f32(x, vdupq_n_f32(c_cephes_FOPI));
+  /* store the integer part of y in mm0 */
+  emm2 = vcvtq_u32_f32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  emm2 = vaddq_u32(emm2, vdupq_n_u32(1));
+  emm2 = vandq_u32(emm2, vdupq_n_u32(~1));
+  y = vcvtq_f32_u32(emm2);
+  /* get the polynom selection mask
+   *     there is one polynom for 0 <= x <= Pi/4
+   *     and another one for Pi/4<x<=Pi/2
+   *
+   *     Both branches will be computed.
+   */
+  uint32x4_t poly_mask = vtstq_u32(emm2, vdupq_n_u32(2));
+  /* The magic pass: "Extended precision modular arithmetic"
+   *     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = vmulq_n_f32(y, c_minus_cephes_DP1);
+  xmm2 = vmulq_n_f32(y, c_minus_cephes_DP2);
+  xmm3 = vmulq_n_f32(y, c_minus_cephes_DP3);
+  x = vaddq_f32(x, xmm1);
+  x = vaddq_f32(x, xmm2);
+  x = vaddq_f32(x, xmm3);
+  sign_mask_sin = veorq_u32(sign_mask_sin, vtstq_u32(emm2, vdupq_n_u32(4)));
+  sign_mask_cos = vtstq_u32(vsubq_u32(emm2, vdupq_n_u32(2)), vdupq_n_u32(4));
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) in y1,
+   *     and the second polynom      (Pi/4 <= x <= 0) in y2 */
+  float32x4_t z = vmulq_f32(x, x);
+  float32x4_t y1, y2;
+  y1 = vmulq_n_f32(z, c_coscof_p0);
+  y2 = vmulq_n_f32(z, c_sincof_p0);
+  y1 = vaddq_f32(y1, vdupq_n_f32(c_coscof_p1));
+  y2 = vaddq_f32(y2, vdupq_n_f32(c_sincof_p1));
+  y1 = vmulq_f32(y1, z);
+  y2 = vmulq_f32(y2, z);
+  y1 = vaddq_f32(y1, vdupq_n_f32(c_coscof_p2));
+  y2 = vaddq_f32(y2, vdupq_n_f32(c_sincof_p2));
+  y1 = vmulq_f32(y1, z);
+  y2 = vmulq_f32(y2, z);
+  y1 = vmulq_f32(y1, z);
+  y2 = vmulq_f32(y2, x);
+  y1 = vsubq_f32(y1, vmulq_f32(z, vdupq_n_f32(0.5f)));
+  y2 = vaddq_f32(y2, x);
+  y1 = vaddq_f32(y1, vdupq_n_f32(1));
+  /* select the correct result from the two polynoms */
+  float32x4_t ys = vbslq_f32(poly_mask, y1, y2);
+  float32x4_t yc = vbslq_f32(poly_mask, y2, y1);
+  *ysin = vbslq_f32(sign_mask_sin, vnegq_f32(ys), ys);
+  *ycos = vbslq_f32(sign_mask_cos, yc, vnegq_f32(yc));
+}
+static inline float32x4_t sin_ps(float32x4_t x) {
+  float32x4_t ysin, ycos;
+  sincos_ps(x, &ysin, &ycos);
+  return ysin;
+}
+static inline float32x4_t cos_ps(float32x4_t x) {
+  float32x4_t ysin, ycos;
+  sincos_ps(x, &ysin, &ycos);
+  return ycos;
+}
+static inline float32x4_t div_ps(float32x4_t a, float32x4_t b) {
+  float32x4_t reciprocal = vrecpeq_f32(b);
+  reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
+  return vmulq_f32(a, reciprocal);
+}
+static inline float32x4_t pow_ps(float32x4_t a, float32x4_t b) {
+  return exp_ps(vmulq_f32(b, log_ps(a)));
+}
+#endif  // __ARM_NEON__
--- a/src/fpga/KD/pes/output_pe.hpp
+++ b/src/fpga/KD/pes/output_pe.hpp
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "../pe.hpp"
+#include "../pe_params.hpp"
+namespace paddle_mobile {
+namespace zynqmp {
+class OutputPE : public PE {
+ public:
+  bool init() {
+    Tensor* output = param_.output;
+    output->setAligned(false);
+    return true;
+  }
+  bool dispatch() {
+    Tensor* input = param_.input;
+    Tensor* output = param_.output;
+    Tensor* src_tensor = input;
+    Tensor float_tensor;
+    input->invalidate();
+    float_tensor.mutableData<float>(DataType::FP32, input->shape());
+    if (input->dataType() == DataType::FP16) {
+      float_tensor.copyFrom(input);
+      src_tensor = &float_tensor;
+    }
+    src_tensor->unalignImage(output, true);
+    return true;
+  }
+  OutputParam& param() { return param_; }
+ private:
+  OutputParam param_;
+};
+}  // namespace zynqmp
+}  // namespace paddle_mobile
--- a/src/fpga/KD/pes/pooling_pe.hpp
+++ b/src/fpga/KD/pes/pooling_pe.hpp
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "../pe.hpp"
+#include "../pe_params.hpp"
+namespace paddle_mobile {
+namespace zynqmp {
+class PoolingPE : public PE {
+ public:
+  bool init() {
+    Tensor* output = param_.output;
+    output->setAligned(true);
+    return true;
+  }
+  void apply() {
+    Tensor* input = param_.input;
+    Tensor* output = param_.output;
+    uint32_t k_width = param_.kernelSize[0];
+    uint32_t k_height = param_.kernelSize[1];
+    if (param_.globalPooling) {
+      k_width = input->shape().width();
+      k_height = input->shape().height();
+    }
+    PoolingArgs args = {0};
+    args.mode = param_.type;
+    args.kernel_reciprocal = fp32_2_fp16(1.0f / (k_width * k_height));
+    args.image.address = input->data<float16>();
+    args.image.channels = input->shape().channel();
+    args.image.height = input->shape().height();
+    args.image.width = input->shape().width();
+    args.image.pad_height = param_.paddings[0];
+    args.image.pad_width = param_.paddings[1];
+    args.image.scale_address = input->scale();
+    args.output.address = output->mutableData<float16>();
+    args.output.scale_address = output->scale();
+    args.kernel.height = k_height;
+    args.kernel.width = k_width;
+    args.kernel.stride_h = param_.strides[0];
+    args.kernel.stride_w = param_.strides[1];
+    args.out_height = output->shape().height();
+    args.out_width = output->shape().width();
+    param_.poolingArgs = args;
+  }
+  bool dispatch() { return compute_fpga_pool(param_.poolingArgs) == 0; }
+  PoolingParam& param() { return param_; }
+ private:
+  PoolingParam param_;
+};
+}  // namespace zynqmp
+}  // namespace paddle_mobile
--- a/src/fpga/KD/pes/softmax_pe.cpp
+++ b/src/fpga/KD/pes/softmax_pe.cpp
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "softmax_pe.hpp"
+#include <vector>
+namespace paddle_mobile {
+namespace zynqmp {
+#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+#ifndef __aarch64__
+static inline float32_t vmaxvq_f32(const float32x4_t &r) {
+  float32x2_t v = vmax_f32(vget_high_f32(r), vget_low_f32(r));
+  return vget_lane_f32(vpmax_f32(v, v), 0);
+}
+static inline float32_t vaddvq_f32(const float32x4_t &r) {
+  float32x2_t v = vadd_f32(vget_high_f32(r), vget_low_f32(r));
+  return vget_lane_f32(vpadd_f32(v, v), 0);
+}
+#endif  // __aarch64__
+#endif  // __ARM_NEON__
+static float find_max(const float *input, const int num_classes) {
+  int remain = num_classes;
+  float max = -std::numeric_limits<float>::max();
+#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+  int loop = num_classes >> 3;
+  remain = num_classes & 0x7;
+  float32x4_t __max = vdupq_n_f32(max);
+  for (int i = 0; i < loop; ++i, input += 8) {
+    float32x4_t x0 = vld1q_f32(input);
+    float32x4_t x1 = vld1q_f32(input + 4);
+    __max = vmaxq_f32(x0, __max);
+    __max = vmaxq_f32(x1, __max);
+  }
+  max = vmaxvq_f32(__max);
+#endif
+  for (int i = 0; i < remain; ++i) {
+    max = std::max(max, input[i]);
+  }
+  return max;
+}
+static void softmax(Tensor *X, Tensor *Y) {
+  std::vector<int> dims = X->shape().dims();
+  int batch_size = X->shape().num();
+  int num_classes = dims[X->shape().dimSize() - 1];
+  int channels = X->shape().numel() / batch_size / num_classes;
+  float *x = X->data<float>();
+  float *y = Y->mutableData<float>();
+#pragma omp parallel for collapse(2)
+  for (int batch = 0; batch < batch_size; ++batch) {
+    for (int channel = 0; channel < channels; ++channel) {
+      size_t offset = (batch * channels + channel) * num_classes;
+      const float *input = x + offset;
+      float *output = y + offset;
+      // find max
+      float max = find_max(input, num_classes);
+      // exp(x - max)
+      int remain = num_classes;
+#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+      int loop = num_classes >> 3;
+      remain = num_classes & 0x7;
+      float32x4_t __max = vdupq_n_f32(max);
+      for (int i = 0; i < loop; ++i, input += 8, output += 8) {
+        float32x4_t x0 = vld1q_f32(input);
+        float32x4_t x1 = vld1q_f32(input + 4);
+        x0 = vsubq_f32(x0, __max);
+        x1 = vsubq_f32(x1, __max);
+        x0 = exp_ps(x0);
+        x1 = exp_ps(x1);
+        vst1q_f32(output, x0);
+        vst1q_f32(output + 4, x1);
+      }
+#endif  // __ARM_NEON__
+      for (int i = 0; i < remain; ++i) {
+        output[i] = expf(input[i] - max);
+      }
+      // sum(exp(x - max))
+      float sum = 0.f;
+      output = y + offset;
+#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+      float32x4_t __sum = vdupq_n_f32(0.f);
+      for (int i = 0; i < loop; ++i, output += 8) {
+        float32x4_t x0 = vld1q_f32(output);
+        float32x4_t x1 = vld1q_f32(output + 4);
+        __sum = vaddq_f32(x0, __sum);
+        __sum = vaddq_f32(x1, __sum);
+      }
+      sum += vaddvq_f32(__sum);
+#endif  // __ARM_NEON__
+      for (int i = 0; i < remain; ++i) {
+        sum += output[i];
+      }
+      // exp(x - max) / sum
+      float inv_sum = 1.f / sum;
+      output = y + offset;
+#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+      float32x4_t __inv_sum = vdupq_n_f32(inv_sum);
+      for (int i = 0; i < loop; ++i, output += 8) {
+        float32x4_t x0 = vld1q_f32(output);
+        float32x4_t x1 = vld1q_f32(output + 4);
+        x0 = vmulq_f32(x0, __inv_sum);
+        x1 = vmulq_f32(x1, __inv_sum);
+        vst1q_f32(output, x0);
+        vst1q_f32(output + 4, x1);
+      }
+#endif
+      for (int i = 0; i < remain; ++i) {
+        output[i] *= inv_sum;
+      }
+    }
+  }
+}
+bool SoftmaxPE::init() {
+  Tensor *output = param_.output;
+  output->setAligned(false);
+  return true;
+}
+bool SoftmaxPE::dispatch() {
+  Tensor *input = param_.input;
+  Tensor *output = param_.output;
+  input->invalidate();
+  Tensor float_input;
+  Tensor float_output;
+  float_input.mutableData<float>(DataType::FP32, input->shape());
+  float_input.copyFrom(input);
+  float_input.unalignImage();
+  float *out_data =
+      float_output.mutableData<float>(DataType::FP32, input->shape());
+  softmax(&float_input, &float_output);
+  float_output.flush();
+  output->copyFrom(&float_output);
+  return true;
+}
+SoftmaxParam &SoftmaxPE::param() { return param_; }
+}  // namespace zynqmp
+}  // namespace paddle_mobile
--- a/src/fpga/KD/pes/softmax_pe.hpp
+++ b/src/fpga/KD/pes/softmax_pe.hpp
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <math.h>
+#include <algorithm>
+#include <limits>
+#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+#include <arm_neon.h>
+#include "fpga/KD/pes/math_func_neon.h"
+#endif
+#include "../pe.hpp"
+#include "../pe_params.hpp"
+namespace paddle_mobile {
+namespace zynqmp {
+class SoftmaxPE : public PE {
+ public:
+  bool init();
+  bool dispatch();
+  SoftmaxParam& param();
+ private:
+  SoftmaxParam param_;
+};
+}  // namespace zynqmp
+}  // namespace paddle_mobile
--- a/src/fpga/KD/shape.hpp
+++ b/src/fpga/KD/shape.hpp
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <stdio.h>
+#include <vector>
+#include "fpga/KD/alignment.h"
+#include "fpga/KD/layout.hpp"
+namespace paddle_mobile {
+namespace zynqmp {
+static struct NCHW nchw_;
+static struct NHWC nhwc_;
+static struct NC nc_;
+static struct NHW nhw_;
+static struct N n_;
+class Shape {
+ public:
+  explicit Shape(std::vector<int> dims) { dims_ = dims; }
+  Shape(LayoutType type, std::vector<int> dims) {
+    dims_ = dims;
+    setLayoutType(type);
+  }
+  Shape(const Shape& src) {
+    dims_ = src.dims_;
+    setLayoutType(src.layoutType_);
+  }
+  bool shouldAlign() {
+    return layout_->alignedElementCount(dims_) != layout_->elementCount(dims_);
+  }
+  int num() {
+    int index = layout_->numIndex();
+    return index == -1 ? 1 : dims_[index];
+  }
+  int channel() {
+    int index = layout_->channelIndex();
+    return index == -1 ? 1 : dims_[index];
+  }
+  int height() {
+    int index = layout_->heightIndex();
+    return index == -1 ? 1 : dims_[index];
+  }
+  int width() {
+    int index = layout_->widthIndex();
+    return index == -1 ? 1 : dims_[index];
+  }
+  int dimSize() { return dims_.size(); }
+  std::vector<int> dims() { return dims_; }
+  size_t memorySize(int cellSize) {
+    return layout_->alignedElementCount(dims_) * cellSize;
+  }
+  int numel() { return layout_->elementCount(dims_); }
+  void setLayoutType(LayoutType layout) {
+    this->layoutType_ = layout;
+    switch (layout) {
+      case NCHW:
+        layout_ = &nchw_;
+        break;
+      case NHWC:
+        layout_ = &nhwc_;
+        break;
+      case NC:
+        layout_ = &nc_;
+        break;
+      case NHW:
+        layout_ = &nhw_;
+        break;
+      case N:
+        layout_ = &n_;
+        break;
+      default:
+        break;
+    }
+  }
+  int operator[](int index) { return dims_[index]; }
+ private:
+  LayoutType layoutType_;
+  Layout* layout_ = &nhwc_;
+  std::vector<int> dims_;
+};
+}  // namespace zynqmp
+}  // namespace paddle_mobile
--- a/src/fpga/KD/tensor.hpp
+++ b/src/fpga/KD/tensor.hpp
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <stdio.h>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+#include "float16.hpp"
+#include "llapi/zynqmp_api.h"
+#include "shape.hpp"
+namespace paddle_mobile {
+namespace zynqmp {
+enum DataType : int {
+  FP32 = 0,
+  FP16 = 1,
+  INT8 = 2,
+};
+typedef uint16_t float16;
+inline int CellSize(DataType type) {
+  switch (type) {
+    case FP32:
+      return sizeof(float);
+    case FP16:
+      return sizeof(float16);
+    case INT8:
+      return sizeof(int8_t);
+    default:
+      return 0;
+  }
+  return 0;
+}
+class PlaceHolder {
+ public:
+  explicit PlaceHolder(size_t size) {
+    size_ = size;
+    data_ = fpga_malloc(size_);
+  }
+  void* data() { return data_; }
+  size_t memorySize() { return size_; }
+  ~PlaceHolder() {
+    std::cout << "place holder dealloc";
+    fpga_free(data_);
+  }
+ private:
+  void* data_ = nullptr;
+  size_t size_ = 0;
+};
+class Tensor {
+ public:
+  int id() { return id_; }
+  template <typename Dtype>
+  Dtype* data() {
+    if (placeHolder_ == nullptr) {
+      return nullptr;
+    }
+    return reinterpret_cast<Dtype*>(this->placeHolder_->data());
+  }
+  template <typename Dtype>
+  Dtype* mutableData(DataType dataType, const Shape& shape) {
+    // if (this->shape_ != &shape) {
+    if (this->shape_ != nullptr) {
+      delete shape_;
+    }
+    this->shape_ = new Shape(shape);
+    // }
+    this->dataType_ = dataType;
+    return mutableData<Dtype>();
+  }
+  template <typename Dtype>
+  Dtype* mutableData() {
+    size_t memorySize = shape_->memorySize(CellSize(dataType_));
+    if (placeHolder_ != nullptr) {
+      if (memorySize > placeHolder_->memorySize()) {
+        delete placeHolder_;
+        placeHolder_ = new PlaceHolder(memorySize);
+      }
+    } else {
+      placeHolder_ = new PlaceHolder(memorySize);
+    }
+    return reinterpret_cast<Dtype*>(placeHolder_->data());
+  }
+  void setDataType(DataType dataType) { this->dataType_ = dataType; }
+  DataType dataType() { return this->dataType_; }
+  Shape& shape() { return *shape_; }
+  bool aligned() { return this->aligned_; }
+  void setAligned(bool aligned) { this->aligned_ = aligned; }
+  float* scale() { return scale_; }
+  void alignImage(Tensor* dst = nullptr, bool copy = false) {
+    if (shape_->shouldAlign()) {
+      int cell_size = CellSize(this->dataType_);
+      char* dst_data = nullptr;
+      size_t mem_size = shape_->memorySize(cell_size);
+      if (dst == nullptr) {
+        dst_data = reinterpret_cast<char*>(fpga_malloc(mem_size));
+      } else {
+        dst_data = dst->data<char>();
+      }
+      int wc = shape_->width() * shape_->channel();
+      int wc_aligned = align_image(wc);
+      int remainder = wc_aligned - wc;
+      char* src_start = data<char>();
+      char* dst_start = dst_data;
+      for (int n = 0; n < shape_->num(); n++) {
+        for (int h = 0; h < shape_->height(); h++) {
+          memcpy(dst_start, src_start, wc * cell_size);
+          memcpy(dst_start + wc * cell_size, 0, remainder * cell_size);
+          src_start += wc * cell_size;
+          dst_start += wc_aligned * cell_size;
+        }
+      }
+      if (dst == nullptr) {
+        memcpy(data<void>(), dst_data, mem_size);
+        flush();
+        fpga_free(dst_data);
+      } else {
+        dst->flush();
+      }
+    } else {
+      if (copy) {
+        dst->copyFrom(this);
+      } else {
+        // TODO(chonwhite) share data.
+      }
+    }
+  }
+  void unalignImage(Tensor* dst = nullptr, bool copy = false) {
+    if (shape_->shouldAlign()) {
+      // int cell_size = CellSize(this->dataType_);
+      // char* dst_data = nullptr;
+      // size_t mem_size = shape_->memorySize(cell_size);
+      // if (dst == nullptr) {
+      //     dst_data = (char*)fpga_malloc(mem_size);
+      // } else {
+      //     dst_data = dst->data<char>();
+      // }
+      // int wc = shape_->width() * shape_->channel();
+      // int wc_aligned = align_image(wc);
+      // int remainder = wc_aligned - wc;
+      // char* src_start = data<char>();
+      // char* dst_start = dst_data;
+      // for (int n = 0; n < shape_->num(); n++) {
+      //     for (int h = 0;h < shape_->height(); h++) {
+      //         memcpy(dst_start, src_start, wc * cell_size);
+      //         memcpy(dst_start + wc * cell_size, 0, remainder * cell_size);
+      //         src_start += wc * cell_size;
+      //         dst_start += wc_aligned * cell_size;
+      //     }
+      // }
+      // if (dst == nullptr) {
+      //     memcpy(data<void>(), dst_data, mem_size);
+      //     flush();
+      //     fpga_free(dst_data);
+      // } else {
+      //     dst->flush();
+      // }
+    } else {
+      if (copy) {
+        dst->copyFrom(this);
+      } else {
+        // TODO(chonwhite) share data.
+      }
+    }
+  }
+  void copyFrom(Tensor* src) {
+    BypassArgs args;
+    args.input_data_type =
+        src->dataType_ == FP32 ? DATA_TYPE_FP32 : DATA_TYPE_FP16;
+    args.output_data_type = dataType_ == FP32 ? DATA_TYPE_FP32 : DATA_TYPE_FP16;
+    args.input_layout_type = LAYOUT_HWC;
+    args.output_layout_type = LAYOUT_HWC;
+    args.image = {.address = src->data<void>(),
+                  .scale_address = src->scale(),
+                  .channels = (uint32_t)src->shape().channel(),
+                  .width = (uint32_t)src->shape().width(),
+                  .height = (uint32_t)src->shape().height(),
+                  .pad_width = 0u,
+                  .pad_height = 0u};
+    args.output = {
+        .address = data<void>(),
+        .scale_address = scale(),
+    };
+    src->flush();
+    perform_bypass(args);
+    this->invalidate();
+  }
+  void flush() { fpga_flush(placeHolder_->data(), placeHolder_->memorySize()); }
+  void invalidate() {
+    fpga_invalidate(placeHolder_->data(), placeHolder_->memorySize());
+  }
+  void print() {
+    int count = shape_->numel();
+    for (int i = 0; i < count; i++) {
+      std::cout << "" << '\n';
+    }
+  }
+  void saveToFile() {
+    std::string path = std::to_string(id_) + ".txt";
+    saveToFile(path);
+  }
+  void saveToFile(std::string path) {
+    std::ofstream ofs;
+    static int counter = 0;
+    std::string npath = std::to_string(counter) + "_" + path;
+    counter++;
+    ofs.open(npath);
+    for (size_t i = 0; i < shape_->numel(); i++) {
+      float value = 0;
+      if (dataType_ == FP32) {
+        value = data<float>()[i];
+      } else {
+        value = half_to_float(data<float16>()[i]);
+      }
+      ofs << value << std::endl;
+    }
+    ofs.close();
+  }
+ private:
+  float scale_[2];
+  Shape* shape_ = nullptr;
+  DataType dataType_ = FP32;
+  bool aligned_ = false;
+  static int generateID() {
+    static int sID = 0;
+    int id = sID++;
+    return id;
+  }
+  int id_ = generateID();
+  PlaceHolder* placeHolder_ = nullptr;
+};
+}  // namespace zynqmp
+}  // namespace paddle_mobile
--- a/src/fpga/KD/tensor_util.cpp
+++ b/src/fpga/KD/tensor_util.cpp
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <algorithm>
+#include "tensor_util.hpp"
+namespace paddle_mobile {
+namespace zynqmp {
+float find_max(const Tensor& tensor) {
+  float max = 0;
+  Tensor& t = const_cast<Tensor&>(tensor);
+  float* data = t.data<float>();
+  for (int i = 0; i < t.shape().numel(); i++) {
+    max = std::max(data[i], max);
+  }
+  return max;
+}
+}  // namespace zynqmp
+}  // namespace paddle_mobile
--- a/src/fpga/KD/tensor_util.hpp
+++ b/src/fpga/KD/tensor_util.hpp
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <stdio.h>
+#include "tensor.hpp"
+namespace paddle_mobile {
+namespace zynqmp {
+float find_max(const Tensor& tensor);
+}  // namespace zynqmp
+}  // namespace paddle_mobile
--- a/src/framework/executor.cpp
+++ b/src/framework/executor.cpp
@@ -62,7 +62,8 @@ Executor<Device, T>::Executor(const Program<Device> &program,
      use_optimize_ ? program_.optimizeProgram : program_.originProgram;
  PADDLE_MOBILE_ENFORCE(program_desc_ != nullptr,
                        "program_desc_ should not be nullptr");
-#if !defined(PADDLE_MOBILE_FPGA) && !defined(PADDLE_MOBILE_CL)
+#if !defined(PADDLE_MOBILE_FPGA) && !defined(PADDLE_MOBILE_FPGA_KD) && \
+    !defined(PADDLE_MOBILE_CL)
  pass::MemoryOptPass()(program_desc_.get(), program_.scope.get());
 #endif
  // resize feed and fetch list
@@ -230,20 +231,6 @@ void Executor<Device, T>::InitMemory() {
  }
 }
-static void ClearNoPersistableTensorArray(const framework::ProgramDesc *program,
-                                          framework::Scope *scope) {
-  for (const auto &block : program->Blocks()) {
-    for (const auto &var_desc : block->Vars()) {
-      if (!var_desc->Persistable() &&
-          var_desc->Type() == VARTYPE_TYPE_STEP_LOD_TENSOR_ARRAY) {
-        auto var = scope->Var(var_desc->Name());
-        auto array = var->template GetMutable<framework::LoDTensorArray>();
-        array->resize(1);
-      }
-    }
-  }
-}
 template <typename Device, typename T>
 void Executor<Device, T>::InitCombineMemory() {
  char *origin_data = nullptr;
@@ -281,6 +268,20 @@ void Executor<Device, T>::InitCombineMemory() {
  LOG(kLOG_INFO) << "init combine memory finish";
 }
+static void ClearNoPersistableTensorArray(const framework::ProgramDesc *program,
+                                          framework::Scope *scope) {
+  for (const auto &block : program->Blocks()) {
+    for (const auto &var_desc : block->Vars()) {
+      if (!var_desc->Persistable() &&
+          var_desc->Type() == VARTYPE_TYPE_STEP_LOD_TENSOR_ARRAY) {
+        auto var = scope->Var(var_desc->Name());
+        auto array = var->template GetMutable<framework::LoDTensorArray>();
+        array->resize(1);
+      }
+    }
+  }
+}
 template <typename Device, typename T>
 void Executor<Device, T>::InitNoPersistableMemory(const Tensor &input_tensor) {
  for (const auto &block : program_desc_->Blocks()) {

--- a/src/framework/tensor.h
+++ b/src/framework/tensor.h
@@ -26,6 +26,12 @@ limitations under the License. */
 #include "framework/tensor_base.h"
 #include "memory/t_malloc.h"
+#ifdef PADDLE_MOBILE_FPGA_KD
+#include "framework/zynqmp/ztensor.hpp"
+#endif
+#ifndef PADDLE_MOBILE_FPGA_KD
 namespace paddle_mobile {
 namespace framework {
@@ -290,3 +296,5 @@ inline Tensor ReshapeToMatrix(const Tensor &src, int num_col_dims) {
 }  // namespace framework
 }  // namespace paddle_mobile
+#endif
--- a/src/framework/zynqmp/ztensor.hpp
+++ b/src/framework/zynqmp/ztensor.hpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <cstdint>
+#include <cstring>
+#include <fstream>
+#include <memory>
+#include <string>
+#include <vector>
+#include "common/enforce.h"
+#include "framework/data_layout.h"
+#include "framework/tensor_base.h"
+#include "memory/t_malloc.h"
+#ifdef PADDLE_MOBILE_FPGA_KD
+#include "fpga/KD/tensor.hpp"
+namespace paddle_mobile {
+namespace framework {
+class LoDTensor;
+class Tensor : public TensorBase {
+ public:
+  Tensor() {}
+  template <typename T>
+  Tensor(std::vector<T> input, DDim ddim) {
+    PADDLE_MOBILE_ENFORCE(
+        input.size() == framework::product(ddim),
+        "input vector'length should be equal to tensor's length");
+    auto input_ptr = mutable_data<T>(ddim);
+    for (int i = 0; i < input.size(); ++i) {
+      input_ptr[i] = input[i];
+    }
+  }
+  Tensor(const Tensor &inTensor) {
+    this->dims_ = inTensor.dims_;
+    this->holder_ = inTensor.holder_;
+    this->offset_ = inTensor.offset_;
+  }
+  /*! Resize the dimensions of the memory block. */
+  inline Tensor &Resize(const DDim &dims) {
+    dims_ = dims;
+    // TODO(chonwhite) resize holder?
+    return *this;
+  }
+  /*! The internal of two tensors share the same memory block. */
+  inline Tensor &ShareDataWith(const Tensor &src) {
+    src.check_memory_size();
+    if (holder_.get() != src.holder_.get()) {
+      *this = src;
+    }
+    return *this;
+  }
+  /*! The internal of two tensors share the same memory block. */
+  inline Tensor &ShareHolderWith(const Tensor &src) {
+    src.check_memory_size();
+    if (holder_.get() != src.holder_.get()) {
+      holder_ = src.holder_;
+    }
+    return *this;
+  }
+  inline zynqmp::Tensor *zynqmpTensor() const {
+    PlaceholderImpl *holder = static_cast<PlaceholderImpl *>(holder_.get());
+    // mutable_data(holder->type());
+    return holder->tensor_;
+  }
+  inline void *mutable_data(const kTypeId_t type) {
+    if (holder_ != nullptr) {
+      holder_->set_type(type);
+    }
+    PADDLE_MOBILE_ENFORCE(numel() >= 0, "the Tensor's numel must >=0.")
+    int64_t size = numel() * SizeOfType(type);
+    if (holder_ == nullptr || holder_->size() < size + offset_) {
+      PlaceholderImpl *impl = nullptr;
+      if (holder_ == nullptr) {
+        std::cout << "holder null" << std::endl;
+        impl = new PlaceholderImpl(dims_, type);
+        holder_.reset(impl);
+      } else {
+        impl = static_cast<PlaceholderImpl *>(holder_.get());
+        std::cout << "holder reize" << std::endl;
+        // holder_->resize(size);
+      }
+      impl->resize(dims_, type);
+      offset_ = 0;
+    }
+    return reinterpret_cast<void *>(
+        reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
+  }
+  /**
+   * @brief   Return a pointer to mutable memory block.
+   * @note    If not exist, then allocation.
+   */
+  template <typename T>
+  inline T *mutable_data() {
+    static_assert(std::is_pod<T>::value, "T must be POD");
+    return reinterpret_cast<T *>(mutable_data(type_id<T>().hash_code()));
+  }
+  /**
+   * @brief     Return a pointer to mutable memory block.
+   *
+   * @param[in] dims    The dimensions of the memory block.
+   * @param[in] place   The place of the memory block.
+   *
+   * @note      If not exist, then allocation.
+   */
+  template <typename T>
+  inline T *mutable_data(DDim dims) {
+    static_assert(std::is_pod<T>::value, "T must be POD");
+    Resize(dims);
+    return mutable_data<T>();
+  }
+  /**
+   * @brief  Return a sub-tensor of the given tensor.
+   *
+   * @param[in] begin_idx   The index of the start row(inclusive) to
+   * slice.
+   *                        The index number begins from 0.
+   * @param[in] end_idx     The index of the end row(exclusive) to
+   * slice.
+   *                        The index number begins from 0.
+   */
+  inline Tensor Slice(int begin_idx, int end_idx) const {
+    check_memory_size();
+    PADDLE_MOBILE_ENFORCE(begin_idx >= 0,
+                          "The start row index must be greater than 0.")
+    PADDLE_MOBILE_ENFORCE(end_idx <= dims_[0],
+                          "The end row index is out of bound.")
+    PADDLE_MOBILE_ENFORCE(
+        begin_idx < end_idx,
+        "The start row index must be lesser than the end row index")
+    if (dims_[0] == 1) {
+      return *this;
+    } else {
+      size_t base = numel() / dims_[0];
+      Tensor dst;
+      dst.holder_ = holder_;
+      DDim dst_dims = dims_;
+      dst_dims[0] = end_idx - begin_idx;
+      dst.Resize(dst_dims);
+      dst.offset_ = offset_ + begin_idx * base * SizeOfType(type());
+      return dst;
+    }
+  }
+  /*! Return a pointer to mutable memory block. */
+  template <typename T>
+  inline T *data() {
+    check_memory_size();
+    PADDLE_MOBILE_ENFORCE(
+        (std::is_same<T, void>::value ||
+         holder_->type() == type_id<T>().hash_code()),
+        "Tensor holds the wrong type, it holds %d, requested %d",
+        this->holder_->type(), type_id<T>().hash_code());
+    return reinterpret_cast<T *>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
+                                 offset_);
+  }
+  /*! Return a pointer to constant memory block. */
+  template <typename T>
+  inline const T *data() const {
+    check_memory_size();
+    PADDLE_MOBILE_ENFORCE(
+        (std::is_same<T, void>::value ||
+         holder_->type() == type_id<T>().hash_code()),
+        "Tensor holds the wrong type, it holds %d, requested %d",
+        this->holder_->type(), type_id<T>().hash_code());
+    return reinterpret_cast<const T *>(
+        reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
+  }
+ private:
+  struct PlaceholderImpl : public Placeholder {
+    PlaceholderImpl(DDim ddim, const kTypeId_t type) {
+      tensor_ = new zynqmp::Tensor();
+      type_ = type;
+      std::vector<int> v = framework::vectorize2int(ddim);
+      zynqmp::LayoutType layout_type = zynqmp::NCHW;
+      switch (v.size()) {
+        case 1:
+          layout_type = zynqmp::N;
+          break;
+        case 2:
+          layout_type = zynqmp::NC;
+          break;
+        case 3:
+          layout_type = zynqmp::NHW;
+          break;
+        case 4:
+          layout_type = zynqmp::NCHW;
+          break;
+      }
+      zynqmp::Shape input_shape(layout_type, v);
+      // for (int i = 0; i < v.size(); i++) {
+      //   std::cout << ":" << v[i] << std::endl;
+      // }
+      zynqmp::DataType dtype = type == _float ? zynqmp::FP32 : zynqmp::FP16;
+      tensor_->mutableData<float>(dtype, input_shape);
+    }
+    virtual size_t size() const { return size_; }
+    virtual void *ptr() const {
+      void *ptr = tensor_->data<void *>();
+      return ptr;
+    }
+    virtual kTypeId_t type() const { return type_; }
+    virtual void set_type(const kTypeId_t type) { type_ = type; }
+    virtual void resize(size_t size) {
+      if (size > capatity_) {
+        capatity_ = size;
+        // TODO(chonwhite) implement;
+      }
+      size_ = size;
+    }
+    void resize(DDim ddim, const kTypeId_t type) {
+      std::vector<int> v = framework::vectorize2int(ddim);
+      zynqmp::LayoutType layout_type = zynqmp::NCHW;
+      switch (v.size()) {
+        case 1:
+          layout_type = zynqmp::N;
+          break;
+        case 2:
+          layout_type = zynqmp::NC;
+          break;
+        case 3:
+          layout_type = zynqmp::NHW;
+          break;
+        case 4:
+          layout_type = zynqmp::NCHW;
+          break;
+      }
+      zynqmp::Shape input_shape(layout_type, v);
+      zynqmp::DataType dtype = type == _float ? zynqmp::FP32 : zynqmp::FP16;
+      tensor_->mutableData<float>(dtype, input_shape);
+    }
+    /*! the size of memory block. */
+    size_t size_;
+    size_t capatity_;
+    /* the current type of memory */
+    kTypeId_t type_;
+    zynqmp::Tensor *tensor_;
+    // zynqmp::Shape* shape_;
+  };
+};
+#ifdef PADDLE_MOBILE_DEBUG
+inline Print &operator<<(Print &printer, const Tensor &tensor) {
+  printer << " dims: " << tensor.dims() << "\n";
+  int stride = tensor.numel() / 20;
+  stride = stride > 0 ? stride : 1;
+  return printer;
+}
+#endif
+inline Tensor ReshapeToMatrix(const Tensor &src, int num_col_dims) {
+  Tensor res;
+  res.ShareDataWith(src);
+  res.Resize(flatten_to_2d(src.dims(), num_col_dims));
+  return res;
+}
+}  // namespace framework
+}  // namespace paddle_mobile
+#endif
--- a/src/memory/t_malloc.cpp
+++ b/src/memory/t_malloc.cpp
@@ -24,6 +24,10 @@ limitations under the License. */
 #include "fpga/V2/api.h"
 #endif
+#ifdef PADDLE_MOBILE_FPGA_KD
+#include "fpga/KD/llapi/zynqmp_api.h"
+#endif
 namespace paddle_mobile {
 namespace memory {
 const int MALLOC_ALIGN = 64;
@@ -43,7 +47,21 @@ void Free(void *ptr) {
  }
 }
+#elif defined(PADDLE_MOBILE_FPGA_KD)
+void Copy(void *dst, const void *src, size_t num) {
+  std::memcpy(dst, src, num);
+}
+void *Alloc(size_t size) { return zynqmp::fpga_malloc(size); }
+void Free(void *ptr) {
+  if (ptr) {
+    zynqmp::fpga_free(ptr);
+  }
+}
 #else
 void Copy(void *dst, const void *src, size_t num) {
  std::memcpy(dst, src, num);
 }
@@ -57,11 +75,9 @@ void *Alloc(size_t size) {
  void *r = reinterpret_cast<void *>(reinterpret_cast<size_t>(p + offset) &
                                     (~(MALLOC_ALIGN - 1)));
  static_cast<void **>(r)[-1] = p;
-  return r;  // if necessary, you need initialize memory by yourself (developer)
+  return r;
 }
-// if you use this ptr again after Free
-// you need to assign `ptr` as nullptr yourself (developer)
 void Free(void *ptr) {
  if (ptr) {
    free(static_cast<void **>(ptr)[-1]);

--- a/src/operators/elementwise_add_op.cpp
+++ b/src/operators/elementwise_add_op.cpp
@@ -37,7 +37,7 @@ REGISTER_OPERATOR_CPU(elementwise_add, ops::ElementwiseAddOp);
 REGISTER_OPERATOR_CL(elementwise_add, ops::ElementwiseAddOp);
 #endif
-#ifdef PADDLE_MOBILE_FPGA
+#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
 REGISTER_OPERATOR_FPGA(elementwise_add, ops::ElementwiseAddOp);
 #endif

--- a/src/operators/feed_op.cpp
+++ b/src/operators/feed_op.cpp
@@ -38,7 +38,7 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(feed, ops::FeedOp);
 #endif
-#ifdef PADDLE_MOBILE_FPGA
+#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
 REGISTER_OPERATOR_FPGA(feed, ops::FeedOp);
 #endif
 #ifdef PADDLE_MOBILE_CL

--- a/src/operators/fetch_op.cpp
+++ b/src/operators/fetch_op.cpp
@@ -31,7 +31,7 @@ namespace ops = paddle_mobile::operators;
 REGISTER_OPERATOR_CPU(fetch, ops::FetchOp);
 #endif
-#ifdef PADDLE_MOBILE_FPGA
+#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
 REGISTER_OPERATOR_FPGA(fetch, ops::FetchOp);
 #endif
 #ifdef PADDLE_MOBILE_CL

--- a/src/operators/fusion_conv_add_bn_relu_op.cpp
+++ b/src/operators/fusion_conv_add_bn_relu_op.cpp
@@ -55,7 +55,7 @@ REGISTER_FUSION_MATCHER(fusion_conv_add_bn_relu,
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(fusion_conv_add_bn_relu, ops::FusionConvAddBNReluOp);
 #endif
-#ifdef PADDLE_MOBILE_FPGA
+#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
 REGISTER_OPERATOR_FPGA(fusion_conv_add_bn_relu, ops::FusionConvAddBNReluOp);
 #endif
 #ifdef PADDLE_MOBILE_CL

--- a/src/operators/fusion_conv_add_op.cpp
+++ b/src/operators/fusion_conv_add_op.cpp
@@ -58,7 +58,7 @@ REGISTER_OPERATOR_CPU(fusion_conv_add, ops::FusionConvAddOp);
 #ifdef PADDLE_MOBILE_CL
 REGISTER_OPERATOR_CL(fusion_conv_add, ops::FusionConvAddOp);
 #endif
-#ifdef PADDLE_MOBILE_FPGA
+#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
 REGISTER_OPERATOR_FPGA(fusion_conv_add, ops::FusionConvAddOp);
 #endif
 #endif
--- a/src/operators/fusion_conv_add_relu_op.cpp
+++ b/src/operators/fusion_conv_add_relu_op.cpp
@@ -53,7 +53,7 @@ REGISTER_FUSION_MATCHER(fusion_conv_add_relu, ops::FusionConvAddReluOpMatcher);
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(fusion_conv_add_relu, ops::FusionConvAddReluOp);
 #endif
-#ifdef PADDLE_MOBILE_FPGA
+#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
 REGISTER_OPERATOR_FPGA(fusion_conv_add_relu, ops::FusionConvAddReluOp);
 #endif
 #ifdef PADDLE_MOBILE_CL

--- a/src/operators/fusion_conv_bn_add_relu_op.cpp
+++ b/src/operators/fusion_conv_bn_add_relu_op.cpp
@@ -58,7 +58,7 @@ REGISTER_OPERATOR_CPU(fusion_conv_bn_add_relu, ops::FusionConvBNAddReluOp);
 #ifdef PADDLE_MOBILE_CL
 REGISTER_OPERATOR_CL(fusion_conv_bn_add_relu, ops::FusionConvBNAddReluOp);
 #endif
-#ifdef PADDLE_MOBILE_FPGA
+#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
 REGISTER_OPERATOR_FPGA(fusion_conv_bn_add_relu, ops::FusionConvBNAddReluOp);
 #endif

--- a/src/operators/fusion_conv_bn_op.cpp
+++ b/src/operators/fusion_conv_bn_op.cpp
@@ -54,7 +54,7 @@ REGISTER_FUSION_MATCHER(fusion_conv_bn, ops::FusionConvBNMatcher);
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(fusion_conv_bn, ops::FusionConvBNOp);
 #endif
-#ifdef PADDLE_MOBILE_FPGA
+#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
 REGISTER_OPERATOR_FPGA(fusion_conv_bn, ops::FusionConvBNOp);
 #endif

--- a/src/operators/fusion_conv_bn_relu_op.cpp
+++ b/src/operators/fusion_conv_bn_relu_op.cpp
@@ -57,7 +57,7 @@ REGISTER_OPERATOR_CPU(fusion_conv_bn_relu, ops::FusionConvBNReluOp);
 #ifdef PADDLE_MOBILE_CL
 REGISTER_OPERATOR_CL(fusion_conv_bn_relu, ops::FusionConvBNReluOp);
 #endif
-#ifdef PADDLE_MOBILE_FPGA
+#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
 REGISTER_OPERATOR_FPGA(fusion_conv_bn_relu, ops::FusionConvBNReluOp);
 #endif

--- a/src/operators/fusion_elementwise_add_relu_op.cpp
+++ b/src/operators/fusion_elementwise_add_relu_op.cpp
@@ -36,7 +36,7 @@ REGISTER_FUSION_MATCHER(fusion_elementwise_add_relu,
 // REGISTER_OPERATOR_CPU(fusion_elementwise_add_relu,
 //                      ops::FusionElementwiseAddReluOp);
 #endif
-#ifdef PADDLE_MOBILE_FPGA
+#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
 REGISTER_OPERATOR_FPGA(fusion_elementwise_add_relu,
                       ops::FusionElementwiseAddReluOp);
 #endif

--- a/src/operators/fusion_fc_op.cpp
+++ b/src/operators/fusion_fc_op.cpp
@@ -63,7 +63,7 @@ REGISTER_OPERATOR_CPU(fusion_fc, ops::FusionFcOp);
 #ifdef PADDLE_MOBILE_CL
 REGISTER_OPERATOR_CL(fusion_fc, ops::FusionFcOp);
 #endif
-#ifdef PADDLE_MOBILE_FPGA
+#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
 REGISTER_OPERATOR_FPGA(fusion_fc, ops::FusionFcOp);
 #endif

--- a/src/operators/fusion_fc_relu_op.cpp
+++ b/src/operators/fusion_fc_relu_op.cpp
@@ -60,7 +60,7 @@ REGISTER_FUSION_MATCHER(fusion_fc_relu, ops::FusionFcReluMatcher);
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(fusion_fc_relu, ops::FusionFcReluOp);
 #endif
-#ifdef PADDLE_MOBILE_FPGA
+#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
 REGISTER_OPERATOR_FPGA(fusion_fc_relu, ops::FusionFcReluOp);
 #endif

--- a/src/operators/kernel/fpga/KD/conv_add_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/KD/conv_add_bn_kernel.cpp
--- a/src/operators/kernel/fpga/KD/conv_add_kernel.cpp
+++ b/src/operators/kernel/fpga/KD/conv_add_kernel.cpp
--- a/src/operators/kernel/fpga/KD/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/KD/conv_add_relu_kernel.cpp
--- a/src/operators/kernel/fpga/KD/conv_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/KD/conv_bn_kernel.cpp
--- a/src/operators/kernel/fpga/KD/conv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/KD/conv_bn_relu_kernel.cpp
--- a/src/operators/kernel/fpga/KD/elementwise_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/KD/elementwise_add_relu_kernel.cpp
--- a/src/operators/kernel/fpga/KD/feed_kernel.cpp
+++ b/src/operators/kernel/fpga/KD/feed_kernel.cpp
--- a/src/operators/kernel/fpga/KD/fetch_kernel.cpp
+++ b/src/operators/kernel/fpga/KD/fetch_kernel.cpp
--- a/src/operators/kernel/fpga/KD/fusion_fc_kernel.cpp
+++ b/src/operators/kernel/fpga/KD/fusion_fc_kernel.cpp
--- a/src/operators/kernel/fpga/KD/pool_kernel.cpp
+++ b/src/operators/kernel/fpga/KD/pool_kernel.cpp
--- a/src/operators/kernel/fpga/KD/softmax_kernel.cpp
+++ b/src/operators/kernel/fpga/KD/softmax_kernel.cpp
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
--- a/src/operators/pad2d_op.cpp
+++ b/src/operators/pad2d_op.cpp
@@ -37,7 +37,7 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(pad2d, ops::Pad2DOp);
 #endif
-#ifdef PADDLE_MOBILE_FPGA
+#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
 REGISTER_OPERATOR_FPGA(pad2d, ops::Pad2DOp);
 #endif

--- a/src/operators/softmax_op.cpp
+++ b/src/operators/softmax_op.cpp
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
--- a/test/fpga/test_ssd.cpp
+++ b/test/fpga/test_ssd.cpp
--- a/tools/op.cmake
+++ b/tools/op.cmake