unify V1 & V2 style for FPGA track

bff417fd · zhangyang · 8b71275c · bff417fd · bff417fd · bff417fd
21 changed file
--- a/src/fpga/V1/api.cpp
+++ b/src/fpga/V1/api.cpp
@@ -13,251 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "fpga/V1/api.h"
-#include <fcntl.h>
-#include <sys/ioctl.h>
-#include <sys/mman.h>
-#include <algorithm>
-#include <map>
 #include "fpga/V1/bias_scale.h"
 #include "fpga/V1/filter.h"
 #include "fpga/V1/image.h"
-#define FPGA_TEST_MODE
-#define PADDLE_MOBILE_OS_LINUX

 namespace paddle_mobile {
 namespace fpga {

-static int fd = -1;
-static const char *device_path = "/dev/fpgadrv0";
-static std::map<void *, size_t> memory_map;
-
-static inline int do_ioctl(int req, const void *arg) {
-#ifdef PADDLE_MOBILE_OS_LINUX
-  int result = ioctl(fd, req, (uint64_t)arg);
-  PADDLE_MOBILE_ENFORCE(result == 0, "ioctl didn't return correctly");
-  return result;
-#else
-  return -1;
-#endif
-}
-
-int open_device() {
-  if (fd == -1) {
-    fd = open(device_path, O_RDWR);
-  }
-  return fd;
-}
-
-// memory management;
-void *fpga_malloc(size_t size) {
-  static uint64_t counter = 0;
-
-#ifdef PADDLE_MOBILE_OS_LINUX
-  auto ptr = mmap64(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
-#else
-  auto ptr = malloc(size);
-#endif
-  counter += size;
-  memory_map.insert(std::make_pair(ptr, size));
-  //  DLOG << "Address: " << ptr << ", " << size << " bytes allocated. Total "
-  //       << counter << " bytes";
-  return ptr;
-}
-
-void fpga_free(void *ptr) {
-  static uint64_t counter = 0;
-  size_t size = 0;
-
-  auto iter = memory_map.find(ptr);  // std::map<void *, size_t>::iterator
-  if (iter != memory_map.end()) {
-    size = iter->second;
-    memory_map.erase(iter);
-#ifdef PADDLE_MOBILE_OS_LINUX
-    munmap(ptr, size);
-#else
-    free(ptr);
-#endif
-    counter += size;
-    //    DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total "
-    //         << counter << " bytes";
-  } else {
-    DLOG << "Invalid pointer";
-  }
-}
-
-void fpga_copy(void *dest, const void *src, size_t num) {
-  memcpy(dest, src, num);
-}
-
-int fpga_flush(void *address, size_t size) {
-  struct MemoryCacheArgs args = {nullptr};
-  args.address = address;
-  args.size = size;
-  return do_ioctl(IOCTL_MEMCACHE_FLUSH, &args);
-}
-
-int fpga_invalidate(void *address, size_t size) {
-  struct MemoryCacheArgs args = {nullptr};
-  args.address = address;
-  args.size = size;
-  return do_ioctl(IOCTL_MEMCACHE_INVAL, &args);
-}
-
-half fp32_2_fp16(float fp32_num) {
-  unsigned long tmp = *(unsigned long *)(&fp32_num);  // NOLINT
-  half t = ((tmp & 0x007fffff) >> 13) | ((tmp & 0x80000000) >> 16) |
-           (((tmp & 0x7f800000) >> 13) - (112 << 10));
-  if (tmp & 0x1000) {
-    t++;  // roundoff
-  }
-  return t;
-}
-
-float fp16_2_fp32(half fp16_num) {
-  int frac = (fp16_num & 0x3ff);
-  int exp = ((fp16_num & 0x7c00) >> 10) + 112;
-  int s = fp16_num & 0x8000;
-  int tmp = 0;
-  float fp32_num;
-  tmp = s << 16 | exp << 23 | frac << 13;
-  fp32_num = *(float *)&tmp;  // NOLINT
-  return fp32_num;
-}
-
-int ComputeBasicConv(const struct ConvArgs &args) {
-#ifdef FPGA_TEST_MODE
-  DLOG << "======Compute Basic Conv======";
-  DLOG << "   relu_enabled:" << args.relu_enabled
-       << "   sb_address:" << args.sb_address
-       << "   filter_address:" << args.filter_address
-       << "   filter_num:" << args.filter_num
-       << "   group_num:" << args.group_num;
-  DLOG << "   image_address:" << args.image.address
-       << "   image_scale_address:" << args.image.scale_address
-       << "   image_channels:" << args.image.channels
-       << "   image_height:" << args.image.height
-       << "   image_width:" << args.image.width
-       << "   pad_height:" << args.image.pad_height
-       << "   pad_width:" << args.image.pad_width;
-  DLOG << "   kernel_height:" << args.kernel.height
-       << "   kernel_width:" << args.kernel.width
-       << "   stride_h:" << args.kernel.stride_h
-       << "   stride_w:" << args.kernel.stride_w;
-  DLOG << "   out_address:" << args.output.address
-       << "   out_scale_address:" << args.output.scale_address;
-#endif
-  return do_ioctl(IOCTL_CONFIG_CONV, &args);
-}
-
-int ComputeFpgaConv(const struct SplitConvArgs &args) {
-#ifdef FPGA_TEST_MODE
-  DLOG << "=============ComputeFPGAConv===========";
-  DLOG << "   filter_num:" << args.filter_num
-       << "   group_num:" << args.group_num
-       << "   split_num:" << args.split_num;
-#endif
-
-  int split_num = args.split_num;
-  for (int i = 0; i < split_num; i++) {
-    ComputeBasicConv(args.conv_args[i]);
-  }
-
-  if (split_num > 1) {
-    ComputeFPGAConcat(args.concat_arg);
-  }
-}
-
-int ComputeFpgaPool(const struct PoolingArgs &args) {
-#ifdef FPGA_TEST_MODE
-  DLOG << "=============ComputeFpgaPool===========";
-  DLOG << "   mode:" << args.mode
-       << "   kernel_reciprocal:" << fp16_2_fp32(args.kernel_reciprocal);
-  DLOG << "   image_address:" << args.image.address
-       << "   image_scale_address:" << args.image.scale_address
-       << "   image_channels:" << args.image.channels
-       << "   image_height:" << args.image.height
-       << "   image_width:" << args.image.width
-       << "   pad_height:" << args.image.pad_height
-       << "   pad_width:" << args.image.pad_width;
-  DLOG << "   kernel_height:" << args.kernel.height
-       << "   kernel_width:" << args.kernel.width
-       << "   stride_h:" << args.kernel.stride_h
-       << "   stride_w:" << args.kernel.stride_w;
-  DLOG << "   out_address:" << args.output.address
-       << "   out_scale_address:" << args.output.scale_address;
-#endif
-
-  return do_ioctl(IOCTL_CONFIG_POOLING, &args);
-}
-
-int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
-#ifdef FPGA_TEST_MODE
-  DLOG << "=============ComputeFpgaEWAdd===========";
-  DLOG << "   relu_enabled:" << args.relu_enabled
-       << "   const0:" << fp16_2_fp32(int16_t(args.const0))
-       << "   const1:" << fp16_2_fp32(int16_t(args.const1));
-  DLOG << "   image0_address:" << args.image0.address
-       << "   image0_scale_address:" << args.image0.scale_address
-       << "   image0_channels:" << args.image0.channels
-       << "   image0_height:" << args.image0.height
-       << "   image0_width:" << args.image0.width
-       << "   pad0_height:" << args.image0.pad_height
-       << "   pad0_width:" << args.image0.pad_width;
-  DLOG << "   image1_address:" << args.image1.address
-       << "   image1_scale_address:" << args.image1.scale_address
-       << "   image1_channels:" << args.image1.channels
-       << "   image1_height:" << args.image1.height
-       << "   image1_width:" << args.image1.width
-       << "   pad1_height:" << args.image1.pad_height
-       << "   pad_width:" << args.image1.pad_width;
-  DLOG << "   out_address:" << args.output.address
-       << "   out_scale_address:" << args.output.scale_address;
-#endif
-
-  return do_ioctl(IOCTL_CONFIG_EW, &args);
-}
-int PerformBypass(const struct BypassArgs &args) {
-#ifdef FPGA_TEST_MODE
-  DLOG << "=============ComputeFpgaBypass===========";
-  DLOG << "   input_type:" << args.input_data_type
-       << "   output_type:" << args.output_data_type
-       << "   input_layout_type:" << args.input_layout_type
-       << "   output_layout_type:" << args.output_layout_type;
-  DLOG << "   image_address:" << args.image.address
-       << "   image_scale_address:" << args.image.scale_address
-       << "   image_channels:" << args.image.channels
-       << "   image_height:" << args.image.height
-       << "   image_width:" << args.image.width
-       << "   pad_height:" << args.image.pad_height
-       << "   pad_width:" << args.image.pad_width;
-  DLOG << "   out_address:" << args.output.address
-       << "   out_scale_address:" << args.output.scale_address;
-#endif
-
-  return do_ioctl(IOCTL_CONFIG_BYPASS, &args);
-}
-
-int ComputeFPGAConcat(const struct ConcatArgs &args) {
-#ifdef FPGA_TEST_MODE
-  DLOG << "=============ComputeFpgaConcat===========";
-  DLOG << "   Image_num: " << args.image_num
-       << "   out_address:" << args.image_out
-       << "   out_scale_address:" << args.scale_out;
-  DLOG << "   image_height:" << args.height << "   image_width:" << args.width;
-  for (int i = 0; i < args.image_num; i++) {
-    DLOG << "   " << i << "th:        ";
-    DLOG << "   channel_num:" << args.channel_num[i]
-         << "   image_address:" << args.images_in[i]
-         << "   image_scale_address:" << args.scales_in[i];
-  }
-#endif
-
-  image::concat_images(args.images_in, args.scales_in, args.image_out,
-                       args.scale_out, args.image_num, args.channel_num,
-                       args.height, args.width);
-  return 0;
-}
-
 int get_align_image_cw(int cw) { return align_to_x(cw, IMAGE_ALIGNMENT); }

 void format_image(framework::Tensor *image_tensor) {
@@ -397,7 +159,7 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
  arg->filter_num = (uint32_t)filter->dims()[0];
  arg->output.address = out_ptr;
  arg->output.scale_address = out->scale;
-  arg->conv_args =
+  arg->conv_arg =
      (ConvArgs *)fpga_malloc(arg->split_num * sizeof(ConvArgs));  // NOLINT

  arg->concat_arg.image_num = arg->split_num;
@@ -420,44 +182,44 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
      filter->dims()[1] * filter->dims()[2] * filter->dims()[3]);

  for (int i = 0; i < n; i++) {
-    arg->conv_args[i].relu_enabled = relu_enabled;
-    arg->conv_args[i].group_num = (uint32_t)group_num;
-    arg->conv_args[i].kernel.stride_h = (uint32_t)stride_h;
-    arg->conv_args[i].kernel.stride_w = (uint32_t)stride_w;
-    arg->conv_args[i].kernel.height = (uint32_t)filter->dims()[2];
-    arg->conv_args[i].kernel.width = (uint32_t)filter->dims()[3];
-    arg->conv_args[i].image.address = input_ptr;
-    arg->conv_args[i].image.channels = (uint32_t)input->dims()[1];
-    arg->conv_args[i].image.height = (uint32_t)input->dims()[2];
-    arg->conv_args[i].image.width = (uint32_t)input->dims()[3];
-    arg->conv_args[i].image.scale_address = input->scale;
-    arg->conv_args[i].image.pad_height = (uint32_t)padding_h;
-    arg->conv_args[i].image.pad_width = (uint32_t)padding_w;
-    arg->conv_args[i].filter_scale_address = filter->scale;
-    arg->conv_args[i].filter_address = &(
+    arg->conv_arg[i].relu_enabled = relu_enabled;
+    arg->conv_arg[i].group_num = (uint32_t)group_num;
+    arg->conv_arg[i].kernel.stride_h = (uint32_t)stride_h;
+    arg->conv_arg[i].kernel.stride_w = (uint32_t)stride_w;
+    arg->conv_arg[i].kernel.height = (uint32_t)filter->dims()[2];
+    arg->conv_arg[i].kernel.width = (uint32_t)filter->dims()[3];
+    arg->conv_arg[i].image.address = input_ptr;
+    arg->conv_arg[i].image.channels = (uint32_t)input->dims()[1];
+    arg->conv_arg[i].image.height = (uint32_t)input->dims()[2];
+    arg->conv_arg[i].image.width = (uint32_t)input->dims()[3];
+    arg->conv_arg[i].image.scale_address = input->scale;
+    arg->conv_arg[i].image.pad_height = (uint32_t)padding_h;
+    arg->conv_arg[i].image.pad_width = (uint32_t)padding_w;
+    arg->conv_arg[i].filter_scale_address = filter->scale;
+    arg->conv_arg[i].filter_address = &(
        (int8_t *)filter_ptr)[i * element_num * filter_num_per_div];  // NOLINT
-    arg->conv_args[i].sb_address = &bs_ptr[i * filter_num_per_div * 2];
-    arg->conv_args[i].filter_num = (uint32_t)(
+    arg->conv_arg[i].sb_address = &bs_ptr[i * filter_num_per_div * 2];
+    arg->conv_arg[i].filter_num = (uint32_t)(
        i == n - 1 ? channel - (n - 1) * filter_num_per_div  // NOLINT
                   : filter_num_per_div);

    if (n > 1) {
-      arg->conv_args[i].output.scale_address =
+      arg->conv_arg[i].output.scale_address =
          (float *)fpga_malloc(2 * sizeof(float));  // NOLINT
-      arg->conv_args[i].output.address = fpga_malloc(
-          input->dims()[2] *
-          align_to_x(input->dims()[3] * arg->conv_args[i].filter_num,
-                     IMAGE_ALIGNMENT) *
-          sizeof(half));
+      arg->conv_arg[i].output.address =
+          fpga_malloc(input->dims()[2] *
+                      align_to_x(input->dims()[3] * arg->conv_arg[i].filter_num,
+                                 IMAGE_ALIGNMENT) *
+                      sizeof(half));
    } else {
-      arg->conv_args[i].output.scale_address = out->scale;
-      arg->conv_args[i].output.address = out_ptr;
+      arg->conv_arg[i].output.scale_address = out->scale;
+      arg->conv_arg[i].output.address = out_ptr;
    }

    arg->concat_arg.images_in[i] =
-        (half *)arg->conv_args[i].output.address;  // NOLINT
-    arg->concat_arg.scales_in[i] = arg->conv_args[i].output.scale_address;
-    arg->concat_arg.channel_num[i] = arg->conv_args[i].filter_num;
+        (half *)arg->conv_arg[i].output.address;  // NOLINT
+    arg->concat_arg.scales_in[i] = arg->conv_arg[i].output.scale_address;
+    arg->concat_arg.channel_num[i] = arg->conv_arg[i].filter_num;
  }
 }


--- a/src/fpga/V1/api.h
+++ b/src/fpga/V1/api.h
@@ -14,178 +14,13 @@ limitations under the License. */

 #pragma once

-#include <stdint.h>
-#include <cstddef>
-#include <iostream>
-#include <limits>
+#include "fpga/common/fpga_common.h"
+#include "fpga/common/pe.h"
 #include "framework/tensor.h"

 namespace paddle_mobile {
 namespace fpga {

-enum DataType {
-  DATA_TYPE_FP32 = 1,
-  DATA_TYPE_FP16 = 0,
-};
-
-enum LayoutType {
-  LAYOUT_CHW = 1,
-  LAYOUT_HWC = 0,
-};
-
-struct VersionArgs {
-  void* buffer;
-};
-
-struct MemoryCopyArgs {
-  void* src;
-  void* dest;
-  size_t size;
-};
-
-struct KernelArgs {
-  uint32_t width;
-  uint32_t height;
-  uint32_t stride_w;
-  uint32_t stride_h;
-};
-
-struct ImageInputArgs {
-  void* address;         // input featuremap virtual address
-  float* scale_address;  // input scale address;
-  uint32_t channels;
-  uint32_t width;  // featuremap width
-  uint32_t height;
-  uint32_t pad_width;  // padding width;
-  uint32_t pad_height;
-};
-
-struct ImageOutputArgs {
-  void* address;         // output result address;
-  float* scale_address;  // output scale address;
-};
-
-struct ConvArgs {
-  bool relu_enabled;
-  void* sb_address;  // scale and bias are interlaced;
-  void* filter_address;
-  float* filter_scale_address;
-  uint32_t filter_num;
-  uint32_t group_num;
-
-  struct KernelArgs kernel;
-  struct ImageInputArgs image;  // input image;
-  struct ImageOutputArgs output;
-};
-
-struct ConcatArgs {
-  uint32_t image_num;
-  half** images_in;
-  float** scales_in;
-  void* image_out;
-  float* scale_out;
-  uint32_t* channel_num;
-  uint32_t height;
-  uint32_t width;
-};
-
-struct SplitConvArgs {
-  uint32_t split_num;
-  uint32_t group_num;
-  uint32_t filter_num;
-  struct ImageOutputArgs output;
-  struct ConvArgs* conv_args;
-  struct ConcatArgs concat_arg;
-};
-
-struct GroupConvArgs {
-  uint32_t group_num;
-  uint32_t filter_num;
-  struct ImageOutputArgs output;
-  struct SplitConvArgs* conv_args;
-  struct ConcatArgs concat_arg;
-};
-
-struct PoolingArgs {
-  int16_t mode;  // mode: 0:max, 1:avg
-  half kernel_reciprocal;
-  struct KernelArgs kernel;
-  struct ImageInputArgs image;  // input image;
-  struct ImageOutputArgs output;
-};
-
-struct EWAddArgs {
-  bool relu_enabled;
-
-  uint32_t const0;  // output0 = const0 x input0 + const1 x input1;
-  uint32_t const1;
-  struct ImageInputArgs image0;
-  struct ImageInputArgs image1;
-  struct ImageOutputArgs output;
-};
-
-struct BypassArgs {
-  enum DataType input_data_type;
-  enum DataType output_data_type;
-  enum LayoutType input_layout_type;
-  enum LayoutType output_layout_type;
-  struct ImageInputArgs image;
-  struct ImageOutputArgs output;
-};
-
-struct FpgaRegWriteArgs {
-  uint64_t address;  //
-  uint64_t value;
-};
-
-struct FpgaRegReadArgs {
-  uint64_t address;
-  uint64_t value;
-};
-
-struct MemoryCacheArgs {
-  void* address;
-  size_t size;
-};
-
-#define IOCTL_FPGA_MAGIC 'FPGA'
-
-#define IOCTL_VERSION _IOW(IOCTL_FPGA_MAGIC, 01, struct VersionArgs)
-
-#define IOCTL_SEPARATOR_0 10
-
-#define IOCTL_MEM_COPY _IOW(IOCTL_FPGA_MAGIC, 11, struct MemoryCopyArgs)
-#define IOCTL_MEMCACHE_INVAL _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryCacheArgs)
-#define IOCTL_MEMCACHE_FLUSH _IOW(IOCTL_FPGA_MAGIC, 13, struct MemoryCacheArgs)
-
-#define IOCTL_SEPARATOR_1 20
-
-#define IOCTL_CONFIG_CONV _IOW(IOCTL_FPGA_MAGIC, 21, struct ConvArgs)
-#define IOCTL_CONFIG_POOLING _IOW(IOCTL_FPGA_MAGIC, 22, struct PoolingArgs)
-#define IOCTL_CONFIG_EW _IOW(IOCTL_FPGA_MAGIC, 23, struct EWAddArgs)
-#define IOCTL_CONFIG_BYPASS _IOW(IOCTL_FPGA_MAGIC, 24, struct BypassArgs)
-#define IOCTL_FPGA_REG_READ _IOW(IOCTL_FPGA_MAGIC, 28, struct FpgaRegReadArgs)
-#define IOCTL_FPGA_REG_WRITE _IOW(IOCTL_FPGA_MAGIC, 29, struct FpgaRegWriteArgs)
-
-//============================== API =============================
-
-int open_device();
-int close_device();
-
-void* fpga_malloc(size_t size);
-void fpga_free(void* ptr);
-void fpga_copy(void* dst, const void* src, size_t num);
-int fpga_flush(void* address, size_t size);
-int fpga_invalidate(void* address, size_t size);
-
-int PerformBypass(const struct BypassArgs& args);
-int ComputeFpgaConv(const struct SplitConvArgs& args);
-int ComputeFpgaPool(const struct PoolingArgs& args);
-int ComputeFpgaEWAdd(const struct EWAddArgs& args);
-int ComputeFPGAConcat(const struct ConcatArgs& args);
-
-static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; }
-
 int get_align_image_cw(int cw);
 void format_image(framework::Tensor* image_tensor);
 void format_fp16_ofm(framework::Tensor* ofm_tensor);  // only allocate memory
@@ -209,8 +44,5 @@ void fill_split_arg(struct SplitConvArgs* arg, framework::Tensor* input,
                    bool relu_enabled, int group_num, int stride_h,
                    int stride_w, int padding_h, int padding_w, float* bs_ptr);

-half fp32_2_fp16(float fp32_num);
-float fp16_2_fp32(half fp16_num);
-
 }  // namespace fpga
 }  // namespace paddle_mobile
--- a/src/fpga/V1/bias_scale.cpp
+++ b/src/fpga/V1/bias_scale.cpp
@@ -14,7 +14,7 @@ limitations under the License. */

 #include "fpga/V1/bias_scale.h"
 #include <memory.h>
-#include "fpga/V1/api.h"
+#include "fpga/common/fpga_common.h"

 namespace paddle_mobile {
 namespace fpga {

--- a/src/fpga/V1/filter.cpp
+++ b/src/fpga/V1/filter.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "fpga/V1/filter.h"
 #include <memory.h>
 #include <algorithm>
-#include "fpga/V1/api.h"
+#include "fpga/common/fpga_common.h"

 namespace paddle_mobile {
 namespace fpga {
@@ -31,20 +31,22 @@ int calc_split_num(int num, int division_capacity) {
 }

 int calc_division_number(int num, int group_num, int division_capacity) {
-  PADDLE_MOBILE_ENFORCE(num % group_num == 0,
-                        "Filter number should be divisible by group number");
+  //  PADDLE_MOBILE_ENFORCE(num % group_num == 0,
+  //                        "Filter number should be divisible by group
+  //                        number");
  int split_num = calc_split_num(num, division_capacity);
-  PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1,
-                        "Split number or group number should be 1");
+  //  PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1,
+  //                        "Split number or group number should be 1");
  return group_num * split_num;
 }

 int calc_num_per_div(int num, int group_num, int division_capacity) {
-  PADDLE_MOBILE_ENFORCE(num % group_num == 0,
-                        "Filter number should be divisible by group number");
+  //  PADDLE_MOBILE_ENFORCE(num % group_num == 0,
+  //                        "Filter number should be divisible by group
+  //                        number");
  int split_num = calc_split_num(num, division_capacity);
-  PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1,
-                        "Split number or group number should be 1");
+  //  PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1,
+  //                        "Split number or group number should be 1");
  if (group_num == 1) {
    if (num > division_capacity) {
      return division_capacity;

--- a/src/fpga/V1/image.cpp
+++ b/src/fpga/V1/image.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "fpga/V1/image.h"
 #include <memory.h>
 #include <algorithm>
-#include "fpga/V1/api.h"
+#include "fpga/common/fpga_common.h"

 namespace paddle_mobile {
 namespace fpga {

--- a/src/fpga/V1/pe.cpp
+++ b/src/fpga/V1/pe.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "fpga/common/pe.h"
+#include "fpga/V1/filter.h"
+#include "fpga/V1/image.h"
+#include "fpga/common/config.h"
+#include "fpga/common/driver.h"
+
+namespace paddle_mobile {
+namespace fpga {
+
+int ComputeFpgaConv(const struct SplitConvArgs &args) {
+  ComputeBasicConv(args.conv_arg[0]);
+}
+
+int ComputeBasicConv(const struct ConvArgs &args) {
+#ifdef FPGA_PRINT_MODE
+  DLOG << "======Compute Basic Conv======";
+  DLOG << "   relu_enabled:" << args.relu_enabled
+       << "   sb_address:" << args.sb_address
+       << "   filter_address:" << args.filter_address
+       << "   filter_num:" << args.filter_num
+       << "   group_num:" << args.group_num;
+  DLOG << "   image_address:" << args.image.address
+       << "   image_scale_address:" << args.image.scale_address
+       << "   image_channels:" << args.image.channels
+       << "   image_height:" << args.image.height
+       << "   image_width:" << args.image.width
+       << "   pad_height:" << args.image.pad_height
+       << "   pad_width:" << args.image.pad_width;
+  DLOG << "   kernel_height:" << args.kernel.height
+       << "   kernel_width:" << args.kernel.width
+       << "   stride_h:" << args.kernel.stride_h
+       << "   stride_w:" << args.kernel.stride_w;
+  DLOG << "   out_address:" << args.output.address
+       << "   out_scale_address:" << args.output.scale_address;
+#endif
+
+#ifndef PADDLE_MOBILE_ZU5
+  return 0;
+#endif
+
+  return 0;
+}
+
+int ComputeFpgaPool(const struct PoolingArgs &args) {
+#ifdef FPGA_PRINT_MODE
+  DLOG << "=============ComputeFpgaPool===========";
+  DLOG << "   mode:" << args.mode
+       << "   kernel_reciprocal:" << fp16_2_fp32(args.kernel_reciprocal);
+  DLOG << "   image_address:" << args.image.address
+       << "   image_scale_address:" << args.image.scale_address
+       << "   image_channels:" << args.image.channels
+       << "   image_height:" << args.image.height
+       << "   image_width:" << args.image.width
+       << "   pad_height:" << args.image.pad_height
+       << "   pad_width:" << args.image.pad_width;
+  DLOG << "   kernel_height:" << args.kernel.height
+       << "   kernel_width:" << args.kernel.width
+       << "   stride_h:" << args.kernel.stride_h
+       << "   stride_w:" << args.kernel.stride_w;
+  DLOG << "   out_address:" << args.output.address
+       << "   out_scale_address:" << args.output.scale_address;
+#endif
+#ifndef PADDLE_MOBILE_ZU5
+  return 0;
+#endif
+  return 0;
+}
+
+int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
+#ifdef FPGA_PRINT_MODE
+  DLOG << "=============ComputeFpgaEWAdd===========";
+  DLOG << "   relu_enabled:" << args.relu_enabled
+       << "   const0:" << fp16_2_fp32(int16_t(args.const0))
+       << "   const1:" << fp16_2_fp32(int16_t(args.const1));
+  DLOG << "   image0_address:" << args.image0.address
+       << "   image0_scale_address:" << args.image0.scale_address
+       << "   image0_channels:" << args.image0.channels
+       << "   image0_height:" << args.image0.height
+       << "   image0_width:" << args.image0.width
+       << "   pad0_height:" << args.image0.pad_height
+       << "   pad0_width:" << args.image0.pad_width;
+  DLOG << "   image1_address:" << args.image1.address
+       << "   image1_scale_address:" << args.image1.scale_address
+       << "   image1_channels:" << args.image1.channels
+       << "   image1_height:" << args.image1.height
+       << "   image1_width:" << args.image1.width
+       << "   pad1_height:" << args.image1.pad_height
+       << "   pad_width:" << args.image1.pad_width;
+  DLOG << "   out_address:" << args.output.address
+       << "   out_scale_address:" << args.output.scale_address;
+#endif
+#ifndef PADDLE_MOBILE_ZU5
+  return 0;
+#endif
+  return 0;
+}
+
+int PerformBypass(const struct BypassArgs &args) {
+#ifdef FPGA_PRINT_MODE
+  DLOG << "=============ComputeFpgaBypass===========";
+  DLOG << "   input_type:" << args.input_data_type
+       << "   output_type:" << args.output_data_type
+       << "   input_layout_type:" << args.input_layout_type
+       << "   output_layout_type:" << args.output_layout_type;
+  DLOG << "   image_address:" << args.image.address
+       << "   image_scale_address:" << args.image.scale_address
+       << "   image_channels:" << args.image.channels
+       << "   image_height:" << args.image.height
+       << "   image_width:" << args.image.width
+       << "   pad_height:" << args.image.pad_height
+       << "   pad_width:" << args.image.pad_width;
+  DLOG << "   out_address:" << args.output.address
+       << "   out_scale_address:" << args.output.scale_address;
+#endif
+#ifndef PADDLE_MOBILE_ZU5
+  return 0;
+#endif
+
+  return 0;
+}
+
+int ComputeFPGAConcat(const struct ConcatArgs &args) {
+#ifdef FPGA_PRINT_MODE
+  DLOG << "=============ComputeFpgaConcat===========";
+  DLOG << "   Image_num: " << args.image_num
+       << "   out_address:" << args.image_out
+       << "   out_scale_address:" << args.scale_out
+       << "   out_channel:" << args.out_channel;
+  DLOG << "   image_height:" << args.height << "   image_width:" << args.width;
+  for (int i = 0; i < args.image_num; i++) {
+    DLOG << "   " << i << "th:        ";
+    DLOG << "   channel_num:" << args.channel_num[i]
+         << "   aligned_channel_num:" << args.aligned_channel_num[i]
+         << "   image_address:" << args.images_in[i]
+         << "   image_scale_address:" << args.scales_in[i];
+  }
+#endif
+
+  image::concat_images(args.images_in, args.scales_in, args.image_out,
+                       args.scale_out, args.image_num, args.channel_num,
+                       args.height, args.width);
+  return 0;
+}
+
+}  // namespace fpga
+}  // namespace paddle_mobile
--- a/src/fpga/V2/api.cpp
+++ b/src/fpga/V2/api.cpp
@@ -13,84 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "fpga/V2/api.h"
-#include <algorithm>
 #include "fpga/V2/bias_scale.h"
-#include "fpga/V2/config.h"
-#include "fpga/V2/driver/driver.h"
 #include "fpga/V2/filter.h"
 #include "fpga/V2/image.h"

 namespace paddle_mobile {
 namespace fpga {

-static std::map<void *, size_t> memory_map;
-
-int open_device() {
-  int ret = driver::open_device_driver();
-  return ret;
-}
-
-int close_device() {
-  int ret = driver::close_device_driver();
-  return ret;
-}
-
-void *fpga_malloc(size_t size) {
-  static uint64_t counter = 0;
-#ifdef PADDLE_MOBILE_ZU5
-  auto ptr = driver::fpga_malloc_driver(size);
-#else
-  auto ptr = malloc(size);
-#endif
-  counter += size;
-  memory_map.insert(std::make_pair(ptr, size));
-  //  DLOG << "Address: " << ptr << ", " << size << " bytes allocated. Total "
-  //       << counter << " bytes";
-  return ptr;
-}
-
-void fpga_free(void *ptr) {
-  static uint64_t counter = 0;
-  size_t size = 0;
-  auto iter = memory_map.find(ptr);  // std::map<void *, size_t>::iterator
-  if (iter != memory_map.end()) {
-    size = iter->second;
-    memory_map.erase(iter);
-#ifdef PADDLE_MOBILE_ZU5
-    driver::fpga_free_driver(ptr);
-#else
-    free(ptr);
-#endif
-    counter += size;
-    //    DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total "
-    //         << counter << " bytes";
-  } else {
-    DLOG << "Invalid pointer";
-  }
-}
-void fpga_copy(void *dest, const void *src, size_t num) {
-#ifdef PADDLE_MOBILE_ZU5
-  driver::fpga_copy_driver(dest, src, num);
-#else
-  memcpy(dest, src, num);
-#endif
-}
-
-int fpga_flush(void *address, size_t size) {
-#ifdef PADDLE_MOBILE_ZU5
-  return driver::fpga_flush_driver(address, size);
-#else
-  return 0;
-#endif
-}
-int fpga_invalidate(void *address, size_t size) {
-#ifdef PADDLE_MOBILE_ZU5
-  return driver::fpga_invalidate_driver(address, size);
-#else
-  return 0;
-#endif
-}
-
 void format_image(framework::Tensor *image_tensor) {
  auto dims = image_tensor->dims();
  auto channel = dims[1], height = dims[2], width = dims[3];
@@ -284,8 +213,8 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
    arg->conv_arg[i].output.address = out_ptr;
    arg->conv_arg[i].output.scale_address = out->scale;

-    int num_after_alignment =
-        filter::calc_aligned_num((int)input->dims()[1], arg->filter_num);
+    int num_after_alignment = filter::calc_aligned_num(
+        (int)input->dims()[1], arg->filter_num);  // NOLINT
    arg->conv_arg[i].free_space =
        fpga_malloc(num_after_alignment * 2 * sizeof(half));
  }

--- a/src/fpga/V2/api.h
+++ b/src/fpga/V2/api.h
@@ -14,21 +14,13 @@ limitations under the License. */

 #pragma once

-#include "fpga/V2/driver/pe.h"
-#include "fpga/V2/fpga_common.h"
+#include "fpga/common/fpga_common.h"
+#include "fpga/common/pe.h"
 #include "framework/tensor.h"

 namespace paddle_mobile {
 namespace fpga {

-int open_device();
-int close_device();
-void* fpga_malloc(size_t size);
-void fpga_free(void* ptr);
-void fpga_copy(void* dest, const void* src, size_t num);
-int fpga_flush(void* address, size_t size);
-int fpga_invalidate(void* address, size_t size);
-
 float filter_find_max(framework::Tensor* filter_tensor);
 int get_aligned_channel_num(int channel_num);
 int get_aligned_filter_num(framework::Tensor* filter_tensor);

--- a/src/fpga/V2/bias_scale.cpp
+++ b/src/fpga/V2/bias_scale.cpp
@@ -14,7 +14,7 @@ limitations under the License. */

 #include "fpga/V2/bias_scale.h"
 #include <memory.h>
-#include "fpga/V2/api.h"
+#include "fpga/common/fpga_common.h"

 namespace paddle_mobile {
 namespace fpga {

--- a/src/fpga/V2/filter.cpp
+++ b/src/fpga/V2/filter.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "fpga/V2/filter.h"
 #include <memory.h>
 #include <algorithm>
-#include "fpga/V2/api.h"
+#include "fpga/common/fpga_common.h"

 namespace paddle_mobile {
 namespace fpga {

--- a/src/fpga/V2/image.cpp
+++ b/src/fpga/V2/image.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "fpga/V2/image.h"
 #include <memory.h>
 #include <algorithm>
-#include "fpga/V2/api.h"
+#include "fpga/common/fpga_common.h"

 namespace paddle_mobile {
 namespace fpga {

--- a/src/fpga/V2/driver/pe.cpp
+++ b/src/fpga/V2/driver/pe.cpp
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "fpga/V2/driver/pe.h"
-#include "fpga/V2/config.h"
-#include "fpga/V2/driver/driver.h"
+#include "fpga/common/pe.h"
 #include "fpga/V2/filter.h"
 #include "fpga/V2/image.h"
+#include "fpga/common/config.h"
+#include "fpga/common/driver.h"

 namespace paddle_mobile {
 namespace fpga {
@@ -166,53 +166,53 @@ int PerformBypass(const struct BypassArgs &args) {
  return 0;
 #endif

-  uint64_t ifm_src_paddr = driver::vaddr_to_paddr(args.image.address);
-  uint64_t ifm_dst_paddr = driver::vaddr_to_paddr(args.output.address);
-  uint64_t bp_enable;
-  int64_t length;
-  uint64_t pixels;
-
-  // fp32->fp16
-  if ((args.input_data_type) && (!args.output_data_type)) {
-    pixels = (args.image.channels) * (args.image.width) * (args.image.height);
-    length = pixels * sizeof(float);
-    bp_enable = 0x8800000000000000 + length;
-  }
-  // fp16->fp32
-  else if ((!args.input_data_type) && (args.output_data_type)) {
-    pixels = filter::calc_aligned_channel((args.image.channels)) *
-             (args.image.width) * (args.image.height);
-    length = pixels * sizeof(short);
-    length = align_to_x((int)length, 64);  // NOLINT
-    bp_enable = 0x8a00000000000000 + length;
-  }
-  // fp16->fp16 findmax
-  else if ((!args.input_data_type) && (!args.output_data_type)) {
-    pixels = (args.image.channels) * (args.image.width) * (args.image.height);
-    length = pixels * sizeof(short);
-    bp_enable = 0x8900000000000000 + length;
-  } else {
-    return -1;
-  }
-
-  // start bypass
-  driver::reg_writeq(ifm_src_paddr, MUL8(27));
-  driver::reg_writeq(ifm_dst_paddr, MUL8(28));
-  driver::reg_writeq(0, MUL8(0));
-  driver::reg_writeq(bp_enable, MUL8(0));
-  // poll
-  int ret = -1;
-  ret = driver::fpga_regpoll(MUL8(48), BYPASS_DONE, 0xffffffff);
-  if (ret != -1) {
-    // clear "irq"
-    driver::reg_readq(MUL8(63));
-  }
-  // get max value
-  if ((!args.input_data_type) && (!args.output_data_type)) {
-    float scale = Findfp16Max();
-    args.output.scale_address[0] = (float)(1.0 / scale);  // NOLINT
-    args.output.scale_address[1] = scale;
-  }
+  //  uint64_t ifm_src_paddr = driver::vaddr_to_paddr(args.image.address);
+  //  uint64_t ifm_dst_paddr = driver::vaddr_to_paddr(args.output.address);
+  //  uint64_t bp_enable;
+  //  int64_t length;
+  //  uint64_t pixels;
+  //
+  //  // fp32->fp16
+  //  if ((args.input_data_type) && (!args.output_data_type)) {
+  //    pixels = (args.image.channels) * (args.image.width) *
+  //    (args.image.height); length = pixels * sizeof(float); bp_enable =
+  //    0x8800000000000000 + length;
+  //  }
+  //  // fp16->fp32
+  //  else if ((!args.input_data_type) && (args.output_data_type)) {
+  //    pixels = filter::calc_aligned_channel((args.image.channels)) *
+  //             (args.image.width) * (args.image.height);
+  //    length = pixels * sizeof(short);
+  //    length = align_to_x((int)length, 64);  // NOLINT
+  //    bp_enable = 0x8a00000000000000 + length;
+  //  }
+  //  // fp16->fp16 findmax
+  //  else if ((!args.input_data_type) && (!args.output_data_type)) {
+  //    pixels = (args.image.channels) * (args.image.width) *
+  //    (args.image.height); length = pixels * sizeof(short); bp_enable =
+  //    0x8900000000000000 + length;
+  //  } else {
+  //    return -1;
+  //  }
+  //
+  //  // start bypass
+  //  driver::reg_writeq(ifm_src_paddr, MUL8(27));
+  //  driver::reg_writeq(ifm_dst_paddr, MUL8(28));
+  //  driver::reg_writeq(0, MUL8(0));
+  //  driver::reg_writeq(bp_enable, MUL8(0));
+  //  // poll
+  //  int ret = -1;
+  //  ret = driver::fpga_regpoll(MUL8(48), BYPASS_DONE, 0xffffffff);
+  //  if (ret != -1) {
+  //    // clear "irq"
+  //    driver::reg_readq(MUL8(63));
+  //  }
+  //  // get max value
+  //  if ((!args.input_data_type) && (!args.output_data_type)) {
+  //    float scale = Findfp16Max();
+  //    args.output.scale_address[0] = (float)(1.0 / scale);  // NOLINT
+  //    args.output.scale_address[1] = scale;
+  //  }
  return ret;
 }


--- a/src/fpga/V2/driver/bitmap.cpp
+++ b/src/fpga/V2/driver/bitmap.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "fpga/V2/driver/bitmap.h"
+#include "fpga/common/bitmap.h"

 namespace fpga_bitmap {
 void bitmap_set(uint64_t *map, unsigned int start, int len) {

--- a/src/fpga/V2/driver/bitmap.h
+++ b/src/fpga/V2/driver/bitmap.h
--- a/src/fpga/V2/config.h
+++ b/src/fpga/V2/config.h
--- a/src/fpga/V2/driver/driver.cpp
+++ b/src/fpga/V2/driver/driver.cpp
@@ -28,8 +28,8 @@ limitations under the License. */
 #include <iostream>

 #include "common/enforce.h"
-#include "fpga/V2/driver/bitmap.h"
-#include "fpga/V2/driver/driver.h"
+#include "fpga/common/bitmap.h"
+#include "fpga/common/driver.h"

 namespace paddle_mobile {
 namespace fpga {
@@ -353,7 +353,7 @@ void fpga_free_driver(void *ptr) {
  }
 }

-static inline int do_ioctl(unsigned long req, const void *arg) {
+static inline int do_ioctl(int64_t req, const void *arg) {
  return ioctl(g_fpgainfo.fd_mem, req, arg);
 }

@@ -363,7 +363,7 @@ int fpga_flush_driver(void *address, size_t size) {

  p_addr = vaddr_to_paddr(address);

-  args.offset = (void *)(p_addr - FPGA_MEM_PHY_ADDR);
+  args.offset = (void *)(p_addr - FPGA_MEM_PHY_ADDR);  // NOLINT
  args.size = size;

  return do_ioctl(IOCTL_MEMCACHE_FLUSH, &args);
@@ -375,7 +375,7 @@ int fpga_invalidate_driver(void *address, size_t size) {

  p_addr = vaddr_to_paddr(address);

-  args.offset = (void *)(p_addr - FPGA_MEM_PHY_ADDR);
+  args.offset = (void *)(p_addr - FPGA_MEM_PHY_ADDR);  // NOLINT
  args.size = size;

  return do_ioctl(IOCTL_MEMCACHE_INVAL, &args);
@@ -389,7 +389,7 @@ void fpga_copy_driver(void *dest, const void *src, size_t num) {
  for (i = 0; i < num; i++) {
    // DLOG << "i:" << i << " val:" << *((int8_t *)src + i);
    // usleep(1);
-    *((int8_t *)dest + i) = *((int8_t *)src + i);
+    *((int8_t *)dest + i) = *((int8_t *)src + i);  // NOLINT
  }

  return;

--- a/src/fpga/V2/driver/driver.h
+++ b/src/fpga/V2/driver/driver.h
@@ -33,8 +33,6 @@ namespace driver {
 #define FPGA_MEM_PHY_ADDR 0x20000000
 #define FPGA_MEM_SIZE 0x20000000

-#define CPU_FREQ 1000000000
-
 #define FPGA_PAGE_SIZE (16UL * 1024UL)

 // PE related macros
@@ -105,17 +103,17 @@ extern struct FPGA_INFO g_fpgainfo;

 inline uint64_t reg_readq(uint32_t offset) {
  // DLOG << "offset : " << offset;
-  uint64_t value = *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr +
-                                          offset);  // NOLINT
+  uint64_t value =
+      *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr +  // NOLINT
+                             offset);                                // NOLINT

  return value;
 }

 inline void reg_writeq(uint64_t value, uint32_t offset) {
  // DLOG << "offset : " << offset << ", value : " << value;
-  *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr +
-                         offset) =  // NOLINT
-      value;
+  *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr +  // NOLINT
+                         offset) = value;
 }

 int open_device_driver();

--- a/src/fpga/V2/fpga_common.cpp
+++ b/src/fpga/V2/fpga_common.cpp
@@ -12,7 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include <fpga/V2/fpga_common.h>
+#include "fpga/common/fpga_common.h"
+#include <algorithm>
+#include <map>
+#include "fpga/common/config.h"
+#include "fpga/common/driver.h"
+
 namespace paddle_mobile {
 namespace fpga {

@@ -40,5 +45,73 @@ float fp16_2_fp32(int16_t fp16_num) {
  return fp32_num;
 }

+static std::map<void *, size_t> memory_map;
+
+int open_device() {
+  int ret = driver::open_device_driver();
+  return ret;
+}
+
+int close_device() {
+  int ret = driver::close_device_driver();
+  return ret;
+}
+
+void *fpga_malloc(size_t size) {
+  static uint64_t counter = 0;
+#ifdef PADDLE_MOBILE_ZU5
+  auto ptr = driver::fpga_malloc_driver(size);
+#else
+  auto ptr = malloc(size);
+#endif
+  counter += size;
+  memory_map.insert(std::make_pair(ptr, size));
+  //  DLOG << "Address: " << ptr << ", " << size << " bytes allocated. Total "
+  //       << counter << " bytes";
+  return ptr;
+}
+
+void fpga_free(void *ptr) {
+  static uint64_t counter = 0;
+  size_t size = 0;
+  auto iter = memory_map.find(ptr);  // std::map<void *, size_t>::iterator
+  if (iter != memory_map.end()) {
+    size = iter->second;
+    memory_map.erase(iter);
+#ifdef PADDLE_MOBILE_ZU5
+    driver::fpga_free_driver(ptr);
+#else
+    free(ptr);
+#endif
+    counter += size;
+    //    DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total "
+    //         << counter << " bytes";
+  } else {
+    DLOG << "Invalid pointer";
+  }
+}
+void fpga_copy(void *dest, const void *src, size_t num) {
+#ifdef PADDLE_MOBILE_ZU5
+  driver::fpga_copy_driver(dest, src, num);
+#else
+  memcpy(dest, src, num);
+#endif
+}
+
+int fpga_flush(void *address, size_t size) {
+#ifdef PADDLE_MOBILE_ZU5
+  return driver::fpga_flush_driver(address, size);
+#else
+  return 0;
+#endif
+}
+int fpga_invalidate(void *address, size_t size) {
+#ifdef PADDLE_MOBILE_ZU5
+  return driver::fpga_invalidate_driver(address, size);
+#else
+  return 0;
+#endif
+}
+
 }  // namespace fpga
 }  // namespace paddle_mobile
--- a/src/fpga/V2/fpga_common.h
+++ b/src/fpga/V2/fpga_common.h
@@ -14,6 +14,7 @@ limitations under the License. */

 #pragma once

+#include <cstddef>
 #include <cstdint>

 namespace paddle_mobile {
@@ -117,9 +118,19 @@ struct BypassArgs {
 struct DeconvArgs {
  struct ConvArgs conv_arg;
 };
+
 static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; }
+
 int16_t fp32_2_fp16(float fp32_num);
 float fp16_2_fp32(int16_t fp16_num);

+int open_device();
+int close_device();
+void* fpga_malloc(size_t size);
+void fpga_free(void* ptr);
+void fpga_copy(void* dest, const void* src, size_t num);
+int fpga_flush(void* address, size_t size);
+int fpga_invalidate(void* address, size_t size);
+
 }  // namespace fpga
 }  // namespace paddle_mobile
--- a/src/fpga/V2/driver/pe.h
+++ b/src/fpga/V2/driver/pe.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once

-#include "fpga/V2/fpga_common.h"
+#include "fpga/common/fpga_common.h"

 namespace paddle_mobile {
 namespace fpga {

--- a/tools/op.cmake
+++ b/tools/op.cmake
@@ -102,7 +102,6 @@ if (CON GREATER -1)
  set(MUL_OP ON)
  set(RESHAPE_OP ON)
  set(SOFTMAX_OP ON)
-
  set(FOUND_MATCH ON)
 endif()

@@ -120,7 +119,6 @@ if (CON GREATER -1)
  set(SOFTMAX_OP ON)
  set(FUSION_CONVBNRELU_OP ON)
  set(FUSION_CONVBN_OP ON)
-  set(FUSION_CONVADD_OP ON)
  set(FOUND_MATCH ON)
 endif()