diff --git a/src/fpga/V1/api.cpp b/src/fpga/V1/api.cpp
index 04e51ab9b09fabc41fcd1cd73864bc285d183821..7c1f15f7c90e0b1ebc15a9ec8f3f6333ff173978 100644
--- a/src/fpga/V1/api.cpp
+++ b/src/fpga/V1/api.cpp
@@ -13,251 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "fpga/V1/api.h"
-#include <fcntl.h>
-#include <sys/ioctl.h>
-#include <sys/mman.h>
-#include <algorithm>
-#include <map>
 #include "fpga/V1/bias_scale.h"
 #include "fpga/V1/filter.h"
 #include "fpga/V1/image.h"
-#define FPGA_TEST_MODE
-#define PADDLE_MOBILE_OS_LINUX
 
 namespace paddle_mobile {
 namespace fpga {
 
-static int fd = -1;
-static const char *device_path = "/dev/fpgadrv0";
-static std::map<void *, size_t> memory_map;
-
-static inline int do_ioctl(int req, const void *arg) {
-#ifdef PADDLE_MOBILE_OS_LINUX
-  int result = ioctl(fd, req, (uint64_t)arg);
-  PADDLE_MOBILE_ENFORCE(result == 0, "ioctl didn't return correctly");
-  return result;
-#else
-  return -1;
-#endif
-}
-
-int open_device() {
-  if (fd == -1) {
-    fd = open(device_path, O_RDWR);
-  }
-  return fd;
-}
-
-// memory management;
-void *fpga_malloc(size_t size) {
-  static uint64_t counter = 0;
-
-#ifdef PADDLE_MOBILE_OS_LINUX
-  auto ptr = mmap64(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
-#else
-  auto ptr = malloc(size);
-#endif
-  counter += size;
-  memory_map.insert(std::make_pair(ptr, size));
-  //  DLOG << "Address: " << ptr << ", " << size << " bytes allocated. Total "
-  //       << counter << " bytes";
-  return ptr;
-}
-
-void fpga_free(void *ptr) {
-  static uint64_t counter = 0;
-  size_t size = 0;
-
-  auto iter = memory_map.find(ptr);  // std::map<void *, size_t>::iterator
-  if (iter != memory_map.end()) {
-    size = iter->second;
-    memory_map.erase(iter);
-#ifdef PADDLE_MOBILE_OS_LINUX
-    munmap(ptr, size);
-#else
-    free(ptr);
-#endif
-    counter += size;
-    //    DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total "
-    //         << counter << " bytes";
-  } else {
-    DLOG << "Invalid pointer";
-  }
-}
-
-void fpga_copy(void *dest, const void *src, size_t num) {
-  memcpy(dest, src, num);
-}
-
-int fpga_flush(void *address, size_t size) {
-  struct MemoryCacheArgs args = {nullptr};
-  args.address = address;
-  args.size = size;
-  return do_ioctl(IOCTL_MEMCACHE_FLUSH, &args);
-}
-
-int fpga_invalidate(void *address, size_t size) {
-  struct MemoryCacheArgs args = {nullptr};
-  args.address = address;
-  args.size = size;
-  return do_ioctl(IOCTL_MEMCACHE_INVAL, &args);
-}
-
-half fp32_2_fp16(float fp32_num) {
-  unsigned long tmp = *(unsigned long *)(&fp32_num);  // NOLINT
-  half t = ((tmp & 0x007fffff) >> 13) | ((tmp & 0x80000000) >> 16) |
-           (((tmp & 0x7f800000) >> 13) - (112 << 10));
-  if (tmp & 0x1000) {
-    t++;  // roundoff
-  }
-  return t;
-}
-
-float fp16_2_fp32(half fp16_num) {
-  int frac = (fp16_num & 0x3ff);
-  int exp = ((fp16_num & 0x7c00) >> 10) + 112;
-  int s = fp16_num & 0x8000;
-  int tmp = 0;
-  float fp32_num;
-  tmp = s << 16 | exp << 23 | frac << 13;
-  fp32_num = *(float *)&tmp;  // NOLINT
-  return fp32_num;
-}
-
-int ComputeBasicConv(const struct ConvArgs &args) {
-#ifdef FPGA_TEST_MODE
-  DLOG << "======Compute Basic Conv======";
-  DLOG << "   relu_enabled:" << args.relu_enabled
-       << "   sb_address:" << args.sb_address
-       << "   filter_address:" << args.filter_address
-       << "   filter_num:" << args.filter_num
-       << "   group_num:" << args.group_num;
-  DLOG << "   image_address:" << args.image.address
-       << "   image_scale_address:" << args.image.scale_address
-       << "   image_channels:" << args.image.channels
-       << "   image_height:" << args.image.height
-       << "   image_width:" << args.image.width
-       << "   pad_height:" << args.image.pad_height
-       << "   pad_width:" << args.image.pad_width;
-  DLOG << "   kernel_height:" << args.kernel.height
-       << "   kernel_width:" << args.kernel.width
-       << "   stride_h:" << args.kernel.stride_h
-       << "   stride_w:" << args.kernel.stride_w;
-  DLOG << "   out_address:" << args.output.address
-       << "   out_scale_address:" << args.output.scale_address;
-#endif
-  return do_ioctl(IOCTL_CONFIG_CONV, &args);
-}
-
-int ComputeFpgaConv(const struct SplitConvArgs &args) {
-#ifdef FPGA_TEST_MODE
-  DLOG << "=============ComputeFPGAConv===========";
-  DLOG << "   filter_num:" << args.filter_num
-       << "   group_num:" << args.group_num
-       << "   split_num:" << args.split_num;
-#endif
-
-  int split_num = args.split_num;
-  for (int i = 0; i < split_num; i++) {
-    ComputeBasicConv(args.conv_args[i]);
-  }
-
-  if (split_num > 1) {
-    ComputeFPGAConcat(args.concat_arg);
-  }
-}
-
-int ComputeFpgaPool(const struct PoolingArgs &args) {
-#ifdef FPGA_TEST_MODE
-  DLOG << "=============ComputeFpgaPool===========";
-  DLOG << "   mode:" << args.mode
-       << "   kernel_reciprocal:" << fp16_2_fp32(args.kernel_reciprocal);
-  DLOG << "   image_address:" << args.image.address
-       << "   image_scale_address:" << args.image.scale_address
-       << "   image_channels:" << args.image.channels
-       << "   image_height:" << args.image.height
-       << "   image_width:" << args.image.width
-       << "   pad_height:" << args.image.pad_height
-       << "   pad_width:" << args.image.pad_width;
-  DLOG << "   kernel_height:" << args.kernel.height
-       << "   kernel_width:" << args.kernel.width
-       << "   stride_h:" << args.kernel.stride_h
-       << "   stride_w:" << args.kernel.stride_w;
-  DLOG << "   out_address:" << args.output.address
-       << "   out_scale_address:" << args.output.scale_address;
-#endif
-
-  return do_ioctl(IOCTL_CONFIG_POOLING, &args);
-}
-
-int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
-#ifdef FPGA_TEST_MODE
-  DLOG << "=============ComputeFpgaEWAdd===========";
-  DLOG << "   relu_enabled:" << args.relu_enabled
-       << "   const0:" << fp16_2_fp32(int16_t(args.const0))
-       << "   const1:" << fp16_2_fp32(int16_t(args.const1));
-  DLOG << "   image0_address:" << args.image0.address
-       << "   image0_scale_address:" << args.image0.scale_address
-       << "   image0_channels:" << args.image0.channels
-       << "   image0_height:" << args.image0.height
-       << "   image0_width:" << args.image0.width
-       << "   pad0_height:" << args.image0.pad_height
-       << "   pad0_width:" << args.image0.pad_width;
-  DLOG << "   image1_address:" << args.image1.address
-       << "   image1_scale_address:" << args.image1.scale_address
-       << "   image1_channels:" << args.image1.channels
-       << "   image1_height:" << args.image1.height
-       << "   image1_width:" << args.image1.width
-       << "   pad1_height:" << args.image1.pad_height
-       << "   pad_width:" << args.image1.pad_width;
-  DLOG << "   out_address:" << args.output.address
-       << "   out_scale_address:" << args.output.scale_address;
-#endif
-
-  return do_ioctl(IOCTL_CONFIG_EW, &args);
-}
-int PerformBypass(const struct BypassArgs &args) {
-#ifdef FPGA_TEST_MODE
-  DLOG << "=============ComputeFpgaBypass===========";
-  DLOG << "   input_type:" << args.input_data_type
-       << "   output_type:" << args.output_data_type
-       << "   input_layout_type:" << args.input_layout_type
-       << "   output_layout_type:" << args.output_layout_type;
-  DLOG << "   image_address:" << args.image.address
-       << "   image_scale_address:" << args.image.scale_address
-       << "   image_channels:" << args.image.channels
-       << "   image_height:" << args.image.height
-       << "   image_width:" << args.image.width
-       << "   pad_height:" << args.image.pad_height
-       << "   pad_width:" << args.image.pad_width;
-  DLOG << "   out_address:" << args.output.address
-       << "   out_scale_address:" << args.output.scale_address;
-#endif
-
-  return do_ioctl(IOCTL_CONFIG_BYPASS, &args);
-}
-
-int ComputeFPGAConcat(const struct ConcatArgs &args) {
-#ifdef FPGA_TEST_MODE
-  DLOG << "=============ComputeFpgaConcat===========";
-  DLOG << "   Image_num: " << args.image_num
-       << "   out_address:" << args.image_out
-       << "   out_scale_address:" << args.scale_out;
-  DLOG << "   image_height:" << args.height << "   image_width:" << args.width;
-  for (int i = 0; i < args.image_num; i++) {
-    DLOG << "   " << i << "th:        ";
-    DLOG << "   channel_num:" << args.channel_num[i]
-         << "   image_address:" << args.images_in[i]
-         << "   image_scale_address:" << args.scales_in[i];
-  }
-#endif
-
-  image::concat_images(args.images_in, args.scales_in, args.image_out,
-                       args.scale_out, args.image_num, args.channel_num,
-                       args.height, args.width);
-  return 0;
-}
-
 int get_align_image_cw(int cw) { return align_to_x(cw, IMAGE_ALIGNMENT); }
 
 void format_image(framework::Tensor *image_tensor) {
@@ -397,7 +159,7 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
   arg->filter_num = (uint32_t)filter->dims()[0];
   arg->output.address = out_ptr;
   arg->output.scale_address = out->scale;
-  arg->conv_args =
+  arg->conv_arg =
       (ConvArgs *)fpga_malloc(arg->split_num * sizeof(ConvArgs));  // NOLINT
 
   arg->concat_arg.image_num = arg->split_num;
@@ -420,44 +182,44 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
       filter->dims()[1] * filter->dims()[2] * filter->dims()[3]);
 
   for (int i = 0; i < n; i++) {
-    arg->conv_args[i].relu_enabled = relu_enabled;
-    arg->conv_args[i].group_num = (uint32_t)group_num;
-    arg->conv_args[i].kernel.stride_h = (uint32_t)stride_h;
-    arg->conv_args[i].kernel.stride_w = (uint32_t)stride_w;
-    arg->conv_args[i].kernel.height = (uint32_t)filter->dims()[2];
-    arg->conv_args[i].kernel.width = (uint32_t)filter->dims()[3];
-    arg->conv_args[i].image.address = input_ptr;
-    arg->conv_args[i].image.channels = (uint32_t)input->dims()[1];
-    arg->conv_args[i].image.height = (uint32_t)input->dims()[2];
-    arg->conv_args[i].image.width = (uint32_t)input->dims()[3];
-    arg->conv_args[i].image.scale_address = input->scale;
-    arg->conv_args[i].image.pad_height = (uint32_t)padding_h;
-    arg->conv_args[i].image.pad_width = (uint32_t)padding_w;
-    arg->conv_args[i].filter_scale_address = filter->scale;
-    arg->conv_args[i].filter_address = &(
+    arg->conv_arg[i].relu_enabled = relu_enabled;
+    arg->conv_arg[i].group_num = (uint32_t)group_num;
+    arg->conv_arg[i].kernel.stride_h = (uint32_t)stride_h;
+    arg->conv_arg[i].kernel.stride_w = (uint32_t)stride_w;
+    arg->conv_arg[i].kernel.height = (uint32_t)filter->dims()[2];
+    arg->conv_arg[i].kernel.width = (uint32_t)filter->dims()[3];
+    arg->conv_arg[i].image.address = input_ptr;
+    arg->conv_arg[i].image.channels = (uint32_t)input->dims()[1];
+    arg->conv_arg[i].image.height = (uint32_t)input->dims()[2];
+    arg->conv_arg[i].image.width = (uint32_t)input->dims()[3];
+    arg->conv_arg[i].image.scale_address = input->scale;
+    arg->conv_arg[i].image.pad_height = (uint32_t)padding_h;
+    arg->conv_arg[i].image.pad_width = (uint32_t)padding_w;
+    arg->conv_arg[i].filter_scale_address = filter->scale;
+    arg->conv_arg[i].filter_address = &(
         (int8_t *)filter_ptr)[i * element_num * filter_num_per_div];  // NOLINT
-    arg->conv_args[i].sb_address = &bs_ptr[i * filter_num_per_div * 2];
-    arg->conv_args[i].filter_num = (uint32_t)(
+    arg->conv_arg[i].sb_address = &bs_ptr[i * filter_num_per_div * 2];
+    arg->conv_arg[i].filter_num = (uint32_t)(
         i == n - 1 ? channel - (n - 1) * filter_num_per_div  // NOLINT
                    : filter_num_per_div);
 
     if (n > 1) {
-      arg->conv_args[i].output.scale_address =
+      arg->conv_arg[i].output.scale_address =
           (float *)fpga_malloc(2 * sizeof(float));  // NOLINT
-      arg->conv_args[i].output.address = fpga_malloc(
-          input->dims()[2] *
-          align_to_x(input->dims()[3] * arg->conv_args[i].filter_num,
-                     IMAGE_ALIGNMENT) *
-          sizeof(half));
+      arg->conv_arg[i].output.address =
+          fpga_malloc(input->dims()[2] *
+                      align_to_x(input->dims()[3] * arg->conv_arg[i].filter_num,
+                                 IMAGE_ALIGNMENT) *
+                      sizeof(half));
     } else {
-      arg->conv_args[i].output.scale_address = out->scale;
-      arg->conv_args[i].output.address = out_ptr;
+      arg->conv_arg[i].output.scale_address = out->scale;
+      arg->conv_arg[i].output.address = out_ptr;
     }
 
     arg->concat_arg.images_in[i] =
-        (half *)arg->conv_args[i].output.address;  // NOLINT
-    arg->concat_arg.scales_in[i] = arg->conv_args[i].output.scale_address;
-    arg->concat_arg.channel_num[i] = arg->conv_args[i].filter_num;
+        (half *)arg->conv_arg[i].output.address;  // NOLINT
+    arg->concat_arg.scales_in[i] = arg->conv_arg[i].output.scale_address;
+    arg->concat_arg.channel_num[i] = arg->conv_arg[i].filter_num;
   }
 }
 
diff --git a/src/fpga/V1/api.h b/src/fpga/V1/api.h
index f535975a35ecc3c454bbac597b31d8c3670cbf91..daa7902ab4a6cb72a77bba31f8cfe84c897f30a4 100644
--- a/src/fpga/V1/api.h
+++ b/src/fpga/V1/api.h
@@ -14,178 +14,13 @@ limitations under the License. */
 
 #pragma once
 
-#include <stdint.h>
-#include <cstddef>
-#include <iostream>
-#include <limits>
+#include "fpga/common/fpga_common.h"
+#include "fpga/common/pe.h"
 #include "framework/tensor.h"
 
 namespace paddle_mobile {
 namespace fpga {
 
-enum DataType {
-  DATA_TYPE_FP32 = 1,
-  DATA_TYPE_FP16 = 0,
-};
-
-enum LayoutType {
-  LAYOUT_CHW = 1,
-  LAYOUT_HWC = 0,
-};
-
-struct VersionArgs {
-  void* buffer;
-};
-
-struct MemoryCopyArgs {
-  void* src;
-  void* dest;
-  size_t size;
-};
-
-struct KernelArgs {
-  uint32_t width;
-  uint32_t height;
-  uint32_t stride_w;
-  uint32_t stride_h;
-};
-
-struct ImageInputArgs {
-  void* address;         // input featuremap virtual address
-  float* scale_address;  // input scale address;
-  uint32_t channels;
-  uint32_t width;  // featuremap width
-  uint32_t height;
-  uint32_t pad_width;  // padding width;
-  uint32_t pad_height;
-};
-
-struct ImageOutputArgs {
-  void* address;         // output result address;
-  float* scale_address;  // output scale address;
-};
-
-struct ConvArgs {
-  bool relu_enabled;
-  void* sb_address;  // scale and bias are interlaced;
-  void* filter_address;
-  float* filter_scale_address;
-  uint32_t filter_num;
-  uint32_t group_num;
-
-  struct KernelArgs kernel;
-  struct ImageInputArgs image;  // input image;
-  struct ImageOutputArgs output;
-};
-
-struct ConcatArgs {
-  uint32_t image_num;
-  half** images_in;
-  float** scales_in;
-  void* image_out;
-  float* scale_out;
-  uint32_t* channel_num;
-  uint32_t height;
-  uint32_t width;
-};
-
-struct SplitConvArgs {
-  uint32_t split_num;
-  uint32_t group_num;
-  uint32_t filter_num;
-  struct ImageOutputArgs output;
-  struct ConvArgs* conv_args;
-  struct ConcatArgs concat_arg;
-};
-
-struct GroupConvArgs {
-  uint32_t group_num;
-  uint32_t filter_num;
-  struct ImageOutputArgs output;
-  struct SplitConvArgs* conv_args;
-  struct ConcatArgs concat_arg;
-};
-
-struct PoolingArgs {
-  int16_t mode;  // mode: 0:max, 1:avg
-  half kernel_reciprocal;
-  struct KernelArgs kernel;
-  struct ImageInputArgs image;  // input image;
-  struct ImageOutputArgs output;
-};
-
-struct EWAddArgs {
-  bool relu_enabled;
-
-  uint32_t const0;  // output0 = const0 x input0 + const1 x input1;
-  uint32_t const1;
-  struct ImageInputArgs image0;
-  struct ImageInputArgs image1;
-  struct ImageOutputArgs output;
-};
-
-struct BypassArgs {
-  enum DataType input_data_type;
-  enum DataType output_data_type;
-  enum LayoutType input_layout_type;
-  enum LayoutType output_layout_type;
-  struct ImageInputArgs image;
-  struct ImageOutputArgs output;
-};
-
-struct FpgaRegWriteArgs {
-  uint64_t address;  //
-  uint64_t value;
-};
-
-struct FpgaRegReadArgs {
-  uint64_t address;
-  uint64_t value;
-};
-
-struct MemoryCacheArgs {
-  void* address;
-  size_t size;
-};
-
-#define IOCTL_FPGA_MAGIC 'FPGA'
-
-#define IOCTL_VERSION _IOW(IOCTL_FPGA_MAGIC, 01, struct VersionArgs)
-
-#define IOCTL_SEPARATOR_0 10
-
-#define IOCTL_MEM_COPY _IOW(IOCTL_FPGA_MAGIC, 11, struct MemoryCopyArgs)
-#define IOCTL_MEMCACHE_INVAL _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryCacheArgs)
-#define IOCTL_MEMCACHE_FLUSH _IOW(IOCTL_FPGA_MAGIC, 13, struct MemoryCacheArgs)
-
-#define IOCTL_SEPARATOR_1 20
-
-#define IOCTL_CONFIG_CONV _IOW(IOCTL_FPGA_MAGIC, 21, struct ConvArgs)
-#define IOCTL_CONFIG_POOLING _IOW(IOCTL_FPGA_MAGIC, 22, struct PoolingArgs)
-#define IOCTL_CONFIG_EW _IOW(IOCTL_FPGA_MAGIC, 23, struct EWAddArgs)
-#define IOCTL_CONFIG_BYPASS _IOW(IOCTL_FPGA_MAGIC, 24, struct BypassArgs)
-#define IOCTL_FPGA_REG_READ _IOW(IOCTL_FPGA_MAGIC, 28, struct FpgaRegReadArgs)
-#define IOCTL_FPGA_REG_WRITE _IOW(IOCTL_FPGA_MAGIC, 29, struct FpgaRegWriteArgs)
-
-//============================== API =============================
-
-int open_device();
-int close_device();
-
-void* fpga_malloc(size_t size);
-void fpga_free(void* ptr);
-void fpga_copy(void* dst, const void* src, size_t num);
-int fpga_flush(void* address, size_t size);
-int fpga_invalidate(void* address, size_t size);
-
-int PerformBypass(const struct BypassArgs& args);
-int ComputeFpgaConv(const struct SplitConvArgs& args);
-int ComputeFpgaPool(const struct PoolingArgs& args);
-int ComputeFpgaEWAdd(const struct EWAddArgs& args);
-int ComputeFPGAConcat(const struct ConcatArgs& args);
-
-static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; }
-
 int get_align_image_cw(int cw);
 void format_image(framework::Tensor* image_tensor);
 void format_fp16_ofm(framework::Tensor* ofm_tensor);  // only allocate memory
@@ -209,8 +44,5 @@ void fill_split_arg(struct SplitConvArgs* arg, framework::Tensor* input,
                     bool relu_enabled, int group_num, int stride_h,
                     int stride_w, int padding_h, int padding_w, float* bs_ptr);
 
-half fp32_2_fp16(float fp32_num);
-float fp16_2_fp32(half fp16_num);
-
 }  // namespace fpga
 }  // namespace paddle_mobile
diff --git a/src/fpga/V1/bias_scale.cpp b/src/fpga/V1/bias_scale.cpp
index 3c2c04dc1d7f76953b04a879fbcfa8377dd7ba8a..263a7494c5602c13208aa0d8899ce80d781aa11b 100644
--- a/src/fpga/V1/bias_scale.cpp
+++ b/src/fpga/V1/bias_scale.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "fpga/V1/bias_scale.h"
 #include <memory.h>
-#include "fpga/V1/api.h"
+#include "fpga/common/fpga_common.h"
 
 namespace paddle_mobile {
 namespace fpga {
diff --git a/src/fpga/V1/filter.cpp b/src/fpga/V1/filter.cpp
index 3f4a3e2c876f0b54546f0e385d4a5e8bbfacdf3c..157ac90a60262cadacb648173cbc5ba6c01e674e 100644
--- a/src/fpga/V1/filter.cpp
+++ b/src/fpga/V1/filter.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "fpga/V1/filter.h"
 #include <memory.h>
 #include <algorithm>
-#include "fpga/V1/api.h"
+#include "fpga/common/fpga_common.h"
 
 namespace paddle_mobile {
 namespace fpga {
@@ -31,20 +31,22 @@ int calc_split_num(int num, int division_capacity) {
 }
 
 int calc_division_number(int num, int group_num, int division_capacity) {
-  PADDLE_MOBILE_ENFORCE(num % group_num == 0,
-                        "Filter number should be divisible by group number");
+  //  PADDLE_MOBILE_ENFORCE(num % group_num == 0,
+  //                        "Filter number should be divisible by group
+  //                        number");
   int split_num = calc_split_num(num, division_capacity);
-  PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1,
-                        "Split number or group number should be 1");
+  //  PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1,
+  //                        "Split number or group number should be 1");
   return group_num * split_num;
 }
 
 int calc_num_per_div(int num, int group_num, int division_capacity) {
-  PADDLE_MOBILE_ENFORCE(num % group_num == 0,
-                        "Filter number should be divisible by group number");
+  //  PADDLE_MOBILE_ENFORCE(num % group_num == 0,
+  //                        "Filter number should be divisible by group
+  //                        number");
   int split_num = calc_split_num(num, division_capacity);
-  PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1,
-                        "Split number or group number should be 1");
+  //  PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1,
+  //                        "Split number or group number should be 1");
   if (group_num == 1) {
     if (num > division_capacity) {
       return division_capacity;
diff --git a/src/fpga/V1/image.cpp b/src/fpga/V1/image.cpp
index 73be05c942d6a848db830148d25bc8b3e14b53e4..312af1d00b5f6dfa25f33ce93a25d55577b92818 100644
--- a/src/fpga/V1/image.cpp
+++ b/src/fpga/V1/image.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "fpga/V1/image.h"
 #include <memory.h>
 #include <algorithm>
-#include "fpga/V1/api.h"
+#include "fpga/common/fpga_common.h"
 
 namespace paddle_mobile {
 namespace fpga {
diff --git a/src/fpga/V1/pe.cpp b/src/fpga/V1/pe.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9adea7e0962243d46fa6060b4deae6df371567c8
--- /dev/null
+++ b/src/fpga/V1/pe.cpp
@@ -0,0 +1,160 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "fpga/common/pe.h"
+#include "fpga/V1/filter.h"
+#include "fpga/V1/image.h"
+#include "fpga/common/config.h"
+#include "fpga/common/driver.h"
+
+namespace paddle_mobile {
+namespace fpga {
+
+int ComputeFpgaConv(const struct SplitConvArgs &args) {
+  ComputeBasicConv(args.conv_arg[0]);
+}
+
+int ComputeBasicConv(const struct ConvArgs &args) {
+#ifdef FPGA_PRINT_MODE
+  DLOG << "======Compute Basic Conv======";
+  DLOG << "   relu_enabled:" << args.relu_enabled
+       << "   sb_address:" << args.sb_address
+       << "   filter_address:" << args.filter_address
+       << "   filter_num:" << args.filter_num
+       << "   group_num:" << args.group_num;
+  DLOG << "   image_address:" << args.image.address
+       << "   image_scale_address:" << args.image.scale_address
+       << "   image_channels:" << args.image.channels
+       << "   image_height:" << args.image.height
+       << "   image_width:" << args.image.width
+       << "   pad_height:" << args.image.pad_height
+       << "   pad_width:" << args.image.pad_width;
+  DLOG << "   kernel_height:" << args.kernel.height
+       << "   kernel_width:" << args.kernel.width
+       << "   stride_h:" << args.kernel.stride_h
+       << "   stride_w:" << args.kernel.stride_w;
+  DLOG << "   out_address:" << args.output.address
+       << "   out_scale_address:" << args.output.scale_address;
+#endif
+
+#ifndef PADDLE_MOBILE_ZU5
+  return 0;
+#endif
+
+  return 0;
+}
+
+int ComputeFpgaPool(const struct PoolingArgs &args) {
+#ifdef FPGA_PRINT_MODE
+  DLOG << "=============ComputeFpgaPool===========";
+  DLOG << "   mode:" << args.mode
+       << "   kernel_reciprocal:" << fp16_2_fp32(args.kernel_reciprocal);
+  DLOG << "   image_address:" << args.image.address
+       << "   image_scale_address:" << args.image.scale_address
+       << "   image_channels:" << args.image.channels
+       << "   image_height:" << args.image.height
+       << "   image_width:" << args.image.width
+       << "   pad_height:" << args.image.pad_height
+       << "   pad_width:" << args.image.pad_width;
+  DLOG << "   kernel_height:" << args.kernel.height
+       << "   kernel_width:" << args.kernel.width
+       << "   stride_h:" << args.kernel.stride_h
+       << "   stride_w:" << args.kernel.stride_w;
+  DLOG << "   out_address:" << args.output.address
+       << "   out_scale_address:" << args.output.scale_address;
+#endif
+#ifndef PADDLE_MOBILE_ZU5
+  return 0;
+#endif
+  return 0;
+}
+
+int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
+#ifdef FPGA_PRINT_MODE
+  DLOG << "=============ComputeFpgaEWAdd===========";
+  DLOG << "   relu_enabled:" << args.relu_enabled
+       << "   const0:" << fp16_2_fp32(int16_t(args.const0))
+       << "   const1:" << fp16_2_fp32(int16_t(args.const1));
+  DLOG << "   image0_address:" << args.image0.address
+       << "   image0_scale_address:" << args.image0.scale_address
+       << "   image0_channels:" << args.image0.channels
+       << "   image0_height:" << args.image0.height
+       << "   image0_width:" << args.image0.width
+       << "   pad0_height:" << args.image0.pad_height
+       << "   pad0_width:" << args.image0.pad_width;
+  DLOG << "   image1_address:" << args.image1.address
+       << "   image1_scale_address:" << args.image1.scale_address
+       << "   image1_channels:" << args.image1.channels
+       << "   image1_height:" << args.image1.height
+       << "   image1_width:" << args.image1.width
+       << "   pad1_height:" << args.image1.pad_height
+       << "   pad_width:" << args.image1.pad_width;
+  DLOG << "   out_address:" << args.output.address
+       << "   out_scale_address:" << args.output.scale_address;
+#endif
+#ifndef PADDLE_MOBILE_ZU5
+  return 0;
+#endif
+  return 0;
+}
+
+int PerformBypass(const struct BypassArgs &args) {
+#ifdef FPGA_PRINT_MODE
+  DLOG << "=============ComputeFpgaBypass===========";
+  DLOG << "   input_type:" << args.input_data_type
+       << "   output_type:" << args.output_data_type
+       << "   input_layout_type:" << args.input_layout_type
+       << "   output_layout_type:" << args.output_layout_type;
+  DLOG << "   image_address:" << args.image.address
+       << "   image_scale_address:" << args.image.scale_address
+       << "   image_channels:" << args.image.channels
+       << "   image_height:" << args.image.height
+       << "   image_width:" << args.image.width
+       << "   pad_height:" << args.image.pad_height
+       << "   pad_width:" << args.image.pad_width;
+  DLOG << "   out_address:" << args.output.address
+       << "   out_scale_address:" << args.output.scale_address;
+#endif
+#ifndef PADDLE_MOBILE_ZU5
+  return 0;
+#endif
+
+  return 0;
+}
+
+int ComputeFPGAConcat(const struct ConcatArgs &args) {
+#ifdef FPGA_PRINT_MODE
+  DLOG << "=============ComputeFpgaConcat===========";
+  DLOG << "   Image_num: " << args.image_num
+       << "   out_address:" << args.image_out
+       << "   out_scale_address:" << args.scale_out
+       << "   out_channel:" << args.out_channel;
+  DLOG << "   image_height:" << args.height << "   image_width:" << args.width;
+  for (int i = 0; i < args.image_num; i++) {
+    DLOG << "   " << i << "th:        ";
+    DLOG << "   channel_num:" << args.channel_num[i]
+         << "   aligned_channel_num:" << args.aligned_channel_num[i]
+         << "   image_address:" << args.images_in[i]
+         << "   image_scale_address:" << args.scales_in[i];
+  }
+#endif
+
+  image::concat_images(args.images_in, args.scales_in, args.image_out,
+                       args.scale_out, args.image_num, args.channel_num,
+                       args.height, args.width);
+  return 0;
+}
+
+}  // namespace fpga
+}  // namespace paddle_mobile
diff --git a/src/fpga/V2/api.cpp b/src/fpga/V2/api.cpp
index 2f8a9f119e643b3836ef2c541e098f39ab3cbd17..5bfd34104600668ce63a9c7d684d4482d5d804fb 100644
--- a/src/fpga/V2/api.cpp
+++ b/src/fpga/V2/api.cpp
@@ -13,84 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "fpga/V2/api.h"
-#include <algorithm>
 #include "fpga/V2/bias_scale.h"
-#include "fpga/V2/config.h"
-#include "fpga/V2/driver/driver.h"
 #include "fpga/V2/filter.h"
 #include "fpga/V2/image.h"
 
 namespace paddle_mobile {
 namespace fpga {
 
-static std::map<void *, size_t> memory_map;
-
-int open_device() {
-  int ret = driver::open_device_driver();
-  return ret;
-}
-
-int close_device() {
-  int ret = driver::close_device_driver();
-  return ret;
-}
-
-void *fpga_malloc(size_t size) {
-  static uint64_t counter = 0;
-#ifdef PADDLE_MOBILE_ZU5
-  auto ptr = driver::fpga_malloc_driver(size);
-#else
-  auto ptr = malloc(size);
-#endif
-  counter += size;
-  memory_map.insert(std::make_pair(ptr, size));
-  //  DLOG << "Address: " << ptr << ", " << size << " bytes allocated. Total "
-  //       << counter << " bytes";
-  return ptr;
-}
-
-void fpga_free(void *ptr) {
-  static uint64_t counter = 0;
-  size_t size = 0;
-  auto iter = memory_map.find(ptr);  // std::map<void *, size_t>::iterator
-  if (iter != memory_map.end()) {
-    size = iter->second;
-    memory_map.erase(iter);
-#ifdef PADDLE_MOBILE_ZU5
-    driver::fpga_free_driver(ptr);
-#else
-    free(ptr);
-#endif
-    counter += size;
-    //    DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total "
-    //         << counter << " bytes";
-  } else {
-    DLOG << "Invalid pointer";
-  }
-}
-void fpga_copy(void *dest, const void *src, size_t num) {
-#ifdef PADDLE_MOBILE_ZU5
-  driver::fpga_copy_driver(dest, src, num);
-#else
-  memcpy(dest, src, num);
-#endif
-}
-
-int fpga_flush(void *address, size_t size) {
-#ifdef PADDLE_MOBILE_ZU5
-  return driver::fpga_flush_driver(address, size);
-#else
-  return 0;
-#endif
-}
-int fpga_invalidate(void *address, size_t size) {
-#ifdef PADDLE_MOBILE_ZU5
-  return driver::fpga_invalidate_driver(address, size);
-#else
-  return 0;
-#endif
-}
-
 void format_image(framework::Tensor *image_tensor) {
   auto dims = image_tensor->dims();
   auto channel = dims[1], height = dims[2], width = dims[3];
@@ -284,8 +213,8 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
     arg->conv_arg[i].output.address = out_ptr;
     arg->conv_arg[i].output.scale_address = out->scale;
 
-    int num_after_alignment =
-        filter::calc_aligned_num((int)input->dims()[1], arg->filter_num);
+    int num_after_alignment = filter::calc_aligned_num(
+        (int)input->dims()[1], arg->filter_num);  // NOLINT
     arg->conv_arg[i].free_space =
         fpga_malloc(num_after_alignment * 2 * sizeof(half));
   }
diff --git a/src/fpga/V2/api.h b/src/fpga/V2/api.h
index 1f4a203936b517d93e2d417b08a8b8456cc1fc93..1386810164d72ef849162b76a8b83fcf32082907 100644
--- a/src/fpga/V2/api.h
+++ b/src/fpga/V2/api.h
@@ -14,21 +14,13 @@ limitations under the License. */
 
 #pragma once
 
-#include "fpga/V2/driver/pe.h"
-#include "fpga/V2/fpga_common.h"
+#include "fpga/common/fpga_common.h"
+#include "fpga/common/pe.h"
 #include "framework/tensor.h"
 
 namespace paddle_mobile {
 namespace fpga {
 
-int open_device();
-int close_device();
-void* fpga_malloc(size_t size);
-void fpga_free(void* ptr);
-void fpga_copy(void* dest, const void* src, size_t num);
-int fpga_flush(void* address, size_t size);
-int fpga_invalidate(void* address, size_t size);
-
 float filter_find_max(framework::Tensor* filter_tensor);
 int get_aligned_channel_num(int channel_num);
 int get_aligned_filter_num(framework::Tensor* filter_tensor);
diff --git a/src/fpga/V2/bias_scale.cpp b/src/fpga/V2/bias_scale.cpp
index 3afd3f51bbb10e3bb2d66195fcc54d25c56e2393..c8f587da330c6e6e9e35969d58ae27f4366830d2 100644
--- a/src/fpga/V2/bias_scale.cpp
+++ b/src/fpga/V2/bias_scale.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "fpga/V2/bias_scale.h"
 #include <memory.h>
-#include "fpga/V2/api.h"
+#include "fpga/common/fpga_common.h"
 
 namespace paddle_mobile {
 namespace fpga {
diff --git a/src/fpga/V2/filter.cpp b/src/fpga/V2/filter.cpp
index e72d97f3d024c2ee2e978fb40d8d6d243c203d75..b17ce4406bf1b6b4619d0e9e75d3f432dfa84fb1 100644
--- a/src/fpga/V2/filter.cpp
+++ b/src/fpga/V2/filter.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "fpga/V2/filter.h"
 #include <memory.h>
 #include <algorithm>
-#include "fpga/V2/api.h"
+#include "fpga/common/fpga_common.h"
 
 namespace paddle_mobile {
 namespace fpga {
diff --git a/src/fpga/V2/fpga_common.cpp b/src/fpga/V2/fpga_common.cpp
deleted file mode 100644
index 01bca30a9ccf79232e1f28bbf77b1c030632f5bc..0000000000000000000000000000000000000000
--- a/src/fpga/V2/fpga_common.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <fpga/V2/fpga_common.h>
-namespace paddle_mobile {
-namespace fpga {
-
-int16_t fp32_2_fp16(float fp32_num) {
-  unsigned long tmp = *(unsigned long *)(&fp32_num);  // NOLINT
-  auto t = (int16_t)(((tmp & 0x007fffff) >> 13) | ((tmp & 0x80000000) >> 16) |
-                     (((tmp & 0x7f800000) >> 13) - (112 << 10)));
-  if (tmp & 0x1000) {
-    t++;  // roundoff
-  }
-  return t;
-}
-
-float fp16_2_fp32(int16_t fp16_num) {
-  if (0 == fp16_num) {
-    return 0;
-  }
-  int frac = (fp16_num & 0x3ff);
-  int exp = ((fp16_num & 0x7c00) >> 10) + 112;
-  int s = fp16_num & 0x8000;
-  int tmp = 0;
-  float fp32_num;
-  tmp = s << 16 | exp << 23 | frac << 13;
-  fp32_num = *(float *)&tmp;  // NOLINT
-  return fp32_num;
-}
-
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/src/fpga/V2/image.cpp b/src/fpga/V2/image.cpp
index 26829bfba65f2375b27251070b33b2bbe57d069b..3d1ed95df2a805c8c64f9184e0a720f5449d6181 100644
--- a/src/fpga/V2/image.cpp
+++ b/src/fpga/V2/image.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "fpga/V2/image.h"
 #include <memory.h>
 #include <algorithm>
-#include "fpga/V2/api.h"
+#include "fpga/common/fpga_common.h"
 
 namespace paddle_mobile {
 namespace fpga {
diff --git a/src/fpga/V2/driver/pe.cpp b/src/fpga/V2/pe.cpp
similarity index 79%
rename from src/fpga/V2/driver/pe.cpp
rename to src/fpga/V2/pe.cpp
index 2e806bfb37c131fad1c011c960bc79aa1b121186..5a1114cd5e9917532a6bf086c868783518401007 100644
--- a/src/fpga/V2/driver/pe.cpp
+++ b/src/fpga/V2/pe.cpp
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "fpga/V2/driver/pe.h"
-#include "fpga/V2/config.h"
-#include "fpga/V2/driver/driver.h"
+#include "fpga/common/pe.h"
 #include "fpga/V2/filter.h"
 #include "fpga/V2/image.h"
+#include "fpga/common/config.h"
+#include "fpga/common/driver.h"
 
 namespace paddle_mobile {
 namespace fpga {
@@ -166,53 +166,53 @@ int PerformBypass(const struct BypassArgs &args) {
   return 0;
 #endif
 
-  uint64_t ifm_src_paddr = driver::vaddr_to_paddr(args.image.address);
-  uint64_t ifm_dst_paddr = driver::vaddr_to_paddr(args.output.address);
-  uint64_t bp_enable;
-  int64_t length;
-  uint64_t pixels;
-
-  // fp32->fp16
-  if ((args.input_data_type) && (!args.output_data_type)) {
-    pixels = (args.image.channels) * (args.image.width) * (args.image.height);
-    length = pixels * sizeof(float);
-    bp_enable = 0x8800000000000000 + length;
-  }
-  // fp16->fp32
-  else if ((!args.input_data_type) && (args.output_data_type)) {
-    pixels = filter::calc_aligned_channel((args.image.channels)) *
-             (args.image.width) * (args.image.height);
-    length = pixels * sizeof(short);
-    length = align_to_x((int)length, 64);  // NOLINT
-    bp_enable = 0x8a00000000000000 + length;
-  }
-  // fp16->fp16 findmax
-  else if ((!args.input_data_type) && (!args.output_data_type)) {
-    pixels = (args.image.channels) * (args.image.width) * (args.image.height);
-    length = pixels * sizeof(short);
-    bp_enable = 0x8900000000000000 + length;
-  } else {
-    return -1;
-  }
-
-  // start bypass
-  driver::reg_writeq(ifm_src_paddr, MUL8(27));
-  driver::reg_writeq(ifm_dst_paddr, MUL8(28));
-  driver::reg_writeq(0, MUL8(0));
-  driver::reg_writeq(bp_enable, MUL8(0));
-  // poll
-  int ret = -1;
-  ret = driver::fpga_regpoll(MUL8(48), BYPASS_DONE, 0xffffffff);
-  if (ret != -1) {
-    // clear "irq"
-    driver::reg_readq(MUL8(63));
-  }
-  // get max value
-  if ((!args.input_data_type) && (!args.output_data_type)) {
-    float scale = Findfp16Max();
-    args.output.scale_address[0] = (float)(1.0 / scale);  // NOLINT
-    args.output.scale_address[1] = scale;
-  }
+  //  uint64_t ifm_src_paddr = driver::vaddr_to_paddr(args.image.address);
+  //  uint64_t ifm_dst_paddr = driver::vaddr_to_paddr(args.output.address);
+  //  uint64_t bp_enable;
+  //  int64_t length;
+  //  uint64_t pixels;
+  //
+  //  // fp32->fp16
+  //  if ((args.input_data_type) && (!args.output_data_type)) {
+  //    pixels = (args.image.channels) * (args.image.width) *
+  //    (args.image.height); length = pixels * sizeof(float); bp_enable =
+  //    0x8800000000000000 + length;
+  //  }
+  //  // fp16->fp32
+  //  else if ((!args.input_data_type) && (args.output_data_type)) {
+  //    pixels = filter::calc_aligned_channel((args.image.channels)) *
+  //             (args.image.width) * (args.image.height);
+  //    length = pixels * sizeof(short);
+  //    length = align_to_x((int)length, 64);  // NOLINT
+  //    bp_enable = 0x8a00000000000000 + length;
+  //  }
+  //  // fp16->fp16 findmax
+  //  else if ((!args.input_data_type) && (!args.output_data_type)) {
+  //    pixels = (args.image.channels) * (args.image.width) *
+  //    (args.image.height); length = pixels * sizeof(short); bp_enable =
+  //    0x8900000000000000 + length;
+  //  } else {
+  //    return -1;
+  //  }
+  //
+  //  // start bypass
+  //  driver::reg_writeq(ifm_src_paddr, MUL8(27));
+  //  driver::reg_writeq(ifm_dst_paddr, MUL8(28));
+  //  driver::reg_writeq(0, MUL8(0));
+  //  driver::reg_writeq(bp_enable, MUL8(0));
+  //  // poll
+  //  int ret = -1;
+  //  ret = driver::fpga_regpoll(MUL8(48), BYPASS_DONE, 0xffffffff);
+  //  if (ret != -1) {
+  //    // clear "irq"
+  //    driver::reg_readq(MUL8(63));
+  //  }
+  //  // get max value
+  //  if ((!args.input_data_type) && (!args.output_data_type)) {
+  //    float scale = Findfp16Max();
+  //    args.output.scale_address[0] = (float)(1.0 / scale);  // NOLINT
+  //    args.output.scale_address[1] = scale;
+  //  }
   return ret;
 }
 
diff --git a/src/fpga/V2/driver/bitmap.cpp b/src/fpga/common/bitmap.cpp
similarity index 99%
rename from src/fpga/V2/driver/bitmap.cpp
rename to src/fpga/common/bitmap.cpp
index c612faa6aed11b683ff81fffdf6c57a6fed9536d..9742a4559927b0520b32eeabc757f5a0f4e3392a 100644
--- a/src/fpga/V2/driver/bitmap.cpp
+++ b/src/fpga/common/bitmap.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "fpga/V2/driver/bitmap.h"
+#include "fpga/common/bitmap.h"
 
 namespace fpga_bitmap {
 void bitmap_set(uint64_t *map, unsigned int start, int len) {
diff --git a/src/fpga/V2/driver/bitmap.h b/src/fpga/common/bitmap.h
similarity index 100%
rename from src/fpga/V2/driver/bitmap.h
rename to src/fpga/common/bitmap.h
diff --git a/src/fpga/V2/config.h b/src/fpga/common/config.h
similarity index 100%
rename from src/fpga/V2/config.h
rename to src/fpga/common/config.h
diff --git a/src/fpga/V2/driver/driver.cpp b/src/fpga/common/driver.cpp
similarity index 96%
rename from src/fpga/V2/driver/driver.cpp
rename to src/fpga/common/driver.cpp
index d7e71782676fd350f938847c03e9736ff0adb64a..8c59ac14fb11282b29a837152194d873bd65d87d 100644
--- a/src/fpga/V2/driver/driver.cpp
+++ b/src/fpga/common/driver.cpp
@@ -28,8 +28,8 @@ limitations under the License. */
 #include <iostream>
 
 #include "common/enforce.h"
-#include "fpga/V2/driver/bitmap.h"
-#include "fpga/V2/driver/driver.h"
+#include "fpga/common/bitmap.h"
+#include "fpga/common/driver.h"
 
 namespace paddle_mobile {
 namespace fpga {
@@ -353,7 +353,7 @@ void fpga_free_driver(void *ptr) {
   }
 }
 
-static inline int do_ioctl(unsigned long req, const void *arg) {
+static inline int do_ioctl(int64_t req, const void *arg) {
   return ioctl(g_fpgainfo.fd_mem, req, arg);
 }
 
@@ -363,7 +363,7 @@ int fpga_flush_driver(void *address, size_t size) {
 
   p_addr = vaddr_to_paddr(address);
 
-  args.offset = (void *)(p_addr - FPGA_MEM_PHY_ADDR);
+  args.offset = (void *)(p_addr - FPGA_MEM_PHY_ADDR);  // NOLINT
   args.size = size;
 
   return do_ioctl(IOCTL_MEMCACHE_FLUSH, &args);
@@ -375,7 +375,7 @@ int fpga_invalidate_driver(void *address, size_t size) {
 
   p_addr = vaddr_to_paddr(address);
 
-  args.offset = (void *)(p_addr - FPGA_MEM_PHY_ADDR);
+  args.offset = (void *)(p_addr - FPGA_MEM_PHY_ADDR);  // NOLINT
   args.size = size;
 
   return do_ioctl(IOCTL_MEMCACHE_INVAL, &args);
@@ -389,7 +389,7 @@ void fpga_copy_driver(void *dest, const void *src, size_t num) {
   for (i = 0; i < num; i++) {
     // DLOG << "i:" << i << " val:" << *((int8_t *)src + i);
     // usleep(1);
-    *((int8_t *)dest + i) = *((int8_t *)src + i);
+    *((int8_t *)dest + i) = *((int8_t *)src + i);  // NOLINT
   }
 
   return;
diff --git a/src/fpga/V2/driver/driver.h b/src/fpga/common/driver.h
similarity index 91%
rename from src/fpga/V2/driver/driver.h
rename to src/fpga/common/driver.h
index ecb40c7bafb061eeb2b5850b0bfb3090f6f31840..2dad07ec5206a7ca64449aa38ebe0603d72b71e3 100644
--- a/src/fpga/V2/driver/driver.h
+++ b/src/fpga/common/driver.h
@@ -33,8 +33,6 @@ namespace driver {
 #define FPGA_MEM_PHY_ADDR 0x20000000
 #define FPGA_MEM_SIZE 0x20000000
 
-#define CPU_FREQ 1000000000
-
 #define FPGA_PAGE_SIZE (16UL * 1024UL)
 
 // PE related macros
@@ -105,17 +103,17 @@ extern struct FPGA_INFO g_fpgainfo;
 
 inline uint64_t reg_readq(uint32_t offset) {
   // DLOG << "offset : " << offset;
-  uint64_t value = *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr +
-                                          offset);  // NOLINT
+  uint64_t value =
+      *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr +  // NOLINT
+                             offset);                                // NOLINT
 
   return value;
 }
 
 inline void reg_writeq(uint64_t value, uint32_t offset) {
   // DLOG << "offset : " << offset << ", value : " << value;
-  *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr +
-                         offset) =  // NOLINT
-      value;
+  *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr +  // NOLINT
+                         offset) = value;
 }
 
 int open_device_driver();
diff --git a/src/fpga/common/fpga_common.cpp b/src/fpga/common/fpga_common.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9c7ae838fa4216d121cf38a11ef4897043b9a0dd
--- /dev/null
+++ b/src/fpga/common/fpga_common.cpp
@@ -0,0 +1,117 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "fpga/common/fpga_common.h"
+#include <algorithm>
+#include <map>
+#include "fpga/common/config.h"
+#include "fpga/common/driver.h"
+
+namespace paddle_mobile {
+namespace fpga {
+
+int16_t fp32_2_fp16(float fp32_num) {
+  unsigned long tmp = *(unsigned long *)(&fp32_num);  // NOLINT
+  auto t = (int16_t)(((tmp & 0x007fffff) >> 13) | ((tmp & 0x80000000) >> 16) |
+                     (((tmp & 0x7f800000) >> 13) - (112 << 10)));
+  if (tmp & 0x1000) {
+    t++;  // roundoff
+  }
+  return t;
+}
+
+float fp16_2_fp32(int16_t fp16_num) {
+  if (0 == fp16_num) {
+    return 0;
+  }
+  int frac = (fp16_num & 0x3ff);
+  int exp = ((fp16_num & 0x7c00) >> 10) + 112;
+  int s = fp16_num & 0x8000;
+  int tmp = 0;
+  float fp32_num;
+  tmp = s << 16 | exp << 23 | frac << 13;
+  fp32_num = *(float *)&tmp;  // NOLINT
+  return fp32_num;
+}
+
+static std::map<void *, size_t> memory_map;
+
+int open_device() {
+  int ret = driver::open_device_driver();
+  return ret;
+}
+
+int close_device() {
+  int ret = driver::close_device_driver();
+  return ret;
+}
+
+void *fpga_malloc(size_t size) {
+  static uint64_t counter = 0;
+#ifdef PADDLE_MOBILE_ZU5
+  auto ptr = driver::fpga_malloc_driver(size);
+#else
+  auto ptr = malloc(size);
+#endif
+  counter += size;
+  memory_map.insert(std::make_pair(ptr, size));
+  //  DLOG << "Address: " << ptr << ", " << size << " bytes allocated. Total "
+  //       << counter << " bytes";
+  return ptr;
+}
+
+void fpga_free(void *ptr) {
+  static uint64_t counter = 0;
+  size_t size = 0;
+  auto iter = memory_map.find(ptr);  // std::map<void *, size_t>::iterator
+  if (iter != memory_map.end()) {
+    size = iter->second;
+    memory_map.erase(iter);
+#ifdef PADDLE_MOBILE_ZU5
+    driver::fpga_free_driver(ptr);
+#else
+    free(ptr);
+#endif
+    counter += size;
+    //    DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total "
+    //         << counter << " bytes";
+  } else {
+    DLOG << "Invalid pointer";
+  }
+}
+void fpga_copy(void *dest, const void *src, size_t num) {
+#ifdef PADDLE_MOBILE_ZU5
+  driver::fpga_copy_driver(dest, src, num);
+#else
+  memcpy(dest, src, num);
+#endif
+}
+
+int fpga_flush(void *address, size_t size) {
+#ifdef PADDLE_MOBILE_ZU5
+  return driver::fpga_flush_driver(address, size);
+#else
+  return 0;
+#endif
+}
+int fpga_invalidate(void *address, size_t size) {
+#ifdef PADDLE_MOBILE_ZU5
+  return driver::fpga_invalidate_driver(address, size);
+#else
+  return 0;
+#endif
+}
+
+}  // namespace fpga
+}  // namespace paddle_mobile
diff --git a/src/fpga/V2/fpga_common.h b/src/fpga/common/fpga_common.h
similarity index 91%
rename from src/fpga/V2/fpga_common.h
rename to src/fpga/common/fpga_common.h
index 1862d843503ee8faf58caf038202e198ca079905..430014ef654ec2f00eeb2548012e4ae716f4aa8b 100644
--- a/src/fpga/V2/fpga_common.h
+++ b/src/fpga/common/fpga_common.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <cstddef>
 #include <cstdint>
 
 namespace paddle_mobile {
@@ -117,9 +118,19 @@ struct BypassArgs {
 struct DeconvArgs {
   struct ConvArgs conv_arg;
 };
+
 static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; }
+
 int16_t fp32_2_fp16(float fp32_num);
 float fp16_2_fp32(int16_t fp16_num);
 
+int open_device();
+int close_device();
+void* fpga_malloc(size_t size);
+void fpga_free(void* ptr);
+void fpga_copy(void* dest, const void* src, size_t num);
+int fpga_flush(void* address, size_t size);
+int fpga_invalidate(void* address, size_t size);
+
 }  // namespace fpga
 }  // namespace paddle_mobile
diff --git a/src/fpga/V2/driver/pe.h b/src/fpga/common/pe.h
similarity index 96%
rename from src/fpga/V2/driver/pe.h
rename to src/fpga/common/pe.h
index 4903bf4c33f6b5d5899c56eeaada8c7a21d1a875..0da13b8396b7f6a7960dfbb36337f3b38c7ac865 100644
--- a/src/fpga/V2/driver/pe.h
+++ b/src/fpga/common/pe.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 
-#include "fpga/V2/fpga_common.h"
+#include "fpga/common/fpga_common.h"
 
 namespace paddle_mobile {
 namespace fpga {
diff --git a/tools/op.cmake b/tools/op.cmake
index db2e801e45a0a5ef7ab1ae11de8325e34a718ec1..3c70f1754fbdddd9594cb25731979f17137f66d4 100644
--- a/tools/op.cmake
+++ b/tools/op.cmake
@@ -102,7 +102,6 @@ if (CON GREATER -1)
   set(MUL_OP ON)
   set(RESHAPE_OP ON)
   set(SOFTMAX_OP ON)
-
   set(FOUND_MATCH ON)
 endif()
 
@@ -120,7 +119,6 @@ if (CON GREATER -1)
   set(SOFTMAX_OP ON)
   set(FUSION_CONVBNRELU_OP ON)
   set(FUSION_CONVBN_OP ON)
-  set(FUSION_CONVADD_OP ON)
   set(FOUND_MATCH ON)
 endif()