Merge pull request #1269 from zhangyang0701/develop

Update FPGA V2 close #1268

Merge pull request #1269 from zhangyang0701/develop
Update FPGA V2 close #1268
afd4cffe · qnqinan · GitHub · 54b6fd3f · bfadcc5e · afd4cffe
23 changed file
--- a/src/fpga/V2/api.cpp
+++ b/src/fpga/V2/api.cpp
@@ -13,46 +13,30 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "fpga/V2/api.h"
-#include <fcntl.h>
-#include <sys/ioctl.h>
 #include <algorithm>
-#include <map>
 #include "fpga/V2/bias_scale.h"
+#include "fpga/V2/config.h"
 #include "fpga/V2/filter.h"
 #include "fpga/V2/image.h"
-#define FPGA_TEST_MODE
-// #define PADDLE_MOBILE_OS_LINUX
 namespace paddle_mobile {
 namespace fpga {
-static int fd = -1;
-static const char *device_path = "/dev/fpgadrv0";
 static std::map<void *, size_t> memory_map;
-static inline int do_ioctl(int req, const void *arg) {
+int open_device() {
-#ifdef PADDLE_MOBILE_OS_LINUX
+  int ret = open_device_driver();
-  int result = ioctl(fd, req, (uint64_t)arg);
+  return ret;
-  PADDLE_MOBILE_ENFORCE(result == 0, "ioctl didn't return correctly");
-  return result;
-#else
-  return -1;
-#endif
 }
-int open_device() {
+int close_device() {
-  if (fd == -1) {
+  int ret = close_device_driver();
-    fd = open(device_path, O_RDWR);
+  return ret;
-  }
-  return fd;
 }
-// memory management;
 void *fpga_malloc(size_t size) {
  static uint64_t counter = 0;
+#ifdef PADDLE_MOBILE_ZU5
-#ifdef PADDLE_MOBILE_OS_LINUX
+  auto ptr = fpga_malloc_driver(size);
-  auto ptr = mmap64(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
 #else
  auto ptr = malloc(size);
 #endif
@@ -66,13 +50,12 @@ void *fpga_malloc(size_t size) {
 void fpga_free(void *ptr) {
  static uint64_t counter = 0;
  size_t size = 0;
  auto iter = memory_map.find(ptr);  // std::map<void *, size_t>::iterator
  if (iter != memory_map.end()) {
    size = iter->second;
    memory_map.erase(iter);
-#ifdef PADDLE_MOBILE_OS_LINUX
+#ifdef PADDLE_MOBILE_ZU5
-    munmap(ptr, size);
+    fpga_free_driver(ptr);
 #else
    free(ptr);
 #endif
@@ -84,24 +67,6 @@ void fpga_free(void *ptr) {
  }
 }
-void fpga_copy(void *dest, const void *src, size_t num) {
-  memcpy(dest, src, num);
-}
-int fpga_flush(void *address, size_t size) {
-  struct MemoryCacheArgs args = {nullptr};
-  args.address = address;
-  args.size = size;
-  return do_ioctl(IOCTL_MEMCACHE_FLUSH, &args);
-}
-int fpga_invalidate(void *address, size_t size) {
-  struct MemoryCacheArgs args = {nullptr};
-  args.address = address;
-  args.size = size;
-  return do_ioctl(IOCTL_MEMCACHE_INVAL, &args);
-}
 half fp32_2_fp16(float fp32_num) {
  unsigned long tmp = *(unsigned long *)(&fp32_num);  // NOLINT
  auto t = (half)(((tmp & 0x007fffff) >> 13) | ((tmp & 0x80000000) >> 16) |
@@ -123,136 +88,13 @@ float fp16_2_fp32(half fp16_num) {
  return fp32_num;
 }
-int ComputeBasicConv(const struct ConvArgs &args) {
-#ifdef FPGA_TEST_MODE
-  DLOG << "======Compute Basic Conv======";
-  DLOG << "   relu_enabled:" << args.relu_enabled
-       << "   sb_address:" << args.sb_address
-       << "   filter_address:" << args.filter_address
-       << "   filter_num:" << args.filter_num
-       << "   group_num:" << args.group_num;
-  DLOG << "   image_address:" << args.image.address
-       << "   image_scale_address:" << args.image.scale_address
-       << "   image_channels:" << args.image.channels
-       << "   image_height:" << args.image.height
-       << "   image_width:" << args.image.width
-       << "   pad_height:" << args.image.pad_height
-       << "   pad_width:" << args.image.pad_width;
-  DLOG << "   kernel_height:" << args.kernel.height
-       << "   kernel_width:" << args.kernel.width
-       << "   stride_h:" << args.kernel.stride_h
-       << "   stride_w:" << args.kernel.stride_w;
-  DLOG << "   out_address:" << args.output.address
-       << "   out_scale_address:" << args.output.scale_address;
-#endif
-  return do_ioctl(IOCTL_CONFIG_CONV, &args);
-}
-int ComputeFpgaConv(const struct SplitConvArgs &args) {
-  ComputeBasicConv(args.conv_args[0]);
-}
-int ComputeFpgaPool(const struct PoolingArgs &args) {
-#ifdef FPGA_TEST_MODE
-  DLOG << "=============ComputeFpgaPool===========";
-  DLOG << "   mode:" << args.mode
-       << "   kernel_reciprocal:" << fp16_2_fp32(args.kernel_reciprocal);
-  DLOG << "   image_address:" << args.image.address
-       << "   image_scale_address:" << args.image.scale_address
-       << "   image_channels:" << args.image.channels
-       << "   image_height:" << args.image.height
-       << "   image_width:" << args.image.width
-       << "   pad_height:" << args.image.pad_height
-       << "   pad_width:" << args.image.pad_width;
-  DLOG << "   kernel_height:" << args.kernel.height
-       << "   kernel_width:" << args.kernel.width
-       << "   stride_h:" << args.kernel.stride_h
-       << "   stride_w:" << args.kernel.stride_w;
-  DLOG << "   out_address:" << args.output.address
-       << "   out_scale_address:" << args.output.scale_address;
-#endif
-  return do_ioctl(IOCTL_CONFIG_POOLING, &args);
-}
-int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
-#ifdef FPGA_TEST_MODE
-  DLOG << "=============ComputeFpgaEWAdd===========";
-  DLOG << "   relu_enabled:" << args.relu_enabled
-       << "   const0:" << fp16_2_fp32(int16_t(args.const0))
-       << "   const1:" << fp16_2_fp32(int16_t(args.const1));
-  DLOG << "   image0_address:" << args.image0.address
-       << "   image0_scale_address:" << args.image0.scale_address
-       << "   image0_channels:" << args.image0.channels
-       << "   image0_height:" << args.image0.height
-       << "   image0_width:" << args.image0.width
-       << "   pad0_height:" << args.image0.pad_height
-       << "   pad0_width:" << args.image0.pad_width;
-  DLOG << "   image1_address:" << args.image1.address
-       << "   image1_scale_address:" << args.image1.scale_address
-       << "   image1_channels:" << args.image1.channels
-       << "   image1_height:" << args.image1.height
-       << "   image1_width:" << args.image1.width
-       << "   pad1_height:" << args.image1.pad_height
-       << "   pad_width:" << args.image1.pad_width;
-  DLOG << "   out_address:" << args.output.address
-       << "   out_scale_address:" << args.output.scale_address;
-#endif
-  return do_ioctl(IOCTL_CONFIG_EW, &args);
-}
-int PerformBypass(const struct BypassArgs &args) {
-#ifdef FPGA_TEST_MODE
-  DLOG << "=============ComputeFpgaBypass===========";
-  DLOG << "   input_type:" << args.input_data_type
-       << "   output_type:" << args.output_data_type
-       << "   input_layout_type:" << args.input_layout_type
-       << "   output_layout_type:" << args.output_layout_type;
-  DLOG << "   image_address:" << args.image.address
-       << "   image_scale_address:" << args.image.scale_address
-       << "   image_channels:" << args.image.channels
-       << "   image_height:" << args.image.height
-       << "   image_width:" << args.image.width
-       << "   pad_height:" << args.image.pad_height
-       << "   pad_width:" << args.image.pad_width;
-  DLOG << "   out_address:" << args.output.address
-       << "   out_scale_address:" << args.output.scale_address;
-#endif
-  return do_ioctl(IOCTL_CONFIG_BYPASS, &args);
-}
-int ComputeFPGAConcat(const struct ConcatArgs &args) {
-#ifdef FPGA_TEST_MODE
-  DLOG << "=============ComputeFpgaConcat===========";
-  DLOG << "   Image_num: " << args.image_num
-       << "   out_address:" << args.image_out
-       << "   out_scale_address:" << args.scale_out
-       << "   out_channel:" << args.out_channel;
-  DLOG << "   image_height:" << args.height << "   image_width:" << args.width;
-  for (int i = 0; i < args.image_num; i++) {
-    DLOG << "   " << i << "th:        ";
-    DLOG << "   channel_num:" << args.channel_num[i]
-         << "   aligned_channel_num:" << args.aligned_channel_num[i]
-         << "   image_address:" << args.images_in[i]
-         << "   image_scale_address:" << args.scales_in[i];
-  }
-#endif
-  image::concat_images(args.images_in, args.scales_in, args.image_out,
-                       args.scale_out, args.image_num, args.channel_num,
-                       args.height, args.width, args.aligned_channel_num,
-                       args.out_channel);
-  return 0;
-}
 void format_image(framework::Tensor *image_tensor) {
  auto dims = image_tensor->dims();
  auto channel = dims[1], height = dims[2], width = dims[3];
  auto data_ptr = image_tensor->data<float>();
  size_t memory_size = channel * height * width * sizeof(float);
  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
-  fpga_copy(new_data, data_ptr, memory_size);
+  memcpy(new_data, data_ptr, memory_size);
  int aligned_channel = filter::calc_aligned_channel((int)channel);  // NOLINT
  image::format_image(&new_data, (int)channel, (int)height,          // NOLINT
                      (int)width,                                    // NOLINT
@@ -265,7 +107,7 @@ void format_fp16_ofm(framework::Tensor *ofm_tensor, int aligned_channel) {
  size_t memory_size = 0;
  if (dims.size() == 4) {
    auto height = dims[2], width = dims[3];
-    memory_size = height * width * aligned_channel * sizeof(half);
+    memory_size = (height + 1) / 2 * 2 * width * aligned_channel * sizeof(half);
  } else if (dims.size() == 2) {
    memory_size = aligned_channel * sizeof(half);
  } else {
@@ -319,7 +161,7 @@ void format_filter(framework::Tensor *filter_tensor, float max_value,
  auto data_ptr = filter_tensor->data<float>();
  size_t memory_size = num * channel * height * width * sizeof(float);
  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
-  fpga_copy(new_data, data_ptr, memory_size);
+  memcpy(new_data, data_ptr, memory_size);
  filter::format_filter(&new_data, (int)num, (int)channel,  // NOLINT
                        (int)height,                        // NOLINT
                        (int)width, group_num, max_value);  // NOLINT
@@ -334,7 +176,7 @@ void format_fc_filter(framework::Tensor *filter_tensor, float max_value) {
  auto data_ptr = filter_tensor->data<float>();
  size_t memory_size = num * channel * height * width * sizeof(float);
  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
-  fpga_copy(new_data, data_ptr, memory_size);
+  memcpy(new_data, data_ptr, memory_size);
  filter::format_fc_filter(&new_data, (int)num, (int)channel,  // NOLINT
                           (int)height,                        // NOLINT
                           (int)width, 1, max_value);          // NOLINT

--- a/src/fpga/V2/api.h
+++ b/src/fpga/V2/api.h
@@ -18,6 +18,8 @@ limitations under the License. */
 #include <cstddef>
 #include <iostream>
 #include <limits>
+#include "fpga/V2/driver/driver.h"
+#include "fpga/V2/driver/pe.h"
 #include "framework/tensor.h"
 namespace paddle_mobile {
@@ -33,16 +35,6 @@ enum LayoutType {
  LAYOUT_HWC = 0,
 };
-struct VersionArgs {
-  void* buffer;
-};
-struct MemoryCopyArgs {
-  void* src;
-  void* dest;
-  size_t size;
-};
 struct KernelArgs {
  uint32_t width;
  uint32_t height;
@@ -128,56 +120,10 @@ struct BypassArgs {
  struct ImageOutputArgs output;
 };
-struct FpgaRegWriteArgs {
-  uint64_t address;  //
-  uint64_t value;
-};
-struct FpgaRegReadArgs {
-  uint64_t address;
-  uint64_t value;
-};
-struct MemoryCacheArgs {
-  void* address;
-  size_t size;
-};
-#define IOCTL_FPGA_MAGIC 'FPGA'
-#define IOCTL_VERSION _IOW(IOCTL_FPGA_MAGIC, 01, struct VersionArgs)
-#define IOCTL_SEPARATOR_0 10
-#define IOCTL_MEM_COPY _IOW(IOCTL_FPGA_MAGIC, 11, struct MemoryCopyArgs)
-#define IOCTL_MEMCACHE_INVAL _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryCacheArgs)
-#define IOCTL_MEMCACHE_FLUSH _IOW(IOCTL_FPGA_MAGIC, 13, struct MemoryCacheArgs)
-#define IOCTL_SEPARATOR_1 20
-#define IOCTL_CONFIG_CONV _IOW(IOCTL_FPGA_MAGIC, 21, struct ConvArgs)
-#define IOCTL_CONFIG_POOLING _IOW(IOCTL_FPGA_MAGIC, 22, struct PoolingArgs)
-#define IOCTL_CONFIG_EW _IOW(IOCTL_FPGA_MAGIC, 23, struct EWAddArgs)
-#define IOCTL_CONFIG_BYPASS _IOW(IOCTL_FPGA_MAGIC, 24, struct BypassArgs)
-#define IOCTL_FPGA_REG_READ _IOW(IOCTL_FPGA_MAGIC, 28, struct FpgaRegReadArgs)
-#define IOCTL_FPGA_REG_WRITE _IOW(IOCTL_FPGA_MAGIC, 29, struct FpgaRegWriteArgs)
-//============================== API =============================
 int open_device();
 int close_device();
 void* fpga_malloc(size_t size);
 void fpga_free(void* ptr);
-void fpga_copy(void* dst, const void* src, size_t num);
-int fpga_flush(void* address, size_t size);
-int fpga_invalidate(void* address, size_t size);
-int PerformBypass(const struct BypassArgs& args);
-int ComputeFpgaConv(const struct SplitConvArgs& args);
-int ComputeFpgaPool(const struct PoolingArgs& args);
-int ComputeFpgaEWAdd(const struct EWAddArgs& args);
-int ComputeFPGAConcat(const struct ConcatArgs& args);
 static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; }

--- a/src/fpga/V2/bias_scale.cpp
+++ b/src/fpga/V2/bias_scale.cpp
@@ -39,7 +39,6 @@ void align_element(float **data_in, int num, int num_after_alignment) {
 void format_bias_scale_array(float **data_in, int num,
                             int num_after_alignment) {
  align_element(data_in, num, num_after_alignment);
-  fpga_flush(*data_in, 2 * num_after_alignment * sizeof(float));
 }
 }  // namespace bias_scale

--- a/src/fpga/V2/config.h
+++ b/src/fpga/V2/config.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#define PADDLE_MOBILE_ZU5
+#define FPGA_PRINT_MODE
--- a/src/fpga/V2/driver/bitmap.cpp
+++ b/src/fpga/V2/driver/bitmap.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "fpga/V2/driver/bitmap.h"
+namespace fpga_bitmap {
+void bitmap_set(uint64_t *map, unsigned int start, int len) {
+  uint64_t *p = map + BIT_WORD(start);
+  const unsigned int size = start + len;
+  int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG);
+  uint64_t mask_to_set = BITMAP_FIRST_WORD_MASK(start);
+  while (len - bits_to_set >= 0) {
+    *p |= mask_to_set;
+    len -= bits_to_set;
+    bits_to_set = BITS_PER_LONG;
+    mask_to_set = ~0UL;
+    p++;
+  }
+  if (len) {
+    mask_to_set &= BITMAP_LAST_WORD_MASK(size);
+    *p |= mask_to_set;
+  }
+}
+void bitmap_clear(uint64_t *map, unsigned int start, int len) {
+  uint64_t *p = map + BIT_WORD(start);
+  const unsigned int size = start + len;
+  int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG);
+  uint64_t mask_to_clear = BITMAP_FIRST_WORD_MASK(start);
+  while (len - bits_to_clear >= 0) {
+    *p &= ~mask_to_clear;
+    len -= bits_to_clear;
+    bits_to_clear = BITS_PER_LONG;
+    mask_to_clear = ~0UL;
+    p++;
+  }
+  if (len) {
+    mask_to_clear &= BITMAP_LAST_WORD_MASK(size);
+    *p &= ~mask_to_clear;
+  }
+}
+static uint64_t ffs(uint64_t data) {
+  uint64_t bit = 0;
+  int i = 0;
+  for (i = 0; i < sizeof(data); i++) {
+    if (data & (1 << i)) {
+      bit = i;
+      break;
+    }
+  }
+  return bit;
+}
+static uint64_t _find_next_bit(const uint64_t *addr, uint64_t nbits,
+                               uint64_t start, uint64_t invert) {
+  uint64_t tmp = 0;
+  if (!nbits || start >= nbits) return nbits;
+  tmp = addr[start / BITS_PER_LONG] ^ invert;
+  /* Handle 1st word. */
+  tmp &= BITMAP_FIRST_WORD_MASK(start);
+  start = round_down(start, BITS_PER_LONG);
+  while (!tmp) {
+    start += BITS_PER_LONG;
+    if (start >= nbits) return nbits;
+    tmp = addr[start / BITS_PER_LONG] ^ invert;
+  }
+  return (start + ffs(tmp)) < nbits ? (start + ffs(tmp)) : nbits;
+}
+uint64_t find_next_zero_bit(const uint64_t *addr, uint64_t size,
+                            uint64_t offset) {
+  return _find_next_bit(addr, size, offset, ~0UL);
+}
+uint64_t find_next_bit(const uint64_t *addr, uint64_t size, uint64_t offset) {
+  return _find_next_bit(addr, size, offset, 0UL);
+}
+uint64_t bitmap_find_next_zero_area_off(uint64_t *map, uint64_t size,
+                                        uint64_t start, unsigned int nr,
+                                        uint64_t align_mask,
+                                        uint64_t align_offset) {
+  uint64_t index = 0;
+  uint64_t end = 0;
+  uint64_t i = 0;
+again:
+  index = find_next_zero_bit(map, size, start);
+  /* Align allocation */
+  index = __ALIGN_MASK(index + align_offset, align_mask) - align_offset;
+  end = index + nr;
+  if (end > size) return end;
+  i = find_next_bit(map, end, index);
+  if (i < end) {
+    start = i + 1;
+    goto again;
+  }
+  return index;
+}
+uint64_t bitmap_find_next_zero_area(uint64_t *map, uint64_t size,
+                                    uint64_t start, unsigned int nr,
+                                    uint64_t align_mask) {
+  return bitmap_find_next_zero_area_off(map, size, start, nr, align_mask, 0);
+}
+}  // namespace fpga_bitmap
--- a/src/fpga/V2/driver/bitmap.h
+++ b/src/fpga/V2/driver/bitmap.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <stdint.h>
+#include <stdio.h>
+#define BITS_PER_LONG 64
+#define BIT_WORD(nr) ((nr) / BITS_PER_LONG)
+#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1)))
+#define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1)))
+#define __ALIGN_KERNEL_MASK(x, mask) (((x) + (mask)) & ~(mask))
+#define __ALIGN_MASK(x, mask) __ALIGN_KERNEL_MASK((x), (mask))
+#define round_down(x, y) ((x) & ((y)-1))
+namespace fpga_bitmap {
+void bitmap_set(uint64_t *map, unsigned int start, int len);
+void bitmap_clear(uint64_t *map, unsigned int start, int len);
+uint64_t bitmap_find_next_zero_area(uint64_t *map, uint64_t size,
+                                    uint64_t start, unsigned int nr,
+                                    uint64_t align_mask);
+}  // namespace fpga_bitmap
--- a/src/fpga/V2/driver/driver.cpp
+++ b/src/fpga/V2/driver/driver.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <errno.h>
+#include <fcntl.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <algorithm>
+#include <cstddef>
+#include <cstring>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include "common/enforce.h"
+#include "fpga/V2/driver/bitmap.h"
+#include "fpga/V2/driver/driver.h"
+namespace paddle_mobile {
+namespace fpga {
+struct FPGA_INFO g_fpgainfo;
+int open_drvdevice() {
+  if (g_fpgainfo.fd_drv == -1) {
+    g_fpgainfo.fd_drv = open(g_fpgainfo.drvdevice_path, O_RDWR);
+  }
+  return g_fpgainfo.fd_drv;
+}
+int open_memdevice() {
+  if (g_fpgainfo.fd_mem == -1) {
+    g_fpgainfo.fd_mem = open(g_fpgainfo.memdevice_path, O_RDWR | O_DSYNC);
+  }
+  return g_fpgainfo.fd_mem;
+}
+void pl_reset() {
+  // DLOG << "PL RESET";
+  // reg_writeq(0x5a, REG_FPGA_RESET);
+  usleep(100 * 1000);
+}
+void setup_pe(struct pe_data_s *pe_data, struct fpga_pe *pe,
+              char const *type_name, int pe_idx) {
+  memset(pe, 0, sizeof(struct fpga_pe));
+  pe->outer = pe_data;
+  snprintf(pe->type_name, MAX_TYPE_NAME_LENTH, "%s", type_name);
+  pe->status = IDLE;
+  pe->interrupt_cnt = 0;
+  pe_data->pes[pe_idx] = pe;
+  pe_data->pe_num++;
+}
+void pl_init() {
+  struct pe_data_s *pe_data = nullptr;
+  pl_reset();
+  pe_data = (struct pe_data_s *)malloc(sizeof(struct pe_data_s));
+  if (pe_data == nullptr) {
+    DLOG << "pe_data malloc error!";
+    return;
+  }
+  memset(pe_data, 0, sizeof(struct pe_data_s));
+  pthread_mutex_init(&pe_data->mutex, 0);
+  setup_pe(pe_data, &pe_data->pe_conv, "CONV", PE_IDX_CONV);
+  setup_pe(pe_data, &pe_data->pe_pooling, "POOLING", PE_IDX_POOLING);
+  setup_pe(pe_data, &pe_data->pe_ew, "EW", PE_IDX_EW);
+  setup_pe(pe_data, &pe_data->pe_bypass, "BYPASS", PE_IDX_BYPASS);
+  g_fpgainfo.pe_data = pe_data;
+}
+void pl_destroy() {
+  struct pe_data_s *pe_data = g_fpgainfo.pe_data;
+  pthread_mutex_destroy(&pe_data->mutex);
+  free(pe_data);
+}
+void pl_start() {
+  struct pe_data_s *pe_data = g_fpgainfo.pe_data;
+  pthread_mutex_unlock(&pe_data->mutex);
+}
+void pl_stop() {
+  struct pe_data_s *pe_data = g_fpgainfo.pe_data;
+  pthread_mutex_lock(&pe_data->mutex);
+}
+void pl_reinit() {
+  struct pe_data_s *pe_data = g_fpgainfo.pe_data;
+  struct fpga_pe *pe = nullptr;
+  int i = 0;
+  pl_stop();
+  pl_reset();
+  pl_start();
+  for (i = 0; i < pe_data->pe_num; i++) {
+    pe = pe_data->pes[i];
+    pe->status = IDLE;
+    pe->interrupt_cnt = 0;
+  }
+  pl_start();
+}
+int pl_get_status() { return 0; }
+/*tmie单位us*/
+int fpga_regpoll(uint64_t reg, uint64_t val, int time) {
+  uint64_t i = 0;
+  /*timeout精确性待确认*/
+  int64_t timeout = time * CPU_FREQ / 1000000;
+  for (i = 0; i < timeout; i++) {
+    if (val == reg_readq(reg)) {
+      break;
+    }
+  }
+  if (i <= timeout) {
+    return 0;
+  } else {
+    return -1;
+  }
+}
+/*内存管理*/
+int memory_request(struct fpga_memory *memory, size_t size, uint64_t *addr) {
+  uint64_t _nr = DIV_ROUND_UP(size, FPGA_PAGE_SIZE);
+  unsigned int nr = (unsigned int)_nr;
+  int ret = 0;
+  pthread_mutex_lock(&memory->mutex);
+  unsigned int pos = (unsigned int)fpga_bitmap::bitmap_find_next_zero_area(
+      memory->bitmap, memory->page_num, 0, nr, 0);
+  if (pos <= memory->page_num) {
+    uint64_t address_ofset =
+        memory->mem_start + ((uint64_t)pos) * FPGA_PAGE_SIZE;
+    fpga_bitmap::bitmap_set(memory->bitmap, pos, nr);
+    memory->nr[pos] = nr;
+    *addr = address_ofset;
+  } else {
+    ret = -ENOMEM;
+  }
+  pthread_mutex_unlock(&memory->mutex);
+  return ret;
+}
+void memory_release(struct fpga_memory *memory) {
+  pthread_mutex_lock(&memory->mutex);
+  fpga_bitmap::bitmap_clear(memory->bitmap, 0, memory->page_num);
+  pthread_mutex_unlock(&memory->mutex);
+}
+int create_fpga_memory_inner(struct fpga_memory *memory, size_t memory_size) {
+  int rc = 0;
+  uint64_t *bitmap = nullptr;
+  unsigned int *nr = nullptr;
+  // 不允许多份memory创建，所以创建memory结构体不存在互斥
+  // pthread_mutex_lock(&memory->mutex);
+  memory->page_num = (unsigned int)(memory_size / FPGA_PAGE_SIZE);
+  memory->page_num_long = DIV_ROUND_UP(memory->page_num, BITS_PER_LONG);
+  bitmap =
+      (uint64_t *)malloc(sizeof(int64_t) * memory->page_num_long);  // NOLINT
+  if (!bitmap) {
+    rc = -EFAULT;
+    return rc;
+  }
+  memory->bitmap = bitmap;
+  nr = (unsigned int *)calloc(memory->page_num, sizeof(unsigned int));
+  if (!nr) {
+    rc = -EFAULT;
+    free(bitmap);
+    return rc;
+  }
+  memory->nr = nr;
+  memory->mem_start = FPGA_MEM_PHY_ADDR;
+  memory->mem_end = FPGA_MEM_SIZE;
+  // pthread_mutex_unlock(memory->mutex);
+  return rc;
+}
+int create_fpga_memory(struct fpga_memory **memory_info) {
+  int rc = 0;
+  *memory_info = (struct fpga_memory *)malloc(sizeof(struct fpga_memory));
+  if (*memory_info == NULL) {
+    rc = -EFAULT;
+    return rc;
+  }
+  pthread_mutex_init(&((*memory_info)->mutex), nullptr);
+  rc = create_fpga_memory_inner(*memory_info, FPGA_MEM_SIZE);
+  if (rc) {
+    free(*memory_info);
+  }
+  return rc;
+}
+int init_fpga_memory(struct fpga_memory *memory) {
+  int rc = 0;
+  if (!memory) {
+    rc = -EFAULT;
+    return rc;
+  }
+  // spin_lock_init(&memory->spin);
+  fpga_bitmap::bitmap_clear(memory->bitmap, 0, memory->page_num);
+  fpga_bitmap::bitmap_set(memory->bitmap, 0, 1);  // NOTE reserve fpga page 0.
+  return 0;
+}
+void destroy_fpga_memory(struct fpga_memory *memory) {
+  if (memory) {
+    free(memory->nr);
+    free(memory->bitmap);
+    free(memory);
+  }
+}
+int fpga_memory_add() {
+  int rc = 0;
+  rc = create_fpga_memory(&g_fpgainfo.memory_info);
+  if (rc) {
+    return rc;
+  }
+  rc = init_fpga_memory(g_fpgainfo.memory_info);
+  if (rc) {
+    destroy_fpga_memory(g_fpgainfo.memory_info);
+    return rc;
+  }
+  return 0;
+}
+uint64_t vaddr_to_paddr(void *address) {
+  uint64_t paddr = 0;
+  auto iter = g_fpgainfo.fpga_vaddr2paddr_map.find(address);
+  if (iter != g_fpgainfo.fpga_vaddr2paddr_map.end()) {
+    paddr = iter->second;
+  } else {
+    DLOG << "Invalid pointer";
+  }
+  return paddr;
+}
+void *fpga_reg_malloc(size_t size) {
+  void *ret = nullptr;
+  ret = mmap64(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED,
+               g_fpgainfo.fd_drv, FPGA_REG_PHY_ADDR);
+  // PADDLE_MOBILE_ENFORCE(ret != (void *)-1, "Should not be -1");
+  g_fpgainfo.fpga_addr2size_map.insert(std::make_pair(ret, size));
+  return ret;
+}
+void *fpga_malloc_driver(size_t size) {
+  void *ret = nullptr;
+  uint64_t phy_addr = 0;
+  memory_request(g_fpgainfo.memory_info, size, &phy_addr);
+  ret = mmap64(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED,
+               g_fpgainfo.fd_mem, phy_addr);
+  PADDLE_MOBILE_ENFORCE(ret != (void *)-1, "Should not be -1");
+  g_fpgainfo.fpga_vaddr2paddr_map.insert(std::make_pair(ret, phy_addr));
+  g_fpgainfo.fpga_addr2size_map.insert(std::make_pair(ret, size));
+  return ret;
+}
+void fpga_free_driver(void *ptr) {
+  size_t size = 0;
+  auto iter = g_fpgainfo.fpga_addr2size_map.find(ptr);
+  if (iter != g_fpgainfo.fpga_addr2size_map.end()) {
+    size = iter->second;
+    g_fpgainfo.fpga_addr2size_map.erase(iter);
+    munmap(ptr, size);
+  } else {
+    DLOG << "Invalid pointer";
+  }
+}
+int open_device_driver() {
+  g_fpgainfo.FpgaRegPhyAddr = FPGA_REG_PHY_ADDR;
+  g_fpgainfo.FpgaMemPhyAddr = FPGA_MEM_PHY_ADDR;
+  g_fpgainfo.FpgaRegVirAddr = nullptr;
+  g_fpgainfo.pe_data = nullptr;
+  g_fpgainfo.drvdevice_path = "/dev/fpgadrv0";
+  g_fpgainfo.memdevice_path = "/dev/fpgamem0";
+  g_fpgainfo.fd_drv = -1;
+  g_fpgainfo.fd_mem = -1;
+  int ret = 0;
+  ret = open_drvdevice();
+  ret |= open_memdevice();
+  g_fpgainfo.FpgaRegVirAddr =
+      (uint64_t *)fpga_reg_malloc(FPGA_REG_SIZE);  // NOLINT
+  fpga_memory_add();
+  pl_init();
+  return ret;
+}
+int close_device_driver() {
+  pl_destroy();
+  fpga_free_driver(g_fpgainfo.FpgaRegVirAddr);
+  memory_release(g_fpgainfo.memory_info);
+  destroy_fpga_memory(g_fpgainfo.memory_info);
+  return 0;
+}
+}  // namespace fpga
+}  // namespace paddle_mobile
--- a/src/fpga/V2/driver/driver.h
+++ b/src/fpga/V2/driver/driver.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <cstring>
+#include <map>
+#include "common/log.h"
+namespace paddle_mobile {
+namespace fpga {
+#define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
+#define FPGA_REG_PHY_ADDR 0xa0000000
+#define FPGA_REG_SIZE 0x1000
+#define FPGA_MEM_PHY_ADDR 0x20000000
+#define FPGA_MEM_SIZE 0x20000000
+#define CPU_FREQ 1000000000
+#define FPGA_PAGE_SIZE (16UL * 1024UL)
+// PE related macros
+const int MAX_NUM_PES = 6;
+const size_t MAX_TYPE_NAME_LENTH = 8;
+const int PE_IDX_CONV = 0;
+const int PE_IDX_POOLING = 1;
+const int PE_IDX_EW = 2;
+const int PE_IDX_BYPASS = 3;
+enum pe_status { IDLE = 0, BUSY = 1 };
+struct fpga_pe {
+  char type_name[MAX_TYPE_NAME_LENTH + 1];
+  struct pe_data_s *outer;
+  pe_status status;  // 0=idle 1=busy -1=fail
+  uint64_t interrupt_cnt;
+};
+struct pe_data_s {
+  pthread_mutex_t mutex;
+  struct fpga_pe pe_conv;
+  struct fpga_pe pe_pooling;
+  struct fpga_pe pe_ew;
+  struct fpga_pe pe_bypass;
+  struct fpga_pe *pes[MAX_NUM_PES];
+  int pe_num;
+};
+struct fpga_memory {
+  pthread_mutex_t mutex;
+  uint64_t *bitmap;
+  unsigned int *nr;
+  unsigned int page_num;
+  unsigned int page_num_long;
+  uint64_t mem_start;
+  uint64_t mem_end;
+};
+struct FPGA_INFO {
+  uint64_t FpgaRegPhyAddr;
+  uint64_t FpgaMemPhyAddr;
+  pthread_t poll_pid;
+  void *FpgaRegVirAddr;
+  struct pe_data_s *pe_data;
+  std::map<void *, size_t> fpga_addr2size_map;
+  std::map<void *, uint64_t> fpga_vaddr2paddr_map;
+  const char *drvdevice_path;
+  const char *memdevice_path;
+  struct fpga_memory *memory_info;
+  int fd_drv;
+  int fd_mem;
+};
+extern struct FPGA_INFO g_fpgainfo;
+inline uint64_t reg_readq(uint32_t offset) {
+  // DLOG << "offset : " << offset;
+  uint64_t value =
+      *(uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + offset);  // NOLINT
+  return value;
+}
+inline void reg_writeq(uint64_t value, uint32_t offset) {
+  // DLOG << "offset : " << offset << ", value : " << value;
+  *(uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + offset) =  // NOLINT
+      value;
+}
+int open_device_driver();
+int close_device_driver();
+void *fpga_malloc_driver(size_t size);
+void fpga_free_driver(void *ptr);
+/*pe*/
+uint64_t vaddr_to_paddr(void *address);
+int fpga_regpoll(uint64_t reg, uint64_t val, int time);
+}  // namespace fpga
+}  // namespace paddle_mobile
--- a/src/fpga/V2/driver/pe.cpp
+++ b/src/fpga/V2/driver/pe.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "fpga/V2/driver/pe.h"
+#include "fpga/V2/config.h"
+#include "fpga/V2/driver/driver.h"
+#include "fpga/V2/filter.h"
+#include "fpga/V2/image.h"
+namespace paddle_mobile {
+namespace fpga {
+#define MUL8(x) (x * 8)
+#define BYPASS_DONE 1
+float Findfp16Max() {
+  uint16_t abs_vals[16];
+  uint64_t max_fp16;
+  max_fp16 = reg_readq(MUL8(49));
+  abs_vals[0] = (uint16_t)(0x0000007f & (max_fp16));        // NOLINT
+  abs_vals[1] = (uint16_t)(0x0000007f & (max_fp16 >> 16));  // NOLINT
+  abs_vals[2] = (uint16_t)(0x0000007f & (max_fp16 >> 32));  // NOLINT
+  abs_vals[3] = (uint16_t)(0x0000007f & (max_fp16 >> 48));  // NOLINT
+  max_fp16 = reg_readq(MUL8(50));
+  abs_vals[4] = (uint16_t)(0x0000007f & (max_fp16));        // NOLINT
+  abs_vals[5] = (uint16_t)(0x0000007f & (max_fp16 >> 16));  // NOLINT
+  abs_vals[6] = (uint16_t)(0x0000007f & (max_fp16 >> 32));  // NOLINT
+  abs_vals[7] = (uint16_t)(0x0000007f & (max_fp16 >> 48));  // NOLINT
+  max_fp16 = reg_readq(MUL8(51));
+  abs_vals[8] = (uint16_t)(0x0000007f & (max_fp16));         // NOLINT
+  abs_vals[9] = (uint16_t)(0x0000007f & (max_fp16 >> 16));   // NOLINT
+  abs_vals[10] = (uint16_t)(0x0000007f & (max_fp16 >> 32));  // NOLINT
+  abs_vals[11] = (uint16_t)(0x0000007f & (max_fp16 >> 48));  // NOLINT
+  max_fp16 = reg_readq(MUL8(52));
+  abs_vals[12] = (uint16_t)(0x0000007f & (max_fp16));
+  abs_vals[13] = (uint16_t)(0x0000007f & (max_fp16 >> 16));  // NOLINT
+  abs_vals[14] = (uint16_t)(0x0000007f & (max_fp16 >> 32));  // NOLINT
+  abs_vals[15] = (uint16_t)(0x0000007f & (max_fp16 >> 48));  // NOLINT
+  uint16_t tmp = 0;
+  for (int i = 0; i < 16; i++) {
+    if (tmp < abs_vals[i]) {
+      tmp = abs_vals[i];
+    }
+  }
+  return fp16_2_fp32(tmp) / 127.0f;
+}
+int ComputeFpgaConv(const struct SplitConvArgs &args) {
+  ComputeBasicConv(args.conv_args[0]);
+}
+int ComputeBasicConv(const struct ConvArgs &args) {
+#ifdef FPGA_PRINT_MODE
+  DLOG << "======Compute Basic Conv======";
+  DLOG << "   relu_enabled:" << args.relu_enabled
+       << "   sb_address:" << args.sb_address
+       << "   filter_address:" << args.filter_address
+       << "   filter_num:" << args.filter_num
+       << "   group_num:" << args.group_num;
+  DLOG << "   image_address:" << args.image.address
+       << "   image_scale_address:" << args.image.scale_address
+       << "   image_channels:" << args.image.channels
+       << "   image_height:" << args.image.height
+       << "   image_width:" << args.image.width
+       << "   pad_height:" << args.image.pad_height
+       << "   pad_width:" << args.image.pad_width;
+  DLOG << "   kernel_height:" << args.kernel.height
+       << "   kernel_width:" << args.kernel.width
+       << "   stride_h:" << args.kernel.stride_h
+       << "   stride_w:" << args.kernel.stride_w;
+  DLOG << "   out_address:" << args.output.address
+       << "   out_scale_address:" << args.output.scale_address;
+#endif
+#ifndef PADDLE_MOBILE_ZU5
+  return 0;
+#endif
+  return 0;
+}
+int ComputeFpgaPool(const struct PoolingArgs &args) {
+#ifdef FPGA_PRINT_MODE
+  DLOG << "=============ComputeFpgaPool===========";
+  DLOG << "   mode:" << args.mode
+       << "   kernel_reciprocal:" << fp16_2_fp32(args.kernel_reciprocal);
+  DLOG << "   image_address:" << args.image.address
+       << "   image_scale_address:" << args.image.scale_address
+       << "   image_channels:" << args.image.channels
+       << "   image_height:" << args.image.height
+       << "   image_width:" << args.image.width
+       << "   pad_height:" << args.image.pad_height
+       << "   pad_width:" << args.image.pad_width;
+  DLOG << "   kernel_height:" << args.kernel.height
+       << "   kernel_width:" << args.kernel.width
+       << "   stride_h:" << args.kernel.stride_h
+       << "   stride_w:" << args.kernel.stride_w;
+  DLOG << "   out_address:" << args.output.address
+       << "   out_scale_address:" << args.output.scale_address;
+#endif
+#ifndef PADDLE_MOBILE_ZU5
+  return 0;
+#endif
+  return 0;
+}
+int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
+#ifdef FPGA_PRINT_MODE
+  DLOG << "=============ComputeFpgaEWAdd===========";
+  DLOG << "   relu_enabled:" << args.relu_enabled
+       << "   const0:" << fp16_2_fp32(int16_t(args.const0))
+       << "   const1:" << fp16_2_fp32(int16_t(args.const1));
+  DLOG << "   image0_address:" << args.image0.address
+       << "   image0_scale_address:" << args.image0.scale_address
+       << "   image0_channels:" << args.image0.channels
+       << "   image0_height:" << args.image0.height
+       << "   image0_width:" << args.image0.width
+       << "   pad0_height:" << args.image0.pad_height
+       << "   pad0_width:" << args.image0.pad_width;
+  DLOG << "   image1_address:" << args.image1.address
+       << "   image1_scale_address:" << args.image1.scale_address
+       << "   image1_channels:" << args.image1.channels
+       << "   image1_height:" << args.image1.height
+       << "   image1_width:" << args.image1.width
+       << "   pad1_height:" << args.image1.pad_height
+       << "   pad_width:" << args.image1.pad_width;
+  DLOG << "   out_address:" << args.output.address
+       << "   out_scale_address:" << args.output.scale_address;
+#endif
+#ifndef PADDLE_MOBILE_ZU5
+  return 0;
+#endif
+  return 0;
+}
+int PerformBypass(const struct BypassArgs &args) {
+#ifdef FPGA_PRINT_MODE
+  DLOG << "=============ComputeFpgaBypass===========";
+  DLOG << "   input_type:" << args.input_data_type
+       << "   output_type:" << args.output_data_type
+       << "   input_layout_type:" << args.input_layout_type
+       << "   output_layout_type:" << args.output_layout_type;
+  DLOG << "   image_address:" << args.image.address
+       << "   image_scale_address:" << args.image.scale_address
+       << "   image_channels:" << args.image.channels
+       << "   image_height:" << args.image.height
+       << "   image_width:" << args.image.width
+       << "   pad_height:" << args.image.pad_height
+       << "   pad_width:" << args.image.pad_width;
+  DLOG << "   out_address:" << args.output.address
+       << "   out_scale_address:" << args.output.scale_address;
+#endif
+#ifndef PADDLE_MOBILE_ZU5
+  return 0;
+#endif
+  uint64_t ifm_src_paddr = vaddr_to_paddr(args.image.address);
+  uint64_t ifm_dst_paddr = vaddr_to_paddr(args.output.address);
+  uint64_t bp_enable;
+  int64_t length;
+  uint64_t pixels;
+  // fp32->fp16
+  if ((args.input_data_type) && (!args.output_data_type)) {
+    pixels = (args.image.channels) * (args.image.width) * (args.image.height);
+    length = pixels * sizeof(float);
+    bp_enable = 0x8800000000000000 + length;
+  }
+  // fp16->fp32
+  else if ((!args.input_data_type) && (args.output_data_type)) {
+    pixels = filter::calc_aligned_channel((args.image.channels)) *
+             (args.image.width) * (args.image.height);
+    length = pixels * sizeof(short);
+    length = align_to_x((int)length, 64);  // NOLINT
+    bp_enable = 0x8a00000000000000 + length;
+  }
+  // fp16->fp16 findmax
+  else if ((!args.input_data_type) && (!args.output_data_type)) {
+    pixels = (args.image.channels) * (args.image.width) * (args.image.height);
+    length = pixels * sizeof(short);
+    bp_enable = 0x8900000000000000 + length;
+  } else {
+    return -1;
+  }
+  // start bypass
+  reg_writeq(ifm_src_paddr, MUL8(27));
+  reg_writeq(ifm_dst_paddr, MUL8(28));
+  reg_writeq(0, MUL8(0));
+  reg_writeq(bp_enable, MUL8(0));
+  // poll
+  int ret = -1;
+  ret = fpga_regpoll(MUL8(48), BYPASS_DONE, 0xffffffff);
+  if (ret != -1) {
+    // clear "irq"
+    reg_readq(MUL8(63));
+  }
+  // get max value
+  if ((!args.input_data_type) && (!args.output_data_type)) {
+    float scale = Findfp16Max();
+    args.output.scale_address[0] = (float)(1.0 / scale);  // NOLINT
+    args.output.scale_address[1] = scale;
+  }
+  return ret;
+}
+int ComputeFPGAConcat(const struct ConcatArgs &args) {
+#ifdef FPGA_PRINT_MODE
+  DLOG << "=============ComputeFpgaConcat===========";
+  DLOG << "   Image_num: " << args.image_num
+       << "   out_address:" << args.image_out
+       << "   out_scale_address:" << args.scale_out
+       << "   out_channel:" << args.out_channel;
+  DLOG << "   image_height:" << args.height << "   image_width:" << args.width;
+  for (int i = 0; i < args.image_num; i++) {
+    DLOG << "   " << i << "th:        ";
+    DLOG << "   channel_num:" << args.channel_num[i]
+         << "   aligned_channel_num:" << args.aligned_channel_num[i]
+         << "   image_address:" << args.images_in[i]
+         << "   image_scale_address:" << args.scales_in[i];
+  }
+#endif
+  image::concat_images(args.images_in, args.scales_in, args.image_out,
+                       args.scale_out, args.image_num, args.channel_num,
+                       args.height, args.width, args.aligned_channel_num,
+                       args.out_channel);
+  return 0;
+}
+}  // namespace fpga
+}  // namespace paddle_mobile
--- a/src/fpga/V2/driver/pe.h
+++ b/src/fpga/V2/driver/pe.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "fpga/V2/api.h"
+namespace paddle_mobile {
+namespace fpga {
+int PerformBypass(const struct BypassArgs& args);
+int ComputeBasicConv(const struct ConvArgs& args);
+int ComputeFpgaPool(const struct PoolingArgs& args);
+int ComputeFpgaEWAdd(const struct EWAddArgs& args);
+int ComputeFpgaConv(const struct SplitConvArgs& args);
+int ComputeFPGAConcat(const struct ConcatArgs& args);
+}  // namespace fpga
+}  // namespace paddle_mobile
--- a/src/fpga/V2/filter.cpp
+++ b/src/fpga/V2/filter.cpp
@@ -94,7 +94,6 @@ void format_filter(float **data_in, int num, int channel, int height, int width,
  convert_to_hwc(data_in, num, channel, height, width);
  align_filter(data_in, num, channel, height, width);
  int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width);
-  fpga_flush(*data_in, pixel_num * sizeof(float));
 }
 void convert_fc_filter(float **data_in, int num, int chw) {
@@ -114,8 +113,6 @@ void format_fc_filter(float **data_in, int num, int channel, int height,
  int chw = channel * height * width;
  convert_fc_filter(data_in, num, chw);
  align_filter(data_in, num, channel, height, width);
-  int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width);
-  fpga_flush(*data_in, pixel_num * sizeof(float));
 }
 float find_max(float *data_in, int data_size) {

--- a/src/fpga/V2/image.cpp
+++ b/src/fpga/V2/image.cpp
@@ -58,7 +58,6 @@ void format_image(float **data_in, int channel, int height, int width,
                  int aligned_channel) {
  convert_to_hwc(data_in, channel, height, width);
  align_image(data_in, channel, height, width, aligned_channel);
-  fpga_flush(*data_in, aligned_channel * height * width * sizeof(float));
 }
 void concat_images(int16_t **images_in, float **scales_in, void *image_out,
@@ -70,8 +69,6 @@ void concat_images(int16_t **images_in, float **scales_in, void *image_out,
  scale_out[1] = 0.0;
  for (int i = 0; i < image_num; i++) {
    scale_out[0] = std::max(*scale_out, scales_in[i][0]);
-    fpga_invalidate(images_in[i],
-                    height * width * aligned_channel_num[i] * sizeof(int16_t));
  }
  scale_out[1] = 1 / scale_out[0];
@@ -86,8 +83,6 @@ void concat_images(int16_t **images_in, float **scales_in, void *image_out,
      tmp_channel_sum += channel_num[i];
    }
  }
-  fpga_flush(image_out, hw * out_channel * sizeof(int16_t));
 }
 }  // namespace image

--- a/src/operators/elementwise_add_op.cpp
+++ b/src/operators/elementwise_add_op.cpp
@@ -40,4 +40,8 @@ REGISTER_OPERATOR_MALI_GPU(elementwise_add, ops::ElementwiseAddOp);
 REGISTER_OPERATOR_CL(elementwise_add, ops::ElementwiseAddOp);
 #endif
+#ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(elementwise_add, ops::ElementwiseAddOp);
+#endif
 #endif
--- a/src/operators/kernel/arm/slice_kernel.cpp
+++ b/src/operators/kernel/arm/slice_kernel.cpp
@@ -17,6 +17,14 @@ limitations under the License. */
 #include "operators/kernel/slice_kernel.h"
 namespace paddle_mobile {
-namespace operators {}
+namespace operators {
+template <>
+bool SliceKernel<CPU, float>::Init(SliceParam<CPU>* param) {
+  return true;
+}
+template <>
+void SliceKernel<CPU, float>::Compute(const SliceParam<CPU>& param) {}
+}  // namespace operators
 }  // namespace paddle_mobile
 #endif
--- a/src/operators/kernel/fpga/V2/concat_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/concat_kernel.cpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifdef CONCAT_OP
 #include "operators/kernel/concat_kernel.h"
+#include "fpga/V2/api.h"
 namespace paddle_mobile {
 namespace operators {
@@ -68,7 +69,7 @@ bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
 template <>
 void ConcatKernel<FPGA, float>::Compute(const ConcatParam<FPGA> &param) {
-  ComputeFPGAConcat(param.FpgaArgs());
+  fpga::ComputeFPGAConcat(param.FpgaArgs());
 }
 template class ConcatKernel<FPGA, float>;

--- a/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef ELEMENTWISEADD_OP
+#include "operators/kernel/elementwise_add_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool ElementwiseAddKernel<FPGA, float>::Init(ElementwiseAddParam<FPGA> *param) {
+  bool relu_enabled = false;
+  auto *input_x = const_cast<LoDTensor *>(param->InputX());
+  auto *input_y = const_cast<LoDTensor *>(param->InputY());
+  auto *out = param->Out();
+  auto input_x_ptr = input_x->data<float>();
+  auto input_y_ptr = input_y->data<float>();
+  int aligned_channel_num = fpga::get_aligned_channel_num(input_x->dims()[1]);
+  fpga::format_fp16_ofm(out, aligned_channel_num);
+  auto out_ptr = out->mutable_data<float>();
+  fpga::EWAddArgs ewaddArgs = {0};
+  ewaddArgs.relu_enabled = relu_enabled;
+  ewaddArgs.const0 = 0x3c00;  // =1
+  ewaddArgs.const1 = 0x3c00;  // =1
+  ewaddArgs.image0.address = input_x_ptr;
+  ewaddArgs.image0.channels = (uint32_t)input_x->dims()[1];
+  ewaddArgs.image0.scale_address = input_x->scale;
+  ewaddArgs.image0.height = (uint32_t)input_x->dims()[2];
+  ewaddArgs.image0.width = (uint32_t)input_x->dims()[3];
+  ewaddArgs.image0.pad_height = 0;
+  ewaddArgs.image0.pad_width = 0;
+  ewaddArgs.image1.address = input_y_ptr;
+  ewaddArgs.image1.channels = (uint32_t)input_y->dims()[1];
+  ewaddArgs.image1.scale_address = input_y->scale;
+  ewaddArgs.image1.height = (uint32_t)input_y->dims()[2];
+  ewaddArgs.image1.width = (uint32_t)input_y->dims()[3];
+  ewaddArgs.image1.pad_height = 0;
+  ewaddArgs.image1.pad_width = 0;
+  ewaddArgs.output.scale_address = out->scale;
+  ewaddArgs.output.address = out_ptr;
+  param->SetFpgaArgs(ewaddArgs);
+  return true;
+}
+template <>
+void ElementwiseAddKernel<FPGA, float>::Compute(
+    const ElementwiseAddParam<FPGA> &param) {
+  fpga::ComputeFpgaEWAdd(param.FpgaArgs());
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
@@ -21,7 +21,7 @@ namespace operators {
 template <>
 bool ElementwiseAddReluKernel<FPGA, float>::Init(
    ElementwiseAddReluParam<FPGA> *param) {
-  bool relu_enabled = true;
+  bool relu_enabled = false;
  auto *input_x = const_cast<LoDTensor *>(param->InputX());
  auto *input_y = const_cast<LoDTensor *>(param->InputY());
  auto *out = param->Out();

--- a/src/operators/kernel/fpga/V2/slice_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/slice_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef SLICE_OP
+#include "operators/kernel/slice_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool SliceKernel<FPGA, float>::Init(SliceParam<FPGA>* param) {
+  return true;
+}
+template <>
+void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) {}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/fpga/V2/softmax_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/softmax_kernel.cpp
@@ -49,12 +49,7 @@ void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) {
  Tensor *out = param.Out();
  fpga::PerformBypass(param.FpgaArgs());
-  fpga::fpga_invalidate(
-      (void *)in_x->data<float>(),                           // NOLINT
-      fpga::get_aligned_channel_num((int)in_x->dims()[1]) *  // NOLINT
-          sizeof(float));
  math::SoftmaxFuntor<CPU, float>()(in_x, out);
-  fpga::fpga_flush(out->data<float>(), out->memory_size());
 }
 }  // namespace operators

--- a/src/operators/kernel/slice_kernel.h
+++ b/src/operators/kernel/slice_kernel.h
@@ -24,7 +24,8 @@ template <typename DeviceType, typename T>
 class SliceKernel
    : public framework::OpKernelBase<DeviceType, SliceParam<DeviceType>> {
 public:
-  void Compute(const SliceParam<DeviceType>& param) {}
+  void Compute(const SliceParam<DeviceType>& param);
+  bool Init(SliceParam<DeviceType>* param);
 };
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -436,6 +436,16 @@ class ConvParam : public OpParam {
 #ifdef PADDLE_MOBILE_CL
  int offset_;
 #endif
+#ifdef PADDLE_MOBILE_FPGA
+ private:
+  fpga::SplitConvArgs fpga_conv_args;
+ public:
+  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
+#endif
 };
 template <typename Dtype>
 Print &operator<<(Print &printer, const ConvParam<Dtype> &conv_param);
@@ -580,15 +590,6 @@ class MulParam : OpParam {
  GType *out_;
  int x_num_col_dims_;
  int y_num_col_dims_;
-#ifdef PADDLE_MOBILE_FPGA
- private:
-  fpga::SplitConvArgs fpga_conv_args;
- public:
-  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
-#endif
 };
 #endif
@@ -1641,15 +1642,6 @@ class FusionConvAddParam : public ConvParam<Dtype> {
  RType *bias_;
  int axis_;
  RType *output_;
-#ifdef PADDLE_MOBILE_FPGA
- private:
-  fpga::SplitConvArgs fpga_conv_args;
- public:
-  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
-#endif
 };
 template <typename Dtype>
@@ -1696,15 +1688,6 @@ class FusionConvAddPReluParam : public ConvParam<Dtype> {
  RType *output_;
  RType *alpha_;
  std::string mode_;
-#ifdef PADDLE_MOBILE_FPGA
- private:
-  fpga::SplitConvArgs fpga_conv_args;
- public:
-  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
-#endif
 };
 #endif
@@ -1754,15 +1737,6 @@ class FusionConvAddAddPReluParam : public ConvParam<Dtype> {
  std::string keyOutput_;
  std::string keyX1_;
  std::string keyY1_;
-#ifdef PADDLE_MOBILE_FPGA
- private:
-  fpga::SplitConvArgs fpga_conv_args;
- public:
-  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
-#endif
 };
 #endif
@@ -1829,16 +1803,6 @@ class FusionConvAddBNReluParam : public ConvParam<Dtype> {
  bool is_test_;
  RType *new_bias_;
  RType *new_scale_;
-#ifdef PADDLE_MOBILE_FPGA
- private:
-  fpga::SplitConvArgs fpga_conv_args;
- public:
-  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
-#endif
 };
 #endif
@@ -1916,15 +1880,6 @@ class FusionConvBNAddReluParam : public ConvParam<Dtype> {
  std::string keyBNY_;
  std::string keyX_;
  std::string keyY_;
-#ifdef PADDLE_MOBILE_FPGA
- private:
-  fpga::SplitConvArgs fpga_conv_args;
- public:
-  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
-#endif
 };
 #endif
@@ -1983,15 +1938,6 @@ class FusionConvBNParam : public ConvParam<Dtype> {
  bool is_test_;
  RType *new_bias_;
  RType *new_scale_;
-#ifdef PADDLE_MOBILE_FPGA
- private:
-  fpga::SplitConvArgs fpga_conv_args;
- public:
-  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
-#endif
 };
 #endif
@@ -2058,15 +2004,6 @@ class FusionConvAddBNParam : public ConvParam<Dtype> {
  bool is_test_;
  RType *new_bias_;
  RType *new_scale_;
-#ifdef PADDLE_MOBILE_FPGA
- private:
-  fpga::SplitConvArgs fpga_conv_args;
- public:
-  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
-#endif
 };
 #endif
@@ -2184,15 +2121,6 @@ class FusionConvBNReluParam : public ConvParam<Dtype> {
  bool is_test_;
  RType *new_bias_;
  RType *new_scale_;
-#ifdef PADDLE_MOBILE_FPGA
- private:
-  fpga::SplitConvArgs fpga_conv_args;
- public:
-  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
-#endif
 };
 #endif

--- a/src/operators/slice_op.cpp
+++ b/src/operators/slice_op.cpp
@@ -34,5 +34,7 @@ REGISTER_OPERATOR_CPU(slice, ops::SliceOp);
 #ifdef PADDLE_MOBILE_MALI_GPU
 REGISTER_OPERATOR_MALI_GPU(slice, ops::SliceOp);
 #endif
+#ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(slice, ops::SliceOp);
+#endif
 #endif
--- a/tools/op.cmake
+++ b/tools/op.cmake
@@ -130,10 +130,12 @@ if (CON GREATER -1)
  set(FUSION_ELEMENTWISEADDRELU_OP ON)
  set(FUSION_FC_OP ON)
  set(POOL_OP ON)
-  set(CONCAT_OP ON)
  set(SOFTMAX_OP ON)
  set(FUSION_CONVBNRELU_OP ON)
  set(FUSION_CONVBN_OP ON)
+#  set(CONV_TRANSPOSE_OP ON)
+#  set(SLICE_OP ON)
+#  set(ELEMENTWISEADD_OP ON)
  set(FOUND_MATCH ON)
 endif()