Merge pull request #1287 from zhangyang0701/develop

update V2 for FPGA track

Merge pull request #1287 from zhangyang0701/develop
update V2 for FPGA track
693490ce · Chon · GitHub · b328934d · 4d4bd0da · 693490ce
34 changed file
--- a/src/common/types.cpp
+++ b/src/common/types.cpp
@@ -71,6 +71,8 @@ const char *G_OP_TYPE_SUM = "sum";
 const char *G_OP_TYPE_QUANTIZE = "quantize";
 const char *G_OP_TYPE_DEQUANTIZE = "dequantize";
+extern const char *G_OP_TYPE_TANH = "tanh";
+extern const char *G_OP_TYPE_FUSION_DECONV_RELU = "fusion_deconv_relu";
 std::unordered_map<
    std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
@@ -129,5 +131,7 @@ std::unordered_map<
        {G_OP_TYPE_SUM, {{"X"}, {"Out"}}},
        {G_OP_TYPE_ELEMENTWISE_MUL, {{"X", "Y"}, {"Out"}}},
        {G_OP_TYPE_QUANTIZE, {{"X"}, {"Out", "OutScale"}}},
-        {G_OP_TYPE_DEQUANTIZE, {{"X", "Scale"}, {"Out"}}}};
+        {G_OP_TYPE_DEQUANTIZE, {{"X", "Scale"}, {"Out"}}},
+        {G_OP_TYPE_TANH, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_DECONV_RELU, {{"Input"}, {"Out"}}}};
 }  // namespace paddle_mobile
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -139,6 +139,9 @@ extern const char *G_OP_TYPE_ELEMENTWISE_MUL;
 extern const char *G_OP_TYPE_QUANTIZE;
 extern const char *G_OP_TYPE_DEQUANTIZE;
+extern const char *G_OP_TYPE_TANH;
+extern const char *G_OP_TYPE_FUSION_DECONV_RELU;
 extern std::unordered_map<
    std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
    op_input_output_key;

--- a/src/fpga/V2/api.cpp
+++ b/src/fpga/V2/api.cpp
@@ -16,27 +16,29 @@ limitations under the License. */
 #include <algorithm>
 #include "fpga/V2/bias_scale.h"
 #include "fpga/V2/config.h"
+#include "fpga/V2/driver/driver.h"
 #include "fpga/V2/filter.h"
 #include "fpga/V2/image.h"
 namespace paddle_mobile {
 namespace fpga {
 static std::map<void *, size_t> memory_map;
 int open_device() {
-  int ret = open_device_driver();
+  int ret = driver::open_device_driver();
  return ret;
 }
 int close_device() {
-  int ret = close_device_driver();
+  int ret = driver::close_device_driver();
  return ret;
 }
 void *fpga_malloc(size_t size) {
  static uint64_t counter = 0;
 #ifdef PADDLE_MOBILE_ZU5
-  auto ptr = fpga_malloc_driver(size);
+  auto ptr = driver::fpga_malloc_driver(size);
 #else
  auto ptr = malloc(size);
 #endif
@@ -55,7 +57,7 @@ void fpga_free(void *ptr) {
    size = iter->second;
    memory_map.erase(iter);
 #ifdef PADDLE_MOBILE_ZU5
-    fpga_free_driver(ptr);
+    driver::fpga_free_driver(ptr);
 #else
    free(ptr);
 #endif
@@ -66,26 +68,27 @@ void fpga_free(void *ptr) {
    DLOG << "Invalid pointer";
  }
 }
+void fpga_copy(void *dest, const void *src, size_t num) {
-half fp32_2_fp16(float fp32_num) {
+#ifdef PADDLE_MOBILE_ZU5
-  unsigned long tmp = *(unsigned long *)(&fp32_num);  // NOLINT
+  driver::fpga_copy_driver(dest, src, num);
-  auto t = (half)(((tmp & 0x007fffff) >> 13) | ((tmp & 0x80000000) >> 16) |
+#else
-                  (((tmp & 0x7f800000) >> 13) - (112 << 10)));
+  memcpy(dest, src, num);
-  if (tmp & 0x1000) {
+#endif
-    t++;  // roundoff
-  }
-  return t;
 }
-float fp16_2_fp32(half fp16_num) {
+int fpga_flush(void *address, size_t size) {
-  int frac = (fp16_num & 0x3ff);
+#ifdef PADDLE_MOBILE_ZU5
-  int exp = ((fp16_num & 0x7c00) >> 10) + 112;
+  return driver::fpga_flush_driver(address, size);
-  int s = fp16_num & 0x8000;
+#else
-  int tmp = 0;
+  return 0;
-  float fp32_num;
+#endif
-  tmp = s << 16 | exp << 23 | frac << 13;
+}
-  fp32_num = *(float *)&tmp;  // NOLINT
+int fpga_invalidate(void *address, size_t size) {
-  return fp32_num;
+#ifdef PADDLE_MOBILE_ZU5
+  return driver::fpga_invalidate_driver(address, size);
+#else
+  return 0;
+#endif
 }
 void format_image(framework::Tensor *image_tensor) {
@@ -240,7 +243,7 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
  arg->filter_num = (uint32_t)filter->dims()[0];
  arg->output.address = out_ptr;
  arg->output.scale_address = out->scale;
-  arg->conv_args =
+  arg->conv_arg =
      (ConvArgs *)fpga_malloc(arg->split_num * sizeof(ConvArgs));  // NOLINT
  arg->concat_arg.image_num = arg->split_num;
@@ -258,28 +261,33 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
      (uint32_t *)fpga_malloc(n * sizeof(uint32_t));  // NOLINT
  for (int i = 0; i < n; i++) {
-    arg->conv_args[i].relu_enabled = relu_enabled;
+    arg->conv_arg[i].relu_enabled = relu_enabled;
-    arg->conv_args[i].sb_address = bs_ptr;
+    arg->conv_arg[i].sb_address = bs_ptr;
-    arg->conv_args[i].filter_address = (int8_t *)filter_ptr;  // NOLINT
+    arg->conv_arg[i].filter_address = (int8_t *)filter_ptr;  // NOLINT
-    arg->conv_args[i].filter_scale_address = filter->scale;
+    arg->conv_arg[i].filter_scale_address = filter->scale;
-    arg->conv_args[i].filter_num = arg->filter_num;
+    arg->conv_arg[i].filter_num = arg->filter_num;
-    arg->conv_args[i].group_num = (uint32_t)group_num;
+    arg->conv_arg[i].group_num = (uint32_t)group_num;
-    arg->conv_args[i].kernel.stride_h = (uint32_t)stride_h;
+    arg->conv_arg[i].kernel.stride_h = (uint32_t)stride_h;
-    arg->conv_args[i].kernel.stride_w = (uint32_t)stride_w;
+    arg->conv_arg[i].kernel.stride_w = (uint32_t)stride_w;
-    arg->conv_args[i].kernel.height = (uint32_t)filter->dims()[2];
+    arg->conv_arg[i].kernel.height = (uint32_t)filter->dims()[2];
-    arg->conv_args[i].kernel.width = (uint32_t)filter->dims()[3];
+    arg->conv_arg[i].kernel.width = (uint32_t)filter->dims()[3];
-    arg->conv_args[i].image.address = input_ptr;
+    arg->conv_arg[i].image.address = input_ptr;
-    arg->conv_args[i].image.scale_address = input->scale;
+    arg->conv_arg[i].image.scale_address = input->scale;
-    arg->conv_args[i].image.channels = (uint32_t)input->dims()[1];
+    arg->conv_arg[i].image.channels = (uint32_t)input->dims()[1];
-    arg->conv_args[i].image.height = (uint32_t)input->dims()[2];
+    arg->conv_arg[i].image.height = (uint32_t)input->dims()[2];
-    arg->conv_args[i].image.width = (uint32_t)input->dims()[3];
+    arg->conv_arg[i].image.width = (uint32_t)input->dims()[3];
-    arg->conv_args[i].image.pad_height = (uint32_t)padding_h;
+    arg->conv_arg[i].image.pad_height = (uint32_t)padding_h;
-    arg->conv_args[i].image.pad_width = (uint32_t)padding_w;
+    arg->conv_arg[i].image.pad_width = (uint32_t)padding_w;
-    arg->conv_args[i].output.address = out_ptr;
+    arg->conv_arg[i].output.address = out_ptr;
-    arg->conv_args[i].output.scale_address = out->scale;
+    arg->conv_arg[i].output.scale_address = out->scale;
+    int num_after_alignment =
+        filter::calc_aligned_num((int)input->dims()[1], arg->filter_num);
+    arg->conv_arg[i].free_space =
+        fpga_malloc(num_after_alignment * 2 * sizeof(half));
  }
 }

--- a/src/fpga/V2/api.h
+++ b/src/fpga/V2/api.h
@@ -14,118 +14,20 @@ limitations under the License. */
 #pragma once
-#include <stdint.h>
-#include <cstddef>
-#include <iostream>
-#include <limits>
-#include "fpga/V2/driver/driver.h"
 #include "fpga/V2/driver/pe.h"
+#include "fpga/V2/fpga_common.h"
 #include "framework/tensor.h"
 namespace paddle_mobile {
 namespace fpga {
-enum DataType {
-  DATA_TYPE_FP32 = 1,
-  DATA_TYPE_FP16 = 0,
-};
-enum LayoutType {
-  LAYOUT_CHW = 1,
-  LAYOUT_HWC = 0,
-};
-struct KernelArgs {
-  uint32_t width;
-  uint32_t height;
-  uint32_t stride_w;
-  uint32_t stride_h;
-};
-struct ImageInputArgs {
-  void* address;         // input featuremap virtual address
-  float* scale_address;  // input scale address;
-  uint32_t channels;
-  uint32_t width;  // featuremap width
-  uint32_t height;
-  uint32_t pad_width;  // padding width;
-  uint32_t pad_height;
-};
-struct ImageOutputArgs {
-  void* address;         // output result address;
-  float* scale_address;  // output scale address;
-  uint64_t timer_cnt;    // time counter for FPGA computation
-};
-struct ConvArgs {
-  bool relu_enabled;
-  void* sb_address;  // scale and bias are interlaced;
-  void* filter_address;
-  float* filter_scale_address;
-  uint32_t filter_num;
-  uint32_t group_num;
-  struct KernelArgs kernel;
-  struct ImageInputArgs image;  // input image;
-  struct ImageOutputArgs output;
-};
-struct ConcatArgs {
-  uint32_t image_num;
-  half** images_in;
-  float** scales_in;
-  void* image_out;
-  float* scale_out;
-  uint32_t* channel_num;
-  uint32_t* aligned_channel_num;
-  uint32_t out_channel;
-  uint32_t height;
-  uint32_t width;
-};
-struct SplitConvArgs {
-  uint32_t split_num;
-  uint32_t group_num;
-  uint32_t filter_num;
-  struct ImageOutputArgs output;
-  struct ConvArgs* conv_args;
-  struct ConcatArgs concat_arg;
-};
-struct PoolingArgs {
-  int16_t mode;  // mode: 0:max, 1:avg
-  half kernel_reciprocal;
-  struct KernelArgs kernel;
-  struct ImageInputArgs image;  // input image;
-  struct ImageOutputArgs output;
-};
-struct EWAddArgs {
-  bool relu_enabled;
-  uint32_t const0;  // output0 = const0 x input0 + const1 x input1;
-  uint32_t const1;
-  struct ImageInputArgs image0;
-  struct ImageInputArgs image1;
-  struct ImageOutputArgs output;
-};
-struct BypassArgs {
-  enum DataType input_data_type;
-  enum DataType output_data_type;
-  enum LayoutType input_layout_type;
-  enum LayoutType output_layout_type;
-  struct ImageInputArgs image;
-  struct ImageOutputArgs output;
-};
 int open_device();
 int close_device();
 void* fpga_malloc(size_t size);
 void fpga_free(void* ptr);
+void fpga_copy(void* dest, const void* src, size_t num);
-static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; }
+int fpga_flush(void* address, size_t size);
+int fpga_invalidate(void* address, size_t size);
 float filter_find_max(framework::Tensor* filter_tensor);
 int get_aligned_channel_num(int channel_num);
@@ -153,8 +55,5 @@ void fill_split_arg(struct SplitConvArgs* arg, framework::Tensor* input,
                    bool relu_enabled, int group_num, int stride_h,
                    int stride_w, int padding_h, int padding_w, float* bs_ptr);
-half fp32_2_fp16(float fp32_num);
-float fp16_2_fp32(half fp16_num);
 }  // namespace fpga
 }  // namespace paddle_mobile
--- a/src/fpga/V2/bias_scale.cpp
+++ b/src/fpga/V2/bias_scale.cpp
@@ -27,7 +27,7 @@ void align_element(float **data_in, int num, int num_after_alignment) {
      (float *)fpga_malloc(total_element * sizeof(float));  // NOLINT
  memset(ptr_aligned, 0, total_element * sizeof(float));
-  for (int i = 1; i < num; i++) {
+  for (int i = 0; i < num; i++) {
    ptr_aligned[i * 2 + 0] = ptr_unaligned[i];
    ptr_aligned[i * 2 + 1] = ptr_unaligned[i + num];
  }
@@ -39,6 +39,7 @@ void align_element(float **data_in, int num, int num_after_alignment) {
 void format_bias_scale_array(float **data_in, int num,
                             int num_after_alignment) {
  align_element(data_in, num, num_after_alignment);
+  fpga_flush(*data_in, 2 * num_after_alignment * sizeof(float));
 }
 }  // namespace bias_scale

--- a/src/fpga/V2/driver/bitmap.cpp
+++ b/src/fpga/V2/driver/bitmap.cpp
@@ -57,8 +57,8 @@ static uint64_t ffs(uint64_t data) {
  uint64_t bit = 0;
  int i = 0;
-  for (i = 0; i < sizeof(data); i++) {
+  for (i = 0; i < sizeof(data) * 8; i++) {
-    if (data & (1 << i)) {
+    if (data & (1UL << i)) {
      bit = i;
      break;
    }

--- a/src/fpga/V2/driver/bitmap.h
+++ b/src/fpga/V2/driver/bitmap.h
@@ -25,7 +25,7 @@ limitations under the License. */
 #define __ALIGN_KERNEL_MASK(x, mask) (((x) + (mask)) & ~(mask))
 #define __ALIGN_MASK(x, mask) __ALIGN_KERNEL_MASK((x), (mask))
-#define round_down(x, y) ((x) & ((y)-1))
+#define round_down(x, y) ((x) & ~((y)-1))
 namespace fpga_bitmap {
 void bitmap_set(uint64_t *map, unsigned int start, int len);

--- a/src/fpga/V2/driver/driver.cpp
+++ b/src/fpga/V2/driver/driver.cpp
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <sys/ioctl.h>
 #include <sys/mman.h>
 #include <unistd.h>
 #include <algorithm>
@@ -32,6 +33,7 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace fpga {
+namespace driver {
 struct FPGA_INFO g_fpgainfo;
 int open_drvdevice() {
@@ -43,7 +45,8 @@ int open_drvdevice() {
 int open_memdevice() {
  if (g_fpgainfo.fd_mem == -1) {
-    g_fpgainfo.fd_mem = open(g_fpgainfo.memdevice_path, O_RDWR | O_DSYNC);
+    // g_fpgainfo.fd_mem = open(g_fpgainfo.memdevice_path, O_RDWR | O_DSYNC);
+    g_fpgainfo.fd_mem = open(g_fpgainfo.memdevice_path, O_RDWR);
  }
  return g_fpgainfo.fd_mem;
 }
@@ -51,7 +54,6 @@ int open_memdevice() {
 void pl_reset() {
  // DLOG << "PL RESET";
-  // reg_writeq(0x5a, REG_FPGA_RESET);
  usleep(100 * 1000);
 }
@@ -131,7 +133,7 @@ int pl_get_status() { return 0; }
 int fpga_regpoll(uint64_t reg, uint64_t val, int time) {
  uint64_t i = 0;
  /*timeout精确性待确认*/
-  int64_t timeout = time * CPU_FREQ / 1000000;
+  int64_t timeout = time * 6;
  for (i = 0; i < timeout; i++) {
    if (val == reg_readq(reg)) {
@@ -173,9 +175,14 @@ int memory_request(struct fpga_memory *memory, size_t size, uint64_t *addr) {
 }
 void memory_release(struct fpga_memory *memory) {
-  pthread_mutex_lock(&memory->mutex);
+  void *ptr = nullptr;
-  fpga_bitmap::bitmap_clear(memory->bitmap, 0, memory->page_num);
-  pthread_mutex_unlock(&memory->mutex);
+  /*unmap memory*/
+  std::map<void *, size_t> map = g_fpgainfo.fpga_addr2size_map;
+  std::map<void *, size_t>::iterator iter;
+  for (iter = map.begin(); iter != map.end(); iter++) {
+    fpga_free_driver(ptr);
+  }
 }
 int create_fpga_memory_inner(struct fpga_memory *memory, size_t memory_size) {
@@ -238,7 +245,6 @@ int init_fpga_memory(struct fpga_memory *memory) {
    return rc;
  }
-  // spin_lock_init(&memory->spin);
  fpga_bitmap::bitmap_clear(memory->bitmap, 0, memory->page_num);
  fpga_bitmap::bitmap_set(memory->bitmap, 0, 1);  // NOTE reserve fpga page 0.
@@ -293,9 +299,23 @@ void *fpga_reg_malloc(size_t size) {
  return ret;
 }
+void *fpga_reg_free(void *ptr) {
+  size_t size = 0;
+  auto iter = g_fpgainfo.fpga_addr2size_map.find(ptr);
+  if (iter != g_fpgainfo.fpga_addr2size_map.end()) {
+    size = iter->second;
+    g_fpgainfo.fpga_addr2size_map.erase(iter);
+    munmap(ptr, size);
+  } else {
+    DLOG << "Invalid pointer";
+  }
+}
 void *fpga_malloc_driver(size_t size) {
  void *ret = nullptr;
  uint64_t phy_addr = 0;
+  int i = 0;
  memory_request(g_fpgainfo.memory_info, size, &phy_addr);
@@ -311,17 +331,70 @@ void *fpga_malloc_driver(size_t size) {
 void fpga_free_driver(void *ptr) {
  size_t size = 0;
+  uint32_t pos = 0;
+  uint64_t p_addr = 0;
  auto iter = g_fpgainfo.fpga_addr2size_map.find(ptr);
  if (iter != g_fpgainfo.fpga_addr2size_map.end()) {
    size = iter->second;
    g_fpgainfo.fpga_addr2size_map.erase(iter);
    munmap(ptr, size);
+    p_addr = vaddr_to_paddr(ptr);
+    pos = (p_addr - g_fpgainfo.memory_info->mem_start) / FPGA_PAGE_SIZE;
+    /*clear bitmap*/
+    pthread_mutex_lock(&g_fpgainfo.memory_info->mutex);
+    fpga_bitmap::bitmap_clear(g_fpgainfo.memory_info->bitmap, pos,
+                              g_fpgainfo.memory_info->nr[pos]);
+    pthread_mutex_unlock(&g_fpgainfo.memory_info->mutex);
  } else {
    DLOG << "Invalid pointer";
  }
 }
+static inline int do_ioctl(unsigned long req, const void *arg) {
+  return ioctl(g_fpgainfo.fd_mem, req, arg);
+}
+int fpga_flush_driver(void *address, size_t size) {
+  struct MemoryCacheArgs args;
+  uint64_t p_addr;
+  p_addr = vaddr_to_paddr(address);
+  args.offset = (void *)(p_addr - FPGA_MEM_PHY_ADDR);
+  args.size = size;
+  return do_ioctl(IOCTL_MEMCACHE_FLUSH, &args);
+}
+int fpga_invalidate_driver(void *address, size_t size) {
+  struct MemoryCacheArgs args;
+  uint64_t p_addr;
+  p_addr = vaddr_to_paddr(address);
+  args.offset = (void *)(p_addr - FPGA_MEM_PHY_ADDR);
+  args.size = size;
+  return do_ioctl(IOCTL_MEMCACHE_INVAL, &args);
+}
+void fpga_copy_driver(void *dest, const void *src, size_t num) {
+  uint64_t i;
+  DLOG << "dest:" << dest << " src:" << src << " size:" << num;
+  for (i = 0; i < num; i++) {
+    // DLOG << "i:" << i << " val:" << *((int8_t *)src + i);
+    // usleep(1);
+    *((int8_t *)dest + i) = *((int8_t *)src + i);
+  }
+  return;
+}
 int open_device_driver() {
  g_fpgainfo.FpgaRegPhyAddr = FPGA_REG_PHY_ADDR;
  g_fpgainfo.FpgaMemPhyAddr = FPGA_MEM_PHY_ADDR;
@@ -347,12 +420,13 @@ int open_device_driver() {
 int close_device_driver() {
  pl_destroy();
-  fpga_free_driver(g_fpgainfo.FpgaRegVirAddr);
+  fpga_reg_free(g_fpgainfo.FpgaRegVirAddr);
  memory_release(g_fpgainfo.memory_info);
  destroy_fpga_memory(g_fpgainfo.memory_info);
  return 0;
 }
+}  // namespace driver
 }  // namespace fpga
 }  // namespace paddle_mobile
--- a/src/fpga/V2/driver/driver.h
+++ b/src/fpga/V2/driver/driver.h
@@ -24,6 +24,7 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace fpga {
+namespace driver {
 #define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
@@ -47,6 +48,15 @@ const int PE_IDX_BYPASS = 3;
 enum pe_status { IDLE = 0, BUSY = 1 };
+struct MemoryCacheArgs {
+  void *offset;
+  size_t size;
+};
+#define IOCTL_FPGA_MAGIC 'FPGA'
+#define IOCTL_MEMCACHE_INVAL _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryCacheArgs)
+#define IOCTL_MEMCACHE_FLUSH _IOW(IOCTL_FPGA_MAGIC, 13, struct MemoryCacheArgs)
 struct fpga_pe {
  char type_name[MAX_TYPE_NAME_LENTH + 1];
  struct pe_data_s *outer;
@@ -95,26 +105,39 @@ extern struct FPGA_INFO g_fpgainfo;
 inline uint64_t reg_readq(uint32_t offset) {
  // DLOG << "offset : " << offset;
-  uint64_t value =
+  uint64_t value = *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr +
-      *(uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + offset);  // NOLINT
+                                          offset);  // NOLINT
  return value;
 }
 inline void reg_writeq(uint64_t value, uint32_t offset) {
  // DLOG << "offset : " << offset << ", value : " << value;
-  *(uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + offset) =  // NOLINT
+  *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr +
+                         offset) =  // NOLINT
      value;
 }
 int open_device_driver();
 int close_device_driver();
 void *fpga_malloc_driver(size_t size);
 void fpga_free_driver(void *ptr);
+void fpga_copy_driver(void *dest, const void *src, size_t num);
+int fpga_flush_driver(void *address, size_t size);
+int fpga_invalidate_driver(void *address, size_t size);
 /*pe*/
 uint64_t vaddr_to_paddr(void *address);
 int fpga_regpoll(uint64_t reg, uint64_t val, int time);
+}  // namespace driver
 }  // namespace fpga
 }  // namespace paddle_mobile
--- a/src/fpga/V2/driver/pe.cpp
+++ b/src/fpga/V2/driver/pe.cpp
@@ -20,29 +20,29 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace fpga {
-#define MUL8(x) (x * 8)
+#define MUL8(x) ((x)*8)
 #define BYPASS_DONE 1
 float Findfp16Max() {
  uint16_t abs_vals[16];
  uint64_t max_fp16;
-  max_fp16 = reg_readq(MUL8(49));
+  max_fp16 = driver::reg_readq(MUL8(49));
  abs_vals[0] = (uint16_t)(0x0000007f & (max_fp16));        // NOLINT
  abs_vals[1] = (uint16_t)(0x0000007f & (max_fp16 >> 16));  // NOLINT
  abs_vals[2] = (uint16_t)(0x0000007f & (max_fp16 >> 32));  // NOLINT
  abs_vals[3] = (uint16_t)(0x0000007f & (max_fp16 >> 48));  // NOLINT
-  max_fp16 = reg_readq(MUL8(50));
+  max_fp16 = driver::reg_readq(MUL8(50));
  abs_vals[4] = (uint16_t)(0x0000007f & (max_fp16));        // NOLINT
  abs_vals[5] = (uint16_t)(0x0000007f & (max_fp16 >> 16));  // NOLINT
  abs_vals[6] = (uint16_t)(0x0000007f & (max_fp16 >> 32));  // NOLINT
  abs_vals[7] = (uint16_t)(0x0000007f & (max_fp16 >> 48));  // NOLINT
-  max_fp16 = reg_readq(MUL8(51));
+  max_fp16 = driver::reg_readq(MUL8(51));
  abs_vals[8] = (uint16_t)(0x0000007f & (max_fp16));         // NOLINT
  abs_vals[9] = (uint16_t)(0x0000007f & (max_fp16 >> 16));   // NOLINT
  abs_vals[10] = (uint16_t)(0x0000007f & (max_fp16 >> 32));  // NOLINT
  abs_vals[11] = (uint16_t)(0x0000007f & (max_fp16 >> 48));  // NOLINT
-  max_fp16 = reg_readq(MUL8(52));
+  max_fp16 = driver::reg_readq(MUL8(52));
  abs_vals[12] = (uint16_t)(0x0000007f & (max_fp16));
  abs_vals[13] = (uint16_t)(0x0000007f & (max_fp16 >> 16));  // NOLINT
  abs_vals[14] = (uint16_t)(0x0000007f & (max_fp16 >> 32));  // NOLINT
@@ -58,7 +58,7 @@ float Findfp16Max() {
 }
 int ComputeFpgaConv(const struct SplitConvArgs &args) {
-  ComputeBasicConv(args.conv_args[0]);
+  ComputeBasicConv(args.conv_arg[0]);
 }
 int ComputeBasicConv(const struct ConvArgs &args) {
@@ -166,8 +166,8 @@ int PerformBypass(const struct BypassArgs &args) {
  return 0;
 #endif
-  uint64_t ifm_src_paddr = vaddr_to_paddr(args.image.address);
+  uint64_t ifm_src_paddr = driver::vaddr_to_paddr(args.image.address);
-  uint64_t ifm_dst_paddr = vaddr_to_paddr(args.output.address);
+  uint64_t ifm_dst_paddr = driver::vaddr_to_paddr(args.output.address);
  uint64_t bp_enable;
  int64_t length;
  uint64_t pixels;
@@ -196,16 +196,16 @@ int PerformBypass(const struct BypassArgs &args) {
  }
  // start bypass
-  reg_writeq(ifm_src_paddr, MUL8(27));
+  driver::reg_writeq(ifm_src_paddr, MUL8(27));
-  reg_writeq(ifm_dst_paddr, MUL8(28));
+  driver::reg_writeq(ifm_dst_paddr, MUL8(28));
-  reg_writeq(0, MUL8(0));
+  driver::reg_writeq(0, MUL8(0));
-  reg_writeq(bp_enable, MUL8(0));
+  driver::reg_writeq(bp_enable, MUL8(0));
  // poll
  int ret = -1;
-  ret = fpga_regpoll(MUL8(48), BYPASS_DONE, 0xffffffff);
+  ret = driver::fpga_regpoll(MUL8(48), BYPASS_DONE, 0xffffffff);
  if (ret != -1) {
    // clear "irq"
-    reg_readq(MUL8(63));
+    driver::reg_readq(MUL8(63));
  }
  // get max value
  if ((!args.input_data_type) && (!args.output_data_type)) {

--- a/src/fpga/V2/driver/pe.h
+++ b/src/fpga/V2/driver/pe.h
@@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
-#include "fpga/V2/api.h"
+#include "fpga/V2/fpga_common.h"
 namespace paddle_mobile {
 namespace fpga {

--- a/src/fpga/V2/filter.cpp
+++ b/src/fpga/V2/filter.cpp
@@ -94,6 +94,7 @@ void format_filter(float **data_in, int num, int channel, int height, int width,
  convert_to_hwc(data_in, num, channel, height, width);
  align_filter(data_in, num, channel, height, width);
  int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width);
+  fpga_flush(*data_in, pixel_num * sizeof(float));
 }
 void convert_fc_filter(float **data_in, int num, int chw) {
@@ -113,6 +114,8 @@ void format_fc_filter(float **data_in, int num, int channel, int height,
  int chw = channel * height * width;
  convert_fc_filter(data_in, num, chw);
  align_filter(data_in, num, channel, height, width);
+  int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width);
+  fpga_flush(*data_in, pixel_num * sizeof(float));
 }
 float find_max(float *data_in, int data_size) {

--- a/src/fpga/V2/fpga_common.cpp
+++ b/src/fpga/V2/fpga_common.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <fpga/V2/fpga_common.h>
+namespace paddle_mobile {
+namespace fpga {
+int16_t fp32_2_fp16(float fp32_num) {
+  unsigned long tmp = *(unsigned long *)(&fp32_num);  // NOLINT
+  auto t = (int16_t)(((tmp & 0x007fffff) >> 13) | ((tmp & 0x80000000) >> 16) |
+                     (((tmp & 0x7f800000) >> 13) - (112 << 10)));
+  if (tmp & 0x1000) {
+    t++;  // roundoff
+  }
+  return t;
+}
+float fp16_2_fp32(int16_t fp16_num) {
+  if (0 == fp16_num) {
+    return 0;
+  }
+  int frac = (fp16_num & 0x3ff);
+  int exp = ((fp16_num & 0x7c00) >> 10) + 112;
+  int s = fp16_num & 0x8000;
+  int tmp = 0;
+  float fp32_num;
+  tmp = s << 16 | exp << 23 | frac << 13;
+  fp32_num = *(float *)&tmp;  // NOLINT
+  return fp32_num;
+}
+}  // namespace fpga
+}  // namespace paddle_mobile
--- a/src/fpga/V2/fpga_common.h
+++ b/src/fpga/V2/fpga_common.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <cstdint>
+namespace paddle_mobile {
+namespace fpga {
+enum DataType {
+  DATA_TYPE_FP32 = 1,
+  DATA_TYPE_FP16 = 0,
+};
+enum LayoutType {
+  LAYOUT_CHW = 1,
+  LAYOUT_HWC = 0,
+};
+struct KernelArgs {
+  uint32_t width;
+  uint32_t height;
+  uint32_t stride_w;
+  uint32_t stride_h;
+};
+struct ImageInputArgs {
+  void* address;         // input featuremap virtual address
+  float* scale_address;  // input scale address;
+  uint32_t channels;
+  uint32_t width;  // featuremap width
+  uint32_t height;
+  uint32_t pad_width;  // padding width;
+  uint32_t pad_height;
+};
+struct ImageOutputArgs {
+  void* address;         // output result address;
+  float* scale_address;  // output scale address;
+  uint64_t timer_cnt;    // time counter for FPGA computation
+};
+struct ConvArgs {
+  bool relu_enabled;
+  void* sb_address;  // scale and bias
+  void* filter_address;
+  float* filter_scale_address;
+  void* free_space;  // used by FPGA logic
+  uint32_t filter_num;
+  uint32_t group_num;
+  struct KernelArgs kernel;
+  struct ImageInputArgs image;  // input image;
+  struct ImageOutputArgs output;
+};
+struct ConcatArgs {
+  uint32_t image_num;
+  int16_t** images_in;
+  float** scales_in;
+  void* image_out;
+  float* scale_out;
+  uint32_t* channel_num;
+  uint32_t* aligned_channel_num;
+  uint32_t out_channel;
+  uint32_t height;
+  uint32_t width;
+};
+struct SplitConvArgs {
+  uint32_t split_num;
+  uint32_t group_num;
+  uint32_t filter_num;
+  struct ImageOutputArgs output;
+  struct ConvArgs* conv_arg;
+  struct ConcatArgs concat_arg;
+};
+struct PoolingArgs {
+  int16_t mode;  // mode: 0:max, 1:avg
+  int16_t kernel_reciprocal;
+  struct KernelArgs kernel;
+  struct ImageInputArgs image;  // input image;
+  struct ImageOutputArgs output;
+};
+struct EWAddArgs {
+  bool relu_enabled;
+  uint32_t const0;  // output0 = const0 x input0 + const1 x input1;
+  uint32_t const1;
+  struct ImageInputArgs image0;
+  struct ImageInputArgs image1;
+  struct ImageOutputArgs output;
+};
+struct BypassArgs {
+  enum DataType input_data_type;
+  enum DataType output_data_type;
+  enum LayoutType input_layout_type;
+  enum LayoutType output_layout_type;
+  struct ImageInputArgs image;
+  struct ImageOutputArgs output;
+};
+struct DeconvArgs {
+  struct ConvArgs conv_arg;
+};
+static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; }
+int16_t fp32_2_fp16(float fp32_num);
+float fp16_2_fp32(int16_t fp16_num);
+}  // namespace fpga
+}  // namespace paddle_mobile
--- a/src/fpga/V2/image.cpp
+++ b/src/fpga/V2/image.cpp
@@ -58,6 +58,7 @@ void format_image(float **data_in, int channel, int height, int width,
                  int aligned_channel) {
  convert_to_hwc(data_in, channel, height, width);
  align_image(data_in, channel, height, width, aligned_channel);
+  fpga_flush(*data_in, aligned_channel * height * width * sizeof(float));
 }
 void concat_images(int16_t **images_in, float **scales_in, void *image_out,
@@ -69,6 +70,8 @@ void concat_images(int16_t **images_in, float **scales_in, void *image_out,
  scale_out[1] = 0.0;
  for (int i = 0; i < image_num; i++) {
    scale_out[0] = std::max(*scale_out, scales_in[i][0]);
+    fpga_invalidate(images_in[i],
+                    height * width * aligned_channel_num[i] * sizeof(int16_t));
  }
  scale_out[1] = 1 / scale_out[0];
@@ -83,6 +86,7 @@ void concat_images(int16_t **images_in, float **scales_in, void *image_out,
      tmp_channel_sum += channel_num[i];
    }
  }
+  fpga_flush(image_out, hw * out_channel * sizeof(int16_t));
 }
 }  // namespace image

--- a/src/framework/executor.cpp
+++ b/src/framework/executor.cpp
@@ -26,6 +26,7 @@ limitations under the License. */
 #include "framework/program/var_desc.h"
 #include "framework/scope.h"
 #include "framework/tensor.h"
+#include "memory/t_malloc.h"
 #ifdef PADDLE_EXECUTOR_MULTITHREAD
 #include <queue>
@@ -86,8 +87,10 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
  }
  std::shared_ptr<framework::BlockDesc> to_predict_block =
      to_predict_program_->Block(0);
+  int i = 0;
  auto &ops = ops_of_block_[*to_predict_block.get()];
  for (const auto &op : ops) {
+    DLOG << "Initialize op[" << i++ << "]: " << op->Type();
    op->Init();
  }
 }
@@ -102,8 +105,8 @@ static void LoadMemInternal(void **data, framework::LoDTensor *tensor,
    // should be moved into operator init function
    float min_value;
    float max_value;
-    memcpy(&min_value, data_buf, sizeof(float));
+    memory::Copy(&min_value, data_buf, sizeof(float));
-    memcpy(&max_value, data_buf + sizeof(float), sizeof(float));
+    memory::Copy(&max_value, data_buf + sizeof(float), sizeof(float));
    data_buf += 2 * sizeof(float);
    const float factor = (max_value - min_value) / 255.0;
    const uint8_t *uint8_data = reinterpret_cast<uint8_t *>(data_buf);
@@ -112,7 +115,7 @@ static void LoadMemInternal(void **data, framework::LoDTensor *tensor,
    }
    data_buf += size * sizeof(uint8_t);
  } else {
-    memcpy(tensor_data, *data_buf, size * sizeof(Dtype));
+    memory::Copy(tensor_data, *data_buf, size * sizeof(Dtype));
    *data_buf += size * sizeof(Dtype);
  }
 }
@@ -128,7 +131,7 @@ void Executor<Dtype, P>::LoadMemory(
  // lod information
  // uint64_t lod_level = *(reinterpret_cast<uint64_t *>(*data_buf));
  uint64_t lod_level = 0;
-  memcpy(&lod_level, *data_buf, sizeof(uint64_t));
+  memory::Copy(&lod_level, *data_buf, sizeof(uint64_t));
  *data_buf += sizeof(uint64_t);
  auto *lod = tensor->mutable_lod();
@@ -137,7 +140,7 @@ void Executor<Dtype, P>::LoadMemory(
    uint64_t size = *(reinterpret_cast<uint64_t *>(*data_buf));
    *data_buf += sizeof(uint64_t);
    std::vector<size_t> tmp_dim(size / sizeof(size_t));
-    memcpy(tmp_dim.data(), *data_buf, size);
+    memory::Copy(tmp_dim.data(), *data_buf, size);
    (*lod)[i] = std::move(tmp_dim);
    *data_buf += size;
  }

--- a/src/io/paddle_mobile.cpp
+++ b/src/io/paddle_mobile.cpp
@@ -21,7 +21,6 @@ limitations under the License. */
 #include "operators/math/gemm.h"
 namespace paddle_mobile {
-static std::mutex lc;
 template <typename Dtype, Precision P>
 void PaddleMobile<Dtype, P>::SetThreadNum(int num) {
 #ifdef _OPENMP
@@ -148,8 +147,8 @@ double PaddleMobile<Dtype, P>::GetPredictTime() {
  }
  paddle_mobile::operators::math::Gemm gemm;
  auto time1 = paddle_mobile::time();
-  gemm.Sgemm(m, n, k, static_cast<float>(1), a, lda, b, ldb,
+  //  gemm.Sgemm(m, n, k, static_cast<float>(1), a, lda, b, ldb,
-             static_cast<float>(0), c, ldc, false, nullptr);
+  //             static_cast<float>(0), c, ldc, false, nullptr);
  auto time2 = paddle_mobile::time();
  double cost = paddle_mobile::time_diff(time1, time2);
  paddle_mobile::memory::Free(a);
@@ -199,6 +198,7 @@ void PaddleMobile<Dtype, P>::Predict_To(int end) {
 #endif
 #ifdef PADDLE_MOBILE_CL
+static std::mutex lc;
 template <typename Dtype, Precision P>
 void PaddleMobile<Dtype, P>::SetCLPath(std::string path) {
  std::lock_guard<std::mutex> lock(lc);

--- a/src/memory/t_malloc.cpp
+++ b/src/memory/t_malloc.cpp
@@ -32,7 +32,7 @@ const int MALLOC_ALIGN = 64;
 namespace fpga = paddle_mobile::fpga;
 void Copy(void *dst, const void *src, size_t num) {
-  std::memcpy(dst, src, num);
+  fpga::fpga_copy(dst, src, num);
 }
 void *Alloc(size_t size) { return fpga::fpga_malloc(size); }

--- a/src/operators/conv_transpose_op.cpp
+++ b/src/operators/conv_transpose_op.cpp
@@ -27,6 +27,7 @@ REGISTER_OPERATOR_CPU(conv2d_transpose, ops::ConvOpTranspose);
 #ifdef PADDLE_MOBILE_MALI_GPU
 #endif
 #ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(conv2d_transpose, ops::ConvOpTranspose);
 #endif
 #endif
--- a/src/operators/fusion_deconv_relu_op.cpp
+++ b/src/operators/fusion_deconv_relu_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_DECONVRELU_OP
+#include "operators/fusion_deconv_relu_op.h"
+namespace paddle_mobile {
+namespace operators {}
+}  // namespace paddle_mobile
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(fusion_deconv_relu, ops::FusionDeconvReluOp);
+#endif
+#endif
--- a/src/operators/fusion_deconv_relu_op.h
+++ b/src/operators/fusion_deconv_relu_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_DECONVRELU_OP
+#pragma once
+#include <string>
+#include <vector>
+#include "framework/operator.h"
+#include "framework/program/program-optimize/fusion_op_register.h"
+#include "operators/kernel/deconv_relu_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+using std::string;
+using std::vector;
+class FusionDeconvReluMatcher : public framework::FusionOpMatcher {
+ public:
+  FusionDeconvReluMatcher() {
+    node_ = framework::Node(G_OP_TYPE_CONV_TRANSPOSE);
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_RELU);
+  }
+  void FolderNodes(
+      framework::Node *node,
+      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
+    node->Folder(node_.Depth(), Type(), {}, removed_nodes);
+  }
+  std::string Type() { return G_OP_TYPE_FUSION_FC_RELU; }
+};
+template <typename DeviceType, typename T>
+class FusionDeconvReluOp : public framework::OperatorWithKernel<
+                               DeviceType, FusionDeconvReluParam<DeviceType>,
+                               operators::DeconvReluKernel<DeviceType, T>> {
+ public:
+  FusionDeconvReluOp(const string &type, const VariableNameMap &inputs,
+                     const VariableNameMap &outputs,
+                     const framework::AttributeMap &attrs,
+                     std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, FusionDeconvReluParam<DeviceType>,
+            operators::DeconvReluKernel<DeviceType, T>>(type, inputs, outputs,
+                                                        attrs, scope) {}
+  void InferShape() const {
+    auto input = this->param_.Input();
+    auto in_dims = input->dims();
+    auto filter = this->param_.Filter();
+    auto filter_dims = filter->dims();
+    std::vector<int> strides = this->param_.Strides();
+    std::vector<int> paddings = this->param_.Paddings();
+    std::vector<int> dilations = this->param_.Dilations();
+    int groups = this->param_.Groups();
+    PADDLE_MOBILE_ENFORCE(
+        in_dims.size() == 4 || in_dims.size() == 5,
+        "ConvTransposeOp intput should be 4-D or 5-D tensor.");
+    PADDLE_MOBILE_ENFORCE(
+        in_dims.size() == filter_dims.size(),
+        "ConvTransposeOp input dimension and filter dimension "
+        "should be the same.");
+    PADDLE_MOBILE_ENFORCE(
+        in_dims.size() - strides.size() == 2U,
+        "ConvTransposeOp input dimension and strides dimension should "
+        "be consistent.");
+    PADDLE_MOBILE_ENFORCE(paddings.size() == strides.size(),
+                          "ConvTransposeOp paddings dimension and strides "
+                          "dimension should be the same.");
+    PADDLE_MOBILE_ENFORCE(paddings.size() == dilations.size(),
+                          "ConvTransposeOp paddings dimension and dilations "
+                          "dimension should be the same.");
+    PADDLE_MOBILE_ENFORCE(
+        in_dims[1] == filter_dims[0],
+        "In ConvTransposeOp, The number of input channels should "
+        "be equal to the number of filter's channels.");
+    std::vector<int64_t> output_shape({in_dims[0], filter_dims[1] * groups});
+    for (size_t i = 0; i < strides.size(); ++i) {
+      auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1;
+      output_shape.push_back((in_dims[i + 2] - 1) * strides[i] -
+                             2 * paddings[i] + filter_extent);
+    }
+    this->param_.Output()->Resize(framework::make_ddim(output_shape));
+  }
+ protected:
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif  // FUSION_FC_RELU_OP
--- a/src/operators/kernel/deconv_relu_kernel.h
+++ b/src/operators/kernel/deconv_relu_kernel.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_DECONVRELU_OP
+#pragma once
+#include "framework/operator.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+using framework::OpKernelBase;
+template <typename DeviceType, typename T>
+class DeconvReluKernel
+    : public OpKernelBase<DeviceType, FusionDeconvReluParam<DeviceType>> {
+ public:
+  void Compute(const FusionDeconvReluParam<DeviceType> &param);
+  bool Init(FusionDeconvReluParam<DeviceType> *param);
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef CONV_TRANSPOSE_OP
+#include "operators/kernel/conv_transpose_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool ConvTransposeKernel<FPGA, float>::Init(ConvTransposeParam<FPGA> *param) {
+  return true;
+}
+template <>
+void ConvTransposeKernel<FPGA, float>::Compute(
+    const ConvTransposeParam<FPGA> &param) {}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/fpga/V2/deconv_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/deconv_relu_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_DECONVRELU_OP
+#include "operators/kernel/deconv_relu_kernel.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool DeconvReluKernel<FPGA, float>::Init(FusionDeconvReluParam<FPGA> *param) {
+  return true;
+}
+template <>
+void DeconvReluKernel<FPGA, float>::Compute(
+    const FusionDeconvReluParam<FPGA> &param) {}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/fpga/V2/slice_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/slice_kernel.cpp
@@ -24,6 +24,7 @@ bool SliceKernel<FPGA, float>::Init(SliceParam<FPGA>* param) {
 }
 template <>
 void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) {}
 }  // namespace operators
 }  // namespace paddle_mobile
 #endif
--- a/src/operators/kernel/fpga/V2/softmax_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/softmax_kernel.cpp
@@ -49,7 +49,12 @@ void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) {
  Tensor *out = param.Out();
  fpga::PerformBypass(param.FpgaArgs());
+  fpga::fpga_invalidate(
+      (void *)in_x->data<float>(),                           // NOLINT
+      fpga::get_aligned_channel_num((int)in_x->dims()[1]) *  // NOLINT
+          sizeof(float));
  math::SoftmaxFuntor<CPU, float>()(in_x, out);
+  fpga::fpga_flush(out->data<float>(), out->memory_size());
 }
 }  // namespace operators

--- a/src/operators/kernel/fpga/V2/tanh_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/tanh_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef TANH_OP
+#include "operators/kernel/tanh_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool TanhKernel<FPGA, float>::Init(TanhParam<FPGA> *param) {
+  return true;
+}
+template <>
+void TanhKernel<FPGA, float>::Compute(const TanhParam<FPGA> &param) {}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/tanh_kernel.h
+++ b/src/operators/kernel/tanh_kernel.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#ifdef TANH_OP
+#include "framework/operator.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+using framework::OpKernelBase;
+template <typename DeviceType, typename T>
+class TanhKernel : public OpKernelBase<DeviceType, TanhParam<DeviceType>> {
+ public:
+  void Compute(const TanhParam<DeviceType>& param);
+  bool Init(TanhParam<DeviceType>* param);
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -1534,6 +1534,27 @@ class ReluParam<GPU_CL> : public ReluParamBase<GPU_CL> {
 #endif
+#ifdef TANH_OP
+template <typename Dtype>
+class TanhParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+ public:
+  TanhParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
+            const AttributeMap &attrs, const Scope &scope) {
+    input_x_ = InputXFrom<GType>(inputs, scope);
+    out_ = OutFrom<GType>(outputs, scope);
+  }
+  const RType *InputX() const { return input_x_; }
+  RType *Out() const { return out_; }
+ private:
+  RType *input_x_;
+  RType *out_;
+};
+#endif
 #ifdef PRELU_OP
 template <typename Dtype>
 class PReluParam : public OpParam {
@@ -2229,9 +2250,24 @@ class ConvTransposeParam : public OpParam {
  vector<int> paddings_;
  vector<int> dilations_;
  int groups;
+#ifdef PADDLE_MOBILE_FPGA
+ private:
+  fpga::DeconvArgs fpga_conv_args;
+ public:
+  const fpga::DeconvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::DeconvArgs &args) { fpga_conv_args = args; }
+#endif
 };
 #endif
+#ifdef FUSION_DECONVRELU_OP
+template <typename Dtype>
+using FusionDeconvReluParam = ConvTransposeParam<Dtype>;
+#endif
 #ifdef GRU_OP
 template <typename Dtype>
 class GruParam : public OpParam {

--- a/src/operators/tanh_op.cpp
+++ b/src/operators/tanh_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef TANH_OP
+#include "operators/tanh_op.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename DeviceType, typename T>
+void TanhOp<DeviceType, T>::InferShape() const {
+  this->param_.Out()->Resize(this->param_.InputX()->dims());
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(Tanh, ops::TanhOp);
+#endif
+#endif
--- a/src/operators/tanh_op.h
+++ b/src/operators/tanh_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef TANH_OP
+#pragma once
+#include <string>
+#include "framework/operator.h"
+#include "operators/kernel/tanh_kernel.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename DeviceType, typename T>
+class TanhOp : public framework::OperatorWithKernel<
+                   DeviceType, TanhParam<DeviceType>,
+                   operators::TanhKernel<DeviceType, T>> {
+ public:
+  TanhOp(const std::string &type, const VariableNameMap &inputs,
+         const VariableNameMap &outputs, const framework::AttributeMap &attrs,
+         std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<DeviceType, TanhParam<DeviceType>,
+                                      operators::TanhKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+  void InferShape() const override;
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -73,6 +73,9 @@ list(FIND NET "FPGA_NET_V2" CON)
 if (CON GREATER -1)
    ADD_EXECUTABLE(test-resnet50 fpga/test_resnet50.cpp test_helper.h test_include.h executor_for_test.h)
    target_link_libraries(test-resnet50 paddle-mobile)
+    ADD_EXECUTABLE(test-pe fpga/test_pe.cpp)
+    target_link_libraries(test-pe paddle-mobile)
    set(FOUND_MATCH ON)
 endif ()

--- a/test/fpga/test_pe.cpp
+++ b/test/fpga/test_pe.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef PADDLE_MOBILE_FPGA_V2
+#include "fpga/V2/api.h"
+#include "fpga/V2/filter.h"
+namespace fpga = paddle_mobile::fpga;
+static const uint32_t N = 64;
+static const uint32_t C = 3;
+static const uint32_t H = 224;
+static const uint32_t W = 224;
+static const uint32_t G = 1;
+fpga::DataType input_type = fpga::DATA_TYPE_FP32;
+fpga::DataType output_type = fpga::DATA_TYPE_FP16;
+void* ifm = nullptr;
+void* ofm = nullptr;
+void* filter = nullptr;
+void* ifm_scale = nullptr;
+void* ofm_scale = nullptr;
+void* filter_scale = nullptr;
+int ifm_size = 0, ofm_size = 0;
+void format_data() {
+  ifm_scale = fpga::fpga_malloc(8);
+  ofm_scale = fpga::fpga_malloc(8);
+  int ifm_channel = fpga::filter::calc_aligned_channel(C);
+  int ofm_channel = fpga::filter::calc_aligned_channel(N);
+  int num = fpga::filter::calc_aligned_num(N, C);
+  DLOG << "ifm_channel = " << ifm_channel;
+  DLOG << "ofm_channel = " << ofm_channel;
+  DLOG << "aligned_num = " << num;
+  ifm_size = ifm_channel * H * W;
+  ofm_size = ofm_channel * H * W;
+  ifm = fpga::fpga_malloc(ifm_size * sizeof(float));
+  ofm = fpga::fpga_malloc(ofm_size * sizeof(int16_t));
+  memset(ifm, 0, ifm_size * sizeof(float));
+  memset(ofm, 0, ofm_size * sizeof(int16_t));
+  for (int h = 0; h < H; h++) {
+    for (int w = 0; w < W; w++) {
+      for (int c = 0; c < C; c++) {
+        int index = h * W * ifm_channel + w * ifm_channel + c;
+        (reinterpret_cast<float*>(ifm))[index] = h + w + c * 0.1f;
+        // DLOG << index << ":" << ((float *) ifm)[index];
+      }
+    }
+  }
+  fpga::fpga_flush(ifm, ifm_size * sizeof(float));
+  fpga::fpga_flush(ofm, ofm_size * sizeof(int16_t));
+}
+void print_fp16(int16_t* ptr, int total_size, int num) {
+  fpga::fpga_invalidate(ptr, total_size * sizeof(int16_t));
+  int stride = total_size / num;
+  for (int i = 0; i < total_size; i += stride) {
+    DLOG << fpga::fp16_2_fp32(ptr[i]);
+  }
+}
+void print_fp32(float* ptr, int total_size, int num) {
+  fpga::fpga_invalidate(ptr, total_size * sizeof(float));
+  int stride = total_size / num;
+  for (int i = 0; i < total_size; i += stride) {
+    DLOG << ptr[i];
+  }
+}
+void test_bypass() {
+  fpga::BypassArgs args;
+  args.input_data_type = input_type;
+  args.output_data_type = output_type;
+  args.image.address = ifm;
+  args.image.height = H;
+  args.image.width = W;
+  args.image.channels = C;
+  args.image.scale_address = reinterpret_cast<float*>(ifm_scale);
+  args.output.address = ofm;
+  args.output.scale_address = reinterpret_cast<float*>(ofm_scale);
+  fpga::PerformBypass(args);
+}
+int main() {
+  paddle_mobile::fpga::open_device();
+  format_data();
+  DLOG << "format data done";
+  print_fp32(reinterpret_cast<float*>(ifm), ifm_size, 200);
+  DLOG << "print input done";
+  test_bypass();
+  DLOG << "test done";
+  print_fp16(reinterpret_cast<int16_t*>(ofm), ifm_size, 200);
+  std::cout << "Computation done" << std::endl;
+  return 0;
+}
+#endif
--- a/tools/op.cmake
+++ b/tools/op.cmake
@@ -133,9 +133,11 @@ if (CON GREATER -1)
  set(SOFTMAX_OP ON)
  set(FUSION_CONVBNRELU_OP ON)
  set(FUSION_CONVBN_OP ON)
-#  set(CONV_TRANSPOSE_OP ON)
+  set(CONV_TRANSPOSE_OP ON)
-#  set(SLICE_OP ON)
+  set(FUSION_DECONVRELU_OP ON)
-#  set(ELEMENTWISEADD_OP ON)
+  set(SLICE_OP ON)
+  set(TANH_OP ON)
+  set(ELEMENTWISEADD_OP ON)
  set(FOUND_MATCH ON)
 endif()
@@ -445,3 +447,9 @@ if (DEQUANT_OP)
  add_definitions(-DDEQUANT_OP)
 endif()
+if (TANH_OP)
+  add_definitions(-DTANH_OP)
+endif()
+if (FUSION_DECONVRELU_OP)
+  add_definitions(-DFUSION_DECONVRELU_OP)
+endif()
\ No newline at end of file