From c812a2513fd944cc719cb7210d8d265075838e16 Mon Sep 17 00:00:00 2001
From: zhangyang <zhangyang49@baidu.com>
Date: Mon, 12 Nov 2018 22:45:44 +0800
Subject: [PATCH] update V2 for FPGA track

---
 src/common/types.cpp                          |   6 +-
 src/common/types.h                            |   3 +
 src/fpga/V2/api.cpp                           |  98 +++++++-------
 src/fpga/V2/api.h                             | 109 +--------------
 src/fpga/V2/bias_scale.cpp                    |   3 +-
 src/fpga/V2/driver/bitmap.cpp                 |   4 +-
 src/fpga/V2/driver/bitmap.h                   |   2 +-
 src/fpga/V2/driver/driver.cpp                 |  90 +++++++++++--
 src/fpga/V2/driver/driver.h                   |  29 +++-
 src/fpga/V2/driver/pe.cpp                     |  28 ++--
 src/fpga/V2/driver/pe.h                       |   3 +-
 src/fpga/V2/filter.cpp                        |   3 +
 src/fpga/V2/fpga_common.cpp                   |  44 ++++++
 src/fpga/V2/fpga_common.h                     | 125 ++++++++++++++++++
 src/fpga/V2/image.cpp                         |   4 +
 src/framework/executor.cpp                    |  13 +-
 src/io/paddle_mobile.cpp                      |   6 +-
 src/memory/t_malloc.cpp                       |   2 +-
 src/operators/conv_transpose_op.cpp           |   1 +
 src/operators/fusion_deconv_relu_op.cpp       |  32 +++++
 src/operators/fusion_deconv_relu_op.h         | 107 +++++++++++++++
 src/operators/kernel/deconv_relu_kernel.h     |  39 ++++++
 .../kernel/fpga/V2/conv_transpose_kernel.cpp  |  34 +++++
 .../kernel/fpga/V2/deconv_relu_kernel.cpp     |  36 +++++
 src/operators/kernel/fpga/V2/slice_kernel.cpp |   1 +
 .../kernel/fpga/V2/softmax_kernel.cpp         |   5 +
 src/operators/kernel/fpga/V2/tanh_kernel.cpp  |  33 +++++
 src/operators/kernel/tanh_kernel.h            |  37 ++++++
 src/operators/op_param.h                      |  36 +++++
 src/operators/tanh_op.cpp                     |  35 +++++
 src/operators/tanh_op.h                       |  44 ++++++
 test/CMakeLists.txt                           |   3 +
 test/fpga/test_pe.cpp                         | 111 ++++++++++++++++
 tools/op.cmake                                |  14 +-
 34 files changed, 947 insertions(+), 193 deletions(-)
 create mode 100644 src/fpga/V2/fpga_common.cpp
 create mode 100644 src/fpga/V2/fpga_common.h
 create mode 100644 src/operators/fusion_deconv_relu_op.cpp
 create mode 100644 src/operators/fusion_deconv_relu_op.h
 create mode 100644 src/operators/kernel/deconv_relu_kernel.h
 create mode 100644 src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp
 create mode 100644 src/operators/kernel/fpga/V2/deconv_relu_kernel.cpp
 create mode 100644 src/operators/kernel/fpga/V2/tanh_kernel.cpp
 create mode 100644 src/operators/kernel/tanh_kernel.h
 create mode 100644 src/operators/tanh_op.cpp
 create mode 100644 src/operators/tanh_op.h
 create mode 100644 test/fpga/test_pe.cpp

diff --git a/src/common/types.cpp b/src/common/types.cpp
index ede49478ce..510313d9fe 100644
--- a/src/common/types.cpp
+++ b/src/common/types.cpp
@@ -71,6 +71,8 @@ const char *G_OP_TYPE_SUM = "sum";
 
 const char *G_OP_TYPE_QUANTIZE = "quantize";
 const char *G_OP_TYPE_DEQUANTIZE = "dequantize";
+extern const char *G_OP_TYPE_TANH = "tanh";
+extern const char *G_OP_TYPE_FUSION_DECONV_RELU = "fusion_deconv_relu";
 
 std::unordered_map<
     std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
@@ -129,5 +131,7 @@ std::unordered_map<
         {G_OP_TYPE_SUM, {{"X"}, {"Out"}}},
         {G_OP_TYPE_ELEMENTWISE_MUL, {{"X", "Y"}, {"Out"}}},
         {G_OP_TYPE_QUANTIZE, {{"X"}, {"Out", "OutScale"}}},
-        {G_OP_TYPE_DEQUANTIZE, {{"X", "Scale"}, {"Out"}}}};
+        {G_OP_TYPE_DEQUANTIZE, {{"X", "Scale"}, {"Out"}}},
+        {G_OP_TYPE_TANH, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_DECONV_RELU, {{"Input"}, {"Out"}}}};
 }  // namespace paddle_mobile
diff --git a/src/common/types.h b/src/common/types.h
index 70f6debf87..4cd35ac910 100644
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -139,6 +139,9 @@ extern const char *G_OP_TYPE_ELEMENTWISE_MUL;
 extern const char *G_OP_TYPE_QUANTIZE;
 extern const char *G_OP_TYPE_DEQUANTIZE;
 
+extern const char *G_OP_TYPE_TANH;
+extern const char *G_OP_TYPE_FUSION_DECONV_RELU;
+
 extern std::unordered_map<
     std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
     op_input_output_key;
diff --git a/src/fpga/V2/api.cpp b/src/fpga/V2/api.cpp
index 324ee4f538..2f8a9f119e 100644
--- a/src/fpga/V2/api.cpp
+++ b/src/fpga/V2/api.cpp
@@ -16,27 +16,29 @@ limitations under the License. */
 #include <algorithm>
 #include "fpga/V2/bias_scale.h"
 #include "fpga/V2/config.h"
+#include "fpga/V2/driver/driver.h"
 #include "fpga/V2/filter.h"
 #include "fpga/V2/image.h"
 
 namespace paddle_mobile {
 namespace fpga {
+
 static std::map<void *, size_t> memory_map;
 
 int open_device() {
-  int ret = open_device_driver();
+  int ret = driver::open_device_driver();
   return ret;
 }
 
 int close_device() {
-  int ret = close_device_driver();
+  int ret = driver::close_device_driver();
   return ret;
 }
 
 void *fpga_malloc(size_t size) {
   static uint64_t counter = 0;
 #ifdef PADDLE_MOBILE_ZU5
-  auto ptr = fpga_malloc_driver(size);
+  auto ptr = driver::fpga_malloc_driver(size);
 #else
   auto ptr = malloc(size);
 #endif
@@ -55,7 +57,7 @@ void fpga_free(void *ptr) {
     size = iter->second;
     memory_map.erase(iter);
 #ifdef PADDLE_MOBILE_ZU5
-    fpga_free_driver(ptr);
+    driver::fpga_free_driver(ptr);
 #else
     free(ptr);
 #endif
@@ -66,26 +68,27 @@ void fpga_free(void *ptr) {
     DLOG << "Invalid pointer";
   }
 }
-
-half fp32_2_fp16(float fp32_num) {
-  unsigned long tmp = *(unsigned long *)(&fp32_num);  // NOLINT
-  auto t = (half)(((tmp & 0x007fffff) >> 13) | ((tmp & 0x80000000) >> 16) |
-                  (((tmp & 0x7f800000) >> 13) - (112 << 10)));
-  if (tmp & 0x1000) {
-    t++;  // roundoff
-  }
-  return t;
+void fpga_copy(void *dest, const void *src, size_t num) {
+#ifdef PADDLE_MOBILE_ZU5
+  driver::fpga_copy_driver(dest, src, num);
+#else
+  memcpy(dest, src, num);
+#endif
 }
 
-float fp16_2_fp32(half fp16_num) {
-  int frac = (fp16_num & 0x3ff);
-  int exp = ((fp16_num & 0x7c00) >> 10) + 112;
-  int s = fp16_num & 0x8000;
-  int tmp = 0;
-  float fp32_num;
-  tmp = s << 16 | exp << 23 | frac << 13;
-  fp32_num = *(float *)&tmp;  // NOLINT
-  return fp32_num;
+int fpga_flush(void *address, size_t size) {
+#ifdef PADDLE_MOBILE_ZU5
+  return driver::fpga_flush_driver(address, size);
+#else
+  return 0;
+#endif
+}
+int fpga_invalidate(void *address, size_t size) {
+#ifdef PADDLE_MOBILE_ZU5
+  return driver::fpga_invalidate_driver(address, size);
+#else
+  return 0;
+#endif
 }
 
 void format_image(framework::Tensor *image_tensor) {
@@ -240,7 +243,7 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
   arg->filter_num = (uint32_t)filter->dims()[0];
   arg->output.address = out_ptr;
   arg->output.scale_address = out->scale;
-  arg->conv_args =
+  arg->conv_arg =
       (ConvArgs *)fpga_malloc(arg->split_num * sizeof(ConvArgs));  // NOLINT
 
   arg->concat_arg.image_num = arg->split_num;
@@ -258,28 +261,33 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
       (uint32_t *)fpga_malloc(n * sizeof(uint32_t));  // NOLINT
 
   for (int i = 0; i < n; i++) {
-    arg->conv_args[i].relu_enabled = relu_enabled;
-    arg->conv_args[i].sb_address = bs_ptr;
-    arg->conv_args[i].filter_address = (int8_t *)filter_ptr;  // NOLINT
-    arg->conv_args[i].filter_scale_address = filter->scale;
-    arg->conv_args[i].filter_num = arg->filter_num;
-    arg->conv_args[i].group_num = (uint32_t)group_num;
-
-    arg->conv_args[i].kernel.stride_h = (uint32_t)stride_h;
-    arg->conv_args[i].kernel.stride_w = (uint32_t)stride_w;
-    arg->conv_args[i].kernel.height = (uint32_t)filter->dims()[2];
-    arg->conv_args[i].kernel.width = (uint32_t)filter->dims()[3];
-
-    arg->conv_args[i].image.address = input_ptr;
-    arg->conv_args[i].image.scale_address = input->scale;
-    arg->conv_args[i].image.channels = (uint32_t)input->dims()[1];
-    arg->conv_args[i].image.height = (uint32_t)input->dims()[2];
-    arg->conv_args[i].image.width = (uint32_t)input->dims()[3];
-    arg->conv_args[i].image.pad_height = (uint32_t)padding_h;
-    arg->conv_args[i].image.pad_width = (uint32_t)padding_w;
-
-    arg->conv_args[i].output.address = out_ptr;
-    arg->conv_args[i].output.scale_address = out->scale;
+    arg->conv_arg[i].relu_enabled = relu_enabled;
+    arg->conv_arg[i].sb_address = bs_ptr;
+    arg->conv_arg[i].filter_address = (int8_t *)filter_ptr;  // NOLINT
+    arg->conv_arg[i].filter_scale_address = filter->scale;
+    arg->conv_arg[i].filter_num = arg->filter_num;
+    arg->conv_arg[i].group_num = (uint32_t)group_num;
+
+    arg->conv_arg[i].kernel.stride_h = (uint32_t)stride_h;
+    arg->conv_arg[i].kernel.stride_w = (uint32_t)stride_w;
+    arg->conv_arg[i].kernel.height = (uint32_t)filter->dims()[2];
+    arg->conv_arg[i].kernel.width = (uint32_t)filter->dims()[3];
+
+    arg->conv_arg[i].image.address = input_ptr;
+    arg->conv_arg[i].image.scale_address = input->scale;
+    arg->conv_arg[i].image.channels = (uint32_t)input->dims()[1];
+    arg->conv_arg[i].image.height = (uint32_t)input->dims()[2];
+    arg->conv_arg[i].image.width = (uint32_t)input->dims()[3];
+    arg->conv_arg[i].image.pad_height = (uint32_t)padding_h;
+    arg->conv_arg[i].image.pad_width = (uint32_t)padding_w;
+
+    arg->conv_arg[i].output.address = out_ptr;
+    arg->conv_arg[i].output.scale_address = out->scale;
+
+    int num_after_alignment =
+        filter::calc_aligned_num((int)input->dims()[1], arg->filter_num);
+    arg->conv_arg[i].free_space =
+        fpga_malloc(num_after_alignment * 2 * sizeof(half));
   }
 }
 
diff --git a/src/fpga/V2/api.h b/src/fpga/V2/api.h
index aac97bec22..1f4a203936 100644
--- a/src/fpga/V2/api.h
+++ b/src/fpga/V2/api.h
@@ -14,118 +14,20 @@ limitations under the License. */
 
 #pragma once
 
-#include <stdint.h>
-#include <cstddef>
-#include <iostream>
-#include <limits>
-#include "fpga/V2/driver/driver.h"
 #include "fpga/V2/driver/pe.h"
+#include "fpga/V2/fpga_common.h"
 #include "framework/tensor.h"
 
 namespace paddle_mobile {
 namespace fpga {
 
-enum DataType {
-  DATA_TYPE_FP32 = 1,
-  DATA_TYPE_FP16 = 0,
-};
-
-enum LayoutType {
-  LAYOUT_CHW = 1,
-  LAYOUT_HWC = 0,
-};
-
-struct KernelArgs {
-  uint32_t width;
-  uint32_t height;
-  uint32_t stride_w;
-  uint32_t stride_h;
-};
-
-struct ImageInputArgs {
-  void* address;         // input featuremap virtual address
-  float* scale_address;  // input scale address;
-  uint32_t channels;
-  uint32_t width;  // featuremap width
-  uint32_t height;
-  uint32_t pad_width;  // padding width;
-  uint32_t pad_height;
-};
-
-struct ImageOutputArgs {
-  void* address;         // output result address;
-  float* scale_address;  // output scale address;
-  uint64_t timer_cnt;    // time counter for FPGA computation
-};
-
-struct ConvArgs {
-  bool relu_enabled;
-  void* sb_address;  // scale and bias are interlaced;
-  void* filter_address;
-  float* filter_scale_address;
-  uint32_t filter_num;
-  uint32_t group_num;
-
-  struct KernelArgs kernel;
-  struct ImageInputArgs image;  // input image;
-  struct ImageOutputArgs output;
-};
-
-struct ConcatArgs {
-  uint32_t image_num;
-  half** images_in;
-  float** scales_in;
-  void* image_out;
-  float* scale_out;
-  uint32_t* channel_num;
-  uint32_t* aligned_channel_num;
-  uint32_t out_channel;
-  uint32_t height;
-  uint32_t width;
-};
-
-struct SplitConvArgs {
-  uint32_t split_num;
-  uint32_t group_num;
-  uint32_t filter_num;
-  struct ImageOutputArgs output;
-  struct ConvArgs* conv_args;
-  struct ConcatArgs concat_arg;
-};
-
-struct PoolingArgs {
-  int16_t mode;  // mode: 0:max, 1:avg
-  half kernel_reciprocal;
-  struct KernelArgs kernel;
-  struct ImageInputArgs image;  // input image;
-  struct ImageOutputArgs output;
-};
-
-struct EWAddArgs {
-  bool relu_enabled;
-
-  uint32_t const0;  // output0 = const0 x input0 + const1 x input1;
-  uint32_t const1;
-  struct ImageInputArgs image0;
-  struct ImageInputArgs image1;
-  struct ImageOutputArgs output;
-};
-
-struct BypassArgs {
-  enum DataType input_data_type;
-  enum DataType output_data_type;
-  enum LayoutType input_layout_type;
-  enum LayoutType output_layout_type;
-  struct ImageInputArgs image;
-  struct ImageOutputArgs output;
-};
-
 int open_device();
 int close_device();
 void* fpga_malloc(size_t size);
 void fpga_free(void* ptr);
-
-static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; }
+void fpga_copy(void* dest, const void* src, size_t num);
+int fpga_flush(void* address, size_t size);
+int fpga_invalidate(void* address, size_t size);
 
 float filter_find_max(framework::Tensor* filter_tensor);
 int get_aligned_channel_num(int channel_num);
@@ -153,8 +55,5 @@ void fill_split_arg(struct SplitConvArgs* arg, framework::Tensor* input,
                     bool relu_enabled, int group_num, int stride_h,
                     int stride_w, int padding_h, int padding_w, float* bs_ptr);
 
-half fp32_2_fp16(float fp32_num);
-float fp16_2_fp32(half fp16_num);
-
 }  // namespace fpga
 }  // namespace paddle_mobile
diff --git a/src/fpga/V2/bias_scale.cpp b/src/fpga/V2/bias_scale.cpp
index 8a0fd42619..3afd3f51bb 100644
--- a/src/fpga/V2/bias_scale.cpp
+++ b/src/fpga/V2/bias_scale.cpp
@@ -27,7 +27,7 @@ void align_element(float **data_in, int num, int num_after_alignment) {
       (float *)fpga_malloc(total_element * sizeof(float));  // NOLINT
   memset(ptr_aligned, 0, total_element * sizeof(float));
 
-  for (int i = 1; i < num; i++) {
+  for (int i = 0; i < num; i++) {
     ptr_aligned[i * 2 + 0] = ptr_unaligned[i];
     ptr_aligned[i * 2 + 1] = ptr_unaligned[i + num];
   }
@@ -39,6 +39,7 @@ void align_element(float **data_in, int num, int num_after_alignment) {
 void format_bias_scale_array(float **data_in, int num,
                              int num_after_alignment) {
   align_element(data_in, num, num_after_alignment);
+  fpga_flush(*data_in, 2 * num_after_alignment * sizeof(float));
 }
 
 }  // namespace bias_scale
diff --git a/src/fpga/V2/driver/bitmap.cpp b/src/fpga/V2/driver/bitmap.cpp
index 9c99f6446c..c612faa6ae 100644
--- a/src/fpga/V2/driver/bitmap.cpp
+++ b/src/fpga/V2/driver/bitmap.cpp
@@ -57,8 +57,8 @@ static uint64_t ffs(uint64_t data) {
   uint64_t bit = 0;
   int i = 0;
 
-  for (i = 0; i < sizeof(data); i++) {
-    if (data & (1 << i)) {
+  for (i = 0; i < sizeof(data) * 8; i++) {
+    if (data & (1UL << i)) {
       bit = i;
       break;
     }
diff --git a/src/fpga/V2/driver/bitmap.h b/src/fpga/V2/driver/bitmap.h
index 272cddf233..4cb1673d91 100644
--- a/src/fpga/V2/driver/bitmap.h
+++ b/src/fpga/V2/driver/bitmap.h
@@ -25,7 +25,7 @@ limitations under the License. */
 #define __ALIGN_KERNEL_MASK(x, mask) (((x) + (mask)) & ~(mask))
 #define __ALIGN_MASK(x, mask) __ALIGN_KERNEL_MASK((x), (mask))
 
-#define round_down(x, y) ((x) & ((y)-1))
+#define round_down(x, y) ((x) & ~((y)-1))
 
 namespace fpga_bitmap {
 void bitmap_set(uint64_t *map, unsigned int start, int len);
diff --git a/src/fpga/V2/driver/driver.cpp b/src/fpga/V2/driver/driver.cpp
index ed78fa5ebc..d7e7178267 100644
--- a/src/fpga/V2/driver/driver.cpp
+++ b/src/fpga/V2/driver/driver.cpp
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <sys/ioctl.h>
 #include <sys/mman.h>
 #include <unistd.h>
 #include <algorithm>
@@ -32,6 +33,7 @@ limitations under the License. */
 
 namespace paddle_mobile {
 namespace fpga {
+namespace driver {
 struct FPGA_INFO g_fpgainfo;
 
 int open_drvdevice() {
@@ -43,7 +45,8 @@ int open_drvdevice() {
 
 int open_memdevice() {
   if (g_fpgainfo.fd_mem == -1) {
-    g_fpgainfo.fd_mem = open(g_fpgainfo.memdevice_path, O_RDWR | O_DSYNC);
+    // g_fpgainfo.fd_mem = open(g_fpgainfo.memdevice_path, O_RDWR | O_DSYNC);
+    g_fpgainfo.fd_mem = open(g_fpgainfo.memdevice_path, O_RDWR);
   }
   return g_fpgainfo.fd_mem;
 }
@@ -51,7 +54,6 @@ int open_memdevice() {
 void pl_reset() {
   // DLOG << "PL RESET";
 
-  // reg_writeq(0x5a, REG_FPGA_RESET);
   usleep(100 * 1000);
 }
 
@@ -131,7 +133,7 @@ int pl_get_status() { return 0; }
 int fpga_regpoll(uint64_t reg, uint64_t val, int time) {
   uint64_t i = 0;
   /*timeout精确性待确认*/
-  int64_t timeout = time * CPU_FREQ / 1000000;
+  int64_t timeout = time * 6;
 
   for (i = 0; i < timeout; i++) {
     if (val == reg_readq(reg)) {
@@ -173,9 +175,14 @@ int memory_request(struct fpga_memory *memory, size_t size, uint64_t *addr) {
 }
 
 void memory_release(struct fpga_memory *memory) {
-  pthread_mutex_lock(&memory->mutex);
-  fpga_bitmap::bitmap_clear(memory->bitmap, 0, memory->page_num);
-  pthread_mutex_unlock(&memory->mutex);
+  void *ptr = nullptr;
+
+  /*unmap memory*/
+  std::map<void *, size_t> map = g_fpgainfo.fpga_addr2size_map;
+  std::map<void *, size_t>::iterator iter;
+  for (iter = map.begin(); iter != map.end(); iter++) {
+    fpga_free_driver(ptr);
+  }
 }
 
 int create_fpga_memory_inner(struct fpga_memory *memory, size_t memory_size) {
@@ -238,7 +245,6 @@ int init_fpga_memory(struct fpga_memory *memory) {
     return rc;
   }
 
-  // spin_lock_init(&memory->spin);
   fpga_bitmap::bitmap_clear(memory->bitmap, 0, memory->page_num);
   fpga_bitmap::bitmap_set(memory->bitmap, 0, 1);  // NOTE reserve fpga page 0.
 
@@ -293,9 +299,23 @@ void *fpga_reg_malloc(size_t size) {
   return ret;
 }
 
+void *fpga_reg_free(void *ptr) {
+  size_t size = 0;
+
+  auto iter = g_fpgainfo.fpga_addr2size_map.find(ptr);
+  if (iter != g_fpgainfo.fpga_addr2size_map.end()) {
+    size = iter->second;
+    g_fpgainfo.fpga_addr2size_map.erase(iter);
+    munmap(ptr, size);
+  } else {
+    DLOG << "Invalid pointer";
+  }
+}
+
 void *fpga_malloc_driver(size_t size) {
   void *ret = nullptr;
   uint64_t phy_addr = 0;
+  int i = 0;
 
   memory_request(g_fpgainfo.memory_info, size, &phy_addr);
 
@@ -311,17 +331,70 @@ void *fpga_malloc_driver(size_t size) {
 
 void fpga_free_driver(void *ptr) {
   size_t size = 0;
+  uint32_t pos = 0;
+  uint64_t p_addr = 0;
 
   auto iter = g_fpgainfo.fpga_addr2size_map.find(ptr);
   if (iter != g_fpgainfo.fpga_addr2size_map.end()) {
     size = iter->second;
     g_fpgainfo.fpga_addr2size_map.erase(iter);
     munmap(ptr, size);
+
+    p_addr = vaddr_to_paddr(ptr);
+    pos = (p_addr - g_fpgainfo.memory_info->mem_start) / FPGA_PAGE_SIZE;
+
+    /*clear bitmap*/
+    pthread_mutex_lock(&g_fpgainfo.memory_info->mutex);
+    fpga_bitmap::bitmap_clear(g_fpgainfo.memory_info->bitmap, pos,
+                              g_fpgainfo.memory_info->nr[pos]);
+    pthread_mutex_unlock(&g_fpgainfo.memory_info->mutex);
   } else {
     DLOG << "Invalid pointer";
   }
 }
 
+static inline int do_ioctl(unsigned long req, const void *arg) {
+  return ioctl(g_fpgainfo.fd_mem, req, arg);
+}
+
+int fpga_flush_driver(void *address, size_t size) {
+  struct MemoryCacheArgs args;
+  uint64_t p_addr;
+
+  p_addr = vaddr_to_paddr(address);
+
+  args.offset = (void *)(p_addr - FPGA_MEM_PHY_ADDR);
+  args.size = size;
+
+  return do_ioctl(IOCTL_MEMCACHE_FLUSH, &args);
+}
+
+int fpga_invalidate_driver(void *address, size_t size) {
+  struct MemoryCacheArgs args;
+  uint64_t p_addr;
+
+  p_addr = vaddr_to_paddr(address);
+
+  args.offset = (void *)(p_addr - FPGA_MEM_PHY_ADDR);
+  args.size = size;
+
+  return do_ioctl(IOCTL_MEMCACHE_INVAL, &args);
+}
+
+void fpga_copy_driver(void *dest, const void *src, size_t num) {
+  uint64_t i;
+
+  DLOG << "dest:" << dest << " src:" << src << " size:" << num;
+
+  for (i = 0; i < num; i++) {
+    // DLOG << "i:" << i << " val:" << *((int8_t *)src + i);
+    // usleep(1);
+    *((int8_t *)dest + i) = *((int8_t *)src + i);
+  }
+
+  return;
+}
+
 int open_device_driver() {
   g_fpgainfo.FpgaRegPhyAddr = FPGA_REG_PHY_ADDR;
   g_fpgainfo.FpgaMemPhyAddr = FPGA_MEM_PHY_ADDR;
@@ -347,12 +420,13 @@ int open_device_driver() {
 
 int close_device_driver() {
   pl_destroy();
-  fpga_free_driver(g_fpgainfo.FpgaRegVirAddr);
+  fpga_reg_free(g_fpgainfo.FpgaRegVirAddr);
   memory_release(g_fpgainfo.memory_info);
   destroy_fpga_memory(g_fpgainfo.memory_info);
 
   return 0;
 }
 
+}  // namespace driver
 }  // namespace fpga
 }  // namespace paddle_mobile
diff --git a/src/fpga/V2/driver/driver.h b/src/fpga/V2/driver/driver.h
index ee01454ac5..633e95ea82 100644
--- a/src/fpga/V2/driver/driver.h
+++ b/src/fpga/V2/driver/driver.h
@@ -24,6 +24,7 @@ limitations under the License. */
 
 namespace paddle_mobile {
 namespace fpga {
+namespace driver {
 
 #define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
 
@@ -47,6 +48,15 @@ const int PE_IDX_BYPASS = 3;
 
 enum pe_status { IDLE = 0, BUSY = 1 };
 
+struct MemoryCacheArgs {
+  void *offset;
+  size_t size;
+};
+
+#define IOCTL_FPGA_MAGIC 'FPGA'
+#define IOCTL_MEMCACHE_INVAL _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryCacheArgs)
+#define IOCTL_MEMCACHE_FLUSH _IOW(IOCTL_FPGA_MAGIC, 13, struct MemoryCacheArgs)
+
 struct fpga_pe {
   char type_name[MAX_TYPE_NAME_LENTH + 1];
   struct pe_data_s *outer;
@@ -95,26 +105,39 @@ extern struct FPGA_INFO g_fpgainfo;
 
 inline uint64_t reg_readq(uint32_t offset) {
   // DLOG << "offset : " << offset;
-  uint64_t value =
-      *(uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + offset);  // NOLINT
+  uint64_t value = *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr +
+                                          offset);  // NOLINT
 
   return value;
 }
 
 inline void reg_writeq(uint64_t value, uint32_t offset) {
   // DLOG << "offset : " << offset << ", value : " << value;
-  *(uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + offset) =  // NOLINT
+  *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr +
+                         offset) =  // NOLINT
       value;
 }
 
 int open_device_driver();
+
 int close_device_driver();
+
 void *fpga_malloc_driver(size_t size);
+
 void fpga_free_driver(void *ptr);
+
+void fpga_copy_driver(void *dest, const void *src, size_t num);
+
+int fpga_flush_driver(void *address, size_t size);
+
+int fpga_invalidate_driver(void *address, size_t size);
+
 /*pe*/
 
 uint64_t vaddr_to_paddr(void *address);
+
 int fpga_regpoll(uint64_t reg, uint64_t val, int time);
 
+}  // namespace driver
 }  // namespace fpga
 }  // namespace paddle_mobile
diff --git a/src/fpga/V2/driver/pe.cpp b/src/fpga/V2/driver/pe.cpp
index 52cde04601..2e806bfb37 100644
--- a/src/fpga/V2/driver/pe.cpp
+++ b/src/fpga/V2/driver/pe.cpp
@@ -20,29 +20,29 @@ limitations under the License. */
 
 namespace paddle_mobile {
 namespace fpga {
-#define MUL8(x) (x * 8)
+#define MUL8(x) ((x)*8)
 #define BYPASS_DONE 1
 
 float Findfp16Max() {
   uint16_t abs_vals[16];
   uint64_t max_fp16;
 
-  max_fp16 = reg_readq(MUL8(49));
+  max_fp16 = driver::reg_readq(MUL8(49));
   abs_vals[0] = (uint16_t)(0x0000007f & (max_fp16));        // NOLINT
   abs_vals[1] = (uint16_t)(0x0000007f & (max_fp16 >> 16));  // NOLINT
   abs_vals[2] = (uint16_t)(0x0000007f & (max_fp16 >> 32));  // NOLINT
   abs_vals[3] = (uint16_t)(0x0000007f & (max_fp16 >> 48));  // NOLINT
-  max_fp16 = reg_readq(MUL8(50));
+  max_fp16 = driver::reg_readq(MUL8(50));
   abs_vals[4] = (uint16_t)(0x0000007f & (max_fp16));        // NOLINT
   abs_vals[5] = (uint16_t)(0x0000007f & (max_fp16 >> 16));  // NOLINT
   abs_vals[6] = (uint16_t)(0x0000007f & (max_fp16 >> 32));  // NOLINT
   abs_vals[7] = (uint16_t)(0x0000007f & (max_fp16 >> 48));  // NOLINT
-  max_fp16 = reg_readq(MUL8(51));
+  max_fp16 = driver::reg_readq(MUL8(51));
   abs_vals[8] = (uint16_t)(0x0000007f & (max_fp16));         // NOLINT
   abs_vals[9] = (uint16_t)(0x0000007f & (max_fp16 >> 16));   // NOLINT
   abs_vals[10] = (uint16_t)(0x0000007f & (max_fp16 >> 32));  // NOLINT
   abs_vals[11] = (uint16_t)(0x0000007f & (max_fp16 >> 48));  // NOLINT
-  max_fp16 = reg_readq(MUL8(52));
+  max_fp16 = driver::reg_readq(MUL8(52));
   abs_vals[12] = (uint16_t)(0x0000007f & (max_fp16));
   abs_vals[13] = (uint16_t)(0x0000007f & (max_fp16 >> 16));  // NOLINT
   abs_vals[14] = (uint16_t)(0x0000007f & (max_fp16 >> 32));  // NOLINT
@@ -58,7 +58,7 @@ float Findfp16Max() {
 }
 
 int ComputeFpgaConv(const struct SplitConvArgs &args) {
-  ComputeBasicConv(args.conv_args[0]);
+  ComputeBasicConv(args.conv_arg[0]);
 }
 
 int ComputeBasicConv(const struct ConvArgs &args) {
@@ -166,8 +166,8 @@ int PerformBypass(const struct BypassArgs &args) {
   return 0;
 #endif
 
-  uint64_t ifm_src_paddr = vaddr_to_paddr(args.image.address);
-  uint64_t ifm_dst_paddr = vaddr_to_paddr(args.output.address);
+  uint64_t ifm_src_paddr = driver::vaddr_to_paddr(args.image.address);
+  uint64_t ifm_dst_paddr = driver::vaddr_to_paddr(args.output.address);
   uint64_t bp_enable;
   int64_t length;
   uint64_t pixels;
@@ -196,16 +196,16 @@ int PerformBypass(const struct BypassArgs &args) {
   }
 
   // start bypass
-  reg_writeq(ifm_src_paddr, MUL8(27));
-  reg_writeq(ifm_dst_paddr, MUL8(28));
-  reg_writeq(0, MUL8(0));
-  reg_writeq(bp_enable, MUL8(0));
+  driver::reg_writeq(ifm_src_paddr, MUL8(27));
+  driver::reg_writeq(ifm_dst_paddr, MUL8(28));
+  driver::reg_writeq(0, MUL8(0));
+  driver::reg_writeq(bp_enable, MUL8(0));
   // poll
   int ret = -1;
-  ret = fpga_regpoll(MUL8(48), BYPASS_DONE, 0xffffffff);
+  ret = driver::fpga_regpoll(MUL8(48), BYPASS_DONE, 0xffffffff);
   if (ret != -1) {
     // clear "irq"
-    reg_readq(MUL8(63));
+    driver::reg_readq(MUL8(63));
   }
   // get max value
   if ((!args.input_data_type) && (!args.output_data_type)) {
diff --git a/src/fpga/V2/driver/pe.h b/src/fpga/V2/driver/pe.h
index 4ec3ccb01d..4903bf4c33 100644
--- a/src/fpga/V2/driver/pe.h
+++ b/src/fpga/V2/driver/pe.h
@@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
-#include "fpga/V2/api.h"
+
+#include "fpga/V2/fpga_common.h"
 
 namespace paddle_mobile {
 namespace fpga {
diff --git a/src/fpga/V2/filter.cpp b/src/fpga/V2/filter.cpp
index 39d67b2d2d..ce278edbee 100644
--- a/src/fpga/V2/filter.cpp
+++ b/src/fpga/V2/filter.cpp
@@ -94,6 +94,7 @@ void format_filter(float **data_in, int num, int channel, int height, int width,
   convert_to_hwc(data_in, num, channel, height, width);
   align_filter(data_in, num, channel, height, width);
   int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width);
+  fpga_flush(*data_in, pixel_num * sizeof(float));
 }
 
 void convert_fc_filter(float **data_in, int num, int chw) {
@@ -113,6 +114,8 @@ void format_fc_filter(float **data_in, int num, int channel, int height,
   int chw = channel * height * width;
   convert_fc_filter(data_in, num, chw);
   align_filter(data_in, num, channel, height, width);
+  int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width);
+  fpga_flush(*data_in, pixel_num * sizeof(float));
 }
 
 float find_max(float *data_in, int data_size) {
diff --git a/src/fpga/V2/fpga_common.cpp b/src/fpga/V2/fpga_common.cpp
new file mode 100644
index 0000000000..01bca30a9c
--- /dev/null
+++ b/src/fpga/V2/fpga_common.cpp
@@ -0,0 +1,44 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <fpga/V2/fpga_common.h>
+namespace paddle_mobile {
+namespace fpga {
+
+int16_t fp32_2_fp16(float fp32_num) {
+  unsigned long tmp = *(unsigned long *)(&fp32_num);  // NOLINT
+  auto t = (int16_t)(((tmp & 0x007fffff) >> 13) | ((tmp & 0x80000000) >> 16) |
+                     (((tmp & 0x7f800000) >> 13) - (112 << 10)));
+  if (tmp & 0x1000) {
+    t++;  // roundoff
+  }
+  return t;
+}
+
+float fp16_2_fp32(int16_t fp16_num) {
+  if (0 == fp16_num) {
+    return 0;
+  }
+  int frac = (fp16_num & 0x3ff);
+  int exp = ((fp16_num & 0x7c00) >> 10) + 112;
+  int s = fp16_num & 0x8000;
+  int tmp = 0;
+  float fp32_num;
+  tmp = s << 16 | exp << 23 | frac << 13;
+  fp32_num = *(float *)&tmp;  // NOLINT
+  return fp32_num;
+}
+
+}  // namespace fpga
+}  // namespace paddle_mobile
diff --git a/src/fpga/V2/fpga_common.h b/src/fpga/V2/fpga_common.h
new file mode 100644
index 0000000000..1862d84350
--- /dev/null
+++ b/src/fpga/V2/fpga_common.h
@@ -0,0 +1,125 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cstdint>
+
+namespace paddle_mobile {
+namespace fpga {
+
+enum DataType {
+  DATA_TYPE_FP32 = 1,
+  DATA_TYPE_FP16 = 0,
+};
+
+enum LayoutType {
+  LAYOUT_CHW = 1,
+  LAYOUT_HWC = 0,
+};
+
+struct KernelArgs {
+  uint32_t width;
+  uint32_t height;
+  uint32_t stride_w;
+  uint32_t stride_h;
+};
+
+struct ImageInputArgs {
+  void* address;         // input featuremap virtual address
+  float* scale_address;  // input scale address;
+  uint32_t channels;
+  uint32_t width;  // featuremap width
+  uint32_t height;
+  uint32_t pad_width;  // padding width;
+  uint32_t pad_height;
+};
+
+struct ImageOutputArgs {
+  void* address;         // output result address;
+  float* scale_address;  // output scale address;
+  uint64_t timer_cnt;    // time counter for FPGA computation
+};
+
+struct ConvArgs {
+  bool relu_enabled;
+  void* sb_address;  // scale and bias
+  void* filter_address;
+  float* filter_scale_address;
+  void* free_space;  // used by FPGA logic
+  uint32_t filter_num;
+  uint32_t group_num;
+
+  struct KernelArgs kernel;
+  struct ImageInputArgs image;  // input image;
+  struct ImageOutputArgs output;
+};
+
+struct ConcatArgs {
+  uint32_t image_num;
+  int16_t** images_in;
+  float** scales_in;
+  void* image_out;
+  float* scale_out;
+  uint32_t* channel_num;
+  uint32_t* aligned_channel_num;
+  uint32_t out_channel;
+  uint32_t height;
+  uint32_t width;
+};
+
+struct SplitConvArgs {
+  uint32_t split_num;
+  uint32_t group_num;
+  uint32_t filter_num;
+  struct ImageOutputArgs output;
+  struct ConvArgs* conv_arg;
+  struct ConcatArgs concat_arg;
+};
+
+struct PoolingArgs {
+  int16_t mode;  // mode: 0:max, 1:avg
+  int16_t kernel_reciprocal;
+  struct KernelArgs kernel;
+  struct ImageInputArgs image;  // input image;
+  struct ImageOutputArgs output;
+};
+
+struct EWAddArgs {
+  bool relu_enabled;
+  uint32_t const0;  // output0 = const0 x input0 + const1 x input1;
+  uint32_t const1;
+  struct ImageInputArgs image0;
+  struct ImageInputArgs image1;
+  struct ImageOutputArgs output;
+};
+
+struct BypassArgs {
+  enum DataType input_data_type;
+  enum DataType output_data_type;
+  enum LayoutType input_layout_type;
+  enum LayoutType output_layout_type;
+  struct ImageInputArgs image;
+  struct ImageOutputArgs output;
+};
+
+struct DeconvArgs {
+  struct ConvArgs conv_arg;
+};
+static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; }
+int16_t fp32_2_fp16(float fp32_num);
+float fp16_2_fp32(int16_t fp16_num);
+
+}  // namespace fpga
+}  // namespace paddle_mobile
diff --git a/src/fpga/V2/image.cpp b/src/fpga/V2/image.cpp
index 4ce76cd00f..26829bfba6 100644
--- a/src/fpga/V2/image.cpp
+++ b/src/fpga/V2/image.cpp
@@ -58,6 +58,7 @@ void format_image(float **data_in, int channel, int height, int width,
                   int aligned_channel) {
   convert_to_hwc(data_in, channel, height, width);
   align_image(data_in, channel, height, width, aligned_channel);
+  fpga_flush(*data_in, aligned_channel * height * width * sizeof(float));
 }
 
 void concat_images(int16_t **images_in, float **scales_in, void *image_out,
@@ -69,6 +70,8 @@ void concat_images(int16_t **images_in, float **scales_in, void *image_out,
   scale_out[1] = 0.0;
   for (int i = 0; i < image_num; i++) {
     scale_out[0] = std::max(*scale_out, scales_in[i][0]);
+    fpga_invalidate(images_in[i],
+                    height * width * aligned_channel_num[i] * sizeof(int16_t));
   }
   scale_out[1] = 1 / scale_out[0];
 
@@ -83,6 +86,7 @@ void concat_images(int16_t **images_in, float **scales_in, void *image_out,
       tmp_channel_sum += channel_num[i];
     }
   }
+  fpga_flush(image_out, hw * out_channel * sizeof(int16_t));
 }
 
 }  // namespace image
diff --git a/src/framework/executor.cpp b/src/framework/executor.cpp
index 0ed3a5d323..c7ef09ed5a 100644
--- a/src/framework/executor.cpp
+++ b/src/framework/executor.cpp
@@ -26,6 +26,7 @@ limitations under the License. */
 #include "framework/program/var_desc.h"
 #include "framework/scope.h"
 #include "framework/tensor.h"
+#include "memory/t_malloc.h"
 
 #ifdef PADDLE_EXECUTOR_MULTITHREAD
 #include <queue>
@@ -86,8 +87,10 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
   }
   std::shared_ptr<framework::BlockDesc> to_predict_block =
       to_predict_program_->Block(0);
+  int i = 0;
   auto &ops = ops_of_block_[*to_predict_block.get()];
   for (const auto &op : ops) {
+    DLOG << "Initialize op[" << i++ << "]: " << op->Type();
     op->Init();
   }
 }
@@ -102,8 +105,8 @@ static void LoadMemInternal(void **data, framework::LoDTensor *tensor,
     // should be moved into operator init function
     float min_value;
     float max_value;
-    memcpy(&min_value, data_buf, sizeof(float));
-    memcpy(&max_value, data_buf + sizeof(float), sizeof(float));
+    memory::Copy(&min_value, data_buf, sizeof(float));
+    memory::Copy(&max_value, data_buf + sizeof(float), sizeof(float));
     data_buf += 2 * sizeof(float);
     const float factor = (max_value - min_value) / 255.0;
     const uint8_t *uint8_data = reinterpret_cast<uint8_t *>(data_buf);
@@ -112,7 +115,7 @@ static void LoadMemInternal(void **data, framework::LoDTensor *tensor,
     }
     data_buf += size * sizeof(uint8_t);
   } else {
-    memcpy(tensor_data, *data_buf, size * sizeof(Dtype));
+    memory::Copy(tensor_data, *data_buf, size * sizeof(Dtype));
     *data_buf += size * sizeof(Dtype);
   }
 }
@@ -128,7 +131,7 @@ void Executor<Dtype, P>::LoadMemory(
   // lod information
   // uint64_t lod_level = *(reinterpret_cast<uint64_t *>(*data_buf));
   uint64_t lod_level = 0;
-  memcpy(&lod_level, *data_buf, sizeof(uint64_t));
+  memory::Copy(&lod_level, *data_buf, sizeof(uint64_t));
   *data_buf += sizeof(uint64_t);
 
   auto *lod = tensor->mutable_lod();
@@ -137,7 +140,7 @@ void Executor<Dtype, P>::LoadMemory(
     uint64_t size = *(reinterpret_cast<uint64_t *>(*data_buf));
     *data_buf += sizeof(uint64_t);
     std::vector<size_t> tmp_dim(size / sizeof(size_t));
-    memcpy(tmp_dim.data(), *data_buf, size);
+    memory::Copy(tmp_dim.data(), *data_buf, size);
     (*lod)[i] = std::move(tmp_dim);
     *data_buf += size;
   }
diff --git a/src/io/paddle_mobile.cpp b/src/io/paddle_mobile.cpp
index fca870860e..1a28373f6a 100644
--- a/src/io/paddle_mobile.cpp
+++ b/src/io/paddle_mobile.cpp
@@ -21,7 +21,6 @@ limitations under the License. */
 #include "operators/math/gemm.h"
 namespace paddle_mobile {
 
-static std::mutex lc;
 template <typename Dtype, Precision P>
 void PaddleMobile<Dtype, P>::SetThreadNum(int num) {
 #ifdef _OPENMP
@@ -148,8 +147,8 @@ double PaddleMobile<Dtype, P>::GetPredictTime() {
   }
   paddle_mobile::operators::math::Gemm gemm;
   auto time1 = paddle_mobile::time();
-  gemm.Sgemm(m, n, k, static_cast<float>(1), a, lda, b, ldb,
-             static_cast<float>(0), c, ldc, false, nullptr);
+//  gemm.Sgemm(m, n, k, static_cast<float>(1), a, lda, b, ldb,
+//             static_cast<float>(0), c, ldc, false, nullptr);
   auto time2 = paddle_mobile::time();
   double cost = paddle_mobile::time_diff(time1, time2);
   paddle_mobile::memory::Free(a);
@@ -199,6 +198,7 @@ void PaddleMobile<Dtype, P>::Predict_To(int end) {
 #endif
 
 #ifdef PADDLE_MOBILE_CL
+static std::mutex lc;
 template <typename Dtype, Precision P>
 void PaddleMobile<Dtype, P>::SetCLPath(std::string path) {
   std::lock_guard<std::mutex> lock(lc);
diff --git a/src/memory/t_malloc.cpp b/src/memory/t_malloc.cpp
index 129f82a19d..2fb74d1880 100644
--- a/src/memory/t_malloc.cpp
+++ b/src/memory/t_malloc.cpp
@@ -32,7 +32,7 @@ const int MALLOC_ALIGN = 64;
 namespace fpga = paddle_mobile::fpga;
 
 void Copy(void *dst, const void *src, size_t num) {
-  std::memcpy(dst, src, num);
+  fpga::fpga_copy(dst, src, num);
 }
 
 void *Alloc(size_t size) { return fpga::fpga_malloc(size); }
diff --git a/src/operators/conv_transpose_op.cpp b/src/operators/conv_transpose_op.cpp
index 4d9eefaa85..d09a793745 100644
--- a/src/operators/conv_transpose_op.cpp
+++ b/src/operators/conv_transpose_op.cpp
@@ -27,6 +27,7 @@ REGISTER_OPERATOR_CPU(conv2d_transpose, ops::ConvOpTranspose);
 #ifdef PADDLE_MOBILE_MALI_GPU
 #endif
 #ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(conv2d_transpose, ops::ConvOpTranspose);
 #endif
 
 #endif
diff --git a/src/operators/fusion_deconv_relu_op.cpp b/src/operators/fusion_deconv_relu_op.cpp
new file mode 100644
index 0000000000..daae39c951
--- /dev/null
+++ b/src/operators/fusion_deconv_relu_op.cpp
@@ -0,0 +1,32 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_DECONVRELU_OP
+
+#include "operators/fusion_deconv_relu_op.h"
+
+namespace paddle_mobile {
+namespace operators {}
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(fusion_deconv_relu, ops::FusionDeconvReluOp);
+#endif
+
+#endif
diff --git a/src/operators/fusion_deconv_relu_op.h b/src/operators/fusion_deconv_relu_op.h
new file mode 100644
index 0000000000..e87d5d3798
--- /dev/null
+++ b/src/operators/fusion_deconv_relu_op.h
@@ -0,0 +1,107 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_DECONVRELU_OP
+#pragma once
+#include <string>
+#include <vector>
+
+#include "framework/operator.h"
+#include "framework/program/program-optimize/fusion_op_register.h"
+#include "operators/kernel/deconv_relu_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+using std::string;
+using std::vector;
+class FusionDeconvReluMatcher : public framework::FusionOpMatcher {
+ public:
+  FusionDeconvReluMatcher() {
+    node_ = framework::Node(G_OP_TYPE_CONV_TRANSPOSE);
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_RELU);
+  }
+
+  void FolderNodes(
+      framework::Node *node,
+      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
+    node->Folder(node_.Depth(), Type(), {}, removed_nodes);
+  }
+
+  std::string Type() { return G_OP_TYPE_FUSION_FC_RELU; }
+};
+
+template <typename DeviceType, typename T>
+class FusionDeconvReluOp : public framework::OperatorWithKernel<
+                               DeviceType, FusionDeconvReluParam<DeviceType>,
+                               operators::DeconvReluKernel<DeviceType, T>> {
+ public:
+  FusionDeconvReluOp(const string &type, const VariableNameMap &inputs,
+                     const VariableNameMap &outputs,
+                     const framework::AttributeMap &attrs,
+                     std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, FusionDeconvReluParam<DeviceType>,
+            operators::DeconvReluKernel<DeviceType, T>>(type, inputs, outputs,
+                                                        attrs, scope) {}
+
+  void InferShape() const {
+    auto input = this->param_.Input();
+    auto in_dims = input->dims();
+
+    auto filter = this->param_.Filter();
+    auto filter_dims = filter->dims();
+
+    std::vector<int> strides = this->param_.Strides();
+    std::vector<int> paddings = this->param_.Paddings();
+    std::vector<int> dilations = this->param_.Dilations();
+
+    int groups = this->param_.Groups();
+
+    PADDLE_MOBILE_ENFORCE(
+        in_dims.size() == 4 || in_dims.size() == 5,
+        "ConvTransposeOp intput should be 4-D or 5-D tensor.");
+    PADDLE_MOBILE_ENFORCE(
+        in_dims.size() == filter_dims.size(),
+        "ConvTransposeOp input dimension and filter dimension "
+        "should be the same.");
+    PADDLE_MOBILE_ENFORCE(
+        in_dims.size() - strides.size() == 2U,
+        "ConvTransposeOp input dimension and strides dimension should "
+        "be consistent.");
+    PADDLE_MOBILE_ENFORCE(paddings.size() == strides.size(),
+                          "ConvTransposeOp paddings dimension and strides "
+                          "dimension should be the same.");
+    PADDLE_MOBILE_ENFORCE(paddings.size() == dilations.size(),
+                          "ConvTransposeOp paddings dimension and dilations "
+                          "dimension should be the same.");
+    PADDLE_MOBILE_ENFORCE(
+        in_dims[1] == filter_dims[0],
+        "In ConvTransposeOp, The number of input channels should "
+        "be equal to the number of filter's channels.");
+
+    std::vector<int64_t> output_shape({in_dims[0], filter_dims[1] * groups});
+    for (size_t i = 0; i < strides.size(); ++i) {
+      auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1;
+      output_shape.push_back((in_dims[i + 2] - 1) * strides[i] -
+                             2 * paddings[i] + filter_extent);
+    }
+    this->param_.Output()->Resize(framework::make_ddim(output_shape));
+  }
+
+ protected:
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif  // FUSION_FC_RELU_OP
diff --git a/src/operators/kernel/deconv_relu_kernel.h b/src/operators/kernel/deconv_relu_kernel.h
new file mode 100644
index 0000000000..bc85f1ffee
--- /dev/null
+++ b/src/operators/kernel/deconv_relu_kernel.h
@@ -0,0 +1,39 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_DECONVRELU_OP
+
+#pragma once
+
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using framework::OpKernelBase;
+
+template <typename DeviceType, typename T>
+class DeconvReluKernel
+    : public OpKernelBase<DeviceType, FusionDeconvReluParam<DeviceType>> {
+ public:
+  void Compute(const FusionDeconvReluParam<DeviceType> &param);
+
+  bool Init(FusionDeconvReluParam<DeviceType> *param);
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp b/src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp
new file mode 100644
index 0000000000..3284ddcdec
--- /dev/null
+++ b/src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp
@@ -0,0 +1,34 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef CONV_TRANSPOSE_OP
+
+#include "operators/kernel/conv_transpose_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvTransposeKernel<FPGA, float>::Init(ConvTransposeParam<FPGA> *param) {
+  return true;
+}
+
+template <>
+void ConvTransposeKernel<FPGA, float>::Compute(
+    const ConvTransposeParam<FPGA> &param) {}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/V2/deconv_relu_kernel.cpp b/src/operators/kernel/fpga/V2/deconv_relu_kernel.cpp
new file mode 100644
index 0000000000..bf3556609a
--- /dev/null
+++ b/src/operators/kernel/fpga/V2/deconv_relu_kernel.cpp
@@ -0,0 +1,36 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_DECONVRELU_OP
+
+#include "operators/kernel/deconv_relu_kernel.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool DeconvReluKernel<FPGA, float>::Init(FusionDeconvReluParam<FPGA> *param) {
+  return true;
+}
+
+template <>
+void DeconvReluKernel<FPGA, float>::Compute(
+    const FusionDeconvReluParam<FPGA> &param) {}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/V2/slice_kernel.cpp b/src/operators/kernel/fpga/V2/slice_kernel.cpp
index b0df0cb65d..bc3fbfd796 100644
--- a/src/operators/kernel/fpga/V2/slice_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/slice_kernel.cpp
@@ -24,6 +24,7 @@ bool SliceKernel<FPGA, float>::Init(SliceParam<FPGA>* param) {
 }
 template <>
 void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) {}
+
 }  // namespace operators
 }  // namespace paddle_mobile
 #endif
diff --git a/src/operators/kernel/fpga/V2/softmax_kernel.cpp b/src/operators/kernel/fpga/V2/softmax_kernel.cpp
index 5cfccf8779..bbdb35b715 100644
--- a/src/operators/kernel/fpga/V2/softmax_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/softmax_kernel.cpp
@@ -49,7 +49,12 @@ void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) {
   Tensor *out = param.Out();
 
   fpga::PerformBypass(param.FpgaArgs());
+  fpga::fpga_invalidate(
+      (void *)in_x->data<float>(),                           // NOLINT
+      fpga::get_aligned_channel_num((int)in_x->dims()[1]) *  // NOLINT
+          sizeof(float));
   math::SoftmaxFuntor<CPU, float>()(in_x, out);
+  fpga::fpga_flush(out->data<float>(), out->memory_size());
 }
 
 }  // namespace operators
diff --git a/src/operators/kernel/fpga/V2/tanh_kernel.cpp b/src/operators/kernel/fpga/V2/tanh_kernel.cpp
new file mode 100644
index 0000000000..46dd3a0f6f
--- /dev/null
+++ b/src/operators/kernel/fpga/V2/tanh_kernel.cpp
@@ -0,0 +1,33 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef TANH_OP
+
+#include "operators/kernel/tanh_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool TanhKernel<FPGA, float>::Init(TanhParam<FPGA> *param) {
+  return true;
+}
+
+template <>
+void TanhKernel<FPGA, float>::Compute(const TanhParam<FPGA> &param) {}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/tanh_kernel.h b/src/operators/kernel/tanh_kernel.h
new file mode 100644
index 0000000000..035f64f840
--- /dev/null
+++ b/src/operators/kernel/tanh_kernel.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef TANH_OP
+
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using framework::OpKernelBase;
+
+template <typename DeviceType, typename T>
+class TanhKernel : public OpKernelBase<DeviceType, TanhParam<DeviceType>> {
+ public:
+  void Compute(const TanhParam<DeviceType>& param);
+  bool Init(TanhParam<DeviceType>* param);
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/op_param.h b/src/operators/op_param.h
index 5666f8e9c9..d65ca66364 100644
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -1534,6 +1534,27 @@ class ReluParam<GPU_CL> : public ReluParamBase<GPU_CL> {
 
 #endif
 
+#ifdef TANH_OP
+template <typename Dtype>
+class TanhParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
+ public:
+  TanhParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
+            const AttributeMap &attrs, const Scope &scope) {
+    input_x_ = InputXFrom<GType>(inputs, scope);
+    out_ = OutFrom<GType>(outputs, scope);
+  }
+  const RType *InputX() const { return input_x_; }
+  RType *Out() const { return out_; }
+
+ private:
+  RType *input_x_;
+  RType *out_;
+};
+#endif
+
 #ifdef PRELU_OP
 template <typename Dtype>
 class PReluParam : public OpParam {
@@ -2229,9 +2250,24 @@ class ConvTransposeParam : public OpParam {
   vector<int> paddings_;
   vector<int> dilations_;
   int groups;
+
+#ifdef PADDLE_MOBILE_FPGA
+
+ private:
+  fpga::DeconvArgs fpga_conv_args;
+
+ public:
+  const fpga::DeconvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::DeconvArgs &args) { fpga_conv_args = args; }
+#endif
 };
 #endif
 
+#ifdef FUSION_DECONVRELU_OP
+template <typename Dtype>
+using FusionDeconvReluParam = ConvTransposeParam<Dtype>;
+#endif
+
 #ifdef GRU_OP
 template <typename Dtype>
 class GruParam : public OpParam {
diff --git a/src/operators/tanh_op.cpp b/src/operators/tanh_op.cpp
new file mode 100644
index 0000000000..454cdfa269
--- /dev/null
+++ b/src/operators/tanh_op.cpp
@@ -0,0 +1,35 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef TANH_OP
+
+#include "operators/tanh_op.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+void TanhOp<DeviceType, T>::InferShape() const {
+  this->param_.Out()->Resize(this->param_.InputX()->dims());
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(Tanh, ops::TanhOp);
+#endif
+
+#endif
diff --git a/src/operators/tanh_op.h b/src/operators/tanh_op.h
new file mode 100644
index 0000000000..82b0e4e9a0
--- /dev/null
+++ b/src/operators/tanh_op.h
@@ -0,0 +1,44 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef TANH_OP
+
+#pragma once
+
+#include <string>
+#include "framework/operator.h"
+#include "operators/kernel/tanh_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class TanhOp : public framework::OperatorWithKernel<
+                   DeviceType, TanhParam<DeviceType>,
+                   operators::TanhKernel<DeviceType, T>> {
+ public:
+  TanhOp(const std::string &type, const VariableNameMap &inputs,
+         const VariableNameMap &outputs, const framework::AttributeMap &attrs,
+         std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<DeviceType, TanhParam<DeviceType>,
+                                      operators::TanhKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+  void InferShape() const override;
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 79bed19be3..52a1bf3070 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -73,6 +73,9 @@ list(FIND NET "FPGA_NET_V2" CON)
 if (CON GREATER -1)
     ADD_EXECUTABLE(test-resnet50 fpga/test_resnet50.cpp test_helper.h test_include.h executor_for_test.h)
     target_link_libraries(test-resnet50 paddle-mobile)
+
+    ADD_EXECUTABLE(test-pe fpga/test_pe.cpp)
+    target_link_libraries(test-pe paddle-mobile)
     set(FOUND_MATCH ON)
 endif ()
 
diff --git a/test/fpga/test_pe.cpp b/test/fpga/test_pe.cpp
new file mode 100644
index 0000000000..f5f2708b9e
--- /dev/null
+++ b/test/fpga/test_pe.cpp
@@ -0,0 +1,111 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_MOBILE_FPGA_V2
+#include "fpga/V2/api.h"
+#include "fpga/V2/filter.h"
+
+namespace fpga = paddle_mobile::fpga;
+
+static const uint32_t N = 64;
+static const uint32_t C = 3;
+static const uint32_t H = 224;
+static const uint32_t W = 224;
+static const uint32_t G = 1;
+
+fpga::DataType input_type = fpga::DATA_TYPE_FP32;
+fpga::DataType output_type = fpga::DATA_TYPE_FP16;
+
+void* ifm = nullptr;
+void* ofm = nullptr;
+void* filter = nullptr;
+void* ifm_scale = nullptr;
+void* ofm_scale = nullptr;
+void* filter_scale = nullptr;
+
+int ifm_size = 0, ofm_size = 0;
+
+void format_data() {
+  ifm_scale = fpga::fpga_malloc(8);
+  ofm_scale = fpga::fpga_malloc(8);
+  int ifm_channel = fpga::filter::calc_aligned_channel(C);
+  int ofm_channel = fpga::filter::calc_aligned_channel(N);
+  int num = fpga::filter::calc_aligned_num(N, C);
+  DLOG << "ifm_channel = " << ifm_channel;
+  DLOG << "ofm_channel = " << ofm_channel;
+  DLOG << "aligned_num = " << num;
+  ifm_size = ifm_channel * H * W;
+  ofm_size = ofm_channel * H * W;
+  ifm = fpga::fpga_malloc(ifm_size * sizeof(float));
+  ofm = fpga::fpga_malloc(ofm_size * sizeof(int16_t));
+  memset(ifm, 0, ifm_size * sizeof(float));
+  memset(ofm, 0, ofm_size * sizeof(int16_t));
+
+  for (int h = 0; h < H; h++) {
+    for (int w = 0; w < W; w++) {
+      for (int c = 0; c < C; c++) {
+        int index = h * W * ifm_channel + w * ifm_channel + c;
+        (reinterpret_cast<float*>(ifm))[index] = h + w + c * 0.1f;
+        // DLOG << index << ":" << ((float *) ifm)[index];
+      }
+    }
+  }
+  fpga::fpga_flush(ifm, ifm_size * sizeof(float));
+  fpga::fpga_flush(ofm, ofm_size * sizeof(int16_t));
+}
+
+void print_fp16(int16_t* ptr, int total_size, int num) {
+  fpga::fpga_invalidate(ptr, total_size * sizeof(int16_t));
+  int stride = total_size / num;
+  for (int i = 0; i < total_size; i += stride) {
+    DLOG << fpga::fp16_2_fp32(ptr[i]);
+  }
+}
+
+void print_fp32(float* ptr, int total_size, int num) {
+  fpga::fpga_invalidate(ptr, total_size * sizeof(float));
+  int stride = total_size / num;
+  for (int i = 0; i < total_size; i += stride) {
+    DLOG << ptr[i];
+  }
+}
+
+void test_bypass() {
+  fpga::BypassArgs args;
+  args.input_data_type = input_type;
+  args.output_data_type = output_type;
+  args.image.address = ifm;
+  args.image.height = H;
+  args.image.width = W;
+  args.image.channels = C;
+  args.image.scale_address = reinterpret_cast<float*>(ifm_scale);
+  args.output.address = ofm;
+  args.output.scale_address = reinterpret_cast<float*>(ofm_scale);
+  fpga::PerformBypass(args);
+}
+
+int main() {
+  paddle_mobile::fpga::open_device();
+  format_data();
+  DLOG << "format data done";
+  print_fp32(reinterpret_cast<float*>(ifm), ifm_size, 200);
+  DLOG << "print input done";
+  test_bypass();
+  DLOG << "test done";
+  print_fp16(reinterpret_cast<int16_t*>(ofm), ifm_size, 200);
+  std::cout << "Computation done" << std::endl;
+  return 0;
+}
+
+#endif
diff --git a/tools/op.cmake b/tools/op.cmake
index 7d19591efc..ae1ac1a4ff 100644
--- a/tools/op.cmake
+++ b/tools/op.cmake
@@ -133,9 +133,11 @@ if (CON GREATER -1)
   set(SOFTMAX_OP ON)
   set(FUSION_CONVBNRELU_OP ON)
   set(FUSION_CONVBN_OP ON)
-#  set(CONV_TRANSPOSE_OP ON)
-#  set(SLICE_OP ON)
-#  set(ELEMENTWISEADD_OP ON)
+  set(CONV_TRANSPOSE_OP ON)
+  set(FUSION_DECONVRELU_OP ON)
+  set(SLICE_OP ON)
+  set(TANH_OP ON)
+  set(ELEMENTWISEADD_OP ON)
   set(FOUND_MATCH ON)
 endif()
 
@@ -445,3 +447,9 @@ if (DEQUANT_OP)
   add_definitions(-DDEQUANT_OP)
 endif()
 
+if (TANH_OP)
+  add_definitions(-DTANH_OP)
+endif()
+if (FUSION_DECONVRELU_OP)
+  add_definitions(-DFUSION_DECONVRELU_OP)
+endif()
\ No newline at end of file
-- 
GitLab