diff --git a/src/common/types.cpp b/src/common/types.cpp
index ede49478ce1a56ca603f5f8580d4d0c231d6616d..510313d9fee0940d7162ea2c6b09426f6d9ce17a 100644
--- a/src/common/types.cpp
+++ b/src/common/types.cpp
@@ -71,6 +71,8 @@ const char *G_OP_TYPE_SUM = "sum";
 
 const char *G_OP_TYPE_QUANTIZE = "quantize";
 const char *G_OP_TYPE_DEQUANTIZE = "dequantize";
+extern const char *G_OP_TYPE_TANH = "tanh";
+extern const char *G_OP_TYPE_FUSION_DECONV_RELU = "fusion_deconv_relu";
 
 std::unordered_map<
     std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
@@ -129,5 +131,7 @@ std::unordered_map<
         {G_OP_TYPE_SUM, {{"X"}, {"Out"}}},
         {G_OP_TYPE_ELEMENTWISE_MUL, {{"X", "Y"}, {"Out"}}},
         {G_OP_TYPE_QUANTIZE, {{"X"}, {"Out", "OutScale"}}},
-        {G_OP_TYPE_DEQUANTIZE, {{"X", "Scale"}, {"Out"}}}};
+        {G_OP_TYPE_DEQUANTIZE, {{"X", "Scale"}, {"Out"}}},
+        {G_OP_TYPE_TANH, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_DECONV_RELU, {{"Input"}, {"Out"}}}};
 }  // namespace paddle_mobile
diff --git a/src/common/types.h b/src/common/types.h
index 70f6debf8756211cf49a62d074010e2bc6d4eaa7..4cd35ac91084f6518858c97cf4c0e8da5b09555b 100644
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -139,6 +139,9 @@ extern const char *G_OP_TYPE_ELEMENTWISE_MUL;
 extern const char *G_OP_TYPE_QUANTIZE;
 extern const char *G_OP_TYPE_DEQUANTIZE;
 
+extern const char *G_OP_TYPE_TANH;
+extern const char *G_OP_TYPE_FUSION_DECONV_RELU;
+
 extern std::unordered_map<
     std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
     op_input_output_key;
diff --git a/src/fpga/V2/api.cpp b/src/fpga/V2/api.cpp
index 324ee4f5381a20a9a34000045b130d61f71ec116..41adc979264b5c881cdad6a3d4a00ee245ae85d5 100644
--- a/src/fpga/V2/api.cpp
+++ b/src/fpga/V2/api.cpp
@@ -16,27 +16,47 @@ limitations under the License. */
 #include <algorithm>
 #include "fpga/V2/bias_scale.h"
 #include "fpga/V2/config.h"
+<<<<<<< HEAD
+=======
+#include "fpga/V2/driver/driver.h"
+>>>>>>> upstream/develop
 #include "fpga/V2/filter.h"
 #include "fpga/V2/image.h"
 
 namespace paddle_mobile {
 namespace fpga {
+<<<<<<< HEAD
 static std::map<void *, size_t> memory_map;
 
 int open_device() {
   int ret = open_device_driver();
+=======
+
+static std::map<void *, size_t> memory_map;
+
+int open_device() {
+  int ret = driver::open_device_driver();
+>>>>>>> upstream/develop
   return ret;
 }
 
 int close_device() {
+<<<<<<< HEAD
   int ret = close_device_driver();
+=======
+  int ret = driver::close_device_driver();
+>>>>>>> upstream/develop
   return ret;
 }
 
 void *fpga_malloc(size_t size) {
   static uint64_t counter = 0;
 #ifdef PADDLE_MOBILE_ZU5
+<<<<<<< HEAD
   auto ptr = fpga_malloc_driver(size);
+=======
+  auto ptr = driver::fpga_malloc_driver(size);
+>>>>>>> upstream/develop
 #else
   auto ptr = malloc(size);
 #endif
@@ -55,7 +75,11 @@ void fpga_free(void *ptr) {
     size = iter->second;
     memory_map.erase(iter);
 #ifdef PADDLE_MOBILE_ZU5
+<<<<<<< HEAD
     fpga_free_driver(ptr);
+=======
+    driver::fpga_free_driver(ptr);
+>>>>>>> upstream/develop
 #else
     free(ptr);
 #endif
@@ -66,6 +90,7 @@ void fpga_free(void *ptr) {
     DLOG << "Invalid pointer";
   }
 }
+<<<<<<< HEAD
 
 half fp32_2_fp16(float fp32_num) {
   unsigned long tmp = *(unsigned long *)(&fp32_num);  // NOLINT
@@ -86,6 +111,29 @@ float fp16_2_fp32(half fp16_num) {
   tmp = s << 16 | exp << 23 | frac << 13;
   fp32_num = *(float *)&tmp;  // NOLINT
   return fp32_num;
+=======
+void fpga_copy(void *dest, const void *src, size_t num) {
+#ifdef PADDLE_MOBILE_ZU5
+  driver::fpga_copy_driver(dest, src, num);
+#else
+  memcpy(dest, src, num);
+#endif
+}
+
+int fpga_flush(void *address, size_t size) {
+#ifdef PADDLE_MOBILE_ZU5
+  return driver::fpga_flush_driver(address, size);
+#else
+  return 0;
+#endif
+}
+int fpga_invalidate(void *address, size_t size) {
+#ifdef PADDLE_MOBILE_ZU5
+  return driver::fpga_invalidate_driver(address, size);
+#else
+  return 0;
+#endif
+>>>>>>> upstream/develop
 }
 
 void format_image(framework::Tensor *image_tensor) {
@@ -240,7 +288,11 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
   arg->filter_num = (uint32_t)filter->dims()[0];
   arg->output.address = out_ptr;
   arg->output.scale_address = out->scale;
+<<<<<<< HEAD
   arg->conv_args =
+=======
+  arg->conv_arg =
+>>>>>>> upstream/develop
       (ConvArgs *)fpga_malloc(arg->split_num * sizeof(ConvArgs));  // NOLINT
 
   arg->concat_arg.image_num = arg->split_num;
@@ -258,6 +310,7 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
       (uint32_t *)fpga_malloc(n * sizeof(uint32_t));  // NOLINT
 
   for (int i = 0; i < n; i++) {
+<<<<<<< HEAD
     arg->conv_args[i].relu_enabled = relu_enabled;
     arg->conv_args[i].sb_address = bs_ptr;
     arg->conv_args[i].filter_address = (int8_t *)filter_ptr;  // NOLINT
@@ -280,6 +333,35 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
 
     arg->conv_args[i].output.address = out_ptr;
     arg->conv_args[i].output.scale_address = out->scale;
+=======
+    arg->conv_arg[i].relu_enabled = relu_enabled;
+    arg->conv_arg[i].sb_address = bs_ptr;
+    arg->conv_arg[i].filter_address = (int8_t *)filter_ptr;  // NOLINT
+    arg->conv_arg[i].filter_scale_address = filter->scale;
+    arg->conv_arg[i].filter_num = arg->filter_num;
+    arg->conv_arg[i].group_num = (uint32_t)group_num;
+
+    arg->conv_arg[i].kernel.stride_h = (uint32_t)stride_h;
+    arg->conv_arg[i].kernel.stride_w = (uint32_t)stride_w;
+    arg->conv_arg[i].kernel.height = (uint32_t)filter->dims()[2];
+    arg->conv_arg[i].kernel.width = (uint32_t)filter->dims()[3];
+
+    arg->conv_arg[i].image.address = input_ptr;
+    arg->conv_arg[i].image.scale_address = input->scale;
+    arg->conv_arg[i].image.channels = (uint32_t)input->dims()[1];
+    arg->conv_arg[i].image.height = (uint32_t)input->dims()[2];
+    arg->conv_arg[i].image.width = (uint32_t)input->dims()[3];
+    arg->conv_arg[i].image.pad_height = (uint32_t)padding_h;
+    arg->conv_arg[i].image.pad_width = (uint32_t)padding_w;
+
+    arg->conv_arg[i].output.address = out_ptr;
+    arg->conv_arg[i].output.scale_address = out->scale;
+
+    int num_after_alignment =
+        filter::calc_aligned_num((int)input->dims()[1], arg->filter_num);
+    arg->conv_arg[i].free_space =
+        fpga_malloc(num_after_alignment * 2 * sizeof(half));
+>>>>>>> upstream/develop
   }
 }
 
diff --git a/src/fpga/V2/api.h b/src/fpga/V2/api.h
index aac97bec225a4940f710172c115e06452469d289..a59a5ccbe3f269b1cdfd33e4a1abb8f50291ea20 100644
--- a/src/fpga/V2/api.h
+++ b/src/fpga/V2/api.h
@@ -14,17 +14,23 @@ limitations under the License. */
 
 #pragma once
 
+<<<<<<< HEAD
 #include <stdint.h>
 #include <cstddef>
 #include <iostream>
 #include <limits>
 #include "fpga/V2/driver/driver.h"
 #include "fpga/V2/driver/pe.h"
+=======
+#include "fpga/V2/driver/pe.h"
+#include "fpga/V2/fpga_common.h"
+>>>>>>> upstream/develop
 #include "framework/tensor.h"
 
 namespace paddle_mobile {
 namespace fpga {
 
+<<<<<<< HEAD
 enum DataType {
   DATA_TYPE_FP32 = 1,
   DATA_TYPE_FP16 = 0,
@@ -120,12 +126,20 @@ struct BypassArgs {
   struct ImageOutputArgs output;
 };
 
+=======
+>>>>>>> upstream/develop
 int open_device();
 int close_device();
 void* fpga_malloc(size_t size);
 void fpga_free(void* ptr);
+<<<<<<< HEAD
 
 static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; }
+=======
+void fpga_copy(void* dest, const void* src, size_t num);
+int fpga_flush(void* address, size_t size);
+int fpga_invalidate(void* address, size_t size);
+>>>>>>> upstream/develop
 
 float filter_find_max(framework::Tensor* filter_tensor);
 int get_aligned_channel_num(int channel_num);
@@ -153,8 +167,11 @@ void fill_split_arg(struct SplitConvArgs* arg, framework::Tensor* input,
                     bool relu_enabled, int group_num, int stride_h,
                     int stride_w, int padding_h, int padding_w, float* bs_ptr);
 
+<<<<<<< HEAD
 half fp32_2_fp16(float fp32_num);
 float fp16_2_fp32(half fp16_num);
 
+=======
+>>>>>>> upstream/develop
 }  // namespace fpga
 }  // namespace paddle_mobile
diff --git a/src/fpga/V2/bias_scale.cpp b/src/fpga/V2/bias_scale.cpp
index 8a0fd426194f6ab5e699f084ff6277920d8c89b4..bbf443321d8ed73fb50b10896bed6bfc3926ca5d 100644
--- a/src/fpga/V2/bias_scale.cpp
+++ b/src/fpga/V2/bias_scale.cpp
@@ -27,7 +27,11 @@ void align_element(float **data_in, int num, int num_after_alignment) {
       (float *)fpga_malloc(total_element * sizeof(float));  // NOLINT
   memset(ptr_aligned, 0, total_element * sizeof(float));
 
+<<<<<<< HEAD
   for (int i = 1; i < num; i++) {
+=======
+  for (int i = 0; i < num; i++) {
+>>>>>>> upstream/develop
     ptr_aligned[i * 2 + 0] = ptr_unaligned[i];
     ptr_aligned[i * 2 + 1] = ptr_unaligned[i + num];
   }
@@ -39,6 +43,10 @@ void align_element(float **data_in, int num, int num_after_alignment) {
 void format_bias_scale_array(float **data_in, int num,
                              int num_after_alignment) {
   align_element(data_in, num, num_after_alignment);
+<<<<<<< HEAD
+=======
+  fpga_flush(*data_in, 2 * num_after_alignment * sizeof(float));
+>>>>>>> upstream/develop
 }
 
 }  // namespace bias_scale
diff --git a/src/fpga/V2/driver/bitmap.cpp b/src/fpga/V2/driver/bitmap.cpp
index 9c99f6446caf80f78a5c8737a41a4a80f93395d9..b3794ef36464bf5ccb09e7b3ec15c3447f5288b0 100644
--- a/src/fpga/V2/driver/bitmap.cpp
+++ b/src/fpga/V2/driver/bitmap.cpp
@@ -57,8 +57,13 @@ static uint64_t ffs(uint64_t data) {
   uint64_t bit = 0;
   int i = 0;
 
+<<<<<<< HEAD
   for (i = 0; i < sizeof(data); i++) {
     if (data & (1 << i)) {
+=======
+  for (i = 0; i < sizeof(data) * 8; i++) {
+    if (data & (1UL << i)) {
+>>>>>>> upstream/develop
       bit = i;
       break;
     }
diff --git a/src/fpga/V2/driver/bitmap.h b/src/fpga/V2/driver/bitmap.h
index 272cddf23367e17759a4493ace64119a9e351595..ae2d11b34b53064e0491c4b1e2ac0a6a12b00559 100644
--- a/src/fpga/V2/driver/bitmap.h
+++ b/src/fpga/V2/driver/bitmap.h
@@ -25,7 +25,11 @@ limitations under the License. */
 #define __ALIGN_KERNEL_MASK(x, mask) (((x) + (mask)) & ~(mask))
 #define __ALIGN_MASK(x, mask) __ALIGN_KERNEL_MASK((x), (mask))
 
+<<<<<<< HEAD
 #define round_down(x, y) ((x) & ((y)-1))
+=======
+#define round_down(x, y) ((x) & ~((y)-1))
+>>>>>>> upstream/develop
 
 namespace fpga_bitmap {
 void bitmap_set(uint64_t *map, unsigned int start, int len);
diff --git a/src/fpga/V2/driver/driver.cpp b/src/fpga/V2/driver/driver.cpp
index ed78fa5ebcc089e136ebc2a79d56874885735879..7bebabf29c984e55683f125538ef8c23cdac048a 100644
--- a/src/fpga/V2/driver/driver.cpp
+++ b/src/fpga/V2/driver/driver.cpp
@@ -17,6 +17,10 @@ limitations under the License. */
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
+<<<<<<< HEAD
+=======
+#include <sys/ioctl.h>
+>>>>>>> upstream/develop
 #include <sys/mman.h>
 #include <unistd.h>
 #include <algorithm>
@@ -32,6 +36,10 @@ limitations under the License. */
 
 namespace paddle_mobile {
 namespace fpga {
+<<<<<<< HEAD
+=======
+namespace driver {
+>>>>>>> upstream/develop
 struct FPGA_INFO g_fpgainfo;
 
 int open_drvdevice() {
@@ -43,7 +51,12 @@ int open_drvdevice() {
 
 int open_memdevice() {
   if (g_fpgainfo.fd_mem == -1) {
+<<<<<<< HEAD
     g_fpgainfo.fd_mem = open(g_fpgainfo.memdevice_path, O_RDWR | O_DSYNC);
+=======
+    // g_fpgainfo.fd_mem = open(g_fpgainfo.memdevice_path, O_RDWR | O_DSYNC);
+    g_fpgainfo.fd_mem = open(g_fpgainfo.memdevice_path, O_RDWR);
+>>>>>>> upstream/develop
   }
   return g_fpgainfo.fd_mem;
 }
@@ -51,7 +64,10 @@ int open_memdevice() {
 void pl_reset() {
   // DLOG << "PL RESET";
 
+<<<<<<< HEAD
   // reg_writeq(0x5a, REG_FPGA_RESET);
+=======
+>>>>>>> upstream/develop
   usleep(100 * 1000);
 }
 
@@ -131,7 +147,11 @@ int pl_get_status() { return 0; }
 int fpga_regpoll(uint64_t reg, uint64_t val, int time) {
   uint64_t i = 0;
   /*timeout精确性待确认*/
+<<<<<<< HEAD
   int64_t timeout = time * CPU_FREQ / 1000000;
+=======
+  int64_t timeout = time * 6;
+>>>>>>> upstream/develop
 
   for (i = 0; i < timeout; i++) {
     if (val == reg_readq(reg)) {
@@ -173,9 +193,20 @@ int memory_request(struct fpga_memory *memory, size_t size, uint64_t *addr) {
 }
 
 void memory_release(struct fpga_memory *memory) {
+<<<<<<< HEAD
   pthread_mutex_lock(&memory->mutex);
   fpga_bitmap::bitmap_clear(memory->bitmap, 0, memory->page_num);
   pthread_mutex_unlock(&memory->mutex);
+=======
+  void *ptr = nullptr;
+
+  /*unmap memory*/
+  std::map<void *, size_t> map = g_fpgainfo.fpga_addr2size_map;
+  std::map<void *, size_t>::iterator iter;
+  for (iter = map.begin(); iter != map.end(); iter++) {
+    fpga_free_driver(ptr);
+  }
+>>>>>>> upstream/develop
 }
 
 int create_fpga_memory_inner(struct fpga_memory *memory, size_t memory_size) {
@@ -238,7 +269,10 @@ int init_fpga_memory(struct fpga_memory *memory) {
     return rc;
   }
 
+<<<<<<< HEAD
   // spin_lock_init(&memory->spin);
+=======
+>>>>>>> upstream/develop
   fpga_bitmap::bitmap_clear(memory->bitmap, 0, memory->page_num);
   fpga_bitmap::bitmap_set(memory->bitmap, 0, 1);  // NOTE reserve fpga page 0.
 
@@ -293,9 +327,29 @@ void *fpga_reg_malloc(size_t size) {
   return ret;
 }
 
+<<<<<<< HEAD
+void *fpga_malloc_driver(size_t size) {
+  void *ret = nullptr;
+  uint64_t phy_addr = 0;
+=======
+void *fpga_reg_free(void *ptr) {
+  size_t size = 0;
+
+  auto iter = g_fpgainfo.fpga_addr2size_map.find(ptr);
+  if (iter != g_fpgainfo.fpga_addr2size_map.end()) {
+    size = iter->second;
+    g_fpgainfo.fpga_addr2size_map.erase(iter);
+    munmap(ptr, size);
+  } else {
+    DLOG << "Invalid pointer";
+  }
+}
+
 void *fpga_malloc_driver(size_t size) {
   void *ret = nullptr;
   uint64_t phy_addr = 0;
+  int i = 0;
+>>>>>>> upstream/develop
 
   memory_request(g_fpgainfo.memory_info, size, &phy_addr);
 
@@ -311,17 +365,79 @@ void *fpga_malloc_driver(size_t size) {
 
 void fpga_free_driver(void *ptr) {
   size_t size = 0;
+<<<<<<< HEAD
+=======
+  uint32_t pos = 0;
+  uint64_t p_addr = 0;
+>>>>>>> upstream/develop
 
   auto iter = g_fpgainfo.fpga_addr2size_map.find(ptr);
   if (iter != g_fpgainfo.fpga_addr2size_map.end()) {
     size = iter->second;
     g_fpgainfo.fpga_addr2size_map.erase(iter);
     munmap(ptr, size);
+<<<<<<< HEAD
+=======
+
+    p_addr = vaddr_to_paddr(ptr);
+    pos = (p_addr - g_fpgainfo.memory_info->mem_start) / FPGA_PAGE_SIZE;
+
+    /*clear bitmap*/
+    pthread_mutex_lock(&g_fpgainfo.memory_info->mutex);
+    fpga_bitmap::bitmap_clear(g_fpgainfo.memory_info->bitmap, pos,
+                              g_fpgainfo.memory_info->nr[pos]);
+    pthread_mutex_unlock(&g_fpgainfo.memory_info->mutex);
+>>>>>>> upstream/develop
   } else {
     DLOG << "Invalid pointer";
   }
 }
 
+<<<<<<< HEAD
+=======
+static inline int do_ioctl(unsigned long req, const void *arg) {
+  return ioctl(g_fpgainfo.fd_mem, req, arg);
+}
+
+int fpga_flush_driver(void *address, size_t size) {
+  struct MemoryCacheArgs args;
+  uint64_t p_addr;
+
+  p_addr = vaddr_to_paddr(address);
+
+  args.offset = (void *)(p_addr - FPGA_MEM_PHY_ADDR);
+  args.size = size;
+
+  return do_ioctl(IOCTL_MEMCACHE_FLUSH, &args);
+}
+
+int fpga_invalidate_driver(void *address, size_t size) {
+  struct MemoryCacheArgs args;
+  uint64_t p_addr;
+
+  p_addr = vaddr_to_paddr(address);
+
+  args.offset = (void *)(p_addr - FPGA_MEM_PHY_ADDR);
+  args.size = size;
+
+  return do_ioctl(IOCTL_MEMCACHE_INVAL, &args);
+}
+
+void fpga_copy_driver(void *dest, const void *src, size_t num) {
+  uint64_t i;
+
+  DLOG << "dest:" << dest << " src:" << src << " size:" << num;
+
+  for (i = 0; i < num; i++) {
+    // DLOG << "i:" << i << " val:" << *((int8_t *)src + i);
+    // usleep(1);
+    *((int8_t *)dest + i) = *((int8_t *)src + i);
+  }
+
+  return;
+}
+
+>>>>>>> upstream/develop
 int open_device_driver() {
   g_fpgainfo.FpgaRegPhyAddr = FPGA_REG_PHY_ADDR;
   g_fpgainfo.FpgaMemPhyAddr = FPGA_MEM_PHY_ADDR;
@@ -347,12 +463,20 @@ int open_device_driver() {
 
 int close_device_driver() {
   pl_destroy();
+<<<<<<< HEAD
   fpga_free_driver(g_fpgainfo.FpgaRegVirAddr);
+=======
+  fpga_reg_free(g_fpgainfo.FpgaRegVirAddr);
+>>>>>>> upstream/develop
   memory_release(g_fpgainfo.memory_info);
   destroy_fpga_memory(g_fpgainfo.memory_info);
 
   return 0;
 }
 
+<<<<<<< HEAD
+=======
+}  // namespace driver
+>>>>>>> upstream/develop
 }  // namespace fpga
 }  // namespace paddle_mobile
diff --git a/src/fpga/V2/driver/driver.h b/src/fpga/V2/driver/driver.h
index ee01454ac593e7b5a146a8fac4f81a957c2b1e95..018229882fc45fe29dc4ad23139a854249e0d107 100644
--- a/src/fpga/V2/driver/driver.h
+++ b/src/fpga/V2/driver/driver.h
@@ -24,6 +24,10 @@ limitations under the License. */
 
 namespace paddle_mobile {
 namespace fpga {
+<<<<<<< HEAD
+=======
+namespace driver {
+>>>>>>> upstream/develop
 
 #define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
 
@@ -47,6 +51,18 @@ const int PE_IDX_BYPASS = 3;
 
 enum pe_status { IDLE = 0, BUSY = 1 };
 
+<<<<<<< HEAD
+=======
+struct MemoryCacheArgs {
+  void *offset;
+  size_t size;
+};
+
+#define IOCTL_FPGA_MAGIC 'FPGA'
+#define IOCTL_MEMCACHE_INVAL _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryCacheArgs)
+#define IOCTL_MEMCACHE_FLUSH _IOW(IOCTL_FPGA_MAGIC, 13, struct MemoryCacheArgs)
+
+>>>>>>> upstream/develop
 struct fpga_pe {
   char type_name[MAX_TYPE_NAME_LENTH + 1];
   struct pe_data_s *outer;
@@ -95,19 +111,30 @@ extern struct FPGA_INFO g_fpgainfo;
 
 inline uint64_t reg_readq(uint32_t offset) {
   // DLOG << "offset : " << offset;
+<<<<<<< HEAD
   uint64_t value =
       *(uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + offset);  // NOLINT
+=======
+  uint64_t value = *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr +
+                                          offset);  // NOLINT
+>>>>>>> upstream/develop
 
   return value;
 }
 
 inline void reg_writeq(uint64_t value, uint32_t offset) {
   // DLOG << "offset : " << offset << ", value : " << value;
+<<<<<<< HEAD
   *(uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + offset) =  // NOLINT
+=======
+  *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr +
+                         offset) =  // NOLINT
+>>>>>>> upstream/develop
       value;
 }
 
 int open_device_driver();
+<<<<<<< HEAD
 int close_device_driver();
 void *fpga_malloc_driver(size_t size);
 void fpga_free_driver(void *ptr);
@@ -116,5 +143,27 @@ void fpga_free_driver(void *ptr);
 uint64_t vaddr_to_paddr(void *address);
 int fpga_regpoll(uint64_t reg, uint64_t val, int time);
 
+=======
+
+int close_device_driver();
+
+void *fpga_malloc_driver(size_t size);
+
+void fpga_free_driver(void *ptr);
+
+void fpga_copy_driver(void *dest, const void *src, size_t num);
+
+int fpga_flush_driver(void *address, size_t size);
+
+int fpga_invalidate_driver(void *address, size_t size);
+
+/*pe*/
+
+uint64_t vaddr_to_paddr(void *address);
+
+int fpga_regpoll(uint64_t reg, uint64_t val, int time);
+
+}  // namespace driver
+>>>>>>> upstream/develop
 }  // namespace fpga
 }  // namespace paddle_mobile
diff --git a/src/fpga/V2/driver/pe.cpp b/src/fpga/V2/driver/pe.cpp
index 52cde04601bc5e002ce2d8e15b3bdb1ce64b340a..741db868820c2e520f731aec5f46153a6c562841 100644
--- a/src/fpga/V2/driver/pe.cpp
+++ b/src/fpga/V2/driver/pe.cpp
@@ -20,29 +20,49 @@ limitations under the License. */
 
 namespace paddle_mobile {
 namespace fpga {
+<<<<<<< HEAD
 #define MUL8(x) (x * 8)
+=======
+#define MUL8(x) ((x)*8)
+>>>>>>> upstream/develop
 #define BYPASS_DONE 1
 
 float Findfp16Max() {
   uint16_t abs_vals[16];
   uint64_t max_fp16;
 
+<<<<<<< HEAD
   max_fp16 = reg_readq(MUL8(49));
+=======
+  max_fp16 = driver::reg_readq(MUL8(49));
+>>>>>>> upstream/develop
   abs_vals[0] = (uint16_t)(0x0000007f & (max_fp16));        // NOLINT
   abs_vals[1] = (uint16_t)(0x0000007f & (max_fp16 >> 16));  // NOLINT
   abs_vals[2] = (uint16_t)(0x0000007f & (max_fp16 >> 32));  // NOLINT
   abs_vals[3] = (uint16_t)(0x0000007f & (max_fp16 >> 48));  // NOLINT
+<<<<<<< HEAD
   max_fp16 = reg_readq(MUL8(50));
+=======
+  max_fp16 = driver::reg_readq(MUL8(50));
+>>>>>>> upstream/develop
   abs_vals[4] = (uint16_t)(0x0000007f & (max_fp16));        // NOLINT
   abs_vals[5] = (uint16_t)(0x0000007f & (max_fp16 >> 16));  // NOLINT
   abs_vals[6] = (uint16_t)(0x0000007f & (max_fp16 >> 32));  // NOLINT
   abs_vals[7] = (uint16_t)(0x0000007f & (max_fp16 >> 48));  // NOLINT
+<<<<<<< HEAD
   max_fp16 = reg_readq(MUL8(51));
+=======
+  max_fp16 = driver::reg_readq(MUL8(51));
+>>>>>>> upstream/develop
   abs_vals[8] = (uint16_t)(0x0000007f & (max_fp16));         // NOLINT
   abs_vals[9] = (uint16_t)(0x0000007f & (max_fp16 >> 16));   // NOLINT
   abs_vals[10] = (uint16_t)(0x0000007f & (max_fp16 >> 32));  // NOLINT
   abs_vals[11] = (uint16_t)(0x0000007f & (max_fp16 >> 48));  // NOLINT
+<<<<<<< HEAD
   max_fp16 = reg_readq(MUL8(52));
+=======
+  max_fp16 = driver::reg_readq(MUL8(52));
+>>>>>>> upstream/develop
   abs_vals[12] = (uint16_t)(0x0000007f & (max_fp16));
   abs_vals[13] = (uint16_t)(0x0000007f & (max_fp16 >> 16));  // NOLINT
   abs_vals[14] = (uint16_t)(0x0000007f & (max_fp16 >> 32));  // NOLINT
@@ -58,7 +78,11 @@ float Findfp16Max() {
 }
 
 int ComputeFpgaConv(const struct SplitConvArgs &args) {
+<<<<<<< HEAD
   ComputeBasicConv(args.conv_args[0]);
+=======
+  ComputeBasicConv(args.conv_arg[0]);
+>>>>>>> upstream/develop
 }
 
 int ComputeBasicConv(const struct ConvArgs &args) {
@@ -166,8 +190,13 @@ int PerformBypass(const struct BypassArgs &args) {
   return 0;
 #endif
 
+<<<<<<< HEAD
   uint64_t ifm_src_paddr = vaddr_to_paddr(args.image.address);
   uint64_t ifm_dst_paddr = vaddr_to_paddr(args.output.address);
+=======
+  uint64_t ifm_src_paddr = driver::vaddr_to_paddr(args.image.address);
+  uint64_t ifm_dst_paddr = driver::vaddr_to_paddr(args.output.address);
+>>>>>>> upstream/develop
   uint64_t bp_enable;
   int64_t length;
   uint64_t pixels;
@@ -196,6 +225,7 @@ int PerformBypass(const struct BypassArgs &args) {
   }
 
   // start bypass
+<<<<<<< HEAD
   reg_writeq(ifm_src_paddr, MUL8(27));
   reg_writeq(ifm_dst_paddr, MUL8(28));
   reg_writeq(0, MUL8(0));
@@ -206,6 +236,18 @@ int PerformBypass(const struct BypassArgs &args) {
   if (ret != -1) {
     // clear "irq"
     reg_readq(MUL8(63));
+=======
+  driver::reg_writeq(ifm_src_paddr, MUL8(27));
+  driver::reg_writeq(ifm_dst_paddr, MUL8(28));
+  driver::reg_writeq(0, MUL8(0));
+  driver::reg_writeq(bp_enable, MUL8(0));
+  // poll
+  int ret = -1;
+  ret = driver::fpga_regpoll(MUL8(48), BYPASS_DONE, 0xffffffff);
+  if (ret != -1) {
+    // clear "irq"
+    driver::reg_readq(MUL8(63));
+>>>>>>> upstream/develop
   }
   // get max value
   if ((!args.input_data_type) && (!args.output_data_type)) {
diff --git a/src/fpga/V2/driver/pe.h b/src/fpga/V2/driver/pe.h
index 4ec3ccb01db1859d4265484644a1e1704cc836c7..ae9f45fc25264d28677d16565911507905dad2a6 100644
--- a/src/fpga/V2/driver/pe.h
+++ b/src/fpga/V2/driver/pe.h
@@ -12,7 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+<<<<<<< HEAD
 #include "fpga/V2/api.h"
+=======
+
+#include "fpga/V2/fpga_common.h"
+>>>>>>> upstream/develop
 
 namespace paddle_mobile {
 namespace fpga {
diff --git a/src/fpga/V2/filter.cpp b/src/fpga/V2/filter.cpp
index 39d67b2d2d6213baf674dc0bbc3e96f4f182e3c6..67d3fe5b45efb9e31a441fc773ca4d06e560291e 100644
--- a/src/fpga/V2/filter.cpp
+++ b/src/fpga/V2/filter.cpp
@@ -94,6 +94,10 @@ void format_filter(float **data_in, int num, int channel, int height, int width,
   convert_to_hwc(data_in, num, channel, height, width);
   align_filter(data_in, num, channel, height, width);
   int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width);
+<<<<<<< HEAD
+=======
+  fpga_flush(*data_in, pixel_num * sizeof(float));
+>>>>>>> upstream/develop
 }
 
 void convert_fc_filter(float **data_in, int num, int chw) {
@@ -113,6 +117,11 @@ void format_fc_filter(float **data_in, int num, int channel, int height,
   int chw = channel * height * width;
   convert_fc_filter(data_in, num, chw);
   align_filter(data_in, num, channel, height, width);
+<<<<<<< HEAD
+=======
+  int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width);
+  fpga_flush(*data_in, pixel_num * sizeof(float));
+>>>>>>> upstream/develop
 }
 
 float find_max(float *data_in, int data_size) {
diff --git a/src/fpga/V2/fpga_common.cpp b/src/fpga/V2/fpga_common.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..01bca30a9ccf79232e1f28bbf77b1c030632f5bc
--- /dev/null
+++ b/src/fpga/V2/fpga_common.cpp
@@ -0,0 +1,44 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <fpga/V2/fpga_common.h>
+namespace paddle_mobile {
+namespace fpga {
+
+int16_t fp32_2_fp16(float fp32_num) {
+  unsigned long tmp = *(unsigned long *)(&fp32_num);  // NOLINT
+  auto t = (int16_t)(((tmp & 0x007fffff) >> 13) | ((tmp & 0x80000000) >> 16) |
+                     (((tmp & 0x7f800000) >> 13) - (112 << 10)));
+  if (tmp & 0x1000) {
+    t++;  // roundoff
+  }
+  return t;
+}
+
+float fp16_2_fp32(int16_t fp16_num) {
+  if (0 == fp16_num) {
+    return 0;
+  }
+  int frac = (fp16_num & 0x3ff);
+  int exp = ((fp16_num & 0x7c00) >> 10) + 112;
+  int s = fp16_num & 0x8000;
+  int tmp = 0;
+  float fp32_num;
+  tmp = s << 16 | exp << 23 | frac << 13;
+  fp32_num = *(float *)&tmp;  // NOLINT
+  return fp32_num;
+}
+
+}  // namespace fpga
+}  // namespace paddle_mobile
diff --git a/src/fpga/V2/fpga_common.h b/src/fpga/V2/fpga_common.h
new file mode 100644
index 0000000000000000000000000000000000000000..1862d843503ee8faf58caf038202e198ca079905
--- /dev/null
+++ b/src/fpga/V2/fpga_common.h
@@ -0,0 +1,125 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cstdint>
+
+namespace paddle_mobile {
+namespace fpga {
+
+enum DataType {
+  DATA_TYPE_FP32 = 1,
+  DATA_TYPE_FP16 = 0,
+};
+
+enum LayoutType {
+  LAYOUT_CHW = 1,
+  LAYOUT_HWC = 0,
+};
+
+struct KernelArgs {
+  uint32_t width;
+  uint32_t height;
+  uint32_t stride_w;
+  uint32_t stride_h;
+};
+
+struct ImageInputArgs {
+  void* address;         // input featuremap virtual address
+  float* scale_address;  // input scale address;
+  uint32_t channels;
+  uint32_t width;  // featuremap width
+  uint32_t height;
+  uint32_t pad_width;  // padding width;
+  uint32_t pad_height;
+};
+
+struct ImageOutputArgs {
+  void* address;         // output result address;
+  float* scale_address;  // output scale address;
+  uint64_t timer_cnt;    // time counter for FPGA computation
+};
+
+struct ConvArgs {
+  bool relu_enabled;
+  void* sb_address;  // scale and bias
+  void* filter_address;
+  float* filter_scale_address;
+  void* free_space;  // used by FPGA logic
+  uint32_t filter_num;
+  uint32_t group_num;
+
+  struct KernelArgs kernel;
+  struct ImageInputArgs image;  // input image;
+  struct ImageOutputArgs output;
+};
+
+struct ConcatArgs {
+  uint32_t image_num;
+  int16_t** images_in;
+  float** scales_in;
+  void* image_out;
+  float* scale_out;
+  uint32_t* channel_num;
+  uint32_t* aligned_channel_num;
+  uint32_t out_channel;
+  uint32_t height;
+  uint32_t width;
+};
+
+struct SplitConvArgs {
+  uint32_t split_num;
+  uint32_t group_num;
+  uint32_t filter_num;
+  struct ImageOutputArgs output;
+  struct ConvArgs* conv_arg;
+  struct ConcatArgs concat_arg;
+};
+
+struct PoolingArgs {
+  int16_t mode;  // mode: 0:max, 1:avg
+  int16_t kernel_reciprocal;
+  struct KernelArgs kernel;
+  struct ImageInputArgs image;  // input image;
+  struct ImageOutputArgs output;
+};
+
+struct EWAddArgs {
+  bool relu_enabled;
+  uint32_t const0;  // output0 = const0 x input0 + const1 x input1;
+  uint32_t const1;
+  struct ImageInputArgs image0;
+  struct ImageInputArgs image1;
+  struct ImageOutputArgs output;
+};
+
+struct BypassArgs {
+  enum DataType input_data_type;
+  enum DataType output_data_type;
+  enum LayoutType input_layout_type;
+  enum LayoutType output_layout_type;
+  struct ImageInputArgs image;
+  struct ImageOutputArgs output;
+};
+
+struct DeconvArgs {
+  struct ConvArgs conv_arg;
+};
+static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; }
+int16_t fp32_2_fp16(float fp32_num);
+float fp16_2_fp32(int16_t fp16_num);
+
+}  // namespace fpga
+}  // namespace paddle_mobile
diff --git a/src/fpga/V2/image.cpp b/src/fpga/V2/image.cpp
index 4ce76cd00fb72cc1292efa5be6cc0d0fe7d93107..ac9d7cbb3b246adf18138652e9761f56b9923163 100644
--- a/src/fpga/V2/image.cpp
+++ b/src/fpga/V2/image.cpp
@@ -58,6 +58,10 @@ void format_image(float **data_in, int channel, int height, int width,
                   int aligned_channel) {
   convert_to_hwc(data_in, channel, height, width);
   align_image(data_in, channel, height, width, aligned_channel);
+<<<<<<< HEAD
+=======
+  fpga_flush(*data_in, aligned_channel * height * width * sizeof(float));
+>>>>>>> upstream/develop
 }
 
 void concat_images(int16_t **images_in, float **scales_in, void *image_out,
@@ -69,6 +73,11 @@ void concat_images(int16_t **images_in, float **scales_in, void *image_out,
   scale_out[1] = 0.0;
   for (int i = 0; i < image_num; i++) {
     scale_out[0] = std::max(*scale_out, scales_in[i][0]);
+<<<<<<< HEAD
+=======
+    fpga_invalidate(images_in[i],
+                    height * width * aligned_channel_num[i] * sizeof(int16_t));
+>>>>>>> upstream/develop
   }
   scale_out[1] = 1 / scale_out[0];
 
@@ -83,6 +92,10 @@ void concat_images(int16_t **images_in, float **scales_in, void *image_out,
       tmp_channel_sum += channel_num[i];
     }
   }
+<<<<<<< HEAD
+=======
+  fpga_flush(image_out, hw * out_channel * sizeof(int16_t));
+>>>>>>> upstream/develop
 }
 
 }  // namespace image
diff --git a/src/framework/executor.cpp b/src/framework/executor.cpp
index 0ed3a5d32385963c67d898defc58ab019a09c156..dede260802acd2b46e932b46e3fc038cb13fd386 100644
--- a/src/framework/executor.cpp
+++ b/src/framework/executor.cpp
@@ -26,6 +26,10 @@ limitations under the License. */
 #include "framework/program/var_desc.h"
 #include "framework/scope.h"
 #include "framework/tensor.h"
+<<<<<<< HEAD:src/framework/executor.cpp
+=======
+#include "memory/t_malloc.h"
+>>>>>>> upstream/develop:src/framework/executor.cpp
 
 #ifdef PADDLE_EXECUTOR_MULTITHREAD
 #include <queue>
@@ -86,8 +90,10 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
   }
   std::shared_ptr<framework::BlockDesc> to_predict_block =
       to_predict_program_->Block(0);
+  int i = 0;
   auto &ops = ops_of_block_[*to_predict_block.get()];
   for (const auto &op : ops) {
+    DLOG << "Initialize op[" << i++ << "]: " << op->Type();
     op->Init();
   }
 }
@@ -102,8 +108,8 @@ static void LoadMemInternal(void **data, framework::LoDTensor *tensor,
     // should be moved into operator init function
     float min_value;
     float max_value;
-    memcpy(&min_value, data_buf, sizeof(float));
-    memcpy(&max_value, data_buf + sizeof(float), sizeof(float));
+    memory::Copy(&min_value, data_buf, sizeof(float));
+    memory::Copy(&max_value, data_buf + sizeof(float), sizeof(float));
     data_buf += 2 * sizeof(float);
     const float factor = (max_value - min_value) / 255.0;
     const uint8_t *uint8_data = reinterpret_cast<uint8_t *>(data_buf);
@@ -112,7 +118,7 @@ static void LoadMemInternal(void **data, framework::LoDTensor *tensor,
     }
     data_buf += size * sizeof(uint8_t);
   } else {
-    memcpy(tensor_data, *data_buf, size * sizeof(Dtype));
+    memory::Copy(tensor_data, *data_buf, size * sizeof(Dtype));
     *data_buf += size * sizeof(Dtype);
   }
 }
@@ -128,7 +134,7 @@ void Executor<Dtype, P>::LoadMemory(
   // lod information
   // uint64_t lod_level = *(reinterpret_cast<uint64_t *>(*data_buf));
   uint64_t lod_level = 0;
-  memcpy(&lod_level, *data_buf, sizeof(uint64_t));
+  memory::Copy(&lod_level, *data_buf, sizeof(uint64_t));
   *data_buf += sizeof(uint64_t);
 
   auto *lod = tensor->mutable_lod();
@@ -137,7 +143,7 @@ void Executor<Dtype, P>::LoadMemory(
     uint64_t size = *(reinterpret_cast<uint64_t *>(*data_buf));
     *data_buf += sizeof(uint64_t);
     std::vector<size_t> tmp_dim(size / sizeof(size_t));
-    memcpy(tmp_dim.data(), *data_buf, size);
+    memory::Copy(tmp_dim.data(), *data_buf, size);
     (*lod)[i] = std::move(tmp_dim);
     *data_buf += size;
   }
diff --git a/src/io/api_paddle_mobile.cc b/src/io/api_paddle_mobile.cc
index 144cf127a44c78279ca1d95815646a4f01fed6bd..8088f0b8c9f600ce2422af500ab66a68e1341fc8 100644
--- a/src/io/api_paddle_mobile.cc
+++ b/src/io/api_paddle_mobile.cc
@@ -52,7 +52,6 @@ bool PaddleMobilePredictor<Dtype, P>::Init(const PaddleMobileConfig &config) {
   paddle_mobile_->SetThreadNum(config.thread_num);
   return true;
 }
-
 template <typename Dtype, Precision P>
 bool PaddleMobilePredictor<Dtype, P>::Run(
     const std::vector<PaddleTensor> &inputs,
diff --git a/src/io/paddle_inference_api.h b/src/io/paddle_inference_api.h
index 3c9ffa00c7e749d1c9d77562b2db0b42ee605164..5326f864a4b5238c8498ee1fe9e5810ca0a657cf 100644
--- a/src/io/paddle_inference_api.h
+++ b/src/io/paddle_inference_api.h
@@ -98,7 +98,6 @@ class PaddlePredictor {
   virtual bool Run(const std::vector<PaddleTensor>& inputs,
                    std::vector<PaddleTensor>* output_data,
                    int batch_size = -1) = 0;
-
   // Destroy the Predictor.
   virtual ~PaddlePredictor() = default;
 
diff --git a/src/io/paddle_mobile.cpp b/src/io/paddle_mobile.cpp
index cfd1a1c87671cfb598aad586b421f046830b10d9..5d631990cbc269b3c7ad54e9d85fa9a978a4560e 100644
--- a/src/io/paddle_mobile.cpp
+++ b/src/io/paddle_mobile.cpp
@@ -13,7 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "io/paddle_mobile.h"
-
+#ifdef PADDLE_MOBILE_CL
+#include <CL/cl.h>
+#include "framework/cl/cl_tensor.h"
+#endif
+#include "common/common.h"
+#include "operators/math/gemm.h"
 namespace paddle_mobile {
 
 static std::mutex lc;
@@ -119,6 +124,40 @@ void PaddleMobile<Dtype, P>::Clear() {
   loader_ = nullptr;
 }
 
+template <typename Dtype, Precision P>
+double PaddleMobile<Dtype, P>::GetPredictTime() {
+  int m = 32;
+  int n = 224 * 224;
+  int k = 27;
+  int lda = k;
+  int ldb = n;
+  int ldc = n;
+  float *a =
+      static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m * k));
+  float *b =
+      static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * k * n));
+  float *c =
+      static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m * n));
+  int t1 = 1;
+  int t2 = 1;
+  for (int i = 0; i < m * k; ++i) {
+    a[i] = t1 + rand() % t2;
+  }
+  for (int i = 0; i < k * n; ++i) {
+    b[i] = t1 + rand() % t2;
+  }
+  paddle_mobile::operators::math::Gemm gemm;
+  auto time1 = paddle_mobile::time();
+  //  gemm.Sgemm(m, n, k, static_cast<float>(1), a, lda, b, ldb,
+  //             static_cast<float>(0), c, ldc, false, nullptr);
+  auto time2 = paddle_mobile::time();
+  double cost = paddle_mobile::time_diff(time1, time2);
+  paddle_mobile::memory::Free(a);
+  paddle_mobile::memory::Free(b);
+  paddle_mobile::memory::Free(c);
+  return cost;
+}
+
 template <typename Dtype, Precision P>
 PaddleMobile<Dtype, P>::~PaddleMobile() {
   executor_ = nullptr;
@@ -160,6 +199,10 @@ void PaddleMobile<Dtype, P>::Predict_To(int end) {
 #endif
 
 #ifdef PADDLE_MOBILE_CL
+<<<<<<< HEAD
+=======
+static std::mutex lc;
+>>>>>>> upstream/develop
 template <typename Dtype, Precision P>
 void PaddleMobile<Dtype, P>::SetCLPath(std::string path) {
   std::lock_guard<std::mutex> lock(lc);
@@ -167,6 +210,211 @@ void PaddleMobile<Dtype, P>::SetCLPath(std::string path) {
     framework::CLEngine::Instance()->setClPath(path);
   }
 }
+<<<<<<< HEAD
+=======
+template <>
+double PaddleMobile<GPU_CL, Precision::FP32>::GetPredictTime() {
+  cl_int status;
+  cl_uint nPlatform;
+  clGetPlatformIDs(0, NULL, &nPlatform);
+  cl_platform_id *listPlatform =
+      (cl_platform_id *)malloc(nPlatform * sizeof(cl_platform_id));
+  clGetPlatformIDs(nPlatform, listPlatform, NULL);
+  cl_uint nDevice = 0;
+  clGetDeviceIDs(listPlatform[0], CL_DEVICE_TYPE_GPU, 0, NULL, &nDevice);
+  cl_device_id *listDevice =
+      (cl_device_id *)malloc(nDevice * sizeof(cl_device_id));
+  clGetDeviceIDs(listPlatform[0], CL_DEVICE_TYPE_GPU, nDevice, listDevice,
+                 NULL);
+  cl_context context =
+      clCreateContext(NULL, nDevice, listDevice, NULL, NULL, &status);
+  cl_command_queue queue =
+      clCreateCommandQueue(context, listDevice[0], 0, &status);
+
+  int n = 1;
+  int c = 3;
+  int h = 224;
+  int w = 224;
+  float *input = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * 3 * 224 * 224));
+  float *filter = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * 32 * 27));
+  int input_w = w * (c + 3) / 4;
+  int input_h = n * h;
+  int filter_w = 3 * (3 + 3) / 4;
+  int filter_h = 32 * 3;
+  int output_w = 224 * (32 + 3) / 4;
+  int output_h = 1 * 224;
+
+  framework::DDim input_dims = {1, 3, 224, 224};
+  framework::CLTensor input_cl_tensor(context, queue);
+  input_cl_tensor.Resize(input_dims);
+  cl_mem inputBuffer = input_cl_tensor.mutable_with_data<float>(input);
+
+  framework::DDim filter_dims = {32, 3, 3, 3};
+  framework::CLTensor filter_cl_tensor(context, queue);
+  input_cl_tensor.Resize(filter_dims);
+  cl_mem filterBuffer = filter_cl_tensor.mutable_with_data<float>(filter);
+
+  cl_mem cl_filter_image = NULL;
+  cl_mem cl_input_image = NULL;
+  cl_mem cl_output_image = NULL;
+  cl_image_format cf = {.image_channel_order = CL_RGBA,
+                        .image_channel_data_type = CL_HALF_FLOAT};
+  cl_input_image = clCreateImage2D(context, CL_MEM_READ_WRITE | 0, &cf, input_w,
+                                   input_h, 0, NULL, &status);
+  cl_filter_image = clCreateImage2D(context, CL_MEM_READ_WRITE | 0, &cf,
+                                    filter_w, filter_h, 0, NULL, &status);
+  cl_output_image = clCreateImage2D(context, CL_MEM_READ_WRITE | 0, &cf,
+                                    output_w, output_h, 0, NULL, &status);
+  char *code;
+  std::string path = framework::CLEngine::Instance()->GetCLPath() +
+                     "/cl_kernel/feed_kernel.cl";
+  size_t length = readText(path.c_str(), &code);
+  cl_program program = clCreateProgramWithSource(
+      context, 1, (const char **)&code, &length, NULL);
+  std::string path1 = "-cl-fast-relaxed-math -I " +
+                      framework::CLEngine::Instance()->GetCLPath() +
+                      "/cl_kernel";
+  clBuildProgram(program, 0, 0, path1.c_str(), NULL, NULL);
+  cl_kernel kernel = clCreateKernel(program, "feed", &status);
+
+  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputBuffer);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_input_image);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 2, sizeof(cl_int), &input_w);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 3, sizeof(cl_int), &input_h);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 4, sizeof(cl_int), &c);
+  CL_CHECK_ERRORS(status);
+
+  size_t global_work_size[2] = {input_w, input_h};
+
+  //  cl_event out_event = param.Out()->GetClEvent();
+
+  status = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_work_size,
+                                  NULL, 0, NULL, NULL);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &filterBuffer);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_filter_image);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 2, sizeof(cl_int), &filter_w);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 3, sizeof(cl_int), &filter_h);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 4, sizeof(cl_int), &c);
+  CL_CHECK_ERRORS(status);
+
+  size_t global_work_size1[2] = {filter_w, filter_h};
+
+  //  cl_event out_event = param.Out()->GetClEvent();
+
+  status = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_work_size1,
+                                  NULL, 0, NULL, NULL);
+  CL_CHECK_ERRORS(status);
+
+  clFinish(queue);
+  queue = clCreateCommandQueue(context, listDevice[0], 0, &status);
+
+  path = framework::CLEngine::Instance()->GetCLPath() +
+         "/cl_kernel/conv_kernel.cl";
+  size_t length1 = readText(path.c_str(), &code);
+  program = clCreateProgramWithSource(context, 1, (const char **)&code,
+                                      &length1, &status);
+  CL_CHECK_ERRORS(status);
+  clBuildProgram(program, 0, 0, path1.c_str(), NULL, NULL);
+  kernel = clCreateKernel(program, "conv_3x3", &status);
+  CL_CHECK_ERRORS(status);
+
+  int c_block = (32 + 3) / 4;
+  int nh = n * h;
+  int stride = 1;
+  int offset = 0;
+  int input_c = (c + 3) / 4;
+  int dilation = 1;
+  int input_width = 224;
+  int input_height = 224;
+  int output_width = 224;
+  int output_height = 224;
+  status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &cl_input_image);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &cl_filter_image);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &cl_output_image);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 6, sizeof(int), &stride);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 7, sizeof(int), &offset);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 8, sizeof(int), &input_c);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 9, sizeof(int), &dilation);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 10, sizeof(int), &input_width);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 11, sizeof(int), &input_height);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 12, sizeof(int), &output_width);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 13, sizeof(int), &output_height);
+  CL_CHECK_ERRORS(status);
+
+  //  cl_event out_event = param.Output()->GetClEvent();
+  //  cl_event wait_event = param.Input()->GetClEvent();
+  size_t global_work_size2[3] = {8, 224, 224};
+  auto time1 = paddle_mobile::time();
+  status = clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size2,
+                                  NULL, 0, NULL, NULL);
+  CL_CHECK_ERRORS(status);
+  clFinish(queue);
+  auto time2 = paddle_mobile::time();
+  paddle_mobile::memory::Free(input);
+  paddle_mobile::memory::Free(filter);
+  return paddle_mobile::time_diff(time1, time2);
+}
+template <typename Dtype, Precision P>
+int PaddleMobile<Dtype, P>::readText(
+    const char *kernelPath,
+    char **pcode)  // 读取文本文件放入 pcode，返回字符串长度
+{
+  FILE *fp;
+  int size;
+  // printf("<readText> File: %s\n", kernelPath);
+  fp = fopen(kernelPath, "rb");
+  if (!fp) {
+    printf("<readText> Open file failed\n");
+    return -1;
+  }
+  if (fseek(fp, 0, SEEK_END) != 0) {
+    printf("<readText> Seek end of file failed\n");
+    return -1;
+  }
+  if ((size = ftell(fp)) < 0) {
+    printf("<readText> Get file position failed\n");
+    return -1;
+  }
+  rewind(fp);
+  if ((*pcode = (char *)malloc(size + 1)) == NULL) {
+    printf("<readText> Allocate space failed\n");
+    return -1;
+  }
+  fread(*pcode, 1, size, fp);
+  (*pcode)[size] = '\0';
+  fclose(fp);
+  return size + 1;
+}
+
+>>>>>>> upstream/develop
 #endif
 
 template class PaddleMobile<CPU, Precision::FP32>;
diff --git a/src/io/paddle_mobile.h b/src/io/paddle_mobile.h
index 778b173f3e64f27f6bdf8329a2979ebbdf955633..1972f5f4430f0a0f363e16316758174f54312d71 100644
--- a/src/io/paddle_mobile.h
+++ b/src/io/paddle_mobile.h
@@ -65,6 +65,7 @@ class PaddleMobile {
 
   void SetThreadNum(int num);
   void Clear();
+  double GetPredictTime();
 
   ~PaddleMobile();
 
@@ -80,6 +81,11 @@ class PaddleMobile {
 #ifdef PADDLE_MOBILE_CL
  public:
   void SetCLPath(std::string cl_path);
+<<<<<<< HEAD
+=======
+  int readText(const char *kernelPath,
+               char **pcode);  // 读取文本文件放入 pcode，返回字符串长度
+>>>>>>> upstream/develop
 #endif
 
  private:
diff --git a/src/io/paddle_test_inference_api.cpp b/src/io/paddle_test_inference_api.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..97410ff32e31298bfd35abcc7dfc8cef61fe017a
--- /dev/null
+++ b/src/io/paddle_test_inference_api.cpp
@@ -0,0 +1,35 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "io/paddle_test_inference_api.h"
+#include "io/paddle_mobile.h"
+namespace paddle_mobile {
+template <typename Dtype, Precision P>
+double PaddleTester<Dtype, P>::CaculatePredictTime(std::string *cl_path) {
+  PaddleMobile<Dtype, P> paddle_mobile;
+#ifdef PADDLE_MOBILE_CL
+  if (cl_path) {
+    paddle_mobile.SetCLPath(*cl_path);
+  }
+
+#endif
+  return paddle_mobile.GetPredictTime();
+}
+template class PaddleTester<CPU, Precision::FP32>;
+template class PaddleTester<FPGA, Precision::FP32>;
+template class PaddleTester<GPU_MALI, Precision::FP32>;
+
+template class PaddleTester<GPU_CL, Precision::FP32>;
+
+}  // namespace paddle_mobile
diff --git a/src/io/paddle_test_inference_api.h b/src/io/paddle_test_inference_api.h
new file mode 100644
index 0000000000000000000000000000000000000000..b203bac43d17cafd7655911df5a5116b215413bd
--- /dev/null
+++ b/src/io/paddle_test_inference_api.h
@@ -0,0 +1,32 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file contains the definition of a simple Inference API for Paddle.
+ *
+ * ATTENTION: It requires some C++ features, for lower version C++ or C, we
+ * might release another API.
+ */
+
+#pragma once
+#include "common/types.h"
+#include "string"
+namespace paddle_mobile {
+template <typename Dtype, Precision P = Precision::FP32>
+class PaddleTester {
+ public:
+  double CaculatePredictTime(std::string *cl_path = nullptr);
+};
+
+}  // namespace paddle_mobile
diff --git a/src/memory/t_malloc.cpp b/src/memory/t_malloc.cpp
index 129f82a19d6f5c2a25174863a007a36b96af29ad..2fb74d18809f174810866a990396bb0279d256f5 100644
--- a/src/memory/t_malloc.cpp
+++ b/src/memory/t_malloc.cpp
@@ -32,7 +32,7 @@ const int MALLOC_ALIGN = 64;
 namespace fpga = paddle_mobile::fpga;
 
 void Copy(void *dst, const void *src, size_t num) {
-  std::memcpy(dst, src, num);
+  fpga::fpga_copy(dst, src, num);
 }
 
 void *Alloc(size_t size) { return fpga::fpga_malloc(size); }
diff --git a/src/operators/conv_transpose_op.cpp b/src/operators/conv_transpose_op.cpp
index 4d9eefaa85be51c9c2409ca044a6da4874566e1c..d09a7937453f3bd2c20d9e6bc1a03d4375d57491 100644
--- a/src/operators/conv_transpose_op.cpp
+++ b/src/operators/conv_transpose_op.cpp
@@ -27,6 +27,7 @@ REGISTER_OPERATOR_CPU(conv2d_transpose, ops::ConvOpTranspose);
 #ifdef PADDLE_MOBILE_MALI_GPU
 #endif
 #ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(conv2d_transpose, ops::ConvOpTranspose);
 #endif
 
 #endif
diff --git a/src/operators/fusion_deconv_relu_op.cpp b/src/operators/fusion_deconv_relu_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..daae39c951b34fa05962f936c28381f7d5d4e15c
--- /dev/null
+++ b/src/operators/fusion_deconv_relu_op.cpp
@@ -0,0 +1,32 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_DECONVRELU_OP
+
+#include "operators/fusion_deconv_relu_op.h"
+
+namespace paddle_mobile {
+namespace operators {}
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(fusion_deconv_relu, ops::FusionDeconvReluOp);
+#endif
+
+#endif
diff --git a/src/operators/fusion_deconv_relu_op.h b/src/operators/fusion_deconv_relu_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..e87d5d3798930d745b82c8e5a3cca793c12ee4b1
--- /dev/null
+++ b/src/operators/fusion_deconv_relu_op.h
@@ -0,0 +1,107 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_DECONVRELU_OP
+#pragma once
+#include <string>
+#include <vector>
+
+#include "framework/operator.h"
+#include "framework/program/program-optimize/fusion_op_register.h"
+#include "operators/kernel/deconv_relu_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+using std::string;
+using std::vector;
+class FusionDeconvReluMatcher : public framework::FusionOpMatcher {
+ public:
+  FusionDeconvReluMatcher() {
+    node_ = framework::Node(G_OP_TYPE_CONV_TRANSPOSE);
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_RELU);
+  }
+
+  void FolderNodes(
+      framework::Node *node,
+      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
+    node->Folder(node_.Depth(), Type(), {}, removed_nodes);
+  }
+
+  std::string Type() { return G_OP_TYPE_FUSION_FC_RELU; }
+};
+
+template <typename DeviceType, typename T>
+class FusionDeconvReluOp : public framework::OperatorWithKernel<
+                               DeviceType, FusionDeconvReluParam<DeviceType>,
+                               operators::DeconvReluKernel<DeviceType, T>> {
+ public:
+  FusionDeconvReluOp(const string &type, const VariableNameMap &inputs,
+                     const VariableNameMap &outputs,
+                     const framework::AttributeMap &attrs,
+                     std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, FusionDeconvReluParam<DeviceType>,
+            operators::DeconvReluKernel<DeviceType, T>>(type, inputs, outputs,
+                                                        attrs, scope) {}
+
+  void InferShape() const {
+    auto input = this->param_.Input();
+    auto in_dims = input->dims();
+
+    auto filter = this->param_.Filter();
+    auto filter_dims = filter->dims();
+
+    std::vector<int> strides = this->param_.Strides();
+    std::vector<int> paddings = this->param_.Paddings();
+    std::vector<int> dilations = this->param_.Dilations();
+
+    int groups = this->param_.Groups();
+
+    PADDLE_MOBILE_ENFORCE(
+        in_dims.size() == 4 || in_dims.size() == 5,
+        "ConvTransposeOp intput should be 4-D or 5-D tensor.");
+    PADDLE_MOBILE_ENFORCE(
+        in_dims.size() == filter_dims.size(),
+        "ConvTransposeOp input dimension and filter dimension "
+        "should be the same.");
+    PADDLE_MOBILE_ENFORCE(
+        in_dims.size() - strides.size() == 2U,
+        "ConvTransposeOp input dimension and strides dimension should "
+        "be consistent.");
+    PADDLE_MOBILE_ENFORCE(paddings.size() == strides.size(),
+                          "ConvTransposeOp paddings dimension and strides "
+                          "dimension should be the same.");
+    PADDLE_MOBILE_ENFORCE(paddings.size() == dilations.size(),
+                          "ConvTransposeOp paddings dimension and dilations "
+                          "dimension should be the same.");
+    PADDLE_MOBILE_ENFORCE(
+        in_dims[1] == filter_dims[0],
+        "In ConvTransposeOp, The number of input channels should "
+        "be equal to the number of filter's channels.");
+
+    std::vector<int64_t> output_shape({in_dims[0], filter_dims[1] * groups});
+    for (size_t i = 0; i < strides.size(); ++i) {
+      auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1;
+      output_shape.push_back((in_dims[i + 2] - 1) * strides[i] -
+                             2 * paddings[i] + filter_extent);
+    }
+    this->param_.Output()->Resize(framework::make_ddim(output_shape));
+  }
+
+ protected:
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif  // FUSION_FC_RELU_OP
diff --git a/src/operators/kernel/arm/fetch_kernel.cpp b/src/operators/kernel/arm/fetch_kernel.cpp
index 62d0e678891e4f54471f95de08242a3e72f7a385..72aca9002a777c187179913f0ae7ba2c342d1422 100644
--- a/src/operators/kernel/arm/fetch_kernel.cpp
+++ b/src/operators/kernel/arm/fetch_kernel.cpp
@@ -8,7 +8,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+<<<<<<< HEAD
 #ifdef FUSION_CONVADD_OP
+=======
+>>>>>>> upstream/develop
 #include "operators/kernel/fetch_kernel.h"
 namespace paddle_mobile {
 namespace operators {
@@ -23,4 +26,7 @@ void FetchKernel<CPU, float>::Compute(const FetchParam<CPU> &param) {
 template class FetchKernel<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile
+<<<<<<< HEAD
 #endif
+=======
+>>>>>>> upstream/develop
diff --git a/src/operators/kernel/cl/cl_kernel/feed_kernel.cl b/src/operators/kernel/cl/cl_kernel/feed_kernel.cl
index 80d741d859af633299120bfec9f4cfeeaeb47194..0d8ca183aebeecc68a5162d798f8d6a81d62f325 100644
--- a/src/operators/kernel/cl/cl_kernel/feed_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/feed_kernel.cl
@@ -13,14 +13,31 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
+<<<<<<< HEAD
 __kernel void feed(__global float *in, __write_only image2d_t outputImage,int h,int w)
+=======
+__kernel void feed(__global float *in, __write_only image2d_t outputImage,int h,int w,int c)
+>>>>>>> upstream/develop
  {
         int i = get_global_id(0);
         int j = get_global_id(1);
         half4 pixel;
         pixel.x = convert_half(in[(i * w + j)]);
+<<<<<<< HEAD
         pixel.y = convert_half(in[h * w + (i * w + j)]);
         pixel.z = convert_half(in[2 * h * w + (i * w + j)]);
+=======
+        if(c>=2){
+        pixel.y = convert_half(in[h * w + (i * w + j)]);
+        }else{
+        pixel.y = 0.0;
+        }
+        if(c>=3){
+        pixel.z = convert_half(in[2 * h * w + (i * w + j)]);
+        }else{
+         pixel.z = 0.0;
+        }
+>>>>>>> upstream/develop
         pixel.w = 0.0;
         int2 coords;
         coords.x = j;
diff --git a/src/operators/kernel/cl/feed_kernel.cpp b/src/operators/kernel/cl/feed_kernel.cpp
index ad5fb9cdbcd00dad56579297c010c3912e3dca24..68fffebd7047fd5dee5151790a12c26f7dd74daf 100644
--- a/src/operators/kernel/cl/feed_kernel.cpp
+++ b/src/operators/kernel/cl/feed_kernel.cpp
@@ -34,6 +34,10 @@ void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) {
   const float *input_data = input->data<float>();
   int numel = input->numel();
   cl_mem cl_image = output->GetCLImage();
+<<<<<<< HEAD
+=======
+  int c = input->dims()[1];
+>>>>>>> upstream/develop
   int height = output->dims()[2];
   int width = output->dims()[3];
   CLTensor input_cl_tensor(this->cl_helper_.CLContext(),
@@ -49,6 +53,11 @@ void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) {
   CL_CHECK_ERRORS(status);
   status = clSetKernelArg(kernel, 3, sizeof(cl_int), &height);
   CL_CHECK_ERRORS(status);
+<<<<<<< HEAD
+=======
+  status = clSetKernelArg(kernel, 4, sizeof(cl_int), &c);
+  CL_CHECK_ERRORS(status);
+>>>>>>> upstream/develop
 
   size_t global_work_size[2] = {width, height};
 
diff --git a/src/operators/kernel/deconv_relu_kernel.h b/src/operators/kernel/deconv_relu_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..bc85f1ffee19abe3941bd9d90fb8dfd04280ce14
--- /dev/null
+++ b/src/operators/kernel/deconv_relu_kernel.h
@@ -0,0 +1,39 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_DECONVRELU_OP
+
+#pragma once
+
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using framework::OpKernelBase;
+
+template <typename DeviceType, typename T>
+class DeconvReluKernel
+    : public OpKernelBase<DeviceType, FusionDeconvReluParam<DeviceType>> {
+ public:
+  void Compute(const FusionDeconvReluParam<DeviceType> &param);
+
+  bool Init(FusionDeconvReluParam<DeviceType> *param);
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp b/src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3284ddcdece3ab7fcf4fb4458a59d39c452ad1ce
--- /dev/null
+++ b/src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp
@@ -0,0 +1,34 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef CONV_TRANSPOSE_OP
+
+#include "operators/kernel/conv_transpose_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvTransposeKernel<FPGA, float>::Init(ConvTransposeParam<FPGA> *param) {
+  return true;
+}
+
+template <>
+void ConvTransposeKernel<FPGA, float>::Compute(
+    const ConvTransposeParam<FPGA> &param) {}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/V2/deconv_relu_kernel.cpp b/src/operators/kernel/fpga/V2/deconv_relu_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bf3556609a4ec2476521a9b8e80192f71aef4f52
--- /dev/null
+++ b/src/operators/kernel/fpga/V2/deconv_relu_kernel.cpp
@@ -0,0 +1,36 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_DECONVRELU_OP
+
+#include "operators/kernel/deconv_relu_kernel.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool DeconvReluKernel<FPGA, float>::Init(FusionDeconvReluParam<FPGA> *param) {
+  return true;
+}
+
+template <>
+void DeconvReluKernel<FPGA, float>::Compute(
+    const FusionDeconvReluParam<FPGA> &param) {}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/V2/slice_kernel.cpp b/src/operators/kernel/fpga/V2/slice_kernel.cpp
index b0df0cb65d44fe864c0e135c582b418826b9e00d..e4f5d3bc6b27a31ddc7537f69ca225338cf7c025 100644
--- a/src/operators/kernel/fpga/V2/slice_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/slice_kernel.cpp
@@ -24,6 +24,10 @@ bool SliceKernel<FPGA, float>::Init(SliceParam<FPGA>* param) {
 }
 template <>
 void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) {}
+<<<<<<< HEAD
+=======
+
+>>>>>>> upstream/develop
 }  // namespace operators
 }  // namespace paddle_mobile
 #endif
diff --git a/src/operators/kernel/fpga/V2/softmax_kernel.cpp b/src/operators/kernel/fpga/V2/softmax_kernel.cpp
index 5cfccf8779bfb1839f1bfe70dade765a975bf982..dcca441875613d64d172dffb4bf56ddcceef7b27 100644
--- a/src/operators/kernel/fpga/V2/softmax_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/softmax_kernel.cpp
@@ -49,7 +49,16 @@ void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) {
   Tensor *out = param.Out();
 
   fpga::PerformBypass(param.FpgaArgs());
+<<<<<<< HEAD
   math::SoftmaxFuntor<CPU, float>()(in_x, out);
+=======
+  fpga::fpga_invalidate(
+      (void *)in_x->data<float>(),                           // NOLINT
+      fpga::get_aligned_channel_num((int)in_x->dims()[1]) *  // NOLINT
+          sizeof(float));
+  math::SoftmaxFuntor<CPU, float>()(in_x, out);
+  fpga::fpga_flush(out->data<float>(), out->memory_size());
+>>>>>>> upstream/develop
 }
 
 }  // namespace operators
diff --git a/src/operators/kernel/fpga/V2/tanh_kernel.cpp b/src/operators/kernel/fpga/V2/tanh_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..46dd3a0f6f8819f6485243a445725554943ab2bf
--- /dev/null
+++ b/src/operators/kernel/fpga/V2/tanh_kernel.cpp
@@ -0,0 +1,33 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef TANH_OP
+
+#include "operators/kernel/tanh_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool TanhKernel<FPGA, float>::Init(TanhParam<FPGA> *param) {
+  return true;
+}
+
+template <>
+void TanhKernel<FPGA, float>::Compute(const TanhParam<FPGA> &param) {}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/tanh_kernel.h b/src/operators/kernel/tanh_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..035f64f840b0aae8970f1aa284054a7984fc7ed6
--- /dev/null
+++ b/src/operators/kernel/tanh_kernel.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef TANH_OP
+
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using framework::OpKernelBase;
+
+template <typename DeviceType, typename T>
+class TanhKernel : public OpKernelBase<DeviceType, TanhParam<DeviceType>> {
+ public:
+  void Compute(const TanhParam<DeviceType>& param);
+  bool Init(TanhParam<DeviceType>* param);
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/math/gemm.cpp b/src/operators/math/gemm.cpp
index 605fa17c3c70ec3151cc1a2fb249edab336548a1..d3e6de3134ff91f47c66c927194a5ba688e931b0 100644
--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
@@ -3230,6 +3230,8 @@ void Gemm::Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
 
   int L1 = 64 / max_threads * 1024;
   KC = k;
+  zero = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * KC));
+  memset(static_cast<void *>(zero), 0, sizeof(float) * KC);
   if (m > n) {
     // 对 A 分块
     MC = L1 / (KC * sizeof(float));
@@ -3255,7 +3257,7 @@ void Gemm::Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
 
     packedB = static_cast<float *>(
         paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
-    (*this.*procPackB)(KC, NC, NC % NR, B, ldb, packedB);
+    (*this.*procPackB)(KC, n, n % NR, B, ldb, packedB);
     packedA = static_cast<float *>(
         paddle_mobile::memory::Alloc(sizeof(float) * MC * KC * max_threads));
   } else {
@@ -3284,12 +3286,10 @@ void Gemm::Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
 
     packedA = static_cast<float *>(
         paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
-    (*this.*procPackA)(MC, KC, MC % MR, A, lda, packedA);
+    (*this.*procPackA)(m, KC, m % MR, A, lda, packedA);
     packedB = static_cast<float *>(
         paddle_mobile::memory::Alloc(sizeof(float) * KC * NC * max_threads));
   }
-  zero = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * KC));
-  memset(static_cast<void *>(zero), 0, sizeof(float) * KC);
   packedC = static_cast<float *>(
       paddle_mobile::memory::Alloc(sizeof(float) * MC * NC * max_threads));
 
@@ -3352,6 +3352,8 @@ void Gemm::SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A,
 
   int L1 = 64 / max_threads * 1024;
   KC = k;
+  zero = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * KC));
+  memset(static_cast<void *>(zero), 0, sizeof(float) * KC);
   if (m > n) {
     // 对 A 分块
     MC = L1 / (KC * sizeof(float));
@@ -3377,7 +3379,7 @@ void Gemm::SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A,
 
     packedB = static_cast<float *>(
         paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
-    (*this.*procPackB)(KC, NC, NC % NR, B, ldb, packedB);
+    (*this.*procPackB)(KC, n, n % NR, B, ldb, packedB);
     packedA = static_cast<float *>(
         paddle_mobile::memory::Alloc(sizeof(float) * MC * KC * max_threads));
   } else {
@@ -3405,12 +3407,10 @@ void Gemm::SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A,
 
     packedA = static_cast<float *>(
         paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
-    (*this.*procPackA)(MC, KC, MC % MR, A, lda, packedA);
+    (*this.*procPackA)(m, KC, m % MR, A, lda, packedA);
     packedB = static_cast<float *>(
         paddle_mobile::memory::Alloc(sizeof(float) * KC * NC * max_threads));
   }
-  zero = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * KC));
-  memset(static_cast<void *>(zero), 0, sizeof(float) * KC);
   packedC = static_cast<float *>(
       paddle_mobile::memory::Alloc(sizeof(float) * MC * NC * max_threads));
 
@@ -3480,6 +3480,8 @@ void Gemm::SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
 
   int L1 = 8 * 1024;
   KC = k;
+  zero = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * KC));
+  memset(static_cast<void *>(zero), 0, sizeof(float) * KC);
   if (m > n) {
     // 对 A 分块
     MC = L1 / (KC * sizeof(float));
@@ -3505,7 +3507,7 @@ void Gemm::SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
 
     packedB = static_cast<float *>(
         paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
-    (*this.*procPackB)(KC, NC, NC % NR, B, ldb, packedB);
+    (*this.*procPackB)(KC, n, n % NR, B, ldb, packedB);
     packedA = static_cast<float *>(
         paddle_mobile::memory::Alloc(sizeof(float) * MC * KC * max_threads));
   } else {
@@ -3533,12 +3535,10 @@ void Gemm::SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
 
     packedA = static_cast<float *>(
         paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
-    (*this.*procPackA)(MC, KC, MC % MR, A, lda, packedA);
+    (*this.*procPackA)(m, KC, m % MR, A, lda, packedA);
     packedB = static_cast<float *>(
         paddle_mobile::memory::Alloc(sizeof(float) * KC * NC * max_threads));
   }
-  zero = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * KC));
-  memset(static_cast<void *>(zero), 0, sizeof(float) * KC);
   packedC = static_cast<float *>(
       paddle_mobile::memory::Alloc(sizeof(float) * MC * NC * max_threads));
 
diff --git a/src/operators/op_param.h b/src/operators/op_param.h
index 5666f8e9c97482c13414fc9c4d4b54e7f96bcbca..e41162a05067b3a35e065f975d14dccf5f3a9142 100644
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -1532,6 +1532,30 @@ class ReluParam<GPU_CL> : public ReluParamBase<GPU_CL> {
 };
 #endif
 
+<<<<<<< HEAD
+=======
+#endif
+
+#ifdef TANH_OP
+template <typename Dtype>
+class TanhParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
+ public:
+  TanhParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
+            const AttributeMap &attrs, const Scope &scope) {
+    input_x_ = InputXFrom<GType>(inputs, scope);
+    out_ = OutFrom<GType>(outputs, scope);
+  }
+  const RType *InputX() const { return input_x_; }
+  RType *Out() const { return out_; }
+
+ private:
+  RType *input_x_;
+  RType *out_;
+};
+>>>>>>> upstream/develop
 #endif
 
 #ifdef PRELU_OP
@@ -2229,9 +2253,24 @@ class ConvTransposeParam : public OpParam {
   vector<int> paddings_;
   vector<int> dilations_;
   int groups;
+
+#ifdef PADDLE_MOBILE_FPGA
+
+ private:
+  fpga::DeconvArgs fpga_conv_args;
+
+ public:
+  const fpga::DeconvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::DeconvArgs &args) { fpga_conv_args = args; }
+#endif
 };
 #endif
 
+#ifdef FUSION_DECONVRELU_OP
+template <typename Dtype>
+using FusionDeconvReluParam = ConvTransposeParam<Dtype>;
+#endif
+
 #ifdef GRU_OP
 template <typename Dtype>
 class GruParam : public OpParam {
diff --git a/src/operators/tanh_op.cpp b/src/operators/tanh_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..454cdfa26942eda225a811317e907b1989bcf61b
--- /dev/null
+++ b/src/operators/tanh_op.cpp
@@ -0,0 +1,35 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef TANH_OP
+
+#include "operators/tanh_op.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+void TanhOp<DeviceType, T>::InferShape() const {
+  this->param_.Out()->Resize(this->param_.InputX()->dims());
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(Tanh, ops::TanhOp);
+#endif
+
+#endif
diff --git a/src/operators/tanh_op.h b/src/operators/tanh_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..82b0e4e9a07ae4fd3e4885790d5832065ed3eb49
--- /dev/null
+++ b/src/operators/tanh_op.h
@@ -0,0 +1,44 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef TANH_OP
+
+#pragma once
+
+#include <string>
+#include "framework/operator.h"
+#include "operators/kernel/tanh_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class TanhOp : public framework::OperatorWithKernel<
+                   DeviceType, TanhParam<DeviceType>,
+                   operators::TanhKernel<DeviceType, T>> {
+ public:
+  TanhOp(const std::string &type, const VariableNameMap &inputs,
+         const VariableNameMap &outputs, const framework::AttributeMap &attrs,
+         std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<DeviceType, TanhParam<DeviceType>,
+                                      operators::TanhKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+  void InferShape() const override;
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 79bed19be341f7e1ec01e5961b64fac4df7571eb..4fa8f62c2f4c76032e1ddd4a3a32847bcb9c1dba 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -68,11 +68,22 @@ if (CON GREATER -1)
     target_link_libraries(test-resnet50 paddle-mobile)
     set(FOUND_MATCH ON)
 endif ()
+<<<<<<< HEAD
 
 list(FIND NET "FPGA_NET_V2" CON)
 if (CON GREATER -1)
     ADD_EXECUTABLE(test-resnet50 fpga/test_resnet50.cpp test_helper.h test_include.h executor_for_test.h)
     target_link_libraries(test-resnet50 paddle-mobile)
+=======
+
+list(FIND NET "FPGA_NET_V2" CON)
+if (CON GREATER -1)
+    ADD_EXECUTABLE(test-resnet50 fpga/test_resnet50.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-resnet50 paddle-mobile)
+
+    ADD_EXECUTABLE(test-pe fpga/test_pe.cpp)
+    target_link_libraries(test-pe paddle-mobile)
+>>>>>>> upstream/develop
     set(FOUND_MATCH ON)
 endif ()
 
diff --git a/test/fpga/test_pe.cpp b/test/fpga/test_pe.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f5f2708b9e628af80433be4e7ccbb205d3fcd6f6
--- /dev/null
+++ b/test/fpga/test_pe.cpp
@@ -0,0 +1,111 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_MOBILE_FPGA_V2
+#include "fpga/V2/api.h"
+#include "fpga/V2/filter.h"
+
+namespace fpga = paddle_mobile::fpga;
+
+static const uint32_t N = 64;
+static const uint32_t C = 3;
+static const uint32_t H = 224;
+static const uint32_t W = 224;
+static const uint32_t G = 1;
+
+fpga::DataType input_type = fpga::DATA_TYPE_FP32;
+fpga::DataType output_type = fpga::DATA_TYPE_FP16;
+
+void* ifm = nullptr;
+void* ofm = nullptr;
+void* filter = nullptr;
+void* ifm_scale = nullptr;
+void* ofm_scale = nullptr;
+void* filter_scale = nullptr;
+
+int ifm_size = 0, ofm_size = 0;
+
+void format_data() {
+  ifm_scale = fpga::fpga_malloc(8);
+  ofm_scale = fpga::fpga_malloc(8);
+  int ifm_channel = fpga::filter::calc_aligned_channel(C);
+  int ofm_channel = fpga::filter::calc_aligned_channel(N);
+  int num = fpga::filter::calc_aligned_num(N, C);
+  DLOG << "ifm_channel = " << ifm_channel;
+  DLOG << "ofm_channel = " << ofm_channel;
+  DLOG << "aligned_num = " << num;
+  ifm_size = ifm_channel * H * W;
+  ofm_size = ofm_channel * H * W;
+  ifm = fpga::fpga_malloc(ifm_size * sizeof(float));
+  ofm = fpga::fpga_malloc(ofm_size * sizeof(int16_t));
+  memset(ifm, 0, ifm_size * sizeof(float));
+  memset(ofm, 0, ofm_size * sizeof(int16_t));
+
+  for (int h = 0; h < H; h++) {
+    for (int w = 0; w < W; w++) {
+      for (int c = 0; c < C; c++) {
+        int index = h * W * ifm_channel + w * ifm_channel + c;
+        (reinterpret_cast<float*>(ifm))[index] = h + w + c * 0.1f;
+        // DLOG << index << ":" << ((float *) ifm)[index];
+      }
+    }
+  }
+  fpga::fpga_flush(ifm, ifm_size * sizeof(float));
+  fpga::fpga_flush(ofm, ofm_size * sizeof(int16_t));
+}
+
+void print_fp16(int16_t* ptr, int total_size, int num) {
+  fpga::fpga_invalidate(ptr, total_size * sizeof(int16_t));
+  int stride = total_size / num;
+  for (int i = 0; i < total_size; i += stride) {
+    DLOG << fpga::fp16_2_fp32(ptr[i]);
+  }
+}
+
+void print_fp32(float* ptr, int total_size, int num) {
+  fpga::fpga_invalidate(ptr, total_size * sizeof(float));
+  int stride = total_size / num;
+  for (int i = 0; i < total_size; i += stride) {
+    DLOG << ptr[i];
+  }
+}
+
+void test_bypass() {
+  fpga::BypassArgs args;
+  args.input_data_type = input_type;
+  args.output_data_type = output_type;
+  args.image.address = ifm;
+  args.image.height = H;
+  args.image.width = W;
+  args.image.channels = C;
+  args.image.scale_address = reinterpret_cast<float*>(ifm_scale);
+  args.output.address = ofm;
+  args.output.scale_address = reinterpret_cast<float*>(ofm_scale);
+  fpga::PerformBypass(args);
+}
+
+int main() {
+  paddle_mobile::fpga::open_device();
+  format_data();
+  DLOG << "format data done";
+  print_fp32(reinterpret_cast<float*>(ifm), ifm_size, 200);
+  DLOG << "print input done";
+  test_bypass();
+  DLOG << "test done";
+  print_fp16(reinterpret_cast<int16_t*>(ofm), ifm_size, 200);
+  std::cout << "Computation done" << std::endl;
+  return 0;
+}
+
+#endif
diff --git a/test/net/test_yologpu.cpp b/test/net/test_yologpu.cpp
index b00cbef0277f44c65ab951227176721599b0559e..301580597521793a1af21d51b7961555685272ef 100644
--- a/test/net/test_yologpu.cpp
+++ b/test/net/test_yologpu.cpp
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <iostream>
+<<<<<<< HEAD
 #include "../test_helper.h"
 #include "../test_include.h"
 
@@ -24,6 +25,81 @@ int main() {
   //                     std::string(g_mobilenet_detect) + "/params", true);
 
   auto isok = paddle_mobile.Load(std::string(g_yolo_mul), true);
+=======
+#include <thread>
+#include "../../src/common/types.h"
+#include "../../src/io/paddle_test_inference_api.h"
+#include "../test_helper.h"
+#include "../test_include.h"
+void t1() {
+  paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> paddle_mobile_gpu;
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile_cpu;
+  paddle_mobile::PaddleTester<paddle_mobile::CPU> paddle_test_cpu;
+  paddle_mobile::PaddleTester<paddle_mobile::GPU_CL> paddle_test_gpu;
+  printf("cpu time:%f\n", paddle_test_cpu.CaculatePredictTime());
+  std::string path = "/data/local/tmp/bin";
+  printf("gpu time:%f\n", paddle_test_gpu.CaculatePredictTime(&path));
+  //    paddle_mobile.SetThreadNum(4);
+#ifdef PADDLE_MOBILE_CL
+  paddle_mobile_gpu.SetCLPath("/data/local/tmp/bin");
+#endif
+  auto time1 = paddle_mobile::time();
+  auto isok = paddle_mobile_gpu.Load(std::string(g_yolo_mul) + "/model",
+                                     std::string(g_yolo_mul) + "/params", true);
+
+  //  auto isok = paddle_mobile.Load(std::string(g_yolo_mul), true);
+  if (isok) {
+    auto time2 = paddle_mobile::time();
+    std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms"
+              << std::endl;
+
+    std::vector<float> input;
+    std::vector<int64_t> dims{1, 3, 416, 416};
+    GetInput<float>(g_yolo_img, &input, dims);
+
+    std::vector<float> vec_result;
+    //            = paddle_mobile.Predict(input, dims);
+
+    auto time3 = paddle_mobile::time();
+    int max = 10;
+    for (int i = 0; i < max; ++i) {
+      vec_result = paddle_mobile_gpu.Predict(input, dims);
+    }
+    auto time4 = paddle_mobile::time();
+
+    //    auto time3 = paddle_mobile::time();
+
+    //    for (int i = 0; i < 10; ++i) {
+    //      auto vec_result = paddle_mobile.Predict(input, dims);
+    //    }
+
+    //    auto time4 = paddle_mobile::time();
+
+    std::cout << "predict cost :"
+              << paddle_mobile::time_diff(time3, time4) / max << "ms"
+              << std::endl;
+    std::vector<float>::iterator biggest =
+        std::max_element(std::begin(vec_result), std::end(vec_result));
+    std::cout << " Max element is " << *biggest << " at position "
+              << std::distance(std::begin(vec_result), biggest) << std::endl;
+    //        for (float i : vec_result) {
+    //            std::cout << i << std::endl;
+    //        }
+  }
+}
+
+void t2() {
+  paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> paddle_mobile;
+  //    paddle_mobile.SetThreadNum(4);
+#ifdef PADDLE_MOBILE_CL
+  paddle_mobile.SetCLPath("/data/local/tmp/bin");
+#endif
+  auto time1 = paddle_mobile::time();
+  auto isok = paddle_mobile.Load(std::string(g_yolo_mul) + "/model",
+                                 std::string(g_yolo_mul) + "/params", true);
+
+  //  auto isok = paddle_mobile.Load(std::string(g_yolo_mul), true);
+>>>>>>> upstream/develop
   if (isok) {
     auto time2 = paddle_mobile::time();
     std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms"
@@ -62,5 +138,69 @@ int main() {
     //            std::cout << i << std::endl;
     //        }
   }
+<<<<<<< HEAD
+=======
+}
+
+void t3() {
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  //    paddle_mobile.SetThreadNum(4);
+  //#ifdef PADDLE_MOBILE_CL
+  //  paddle_mobile.SetCLPath("/data/local/tmp/bin");
+  //#endif
+  auto time1 = paddle_mobile::time();
+  auto isok = paddle_mobile.Load(std::string(g_yolo_mul) + "/model",
+                                 std::string(g_yolo_mul) + "/params", true);
+
+  //  auto isok = paddle_mobile.Load(std::string(g_yolo_mul), true);
+  if (isok) {
+    auto time2 = paddle_mobile::time();
+    std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms"
+              << std::endl;
+
+    std::vector<float> input;
+    std::vector<int64_t> dims{1, 3, 416, 416};
+    GetInput<float>(g_yolo_img, &input, dims);
+
+    std::vector<float> vec_result = paddle_mobile.Predict(input, dims);
+
+    auto time3 = paddle_mobile::time();
+    int max = 10;
+    for (int i = 0; i < max; ++i) {
+      vec_result = paddle_mobile.Predict(input, dims);
+    }
+    auto time4 = paddle_mobile::time();
+
+    //    auto time3 = paddle_mobile::time();
+
+    //    for (int i = 0; i < 10; ++i) {
+    //      auto vec_result = paddle_mobile.Predict(input, dims);
+    //    }
+
+    //    auto time4 = paddle_mobile::time();
+
+    std::cout << "predict cost :"
+              << paddle_mobile::time_diff(time3, time4) / max << "ms"
+              << std::endl;
+    std::vector<float>::iterator biggest =
+        std::max_element(std::begin(vec_result), std::end(vec_result));
+    std::cout << " Max element is " << *biggest << " at position "
+              << std::distance(std::begin(vec_result), biggest) << std::endl;
+    //        for (float i : vec_result) {
+    //            std::cout << i << std::endl;
+    //        }
+  }
+}
+
+int main() {
+  //  std::thread th1(t1);
+  //      std::thread th2(t2);
+  //  std::thread th3(t3);
+  std::thread th1(t1);
+  //  th1.join();
+  //      th2.join();
+  //  th3.join();
+  th1.join();
+>>>>>>> upstream/develop
   return 0;
 }
diff --git a/tools/op.cmake b/tools/op.cmake
index 7d19591efc0e0a1bc36da914df0acd663aee811c..71424179d7b8b95929b935fcd98f4791a3cdcd38 100644
--- a/tools/op.cmake
+++ b/tools/op.cmake
@@ -133,9 +133,17 @@ if (CON GREATER -1)
   set(SOFTMAX_OP ON)
   set(FUSION_CONVBNRELU_OP ON)
   set(FUSION_CONVBN_OP ON)
+<<<<<<< HEAD
 #  set(CONV_TRANSPOSE_OP ON)
 #  set(SLICE_OP ON)
 #  set(ELEMENTWISEADD_OP ON)
+=======
+  set(CONV_TRANSPOSE_OP ON)
+  set(FUSION_DECONVRELU_OP ON)
+  set(SLICE_OP ON)
+  set(TANH_OP ON)
+  set(ELEMENTWISEADD_OP ON)
+>>>>>>> upstream/develop
   set(FOUND_MATCH ON)
 endif()
 
@@ -445,3 +453,9 @@ if (DEQUANT_OP)
   add_definitions(-DDEQUANT_OP)
 endif()
 
+if (TANH_OP)
+  add_definitions(-DTANH_OP)
+endif()
+if (FUSION_DECONVRELU_OP)
+  add_definitions(-DFUSION_DECONVRELU_OP)
+endif()
\ No newline at end of file