add V2 for FPGA track

eee2ba17 · zhangyang · 49fe469a · eee2ba17 · eee2ba17 · eee2ba17
51 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,6 +10,10 @@ option(CPU "armv7 with neon" ON)
 option(GPU_MALI "mali gpu" OFF)
 option(GPU_CL "opencl gpu" OFF)
 option(FPGA "fpga" OFF)
+if(FPGA)
+    option(FPGAV1 "fpga v1" ON)
+    option(FPGAV2 "fpga v2" OFF)
+endif()


 project(paddle-mobile)
@@ -119,8 +123,43 @@ else()
 endif()

 if(FPGA)
-    message("FPGA mode enabled")
    add_definitions(-DPADDLE_MOBILE_FPGA)
+    file(GLOB_RECURSE _tmp_list src/operators/math/*.cpp src/operators/kernel/fpga/*.cc)
+    foreach(f ${_tmp_list})
+        list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
+    endforeach()
+    file(GLOB_RECURSE _tmp_list_h src/operators/math/*.h)
+    foreach(f ${_tmp_list_h})
+        list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
+    endforeach()
+    list(APPEND PADDLE_MOBILE_CC src/operators/math/softmax.cpp)
+    list(APPEND PADDLE_MOBILE_h src/operators/math/softmax.h)
+    list(APPEND PADDLE_MOBILE_h src/operators/math/math_func_neon.h)
+    if(FPGAV1)
+        message("FPGA_V1 enabled")
+        add_definitions(-DPADDLE_MOBILE_FPGA_V1)
+        file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V2/*.cpp src/fpga/V2/*.cpp)
+        foreach(f ${_tmp_list})
+            list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
+        endforeach()
+        file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V2/*.h src/fpga/V2/*.h)
+        foreach(f ${_tmp_list})
+            list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
+        endforeach()
+    endif()
+    if(FPGAV2)
+        message("FPGA_V2 enabled")
+        add_definitions(-DPADDLE_MOBILE_FPGA_V2)
+        file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V1/*.cpp src/fpga/V1/*.cpp)
+        foreach(f ${_tmp_list})
+            list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
+        endforeach()
+        file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V1/*.h src/fpga/V1/*.h)
+        foreach(f ${_tmp_list})
+            list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
+        endforeach()
+    endif()
+
 else()
    file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/*.cpp src/operators/kernel/fpga/*.cc)
    foreach(f ${_tmp_list})
@@ -166,8 +205,10 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY build)
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY build)

 # NET default
-if(FPGA)
-    set(NET "FPGAnets" CACHE STRING "select net type")
+if(FPGAV1)
+    set(NET "FPGA_NET_V1" CACHE STRING "select net type")
+elseif(FPGAV2)
+    set(NET "FPGA_NET_V2" CACHE STRING "select net type")
 else()
    set(NET "default" CACHE STRING "select net type")
 endif()

--- a/doc/development_fpga.md
+++ b/doc/development_fpga.md
 # FPGA开发文档

-FPGA平台的代码在Xilinx ZCU102 revision 1.0开发板测试Resnet50成功，预测结果正确。
+FPGA平台的代码分为V1和V2。其中V1在Xilinx ZCU102 revision 1.0开发板测试Resnet50成功，预测结果正确。以下描述适用于复现V1运行的结果。

 ## 准备硬件
 ___
@@ -17,7 +17,7 @@ ___
 ## 编译工程
 ___
 1. 将最新的paddle mobile 代码复制到ZCU102开发板中。
-2. 进入paddle-mobile根目录， CMakeLists.txt 设置平台为 option(FPGA "fpga support" ON)。CPU和MALI\_GPU选项设置为OFF。
+2. 进入paddle-mobile根目录， CMakeLists.txt 设置平台为 option(FPGA "fpga support" ON)。CPU和MALI\_GPU选项设置为OFF。设置option(FPGAV1 "fpga v1" ON), option(FPGAV2 "fpga v2" OFF)。
 2. 执行以下命令，可在./test/build下生成test-resnet50可执行程序。
    * mkdir build
    * cd build

--- a/src/fpga/api.cpp
+++ b/src/fpga/api.cpp
@@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "fpga/api.h"
+#include "fpga/V1/api.h"
 #include <fcntl.h>
 #include <sys/ioctl.h>
 #include <sys/mman.h>
 #include <algorithm>
 #include <map>
-#include "fpga/bias_scale.h"
-#include "fpga/filter.h"
-#include "fpga/image.h"
+#include "fpga/V1/bias_scale.h"
+#include "fpga/V1/filter.h"
+#include "fpga/V1/image.h"
 #define FPGA_TEST_MODE
 #define PADDLE_MOBILE_OS_LINUX


--- a/src/fpga/api.h
+++ b/src/fpga/api.h
--- a/src/fpga/bias_scale.cpp
+++ b/src/fpga/bias_scale.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "fpga/bias_scale.h"
+#include "fpga/V1/bias_scale.h"
 #include <memory.h>
-#include "fpga/api.h"
+#include "fpga/V1/api.h"

 namespace paddle_mobile {
 namespace fpga {

--- a/src/fpga/bias_scale.h
+++ b/src/fpga/bias_scale.h
--- a/src/fpga/filter.cpp
+++ b/src/fpga/filter.cpp
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "fpga/filter.h"
+#include "fpga/V1/filter.h"
 #include <memory.h>
 #include <algorithm>
-#include "fpga/api.h"
+#include "fpga/V1/api.h"

 namespace paddle_mobile {
 namespace fpga {

--- a/src/fpga/filter.h
+++ b/src/fpga/filter.h
--- a/src/fpga/image.cpp
+++ b/src/fpga/image.cpp
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "fpga/image.h"
+#include "fpga/V1/image.h"
 #include <memory.h>
 #include <algorithm>
-#include "fpga/api.h"
+#include "fpga/V1/api.h"

 namespace paddle_mobile {
 namespace fpga {

--- a/src/fpga/image.h
+++ b/src/fpga/image.h
--- a/src/fpga/V2/api.cpp
+++ b/src/fpga/V2/api.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "fpga/V2/api.h"
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <algorithm>
+#include <map>
+#include "fpga/V2/bias_scale.h"
+#include "fpga/V2/filter.h"
+#include "fpga/V2/image.h"
+#define FPGA_TEST_MODE
+// #define PADDLE_MOBILE_OS_LINUX
+
+namespace paddle_mobile {
+namespace fpga {
+
+static int fd = -1;
+static const char *device_path = "/dev/fpgadrv0";
+static std::map<void *, size_t> memory_map;
+
+static inline int do_ioctl(int req, const void *arg) {
+#ifdef PADDLE_MOBILE_OS_LINUX
+  int result = ioctl(fd, req, (uint64_t)arg);
+  PADDLE_MOBILE_ENFORCE(result == 0, "ioctl didn't return correctly");
+  return result;
+#else
+  return -1;
+#endif
+}
+
+int open_device() {
+  if (fd == -1) {
+    fd = open(device_path, O_RDWR);
+  }
+  return fd;
+}
+
+// memory management;
+void *fpga_malloc(size_t size) {
+  static uint64_t counter = 0;
+
+#ifdef PADDLE_MOBILE_OS_LINUX
+  auto ptr = mmap64(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+#else
+  auto ptr = malloc(size);
+#endif
+  counter += size;
+  memory_map.insert(std::make_pair(ptr, size));
+  //  DLOG << "Address: " << ptr << ", " << size << " bytes allocated. Total "
+  //       << counter << " bytes";
+  return ptr;
+}
+
+void fpga_free(void *ptr) {
+  static uint64_t counter = 0;
+  size_t size = 0;
+
+  auto iter = memory_map.find(ptr);  // std::map<void *, size_t>::iterator
+  if (iter != memory_map.end()) {
+    size = iter->second;
+    memory_map.erase(iter);
+#ifdef PADDLE_MOBILE_OS_LINUX
+    munmap(ptr, size);
+#else
+    free(ptr);
+#endif
+    counter += size;
+    //    DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total "
+    //         << counter << " bytes";
+  } else {
+    DLOG << "Invalid pointer";
+  }
+}
+
+void fpga_copy(void *dest, const void *src, size_t num) {
+  memcpy(dest, src, num);
+}
+
+int fpga_flush(void *address, size_t size) {
+  struct MemoryCacheArgs args = {nullptr};
+  args.address = address;
+  args.size = size;
+  return do_ioctl(IOCTL_MEMCACHE_FLUSH, &args);
+}
+
+int fpga_invalidate(void *address, size_t size) {
+  struct MemoryCacheArgs args = {nullptr};
+  args.address = address;
+  args.size = size;
+  return do_ioctl(IOCTL_MEMCACHE_INVAL, &args);
+}
+
+half fp32_2_fp16(float fp32_num) {
+  unsigned long tmp = *(unsigned long *)(&fp32_num);  // NOLINT
+  auto t = (half)(((tmp & 0x007fffff) >> 13) | ((tmp & 0x80000000) >> 16) |
+                  (((tmp & 0x7f800000) >> 13) - (112 << 10)));
+  if (tmp & 0x1000) {
+    t++;  // roundoff
+  }
+  return t;
+}
+
+float fp16_2_fp32(half fp16_num) {
+  int frac = (fp16_num & 0x3ff);
+  int exp = ((fp16_num & 0x7c00) >> 10) + 112;
+  int s = fp16_num & 0x8000;
+  int tmp = 0;
+  float fp32_num;
+  tmp = s << 16 | exp << 23 | frac << 13;
+  fp32_num = *(float *)&tmp;  // NOLINT
+  return fp32_num;
+}
+
+int ComputeBasicConv(const struct ConvArgs &args) {
+#ifdef FPGA_TEST_MODE
+  DLOG << "======Compute Basic Conv======";
+  DLOG << "   relu_enabled:" << args.relu_enabled
+       << "   sb_address:" << args.sb_address
+       << "   filter_address:" << args.filter_address
+       << "   filter_num:" << args.filter_num
+       << "   group_num:" << args.group_num;
+  DLOG << "   image_address:" << args.image.address
+       << "   image_scale_address:" << args.image.scale_address
+       << "   image_channels:" << args.image.channels
+       << "   image_height:" << args.image.height
+       << "   image_width:" << args.image.width
+       << "   pad_height:" << args.image.pad_height
+       << "   pad_width:" << args.image.pad_width;
+  DLOG << "   kernel_height:" << args.kernel.height
+       << "   kernel_width:" << args.kernel.width
+       << "   stride_h:" << args.kernel.stride_h
+       << "   stride_w:" << args.kernel.stride_w;
+  DLOG << "   out_address:" << args.output.address
+       << "   out_scale_address:" << args.output.scale_address;
+#endif
+  return do_ioctl(IOCTL_CONFIG_CONV, &args);
+}
+
+int ComputeFpgaConv(const struct SplitConvArgs &args) {
+  ComputeBasicConv(args.conv_args[0]);
+}
+
+int ComputeFpgaPool(const struct PoolingArgs &args) {
+#ifdef FPGA_TEST_MODE
+  DLOG << "=============ComputeFpgaPool===========";
+  DLOG << "   mode:" << args.mode
+       << "   kernel_reciprocal:" << fp16_2_fp32(args.kernel_reciprocal);
+  DLOG << "   image_address:" << args.image.address
+       << "   image_scale_address:" << args.image.scale_address
+       << "   image_channels:" << args.image.channels
+       << "   image_height:" << args.image.height
+       << "   image_width:" << args.image.width
+       << "   pad_height:" << args.image.pad_height
+       << "   pad_width:" << args.image.pad_width;
+  DLOG << "   kernel_height:" << args.kernel.height
+       << "   kernel_width:" << args.kernel.width
+       << "   stride_h:" << args.kernel.stride_h
+       << "   stride_w:" << args.kernel.stride_w;
+  DLOG << "   out_address:" << args.output.address
+       << "   out_scale_address:" << args.output.scale_address;
+#endif
+
+  return do_ioctl(IOCTL_CONFIG_POOLING, &args);
+}
+
+int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
+#ifdef FPGA_TEST_MODE
+  DLOG << "=============ComputeFpgaEWAdd===========";
+  DLOG << "   relu_enabled:" << args.relu_enabled
+       << "   const0:" << fp16_2_fp32(int16_t(args.const0))
+       << "   const1:" << fp16_2_fp32(int16_t(args.const1));
+  DLOG << "   image0_address:" << args.image0.address
+       << "   image0_scale_address:" << args.image0.scale_address
+       << "   image0_channels:" << args.image0.channels
+       << "   image0_height:" << args.image0.height
+       << "   image0_width:" << args.image0.width
+       << "   pad0_height:" << args.image0.pad_height
+       << "   pad0_width:" << args.image0.pad_width;
+  DLOG << "   image1_address:" << args.image1.address
+       << "   image1_scale_address:" << args.image1.scale_address
+       << "   image1_channels:" << args.image1.channels
+       << "   image1_height:" << args.image1.height
+       << "   image1_width:" << args.image1.width
+       << "   pad1_height:" << args.image1.pad_height
+       << "   pad_width:" << args.image1.pad_width;
+  DLOG << "   out_address:" << args.output.address
+       << "   out_scale_address:" << args.output.scale_address;
+#endif
+
+  return do_ioctl(IOCTL_CONFIG_EW, &args);
+}
+int PerformBypass(const struct BypassArgs &args) {
+#ifdef FPGA_TEST_MODE
+  DLOG << "=============ComputeFpgaBypass===========";
+  DLOG << "   input_type:" << args.input_data_type
+       << "   output_type:" << args.output_data_type
+       << "   input_layout_type:" << args.input_layout_type
+       << "   output_layout_type:" << args.output_layout_type;
+  DLOG << "   image_address:" << args.image.address
+       << "   image_scale_address:" << args.image.scale_address
+       << "   image_channels:" << args.image.channels
+       << "   image_height:" << args.image.height
+       << "   image_width:" << args.image.width
+       << "   pad_height:" << args.image.pad_height
+       << "   pad_width:" << args.image.pad_width;
+  DLOG << "   out_address:" << args.output.address
+       << "   out_scale_address:" << args.output.scale_address;
+#endif
+
+  return do_ioctl(IOCTL_CONFIG_BYPASS, &args);
+}
+
+int ComputeFPGAConcat(const struct ConcatArgs &args) {
+#ifdef FPGA_TEST_MODE
+  DLOG << "=============ComputeFpgaConcat===========";
+  DLOG << "   Image_num: " << args.image_num
+       << "   out_address:" << args.image_out
+       << "   out_scale_address:" << args.scale_out
+       << "   out_channel:" << args.out_channel;
+  DLOG << "   image_height:" << args.height << "   image_width:" << args.width;
+  for (int i = 0; i < args.image_num; i++) {
+    DLOG << "   " << i << "th:        ";
+    DLOG << "   channel_num:" << args.channel_num[i]
+         << "   aligned_channel_num:" << args.aligned_channel_num[i]
+         << "   image_address:" << args.images_in[i]
+         << "   image_scale_address:" << args.scales_in[i];
+  }
+#endif
+
+  image::concat_images(args.images_in, args.scales_in, args.image_out,
+                       args.scale_out, args.image_num, args.channel_num,
+                       args.height, args.width, args.aligned_channel_num,
+                       args.out_channel);
+  return 0;
+}
+
+void format_image(framework::Tensor *image_tensor) {
+  auto dims = image_tensor->dims();
+  auto channel = dims[1], height = dims[2], width = dims[3];
+  auto data_ptr = image_tensor->data<float>();
+  size_t memory_size = channel * height * width * sizeof(float);
+  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
+  fpga_copy(new_data, data_ptr, memory_size);
+  int aligned_channel = filter::calc_aligned_channel((int)channel);  // NOLINT
+  image::format_image(&new_data, (int)channel, (int)height,          // NOLINT
+                      (int)width,                                    // NOLINT
+                      aligned_channel);
+  image_tensor->reset_data_ptr(new_data);
+}
+
+void format_fp16_ofm(framework::Tensor *ofm_tensor, int aligned_channel) {
+  auto dims = ofm_tensor->dims();
+  size_t memory_size = 0;
+  if (dims.size() == 4) {
+    auto height = dims[2], width = dims[3];
+    memory_size = height * width * aligned_channel * sizeof(half);
+  } else if (dims.size() == 2) {
+    memory_size = aligned_channel * sizeof(half);
+  } else {
+    DLOG << "Wrong ofm dimension";
+  }
+  auto p = fpga_malloc(memory_size);
+  memset(p, 0, memory_size);
+  ofm_tensor->reset_data_ptr(p);
+}
+
+void format_fp32_ofm(framework::Tensor *ofm_tensor, int aligned_channel) {
+  auto dims = ofm_tensor->dims();
+  size_t memory_size = 0;
+  if (dims.size() == 4) {
+    auto height = dims[2], width = dims[3];
+    memory_size = height * width * aligned_channel * sizeof(float);
+  } else if (dims.size() == 2) {
+    memory_size = aligned_channel * sizeof(float);
+  } else {
+    DLOG << "Wrong ofm dimension";
+  }
+  auto p = fpga_malloc(memory_size);
+  memset(p, 0, memory_size);
+  ofm_tensor->reset_data_ptr(p);
+}
+
+float filter_find_max(framework::Tensor *filter_tensor) {
+  auto filter_ptr = filter_tensor->data<float>();
+  return filter::find_max(filter_ptr, (int)filter_tensor->numel());  // NOLINT
+}
+
+int get_aligned_channel_num(int channel_num) {
+  return filter::calc_aligned_channel(channel_num);
+}
+
+int get_aligned_filter_num(framework::Tensor *filter_tensor) {
+  auto dims = filter_tensor->dims();
+  return filter::calc_aligned_num((int)dims[0], (int)dims[1]);  // NOLINT
+}
+
+int get_conv_output_channel(framework::Tensor *filter_tensor) {
+  int aligned_filter_num = get_aligned_filter_num(filter_tensor);
+  return get_aligned_channel_num(aligned_filter_num);
+}
+void format_filter(framework::Tensor *filter_tensor, float max_value,
+                   int group_num) {
+  filter_tensor->scale[0] = float(max_value / 127.0);  // NOLINT
+  filter_tensor->scale[1] = float(127.0 / max_value);  // NOLINT
+  auto dims = filter_tensor->dims();
+  auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
+  auto data_ptr = filter_tensor->data<float>();
+  size_t memory_size = num * channel * height * width * sizeof(float);
+  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
+  fpga_copy(new_data, data_ptr, memory_size);
+  filter::format_filter(&new_data, (int)num, (int)channel,  // NOLINT
+                        (int)height,                        // NOLINT
+                        (int)width, group_num, max_value);  // NOLINT
+  filter_tensor->reset_data_ptr(new_data);
+}
+
+void format_fc_filter(framework::Tensor *filter_tensor, float max_value) {
+  filter_tensor->scale[0] = float(max_value / 127.0);  // NOLINT
+  filter_tensor->scale[1] = float(127.0 / max_value);  // NOLINT
+  auto dims = filter_tensor->dims();
+  auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
+  auto data_ptr = filter_tensor->data<float>();
+  size_t memory_size = num * channel * height * width * sizeof(float);
+  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
+  fpga_copy(new_data, data_ptr, memory_size);
+  filter::format_fc_filter(&new_data, (int)num, (int)channel,  // NOLINT
+                           (int)height,                        // NOLINT
+                           (int)width, 1, max_value);          // NOLINT
+  filter_tensor->reset_data_ptr(new_data);
+}
+
+void format_bias_scale_array(float **bias_scale_array, int filter_num,
+                             int filter_channel) {
+  int num_after_alignment =
+      filter::calc_aligned_num(filter_channel, filter_channel);
+  bias_scale::format_bias_scale_array(bias_scale_array, filter_num,
+                                      num_after_alignment);
+}
+
+void format_concat_output(framework::Tensor *out, int height, int width,
+                          uint32_t out_channel) {
+  auto data_ptr = fpga_malloc(out_channel * height * width * sizeof(half));
+  auto ddim = framework::make_ddim({1, out_channel, height, width});
+  out->Resize(ddim);
+  out->reset_data_ptr(data_ptr);
+}
+
+int format_conv_data(framework::Tensor *filter_tensor,
+                     framework::Tensor *ofm_tensor, float *bs_ptr, int group) {
+  float max_value = fpga::filter_find_max(filter_tensor);
+  fpga::format_filter(filter_tensor, max_value, group);
+  int aligned_num = get_aligned_filter_num(filter_tensor);
+  fpga::format_bias_scale_array(&bs_ptr,
+                                (int)filter_tensor->dims()[0],  // NOLINT
+                                aligned_num);
+  int aligned_channel = fpga::get_conv_output_channel(filter_tensor);
+  fpga::format_fp16_ofm(ofm_tensor, aligned_channel);
+  DLOG << aligned_channel;
+  return aligned_channel;
+}
+
+int format_fc_data(framework::Tensor *filter_tensor,
+                   framework::Tensor *ofm_tensor, float *bs_ptr) {
+  float max_value = fpga::filter_find_max(filter_tensor);
+  fpga::format_fc_filter(filter_tensor, max_value);
+  int aligned_num = get_aligned_filter_num(filter_tensor);
+  fpga::format_bias_scale_array(&bs_ptr,
+                                (int)filter_tensor->dims()[0],  // NOLINT
+                                aligned_num);
+  int aligned_channel = fpga::get_conv_output_channel(filter_tensor);
+  fpga::format_fp16_ofm(ofm_tensor, aligned_channel);
+  DLOG << aligned_channel;
+  return aligned_channel;
+}
+
+void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
+                    framework::Tensor *out, framework::Tensor *filter,
+                    bool relu_enabled, int group_num, int stride_h,
+                    int stride_w, int padding_h, int padding_w, float *bs_ptr) {
+  auto input_ptr = input->data<float>();
+  auto filter_ptr = filter->data<float>();
+  auto out_ptr = out->data<float>();
+
+  arg->group_num = (uint32_t)group_num;
+  arg->split_num = 1;
+  arg->filter_num = (uint32_t)filter->dims()[0];
+  arg->output.address = out_ptr;
+  arg->output.scale_address = out->scale;
+  arg->conv_args =
+      (ConvArgs *)fpga_malloc(arg->split_num * sizeof(ConvArgs));  // NOLINT
+
+  arg->concat_arg.image_num = arg->split_num;
+  arg->concat_arg.image_out = out_ptr;
+  arg->concat_arg.scale_out = out->scale;
+  arg->concat_arg.height = (uint32_t)out->dims()[2];
+  arg->concat_arg.width = (uint32_t)out->dims()[3];
+
+  int n = arg->split_num;
+  arg->concat_arg.images_in =
+      (half **)fpga_malloc(n * sizeof(int *));  // NOLINT
+  arg->concat_arg.scales_in =
+      (float **)fpga_malloc(n * sizeof(float *));  // NOLINT
+  arg->concat_arg.channel_num =
+      (uint32_t *)fpga_malloc(n * sizeof(uint32_t));  // NOLINT
+
+  for (int i = 0; i < n; i++) {
+    arg->conv_args[i].relu_enabled = relu_enabled;
+    arg->conv_args[i].sb_address = bs_ptr;
+    arg->conv_args[i].filter_address = (int8_t *)filter_ptr;  // NOLINT
+    arg->conv_args[i].filter_scale_address = filter->scale;
+    arg->conv_args[i].filter_num = arg->filter_num;
+    arg->conv_args[i].group_num = (uint32_t)group_num;
+
+    arg->conv_args[i].kernel.stride_h = (uint32_t)stride_h;
+    arg->conv_args[i].kernel.stride_w = (uint32_t)stride_w;
+    arg->conv_args[i].kernel.height = (uint32_t)filter->dims()[2];
+    arg->conv_args[i].kernel.width = (uint32_t)filter->dims()[3];
+
+    arg->conv_args[i].image.address = input_ptr;
+    arg->conv_args[i].image.scale_address = input->scale;
+    arg->conv_args[i].image.channels = (uint32_t)input->dims()[1];
+    arg->conv_args[i].image.height = (uint32_t)input->dims()[2];
+    arg->conv_args[i].image.width = (uint32_t)input->dims()[3];
+    arg->conv_args[i].image.pad_height = (uint32_t)padding_h;
+    arg->conv_args[i].image.pad_width = (uint32_t)padding_w;
+
+    arg->conv_args[i].output.address = out_ptr;
+    arg->conv_args[i].output.scale_address = out->scale;
+  }
+}
+
+}  // namespace fpga
+}  // namespace paddle_mobile
--- a/src/fpga/V2/api.h
+++ b/src/fpga/V2/api.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdint.h>
+#include <cstddef>
+#include <iostream>
+#include <limits>
+#include "framework/tensor.h"
+
+namespace paddle_mobile {
+namespace fpga {
+
+enum DataType {
+  DATA_TYPE_FP32 = 1,
+  DATA_TYPE_FP16 = 0,
+};
+
+enum LayoutType {
+  LAYOUT_CHW = 1,
+  LAYOUT_HWC = 0,
+};
+
+struct VersionArgs {
+  void* buffer;
+};
+
+struct MemoryCopyArgs {
+  void* src;
+  void* dest;
+  size_t size;
+};
+
+struct KernelArgs {
+  uint32_t width;
+  uint32_t height;
+  uint32_t stride_w;
+  uint32_t stride_h;
+};
+
+struct ImageInputArgs {
+  void* address;         // input featuremap virtual address
+  float* scale_address;  // input scale address;
+  uint32_t channels;
+  uint32_t width;  // featuremap width
+  uint32_t height;
+  uint32_t pad_width;  // padding width;
+  uint32_t pad_height;
+};
+
+struct ImageOutputArgs {
+  void* address;         // output result address;
+  float* scale_address;  // output scale address;
+  uint64_t timer_cnt;    // time counter for FPGA computation
+};
+
+struct ConvArgs {
+  bool relu_enabled;
+  void* sb_address;  // scale and bias are interlaced;
+  void* filter_address;
+  float* filter_scale_address;
+  uint32_t filter_num;
+  uint32_t group_num;
+
+  struct KernelArgs kernel;
+  struct ImageInputArgs image;  // input image;
+  struct ImageOutputArgs output;
+};
+
+struct ConcatArgs {
+  uint32_t image_num;
+  half** images_in;
+  float** scales_in;
+  void* image_out;
+  float* scale_out;
+  uint32_t* channel_num;
+  uint32_t* aligned_channel_num;
+  uint32_t out_channel;
+  uint32_t height;
+  uint32_t width;
+};
+
+struct SplitConvArgs {
+  uint32_t split_num;
+  uint32_t group_num;
+  uint32_t filter_num;
+  struct ImageOutputArgs output;
+  struct ConvArgs* conv_args;
+  struct ConcatArgs concat_arg;
+};
+
+struct PoolingArgs {
+  int16_t mode;  // mode: 0:max, 1:avg
+  half kernel_reciprocal;
+  struct KernelArgs kernel;
+  struct ImageInputArgs image;  // input image;
+  struct ImageOutputArgs output;
+};
+
+struct EWAddArgs {
+  bool relu_enabled;
+
+  uint32_t const0;  // output0 = const0 x input0 + const1 x input1;
+  uint32_t const1;
+  struct ImageInputArgs image0;
+  struct ImageInputArgs image1;
+  struct ImageOutputArgs output;
+};
+
+struct BypassArgs {
+  enum DataType input_data_type;
+  enum DataType output_data_type;
+  enum LayoutType input_layout_type;
+  enum LayoutType output_layout_type;
+  struct ImageInputArgs image;
+  struct ImageOutputArgs output;
+};
+
+struct FpgaRegWriteArgs {
+  uint64_t address;  //
+  uint64_t value;
+};
+
+struct FpgaRegReadArgs {
+  uint64_t address;
+  uint64_t value;
+};
+
+struct MemoryCacheArgs {
+  void* address;
+  size_t size;
+};
+
+#define IOCTL_FPGA_MAGIC 'FPGA'
+
+#define IOCTL_VERSION _IOW(IOCTL_FPGA_MAGIC, 01, struct VersionArgs)
+
+#define IOCTL_SEPARATOR_0 10
+
+#define IOCTL_MEM_COPY _IOW(IOCTL_FPGA_MAGIC, 11, struct MemoryCopyArgs)
+#define IOCTL_MEMCACHE_INVAL _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryCacheArgs)
+#define IOCTL_MEMCACHE_FLUSH _IOW(IOCTL_FPGA_MAGIC, 13, struct MemoryCacheArgs)
+
+#define IOCTL_SEPARATOR_1 20
+
+#define IOCTL_CONFIG_CONV _IOW(IOCTL_FPGA_MAGIC, 21, struct ConvArgs)
+#define IOCTL_CONFIG_POOLING _IOW(IOCTL_FPGA_MAGIC, 22, struct PoolingArgs)
+#define IOCTL_CONFIG_EW _IOW(IOCTL_FPGA_MAGIC, 23, struct EWAddArgs)
+#define IOCTL_CONFIG_BYPASS _IOW(IOCTL_FPGA_MAGIC, 24, struct BypassArgs)
+#define IOCTL_FPGA_REG_READ _IOW(IOCTL_FPGA_MAGIC, 28, struct FpgaRegReadArgs)
+#define IOCTL_FPGA_REG_WRITE _IOW(IOCTL_FPGA_MAGIC, 29, struct FpgaRegWriteArgs)
+
+//============================== API =============================
+
+int open_device();
+int close_device();
+
+void* fpga_malloc(size_t size);
+void fpga_free(void* ptr);
+void fpga_copy(void* dst, const void* src, size_t num);
+int fpga_flush(void* address, size_t size);
+int fpga_invalidate(void* address, size_t size);
+
+int PerformBypass(const struct BypassArgs& args);
+int ComputeFpgaConv(const struct SplitConvArgs& args);
+int ComputeFpgaPool(const struct PoolingArgs& args);
+int ComputeFpgaEWAdd(const struct EWAddArgs& args);
+int ComputeFPGAConcat(const struct ConcatArgs& args);
+
+static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; }
+
+float filter_find_max(framework::Tensor* filter_tensor);
+int get_aligned_channel_num(int channel_num);
+int get_aligned_filter_num(framework::Tensor* filter_tensor);
+int get_conv_output_channel(framework::Tensor* filter_tensor);
+
+void format_image(framework::Tensor* image_tensor);
+void format_fp16_ofm(framework::Tensor* ofm_tensor,
+                     int aligned_channel);  // only allocate memory
+void format_fp32_ofm(framework::Tensor* ofm_tensor, int aligned_channel);
+
+void format_filter(framework::Tensor* filter_tensor, float max_value,
+                   int group_num);
+void format_fc_filter(framework::Tensor* filter_tensor, float max_value);
+void format_bias_scale_array(float** bias_scale_array, int filter_num,
+                             int filter_channel);
+void format_concat_output(framework::Tensor* out, int height, int width,
+                          uint32_t out_channel);
+int format_conv_data(framework::Tensor* filter_tensor,
+                     framework::Tensor* ofm_tensor, float* bs_ptr, int group);
+int format_fc_data(framework::Tensor* filter_tensor,
+                   framework::Tensor* ofm_tensor, float* bs_ptr);
+void fill_split_arg(struct SplitConvArgs* arg, framework::Tensor* input,
+                    framework::Tensor* out, framework::Tensor* filter,
+                    bool relu_enabled, int group_num, int stride_h,
+                    int stride_w, int padding_h, int padding_w, float* bs_ptr);
+
+half fp32_2_fp16(float fp32_num);
+float fp16_2_fp32(half fp16_num);
+
+}  // namespace fpga
+}  // namespace paddle_mobile
--- a/src/fpga/V2/bias_scale.cpp
+++ b/src/fpga/V2/bias_scale.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "fpga/V2/bias_scale.h"
+#include <memory.h>
+#include "fpga/V2/api.h"
+
+namespace paddle_mobile {
+namespace fpga {
+namespace bias_scale {
+
+void align_element(float **data_in, int num, int num_after_alignment) {
+  float *ptr_unaligned = *data_in;
+  int total_element = 2 * num_after_alignment;  // including bias & scale
+  float *ptr_aligned =
+      (float *)fpga_malloc(total_element * sizeof(float));  // NOLINT
+  memset(ptr_aligned, 0, total_element * sizeof(float));
+
+  for (int i = 1; i < num; i++) {
+    ptr_aligned[i * 2 + 0] = ptr_unaligned[i];
+    ptr_aligned[i * 2 + 1] = ptr_unaligned[i + num];
+  }
+
+  fpga_free(ptr_unaligned);
+  *data_in = ptr_aligned;
+}
+
+void format_bias_scale_array(float **data_in, int num,
+                             int num_after_alignment) {
+  align_element(data_in, num, num_after_alignment);
+  fpga_flush(*data_in, 2 * num_after_alignment * sizeof(float));
+}
+
+}  // namespace bias_scale
+}  // namespace fpga
+}  // namespace paddle_mobile
--- a/src/fpga/V2/bias_scale.h
+++ b/src/fpga/V2/bias_scale.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace paddle_mobile {
+namespace fpga {
+namespace bias_scale {
+
+void align_element(float **data_in, int num, int num_after_alignment);
+void format_bias_scale_array(float **data_in, int num, int num_after_alignment);
+
+}  // namespace bias_scale
+}  // namespace fpga
+}  // namespace paddle_mobile
--- a/src/fpga/V2/filter.cpp
+++ b/src/fpga/V2/filter.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "fpga/V2/filter.h"
+#include <memory.h>
+#include <algorithm>
+#include "fpga/V2/api.h"
+
+namespace paddle_mobile {
+namespace fpga {
+namespace filter {
+
+int calc_channel_parallelism(int channel) {
+  if (channel <= 16) {
+    return 16;
+  } else if (channel <= 32) {
+    return 32;
+  } else if (channel <= 64) {
+    return 64;
+  } else {
+    return 128;
+  }
+}
+int calc_aligned_channel(int channel) {
+  return align_to_x(channel, calc_channel_parallelism(channel));
+}
+
+int calc_num_parallelism(int channel) {
+  return FILTER_PARALLELISM / calc_channel_parallelism(channel);
+}
+
+int calc_aligned_num(int num, int channel) {
+  return align_to_x(num, calc_num_parallelism(channel));
+}
+
+int calc_aligned_total_pixel_num(int num, int channel, int height, int width) {
+  int aligned_channel = calc_aligned_channel(channel);
+  int aligned_filter_num = calc_aligned_num(num, channel);
+  return aligned_filter_num * aligned_channel * height * width;
+}
+
+void convert_to_hwc(float **data_in, int num, int channel, int height,
+                    int width) {
+  float *tmp = *data_in;
+  int chw = channel * height * width;
+  float *data_tmp = (float *)fpga_malloc(chw * num * sizeof(float));  // NOLINT
+  for (int n = 0; n < num; n++) {
+    int64_t amount_per_row = width * channel;
+    for (int c = 0; c < channel; c++) {
+      for (int h = 0; h < height; h++) {
+        int64_t offset_height = h * amount_per_row;
+        for (int w = 0; w < width; w++) {
+          *(data_tmp + n * chw + offset_height + w * channel + c) =
+              *((*data_in)++);
+        }
+      }
+    }
+  }
+  *data_in = data_tmp;
+  fpga_free(tmp);
+}
+
+void align_filter(float **data_in, int num, int channel, int height,
+                  int width) {
+  int aligned_channel = calc_channel_parallelism(channel);
+  int hw = height * width;
+  int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width);
+  float *new_data = (float *)fpga_malloc(pixel_num * sizeof(float));  // NOLINT
+  float *temp = *data_in;
+  memset(new_data, 0, pixel_num * sizeof(float));
+  for (int i = 0; i < num; i++) {
+    for (int j = 0; j < hw; j++) {
+      memcpy(new_data + i * aligned_channel * hw + j * aligned_channel,
+             temp + i * channel * hw + j * channel, channel * sizeof(float));
+    }
+  }
+  *data_in = new_data;
+  fpga_free(temp);
+}
+
+void format_filter(float **data_in, int num, int channel, int height, int width,
+                   int group_num, float max) {
+  convert_to_hwc(data_in, num, channel, height, width);
+  align_filter(data_in, num, channel, height, width);
+  int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width);
+  fpga_flush(*data_in, pixel_num * sizeof(float));
+}
+
+void convert_fc_filter(float **data_in, int num, int chw) {
+  float *tmp = *data_in;
+  float *data_tmp = (float *)fpga_malloc(chw * num * sizeof(float));  // NOLINT
+  for (int n = 0; n < num; n++) {
+    for (int c = 0; c < chw; c++) {
+      data_tmp[n * chw + c] = (*data_in)[num * c + n];
+    }
+  }
+  *data_in = data_tmp;
+  fpga_free(tmp);
+}
+
+void format_fc_filter(float **data_in, int num, int channel, int height,
+                      int width, int group_num, float max) {
+  int chw = channel * height * width;
+  convert_fc_filter(data_in, num, chw);
+  align_filter(data_in, num, channel, height, width);
+  int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width);
+  fpga_flush(*data_in, pixel_num * sizeof(float));
+}
+
+float find_max(float *data_in, int data_size) {
+  float max = 0.0;
+  for (int i = 0; i < data_size; ++i) {
+    float value = data_in[i];
+    float abs = value > 0 ? value : -value;
+    max = std::max(max, abs);
+  }
+  return max;
+}
+
+signed char float_to_int8(float fdata) {
+  if (fdata < 0.0) {
+    fdata -= 0.5;
+  } else {
+    fdata += 0.5;
+  }
+  return (signed char)fdata;
+}
+
+void quantize(float **data_in, int data_size, float max) {
+  float *tmp = *data_in;
+  float fix_range = 127;
+  float scale = fix_range / max;
+
+  signed char *tmp_data = (signed char *)fpga_malloc(data_size * sizeof(char));
+  for (int i = 0; i < data_size; i++) {
+    tmp_data[i] = float_to_int8(
+        (*data_in)[i] * scale);  // (signed char)((*data_in)[i] * scale);
+  }
+  *data_in = (float *)tmp_data;  // NOLINT
+  fpga_free(tmp);
+}
+
+}  // namespace filter
+}  // namespace fpga
+}  // namespace paddle_mobile
--- a/src/fpga/V2/filter.h
+++ b/src/fpga/V2/filter.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#define FILTER_PARALLELISM 1024
+namespace paddle_mobile {
+namespace fpga {
+namespace filter {
+
+int calc_channel_parallelism(int channel);
+int calc_aligned_channel(int channel);
+int calc_num_parallelism(int channel);
+int calc_aligned_num(int num, int channel);
+int calc_aligned_total_pixel_num(int num, int channel, int height, int width);
+void convert_to_hwc(float** data_in, int num, int channel, int height,
+                    int width);
+void format_filter(float** data_in, int num, int channel, int height, int width,
+                   int group_num, float max);
+void convert_fc_filter(float** data_in, int num, int chw);
+void format_fc_filter(float** data_in, int num, int channel, int height,
+                      int width, int group_num, float max);
+float find_max(float* data_in, int data_size);
+}  // namespace filter
+}  // namespace fpga
+}  // namespace paddle_mobile
--- a/src/fpga/V2/image.cpp
+++ b/src/fpga/V2/image.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "fpga/V2/image.h"
+#include <memory.h>
+#include <algorithm>
+#include "fpga/V2/api.h"
+
+namespace paddle_mobile {
+namespace fpga {
+namespace image {
+
+void convert_to_hwc(float **data_in, int channel, int height, int width) {
+  float *tmp = *data_in;
+  float *data_tmp =
+      (float *)fpga_malloc(channel * height * width * sizeof(float));  // NOLINT
+  int64_t amount_per_row = width * channel;
+  for (int c = 0; c < channel; c++) {
+    for (int h = 0; h < height; h++) {
+      int64_t offset_height = h * amount_per_row;
+      for (int w = 0; w < width; w++) {
+        *(data_tmp + offset_height + w * channel + c) = *((*data_in)++);
+      }
+    }
+  }
+  *data_in = data_tmp;
+  fpga_free(tmp);
+}
+void align_image(float **data_in, int channel, int height, int width,
+                 int aligned_channel) {
+  if (channel == aligned_channel) return;
+  float *tmp = *data_in;
+  float *new_data =
+      (float *)fpga_malloc(aligned_channel * height * width *  // NOLINT
+                           sizeof(float));                     // NOLINT
+  memset(new_data, 0, aligned_channel * height * width * sizeof(float));
+
+  for (int i = 0; i < height * width; i++) {
+    memcpy(new_data + i * aligned_channel, tmp + i * channel,
+           channel * sizeof(float));
+  }
+  *data_in = new_data;
+  fpga_free(tmp);
+}
+
+void format_image(float **data_in, int channel, int height, int width,
+                  int aligned_channel) {
+  convert_to_hwc(data_in, channel, height, width);
+  align_image(data_in, channel, height, width, aligned_channel);
+  fpga_flush(*data_in, aligned_channel * height * width * sizeof(float));
+}
+
+void concat_images(int16_t **images_in, float **scales_in, void *image_out,
+                   float *scale_out, int image_num, const uint32_t *channel_num,
+                   int height, int width, const uint32_t *aligned_channel_num,
+                   int out_channel) {
+  int hw = height * width;
+  scale_out[0] = 0.0;
+  scale_out[1] = 0.0;
+  for (int i = 0; i < image_num; i++) {
+    scale_out[0] = std::max(*scale_out, scales_in[i][0]);
+    fpga_invalidate(images_in[i],
+                    height * width * aligned_channel_num[i] * sizeof(int16_t));
+  }
+  scale_out[1] = 1 / scale_out[0];
+
+  for (int j = 0; j < hw; j++) {
+    int tmp_channel_sum = 0;
+    for (int i = 0; i < image_num; i++) {
+      memcpy(
+          (int16_t *)image_out + j * out_channel + tmp_channel_sum,  // NOLINT
+          images_in[i] + j * aligned_channel_num[i],
+          channel_num[i] * sizeof(int16_t));
+
+      tmp_channel_sum += channel_num[i];
+    }
+  }
+
+  fpga_flush(image_out, hw * out_channel * sizeof(int16_t));
+}
+
+}  // namespace image
+}  // namespace fpga
+}  // namespace paddle_mobile
--- a/src/fpga/V2/image.h
+++ b/src/fpga/V2/image.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdint.h>
+
+namespace paddle_mobile {
+namespace fpga {
+namespace image {
+
+void convert_to_hwc(float **data_in, int channel, int height, int width);
+void align_image(float **data_in, int channel, int height, int width,
+                 int aligned_channel);
+void format_image(float **data_in, int channel, int height, int width,
+                  int aligned_channel);
+void concat_images(
+    int16_t **images_in, float **scales_in, void *image_out, float *scale_out,
+    int image_num, const uint32_t *channel_num, int height, int width,
+    const uint32_t *aligned_channel_num,
+    int out_channel);  // Concat featuremaps along channel direction
+
+}  // namespace image
+}  // namespace fpga
+}  // namespace paddle_mobile
--- a/src/memory/t_malloc.cpp
+++ b/src/memory/t_malloc.cpp
@@ -16,10 +16,12 @@ limitations under the License. */
 #include <cstdlib>
 #include <cstring>

-#ifdef PADDLE_MOBILE_FPGA
-
-#include "fpga/api.h"
+#ifdef PADDLE_MOBILE_FPGA_V1
+#include "fpga/V1/api.h"
+#endif

+#ifdef PADDLE_MOBILE_FPGA_V2
+#include "fpga/V2/api.h"
 #endif

 namespace paddle_mobile {

--- a/src/operators/kernel/fpga/concat_kernel.cpp
+++ b/src/operators/kernel/fpga/concat_kernel.cpp
--- a/src/operators/kernel/fpga/conv_add_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_add_bn_kernel.cpp
--- a/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp
--- a/src/operators/kernel/fpga/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_add_relu_kernel.cpp
--- a/src/operators/kernel/fpga/conv_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_bn_kernel.cpp
--- a/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp
--- a/src/operators/kernel/fpga/dropout_kernel.cpp
+++ b/src/operators/kernel/fpga/dropout_kernel.cpp
--- a/src/operators/kernel/fpga/elementwise_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/elementwise_add_relu_kernel.cpp
--- a/src/operators/kernel/fpga/fc_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/fc_relu_kernel.cpp
--- a/src/operators/kernel/fpga/feed_kernel.cpp
+++ b/src/operators/kernel/fpga/feed_kernel.cpp
--- a/src/operators/kernel/fpga/fetch_kernel.cpp
+++ b/src/operators/kernel/fpga/fetch_kernel.cpp
@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#ifdef FUSION_CONVADD_OP

 #include "operators/kernel/fetch_kernel.h"

@@ -32,5 +31,3 @@ template class FetchKernel<FPGA, float>;

 }  // namespace operators
 }  // namespace paddle_mobile
-
-#endif
--- a/src/operators/kernel/fpga/fusion_fc_kernel.cpp
+++ b/src/operators/kernel/fpga/fusion_fc_kernel.cpp
--- a/src/operators/kernel/fpga/pool_kernel.cpp
+++ b/src/operators/kernel/fpga/pool_kernel.cpp
@@ -34,7 +34,7 @@ bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) {
  fpga::PoolingArgs poolArgs = {0};
  poolArgs.mode = pooling_type == "max" ? 0 : 1;  // max:0, avg:1
  poolArgs.kernel_reciprocal =
-      fpga::fp32_2_fp16(float(1.0 / (ksize[0] * ksize[1])));
+      fpga::fp32_2_fp16(float(1.0 / (ksize[0] * ksize[1])));  // NOLINT
  poolArgs.image.address = input_ptr;
  poolArgs.image.channels = (uint32_t)input->dims()[1];
  poolArgs.image.height = (uint32_t)input->dims()[2];

--- a/src/operators/kernel/fpga/softmax_kernel.cpp
+++ b/src/operators/kernel/fpga/softmax_kernel.cpp
@@ -14,11 +14,9 @@ limitations under the License. */

 #ifdef SOFTMAX_OP

-#include "../softmax_kernel.h"
-#include "../central-arm-func/softmax_arm_func.h"
-#include "common/types.h"
-#include "fpga/api.h"
-#include "operators/math/softmax.h"
+#include "operators/kernel/softmax_kernel.h"
+#include "operators/kernel/central-arm-func/softmax_arm_func.h"
+
 namespace paddle_mobile {
 namespace operators {


--- a/src/operators/kernel/fpga/V2/concat_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/concat_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef CONCAT_OP
+
+#include "operators/kernel/concat_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
+  auto inputs = param->Inputs();
+  auto out = param->Out();
+  auto image_num = inputs.size();
+  auto images_in =
+      (half **)fpga::fpga_malloc(image_num * sizeof(int *));  // NOLINT
+  auto scales_in =
+      (float **)fpga::fpga_malloc(image_num * sizeof(float *));  // NOLINT
+  auto channel_num =
+      (uint32_t *)fpga::fpga_malloc(image_num * sizeof(uint32_t));  // NOLINT
+  auto aligned_channel_num =
+      (uint32_t *)fpga::fpga_malloc(image_num * sizeof(uint32_t));  // NOLINT
+
+  auto height = inputs[0]->dims()[2];
+  auto width = inputs[0]->dims()[3];
+  auto out_channel =
+      (uint32_t)fpga::get_aligned_channel_num((int)out->dims()[1]);  // NOLINT
+  for (int i = 0; i < image_num; i++) {
+    auto input = inputs[i];
+    PADDLE_MOBILE_ENFORCE(
+        input->dims()[2] == height && input->dims()[3] == width,
+        "Image height & width should be unified");
+    images_in[i] = (half *)input->data<float>();  // NOLINT
+    channel_num[i] = (uint32_t)inputs[i]->dims()[1];
+    aligned_channel_num[i] =
+        (uint32_t)fpga::get_aligned_channel_num(channel_num[i]);
+    scales_in[i] = input->scale;
+  }
+  fpga::format_concat_output(out, (int)height, (int)width,  // NOLINT
+                             out_channel);
+
+  fpga::ConcatArgs concatArgs = {0};
+  concatArgs.image_num = (uint32_t)image_num;
+  concatArgs.images_in = images_in;
+  concatArgs.scales_in = scales_in;
+  concatArgs.image_out = (half *)out->data<float>();  // NOLINT
+  concatArgs.scale_out = out->scale;
+  concatArgs.channel_num = channel_num;
+  concatArgs.aligned_channel_num = aligned_channel_num;
+  concatArgs.out_channel = out_channel;
+  concatArgs.height = (uint32_t)height;
+  concatArgs.width = (uint32_t)width;
+  param->SetFpgaArgs(concatArgs);
+  return true;
+}
+
+template <>
+void ConcatKernel<FPGA, float>::Compute(const ConcatParam<FPGA> &param) {
+  ComputeFPGAConcat(param.FpgaArgs());
+}
+template class ConcatKernel<FPGA, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/fpga/V2/conv_add_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/conv_add_bn_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDBN_OP
+
+#include "operators/kernel/conv_add_bn_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
+  bool relu_enabled = false;
+  auto input = const_cast<Tensor *>(param->Input());
+
+  auto bias = param->Bias();
+  auto bias_ptr = bias->data<float>();
+  auto filter = const_cast<Tensor *>(param->Filter());
+
+  auto out = param->Output();
+
+  auto bn_mean_ptr = param->InputMean()->data<float>();
+  auto bn_var_ptr = param->InputVariance()->data<float>();
+  auto bn_scale_ptr = param->InputScale()->data<float>();
+  auto bn_bias_ptr = param->InputBias()->data<float>();
+  const float epsilon = param->Epsilon();
+  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0] &&
+                            bias->dims()[0] == param->InputBias()->dims()[0],
+                        "Output channel should be equal to bias number");
+
+  const int channel = out->dims()[1];
+  auto bs_ptr =
+      reinterpret_cast<float *>(fpga::fpga_malloc(2 * channel * sizeof(float)));
+  auto new_scale = new Tensor();
+  auto new_bias = new Tensor();
+  auto new_scale_ptr = new_scale->mutable_data<float>({channel});
+  auto new_bias_ptr = new_bias->mutable_data<float>({channel});
+
+  for (int i = 0; i < channel; i++) {
+    new_scale_ptr[i] = bn_scale_ptr[i] /
+                       static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
+    new_bias_ptr[i] =
+        bn_bias_ptr[i] + (bias_ptr[i] - bn_mean_ptr[i]) * new_scale_ptr[i];
+    bs_ptr[i + channel] = new_scale_ptr[i];
+    bs_ptr[i] = new_bias_ptr[i];
+  }
+  param->SetNewScale(new_scale);
+  param->SetNewBias(new_bias);
+
+  fpga::format_conv_data(filter, out, bs_ptr, param->Groups());
+
+  fpga::SplitConvArgs conv_arg = {0};
+  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
+                       param->Groups(), param->Strides()[0],
+                       param->Strides()[1], param->Paddings()[0],
+                       param->Paddings()[1], bs_ptr);
+  param->SetFpgaArgs(conv_arg);
+
+  return true;
+}
+
+template <>
+void ConvAddBNKernel<FPGA, float>::Compute(
+    const FusionConvAddBNParam<FPGA> &param) {
+  fpga::ComputeFpgaConv(param.FpgaArgs());
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDBNRELU_OP
+
+#include "operators/kernel/conv_add_bn_relu_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvAddBNReluKernel<FPGA, float>::Init(
+    FusionConvAddBNReluParam<FPGA> *param) {
+  bool relu_enabled = true;
+  auto input = const_cast<Tensor *>(param->Input());
+  const Tensor *bias = param->Bias();
+  auto bias_ptr = bias->data<float>();
+  auto filter = const_cast<Tensor *>(param->Filter());
+  auto out = param->Output();
+  auto bn_mean_ptr = param->InputMean()->data<float>();
+  auto bn_var_ptr = param->InputVariance()->data<float>();
+  auto bn_scale_ptr = param->InputScale()->data<float>();
+  auto bn_bias_ptr = param->InputBias()->data<float>();
+  const float epsilon = param->Epsilon();
+  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0] &&
+                            bias->dims()[0] == param->InputBias()->dims()[0],
+                        "Output channel should be equal to bias number");
+
+  const int channel = out->dims()[1];
+  auto bs_ptr =
+      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
+  auto new_scale = new Tensor();
+  auto new_bias = new Tensor();
+  auto new_scale_ptr = new_scale->mutable_data<float>({channel});
+  auto new_bias_ptr = new_bias->mutable_data<float>({channel});
+
+  for (int i = 0; i < channel; i++) {
+    new_scale_ptr[i] = bn_scale_ptr[i] /
+                       static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
+    new_bias_ptr[i] =
+        bn_bias_ptr[i] + (bias_ptr[i] - bn_mean_ptr[i]) * new_scale_ptr[i];
+    bs_ptr[i + 2] = new_scale_ptr[i];
+    bs_ptr[i] = new_bias_ptr[i];
+  }
+  param->SetNewScale(new_scale);
+  param->SetNewBias(new_bias);
+
+  fpga::format_conv_data(filter, out, bs_ptr, param->Groups());
+
+  fpga::SplitConvArgs conv_arg = {0};
+  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
+                       param->Groups(), param->Strides()[0],
+                       param->Strides()[1], param->Paddings()[0],
+                       param->Paddings()[1], bs_ptr);
+  param->SetFpgaArgs(conv_arg);
+  return true;
+}
+
+template <>
+void ConvAddBNReluKernel<FPGA, float>::Compute(
+    const FusionConvAddBNReluParam<FPGA> &param) {
+  fpga::ComputeFpgaConv(param.FpgaArgs());
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/fpga/V2/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/conv_add_relu_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDRELU_OP
+
+#include "operators/kernel/conv_add_relu_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
+  bool relu_enabled = true;
+  auto input = const_cast<Tensor *>(param->Input());
+  const Tensor *bias = param->Bias();
+  auto bias_ptr = bias->data<float>();
+  auto filter = const_cast<Tensor *>(param->Filter());
+  auto out = param->Output();
+
+  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
+                        "Output channel should be equal to bias number");
+  int channel = out->dims()[1];
+  auto bs_ptr =
+      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
+  for (int i = 0; i < channel; i++) {
+    bs_ptr[i + channel] = 1;
+    bs_ptr[i] = bias_ptr[i];
+  }
+
+  fpga::format_conv_data(filter, out, bs_ptr, param->Groups());
+
+  fpga::SplitConvArgs conv_arg = {0};
+  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
+                       param->Groups(), param->Strides()[0],
+                       param->Strides()[1], param->Paddings()[0],
+                       param->Paddings()[1], bs_ptr);
+  param->SetFpgaArgs(conv_arg);
+  return true;
+}
+
+template <>
+void ConvAddReluKernel<FPGA, float>::Compute(
+    const FusionConvAddReluParam<FPGA> &param) {
+  fpga::ComputeFpgaConv(param.FpgaArgs());
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/fpga/V2/conv_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/conv_bn_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVBN_OP
+
+#include "operators/kernel/conv_bn_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
+  bool relu_enabled = false;
+  auto input = const_cast<Tensor *>(param->Input());
+  auto filter = const_cast<Tensor *>(param->Filter());
+  auto out = param->Output();
+  auto bn_mean_ptr = param->InputMean()->data<float>();
+  auto bn_var_ptr = param->InputVariance()->data<float>();
+  auto bn_scale_ptr = param->InputScale()->data<float>();
+  auto bn_bias_ptr = param->InputBias()->data<float>();
+  const float epsilon = param->Epsilon();
+  PADDLE_MOBILE_ENFORCE(out->dims()[1] == param->InputBias()->dims()[0],
+                        "Output channel should be equal to bias number");
+  const int channel = out->dims()[1];
+  auto bs_ptr =
+      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // // NOLINT
+  auto new_scale = new Tensor();
+  auto new_bias = new Tensor();
+  auto new_scale_ptr = new_scale->mutable_data<float>({channel});
+  auto new_bias_ptr = new_bias->mutable_data<float>({channel});
+
+  for (int i = 0; i < channel; i++) {
+    new_scale_ptr[i] = bn_scale_ptr[i] /
+                       static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
+    new_bias_ptr[i] = bn_bias_ptr[i] + (0 - bn_mean_ptr[i]) * new_scale_ptr[i];
+    bs_ptr[i + channel] = new_scale_ptr[i];
+    bs_ptr[i] = new_bias_ptr[i];
+  }
+  param->SetNewScale(new_scale);
+  param->SetNewBias(new_bias);
+
+  fpga::format_conv_data(filter, out, bs_ptr, param->Groups());
+
+  fpga::SplitConvArgs conv_arg = {0};
+  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
+                       param->Groups(), param->Strides()[0],
+                       param->Strides()[1], param->Paddings()[0],
+                       param->Paddings()[1], bs_ptr);
+  param->SetFpgaArgs(conv_arg);
+  return true;
+}
+
+template <>
+void ConvBNKernel<FPGA, float>::Compute(const FusionConvBNParam<FPGA> &param) {
+  fpga::ComputeFpgaConv(param.FpgaArgs());
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/fpga/V2/conv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/conv_bn_relu_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVBNRELU_OP
+
+#include "operators/kernel/conv_bn_relu_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
+  bool relu_enabled = true;
+  auto input = const_cast<Tensor *>(param->Input());
+  auto filter = const_cast<Tensor *>(param->Filter());
+  auto out = param->Output();
+  auto bn_mean_ptr = param->InputMean()->data<float>();
+  auto bn_var_ptr = param->InputVariance()->data<float>();
+  auto bn_scale_ptr = param->InputScale()->data<float>();
+  auto bn_bias_ptr = param->InputBias()->data<float>();
+  const float epsilon = param->Epsilon();
+  PADDLE_MOBILE_ENFORCE(out->dims()[1] == param->InputBias()->dims()[0],
+                        "Output channel should be equal to bias number");
+  const int channel = out->dims()[1];
+  auto bs_ptr =
+      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
+  auto new_scale = new Tensor();
+  auto new_bias = new Tensor();
+  auto new_scale_ptr = new_scale->mutable_data<float>({channel});
+  auto new_bias_ptr = new_bias->mutable_data<float>({channel});
+
+  for (int i = 0; i < channel; i++) {
+    new_scale_ptr[i] = bn_scale_ptr[i] /
+                       static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
+    new_bias_ptr[i] = bn_bias_ptr[i] + (0 - bn_mean_ptr[i]) * new_scale_ptr[i];
+    bs_ptr[i + channel] = new_scale_ptr[i];
+    bs_ptr[i] = new_bias_ptr[i];
+  }
+  param->SetNewScale(new_scale);
+  param->SetNewBias(new_bias);
+
+  fpga::format_conv_data(filter, out, bs_ptr, param->Groups());
+
+  fpga::SplitConvArgs conv_arg = {0};
+  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
+                       param->Groups(), param->Strides()[0],
+                       param->Strides()[1], param->Paddings()[0],
+                       param->Paddings()[1], bs_ptr);
+  param->SetFpgaArgs(conv_arg);
+  return true;
+}
+
+template <>
+void ConvBNReluKernel<FPGA, float>::Compute(
+    const FusionConvBNReluParam<FPGA> &param) {
+  fpga::ComputeFpgaConv(param.FpgaArgs());
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/fpga/V2/dropout_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/dropout_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef DROPOUT_OP
+
+#include "operators/kernel/dropout_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool DropoutKernel<FPGA, float>::Init(DropoutParam<FPGA> *param) {
+  param->Out()->ShareDataWith(*param->InputX());
+  return true;
+}
+
+template <>
+void DropoutKernel<FPGA, float>::Compute(const DropoutParam<FPGA> &param) {}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_ELEMENTWISEADDRELU_OP
+
+#include "operators/kernel/elementwise_add_relu_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ElementwiseAddReluKernel<FPGA, float>::Init(
+    ElementwiseAddReluParam<FPGA> *param) {
+  bool relu_enabled = true;
+  auto *input_x = const_cast<LoDTensor *>(param->InputX());
+  auto *input_y = const_cast<LoDTensor *>(param->InputY());
+  auto *out = param->Out();
+  auto input_x_ptr = input_x->data<float>();
+  auto input_y_ptr = input_y->data<float>();
+  int aligned_channel_num = fpga::get_aligned_channel_num(input_x->dims()[1]);
+  fpga::format_fp16_ofm(out, aligned_channel_num);
+  auto out_ptr = out->mutable_data<float>();
+
+  fpga::EWAddArgs ewaddArgs = {0};
+  ewaddArgs.relu_enabled = relu_enabled;
+  ewaddArgs.const0 = 0x3c00;  // =1
+  ewaddArgs.const1 = 0x3c00;  // =1
+  ewaddArgs.image0.address = input_x_ptr;
+  ewaddArgs.image0.channels = (uint32_t)input_x->dims()[1];
+  ewaddArgs.image0.scale_address = input_x->scale;
+  ewaddArgs.image0.height = (uint32_t)input_x->dims()[2];
+  ewaddArgs.image0.width = (uint32_t)input_x->dims()[3];
+  ewaddArgs.image0.pad_height = 0;
+  ewaddArgs.image0.pad_width = 0;
+  ewaddArgs.image1.address = input_y_ptr;
+  ewaddArgs.image1.channels = (uint32_t)input_y->dims()[1];
+  ewaddArgs.image1.scale_address = input_y->scale;
+  ewaddArgs.image1.height = (uint32_t)input_y->dims()[2];
+  ewaddArgs.image1.width = (uint32_t)input_y->dims()[3];
+  ewaddArgs.image1.pad_height = 0;
+  ewaddArgs.image1.pad_width = 0;
+  ewaddArgs.output.scale_address = out->scale;
+  ewaddArgs.output.address = out_ptr;
+  param->SetFpgaArgs(ewaddArgs);
+  return true;
+}
+
+template <>
+void ElementwiseAddReluKernel<FPGA, float>::Compute(
+    const ElementwiseAddReluParam<FPGA> &param) {
+  fpga::ComputeFpgaEWAdd(param.FpgaArgs());
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/fpga/V2/fc_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/fc_relu_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_FCRELU_OP
+#include "operators/kernel/fc_relu_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
+  bool relu_enabled = true;
+  auto input_x = const_cast<LoDTensor *>(param->InputX());
+  auto filter = const_cast<Tensor *>(param->InputY());
+  auto input_z = param->InputZ();
+  auto input_z_ptr = input_z->data<float>();
+  auto out = param->Out();
+  PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0],
+                        "Image channel should be equal to weight number");
+  int channel = (uint32_t)out->dims()[1];
+  auto bs_ptr =
+      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
+  for (int i = 0; i < channel; i++) {
+    bs_ptr[i + channel] = 1;
+    bs_ptr[i] = input_z_ptr[i];
+  }
+
+  int num = (uint32_t)filter->dims()[1];
+  int chw = (uint32_t)filter->dims()[0];
+  PADDLE_MOBILE_ENFORCE(
+      chw == input_x->numel(),
+      "Filter element num should be equal to IFM element num");
+  int height = (uint32_t)input_x->dims()[2];
+  int width = (uint32_t)input_x->dims()[3];
+  int filter_channel = chw / height / width;
+
+  out->Resize(framework::make_ddim({1, channel, 1, 1}));
+  filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
+  fpga::format_fc_data(filter, out, bs_ptr);
+
+  fpga::SplitConvArgs conv_arg = {0};
+  fpga::fill_split_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1,
+                       0, 0, bs_ptr);
+  param->SetFpgaArgs(conv_arg);
+  return true;
+}
+template <>
+void FusionFcReluKernel<FPGA, float>::Compute(
+    const FusionFcReluParam<FPGA> &param) {
+  fpga::ComputeFpgaConv(param.FpgaArgs());
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/fpga/V2/feed_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/feed_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "operators/kernel/feed_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool FeedKernel<FPGA, float>::Init(FeedParam<FPGA> *param) {
+  Tensor *output = param->Out();
+  int aligned_channel = fpga::get_aligned_channel_num(output->dims()[1]);
+  fpga::format_fp16_ofm(output, aligned_channel);
+  return true;
+}
+
+template <>
+void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
+  auto input =
+      reinterpret_cast<Tensor *>(const_cast<LoDTensor *>(param.InputX()));
+  auto input_ptr = input->data<float>();
+  fpga::format_image(input);
+  Tensor *output = param.Out();
+  auto output_ptr = output->data<float>();
+
+  fpga::BypassArgs args = {fpga::DATA_TYPE_FP32};
+
+  args.input_data_type = fpga::DATA_TYPE_FP32;
+  args.output_data_type = fpga::DATA_TYPE_FP16;
+  args.input_layout_type = fpga::LAYOUT_CHW;
+  args.output_layout_type = fpga::LAYOUT_HWC;
+  args.image.address = reinterpret_cast<void *>(input_ptr);
+  args.image.channels = (uint32_t)input->dims()[1];
+  args.image.height = (uint32_t)input->dims()[2];
+  args.image.width = (uint32_t)input->dims()[3];
+  args.image.pad_height = 0;
+  args.image.pad_width = 0;
+  args.output.address = output_ptr;
+  args.output.scale_address = output->scale;
+  fpga::PerformBypass(args);
+}
+template class FeedKernel<FPGA, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/kernel/fpga/V2/fetch_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/fetch_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "operators/kernel/fetch_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool FetchKernel<FPGA, float>::Init(FetchParam<FPGA> *param) {
+  return true;
+}
+
+template <>
+void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
+  param.Out()->ShareDataWith(*(param.InputX()));
+}
+
+template class FetchKernel<FPGA, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/kernel/fpga/V2/fusion_fc_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/fusion_fc_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_FC_OP
+
+#include "operators/kernel/fusion_fc_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
+  bool relu_enabled = false;
+  auto input_x = const_cast<LoDTensor *>(param->InputX());
+  auto filter = const_cast<Tensor *>(param->InputY());
+  const Tensor *input_z = param->InputZ();
+  auto input_z_ptr = input_z->data<float>();
+  auto out = param->Out();
+
+  PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0],
+                        "Image channel should be equal to weight number");
+  int channel = (uint32_t)out->dims()[1];
+  auto bs_ptr =
+      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
+  for (int i = 0; i < channel; i++) {
+    bs_ptr[i + channel] = 1;
+    bs_ptr[i] = input_z_ptr[i];
+  }
+  int num = (uint32_t)filter->dims()[1];
+  int chw = (uint32_t)filter->dims()[0];
+  PADDLE_MOBILE_ENFORCE(
+      chw == input_x->numel(),
+      "Filter element num should be equal to IFM element num");
+  int height = (uint32_t)input_x->dims()[2];
+  int width = (uint32_t)input_x->dims()[3];
+  int filter_channel = chw / height / width;
+
+  out->Resize(framework::make_ddim({1, channel, 1, 1}));
+  filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
+  fpga::format_fc_data(filter, out, bs_ptr);
+
+  fpga::SplitConvArgs conv_arg = {0};
+  fpga::fill_split_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1,
+                       0, 0, bs_ptr);
+  param->SetFpgaArgs(conv_arg);
+  return true;
+}
+
+template <>
+void FusionFcKernel<FPGA, float>::Compute(const FusionFcParam<FPGA> &param) {
+  fpga::ComputeFpgaConv(param.FpgaArgs());
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/fpga/V2/pool_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/pool_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef POOL_OP
+
+#include "operators/kernel/pool_kernel.h"
+
+class PoolingArgs;
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) {
+  auto *input = const_cast<Tensor *>(param->Input());
+  auto input_ptr = input->data<float>();
+  Tensor *output = param->Output();
+  int aligned_channel_num =
+      fpga::get_aligned_channel_num((int)output->dims()[1]);  // NOLINT
+  fpga::format_fp16_ofm(output, aligned_channel_num);
+  auto output_ptr = output->mutable_data<float>();
+  vector<int> ksize = param->Ksize();
+  vector<int> strides = param->Strides();
+  vector<int> paddings = param->Paddings();
+  std::string pooling_type = param->PoolingType();
+
+  fpga::PoolingArgs poolArgs = {0};
+  poolArgs.mode = pooling_type == "max" ? 0 : 1;  // max:0, avg:1
+  poolArgs.kernel_reciprocal =
+      fpga::fp32_2_fp16(float(1.0 / (ksize[0] * ksize[1])));  // NOLINT
+  poolArgs.image.address = input_ptr;
+  poolArgs.image.channels = (uint32_t)input->dims()[1];
+  poolArgs.image.height = (uint32_t)input->dims()[2];
+  poolArgs.image.width = (uint32_t)input->dims()[3];
+  poolArgs.image.pad_height = (uint32_t)paddings[0];
+  poolArgs.image.pad_width = (uint32_t)paddings[1];
+  poolArgs.image.scale_address = input->scale;
+  poolArgs.output.address = output_ptr;
+  poolArgs.output.scale_address = output->scale;
+  poolArgs.kernel.height = (uint32_t)ksize[0];
+  poolArgs.kernel.width = (uint32_t)ksize[1];
+  poolArgs.kernel.stride_h = (uint32_t)strides[0];
+  poolArgs.kernel.stride_w = (uint32_t)strides[1];
+  param->SetFpgaArgs(poolArgs);
+  return true;
+}
+
+template <>
+void PoolKernel<FPGA, float>::Compute(const PoolParam<FPGA> &param) {
+  fpga::ComputeFpgaPool(param.FpgaArgs());
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/fpga/V2/softmax_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/softmax_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef SOFTMAX_OP
+
+#include "operators/kernel/softmax_kernel.h"
+#include "operators/kernel/central-arm-func/softmax_arm_func.h"
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
+  auto input = const_cast<Tensor *>(param->InputX());
+  auto input_ptr = input->data<float>();
+  auto float_input = new Tensor;
+  float_input->mutable_data<float>({1, input->dims()[1]});
+  fpga::format_fp32_ofm(float_input, 8);
+
+  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
+  args.input_layout_type = fpga::LAYOUT_HWC;
+  args.output_layout_type = fpga::LAYOUT_CHW;
+  args.input_data_type = fpga::DATA_TYPE_FP16;
+  args.output_data_type = fpga::DATA_TYPE_FP32;
+  args.image.address = input_ptr;
+  args.image.height = 1;
+  args.image.width = 1;
+  args.image.channels = (uint32_t)input->dims()[1];
+  args.output.address = float_input->data<float>();
+  args.output.scale_address = float_input->scale;
+  param->SetFloatInput(float_input);
+  param->SetFpgaArgs(args);
+  return true;
+}
+
+template <>
+void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) {
+  Tensor *in_x = param.FloatInput();
+  Tensor *out = param.Out();
+
+  fpga::PerformBypass(param.FpgaArgs());
+  fpga::fpga_invalidate(
+      (void *)in_x->data<float>(),                           // NOLINT
+      fpga::get_aligned_channel_num((int)in_x->dims()[1]) *  // NOLINT
+          sizeof(float));
+  math::SoftmaxFuntor<CPU, float>()(in_x, out);
+  fpga::fpga_flush(out->data<float>(), out->memory_size());
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -23,8 +23,13 @@ limitations under the License. */
 #include "framework/scope.h"
 #include "framework/tensor.h"
 #include "framework/variable.h"
-#ifdef PADDLE_MOBILE_FPGA
-#include "fpga/api.h"
+
+#ifdef PADDLE_MOBILE_FPGA_V1
+#include "fpga/V1/api.h"
+#endif
+
+#ifdef PADDLE_MOBILE_FPGA_V2
+#include "fpga/V2/api.h"
 #endif

 #ifdef PADDLE_MOBILE_CL

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -62,15 +62,18 @@ if (CON GREATER -1)

 endif ()

-list(FIND NET "FPGAnets" CON)
+list(FIND NET "FPGA_NET_V1" CON)
 if (CON GREATER -1)
    ADD_EXECUTABLE(test-resnet50 fpga/test_resnet50.cpp test_helper.h test_include.h executor_for_test.h)
    target_link_libraries(test-resnet50 paddle-mobile)
-
-#    ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h test_include.h executor_for_test.h)
-#    target_link_libraries(test-resnet paddle-mobile)
    set(FOUND_MATCH ON)
+endif ()

+list(FIND NET "FPGA_NET_V2" CON)
+if (CON GREATER -1)
+    ADD_EXECUTABLE(test-resnet50 fpga/test_resnet50.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-resnet50 paddle-mobile)
+    set(FOUND_MATCH ON)
 endif ()

 list(FIND NET "mobilenetssd" CON)

--- a/test/fpga/test_resnet50.cpp
+++ b/test/fpga/test_resnet50.cpp
@@ -13,7 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <fstream>
 #include "../test_include.h"
-#include "fpga/api.h"
+
+#ifdef PADDLE_MOBILE_FPGA_V1
+#include "fpga/V1/api.h"
+#endif
+#ifdef PADDLE_MOBILE_FPGA_V2
+#include "fpga/V2/api.h"
+#endif
+
 void readStream(std::string filename, float *buf) {
  std::ifstream in;
  in.open(filename, std::ios::in);

--- a/tools/op.cmake
+++ b/tools/op.cmake
@@ -106,9 +106,9 @@ if (CON GREATER -1)
  set(FOUND_MATCH ON)
 endif()

-list(FIND NET "FPGAnets" CON)
+list(FIND NET "FPGA_NET_V1" CON)
 if (CON GREATER -1)
-  message("FPGAnets enabled")
+  message("FPGA_NET_V1 enabled")
  set(FUSION_CONVADDRELU_OP ON)
  set(FUSION_CONVADDBNRELU_OP ON)
  set(FUSION_CONVADDBN_OP ON)
@@ -124,6 +124,19 @@ if (CON GREATER -1)
  set(FOUND_MATCH ON)
 endif()

+list(FIND NET "FPGA_NET_V2" CON)
+if (CON GREATER -1)
+  message("FPGA_NET_V2 enabled")
+  set(FUSION_ELEMENTWISEADDRELU_OP ON)
+  set(FUSION_FC_OP ON)
+  set(POOL_OP ON)
+  set(CONCAT_OP ON)
+  set(SOFTMAX_OP ON)
+  set(FUSION_CONVBNRELU_OP ON)
+  set(FUSION_CONVBN_OP ON)
+  set(FOUND_MATCH ON)
+endif()
+
 list(FIND NET "nlp" CON)
 if (CON GREATER -1)
  message("nlp enabled")