Merge branch 'develop' into develop

70b60b36 · xiebaiyuan · GitHub · 66cc1502 · 29c0ae43 · 70b60b36
80 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,6 +10,10 @@ option(CPU "armv7 with neon" ON)
 option(GPU_MALI "mali gpu" OFF)
 option(GPU_CL "opencl gpu" OFF)
 option(FPGA "fpga" OFF)
+if(FPGA)
+    option(FPGAV1 "fpga v1" ON)
+    option(FPGAV2 "fpga v2" OFF)
+endif()
 project(paddle-mobile)
@@ -119,8 +123,43 @@ else()
 endif()
 if(FPGA)
-    message("FPGA mode enabled")
    add_definitions(-DPADDLE_MOBILE_FPGA)
+    file(GLOB_RECURSE _tmp_list src/operators/math/*.cpp src/operators/kernel/fpga/*.cc)
+    foreach(f ${_tmp_list})
+        list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
+    endforeach()
+    file(GLOB_RECURSE _tmp_list_h src/operators/math/*.h)
+    foreach(f ${_tmp_list_h})
+        list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
+    endforeach()
+    list(APPEND PADDLE_MOBILE_CC src/operators/math/softmax.cpp)
+    list(APPEND PADDLE_MOBILE_h src/operators/math/softmax.h)
+    list(APPEND PADDLE_MOBILE_h src/operators/math/math_func_neon.h)
+    if(FPGAV1)
+        message("FPGA_V1 enabled")
+        add_definitions(-DPADDLE_MOBILE_FPGA_V1)
+        file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V2/*.cpp src/fpga/V2/*.cpp)
+        foreach(f ${_tmp_list})
+            list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
+        endforeach()
+        file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V2/*.h src/fpga/V2/*.h)
+        foreach(f ${_tmp_list})
+            list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
+        endforeach()
+    endif()
+    if(FPGAV2)
+        message("FPGA_V2 enabled")
+        add_definitions(-DPADDLE_MOBILE_FPGA_V2)
+        file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V1/*.cpp src/fpga/V1/*.cpp)
+        foreach(f ${_tmp_list})
+            list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
+        endforeach()
+        file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V1/*.h src/fpga/V1/*.h)
+        foreach(f ${_tmp_list})
+            list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
+        endforeach()
+    endif()
 else()
    file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/*.cpp src/operators/kernel/fpga/*.cc)
    foreach(f ${_tmp_list})
@@ -166,8 +205,10 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY build)
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY build)
 # NET default
-if(FPGA)
+if(FPGAV1)
-    set(NET "FPGAnets" CACHE STRING "select net type")
+    set(NET "FPGA_NET_V1" CACHE STRING "select net type")
+elseif(FPGAV2)
+    set(NET "FPGA_NET_V2" CACHE STRING "select net type")
 else()
    set(NET "default" CACHE STRING "select net type")
 endif()

--- a/README.md
+++ b/README.md
@@ -19,7 +19,11 @@
 - 支持ZU5、ZU9等FPGA开发板
 - 支持树莓派等arm-linux开发板
-## Demo目录
+## Demo
+- [ANDROID](https://github.com/xiebaiyuan/paddle-mobile-demo)
+### 原Domo目录
 [https://github.com/PaddlePaddle/paddle-mobile/tree/develop/demo](https://github.com/PaddlePaddle/paddle-mobile/tree/develop/demo)
 ## 文档
@@ -33,8 +37,7 @@
 开发文档主要是关于编译、运行等问题。做为开发者，它可以和贡献文档共同结合使用。
 * [iOS](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_ios.md)
-* [Android_CPU](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_android.md)
+* [Android](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_android.md)
-* [Android_GPU](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_android_GPU.md)
 * [FPGA](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_fpga.md)
 * [ARM_LINUX](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_arm_linux.md)

--- a/demo/ReadMe.md
+++ b/demo/ReadMe.md
-## PADDLE MOBILE DEMOS
+## Demo 下载路径
- [ANDROID](https://github.com/xiebaiyuan/paddle-mobile-demo)
---
-### 原DEMO下载路径
 - [ANDROID](http://mms-graph.bj.bcebos.com/paddle-mobile%2FPaddleMobile_Android.zip)
 - [IOS](http://mms-graph.bj.bcebos.com/paddle-mobile%2FPaddleMobileDemo_iOS.zip)

--- a/doc/development_fpga.md
+++ b/doc/development_fpga.md
 # FPGA开发文档
-FPGA平台的代码在Xilinx ZCU102 revision 1.0开发板测试Resnet50成功，预测结果正确。
+FPGA平台的代码分为V1和V2。其中V1在Xilinx ZCU102 revision 1.0开发板测试Resnet50成功，预测结果正确。以下描述适用于复现V1运行的结果。
 ## 准备硬件
 ___
@@ -17,7 +17,7 @@ ___
 ## 编译工程
 ___
 1. 将最新的paddle mobile 代码复制到ZCU102开发板中。
-2. 进入paddle-mobile根目录， CMakeLists.txt 设置平台为 option(FPGA "fpga support" ON)。CPU和MALI\_GPU选项设置为OFF。
+2. 进入paddle-mobile根目录， CMakeLists.txt 设置平台为 option(FPGA "fpga support" ON)。CPU和MALI\_GPU选项设置为OFF。设置option(FPGAV1 "fpga v1" ON), option(FPGAV2 "fpga v2" OFF)。
 2. 执行以下命令，可在./test/build下生成test-resnet50可执行程序。
    * mkdir build
    * cd build

--- a/src/common/types.cpp
+++ b/src/common/types.cpp
@@ -82,6 +82,7 @@ std::unordered_map<
        {G_OP_TYPE_FUSION_CONV_ADD, {{"Input"}, {"Out"}}},
        {G_OP_TYPE_RELU, {{"X"}, {"Out"}}},
        {G_OP_TYPE_SOFTMAX, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_SIGMOID, {{"X"}, {"Out"}}},
        {G_OP_TYPE_MUL, {{"X"}, {"Out"}}},
        {G_OP_TYPE_ELEMENTWISE_ADD, {{"X", "Y"}, {"Out"}}},
        {G_OP_TYPE_POOL2D, {{"X"}, {"Out"}}},

--- a/src/fpga/api.cpp
+++ b/src/fpga/api.cpp
@@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "fpga/api.h"
+#include "fpga/V1/api.h"
 #include <fcntl.h>
 #include <sys/ioctl.h>
 #include <sys/mman.h>
 #include <algorithm>
 #include <map>
-#include "fpga/bias_scale.h"
+#include "fpga/V1/bias_scale.h"
-#include "fpga/filter.h"
+#include "fpga/V1/filter.h"
-#include "fpga/image.h"
+#include "fpga/V1/image.h"
 #define FPGA_TEST_MODE
 #define PADDLE_MOBILE_OS_LINUX

--- a/src/fpga/api.h
+++ b/src/fpga/api.h
--- a/src/fpga/bias_scale.cpp
+++ b/src/fpga/bias_scale.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "fpga/bias_scale.h"
+#include "fpga/V1/bias_scale.h"
 #include <memory.h>
-#include "fpga/api.h"
+#include "fpga/V1/api.h"
 namespace paddle_mobile {
 namespace fpga {

--- a/src/fpga/bias_scale.h
+++ b/src/fpga/bias_scale.h
--- a/src/fpga/filter.cpp
+++ b/src/fpga/filter.cpp
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "fpga/filter.h"
+#include "fpga/V1/filter.h"
 #include <memory.h>
 #include <algorithm>
-#include "fpga/api.h"
+#include "fpga/V1/api.h"
 namespace paddle_mobile {
 namespace fpga {

--- a/src/fpga/filter.h
+++ b/src/fpga/filter.h
--- a/src/fpga/image.cpp
+++ b/src/fpga/image.cpp
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "fpga/image.h"
+#include "fpga/V1/image.h"
 #include <memory.h>
 #include <algorithm>
-#include "fpga/api.h"
+#include "fpga/V1/api.h"
 namespace paddle_mobile {
 namespace fpga {

--- a/src/fpga/image.h
+++ b/src/fpga/image.h
--- a/src/fpga/V2/api.cpp
+++ b/src/fpga/V2/api.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "fpga/V2/api.h"
+#include <algorithm>
+#include "fpga/V2/bias_scale.h"
+#include "fpga/V2/config.h"
+#include "fpga/V2/filter.h"
+#include "fpga/V2/image.h"
+namespace paddle_mobile {
+namespace fpga {
+static std::map<void *, size_t> memory_map;
+int open_device() {
+  int ret = open_device_driver();
+  return ret;
+}
+int close_device() {
+  int ret = close_device_driver();
+  return ret;
+}
+void *fpga_malloc(size_t size) {
+  static uint64_t counter = 0;
+#ifdef PADDLE_MOBILE_ZU5
+  auto ptr = fpga_malloc_driver(size);
+#else
+  auto ptr = malloc(size);
+#endif
+  counter += size;
+  memory_map.insert(std::make_pair(ptr, size));
+  //  DLOG << "Address: " << ptr << ", " << size << " bytes allocated. Total "
+  //       << counter << " bytes";
+  return ptr;
+}
+void fpga_free(void *ptr) {
+  static uint64_t counter = 0;
+  size_t size = 0;
+  auto iter = memory_map.find(ptr);  // std::map<void *, size_t>::iterator
+  if (iter != memory_map.end()) {
+    size = iter->second;
+    memory_map.erase(iter);
+#ifdef PADDLE_MOBILE_ZU5
+    fpga_free_driver(ptr);
+#else
+    free(ptr);
+#endif
+    counter += size;
+    //    DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total "
+    //         << counter << " bytes";
+  } else {
+    DLOG << "Invalid pointer";
+  }
+}
+half fp32_2_fp16(float fp32_num) {
+  unsigned long tmp = *(unsigned long *)(&fp32_num);  // NOLINT
+  auto t = (half)(((tmp & 0x007fffff) >> 13) | ((tmp & 0x80000000) >> 16) |
+                  (((tmp & 0x7f800000) >> 13) - (112 << 10)));
+  if (tmp & 0x1000) {
+    t++;  // roundoff
+  }
+  return t;
+}
+float fp16_2_fp32(half fp16_num) {
+  int frac = (fp16_num & 0x3ff);
+  int exp = ((fp16_num & 0x7c00) >> 10) + 112;
+  int s = fp16_num & 0x8000;
+  int tmp = 0;
+  float fp32_num;
+  tmp = s << 16 | exp << 23 | frac << 13;
+  fp32_num = *(float *)&tmp;  // NOLINT
+  return fp32_num;
+}
+void format_image(framework::Tensor *image_tensor) {
+  auto dims = image_tensor->dims();
+  auto channel = dims[1], height = dims[2], width = dims[3];
+  auto data_ptr = image_tensor->data<float>();
+  size_t memory_size = channel * height * width * sizeof(float);
+  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
+  memcpy(new_data, data_ptr, memory_size);
+  int aligned_channel = filter::calc_aligned_channel((int)channel);  // NOLINT
+  image::format_image(&new_data, (int)channel, (int)height,          // NOLINT
+                      (int)width,                                    // NOLINT
+                      aligned_channel);
+  image_tensor->reset_data_ptr(new_data);
+}
+void format_fp16_ofm(framework::Tensor *ofm_tensor, int aligned_channel) {
+  auto dims = ofm_tensor->dims();
+  size_t memory_size = 0;
+  if (dims.size() == 4) {
+    auto height = dims[2], width = dims[3];
+    memory_size = (height + 1) / 2 * 2 * width * aligned_channel * sizeof(half);
+  } else if (dims.size() == 2) {
+    memory_size = aligned_channel * sizeof(half);
+  } else {
+    DLOG << "Wrong ofm dimension";
+  }
+  auto p = fpga_malloc(memory_size);
+  memset(p, 0, memory_size);
+  ofm_tensor->reset_data_ptr(p);
+}
+void format_fp32_ofm(framework::Tensor *ofm_tensor, int aligned_channel) {
+  auto dims = ofm_tensor->dims();
+  size_t memory_size = 0;
+  if (dims.size() == 4) {
+    auto height = dims[2], width = dims[3];
+    memory_size = height * width * aligned_channel * sizeof(float);
+  } else if (dims.size() == 2) {
+    memory_size = aligned_channel * sizeof(float);
+  } else {
+    DLOG << "Wrong ofm dimension";
+  }
+  auto p = fpga_malloc(memory_size);
+  memset(p, 0, memory_size);
+  ofm_tensor->reset_data_ptr(p);
+}
+float filter_find_max(framework::Tensor *filter_tensor) {
+  auto filter_ptr = filter_tensor->data<float>();
+  return filter::find_max(filter_ptr, (int)filter_tensor->numel());  // NOLINT
+}
+int get_aligned_channel_num(int channel_num) {
+  return filter::calc_aligned_channel(channel_num);
+}
+int get_aligned_filter_num(framework::Tensor *filter_tensor) {
+  auto dims = filter_tensor->dims();
+  return filter::calc_aligned_num((int)dims[0], (int)dims[1]);  // NOLINT
+}
+int get_conv_output_channel(framework::Tensor *filter_tensor) {
+  int aligned_filter_num = get_aligned_filter_num(filter_tensor);
+  return get_aligned_channel_num(aligned_filter_num);
+}
+void format_filter(framework::Tensor *filter_tensor, float max_value,
+                   int group_num) {
+  filter_tensor->scale[0] = float(max_value / 127.0);  // NOLINT
+  filter_tensor->scale[1] = float(127.0 / max_value);  // NOLINT
+  auto dims = filter_tensor->dims();
+  auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
+  auto data_ptr = filter_tensor->data<float>();
+  size_t memory_size = num * channel * height * width * sizeof(float);
+  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
+  memcpy(new_data, data_ptr, memory_size);
+  filter::format_filter(&new_data, (int)num, (int)channel,  // NOLINT
+                        (int)height,                        // NOLINT
+                        (int)width, group_num, max_value);  // NOLINT
+  filter_tensor->reset_data_ptr(new_data);
+}
+void format_fc_filter(framework::Tensor *filter_tensor, float max_value) {
+  filter_tensor->scale[0] = float(max_value / 127.0);  // NOLINT
+  filter_tensor->scale[1] = float(127.0 / max_value);  // NOLINT
+  auto dims = filter_tensor->dims();
+  auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
+  auto data_ptr = filter_tensor->data<float>();
+  size_t memory_size = num * channel * height * width * sizeof(float);
+  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
+  memcpy(new_data, data_ptr, memory_size);
+  filter::format_fc_filter(&new_data, (int)num, (int)channel,  // NOLINT
+                           (int)height,                        // NOLINT
+                           (int)width, 1, max_value);          // NOLINT
+  filter_tensor->reset_data_ptr(new_data);
+}
+void format_bias_scale_array(float **bias_scale_array, int filter_num,
+                             int filter_channel) {
+  int num_after_alignment =
+      filter::calc_aligned_num(filter_channel, filter_channel);
+  bias_scale::format_bias_scale_array(bias_scale_array, filter_num,
+                                      num_after_alignment);
+}
+void format_concat_output(framework::Tensor *out, int height, int width,
+                          uint32_t out_channel) {
+  auto data_ptr = fpga_malloc(out_channel * height * width * sizeof(half));
+  auto ddim = framework::make_ddim({1, out_channel, height, width});
+  out->Resize(ddim);
+  out->reset_data_ptr(data_ptr);
+}
+int format_conv_data(framework::Tensor *filter_tensor,
+                     framework::Tensor *ofm_tensor, float *bs_ptr, int group) {
+  float max_value = fpga::filter_find_max(filter_tensor);
+  fpga::format_filter(filter_tensor, max_value, group);
+  int aligned_num = get_aligned_filter_num(filter_tensor);
+  fpga::format_bias_scale_array(&bs_ptr,
+                                (int)filter_tensor->dims()[0],  // NOLINT
+                                aligned_num);
+  int aligned_channel = fpga::get_conv_output_channel(filter_tensor);
+  fpga::format_fp16_ofm(ofm_tensor, aligned_channel);
+  DLOG << aligned_channel;
+  return aligned_channel;
+}
+int format_fc_data(framework::Tensor *filter_tensor,
+                   framework::Tensor *ofm_tensor, float *bs_ptr) {
+  float max_value = fpga::filter_find_max(filter_tensor);
+  fpga::format_fc_filter(filter_tensor, max_value);
+  int aligned_num = get_aligned_filter_num(filter_tensor);
+  fpga::format_bias_scale_array(&bs_ptr,
+                                (int)filter_tensor->dims()[0],  // NOLINT
+                                aligned_num);
+  int aligned_channel = fpga::get_conv_output_channel(filter_tensor);
+  fpga::format_fp16_ofm(ofm_tensor, aligned_channel);
+  DLOG << aligned_channel;
+  return aligned_channel;
+}
+void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
+                    framework::Tensor *out, framework::Tensor *filter,
+                    bool relu_enabled, int group_num, int stride_h,
+                    int stride_w, int padding_h, int padding_w, float *bs_ptr) {
+  auto input_ptr = input->data<float>();
+  auto filter_ptr = filter->data<float>();
+  auto out_ptr = out->data<float>();
+  arg->group_num = (uint32_t)group_num;
+  arg->split_num = 1;
+  arg->filter_num = (uint32_t)filter->dims()[0];
+  arg->output.address = out_ptr;
+  arg->output.scale_address = out->scale;
+  arg->conv_args =
+      (ConvArgs *)fpga_malloc(arg->split_num * sizeof(ConvArgs));  // NOLINT
+  arg->concat_arg.image_num = arg->split_num;
+  arg->concat_arg.image_out = out_ptr;
+  arg->concat_arg.scale_out = out->scale;
+  arg->concat_arg.height = (uint32_t)out->dims()[2];
+  arg->concat_arg.width = (uint32_t)out->dims()[3];
+  int n = arg->split_num;
+  arg->concat_arg.images_in =
+      (half **)fpga_malloc(n * sizeof(int *));  // NOLINT
+  arg->concat_arg.scales_in =
+      (float **)fpga_malloc(n * sizeof(float *));  // NOLINT
+  arg->concat_arg.channel_num =
+      (uint32_t *)fpga_malloc(n * sizeof(uint32_t));  // NOLINT
+  for (int i = 0; i < n; i++) {
+    arg->conv_args[i].relu_enabled = relu_enabled;
+    arg->conv_args[i].sb_address = bs_ptr;
+    arg->conv_args[i].filter_address = (int8_t *)filter_ptr;  // NOLINT
+    arg->conv_args[i].filter_scale_address = filter->scale;
+    arg->conv_args[i].filter_num = arg->filter_num;
+    arg->conv_args[i].group_num = (uint32_t)group_num;
+    arg->conv_args[i].kernel.stride_h = (uint32_t)stride_h;
+    arg->conv_args[i].kernel.stride_w = (uint32_t)stride_w;
+    arg->conv_args[i].kernel.height = (uint32_t)filter->dims()[2];
+    arg->conv_args[i].kernel.width = (uint32_t)filter->dims()[3];
+    arg->conv_args[i].image.address = input_ptr;
+    arg->conv_args[i].image.scale_address = input->scale;
+    arg->conv_args[i].image.channels = (uint32_t)input->dims()[1];
+    arg->conv_args[i].image.height = (uint32_t)input->dims()[2];
+    arg->conv_args[i].image.width = (uint32_t)input->dims()[3];
+    arg->conv_args[i].image.pad_height = (uint32_t)padding_h;
+    arg->conv_args[i].image.pad_width = (uint32_t)padding_w;
+    arg->conv_args[i].output.address = out_ptr;
+    arg->conv_args[i].output.scale_address = out->scale;
+  }
+}
+}  // namespace fpga
+}  // namespace paddle_mobile
--- a/src/fpga/V2/api.h
+++ b/src/fpga/V2/api.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <stdint.h>
+#include <cstddef>
+#include <iostream>
+#include <limits>
+#include "fpga/V2/driver/driver.h"
+#include "fpga/V2/driver/pe.h"
+#include "framework/tensor.h"
+namespace paddle_mobile {
+namespace fpga {
+enum DataType {
+  DATA_TYPE_FP32 = 1,
+  DATA_TYPE_FP16 = 0,
+};
+enum LayoutType {
+  LAYOUT_CHW = 1,
+  LAYOUT_HWC = 0,
+};
+struct KernelArgs {
+  uint32_t width;
+  uint32_t height;
+  uint32_t stride_w;
+  uint32_t stride_h;
+};
+struct ImageInputArgs {
+  void* address;         // input featuremap virtual address
+  float* scale_address;  // input scale address;
+  uint32_t channels;
+  uint32_t width;  // featuremap width
+  uint32_t height;
+  uint32_t pad_width;  // padding width;
+  uint32_t pad_height;
+};
+struct ImageOutputArgs {
+  void* address;         // output result address;
+  float* scale_address;  // output scale address;
+  uint64_t timer_cnt;    // time counter for FPGA computation
+};
+struct ConvArgs {
+  bool relu_enabled;
+  void* sb_address;  // scale and bias are interlaced;
+  void* filter_address;
+  float* filter_scale_address;
+  uint32_t filter_num;
+  uint32_t group_num;
+  struct KernelArgs kernel;
+  struct ImageInputArgs image;  // input image;
+  struct ImageOutputArgs output;
+};
+struct ConcatArgs {
+  uint32_t image_num;
+  half** images_in;
+  float** scales_in;
+  void* image_out;
+  float* scale_out;
+  uint32_t* channel_num;
+  uint32_t* aligned_channel_num;
+  uint32_t out_channel;
+  uint32_t height;
+  uint32_t width;
+};
+struct SplitConvArgs {
+  uint32_t split_num;
+  uint32_t group_num;
+  uint32_t filter_num;
+  struct ImageOutputArgs output;
+  struct ConvArgs* conv_args;
+  struct ConcatArgs concat_arg;
+};
+struct PoolingArgs {
+  int16_t mode;  // mode: 0:max, 1:avg
+  half kernel_reciprocal;
+  struct KernelArgs kernel;
+  struct ImageInputArgs image;  // input image;
+  struct ImageOutputArgs output;
+};
+struct EWAddArgs {
+  bool relu_enabled;
+  uint32_t const0;  // output0 = const0 x input0 + const1 x input1;
+  uint32_t const1;
+  struct ImageInputArgs image0;
+  struct ImageInputArgs image1;
+  struct ImageOutputArgs output;
+};
+struct BypassArgs {
+  enum DataType input_data_type;
+  enum DataType output_data_type;
+  enum LayoutType input_layout_type;
+  enum LayoutType output_layout_type;
+  struct ImageInputArgs image;
+  struct ImageOutputArgs output;
+};
+int open_device();
+int close_device();
+void* fpga_malloc(size_t size);
+void fpga_free(void* ptr);
+static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; }
+float filter_find_max(framework::Tensor* filter_tensor);
+int get_aligned_channel_num(int channel_num);
+int get_aligned_filter_num(framework::Tensor* filter_tensor);
+int get_conv_output_channel(framework::Tensor* filter_tensor);
+void format_image(framework::Tensor* image_tensor);
+void format_fp16_ofm(framework::Tensor* ofm_tensor,
+                     int aligned_channel);  // only allocate memory
+void format_fp32_ofm(framework::Tensor* ofm_tensor, int aligned_channel);
+void format_filter(framework::Tensor* filter_tensor, float max_value,
+                   int group_num);
+void format_fc_filter(framework::Tensor* filter_tensor, float max_value);
+void format_bias_scale_array(float** bias_scale_array, int filter_num,
+                             int filter_channel);
+void format_concat_output(framework::Tensor* out, int height, int width,
+                          uint32_t out_channel);
+int format_conv_data(framework::Tensor* filter_tensor,
+                     framework::Tensor* ofm_tensor, float* bs_ptr, int group);
+int format_fc_data(framework::Tensor* filter_tensor,
+                   framework::Tensor* ofm_tensor, float* bs_ptr);
+void fill_split_arg(struct SplitConvArgs* arg, framework::Tensor* input,
+                    framework::Tensor* out, framework::Tensor* filter,
+                    bool relu_enabled, int group_num, int stride_h,
+                    int stride_w, int padding_h, int padding_w, float* bs_ptr);
+half fp32_2_fp16(float fp32_num);
+float fp16_2_fp32(half fp16_num);
+}  // namespace fpga
+}  // namespace paddle_mobile
--- a/src/fpga/V2/bias_scale.cpp
+++ b/src/fpga/V2/bias_scale.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "fpga/V2/bias_scale.h"
+#include <memory.h>
+#include "fpga/V2/api.h"
+namespace paddle_mobile {
+namespace fpga {
+namespace bias_scale {
+void align_element(float **data_in, int num, int num_after_alignment) {
+  float *ptr_unaligned = *data_in;
+  int total_element = 2 * num_after_alignment;  // including bias & scale
+  float *ptr_aligned =
+      (float *)fpga_malloc(total_element * sizeof(float));  // NOLINT
+  memset(ptr_aligned, 0, total_element * sizeof(float));
+  for (int i = 1; i < num; i++) {
+    ptr_aligned[i * 2 + 0] = ptr_unaligned[i];
+    ptr_aligned[i * 2 + 1] = ptr_unaligned[i + num];
+  }
+  fpga_free(ptr_unaligned);
+  *data_in = ptr_aligned;
+}
+void format_bias_scale_array(float **data_in, int num,
+                             int num_after_alignment) {
+  align_element(data_in, num, num_after_alignment);
+}
+}  // namespace bias_scale
+}  // namespace fpga
+}  // namespace paddle_mobile
--- a/src/fpga/V2/bias_scale.h
+++ b/src/fpga/V2/bias_scale.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+namespace paddle_mobile {
+namespace fpga {
+namespace bias_scale {
+void align_element(float **data_in, int num, int num_after_alignment);
+void format_bias_scale_array(float **data_in, int num, int num_after_alignment);
+}  // namespace bias_scale
+}  // namespace fpga
+}  // namespace paddle_mobile
--- a/src/fpga/V2/config.h
+++ b/src/fpga/V2/config.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#define PADDLE_MOBILE_ZU5
+#define FPGA_PRINT_MODE
--- a/src/fpga/V2/driver/bitmap.cpp
+++ b/src/fpga/V2/driver/bitmap.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "fpga/V2/driver/bitmap.h"
+namespace fpga_bitmap {
+void bitmap_set(uint64_t *map, unsigned int start, int len) {
+  uint64_t *p = map + BIT_WORD(start);
+  const unsigned int size = start + len;
+  int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG);
+  uint64_t mask_to_set = BITMAP_FIRST_WORD_MASK(start);
+  while (len - bits_to_set >= 0) {
+    *p |= mask_to_set;
+    len -= bits_to_set;
+    bits_to_set = BITS_PER_LONG;
+    mask_to_set = ~0UL;
+    p++;
+  }
+  if (len) {
+    mask_to_set &= BITMAP_LAST_WORD_MASK(size);
+    *p |= mask_to_set;
+  }
+}
+void bitmap_clear(uint64_t *map, unsigned int start, int len) {
+  uint64_t *p = map + BIT_WORD(start);
+  const unsigned int size = start + len;
+  int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG);
+  uint64_t mask_to_clear = BITMAP_FIRST_WORD_MASK(start);
+  while (len - bits_to_clear >= 0) {
+    *p &= ~mask_to_clear;
+    len -= bits_to_clear;
+    bits_to_clear = BITS_PER_LONG;
+    mask_to_clear = ~0UL;
+    p++;
+  }
+  if (len) {
+    mask_to_clear &= BITMAP_LAST_WORD_MASK(size);
+    *p &= ~mask_to_clear;
+  }
+}
+static uint64_t ffs(uint64_t data) {
+  uint64_t bit = 0;
+  int i = 0;
+  for (i = 0; i < sizeof(data); i++) {
+    if (data & (1 << i)) {
+      bit = i;
+      break;
+    }
+  }
+  return bit;
+}
+static uint64_t _find_next_bit(const uint64_t *addr, uint64_t nbits,
+                               uint64_t start, uint64_t invert) {
+  uint64_t tmp = 0;
+  if (!nbits || start >= nbits) return nbits;
+  tmp = addr[start / BITS_PER_LONG] ^ invert;
+  /* Handle 1st word. */
+  tmp &= BITMAP_FIRST_WORD_MASK(start);
+  start = round_down(start, BITS_PER_LONG);
+  while (!tmp) {
+    start += BITS_PER_LONG;
+    if (start >= nbits) return nbits;
+    tmp = addr[start / BITS_PER_LONG] ^ invert;
+  }
+  return (start + ffs(tmp)) < nbits ? (start + ffs(tmp)) : nbits;
+}
+uint64_t find_next_zero_bit(const uint64_t *addr, uint64_t size,
+                            uint64_t offset) {
+  return _find_next_bit(addr, size, offset, ~0UL);
+}
+uint64_t find_next_bit(const uint64_t *addr, uint64_t size, uint64_t offset) {
+  return _find_next_bit(addr, size, offset, 0UL);
+}
+uint64_t bitmap_find_next_zero_area_off(uint64_t *map, uint64_t size,
+                                        uint64_t start, unsigned int nr,
+                                        uint64_t align_mask,
+                                        uint64_t align_offset) {
+  uint64_t index = 0;
+  uint64_t end = 0;
+  uint64_t i = 0;
+again:
+  index = find_next_zero_bit(map, size, start);
+  /* Align allocation */
+  index = __ALIGN_MASK(index + align_offset, align_mask) - align_offset;
+  end = index + nr;
+  if (end > size) return end;
+  i = find_next_bit(map, end, index);
+  if (i < end) {
+    start = i + 1;
+    goto again;
+  }
+  return index;
+}
+uint64_t bitmap_find_next_zero_area(uint64_t *map, uint64_t size,
+                                    uint64_t start, unsigned int nr,
+                                    uint64_t align_mask) {
+  return bitmap_find_next_zero_area_off(map, size, start, nr, align_mask, 0);
+}
+}  // namespace fpga_bitmap
--- a/src/fpga/V2/driver/bitmap.h
+++ b/src/fpga/V2/driver/bitmap.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <stdint.h>
+#include <stdio.h>
+#define BITS_PER_LONG 64
+#define BIT_WORD(nr) ((nr) / BITS_PER_LONG)
+#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1)))
+#define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1)))
+#define __ALIGN_KERNEL_MASK(x, mask) (((x) + (mask)) & ~(mask))
+#define __ALIGN_MASK(x, mask) __ALIGN_KERNEL_MASK((x), (mask))
+#define round_down(x, y) ((x) & ((y)-1))
+namespace fpga_bitmap {
+void bitmap_set(uint64_t *map, unsigned int start, int len);
+void bitmap_clear(uint64_t *map, unsigned int start, int len);
+uint64_t bitmap_find_next_zero_area(uint64_t *map, uint64_t size,
+                                    uint64_t start, unsigned int nr,
+                                    uint64_t align_mask);
+}  // namespace fpga_bitmap
--- a/src/fpga/V2/driver/driver.cpp
+++ b/src/fpga/V2/driver/driver.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <errno.h>
+#include <fcntl.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <algorithm>
+#include <cstddef>
+#include <cstring>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include "common/enforce.h"
+#include "fpga/V2/driver/bitmap.h"
+#include "fpga/V2/driver/driver.h"
+namespace paddle_mobile {
+namespace fpga {
+struct FPGA_INFO g_fpgainfo;
+int open_drvdevice() {
+  if (g_fpgainfo.fd_drv == -1) {
+    g_fpgainfo.fd_drv = open(g_fpgainfo.drvdevice_path, O_RDWR);
+  }
+  return g_fpgainfo.fd_drv;
+}
+int open_memdevice() {
+  if (g_fpgainfo.fd_mem == -1) {
+    g_fpgainfo.fd_mem = open(g_fpgainfo.memdevice_path, O_RDWR | O_DSYNC);
+  }
+  return g_fpgainfo.fd_mem;
+}
+void pl_reset() {
+  // DLOG << "PL RESET";
+  // reg_writeq(0x5a, REG_FPGA_RESET);
+  usleep(100 * 1000);
+}
+void setup_pe(struct pe_data_s *pe_data, struct fpga_pe *pe,
+              char const *type_name, int pe_idx) {
+  memset(pe, 0, sizeof(struct fpga_pe));
+  pe->outer = pe_data;
+  snprintf(pe->type_name, MAX_TYPE_NAME_LENTH, "%s", type_name);
+  pe->status = IDLE;
+  pe->interrupt_cnt = 0;
+  pe_data->pes[pe_idx] = pe;
+  pe_data->pe_num++;
+}
+void pl_init() {
+  struct pe_data_s *pe_data = nullptr;
+  pl_reset();
+  pe_data = (struct pe_data_s *)malloc(sizeof(struct pe_data_s));
+  if (pe_data == nullptr) {
+    DLOG << "pe_data malloc error!";
+    return;
+  }
+  memset(pe_data, 0, sizeof(struct pe_data_s));
+  pthread_mutex_init(&pe_data->mutex, 0);
+  setup_pe(pe_data, &pe_data->pe_conv, "CONV", PE_IDX_CONV);
+  setup_pe(pe_data, &pe_data->pe_pooling, "POOLING", PE_IDX_POOLING);
+  setup_pe(pe_data, &pe_data->pe_ew, "EW", PE_IDX_EW);
+  setup_pe(pe_data, &pe_data->pe_bypass, "BYPASS", PE_IDX_BYPASS);
+  g_fpgainfo.pe_data = pe_data;
+}
+void pl_destroy() {
+  struct pe_data_s *pe_data = g_fpgainfo.pe_data;
+  pthread_mutex_destroy(&pe_data->mutex);
+  free(pe_data);
+}
+void pl_start() {
+  struct pe_data_s *pe_data = g_fpgainfo.pe_data;
+  pthread_mutex_unlock(&pe_data->mutex);
+}
+void pl_stop() {
+  struct pe_data_s *pe_data = g_fpgainfo.pe_data;
+  pthread_mutex_lock(&pe_data->mutex);
+}
+void pl_reinit() {
+  struct pe_data_s *pe_data = g_fpgainfo.pe_data;
+  struct fpga_pe *pe = nullptr;
+  int i = 0;
+  pl_stop();
+  pl_reset();
+  pl_start();
+  for (i = 0; i < pe_data->pe_num; i++) {
+    pe = pe_data->pes[i];
+    pe->status = IDLE;
+    pe->interrupt_cnt = 0;
+  }
+  pl_start();
+}
+int pl_get_status() { return 0; }
+/*tmie单位us*/
+int fpga_regpoll(uint64_t reg, uint64_t val, int time) {
+  uint64_t i = 0;
+  /*timeout精确性待确认*/
+  int64_t timeout = time * CPU_FREQ / 1000000;
+  for (i = 0; i < timeout; i++) {
+    if (val == reg_readq(reg)) {
+      break;
+    }
+  }
+  if (i <= timeout) {
+    return 0;
+  } else {
+    return -1;
+  }
+}
+/*内存管理*/
+int memory_request(struct fpga_memory *memory, size_t size, uint64_t *addr) {
+  uint64_t _nr = DIV_ROUND_UP(size, FPGA_PAGE_SIZE);
+  unsigned int nr = (unsigned int)_nr;
+  int ret = 0;
+  pthread_mutex_lock(&memory->mutex);
+  unsigned int pos = (unsigned int)fpga_bitmap::bitmap_find_next_zero_area(
+      memory->bitmap, memory->page_num, 0, nr, 0);
+  if (pos <= memory->page_num) {
+    uint64_t address_ofset =
+        memory->mem_start + ((uint64_t)pos) * FPGA_PAGE_SIZE;
+    fpga_bitmap::bitmap_set(memory->bitmap, pos, nr);
+    memory->nr[pos] = nr;
+    *addr = address_ofset;
+  } else {
+    ret = -ENOMEM;
+  }
+  pthread_mutex_unlock(&memory->mutex);
+  return ret;
+}
+void memory_release(struct fpga_memory *memory) {
+  pthread_mutex_lock(&memory->mutex);
+  fpga_bitmap::bitmap_clear(memory->bitmap, 0, memory->page_num);
+  pthread_mutex_unlock(&memory->mutex);
+}
+int create_fpga_memory_inner(struct fpga_memory *memory, size_t memory_size) {
+  int rc = 0;
+  uint64_t *bitmap = nullptr;
+  unsigned int *nr = nullptr;
+  // 不允许多份memory创建，所以创建memory结构体不存在互斥
+  // pthread_mutex_lock(&memory->mutex);
+  memory->page_num = (unsigned int)(memory_size / FPGA_PAGE_SIZE);
+  memory->page_num_long = DIV_ROUND_UP(memory->page_num, BITS_PER_LONG);
+  bitmap =
+      (uint64_t *)malloc(sizeof(int64_t) * memory->page_num_long);  // NOLINT
+  if (!bitmap) {
+    rc = -EFAULT;
+    return rc;
+  }
+  memory->bitmap = bitmap;
+  nr = (unsigned int *)calloc(memory->page_num, sizeof(unsigned int));
+  if (!nr) {
+    rc = -EFAULT;
+    free(bitmap);
+    return rc;
+  }
+  memory->nr = nr;
+  memory->mem_start = FPGA_MEM_PHY_ADDR;
+  memory->mem_end = FPGA_MEM_SIZE;
+  // pthread_mutex_unlock(memory->mutex);
+  return rc;
+}
+int create_fpga_memory(struct fpga_memory **memory_info) {
+  int rc = 0;
+  *memory_info = (struct fpga_memory *)malloc(sizeof(struct fpga_memory));
+  if (*memory_info == NULL) {
+    rc = -EFAULT;
+    return rc;
+  }
+  pthread_mutex_init(&((*memory_info)->mutex), nullptr);
+  rc = create_fpga_memory_inner(*memory_info, FPGA_MEM_SIZE);
+  if (rc) {
+    free(*memory_info);
+  }
+  return rc;
+}
+int init_fpga_memory(struct fpga_memory *memory) {
+  int rc = 0;
+  if (!memory) {
+    rc = -EFAULT;
+    return rc;
+  }
+  // spin_lock_init(&memory->spin);
+  fpga_bitmap::bitmap_clear(memory->bitmap, 0, memory->page_num);
+  fpga_bitmap::bitmap_set(memory->bitmap, 0, 1);  // NOTE reserve fpga page 0.
+  return 0;
+}
+void destroy_fpga_memory(struct fpga_memory *memory) {
+  if (memory) {
+    free(memory->nr);
+    free(memory->bitmap);
+    free(memory);
+  }
+}
+int fpga_memory_add() {
+  int rc = 0;
+  rc = create_fpga_memory(&g_fpgainfo.memory_info);
+  if (rc) {
+    return rc;
+  }
+  rc = init_fpga_memory(g_fpgainfo.memory_info);
+  if (rc) {
+    destroy_fpga_memory(g_fpgainfo.memory_info);
+    return rc;
+  }
+  return 0;
+}
+uint64_t vaddr_to_paddr(void *address) {
+  uint64_t paddr = 0;
+  auto iter = g_fpgainfo.fpga_vaddr2paddr_map.find(address);
+  if (iter != g_fpgainfo.fpga_vaddr2paddr_map.end()) {
+    paddr = iter->second;
+  } else {
+    DLOG << "Invalid pointer";
+  }
+  return paddr;
+}
+void *fpga_reg_malloc(size_t size) {
+  void *ret = nullptr;
+  ret = mmap64(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED,
+               g_fpgainfo.fd_drv, FPGA_REG_PHY_ADDR);
+  // PADDLE_MOBILE_ENFORCE(ret != (void *)-1, "Should not be -1");
+  g_fpgainfo.fpga_addr2size_map.insert(std::make_pair(ret, size));
+  return ret;
+}
+void *fpga_malloc_driver(size_t size) {
+  void *ret = nullptr;
+  uint64_t phy_addr = 0;
+  memory_request(g_fpgainfo.memory_info, size, &phy_addr);
+  ret = mmap64(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED,
+               g_fpgainfo.fd_mem, phy_addr);
+  PADDLE_MOBILE_ENFORCE(ret != (void *)-1, "Should not be -1");
+  g_fpgainfo.fpga_vaddr2paddr_map.insert(std::make_pair(ret, phy_addr));
+  g_fpgainfo.fpga_addr2size_map.insert(std::make_pair(ret, size));
+  return ret;
+}
+void fpga_free_driver(void *ptr) {
+  size_t size = 0;
+  auto iter = g_fpgainfo.fpga_addr2size_map.find(ptr);
+  if (iter != g_fpgainfo.fpga_addr2size_map.end()) {
+    size = iter->second;
+    g_fpgainfo.fpga_addr2size_map.erase(iter);
+    munmap(ptr, size);
+  } else {
+    DLOG << "Invalid pointer";
+  }
+}
+int open_device_driver() {
+  g_fpgainfo.FpgaRegPhyAddr = FPGA_REG_PHY_ADDR;
+  g_fpgainfo.FpgaMemPhyAddr = FPGA_MEM_PHY_ADDR;
+  g_fpgainfo.FpgaRegVirAddr = nullptr;
+  g_fpgainfo.pe_data = nullptr;
+  g_fpgainfo.drvdevice_path = "/dev/fpgadrv0";
+  g_fpgainfo.memdevice_path = "/dev/fpgamem0";
+  g_fpgainfo.fd_drv = -1;
+  g_fpgainfo.fd_mem = -1;
+  int ret = 0;
+  ret = open_drvdevice();
+  ret |= open_memdevice();
+  g_fpgainfo.FpgaRegVirAddr =
+      (uint64_t *)fpga_reg_malloc(FPGA_REG_SIZE);  // NOLINT
+  fpga_memory_add();
+  pl_init();
+  return ret;
+}
+int close_device_driver() {
+  pl_destroy();
+  fpga_free_driver(g_fpgainfo.FpgaRegVirAddr);
+  memory_release(g_fpgainfo.memory_info);
+  destroy_fpga_memory(g_fpgainfo.memory_info);
+  return 0;
+}
+}  // namespace fpga
+}  // namespace paddle_mobile
--- a/src/fpga/V2/driver/driver.h
+++ b/src/fpga/V2/driver/driver.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <cstring>
+#include <map>
+#include "common/log.h"
+namespace paddle_mobile {
+namespace fpga {
+#define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
+#define FPGA_REG_PHY_ADDR 0xa0000000
+#define FPGA_REG_SIZE 0x1000
+#define FPGA_MEM_PHY_ADDR 0x20000000
+#define FPGA_MEM_SIZE 0x20000000
+#define CPU_FREQ 1000000000
+#define FPGA_PAGE_SIZE (16UL * 1024UL)
+// PE related macros
+const int MAX_NUM_PES = 6;
+const size_t MAX_TYPE_NAME_LENTH = 8;
+const int PE_IDX_CONV = 0;
+const int PE_IDX_POOLING = 1;
+const int PE_IDX_EW = 2;
+const int PE_IDX_BYPASS = 3;
+enum pe_status { IDLE = 0, BUSY = 1 };
+struct fpga_pe {
+  char type_name[MAX_TYPE_NAME_LENTH + 1];
+  struct pe_data_s *outer;
+  pe_status status;  // 0=idle 1=busy -1=fail
+  uint64_t interrupt_cnt;
+};
+struct pe_data_s {
+  pthread_mutex_t mutex;
+  struct fpga_pe pe_conv;
+  struct fpga_pe pe_pooling;
+  struct fpga_pe pe_ew;
+  struct fpga_pe pe_bypass;
+  struct fpga_pe *pes[MAX_NUM_PES];
+  int pe_num;
+};
+struct fpga_memory {
+  pthread_mutex_t mutex;
+  uint64_t *bitmap;
+  unsigned int *nr;
+  unsigned int page_num;
+  unsigned int page_num_long;
+  uint64_t mem_start;
+  uint64_t mem_end;
+};
+struct FPGA_INFO {
+  uint64_t FpgaRegPhyAddr;
+  uint64_t FpgaMemPhyAddr;
+  pthread_t poll_pid;
+  void *FpgaRegVirAddr;
+  struct pe_data_s *pe_data;
+  std::map<void *, size_t> fpga_addr2size_map;
+  std::map<void *, uint64_t> fpga_vaddr2paddr_map;
+  const char *drvdevice_path;
+  const char *memdevice_path;
+  struct fpga_memory *memory_info;
+  int fd_drv;
+  int fd_mem;
+};
+extern struct FPGA_INFO g_fpgainfo;
+inline uint64_t reg_readq(uint32_t offset) {
+  // DLOG << "offset : " << offset;
+  uint64_t value =
+      *(uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + offset);  // NOLINT
+  return value;
+}
+inline void reg_writeq(uint64_t value, uint32_t offset) {
+  // DLOG << "offset : " << offset << ", value : " << value;
+  *(uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + offset) =  // NOLINT
+      value;
+}
+int open_device_driver();
+int close_device_driver();
+void *fpga_malloc_driver(size_t size);
+void fpga_free_driver(void *ptr);
+/*pe*/
+uint64_t vaddr_to_paddr(void *address);
+int fpga_regpoll(uint64_t reg, uint64_t val, int time);
+}  // namespace fpga
+}  // namespace paddle_mobile
--- a/src/fpga/V2/driver/pe.cpp
+++ b/src/fpga/V2/driver/pe.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "fpga/V2/driver/pe.h"
+#include "fpga/V2/config.h"
+#include "fpga/V2/driver/driver.h"
+#include "fpga/V2/filter.h"
+#include "fpga/V2/image.h"
+namespace paddle_mobile {
+namespace fpga {
+#define MUL8(x) (x * 8)
+#define BYPASS_DONE 1
+float Findfp16Max() {
+  uint16_t abs_vals[16];
+  uint64_t max_fp16;
+  max_fp16 = reg_readq(MUL8(49));
+  abs_vals[0] = (uint16_t)(0x0000007f & (max_fp16));        // NOLINT
+  abs_vals[1] = (uint16_t)(0x0000007f & (max_fp16 >> 16));  // NOLINT
+  abs_vals[2] = (uint16_t)(0x0000007f & (max_fp16 >> 32));  // NOLINT
+  abs_vals[3] = (uint16_t)(0x0000007f & (max_fp16 >> 48));  // NOLINT
+  max_fp16 = reg_readq(MUL8(50));
+  abs_vals[4] = (uint16_t)(0x0000007f & (max_fp16));        // NOLINT
+  abs_vals[5] = (uint16_t)(0x0000007f & (max_fp16 >> 16));  // NOLINT
+  abs_vals[6] = (uint16_t)(0x0000007f & (max_fp16 >> 32));  // NOLINT
+  abs_vals[7] = (uint16_t)(0x0000007f & (max_fp16 >> 48));  // NOLINT
+  max_fp16 = reg_readq(MUL8(51));
+  abs_vals[8] = (uint16_t)(0x0000007f & (max_fp16));         // NOLINT
+  abs_vals[9] = (uint16_t)(0x0000007f & (max_fp16 >> 16));   // NOLINT
+  abs_vals[10] = (uint16_t)(0x0000007f & (max_fp16 >> 32));  // NOLINT
+  abs_vals[11] = (uint16_t)(0x0000007f & (max_fp16 >> 48));  // NOLINT
+  max_fp16 = reg_readq(MUL8(52));
+  abs_vals[12] = (uint16_t)(0x0000007f & (max_fp16));
+  abs_vals[13] = (uint16_t)(0x0000007f & (max_fp16 >> 16));  // NOLINT
+  abs_vals[14] = (uint16_t)(0x0000007f & (max_fp16 >> 32));  // NOLINT
+  abs_vals[15] = (uint16_t)(0x0000007f & (max_fp16 >> 48));  // NOLINT
+  uint16_t tmp = 0;
+  for (int i = 0; i < 16; i++) {
+    if (tmp < abs_vals[i]) {
+      tmp = abs_vals[i];
+    }
+  }
+  return fp16_2_fp32(tmp) / 127.0f;
+}
+int ComputeFpgaConv(const struct SplitConvArgs &args) {
+  ComputeBasicConv(args.conv_args[0]);
+}
+int ComputeBasicConv(const struct ConvArgs &args) {
+#ifdef FPGA_PRINT_MODE
+  DLOG << "======Compute Basic Conv======";
+  DLOG << "   relu_enabled:" << args.relu_enabled
+       << "   sb_address:" << args.sb_address
+       << "   filter_address:" << args.filter_address
+       << "   filter_num:" << args.filter_num
+       << "   group_num:" << args.group_num;
+  DLOG << "   image_address:" << args.image.address
+       << "   image_scale_address:" << args.image.scale_address
+       << "   image_channels:" << args.image.channels
+       << "   image_height:" << args.image.height
+       << "   image_width:" << args.image.width
+       << "   pad_height:" << args.image.pad_height
+       << "   pad_width:" << args.image.pad_width;
+  DLOG << "   kernel_height:" << args.kernel.height
+       << "   kernel_width:" << args.kernel.width
+       << "   stride_h:" << args.kernel.stride_h
+       << "   stride_w:" << args.kernel.stride_w;
+  DLOG << "   out_address:" << args.output.address
+       << "   out_scale_address:" << args.output.scale_address;
+#endif
+#ifndef PADDLE_MOBILE_ZU5
+  return 0;
+#endif
+  return 0;
+}
+int ComputeFpgaPool(const struct PoolingArgs &args) {
+#ifdef FPGA_PRINT_MODE
+  DLOG << "=============ComputeFpgaPool===========";
+  DLOG << "   mode:" << args.mode
+       << "   kernel_reciprocal:" << fp16_2_fp32(args.kernel_reciprocal);
+  DLOG << "   image_address:" << args.image.address
+       << "   image_scale_address:" << args.image.scale_address
+       << "   image_channels:" << args.image.channels
+       << "   image_height:" << args.image.height
+       << "   image_width:" << args.image.width
+       << "   pad_height:" << args.image.pad_height
+       << "   pad_width:" << args.image.pad_width;
+  DLOG << "   kernel_height:" << args.kernel.height
+       << "   kernel_width:" << args.kernel.width
+       << "   stride_h:" << args.kernel.stride_h
+       << "   stride_w:" << args.kernel.stride_w;
+  DLOG << "   out_address:" << args.output.address
+       << "   out_scale_address:" << args.output.scale_address;
+#endif
+#ifndef PADDLE_MOBILE_ZU5
+  return 0;
+#endif
+  return 0;
+}
+int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
+#ifdef FPGA_PRINT_MODE
+  DLOG << "=============ComputeFpgaEWAdd===========";
+  DLOG << "   relu_enabled:" << args.relu_enabled
+       << "   const0:" << fp16_2_fp32(int16_t(args.const0))
+       << "   const1:" << fp16_2_fp32(int16_t(args.const1));
+  DLOG << "   image0_address:" << args.image0.address
+       << "   image0_scale_address:" << args.image0.scale_address
+       << "   image0_channels:" << args.image0.channels
+       << "   image0_height:" << args.image0.height
+       << "   image0_width:" << args.image0.width
+       << "   pad0_height:" << args.image0.pad_height
+       << "   pad0_width:" << args.image0.pad_width;
+  DLOG << "   image1_address:" << args.image1.address
+       << "   image1_scale_address:" << args.image1.scale_address
+       << "   image1_channels:" << args.image1.channels
+       << "   image1_height:" << args.image1.height
+       << "   image1_width:" << args.image1.width
+       << "   pad1_height:" << args.image1.pad_height
+       << "   pad_width:" << args.image1.pad_width;
+  DLOG << "   out_address:" << args.output.address
+       << "   out_scale_address:" << args.output.scale_address;
+#endif
+#ifndef PADDLE_MOBILE_ZU5
+  return 0;
+#endif
+  return 0;
+}
+int PerformBypass(const struct BypassArgs &args) {
+#ifdef FPGA_PRINT_MODE
+  DLOG << "=============ComputeFpgaBypass===========";
+  DLOG << "   input_type:" << args.input_data_type
+       << "   output_type:" << args.output_data_type
+       << "   input_layout_type:" << args.input_layout_type
+       << "   output_layout_type:" << args.output_layout_type;
+  DLOG << "   image_address:" << args.image.address
+       << "   image_scale_address:" << args.image.scale_address
+       << "   image_channels:" << args.image.channels
+       << "   image_height:" << args.image.height
+       << "   image_width:" << args.image.width
+       << "   pad_height:" << args.image.pad_height
+       << "   pad_width:" << args.image.pad_width;
+  DLOG << "   out_address:" << args.output.address
+       << "   out_scale_address:" << args.output.scale_address;
+#endif
+#ifndef PADDLE_MOBILE_ZU5
+  return 0;
+#endif
+  uint64_t ifm_src_paddr = vaddr_to_paddr(args.image.address);
+  uint64_t ifm_dst_paddr = vaddr_to_paddr(args.output.address);
+  uint64_t bp_enable;
+  int64_t length;
+  uint64_t pixels;
+  // fp32->fp16
+  if ((args.input_data_type) && (!args.output_data_type)) {
+    pixels = (args.image.channels) * (args.image.width) * (args.image.height);
+    length = pixels * sizeof(float);
+    bp_enable = 0x8800000000000000 + length;
+  }
+  // fp16->fp32
+  else if ((!args.input_data_type) && (args.output_data_type)) {
+    pixels = filter::calc_aligned_channel((args.image.channels)) *
+             (args.image.width) * (args.image.height);
+    length = pixels * sizeof(short);
+    length = align_to_x((int)length, 64);  // NOLINT
+    bp_enable = 0x8a00000000000000 + length;
+  }
+  // fp16->fp16 findmax
+  else if ((!args.input_data_type) && (!args.output_data_type)) {
+    pixels = (args.image.channels) * (args.image.width) * (args.image.height);
+    length = pixels * sizeof(short);
+    bp_enable = 0x8900000000000000 + length;
+  } else {
+    return -1;
+  }
+  // start bypass
+  reg_writeq(ifm_src_paddr, MUL8(27));
+  reg_writeq(ifm_dst_paddr, MUL8(28));
+  reg_writeq(0, MUL8(0));
+  reg_writeq(bp_enable, MUL8(0));
+  // poll
+  int ret = -1;
+  ret = fpga_regpoll(MUL8(48), BYPASS_DONE, 0xffffffff);
+  if (ret != -1) {
+    // clear "irq"
+    reg_readq(MUL8(63));
+  }
+  // get max value
+  if ((!args.input_data_type) && (!args.output_data_type)) {
+    float scale = Findfp16Max();
+    args.output.scale_address[0] = (float)(1.0 / scale);  // NOLINT
+    args.output.scale_address[1] = scale;
+  }
+  return ret;
+}
+int ComputeFPGAConcat(const struct ConcatArgs &args) {
+#ifdef FPGA_PRINT_MODE
+  DLOG << "=============ComputeFpgaConcat===========";
+  DLOG << "   Image_num: " << args.image_num
+       << "   out_address:" << args.image_out
+       << "   out_scale_address:" << args.scale_out
+       << "   out_channel:" << args.out_channel;
+  DLOG << "   image_height:" << args.height << "   image_width:" << args.width;
+  for (int i = 0; i < args.image_num; i++) {
+    DLOG << "   " << i << "th:        ";
+    DLOG << "   channel_num:" << args.channel_num[i]
+         << "   aligned_channel_num:" << args.aligned_channel_num[i]
+         << "   image_address:" << args.images_in[i]
+         << "   image_scale_address:" << args.scales_in[i];
+  }
+#endif
+  image::concat_images(args.images_in, args.scales_in, args.image_out,
+                       args.scale_out, args.image_num, args.channel_num,
+                       args.height, args.width, args.aligned_channel_num,
+                       args.out_channel);
+  return 0;
+}
+}  // namespace fpga
+}  // namespace paddle_mobile
--- a/src/fpga/V2/driver/pe.h
+++ b/src/fpga/V2/driver/pe.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "fpga/V2/api.h"
+namespace paddle_mobile {
+namespace fpga {
+int PerformBypass(const struct BypassArgs& args);
+int ComputeBasicConv(const struct ConvArgs& args);
+int ComputeFpgaPool(const struct PoolingArgs& args);
+int ComputeFpgaEWAdd(const struct EWAddArgs& args);
+int ComputeFpgaConv(const struct SplitConvArgs& args);
+int ComputeFPGAConcat(const struct ConcatArgs& args);
+}  // namespace fpga
+}  // namespace paddle_mobile
--- a/src/fpga/V2/filter.cpp
+++ b/src/fpga/V2/filter.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "fpga/V2/filter.h"
+#include <memory.h>
+#include <algorithm>
+#include "fpga/V2/api.h"
+namespace paddle_mobile {
+namespace fpga {
+namespace filter {
+int calc_channel_parallelism(int channel) {
+  if (channel <= 16) {
+    return 16;
+  } else if (channel <= 32) {
+    return 32;
+  } else if (channel <= 64) {
+    return 64;
+  } else {
+    return 128;
+  }
+}
+int calc_aligned_channel(int channel) {
+  return align_to_x(channel, calc_channel_parallelism(channel));
+}
+int calc_num_parallelism(int channel) {
+  return FILTER_PARALLELISM / calc_channel_parallelism(channel);
+}
+int calc_aligned_num(int num, int channel) {
+  return align_to_x(num, calc_num_parallelism(channel));
+}
+int calc_aligned_total_pixel_num(int num, int channel, int height, int width) {
+  int aligned_channel = calc_aligned_channel(channel);
+  int aligned_filter_num = calc_aligned_num(num, channel);
+  return aligned_filter_num * aligned_channel * height * width;
+}
+void convert_to_hwc(float **data_in, int num, int channel, int height,
+                    int width) {
+  float *tmp = *data_in;
+  int chw = channel * height * width;
+  float *data_tmp = (float *)fpga_malloc(chw * num * sizeof(float));  // NOLINT
+  for (int n = 0; n < num; n++) {
+    int64_t amount_per_row = width * channel;
+    for (int c = 0; c < channel; c++) {
+      for (int h = 0; h < height; h++) {
+        int64_t offset_height = h * amount_per_row;
+        for (int w = 0; w < width; w++) {
+          *(data_tmp + n * chw + offset_height + w * channel + c) =
+              *((*data_in)++);
+        }
+      }
+    }
+  }
+  *data_in = data_tmp;
+  fpga_free(tmp);
+}
+void align_filter(float **data_in, int num, int channel, int height,
+                  int width) {
+  int aligned_channel = calc_channel_parallelism(channel);
+  int hw = height * width;
+  int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width);
+  float *new_data = (float *)fpga_malloc(pixel_num * sizeof(float));  // NOLINT
+  float *temp = *data_in;
+  memset(new_data, 0, pixel_num * sizeof(float));
+  for (int i = 0; i < num; i++) {
+    for (int j = 0; j < hw; j++) {
+      memcpy(new_data + i * aligned_channel * hw + j * aligned_channel,
+             temp + i * channel * hw + j * channel, channel * sizeof(float));
+    }
+  }
+  *data_in = new_data;
+  fpga_free(temp);
+}
+void format_filter(float **data_in, int num, int channel, int height, int width,
+                   int group_num, float max) {
+  convert_to_hwc(data_in, num, channel, height, width);
+  align_filter(data_in, num, channel, height, width);
+  int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width);
+}
+void convert_fc_filter(float **data_in, int num, int chw) {
+  float *tmp = *data_in;
+  float *data_tmp = (float *)fpga_malloc(chw * num * sizeof(float));  // NOLINT
+  for (int n = 0; n < num; n++) {
+    for (int c = 0; c < chw; c++) {
+      data_tmp[n * chw + c] = (*data_in)[num * c + n];
+    }
+  }
+  *data_in = data_tmp;
+  fpga_free(tmp);
+}
+void format_fc_filter(float **data_in, int num, int channel, int height,
+                      int width, int group_num, float max) {
+  int chw = channel * height * width;
+  convert_fc_filter(data_in, num, chw);
+  align_filter(data_in, num, channel, height, width);
+}
+float find_max(float *data_in, int data_size) {
+  float max = 0.0;
+  for (int i = 0; i < data_size; ++i) {
+    float value = data_in[i];
+    float abs = value > 0 ? value : -value;
+    max = std::max(max, abs);
+  }
+  return max;
+}
+signed char float_to_int8(float fdata) {
+  if (fdata < 0.0) {
+    fdata -= 0.5;
+  } else {
+    fdata += 0.5;
+  }
+  return (signed char)fdata;
+}
+void quantize(float **data_in, int data_size, float max) {
+  float *tmp = *data_in;
+  float fix_range = 127;
+  float scale = fix_range / max;
+  signed char *tmp_data = (signed char *)fpga_malloc(data_size * sizeof(char));
+  for (int i = 0; i < data_size; i++) {
+    tmp_data[i] = float_to_int8(
+        (*data_in)[i] * scale);  // (signed char)((*data_in)[i] * scale);
+  }
+  *data_in = (float *)tmp_data;  // NOLINT
+  fpga_free(tmp);
+}
+}  // namespace filter
+}  // namespace fpga
+}  // namespace paddle_mobile
--- a/src/fpga/V2/filter.h
+++ b/src/fpga/V2/filter.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#define FILTER_PARALLELISM 1024
+namespace paddle_mobile {
+namespace fpga {
+namespace filter {
+int calc_channel_parallelism(int channel);
+int calc_aligned_channel(int channel);
+int calc_num_parallelism(int channel);
+int calc_aligned_num(int num, int channel);
+int calc_aligned_total_pixel_num(int num, int channel, int height, int width);
+void convert_to_hwc(float** data_in, int num, int channel, int height,
+                    int width);
+void format_filter(float** data_in, int num, int channel, int height, int width,
+                   int group_num, float max);
+void convert_fc_filter(float** data_in, int num, int chw);
+void format_fc_filter(float** data_in, int num, int channel, int height,
+                      int width, int group_num, float max);
+float find_max(float* data_in, int data_size);
+}  // namespace filter
+}  // namespace fpga
+}  // namespace paddle_mobile
--- a/src/fpga/V2/image.cpp
+++ b/src/fpga/V2/image.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "fpga/V2/image.h"
+#include <memory.h>
+#include <algorithm>
+#include "fpga/V2/api.h"
+namespace paddle_mobile {
+namespace fpga {
+namespace image {
+void convert_to_hwc(float **data_in, int channel, int height, int width) {
+  float *tmp = *data_in;
+  float *data_tmp =
+      (float *)fpga_malloc(channel * height * width * sizeof(float));  // NOLINT
+  int64_t amount_per_row = width * channel;
+  for (int c = 0; c < channel; c++) {
+    for (int h = 0; h < height; h++) {
+      int64_t offset_height = h * amount_per_row;
+      for (int w = 0; w < width; w++) {
+        *(data_tmp + offset_height + w * channel + c) = *((*data_in)++);
+      }
+    }
+  }
+  *data_in = data_tmp;
+  fpga_free(tmp);
+}
+void align_image(float **data_in, int channel, int height, int width,
+                 int aligned_channel) {
+  if (channel == aligned_channel) return;
+  float *tmp = *data_in;
+  float *new_data =
+      (float *)fpga_malloc(aligned_channel * height * width *  // NOLINT
+                           sizeof(float));                     // NOLINT
+  memset(new_data, 0, aligned_channel * height * width * sizeof(float));
+  for (int i = 0; i < height * width; i++) {
+    memcpy(new_data + i * aligned_channel, tmp + i * channel,
+           channel * sizeof(float));
+  }
+  *data_in = new_data;
+  fpga_free(tmp);
+}
+void format_image(float **data_in, int channel, int height, int width,
+                  int aligned_channel) {
+  convert_to_hwc(data_in, channel, height, width);
+  align_image(data_in, channel, height, width, aligned_channel);
+}
+void concat_images(int16_t **images_in, float **scales_in, void *image_out,
+                   float *scale_out, int image_num, const uint32_t *channel_num,
+                   int height, int width, const uint32_t *aligned_channel_num,
+                   int out_channel) {
+  int hw = height * width;
+  scale_out[0] = 0.0;
+  scale_out[1] = 0.0;
+  for (int i = 0; i < image_num; i++) {
+    scale_out[0] = std::max(*scale_out, scales_in[i][0]);
+  }
+  scale_out[1] = 1 / scale_out[0];
+  for (int j = 0; j < hw; j++) {
+    int tmp_channel_sum = 0;
+    for (int i = 0; i < image_num; i++) {
+      memcpy(
+          (int16_t *)image_out + j * out_channel + tmp_channel_sum,  // NOLINT
+          images_in[i] + j * aligned_channel_num[i],
+          channel_num[i] * sizeof(int16_t));
+      tmp_channel_sum += channel_num[i];
+    }
+  }
+}
+}  // namespace image
+}  // namespace fpga
+}  // namespace paddle_mobile
--- a/src/fpga/V2/image.h
+++ b/src/fpga/V2/image.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <stdint.h>
+namespace paddle_mobile {
+namespace fpga {
+namespace image {
+void convert_to_hwc(float **data_in, int channel, int height, int width);
+void align_image(float **data_in, int channel, int height, int width,
+                 int aligned_channel);
+void format_image(float **data_in, int channel, int height, int width,
+                  int aligned_channel);
+void concat_images(
+    int16_t **images_in, float **scales_in, void *image_out, float *scale_out,
+    int image_num, const uint32_t *channel_num, int height, int width,
+    const uint32_t *aligned_channel_num,
+    int out_channel);  // Concat featuremaps along channel direction
+}  // namespace image
+}  // namespace fpga
+}  // namespace paddle_mobile
--- a/src/framework/cl/cl_engine.h
+++ b/src/framework/cl/cl_engine.h
@@ -90,8 +90,10 @@ class CLEngine {
  bool BuildProgram(cl_program program) {
    cl_int status;
-    status = clBuildProgram(program, 0, 0, "-cl-fast-relaxed-math -I cl_kernel",
+    std::string path = "-cl-fast-relaxed-math -I " +
-                            0, 0);
+                       CLEngine::Instance()->GetCLPath() + "/cl_kernel";
+    status = clBuildProgram(program, 0, 0, path.c_str(), 0, 0);
    CL_CHECK_ERRORS(status);
@@ -114,6 +116,9 @@ class CLEngine {
  cl_device_id DeviceID(int index = 0) { return devices_[index]; }
+  std::string GetCLPath() { return cl_path_; }
+  void setClPath(std::string cl_path) { cl_path_ = cl_path; }
 private:
  CLEngine() { initialized_ = false; }
@@ -129,6 +134,7 @@ class CLEngine {
  cl_int status_;
+  std::string cl_path_;
  std::unique_ptr<_cl_program, CLProgramDeleter> program_;
  //  bool SetClContext();

--- a/src/framework/cl/cl_scope.h
+++ b/src/framework/cl/cl_scope.h
@@ -58,7 +58,8 @@ class CLScope {
    }
    auto program = CLEngine::Instance()->CreateProgramWith(
-        context_.get(), "./cl_kernel/" + file_name);
+        context_.get(),
+        CLEngine::Instance()->GetCLPath() + "/cl_kernel/" + file_name);
    DLOG << " --- begin build program -> " << file_name << " --- ";
    CLEngine::Instance()->BuildProgram(program.get());

--- a/src/framework/executor.cpp
+++ b/src/framework/executor.cpp
@@ -649,12 +649,14 @@ void Executor<GPU_CL, Precision::FP32>::InitMemory() {
 template <>
 void Executor<GPU_CL, Precision::FP32>::InitCombineMemory() {
-  char *origin_data;
+  char *origin_data = nullptr;
+  bool self_alloc = false;
  if (program_.combined_params_buf && program_.combined_params_len) {
    LOG(kLOG_INFO) << "use outter memory";
    origin_data = reinterpret_cast<char *>(program_.combined_params_buf);
  } else {
    LOG(kLOG_INFO) << " begin init combine memory";
+    self_alloc = true;
    origin_data = ReadFileToBuff(program_.para_path);
  }
  PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "origin_data==nullptr!!!");
@@ -701,7 +703,9 @@ void Executor<GPU_CL, Precision::FP32>::InitCombineMemory() {
      }
    }
  }
-  delete origin_data;
+  if (self_alloc) {
+    delete data;
+  }
  LOG(kLOG_INFO) << " end init combine memory ";
 }

--- a/src/framework/loader.cpp
+++ b/src/framework/loader.cpp
@@ -44,7 +44,12 @@ void Loader<Dtype, P>::InitMemoryFromProgram(
        } else {
          auto dim = var_desc->Tensor_desc().Dims();
          PADDLE_MOBILE_ENFORCE(dim.size() > 0, "dim size is 0");
-          dim[0] = 1;
+          //          dim[0] = 1;
+          for (auto &d : dim) {
+            if (d < 0) {
+              d *= -1;
+            }
+          }
          auto tensor = var->GetMutable<LoDTensor>();
          tensor->Resize(make_ddim(dim));
        }
@@ -82,6 +87,54 @@ void Loader<GPU_CL, Precision::FP32>::InitMemoryFromProgram(
    }
  }
 }
+template <>
+const Program<GPU_CL, Precision::FP32>
+Loader<GPU_CL, Precision::FP32>::LoadCombinedMemory(
+    size_t read_size, const uint8_t *buf, size_t combined_params_len,
+    uint8_t *combined_params_buf, bool optimize, bool quantification) {
+  bool can_add_split = false;
+  PaddleMobile__Framework__Proto__ProgramDesc *c_program;
+  PADDLE_MOBILE_ENFORCE(buf != nullptr, "read from __model__ is null");
+  c_program = paddle_mobile__framework__proto__program_desc__unpack(
+      nullptr, read_size, buf);
+  //
+  PADDLE_MOBILE_ENFORCE(c_program != nullptr, "program is null");
+  //
+  DLOG << "n_ops: " << (*c_program->blocks)->n_ops;
+  //
+  auto originProgramDesc = std::make_shared<ProgramDesc>(c_program);
+  Program<GPU_CL, Precision::FP32> program;
+  program.combined = true;
+  program.originProgram = originProgramDesc;
+  program.quantification = quantification;
+  program.combined_params_len = combined_params_len;
+  program.combined_params_buf = combined_params_buf;
+  auto scope = std::make_shared<Scope>();
+  program.scope = scope;
+  InitMemoryFromProgram(originProgramDesc, scope);
+  if (optimize) {
+    ProgramOptimize program_optimize;
+    program.optimizeProgram =
+        program_optimize.FusionOptimize(originProgramDesc, can_add_split);
+    if (!program.optimizeProgram) {
+      program.optimizeProgram = originProgramDesc;
+    }
+  }
+  if (optimize) {
+    program.optimizeProgram->Description("optimize: ");
+  } else {
+    originProgramDesc->Description("program: ");
+  }
+  paddle_mobile__framework__proto__program_desc__free_unpacked(c_program,
+                                                               nullptr);
+  return program;
+}
 #endif
 /**

--- a/src/io/api_paddle_mobile.cc
+++ b/src/io/api_paddle_mobile.cc
@@ -29,7 +29,9 @@ PaddleMobilePredictor<Dtype, P>::PaddleMobilePredictor(
 template <typename Dtype, Precision P>
 bool PaddleMobilePredictor<Dtype, P>::Init(const PaddleMobileConfig &config) {
  paddle_mobile_.reset(new PaddleMobile<Dtype, P>());
+#ifdef PADDLE_MOBILE_CL
+  paddle_mobile_->SetCLPath(config.cl_path);
+#endif
  if (config.memory_pack.from_memory) {
    DLOG << "load from memory!";
    paddle_mobile_->LoadCombinedMemory(config.memory_pack.model_size,

--- a/src/io/paddle_inference_api.h
+++ b/src/io/paddle_inference_api.h
@@ -132,6 +132,7 @@ struct PaddleMobileConfig : public PaddlePredictor::Config {
  int thread_num = 1;
  std::string prog_file;
  std::string param_file;
+  std::string cl_path;
  struct PaddleModelMemoryPack memory_pack;
 };

--- a/src/io/paddle_mobile.cpp
+++ b/src/io/paddle_mobile.cpp
@@ -158,6 +158,13 @@ void PaddleMobile<Dtype, P>::Predict_To(int end) {
 }
 #endif
+#ifdef PADDLE_MOBILE_CL
+template <typename Dtype, Precision P>
+void PaddleMobile<Dtype, P>::SetCLPath(std::string path) {
+  framework::CLEngine::Instance()->setClPath(path);
+}
+#endif
 template class PaddleMobile<CPU, Precision::FP32>;
 template class PaddleMobile<FPGA, Precision::FP32>;
 template class PaddleMobile<GPU_MALI, Precision::FP32>;

--- a/src/io/paddle_mobile.h
+++ b/src/io/paddle_mobile.h
@@ -26,6 +26,9 @@ limitations under the License. */
 #include "framework/load_ops.h"
 #include "framework/loader.h"
 #include "framework/tensor.h"
+#ifdef PADDLE_MOBILE_CL
+#include "framework/cl/cl_engine.h"
+#endif
 namespace paddle_mobile {
@@ -74,6 +77,11 @@ class PaddleMobile {
  void Predict_To(int end);
 #endif
+#ifdef PADDLE_MOBILE_CL
+ public:
+  void SetCLPath(std::string cl_path);
+#endif
 private:
  std::shared_ptr<framework::Loader<Dtype, P>> loader_;
  std::shared_ptr<framework::Executor<Dtype, P>> executor_;

--- a/src/memory/t_malloc.cpp
+++ b/src/memory/t_malloc.cpp
@@ -16,10 +16,12 @@ limitations under the License. */
 #include <cstdlib>
 #include <cstring>
-#ifdef PADDLE_MOBILE_FPGA
+#ifdef PADDLE_MOBILE_FPGA_V1
+#include "fpga/V1/api.h"
-#include "fpga/api.h"
+#endif
+#ifdef PADDLE_MOBILE_FPGA_V2
+#include "fpga/V2/api.h"
 #endif
 namespace paddle_mobile {

--- a/src/operators/elementwise_add_op.cpp
+++ b/src/operators/elementwise_add_op.cpp
@@ -40,4 +40,8 @@ REGISTER_OPERATOR_MALI_GPU(elementwise_add, ops::ElementwiseAddOp);
 REGISTER_OPERATOR_CL(elementwise_add, ops::ElementwiseAddOp);
 #endif
+#ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(elementwise_add, ops::ElementwiseAddOp);
+#endif
 #endif
--- a/src/operators/kernel/arm/slice_kernel.cpp
+++ b/src/operators/kernel/arm/slice_kernel.cpp
@@ -17,6 +17,14 @@ limitations under the License. */
 #include "operators/kernel/slice_kernel.h"
 namespace paddle_mobile {
-namespace operators {}
+namespace operators {
+template <>
+bool SliceKernel<CPU, float>::Init(SliceParam<CPU>* param) {
+  return true;
+}
+template <>
+void SliceKernel<CPU, float>::Compute(const SliceParam<CPU>& param) {}
+}  // namespace operators
 }  // namespace paddle_mobile
 #endif
--- a/src/operators/kernel/fpga/concat_kernel.cpp
+++ b/src/operators/kernel/fpga/concat_kernel.cpp
--- a/src/operators/kernel/fpga/conv_add_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_add_bn_kernel.cpp
--- a/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp
--- a/src/operators/kernel/fpga/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_add_relu_kernel.cpp
--- a/src/operators/kernel/fpga/conv_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_bn_kernel.cpp
--- a/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp
--- a/src/operators/kernel/fpga/dropout_kernel.cpp
+++ b/src/operators/kernel/fpga/dropout_kernel.cpp
--- a/src/operators/kernel/fpga/elementwise_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/elementwise_add_relu_kernel.cpp
--- a/src/operators/kernel/fpga/fc_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/fc_relu_kernel.cpp
--- a/src/operators/kernel/fpga/feed_kernel.cpp
+++ b/src/operators/kernel/fpga/feed_kernel.cpp
--- a/src/operators/kernel/fpga/fetch_kernel.cpp
+++ b/src/operators/kernel/fpga/fetch_kernel.cpp
@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#ifdef FUSION_CONVADD_OP
 #include "operators/kernel/fetch_kernel.h"
@@ -32,5 +31,3 @@ template class FetchKernel<FPGA, float>;
 }  // namespace operators
 }  // namespace paddle_mobile
-#endif
--- a/src/operators/kernel/fpga/fusion_fc_kernel.cpp
+++ b/src/operators/kernel/fpga/fusion_fc_kernel.cpp
--- a/src/operators/kernel/fpga/pool_kernel.cpp
+++ b/src/operators/kernel/fpga/pool_kernel.cpp
@@ -34,7 +34,7 @@ bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) {
  fpga::PoolingArgs poolArgs = {0};
  poolArgs.mode = pooling_type == "max" ? 0 : 1;  // max:0, avg:1
  poolArgs.kernel_reciprocal =
-      fpga::fp32_2_fp16(float(1.0 / (ksize[0] * ksize[1])));
+      fpga::fp32_2_fp16(float(1.0 / (ksize[0] * ksize[1])));  // NOLINT
  poolArgs.image.address = input_ptr;
  poolArgs.image.channels = (uint32_t)input->dims()[1];
  poolArgs.image.height = (uint32_t)input->dims()[2];

--- a/src/operators/kernel/fpga/softmax_kernel.cpp
+++ b/src/operators/kernel/fpga/softmax_kernel.cpp
@@ -14,11 +14,9 @@ limitations under the License. */
 #ifdef SOFTMAX_OP
-#include "../softmax_kernel.h"
+#include "operators/kernel/softmax_kernel.h"
-#include "../central-arm-func/softmax_arm_func.h"
+#include "operators/kernel/central-arm-func/softmax_arm_func.h"
-#include "common/types.h"
-#include "fpga/api.h"
-#include "operators/math/softmax.h"
 namespace paddle_mobile {
 namespace operators {

--- a/src/operators/kernel/fpga/V2/concat_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/concat_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef CONCAT_OP
+#include "operators/kernel/concat_kernel.h"
+#include "fpga/V2/api.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
+  auto inputs = param->Inputs();
+  auto out = param->Out();
+  auto image_num = inputs.size();
+  auto images_in =
+      (half **)fpga::fpga_malloc(image_num * sizeof(int *));  // NOLINT
+  auto scales_in =
+      (float **)fpga::fpga_malloc(image_num * sizeof(float *));  // NOLINT
+  auto channel_num =
+      (uint32_t *)fpga::fpga_malloc(image_num * sizeof(uint32_t));  // NOLINT
+  auto aligned_channel_num =
+      (uint32_t *)fpga::fpga_malloc(image_num * sizeof(uint32_t));  // NOLINT
+  auto height = inputs[0]->dims()[2];
+  auto width = inputs[0]->dims()[3];
+  auto out_channel =
+      (uint32_t)fpga::get_aligned_channel_num((int)out->dims()[1]);  // NOLINT
+  for (int i = 0; i < image_num; i++) {
+    auto input = inputs[i];
+    PADDLE_MOBILE_ENFORCE(
+        input->dims()[2] == height && input->dims()[3] == width,
+        "Image height & width should be unified");
+    images_in[i] = (half *)input->data<float>();  // NOLINT
+    channel_num[i] = (uint32_t)inputs[i]->dims()[1];
+    aligned_channel_num[i] =
+        (uint32_t)fpga::get_aligned_channel_num(channel_num[i]);
+    scales_in[i] = input->scale;
+  }
+  fpga::format_concat_output(out, (int)height, (int)width,  // NOLINT
+                             out_channel);
+  fpga::ConcatArgs concatArgs = {0};
+  concatArgs.image_num = (uint32_t)image_num;
+  concatArgs.images_in = images_in;
+  concatArgs.scales_in = scales_in;
+  concatArgs.image_out = (half *)out->data<float>();  // NOLINT
+  concatArgs.scale_out = out->scale;
+  concatArgs.channel_num = channel_num;
+  concatArgs.aligned_channel_num = aligned_channel_num;
+  concatArgs.out_channel = out_channel;
+  concatArgs.height = (uint32_t)height;
+  concatArgs.width = (uint32_t)width;
+  param->SetFpgaArgs(concatArgs);
+  return true;
+}
+template <>
+void ConcatKernel<FPGA, float>::Compute(const ConcatParam<FPGA> &param) {
+  fpga::ComputeFPGAConcat(param.FpgaArgs());
+}
+template class ConcatKernel<FPGA, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/fpga/V2/conv_add_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/conv_add_bn_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_CONVADDBN_OP
+#include "operators/kernel/conv_add_bn_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
+  bool relu_enabled = false;
+  auto input = const_cast<Tensor *>(param->Input());
+  auto bias = param->Bias();
+  auto bias_ptr = bias->data<float>();
+  auto filter = const_cast<Tensor *>(param->Filter());
+  auto out = param->Output();
+  auto bn_mean_ptr = param->InputMean()->data<float>();
+  auto bn_var_ptr = param->InputVariance()->data<float>();
+  auto bn_scale_ptr = param->InputScale()->data<float>();
+  auto bn_bias_ptr = param->InputBias()->data<float>();
+  const float epsilon = param->Epsilon();
+  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0] &&
+                            bias->dims()[0] == param->InputBias()->dims()[0],
+                        "Output channel should be equal to bias number");
+  const int channel = out->dims()[1];
+  auto bs_ptr =
+      reinterpret_cast<float *>(fpga::fpga_malloc(2 * channel * sizeof(float)));
+  auto new_scale = new Tensor();
+  auto new_bias = new Tensor();
+  auto new_scale_ptr = new_scale->mutable_data<float>({channel});
+  auto new_bias_ptr = new_bias->mutable_data<float>({channel});
+  for (int i = 0; i < channel; i++) {
+    new_scale_ptr[i] = bn_scale_ptr[i] /
+                       static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
+    new_bias_ptr[i] =
+        bn_bias_ptr[i] + (bias_ptr[i] - bn_mean_ptr[i]) * new_scale_ptr[i];
+    bs_ptr[i + channel] = new_scale_ptr[i];
+    bs_ptr[i] = new_bias_ptr[i];
+  }
+  param->SetNewScale(new_scale);
+  param->SetNewBias(new_bias);
+  fpga::format_conv_data(filter, out, bs_ptr, param->Groups());
+  fpga::SplitConvArgs conv_arg = {0};
+  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
+                       param->Groups(), param->Strides()[0],
+                       param->Strides()[1], param->Paddings()[0],
+                       param->Paddings()[1], bs_ptr);
+  param->SetFpgaArgs(conv_arg);
+  return true;
+}
+template <>
+void ConvAddBNKernel<FPGA, float>::Compute(
+    const FusionConvAddBNParam<FPGA> &param) {
+  fpga::ComputeFpgaConv(param.FpgaArgs());
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_CONVADDBNRELU_OP
+#include "operators/kernel/conv_add_bn_relu_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool ConvAddBNReluKernel<FPGA, float>::Init(
+    FusionConvAddBNReluParam<FPGA> *param) {
+  bool relu_enabled = true;
+  auto input = const_cast<Tensor *>(param->Input());
+  const Tensor *bias = param->Bias();
+  auto bias_ptr = bias->data<float>();
+  auto filter = const_cast<Tensor *>(param->Filter());
+  auto out = param->Output();
+  auto bn_mean_ptr = param->InputMean()->data<float>();
+  auto bn_var_ptr = param->InputVariance()->data<float>();
+  auto bn_scale_ptr = param->InputScale()->data<float>();
+  auto bn_bias_ptr = param->InputBias()->data<float>();
+  const float epsilon = param->Epsilon();
+  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0] &&
+                            bias->dims()[0] == param->InputBias()->dims()[0],
+                        "Output channel should be equal to bias number");
+  const int channel = out->dims()[1];
+  auto bs_ptr =
+      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
+  auto new_scale = new Tensor();
+  auto new_bias = new Tensor();
+  auto new_scale_ptr = new_scale->mutable_data<float>({channel});
+  auto new_bias_ptr = new_bias->mutable_data<float>({channel});
+  for (int i = 0; i < channel; i++) {
+    new_scale_ptr[i] = bn_scale_ptr[i] /
+                       static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
+    new_bias_ptr[i] =
+        bn_bias_ptr[i] + (bias_ptr[i] - bn_mean_ptr[i]) * new_scale_ptr[i];
+    bs_ptr[i + 2] = new_scale_ptr[i];
+    bs_ptr[i] = new_bias_ptr[i];
+  }
+  param->SetNewScale(new_scale);
+  param->SetNewBias(new_bias);
+  fpga::format_conv_data(filter, out, bs_ptr, param->Groups());
+  fpga::SplitConvArgs conv_arg = {0};
+  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
+                       param->Groups(), param->Strides()[0],
+                       param->Strides()[1], param->Paddings()[0],
+                       param->Paddings()[1], bs_ptr);
+  param->SetFpgaArgs(conv_arg);
+  return true;
+}
+template <>
+void ConvAddBNReluKernel<FPGA, float>::Compute(
+    const FusionConvAddBNReluParam<FPGA> &param) {
+  fpga::ComputeFpgaConv(param.FpgaArgs());
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/fpga/V2/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/conv_add_relu_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_CONVADDRELU_OP
+#include "operators/kernel/conv_add_relu_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
+  bool relu_enabled = true;
+  auto input = const_cast<Tensor *>(param->Input());
+  const Tensor *bias = param->Bias();
+  auto bias_ptr = bias->data<float>();
+  auto filter = const_cast<Tensor *>(param->Filter());
+  auto out = param->Output();
+  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
+                        "Output channel should be equal to bias number");
+  int channel = out->dims()[1];
+  auto bs_ptr =
+      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
+  for (int i = 0; i < channel; i++) {
+    bs_ptr[i + channel] = 1;
+    bs_ptr[i] = bias_ptr[i];
+  }
+  fpga::format_conv_data(filter, out, bs_ptr, param->Groups());
+  fpga::SplitConvArgs conv_arg = {0};
+  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
+                       param->Groups(), param->Strides()[0],
+                       param->Strides()[1], param->Paddings()[0],
+                       param->Paddings()[1], bs_ptr);
+  param->SetFpgaArgs(conv_arg);
+  return true;
+}
+template <>
+void ConvAddReluKernel<FPGA, float>::Compute(
+    const FusionConvAddReluParam<FPGA> &param) {
+  fpga::ComputeFpgaConv(param.FpgaArgs());
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/fpga/V2/conv_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/conv_bn_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_CONVBN_OP
+#include "operators/kernel/conv_bn_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
+  bool relu_enabled = false;
+  auto input = const_cast<Tensor *>(param->Input());
+  auto filter = const_cast<Tensor *>(param->Filter());
+  auto out = param->Output();
+  auto bn_mean_ptr = param->InputMean()->data<float>();
+  auto bn_var_ptr = param->InputVariance()->data<float>();
+  auto bn_scale_ptr = param->InputScale()->data<float>();
+  auto bn_bias_ptr = param->InputBias()->data<float>();
+  const float epsilon = param->Epsilon();
+  PADDLE_MOBILE_ENFORCE(out->dims()[1] == param->InputBias()->dims()[0],
+                        "Output channel should be equal to bias number");
+  const int channel = out->dims()[1];
+  auto bs_ptr =
+      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // // NOLINT
+  auto new_scale = new Tensor();
+  auto new_bias = new Tensor();
+  auto new_scale_ptr = new_scale->mutable_data<float>({channel});
+  auto new_bias_ptr = new_bias->mutable_data<float>({channel});
+  for (int i = 0; i < channel; i++) {
+    new_scale_ptr[i] = bn_scale_ptr[i] /
+                       static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
+    new_bias_ptr[i] = bn_bias_ptr[i] + (0 - bn_mean_ptr[i]) * new_scale_ptr[i];
+    bs_ptr[i + channel] = new_scale_ptr[i];
+    bs_ptr[i] = new_bias_ptr[i];
+  }
+  param->SetNewScale(new_scale);
+  param->SetNewBias(new_bias);
+  fpga::format_conv_data(filter, out, bs_ptr, param->Groups());
+  fpga::SplitConvArgs conv_arg = {0};
+  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
+                       param->Groups(), param->Strides()[0],
+                       param->Strides()[1], param->Paddings()[0],
+                       param->Paddings()[1], bs_ptr);
+  param->SetFpgaArgs(conv_arg);
+  return true;
+}
+template <>
+void ConvBNKernel<FPGA, float>::Compute(const FusionConvBNParam<FPGA> &param) {
+  fpga::ComputeFpgaConv(param.FpgaArgs());
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/fpga/V2/conv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/conv_bn_relu_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_CONVBNRELU_OP
+#include "operators/kernel/conv_bn_relu_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
+  bool relu_enabled = true;
+  auto input = const_cast<Tensor *>(param->Input());
+  auto filter = const_cast<Tensor *>(param->Filter());
+  auto out = param->Output();
+  auto bn_mean_ptr = param->InputMean()->data<float>();
+  auto bn_var_ptr = param->InputVariance()->data<float>();
+  auto bn_scale_ptr = param->InputScale()->data<float>();
+  auto bn_bias_ptr = param->InputBias()->data<float>();
+  const float epsilon = param->Epsilon();
+  PADDLE_MOBILE_ENFORCE(out->dims()[1] == param->InputBias()->dims()[0],
+                        "Output channel should be equal to bias number");
+  const int channel = out->dims()[1];
+  auto bs_ptr =
+      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
+  auto new_scale = new Tensor();
+  auto new_bias = new Tensor();
+  auto new_scale_ptr = new_scale->mutable_data<float>({channel});
+  auto new_bias_ptr = new_bias->mutable_data<float>({channel});
+  for (int i = 0; i < channel; i++) {
+    new_scale_ptr[i] = bn_scale_ptr[i] /
+                       static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
+    new_bias_ptr[i] = bn_bias_ptr[i] + (0 - bn_mean_ptr[i]) * new_scale_ptr[i];
+    bs_ptr[i + channel] = new_scale_ptr[i];
+    bs_ptr[i] = new_bias_ptr[i];
+  }
+  param->SetNewScale(new_scale);
+  param->SetNewBias(new_bias);
+  fpga::format_conv_data(filter, out, bs_ptr, param->Groups());
+  fpga::SplitConvArgs conv_arg = {0};
+  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
+                       param->Groups(), param->Strides()[0],
+                       param->Strides()[1], param->Paddings()[0],
+                       param->Paddings()[1], bs_ptr);
+  param->SetFpgaArgs(conv_arg);
+  return true;
+}
+template <>
+void ConvBNReluKernel<FPGA, float>::Compute(
+    const FusionConvBNReluParam<FPGA> &param) {
+  fpga::ComputeFpgaConv(param.FpgaArgs());
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/fpga/V2/dropout_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/dropout_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef DROPOUT_OP
+#include "operators/kernel/dropout_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool DropoutKernel<FPGA, float>::Init(DropoutParam<FPGA> *param) {
+  param->Out()->ShareDataWith(*param->InputX());
+  return true;
+}
+template <>
+void DropoutKernel<FPGA, float>::Compute(const DropoutParam<FPGA> &param) {}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef ELEMENTWISEADD_OP
+#include "operators/kernel/elementwise_add_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool ElementwiseAddKernel<FPGA, float>::Init(ElementwiseAddParam<FPGA> *param) {
+  bool relu_enabled = false;
+  auto *input_x = const_cast<LoDTensor *>(param->InputX());
+  auto *input_y = const_cast<LoDTensor *>(param->InputY());
+  auto *out = param->Out();
+  auto input_x_ptr = input_x->data<float>();
+  auto input_y_ptr = input_y->data<float>();
+  int aligned_channel_num = fpga::get_aligned_channel_num(input_x->dims()[1]);
+  fpga::format_fp16_ofm(out, aligned_channel_num);
+  auto out_ptr = out->mutable_data<float>();
+  fpga::EWAddArgs ewaddArgs = {0};
+  ewaddArgs.relu_enabled = relu_enabled;
+  ewaddArgs.const0 = 0x3c00;  // =1
+  ewaddArgs.const1 = 0x3c00;  // =1
+  ewaddArgs.image0.address = input_x_ptr;
+  ewaddArgs.image0.channels = (uint32_t)input_x->dims()[1];
+  ewaddArgs.image0.scale_address = input_x->scale;
+  ewaddArgs.image0.height = (uint32_t)input_x->dims()[2];
+  ewaddArgs.image0.width = (uint32_t)input_x->dims()[3];
+  ewaddArgs.image0.pad_height = 0;
+  ewaddArgs.image0.pad_width = 0;
+  ewaddArgs.image1.address = input_y_ptr;
+  ewaddArgs.image1.channels = (uint32_t)input_y->dims()[1];
+  ewaddArgs.image1.scale_address = input_y->scale;
+  ewaddArgs.image1.height = (uint32_t)input_y->dims()[2];
+  ewaddArgs.image1.width = (uint32_t)input_y->dims()[3];
+  ewaddArgs.image1.pad_height = 0;
+  ewaddArgs.image1.pad_width = 0;
+  ewaddArgs.output.scale_address = out->scale;
+  ewaddArgs.output.address = out_ptr;
+  param->SetFpgaArgs(ewaddArgs);
+  return true;
+}
+template <>
+void ElementwiseAddKernel<FPGA, float>::Compute(
+    const ElementwiseAddParam<FPGA> &param) {
+  fpga::ComputeFpgaEWAdd(param.FpgaArgs());
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_ELEMENTWISEADDRELU_OP
+#include "operators/kernel/elementwise_add_relu_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool ElementwiseAddReluKernel<FPGA, float>::Init(
+    ElementwiseAddReluParam<FPGA> *param) {
+  bool relu_enabled = false;
+  auto *input_x = const_cast<LoDTensor *>(param->InputX());
+  auto *input_y = const_cast<LoDTensor *>(param->InputY());
+  auto *out = param->Out();
+  auto input_x_ptr = input_x->data<float>();
+  auto input_y_ptr = input_y->data<float>();
+  int aligned_channel_num = fpga::get_aligned_channel_num(input_x->dims()[1]);
+  fpga::format_fp16_ofm(out, aligned_channel_num);
+  auto out_ptr = out->mutable_data<float>();
+  fpga::EWAddArgs ewaddArgs = {0};
+  ewaddArgs.relu_enabled = relu_enabled;
+  ewaddArgs.const0 = 0x3c00;  // =1
+  ewaddArgs.const1 = 0x3c00;  // =1
+  ewaddArgs.image0.address = input_x_ptr;
+  ewaddArgs.image0.channels = (uint32_t)input_x->dims()[1];
+  ewaddArgs.image0.scale_address = input_x->scale;
+  ewaddArgs.image0.height = (uint32_t)input_x->dims()[2];
+  ewaddArgs.image0.width = (uint32_t)input_x->dims()[3];
+  ewaddArgs.image0.pad_height = 0;
+  ewaddArgs.image0.pad_width = 0;
+  ewaddArgs.image1.address = input_y_ptr;
+  ewaddArgs.image1.channels = (uint32_t)input_y->dims()[1];
+  ewaddArgs.image1.scale_address = input_y->scale;
+  ewaddArgs.image1.height = (uint32_t)input_y->dims()[2];
+  ewaddArgs.image1.width = (uint32_t)input_y->dims()[3];
+  ewaddArgs.image1.pad_height = 0;
+  ewaddArgs.image1.pad_width = 0;
+  ewaddArgs.output.scale_address = out->scale;
+  ewaddArgs.output.address = out_ptr;
+  param->SetFpgaArgs(ewaddArgs);
+  return true;
+}
+template <>
+void ElementwiseAddReluKernel<FPGA, float>::Compute(
+    const ElementwiseAddReluParam<FPGA> &param) {
+  fpga::ComputeFpgaEWAdd(param.FpgaArgs());
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/fpga/V2/fc_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/fc_relu_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_FCRELU_OP
+#include "operators/kernel/fc_relu_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
+  bool relu_enabled = true;
+  auto input_x = const_cast<LoDTensor *>(param->InputX());
+  auto filter = const_cast<Tensor *>(param->InputY());
+  auto input_z = param->InputZ();
+  auto input_z_ptr = input_z->data<float>();
+  auto out = param->Out();
+  PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0],
+                        "Image channel should be equal to weight number");
+  int channel = (uint32_t)out->dims()[1];
+  auto bs_ptr =
+      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
+  for (int i = 0; i < channel; i++) {
+    bs_ptr[i + channel] = 1;
+    bs_ptr[i] = input_z_ptr[i];
+  }
+  int num = (uint32_t)filter->dims()[1];
+  int chw = (uint32_t)filter->dims()[0];
+  PADDLE_MOBILE_ENFORCE(
+      chw == input_x->numel(),
+      "Filter element num should be equal to IFM element num");
+  int height = (uint32_t)input_x->dims()[2];
+  int width = (uint32_t)input_x->dims()[3];
+  int filter_channel = chw / height / width;
+  out->Resize(framework::make_ddim({1, channel, 1, 1}));
+  filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
+  fpga::format_fc_data(filter, out, bs_ptr);
+  fpga::SplitConvArgs conv_arg = {0};
+  fpga::fill_split_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1,
+                       0, 0, bs_ptr);
+  param->SetFpgaArgs(conv_arg);
+  return true;
+}
+template <>
+void FusionFcReluKernel<FPGA, float>::Compute(
+    const FusionFcReluParam<FPGA> &param) {
+  fpga::ComputeFpgaConv(param.FpgaArgs());
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/fpga/V2/feed_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/feed_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "operators/kernel/feed_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool FeedKernel<FPGA, float>::Init(FeedParam<FPGA> *param) {
+  Tensor *output = param->Out();
+  int aligned_channel = fpga::get_aligned_channel_num(output->dims()[1]);
+  fpga::format_fp16_ofm(output, aligned_channel);
+  return true;
+}
+template <>
+void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
+  auto input =
+      reinterpret_cast<Tensor *>(const_cast<LoDTensor *>(param.InputX()));
+  auto input_ptr = input->data<float>();
+  fpga::format_image(input);
+  Tensor *output = param.Out();
+  auto output_ptr = output->data<float>();
+  fpga::BypassArgs args = {fpga::DATA_TYPE_FP32};
+  args.input_data_type = fpga::DATA_TYPE_FP32;
+  args.output_data_type = fpga::DATA_TYPE_FP16;
+  args.input_layout_type = fpga::LAYOUT_CHW;
+  args.output_layout_type = fpga::LAYOUT_HWC;
+  args.image.address = reinterpret_cast<void *>(input_ptr);
+  args.image.channels = (uint32_t)input->dims()[1];
+  args.image.height = (uint32_t)input->dims()[2];
+  args.image.width = (uint32_t)input->dims()[3];
+  args.image.pad_height = 0;
+  args.image.pad_width = 0;
+  args.output.address = output_ptr;
+  args.output.scale_address = output->scale;
+  fpga::PerformBypass(args);
+}
+template class FeedKernel<FPGA, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/kernel/fpga/V2/fetch_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/fetch_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "operators/kernel/fetch_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool FetchKernel<FPGA, float>::Init(FetchParam<FPGA> *param) {
+  return true;
+}
+template <>
+void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
+  param.Out()->ShareDataWith(*(param.InputX()));
+}
+template class FetchKernel<FPGA, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/kernel/fpga/V2/fusion_fc_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/fusion_fc_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_FC_OP
+#include "operators/kernel/fusion_fc_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
+  bool relu_enabled = false;
+  auto input_x = const_cast<LoDTensor *>(param->InputX());
+  auto filter = const_cast<Tensor *>(param->InputY());
+  const Tensor *input_z = param->InputZ();
+  auto input_z_ptr = input_z->data<float>();
+  auto out = param->Out();
+  PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0],
+                        "Image channel should be equal to weight number");
+  int channel = (uint32_t)out->dims()[1];
+  auto bs_ptr =
+      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
+  for (int i = 0; i < channel; i++) {
+    bs_ptr[i + channel] = 1;
+    bs_ptr[i] = input_z_ptr[i];
+  }
+  int num = (uint32_t)filter->dims()[1];
+  int chw = (uint32_t)filter->dims()[0];
+  PADDLE_MOBILE_ENFORCE(
+      chw == input_x->numel(),
+      "Filter element num should be equal to IFM element num");
+  int height = (uint32_t)input_x->dims()[2];
+  int width = (uint32_t)input_x->dims()[3];
+  int filter_channel = chw / height / width;
+  out->Resize(framework::make_ddim({1, channel, 1, 1}));
+  filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
+  fpga::format_fc_data(filter, out, bs_ptr);
+  fpga::SplitConvArgs conv_arg = {0};
+  fpga::fill_split_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1,
+                       0, 0, bs_ptr);
+  param->SetFpgaArgs(conv_arg);
+  return true;
+}
+template <>
+void FusionFcKernel<FPGA, float>::Compute(const FusionFcParam<FPGA> &param) {
+  fpga::ComputeFpgaConv(param.FpgaArgs());
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/fpga/V2/pool_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/pool_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef POOL_OP
+#include "operators/kernel/pool_kernel.h"
+class PoolingArgs;
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) {
+  auto *input = const_cast<Tensor *>(param->Input());
+  auto input_ptr = input->data<float>();
+  Tensor *output = param->Output();
+  int aligned_channel_num =
+      fpga::get_aligned_channel_num((int)output->dims()[1]);  // NOLINT
+  fpga::format_fp16_ofm(output, aligned_channel_num);
+  auto output_ptr = output->mutable_data<float>();
+  vector<int> ksize = param->Ksize();
+  vector<int> strides = param->Strides();
+  vector<int> paddings = param->Paddings();
+  std::string pooling_type = param->PoolingType();
+  fpga::PoolingArgs poolArgs = {0};
+  poolArgs.mode = pooling_type == "max" ? 0 : 1;  // max:0, avg:1
+  poolArgs.kernel_reciprocal =
+      fpga::fp32_2_fp16(float(1.0 / (ksize[0] * ksize[1])));  // NOLINT
+  poolArgs.image.address = input_ptr;
+  poolArgs.image.channels = (uint32_t)input->dims()[1];
+  poolArgs.image.height = (uint32_t)input->dims()[2];
+  poolArgs.image.width = (uint32_t)input->dims()[3];
+  poolArgs.image.pad_height = (uint32_t)paddings[0];
+  poolArgs.image.pad_width = (uint32_t)paddings[1];
+  poolArgs.image.scale_address = input->scale;
+  poolArgs.output.address = output_ptr;
+  poolArgs.output.scale_address = output->scale;
+  poolArgs.kernel.height = (uint32_t)ksize[0];
+  poolArgs.kernel.width = (uint32_t)ksize[1];
+  poolArgs.kernel.stride_h = (uint32_t)strides[0];
+  poolArgs.kernel.stride_w = (uint32_t)strides[1];
+  param->SetFpgaArgs(poolArgs);
+  return true;
+}
+template <>
+void PoolKernel<FPGA, float>::Compute(const PoolParam<FPGA> &param) {
+  fpga::ComputeFpgaPool(param.FpgaArgs());
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/fpga/V2/slice_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/slice_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef SLICE_OP
+#include "operators/kernel/slice_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool SliceKernel<FPGA, float>::Init(SliceParam<FPGA>* param) {
+  return true;
+}
+template <>
+void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) {}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/fpga/V2/softmax_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/softmax_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef SOFTMAX_OP
+#include "operators/kernel/softmax_kernel.h"
+#include "operators/kernel/central-arm-func/softmax_arm_func.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
+  auto input = const_cast<Tensor *>(param->InputX());
+  auto input_ptr = input->data<float>();
+  auto float_input = new Tensor;
+  float_input->mutable_data<float>({1, input->dims()[1]});
+  fpga::format_fp32_ofm(float_input, 8);
+  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
+  args.input_layout_type = fpga::LAYOUT_HWC;
+  args.output_layout_type = fpga::LAYOUT_CHW;
+  args.input_data_type = fpga::DATA_TYPE_FP16;
+  args.output_data_type = fpga::DATA_TYPE_FP32;
+  args.image.address = input_ptr;
+  args.image.height = 1;
+  args.image.width = 1;
+  args.image.channels = (uint32_t)input->dims()[1];
+  args.output.address = float_input->data<float>();
+  args.output.scale_address = float_input->scale;
+  param->SetFloatInput(float_input);
+  param->SetFpgaArgs(args);
+  return true;
+}
+template <>
+void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) {
+  Tensor *in_x = param.FloatInput();
+  Tensor *out = param.Out();
+  fpga::PerformBypass(param.FpgaArgs());
+  math::SoftmaxFuntor<CPU, float>()(in_x, out);
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/slice_kernel.h
+++ b/src/operators/kernel/slice_kernel.h
@@ -24,7 +24,8 @@ template <typename DeviceType, typename T>
 class SliceKernel
    : public framework::OpKernelBase<DeviceType, SliceParam<DeviceType>> {
 public:
-  void Compute(const SliceParam<DeviceType>& param) {}
+  void Compute(const SliceParam<DeviceType>& param);
+  bool Init(SliceParam<DeviceType>* param);
 };
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/src/operators/math/depthwise_conv_3x3.cpp
+++ b/src/operators/math/depthwise_conv_3x3.cpp
@@ -1275,7 +1275,8 @@ void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter,
  const int inhxw = in_h * in_w;
  const int outhxw = out_h * out_w;
  /// todo : fix if_pad when w != h
-  const int if_pad = in_w - 1 == (out_w - 1) * 2 ? 1 : 0;
+  const int if_pad_r = in_w - 1 == (out_w - 1) * 2 ? 1 : 0;
+  const int if_pad_b = in_h - 1 == (out_h - 1) * 2 ? 1 : 0;
  const int batch_size = static_cast<int>(input->dims()[0]);
  const int c = static_cast<int>(input->dims()[1]);
  const float *input_row_ptr;
@@ -1366,7 +1367,7 @@ void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter,
        elewise_res0 = vmlaq_n_f32(elewise_res0, input_buff_mid.val[0], w10);
        elewise_res2 = vmlaq_n_f32(elewise_res2, input_buff_mid.val[0], w12);
-        if (!if_pad) {
+        if (!if_pad_b) {
          elewise_res1 =
              vmlaq_n_f32(elewise_res1, input_buff_bottom[w4].val[1], w21);
          elewise_res0 =
@@ -1401,8 +1402,8 @@ void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter,
          w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
          w20 * input_const[out2in_mid + in_w - 1] +
          w21 * input_const[out2in_mid + in_w] +
-          (1 - if_pad) * (w12 * input_const[out2in_mid + 1] +
+          (1 - if_pad_r) * (w12 * input_const[out2in_mid + 1] +
-                          w22 * input_const[out2in_mid + in_w + 1]);
+                            w22 * input_const[out2in_mid + in_w + 1]);
      out2in_mid = (out_h - 1) * 2 * in_w;
@@ -1410,19 +1411,20 @@ void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter,
          w01 * input_const[out2in_mid - in_w] +
          w02 * input_const[out2in_mid - in_w + 1] +
          w11 * input_const[out2in_mid] + w12 * input_const[out2in_mid + 1] +
-          (1 - if_pad) * (w21 * input_const[out2in_mid + in_w] +
+          (1 - if_pad_b) * (w21 * input_const[out2in_mid + in_w] +
-                          w22 * input_const[out2in_mid + in_w + 1]);
+                            w22 * input_const[out2in_mid + in_w + 1]);
      out2in_mid = (out_h - 1) * 2 * in_w + (out_w - 1) * 2;
      output_data_tmp[out_h * out_w - 1] =
          w00 * input_const[out2in_mid - in_w - 1] +
          w01 * input_const[out2in_mid - in_w] +
          w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
-          (1 - if_pad) * (w20 * input_const[out2in_mid + in_w - 1] +
+          (1 - if_pad_r) * (w20 * input_const[out2in_mid + in_w - 1] +
-                          w21 * input_const[out2in_mid + in_w] +
+                            w21 * input_const[out2in_mid + in_w]) +
-                          w02 * input_const[out2in_mid - in_w + 1] +
+          (1 - if_pad_b) * (w02 * input_const[out2in_mid - in_w + 1] +
-                          w12 * input_const[out2in_mid + 1] +
+                            w12 * input_const[out2in_mid + 1]) +
-                          w22 * input_const[out2in_mid + in_w + 1]);
+          (1 - if_pad_r) * (1 - if_pad_b) * w22 *
+              input_const[out2in_mid + in_w + 1];
      if (if_bias) {
        output_data_tmp[0] += bias_data[j];
        output_data_tmp[out_w - 1] += bias_data[j];
@@ -1445,9 +1447,9 @@ void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter,
            w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
            w20 * input_const[out2in_mid + in_w - 1] +
            w21 * input_const[out2in_mid + in_w] +
-            (1 - if_pad) * (w02 * input_const[out2in_mid - in_w + 1] +
+            (1 - if_pad_r) * (w02 * input_const[out2in_mid - in_w + 1] +
-                            w12 * input_const[out2in_mid + 1] +
+                              w12 * input_const[out2in_mid + 1] +
-                            w22 * input_const[out2in_mid + in_w + 1]);
+                              w22 * input_const[out2in_mid + in_w + 1]);
        if (if_bias) {
          output_data_tmp[i * out_w] += bias_data[j];
          output_data_tmp[i * out_w + out_w - 1] += bias_data[j];
@@ -1662,7 +1664,8 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
  const int inhxw = in_h * in_w;
  const int outhxw = out_h * out_w;
  /// todo : fix if_pad when w != h
-  const int if_pad = in_w - 1 == (out_w - 1) * 2 ? 1 : 0;
+  const int if_pad_r = in_w - 1 == (out_w - 1) * 2 ? 1 : 0;
+  const int if_pad_b = in_h - 1 == (out_h - 1) * 2 ? 1 : 0;
  const int batch_size = static_cast<int>(input->dims()[0]);
  const int c = static_cast<int>(input->dims()[1]);
  const int w_times = (out_w - 2) / 3;
@@ -1756,7 +1759,7 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
        elewise_res0 = vmlaq_n_f32(elewise_res0, input_buff_mid.val[0], w10);
        elewise_res2 = vmlaq_n_f32(elewise_res2, input_buff_mid.val[0], w12);
-        if (!if_pad) {
+        if (!if_pad_b) {
          elewise_res1 =
              vmlaq_n_f32(elewise_res1, input_buff_bottom[w4].val[1], w21);
          elewise_res0 =
@@ -1796,8 +1799,8 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
          w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
          w20 * input_const[out2in_mid + in_w - 1] +
          w21 * input_const[out2in_mid + in_w] +
-          (1 - if_pad) * (w12 * input_const[out2in_mid + 1] +
+          (1 - if_pad_r) * (w12 * input_const[out2in_mid + 1] +
-                          w22 * input_const[out2in_mid + in_w + 1]);
+                            w22 * input_const[out2in_mid + in_w + 1]);
      out2in_mid = (out_h - 1) * 2 * in_w;
@@ -1805,19 +1808,20 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
          w01 * input_const[out2in_mid - in_w] +
          w02 * input_const[out2in_mid - in_w + 1] +
          w11 * input_const[out2in_mid] + w12 * input_const[out2in_mid + 1] +
-          (1 - if_pad) * (w21 * input_const[out2in_mid + in_w] +
+          (1 - if_pad_b) * (w21 * input_const[out2in_mid + in_w] +
-                          w22 * input_const[out2in_mid + in_w + 1]);
+                            w22 * input_const[out2in_mid + in_w + 1]);
      out2in_mid = (out_h - 1) * 2 * in_w + (out_w - 1) * 2;
      output_data_tmp[out_h * out_w - 1] =
          w00 * input_const[out2in_mid - in_w - 1] +
          w01 * input_const[out2in_mid - in_w] +
          w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
-          (1 - if_pad) * (w20 * input_const[out2in_mid + in_w - 1] +
+          (1 - if_pad_r) * (w20 * input_const[out2in_mid + in_w - 1] +
-                          w21 * input_const[out2in_mid + in_w] +
+                            w21 * input_const[out2in_mid + in_w]) +
-                          w02 * input_const[out2in_mid - in_w + 1] +
+          (1 - if_pad_b) * (w02 * input_const[out2in_mid - in_w + 1] +
-                          w12 * input_const[out2in_mid + 1] +
+                            w12 * input_const[out2in_mid + 1]) +
-                          w22 * input_const[out2in_mid + in_w + 1]);
+          (1 - if_pad_r) * (1 - if_pad_b) * w22 *
+              input_const[out2in_mid + in_w + 1];
      output_data_tmp[0] =
          output_data_tmp[0] * newscale_data[j] + newbias_data[j];
      output_data_tmp[out_w - 1] =
@@ -1857,9 +1861,9 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
            w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
            w20 * input_const[out2in_mid + in_w - 1] +
            w21 * input_const[out2in_mid + in_w] +
-            (1 - if_pad) * (w02 * input_const[out2in_mid - in_w + 1] +
+            (1 - if_pad_r) * (w02 * input_const[out2in_mid - in_w + 1] +
-                            w12 * input_const[out2in_mid + 1] +
+                              w12 * input_const[out2in_mid + 1] +
-                            w22 * input_const[out2in_mid + in_w + 1]);
+                              w22 * input_const[out2in_mid + in_w + 1]);
        output_data_tmp[i * out_w] =
            output_data_tmp[i * out_w] * newscale_data[j] + newbias_data[j];
        output_data_tmp[i * out_w + out_w - 1] =

--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
@@ -3307,8 +3307,13 @@ void Gemm::Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
      float *local_A = packedA + MC * KC * local_threads;
      float *local_C = packedC + MC * NC * local_threads;
      (*this.*procPackA)(mc, KC, mc % MR, &A(i, 0), lda, local_A);
-      InnerKernelWithBias(mc, n, alpha, local_A, packedB, beta, local_C,
+      if (bias == nullptr) {
-                          &C(i, 0), ldc, relu, bias + i);
+        InnerKernelWithBias(mc, n, alpha, local_A, packedB, beta, local_C,
+                            &C(i, 0), ldc, relu, nullptr);
+      } else {
+        InnerKernelWithBias(mc, n, alpha, local_A, packedB, beta, local_C,
+                            &C(i, 0), ldc, relu, bias + i);
+      }
    }
  } else {
 #pragma omp parallel for

--- a/src/operators/math/gru_compute.cpp
+++ b/src/operators/math/gru_compute.cpp
@@ -30,20 +30,34 @@ struct GRUUnitFunctor<CPU, T> {
                      const ActivationType active_gate) {
    Gemm gemm;
    if (value.prev_out_value) {
+#ifdef _OPENMP
+      gemm.Sgemm_omp(batch_size, frame_size * 2, frame_size, 1,
+                     value.prev_out_value, frame_size, value.gate_weight,
+                     frame_size * 2, 1, value.gate_value, frame_size * 3, false,
+                     nullptr);
+#else
      gemm.Sgemm(batch_size, frame_size * 2, frame_size, 1,
                 value.prev_out_value, frame_size, value.gate_weight,
                 frame_size * 2, 1, value.gate_value, frame_size * 3, false,
                 nullptr);
+#endif
    }
    forward_reset_output(forward::gru_resetOutput<T>(), value, frame_size,
                         batch_size, active_gate);
    if (value.prev_out_value) {
+#ifdef _OPENMP
+      gemm.Sgemm_omp(batch_size, frame_size, frame_size, 1,
+                     value.reset_output_value, frame_size, value.state_weight,
+                     frame_size, 1, value.gate_value + frame_size * 2,
+                     frame_size * 3, false, nullptr);
+#else
      gemm.Sgemm(batch_size, frame_size, frame_size, 1,
                 value.reset_output_value, frame_size, value.state_weight,
                 frame_size, 1, value.gate_value + frame_size * 2,
                 frame_size * 3, false, nullptr);
+#endif
    }
    forward_final_output(forward::gru_finalOutput<T>(), value, frame_size,

--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -23,8 +23,13 @@ limitations under the License. */
 #include "framework/scope.h"
 #include "framework/tensor.h"
 #include "framework/variable.h"
-#ifdef PADDLE_MOBILE_FPGA
-#include "fpga/api.h"
+#ifdef PADDLE_MOBILE_FPGA_V1
+#include "fpga/V1/api.h"
+#endif
+#ifdef PADDLE_MOBILE_FPGA_V2
+#include "fpga/V2/api.h"
 #endif
 #ifdef PADDLE_MOBILE_CL
@@ -431,6 +436,16 @@ class ConvParam : public OpParam {
 #ifdef PADDLE_MOBILE_CL
  int offset_;
 #endif
+#ifdef PADDLE_MOBILE_FPGA
+ private:
+  fpga::SplitConvArgs fpga_conv_args;
+ public:
+  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
+#endif
 };
 template <typename Dtype>
 Print &operator<<(Print &printer, const ConvParam<Dtype> &conv_param);
@@ -575,15 +590,6 @@ class MulParam : OpParam {
  GType *out_;
  int x_num_col_dims_;
  int y_num_col_dims_;
-#ifdef PADDLE_MOBILE_FPGA
- private:
-  fpga::SplitConvArgs fpga_conv_args;
- public:
-  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
-#endif
 };
 #endif
@@ -1332,23 +1338,23 @@ class Reshape2Param : public OpParam {
    }
  }
-  const RType *InputX() const { return input_x_; }
+  const GType *InputX() const { return input_x_; }
-  const RType *InputShape() const { return input_shape_; }
+  const GType *InputShape() const { return input_shape_; }
-  RType *Out() const { return out_; }
+  GType *Out() const { return out_; }
-  RType *OutputXShape() const { return output_xshape_; }
+  GType *OutputXShape() const { return output_xshape_; }
  const vector<int> &Shape() const { return shape_; }
  const bool &Inplace() const { return inplace_; }
 private:
-  RType *input_x_;
+  GType *input_x_;
-  RType *input_shape_;
+  GType *input_shape_;
-  RType *out_;
+  GType *out_;
-  RType *output_xshape_;
+  GType *output_xshape_;
  vector<int> shape_;
  bool inplace_;
 };
@@ -1636,15 +1642,6 @@ class FusionConvAddParam : public ConvParam<Dtype> {
  RType *bias_;
  int axis_;
  RType *output_;
-#ifdef PADDLE_MOBILE_FPGA
- private:
-  fpga::SplitConvArgs fpga_conv_args;
- public:
-  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
-#endif
 };
 template <typename Dtype>
@@ -1691,15 +1688,6 @@ class FusionConvAddPReluParam : public ConvParam<Dtype> {
  RType *output_;
  RType *alpha_;
  std::string mode_;
-#ifdef PADDLE_MOBILE_FPGA
- private:
-  fpga::SplitConvArgs fpga_conv_args;
- public:
-  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
-#endif
 };
 #endif
@@ -1749,15 +1737,6 @@ class FusionConvAddAddPReluParam : public ConvParam<Dtype> {
  std::string keyOutput_;
  std::string keyX1_;
  std::string keyY1_;
-#ifdef PADDLE_MOBILE_FPGA
- private:
-  fpga::SplitConvArgs fpga_conv_args;
- public:
-  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
-#endif
 };
 #endif
@@ -1824,16 +1803,6 @@ class FusionConvAddBNReluParam : public ConvParam<Dtype> {
  bool is_test_;
  RType *new_bias_;
  RType *new_scale_;
-#ifdef PADDLE_MOBILE_FPGA
- private:
-  fpga::SplitConvArgs fpga_conv_args;
- public:
-  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
-#endif
 };
 #endif
@@ -1911,15 +1880,6 @@ class FusionConvBNAddReluParam : public ConvParam<Dtype> {
  std::string keyBNY_;
  std::string keyX_;
  std::string keyY_;
-#ifdef PADDLE_MOBILE_FPGA
- private:
-  fpga::SplitConvArgs fpga_conv_args;
- public:
-  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
-#endif
 };
 #endif
@@ -1978,15 +1938,6 @@ class FusionConvBNParam : public ConvParam<Dtype> {
  bool is_test_;
  RType *new_bias_;
  RType *new_scale_;
-#ifdef PADDLE_MOBILE_FPGA
- private:
-  fpga::SplitConvArgs fpga_conv_args;
- public:
-  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
-#endif
 };
 #endif
@@ -2053,15 +2004,6 @@ class FusionConvAddBNParam : public ConvParam<Dtype> {
  bool is_test_;
  RType *new_bias_;
  RType *new_scale_;
-#ifdef PADDLE_MOBILE_FPGA
- private:
-  fpga::SplitConvArgs fpga_conv_args;
- public:
-  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
-#endif
 };
 #endif
@@ -2179,15 +2121,6 @@ class FusionConvBNReluParam : public ConvParam<Dtype> {
  bool is_test_;
  RType *new_bias_;
  RType *new_scale_;
-#ifdef PADDLE_MOBILE_FPGA
- private:
-  fpga::SplitConvArgs fpga_conv_args;
- public:
-  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
-#endif
 };
 #endif

--- a/src/operators/slice_op.cpp
+++ b/src/operators/slice_op.cpp
@@ -34,5 +34,7 @@ REGISTER_OPERATOR_CPU(slice, ops::SliceOp);
 #ifdef PADDLE_MOBILE_MALI_GPU
 REGISTER_OPERATOR_MALI_GPU(slice, ops::SliceOp);
 #endif
+#ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(slice, ops::SliceOp);
+#endif
 #endif
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -62,15 +62,18 @@ if (CON GREATER -1)
 endif ()
-list(FIND NET "FPGAnets" CON)
+list(FIND NET "FPGA_NET_V1" CON)
 if (CON GREATER -1)
    ADD_EXECUTABLE(test-resnet50 fpga/test_resnet50.cpp test_helper.h test_include.h executor_for_test.h)
    target_link_libraries(test-resnet50 paddle-mobile)
-#    ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h test_include.h executor_for_test.h)
-#    target_link_libraries(test-resnet paddle-mobile)
    set(FOUND_MATCH ON)
+endif ()
+list(FIND NET "FPGA_NET_V2" CON)
+if (CON GREATER -1)
+    ADD_EXECUTABLE(test-resnet50 fpga/test_resnet50.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-resnet50 paddle-mobile)
+    set(FOUND_MATCH ON)
 endif ()
 list(FIND NET "mobilenetssd" CON)

--- a/test/fpga/test_resnet50.cpp
+++ b/test/fpga/test_resnet50.cpp
@@ -13,7 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <fstream>
 #include "../test_include.h"
-#include "fpga/api.h"
+#ifdef PADDLE_MOBILE_FPGA_V1
+#include "fpga/V1/api.h"
+#endif
+#ifdef PADDLE_MOBILE_FPGA_V2
+#include "fpga/V2/api.h"
+#endif
 void readStream(std::string filename, float *buf) {
  std::ifstream in;
  in.open(filename, std::ios::in);

--- a/test/framework/test_load_memory_inference_api.cpp
+++ b/test/framework/test_load_memory_inference_api.cpp
@@ -55,11 +55,11 @@ static char *Get_binary_data(std::string filename) {
 paddle_mobile::PaddleMobileConfig GetConfig() {
  paddle_mobile::PaddleMobileConfig config;
  config.precision = paddle_mobile::PaddleMobileConfig::FP32;
-  config.device = paddle_mobile::PaddleMobileConfig::kCPU;
+  config.device = paddle_mobile::PaddleMobileConfig::kGPU_CL;
  const std::shared_ptr<paddle_mobile::PaddleModelMemoryPack> &memory_pack =
      std::make_shared<paddle_mobile::PaddleModelMemoryPack>();
-  auto model_path = std::string(g_genet_combine) + "/model";
+  auto model_path = std::string(g_mobilenet_combined) + "/model";
-  auto params_path = std::string(g_genet_combine) + "/params";
+  auto params_path = std::string(g_mobilenet_combined) + "/params";
  memory_pack->model_size =
      ReadBuffer(model_path.c_str(), &memory_pack->model_buf);
  std::cout << "sizeBuf: " << memory_pack->model_size << std::endl;

--- a/test/net/test_mobilenet_GPU.cpp
+++ b/test/net/test_mobilenet_GPU.cpp
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <iostream>
+#include "../../src/common/types.h"
 #include "../test_helper.h"
 #include "../test_include.h"
@@ -20,10 +21,15 @@ int main() {
  paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> paddle_mobile;
  //    paddle_mobile.SetThreadNum(4);
  auto time1 = paddle_mobile::time();
-  //  auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
+#ifdef PADDLE_MOBILE_CL
-  //                     std::string(g_mobilenet_detect) + "/params", true);
+  paddle_mobile.SetCLPath("/data/local/tmp/bin");
+#endif
-  auto isok = paddle_mobile.Load(std::string(g_mobilenet), true);
+  auto isok =
+      paddle_mobile.Load(std::string(g_mobilenet_mul) + "/model",
+                         std::string(g_mobilenet_mul) + "/params", true);
+  //  auto isok = paddle_mobile.Load(std::string(g_mobilenet_mul), true);
  if (isok) {
    auto time2 = paddle_mobile::time();
    std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms"

--- a/tools/op.cmake
+++ b/tools/op.cmake
@@ -106,9 +106,9 @@ if (CON GREATER -1)
  set(FOUND_MATCH ON)
 endif()
-list(FIND NET "FPGAnets" CON)
+list(FIND NET "FPGA_NET_V1" CON)
 if (CON GREATER -1)
-  message("FPGAnets enabled")
+  message("FPGA_NET_V1 enabled")
  set(FUSION_CONVADDRELU_OP ON)
  set(FUSION_CONVADDBNRELU_OP ON)
  set(FUSION_CONVADDBN_OP ON)
@@ -124,6 +124,21 @@ if (CON GREATER -1)
  set(FOUND_MATCH ON)
 endif()
+list(FIND NET "FPGA_NET_V2" CON)
+if (CON GREATER -1)
+  message("FPGA_NET_V2 enabled")
+  set(FUSION_ELEMENTWISEADDRELU_OP ON)
+  set(FUSION_FC_OP ON)
+  set(POOL_OP ON)
+  set(SOFTMAX_OP ON)
+  set(FUSION_CONVBNRELU_OP ON)
+  set(FUSION_CONVBN_OP ON)
+#  set(CONV_TRANSPOSE_OP ON)
+#  set(SLICE_OP ON)
+#  set(ELEMENTWISEADD_OP ON)
+  set(FOUND_MATCH ON)
+endif()
 list(FIND NET "nlp" CON)
 if (CON GREATER -1)
  message("nlp enabled")