From 59bdbff94a0984463d48333004d05978f11d606e Mon Sep 17 00:00:00 2001
From: qnqinan <qnqinan@163.com>
Date: Fri, 19 Apr 2019 15:22:56 +0800
Subject: [PATCH] add static quantization code and update FPGA V2(V3) related
 files

---
 src/fpga/V2/api.cpp                           |  983 ++++++-
 src/fpga/V2/api.h                             |   83 +-
 src/fpga/V2/bias_scale.cpp                    |   77 +-
 src/fpga/V2/bias_scale.h                      |    7 +-
 src/fpga/V2/deconv_bias_scale.cpp             |   48 +
 src/fpga/V2/deconv_bias_scale.h               |   26 +
 src/fpga/V2/deconv_filter.cpp                 |  280 ++
 src/fpga/V2/deconv_filter.h                   |   39 +
 src/fpga/V2/filter.cpp                        |  353 ++-
 src/fpga/V2/filter.h                          |   33 +-
 src/fpga/V2/image.cpp                         |  140 +-
 src/fpga/V2/image.h                           |   64 +-
 src/fpga/V2/pe.cpp                            | 2549 +++++++----------
 src/fpga/common/fpga_common.h                 |   23 +-
 src/framework/executor.cpp                    |   72 +-
 src/framework/executor.h                      |    3 +
 .../fpga/V2/anchor_generator_kernel.cpp       |   87 +
 .../kernel/fpga/V2/concat_kernel.cpp          |   26 +-
 .../kernel/fpga/V2/conv_add_bn_kernel.cpp     |   23 +-
 .../fpga/V2/conv_add_bn_relu_kernel.cpp       |   54 +-
 .../kernel/fpga/V2/conv_add_kernel.cpp        |   18 +-
 .../kernel/fpga/V2/conv_add_relu_kernel.cpp   |   18 +-
 .../kernel/fpga/V2/conv_bn_kernel.cpp         |   31 +-
 .../kernel/fpga/V2/conv_bn_relu_kernel.cpp    |   62 +-
 src/operators/kernel/fpga/V2/conv_kernel.cpp  |   61 +
 .../kernel/fpga/V2/conv_transpose_kernel.cpp  |   89 +
 .../kernel/fpga/V2/deconv_add_bn_kernel.cpp   |   90 +
 .../fpga/V2/deconv_add_bn_relu_kernel.cpp     |   91 +
 .../kernel/fpga/V2/deconv_add_kernel.cpp      |   56 +-
 .../kernel/fpga/V2/deconv_add_relu_kernel.cpp |   56 +-
 .../kernel/fpga/V2/deconv_bn_relu_kernel.cpp  |  108 +
 .../kernel/fpga/V2/elementwise_add_kernel.cpp |  187 +-
 .../fpga/V2/elementwise_add_relu_kernel.cpp   |   20 +-
 .../kernel/fpga/V2/elementwise_mul_kernel.cpp |   93 +
 src/operators/kernel/fpga/V2/feed_kernel.cpp  |  102 +-
 src/operators/kernel/fpga/V2/fetch_kernel.cpp |  104 +-
 .../kernel/fpga/V2/fusion_fc_kernel.cpp       |   22 +-
 .../kernel/fpga/V2/fusion_fc_relu_kernel.cpp  |   75 +
 src/operators/kernel/fpga/V2/pad2d_kernel.cpp |   61 +
 src/operators/kernel/fpga/V2/pool_kernel.cpp  |   54 +-
 .../kernel/fpga/V2/proposal_kernel.cpp        |  567 ++++
 .../kernel/fpga/V2/psroi_pool_kernel.cpp      |  284 ++
 ...deconv_relu_kernel.cpp => relu_kernel.cpp} |   13 +-
 .../kernel/fpga/V2/reshape2_kernel.cpp        |  127 +
 .../kernel/fpga/V2/reshape_kernel.cpp         |   40 +
 .../kernel/fpga/V2/roialign_pool_kernel.cpp   |  296 ++
 .../kernel/fpga/V2/sigmoid_kernel.cpp         |   55 +
 src/operators/kernel/fpga/V2/slice_kernel.cpp |   35 +-
 .../kernel/fpga/V2/softmax_kernel.cpp         |  113 +-
 src/operators/kernel/fpga/V2/split_kernel.cpp |   48 +-
 src/operators/kernel/fpga/V2/tanh_kernel.cpp  |   50 +-
 .../kernel/fpga/V2/transpose2_kernel.cpp      |   21 +-
 52 files changed, 5980 insertions(+), 2037 deletions(-)
 create mode 100644 src/fpga/V2/deconv_bias_scale.cpp
 create mode 100644 src/fpga/V2/deconv_bias_scale.h
 create mode 100644 src/fpga/V2/deconv_filter.cpp
 create mode 100644 src/fpga/V2/deconv_filter.h
 create mode 100644 src/operators/kernel/fpga/V2/anchor_generator_kernel.cpp
 create mode 100644 src/operators/kernel/fpga/V2/conv_kernel.cpp
 create mode 100644 src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp
 create mode 100644 src/operators/kernel/fpga/V2/deconv_add_bn_kernel.cpp
 create mode 100644 src/operators/kernel/fpga/V2/deconv_add_bn_relu_kernel.cpp
 create mode 100644 src/operators/kernel/fpga/V2/deconv_bn_relu_kernel.cpp
 create mode 100644 src/operators/kernel/fpga/V2/elementwise_mul_kernel.cpp
 create mode 100644 src/operators/kernel/fpga/V2/fusion_fc_relu_kernel.cpp
 create mode 100644 src/operators/kernel/fpga/V2/pad2d_kernel.cpp
 create mode 100644 src/operators/kernel/fpga/V2/proposal_kernel.cpp
 create mode 100644 src/operators/kernel/fpga/V2/psroi_pool_kernel.cpp
 rename src/operators/kernel/fpga/V2/{deconv_relu_kernel.cpp => relu_kernel.cpp} (70%)
 create mode 100644 src/operators/kernel/fpga/V2/reshape2_kernel.cpp
 create mode 100644 src/operators/kernel/fpga/V2/reshape_kernel.cpp
 create mode 100644 src/operators/kernel/fpga/V2/roialign_pool_kernel.cpp
 create mode 100644 src/operators/kernel/fpga/V2/sigmoid_kernel.cpp

diff --git a/src/fpga/V2/api.cpp b/src/fpga/V2/api.cpp
index 6e1090c00e..9d91cf45b2 100644
--- a/src/fpga/V2/api.cpp
+++ b/src/fpga/V2/api.cpp
@@ -13,77 +13,151 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "fpga/V2/api.h"
+#include <memory>
 #include "fpga/V2/bias_scale.h"
+#include "fpga/V2/deconv_filter.h"
 #include "fpga/V2/filter.h"
 #include "fpga/V2/image.h"
 
 namespace paddle_mobile {
 namespace fpga {
 
+#define USE_RELU 1
+#define USE_BIAS 2
+
 void format_image(framework::Tensor *image_tensor) {
   auto dims = image_tensor->dims();
   auto channel = dims[1], height = dims[2], width = dims[3];
-  auto data_ptr = image_tensor->data<float>();
-  size_t memory_size = channel * height * width * sizeof(float);
-  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
-  memcpy(new_data, data_ptr, memory_size);
-  int aligned_channel = filter::calc_aligned_channel((int)channel);  // NOLINT
-  image::format_image(&new_data, (int)channel, (int)height,          // NOLINT
-                      (int)width,                                    // NOLINT
-                      aligned_channel);
-  image_tensor->reset_data_ptr(new_data);
+  kTypeId_t input_type = image_tensor->type();
+  if (input_type == type_id<float>()) {
+    auto data_ptr = image_tensor->data<float>();
+    auto external_ptr = reinterpret_cast<float *>(image_tensor->external_data);
+    float *p_data = external_ptr == nullptr ? data_ptr : external_ptr;
+
+    image::format_image<float>(&p_data, channel, height, width);
+    if (p_data != data_ptr && external_ptr == nullptr) {
+      image_tensor->reset_data_ptr(p_data);
+    }
+  } else {
+    auto data_ptr = image_tensor->data<int8_t>();
+    auto external_ptr = reinterpret_cast<int8_t *>(image_tensor->external_data);
+    int8_t *p_data = external_ptr == nullptr ? data_ptr : external_ptr;
+
+    image::format_image<int8_t>(&p_data, channel, height, width);
+    if (p_data != data_ptr && external_ptr == nullptr) {
+      image_tensor->reset_data_ptr(p_data);
+    }
+  }
 }
 
-void format_fp16_ofm(framework::Tensor *ofm_tensor, int aligned_channel) {
+void format_ofm(framework::Tensor *ofm_tensor) {
+  if (ofm_tensor->type() == type_id<float>()) {
+    format_fp32_ofm(ofm_tensor);
+  } else {
+    format_fp16_ofm(ofm_tensor);
+  }
+}
+void format_fp16_ofm(framework::Tensor *ofm_tensor) {
   auto dims = ofm_tensor->dims();
   size_t memory_size = 0;
   if (dims.size() == 4) {
-    auto height = dims[2], width = dims[3];
-    memory_size = (height + 1) / 2 * 2 * width * aligned_channel * sizeof(half);
+    auto channel = dims[1], height = dims[2], width = dims[3], num = dims[0];
+    memory_size = num * height * align_to_x(channel * width, IMAGE_ALIGNMENT) *
+                  sizeof(half);
+  } else if (dims.size() == 2) {
+    memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(half);
+  } else {
+    DLOG << "Wrong ofm dimension";
+  }
+  auto p = fpga_malloc(memory_size);
+  // memset(p, 0, memory_size);
+  ofm_tensor->reset_data_ptr(p);
+  ofm_tensor->set_type(type_id<half>().hash_code());
+  ofm_tensor->fpga_data_num = memory_size / sizeof(half);
+  fpga::fpga_flush(p, memory_size);
+}
+
+void format_fp16_ofm(framework::Tensor *ofm_tensor, framework::DDim dims) {
+  // auto dims = ofm_tensor->dims();
+  size_t memory_size = 0;
+  if (dims.size() == 4) {
+    auto channel = dims[1], height = dims[2], width = dims[3];
+    memory_size =
+        height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(half);
   } else if (dims.size() == 2) {
-    memory_size = aligned_channel * sizeof(half);
+    memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(half);
   } else {
     DLOG << "Wrong ofm dimension";
   }
   auto p = fpga_malloc(memory_size);
-  memset(p, 0, memory_size);
+  // memset(p, 0, memory_size);
   ofm_tensor->reset_data_ptr(p);
+  ofm_tensor->set_type(type_id<half>().hash_code());
+  ofm_tensor->fpga_data_num = memory_size / sizeof(half);
+  fpga::fpga_flush(p, memory_size);
 }
 
-void format_fp32_ofm(framework::Tensor *ofm_tensor, int aligned_channel) {
+void format_fp32_ofm(framework::Tensor *ofm_tensor) {
   auto dims = ofm_tensor->dims();
   size_t memory_size = 0;
   if (dims.size() == 4) {
-    auto height = dims[2], width = dims[3];
-    memory_size = height * width * aligned_channel * sizeof(float);
+    auto channel = dims[1], height = dims[2], width = dims[3];
+    memory_size =
+        height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(float);
   } else if (dims.size() == 2) {
-    memory_size = aligned_channel * sizeof(float);
+    memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(float);
   } else {
     DLOG << "Wrong ofm dimension";
   }
   auto p = fpga_malloc(memory_size);
-  memset(p, 0, memory_size);
+  // memset(p, 0, memory_size);
   ofm_tensor->reset_data_ptr(p);
+  ofm_tensor->set_type(type_id<float>().hash_code());
+  ofm_tensor->fpga_data_num = memory_size / sizeof(float);
+  fpga::fpga_flush(p, memory_size);
 }
 
 float filter_find_max(framework::Tensor *filter_tensor) {
   auto filter_ptr = filter_tensor->data<float>();
-  return filter::find_max(filter_ptr, (int)filter_tensor->numel());  // NOLINT
+  return filter::find_max(filter_ptr, filter_tensor->numel());
 }
 
-int get_aligned_channel_num(int channel_num) {
-  return filter::calc_aligned_channel(channel_num);
+int get_plit_num(framework::Tensor *filter_tensor) {
+  auto dims = filter_tensor->dims();
+  auto chw = dims[1] * dims[2] * dims[3];
+  auto num = dims[0];
+  int div_capacity = filter::calc_division_capacity(chw);
+  return filter::calc_split_num(num, div_capacity);
+}
+int get_deconv_plit_num(framework::Tensor *filter_tensor, int stride) {
+  auto dims = filter_tensor->dims();
+  auto chw = dims[1] * dims[2] / stride * dims[3] / stride;
+  auto num = dims[0] * stride;
+  int div_capacity = filter::calc_division_capacity(chw);
+  return filter::calc_split_num(num, div_capacity);
+}
+
+int get_filter_num_per_div(framework::Tensor *filter_tensor, int group_num) {
+  auto dims = filter_tensor->dims();
+  auto chw = dims[1] * dims[2] * dims[3];
+  auto num = dims[0];
+  int div_capacity = filter::calc_division_capacity(chw);
+  return filter::calc_num_per_div(num, group_num, div_capacity);
 }
 
-int get_aligned_filter_num(framework::Tensor *filter_tensor) {
+int get_deconv_filter_num_per_div(framework::Tensor *filter_tensor,
+                                  int group_num, int stride) {
   auto dims = filter_tensor->dims();
-  return filter::calc_aligned_num((int)dims[0], (int)dims[1]);  // NOLINT
+  auto chw = dims[1] * dims[2] / stride * dims[3] / stride;
+  auto num = dims[0] * stride;
+  int div_capacity = filter::calc_division_capacity(chw);
+  return filter::calc_num_per_div(num, group_num, div_capacity);
 }
 
-int get_conv_output_channel(framework::Tensor *filter_tensor) {
-  int aligned_filter_num = get_aligned_filter_num(filter_tensor);
-  return get_aligned_channel_num(aligned_filter_num);
+int get_aligned_filter_element_num(int chw) {
+  return align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
 }
+
 void format_filter(framework::Tensor *filter_tensor, float max_value,
                    int group_num) {
   filter_tensor->scale[0] = float(max_value / 127.0);  // NOLINT
@@ -93,11 +167,47 @@ void format_filter(framework::Tensor *filter_tensor, float max_value,
   auto data_ptr = filter_tensor->data<float>();
   size_t memory_size = num * channel * height * width * sizeof(float);
   auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
-  memcpy(new_data, data_ptr, memory_size);
-  filter::format_filter(&new_data, (int)num, (int)channel,  // NOLINT
-                        (int)height,                        // NOLINT
-                        (int)width, group_num, max_value);  // NOLINT
+  fpga_copy(new_data, data_ptr, memory_size);
+  filter::format_filter(&new_data, num, channel, height, width, group_num,
+                        max_value);
+  filter_tensor->reset_data_ptr(new_data);
+  filter_tensor->set_type(type_id<int8_t>().hash_code());
+}
+void format_dwconv_filter(framework::Tensor *filter_tensor, float *scale_ptr) {
+  auto dims = filter_tensor->dims();
+  auto num = dims[0], height = dims[2], width = dims[3];
+  auto data_ptr = filter_tensor->data<float>();
+  size_t memory_size = num * height * width * sizeof(float);
+  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
+  fpga_copy(new_data, data_ptr, memory_size);
+  filter::format_dwconv_filter(&new_data, num, height, width, scale_ptr);
+  filter_tensor->reset_data_ptr(new_data);
+  filter_tensor->set_type(type_id<int16_t>().hash_code());
+}
+
+void format_DWDconv_filter(framework::Tensor *filter_tensor, float *scale_ptr,
+                           int stride) {
+  auto dims = filter_tensor->dims();
+  auto num = dims[0], height = dims[2], width = dims[3];
+  auto data_ptr = filter_tensor->data<float>();
+  size_t memory_size = num * height * width * sizeof(float);
+  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
+  fpga_copy(new_data, data_ptr, memory_size);
+
+  int hw = height * width;
+  deconv_filter::deconv_NC_convert(&new_data, num, 1, hw);
+
+  num = dims[1];
+  int channel = dims[0];
+
+  deconv_filter::DWDconv_format_filter(&new_data, num, channel, height, width,
+                                       scale_ptr, stride);
+
+  //  framework::DDim dims_new =
+  //      framework::make_ddim({num, 1, height, width});
+  //  filter_tensor->Resize(dims_new);
   filter_tensor->reset_data_ptr(new_data);
+  filter_tensor->set_type(type_id<int16_t>().hash_code());
 }
 
 void format_fc_filter(framework::Tensor *filter_tensor, float max_value) {
@@ -108,73 +218,292 @@ void format_fc_filter(framework::Tensor *filter_tensor, float max_value) {
   auto data_ptr = filter_tensor->data<float>();
   size_t memory_size = num * channel * height * width * sizeof(float);
   auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
+  fpga_copy(new_data, data_ptr, memory_size);
+  filter::format_fc_filter(&new_data, num, channel, height, width, 1,
+                           max_value);
+  filter_tensor->reset_data_ptr(new_data);
+  filter_tensor->set_type(type_id<int8_t>().hash_code());
+}
+void format_deconv_filter(framework::Tensor *filter_tensor, float max_value,
+                          int group_num, int stride) {
+  filter_tensor->scale[0] = float(max_value / 127.0);  // NOLINT
+  filter_tensor->scale[1] = float(127.0 / max_value);  // NOLINT
+  auto dims = filter_tensor->dims();
+  auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
+  auto data_ptr = filter_tensor->data<float>();
+  size_t memory_size = num * channel * height * width * sizeof(float);
+  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
   memcpy(new_data, data_ptr, memory_size);
-  filter::format_fc_filter(&new_data, (int)num, (int)channel,  // NOLINT
-                           (int)height,                        // NOLINT
-                           (int)width, 1, max_value);          // NOLINT
+
+  int hw = height * width;
+  deconv_filter::deconv_NC_convert(&new_data, num, channel, hw);
+
+  num = dims[1];
+  channel = dims[0];
+  deconv_filter::deconv_format_filter(
+      &new_data, (int)num, (int)channel,          // NOLINT
+      (int)height,                                // NOLINT
+      (int)width, group_num, max_value, stride);  // NOLINT
+
+  framework::DDim dims_new =
+      framework::make_ddim({num, channel, height, width});
+  filter_tensor->Resize(dims_new);
   filter_tensor->reset_data_ptr(new_data);
+  filter_tensor->set_type(type_id<int8_t>().hash_code());
 }
 
-void format_bias_scale_array(float **bias_scale_array, int filter_num,
-                             int filter_channel) {
-  int num_after_alignment =
-      filter::calc_aligned_num(filter_channel, filter_channel);
-  bias_scale::format_bias_scale_array(bias_scale_array, filter_num,
-                                      num_after_alignment);
+void format_bias_scale_array(float **bias_scale_array,
+                             int element_num_per_division, int num) {
+  bias_scale::format_bias_scale_array(bias_scale_array,
+                                      element_num_per_division, num);
+}
+void format_bias_array(float **bias_array, int num) {
+  bias_scale::format_bias_array(bias_array, num);
 }
 
 void format_concat_output(framework::Tensor *out, int height, int width,
-                          uint32_t out_channel) {
-  auto data_ptr = fpga_malloc(out_channel * height * width * sizeof(half));
-  auto ddim = framework::make_ddim({1, out_channel, height, width});
+                          int image_num, uint32_t *channel_num) {
+  int sum_channel = 0, sum_cw = 0;
+  for (int i = 0; i < image_num; i++) {
+    sum_channel += channel_num[i];
+  }
+
+  sum_cw = align_to_x(width * sum_channel, IMAGE_ALIGNMENT);
+  auto data_ptr = fpga_malloc(height * sum_cw * sizeof(half));
+  auto ddim = framework::make_ddim({1, sum_channel, height, width});
   out->Resize(ddim);
   out->reset_data_ptr(data_ptr);
+  out->set_type(type_id<half>().hash_code());
 }
-
-int format_conv_data(framework::Tensor *filter_tensor,
-                     framework::Tensor *ofm_tensor, float **bs_ptr, int group) {
+void format_conv_data(framework::Tensor *filter_tensor,
+                      framework::Tensor *ofm_tensor, float **bs_ptr,
+                      int group) {
   float max_value = fpga::filter_find_max(filter_tensor);
   fpga::format_filter(filter_tensor, max_value, group);
-  int aligned_num = get_aligned_filter_num(filter_tensor);
-  fpga::format_bias_scale_array(bs_ptr,
-                                (int)filter_tensor->dims()[0],  // NOLINT
-                                aligned_num);
-  int aligned_channel = fpga::get_conv_output_channel(filter_tensor);
-  fpga::format_fp16_ofm(ofm_tensor, aligned_channel);
-  DLOG << aligned_channel;
-  return aligned_channel;
-}
-
-int format_fc_data(framework::Tensor *filter_tensor,
-                   framework::Tensor *ofm_tensor, float **bs_ptr) {
-  float max_value = fpga::filter_find_max(filter_tensor);
-  fpga::format_fc_filter(filter_tensor, max_value);
-  int aligned_num = get_aligned_filter_num(filter_tensor);
-  fpga::format_bias_scale_array(bs_ptr,
-                                (int)filter_tensor->dims()[0],  // NOLINT
-                                aligned_num);
-  int aligned_channel = fpga::get_conv_output_channel(filter_tensor);
-  fpga::format_fp16_ofm(ofm_tensor, aligned_channel);
-  DLOG << aligned_channel;
-  return aligned_channel;
+  int element_num_per_div = fpga::get_filter_num_per_div(filter_tensor, group);
+  fpga::format_bias_scale_array(bs_ptr, element_num_per_div,
+                                ofm_tensor->dims()[1]);
+  fpga::format_fp16_ofm(ofm_tensor);
+}
+void format_deconv_data(framework::Tensor *filter_tensor,
+                        framework::Tensor *ofm_tensor, float **bs_ptr,
+                        int group, int sub_conv_n) {
+  int channel = ofm_tensor->dims()[1];
+  float max_value = filter_find_max(filter_tensor);
+  format_deconv_filter(filter_tensor, max_value, group, sub_conv_n);
+  int element_num_per_div =
+      get_deconv_filter_num_per_div(filter_tensor, group, sub_conv_n);
+  format_bias_scale_array(bs_ptr, element_num_per_div, channel * sub_conv_n);
+  format_fp16_ofm(ofm_tensor);
+}
+
+void format_dwconv_data(framework::Tensor *filter_tensor,
+                        framework::Tensor *ofm_tensor, float *scale_ptr,
+                        float **bias_ptr) {
+  auto channel = ofm_tensor->dims()[1];
+  format_dwconv_filter(filter_tensor, scale_ptr);
+  format_bias_array(bias_ptr, channel);
+  format_fp16_ofm(ofm_tensor);
+}
+void format_DWDeconv_data(framework::Tensor *filter_tensor,
+                          framework::Tensor *ofm_tensor, float **bs_ptr,
+                          int group, int sub_conv_n) {
+  int channel = ofm_tensor->dims()[1];
+  // dw-deconv
+  format_DWDconv_filter(
+      filter_tensor,
+      (reinterpret_cast<float *>(*bs_ptr) + sub_conv_n * channel), sub_conv_n);
+  format_bias_array(bs_ptr, channel);
+  format_fp16_ofm(ofm_tensor);
 }
+void expand_conv_arg(ConvArgs *arg) {
+  ConvArgs args = *arg;
+
+  auto fpga_bias_scale_len =
+      align_to_x(args.filter_num / args.group_num, 8) * args.group_num;
+
+  auto output_height =
+      (args.image.height + args.image.pad_height * 2 - args.kernel.height) /
+          args.kernel.stride_h +
+      1;
+  auto output_width =
+      (args.image.width + args.image.pad_width * 2 - args.kernel.width) /
+          args.kernel.stride_w +
+      1;
+
+  auto filter_per_group = args.filter_num / args.group_num;
+  auto channel_per_group = args.image.channels / args.group_num;
+
+  auto image_row_count = args.image.width * args.image.channels;
+  auto image_amount_per_row = align_to_x(image_row_count, IMAGE_ALIGNMENT);
+  auto image_one_pad_per_row = align_to_x(image_row_count, IMAGE_ALIGNMENT) +
+                               args.image.pad_width * args.image.channels;
+  auto filter_amount_all =
+      align_to_x(args.kernel.height * args.kernel.width * channel_per_group,
+                 FILTER_ELEMENT_ALIGNMENT);
+
+  auto output_amount_per_row = align_to_x(
+      (output_width - (args.deconv_tx_param.omit_size) * 2) * args.filter_num,
+      IMAGE_ALIGNMENT);
+
+  // find the opt partition strategy
+  uint64_t res_win;
+  uint64_t res_fit = 0;
+  for (res_win = 1; res_win <= output_width; res_win++) {
+    if ((align_to_x(
+             (args.image.channels *
+              (args.kernel.width + (res_win - 1) * args.kernel.stride_w)),
+             IMAGE_ALIGNMENT) /
+             16 +
+         1) *
+            args.kernel.height >
+        2048) {
+      break;
+    }
+  }
+
+  if (res_win != output_width) {
+    res_win -= 1;
+  }
+
+  if (((res_win % 2) != 0) && (res_win != 1)) {
+    res_win = res_win - 1;
+  }
+  res_fit = res_win;
+
+  auto block_num = (output_width + res_fit - 1) / res_fit;
+  auto block_len = res_fit;
+  auto block_last = output_width - res_fit * (block_num - 1);
+
+  auto res_amount_per_row =
+      (output_width - (args.deconv_tx_param.omit_size) * 2) * args.filter_num;
+  auto res_amount_per_row_pad = output_amount_per_row - res_amount_per_row;
+
+  auto image_block_amount_per_row =
+      args.kernel.stride_w * res_fit * args.image.channels;
+  auto filter_pad_width_mul_channel =
+      args.image.pad_width * args.image.channels;
+  auto image_amount_per_row_multi_win_first =
+      image_amount_per_row *
+      (ROW_PARALLEL_NUM * args.kernel.stride_h - args.image.pad_height);
+  auto image_amount_per_row_multi_win =
+      image_amount_per_row * (ROW_PARALLEL_NUM * args.kernel.stride_h);
+
+  auto image_block_num = block_num;
+  auto image_block_len =
+      align_to_x((args.image.channels *
+                  (args.kernel.width + (block_len - 1) * args.kernel.stride_w)),
+                 IMAGE_ALIGNMENT) /
+          16 +
+      1;
+  auto image_block_len_last =
+      align_to_x(
+          (args.image.channels *
+           (args.kernel.width + (block_last - 1) * args.kernel.stride_w)),
+          IMAGE_ALIGNMENT) /
+          16 +
+      1;
+  auto image_win_cnt = block_len;
+  auto image_win_cnt_last = block_last;
+  auto res_row_data_align4_pad = res_amount_per_row_pad / 8;
+  auto prog_full_cnt = 1024 / (filter_amount_all / 16 * 2) - 1;
+  if (prog_full_cnt == 511) {
+    prog_full_cnt--;
+  }
+  auto post_prog_full_cnt =
+      (512 / (align_to_x(args.filter_num, 4) / 4 * 2) > 2)
+          ? (512 / (align_to_x(args.filter_num, 4) / 4 * 2) - 2)
+          : 0;
+  // auto cmd = 0UL | (args.relu_enabled ? USE_RELU : 0) | USE_BIAS;
+  auto cmd = 0UL | USE_BIAS;
+
+  auto deconv_param = ((args.deconv_tx_param.deconv_en) << 16) |
+                      ((args.deconv_tx_param.sub_conv_num) << 8) |
+                      ((args.deconv_tx_param.omit_size) << 0);
+  (*arg).driver.image_address_phy = vaddr_to_paddr(args.image.address);
+  (*arg).driver.sb_address_phy = vaddr_to_paddr(args.sb_address);
+  (*arg).driver.filter_address_phy = vaddr_to_paddr(args.filter_address);
+  (*arg).driver.output_address_phy = vaddr_to_paddr(args.output.address) +
+                                     args.deconv_tx_param.out_addr_offset;
+  (*arg).driver.output_height = output_height;
+  (*arg).driver.output_width = output_width;
+  (*arg).driver.filter_per_group = filter_per_group;
+  (*arg).driver.channel_per_group = channel_per_group;
+  (*arg).driver.image_amount_per_row = image_amount_per_row;
+  (*arg).driver.image_one_pad_per_row = image_one_pad_per_row;
+  (*arg).driver.filter_amount_all = filter_amount_all;
+  (*arg).driver.output_amount_per_row = output_amount_per_row;
+  (*arg).driver.image_block_amount_per_row = image_block_amount_per_row;
+  (*arg).driver.filter_pad_width_mul_channel = filter_pad_width_mul_channel;
+  (*arg).driver.image_amount_per_row_multi_win_first =
+      image_amount_per_row_multi_win_first;
+  (*arg).driver.image_amount_per_row_multi_win = image_amount_per_row_multi_win;
+  (*arg).driver.image_block_num = image_block_num;
+  (*arg).driver.image_block_len = image_block_len;
+  (*arg).driver.image_block_len_last = image_block_len_last;
+  (*arg).driver.image_win_cnt = image_win_cnt;
+  (*arg).driver.image_win_cnt_last = image_win_cnt_last;
+  (*arg).driver.res_row_data_align4_pad = res_row_data_align4_pad;
+  (*arg).driver.prog_full_cnt = prog_full_cnt;
+  (*arg).driver.post_prog_full_cnt = post_prog_full_cnt;
+  (*arg).driver.fpga_bias_scale_len = fpga_bias_scale_len;
+  (*arg).driver.cmd = cmd;
+  (*arg).driver.deconv_param = deconv_param;
+}  // expand_conv_arg()
+
+void expand_EW_arg(EWAddArgs *arg) {
+  EWAddArgs args = *arg;
+  // uint64_t cmd = args.relu_enabled ? USE_RELU : 0;
+  uint64_t cmd = 0;
+  uint64_t datalen = (uint64_t)args.image0.width *
+                     (uint64_t)args.image0.height *
+                     (uint64_t)args.image0.channels;
+  uint64_t coefficient = (uint64_t)args.const0 << 32 | (uint64_t)args.const1;
+  uint64_t image0_address_phy = vaddr_to_paddr(args.image0.address);
+  uint64_t image1_address_phy = vaddr_to_paddr(args.image1.address);
+  uint64_t output_address_phy = vaddr_to_paddr(args.output.address);
+
+  uint64_t image_amount_per_row =
+      align_to_x((uint64_t)args.image0.width * (uint64_t)args.image0.channels,
+                 IMAGE_ALIGNMENT);
+  uint64_t image_image_pixel = ((uint64_t)args.image0.channels << 32) |
+                               ((uint64_t)args.image0.width << 16) |
+                               (uint64_t)args.image0.height;
+
+  (*arg).driver.image0_address_phy = image0_address_phy;
+  (*arg).driver.image1_address_phy = image1_address_phy;
+  (*arg).driver.datalen = datalen;
+  (*arg).driver.image_image_pixel = image_image_pixel;
+  (*arg).driver.image_amount_per_row = image_amount_per_row;
+  (*arg).driver.output_address_phy = output_address_phy;
+  (*arg).driver.coefficient = coefficient;
+  (*arg).driver.cmd = cmd;
+}  // expand_EW_arg
 
 void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
                     framework::Tensor *out, framework::Tensor *filter,
-                    bool relu_enabled, int group_num, int stride_h,
-                    int stride_w, int padding_h, int padding_w, float *bs_ptr) {
-  auto input_ptr = input->data<float>();
-  auto filter_ptr = filter->data<float>();
-  auto out_ptr = out->data<float>();
+                    ActivationType activation_enable,
+                    int16_t leaky_relu_negative_slope, int group_num,
+                    int stride_h, int stride_w, int padding_h, int padding_w,
+                    float *bs_ptr) {
+  auto input_ptr = input->data<half>();
+  auto filter_ptr = filter->data<int8_t>();
+  auto out_ptr = out->data<half>();
+  auto deleter = [](void *p) { fpga_free(p); };
 
   arg->group_num = (uint32_t)group_num;
-  arg->split_num = 1;
+  // Either group_num or split_num = 1;
+  arg->split_num = group_num == 1 ? (uint32_t)get_plit_num(filter) : 1;
   arg->filter_num = (uint32_t)filter->dims()[0];
   arg->output.address = out_ptr;
   arg->output.scale_address = out->scale;
   arg->conv_arg =
       (ConvArgs *)fpga_malloc(arg->split_num * sizeof(ConvArgs));  // NOLINT
 
+  arg->shared_conv_arg = std::shared_ptr<ConvArgs>(arg->conv_arg, deleter);
+
+  memset(arg->conv_arg, 0, arg->split_num * sizeof(struct ConvArgs));
+
   arg->concat_arg.image_num = arg->split_num;
   arg->concat_arg.image_out = out_ptr;
   arg->concat_arg.scale_out = out->scale;
@@ -183,43 +512,509 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
 
   int n = arg->split_num;
   arg->concat_arg.images_in =
-      (half **)fpga_malloc(n * sizeof(int *));  // NOLINT
+      static_cast<int16_t **>(fpga_malloc(n * sizeof(int *)));
   arg->concat_arg.scales_in =
-      (float **)fpga_malloc(n * sizeof(float *));  // NOLINT
+      static_cast<float **>(fpga_malloc(n * sizeof(float *)));
   arg->concat_arg.channel_num =
-      (uint32_t *)fpga_malloc(n * sizeof(uint32_t));  // NOLINT
+      static_cast<uint32_t *>(fpga_malloc(n * sizeof(uint32_t)));
+  arg->vector_concat_space.push_back(std::shared_ptr<char>(
+      reinterpret_cast<char *>(arg->concat_arg.images_in), deleter));
+  arg->vector_concat_space.push_back(std::shared_ptr<char>(
+      reinterpret_cast<char *>(arg->concat_arg.scales_in), deleter));
+  arg->vector_concat_space.push_back(std::shared_ptr<char>(
+      reinterpret_cast<char *>(arg->concat_arg.channel_num), deleter));
+
+  auto channel = (int)out->dims()[1];  // NOLINT
+  int filter_num_per_div = get_filter_num_per_div(filter, group_num);
+  int element_num = get_aligned_filter_element_num(
+      (int)(filter->dims()[1] * filter->dims()[2] *  // NOLINT
+            filter->dims()[3]));
 
   for (int i = 0; i < n; i++) {
-    arg->conv_arg[i].relu_enabled = relu_enabled;
-    arg->conv_arg[i].sb_address = bs_ptr;
-    arg->conv_arg[i].filter_address = (int8_t *)filter_ptr;  // NOLINT
-    arg->conv_arg[i].filter_scale_address = filter->scale;
-    arg->conv_arg[i].filter_num = arg->filter_num;
+    // arg->conv_arg[i].relu_enabled = relu_enabled;
+    arg->conv_arg[i].output.activation.activation_type = activation_enable;
+    arg->conv_arg[i].output.activation.leaky_relu_negative_slope =
+        leaky_relu_negative_slope;
     arg->conv_arg[i].group_num = (uint32_t)group_num;
-
     arg->conv_arg[i].kernel.stride_h = (uint32_t)stride_h;
     arg->conv_arg[i].kernel.stride_w = (uint32_t)stride_w;
     arg->conv_arg[i].kernel.height = (uint32_t)filter->dims()[2];
     arg->conv_arg[i].kernel.width = (uint32_t)filter->dims()[3];
-
     arg->conv_arg[i].image.address = input_ptr;
-    arg->conv_arg[i].image.scale_address = input->scale;
-    arg->conv_arg[i].image.channels =
-        (uint32_t)get_aligned_channel_num((int)(input->dims()[1]));  // NOLINT
+    arg->conv_arg[i].image.channels = (uint32_t)input->dims()[1];
     arg->conv_arg[i].image.height = (uint32_t)input->dims()[2];
     arg->conv_arg[i].image.width = (uint32_t)input->dims()[3];
+    arg->conv_arg[i].image.scale_address = input->scale;
     arg->conv_arg[i].image.pad_height = (uint32_t)padding_h;
     arg->conv_arg[i].image.pad_width = (uint32_t)padding_w;
+    arg->conv_arg[i].filter_scale_address = filter->scale;
+    arg->conv_arg[i].filter_num = (uint32_t)(
+        i == n - 1 ? channel - (n - 1) * filter_num_per_div  // NOLINT
+                   : filter_num_per_div);
 
-    arg->conv_arg[i].output.address = out_ptr;
-    arg->conv_arg[i].output.scale_address = out->scale;
+    size_t filter_size =
+        element_num *
+        align_to_x(arg->conv_arg[i].filter_num, FILTER_NUM_ALIGNMENT) *
+        sizeof(int8_t);
+    auto filter_head = &(
+        (int8_t *)filter_ptr)[i * element_num * filter_num_per_div];  // NOLINT
+    arg->conv_arg[i].filter_address = fpga_malloc(filter_size);
+    arg->vector_conv_space.push_back(std::shared_ptr<char>(
+        reinterpret_cast<char *>(arg->conv_arg[i].filter_address), deleter));
+    memcpy(arg->conv_arg[i].filter_address, filter_head, filter_size);
+    fpga_flush(arg->conv_arg[i].filter_address, filter_size);
+    // for test
+    //    {
+    //    static int cnt = 0;
+    //    if(cnt == 4){
+    //        int8_t result = 0;
+    //        std::string str = "fc_filter";
+    //      fpga::savefile<int8_t>(str, arg->conv_arg[i].filter_address,
+    //      filter_size, result);
+    //
+    //    }
+    //    cnt++;
+    //}
 
-    int num_after_alignment = filter::calc_aligned_num(
-        arg->filter_num, (int)input->dims()[1]);  // NOLINT
-    arg->conv_arg[i].free_space =
-        fpga_malloc(num_after_alignment * 2 * sizeof(float));  // half
+    size_t bs_size = 2 *
+                     align_to_x(arg->conv_arg[i].filter_num, BS_NUM_ALIGNMENT) *
+                     sizeof(float);
+    auto bs_head = &bs_ptr[i * filter_num_per_div * 2];
+    arg->conv_arg[i].sb_address = fpga_malloc(bs_size);
+    arg->vector_conv_space.push_back(std::shared_ptr<char>(
+        reinterpret_cast<char *>(arg->conv_arg[i].sb_address), deleter));
+    memcpy(arg->conv_arg[i].sb_address, bs_head, bs_size);
+    fpga_flush(arg->conv_arg[i].sb_address, bs_size);
+    // for test
+    /*{
+    static int cnt = 0;
+    if(cnt == 4){
+        float result = 0;
+        std::string str = "fc_bs";
+      fpga::savefile<float>(str, arg->conv_arg[i].sb_address, bs_size/4,
+result);
+
+    }
+    cnt++;
+}*/
+
+    if (n > 1) {
+      arg->conv_arg[i].output.scale_address =
+          static_cast<float *>(fpga_malloc(2 * sizeof(float)));
+      arg->conv_arg[i].output.address =
+          fpga_malloc(out->dims()[2] *
+                      align_to_x((int)(out->dims()[3] *  // NOLINT
+                                       arg->conv_arg[i].filter_num),
+                                 IMAGE_ALIGNMENT) *
+                      sizeof(half));
+      arg->vector_conv_space.push_back(std::shared_ptr<char>(
+          reinterpret_cast<char *>(arg->conv_arg[i].output.scale_address),
+          deleter));
+      arg->vector_conv_space.push_back(std::shared_ptr<char>(
+          reinterpret_cast<char *>(arg->conv_arg[i].output.address), deleter));
+    } else {
+      arg->conv_arg[i].output.scale_address = out->scale;
+      arg->conv_arg[i].output.address = out_ptr;
+    }
+
+    arg->concat_arg.images_in[i] =
+        (half *)arg->conv_arg[i].output.address;  // NOLINT
+    arg->concat_arg.scales_in[i] = arg->conv_arg[i].output.scale_address;
+    arg->concat_arg.channel_num[i] = arg->conv_arg[i].filter_num;
+
+    expand_conv_arg(&arg->conv_arg[i]);
   }
-}
+  filter->reset_data_ptr(nullptr);
+  fpga_free(bs_ptr);
+}  // fill_split_arg
+
+void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
+                     framework::Tensor *out, framework::Tensor *filter,
+                     ActivationType activation_enable,
+                     int16_t leaky_relu_negative_slope, int group_num,
+                     int stride_h, int stride_w, int padding_h, int padding_w,
+                     float *bs_ptr) {
+  auto input_ptr = input->data<half>();
+  auto filter_ptr = filter->data<int8_t>();
+  auto deleter = [](void *p) { fpga_free(p); };
+
+  arg->group_num = (uint32_t)group_num;
+  arg->sub_conv_num = (uint32_t)stride_h;
+  arg->filter_num = (uint32_t)filter->dims()[0];
+  uint32_t sub_conv_num = arg->sub_conv_num;
+  int sub_pad =
+      deconv_filter::deconv_calc_sub_pad((int)filter->dims()[3],  // NOLINT
+                                         padding_w, stride_w);
+  auto sub_filter_width = (uint32_t)deconv_filter::deconv_get_sub_filter_axis(
+      (int)filter->dims()[3], stride_w);  // NOLINT
+
+  auto sub_output_width = (uint32_t)deconv_filter::deconv_get_sub_out_axis(
+      (int)input->dims()[3], sub_pad, sub_filter_width);  // NOLINT
+  auto sub_output_height = (uint32_t)deconv_filter::deconv_get_sub_out_axis(
+      (int)input->dims()[2], sub_pad, sub_filter_width);  // NOLINT
+
+  arg->sub_output_width = (uint32_t)sub_output_width;
+  arg->sub_output_height = (uint32_t)sub_output_height;
+  arg->omit_size = (uint32_t)deconv_filter::deconv_get_omit(
+      stride_w, (int)filter->dims()[3], padding_w);  // NOLINT
+
+  auto sub_channels = (int)input->dims()[1];  // NOLINT
+  uint32_t omit_size = arg->omit_size;
+  int real_out_width = sub_output_width * sub_conv_num - 2 * omit_size;
+  int sub_filter_num = sub_conv_num * (arg->filter_num);
+
+  framework::DDim dims_out_new = framework::make_ddim(
+      {1, arg->filter_num, sub_output_height * sub_conv_num, real_out_width});
+  fpga::format_fp16_ofm(out, dims_out_new);
+  auto out_ptr = out->data<half>();
+  arg->output.address =
+      (half *)out_ptr +  // NOLINT
+      omit_size * sizeof(half) *
+          (align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT));
+  arg->output.scale_address = out->scale;
+
+  uint32_t conv_output_size =
+      (align_to_x(sub_output_width * sub_filter_num, IMAGE_ALIGNMENT)) *
+      sub_output_height;
+  uint32_t split_num =
+      group_num == 1 ? (uint32_t)get_deconv_plit_num(filter, sub_conv_num) : 1;
+
+  for (int i = 0; i < sub_conv_num; ++i) {
+    arg->split_conv_args.push_back(std::make_shared<SplitConvArgs>());
+    arg->split_conv_args[i]->filter_num =
+        (arg->sub_conv_num) * (arg->filter_num);
+    arg->split_conv_args[i]->group_num = (uint32_t)group_num;
+    arg->split_conv_args[i]->split_num = split_num;
+    arg->split_conv_args[i]->concat_arg.height = sub_output_height;
+    arg->split_conv_args[i]->concat_arg.width = sub_output_width;
+    arg->split_conv_args[i]->concat_arg.image_num = split_num;
+
+    arg->split_conv_args[i]->conv_arg =
+        static_cast<ConvArgs *>(fpga_malloc(split_num * sizeof(ConvArgs)));
+    arg->split_conv_args[i]->concat_arg.images_in =
+        static_cast<int16_t **>(fpga_malloc(split_num * sizeof(int16_t *)));
+    arg->split_conv_args[i]->concat_arg.scales_in =
+        static_cast<float **>(fpga_malloc(split_num * sizeof(float *)));
+    arg->split_conv_args[i]->concat_arg.channel_num =
+        static_cast<uint32_t *>(fpga_malloc(split_num * sizeof(uint32_t)));
+    arg->split_conv_args[i]->shared_conv_arg =
+        std::shared_ptr<ConvArgs>(arg->split_conv_args[i]->conv_arg, deleter);
+    arg->split_conv_args[i]->vector_concat_space.push_back(
+        std::shared_ptr<char>(
+            reinterpret_cast<char *>(
+                arg->split_conv_args[i]->concat_arg.images_in),
+            deleter));
+    arg->split_conv_args[i]->vector_concat_space.push_back(
+        std::shared_ptr<char>(
+            reinterpret_cast<char *>(
+                arg->split_conv_args[i]->concat_arg.scales_in),
+            deleter));
+    arg->split_conv_args[i]->vector_concat_space.push_back(
+        std::shared_ptr<char>(
+            reinterpret_cast<char *>(
+                arg->split_conv_args[i]->concat_arg.channel_num),
+            deleter));
+  }
+
+  auto filter_num_per_div =
+      (uint32_t)get_deconv_filter_num_per_div(filter, group_num, stride_w);
+  int element_num = get_aligned_filter_element_num(
+      (int)(sub_channels * sub_filter_width * sub_filter_width));  // NOLINT
+
+  int chw = sub_channels * sub_filter_width * sub_filter_width;
+  int division_capacity = filter::calc_division_capacity(chw);
+  int num_per_div_before_alignment =
+      filter::calc_num_per_div(sub_filter_num, group_num, division_capacity);
+  int num_per_div_after_alignment =
+      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
+  int div_num = (sub_filter_num + num_per_div_before_alignment - 1) /
+                num_per_div_before_alignment;
+  int residual = sub_filter_num % num_per_div_before_alignment;
+  int num_after_alignment = num_per_div_after_alignment *
+                                ((residual == 0) ? div_num : (div_num - 1)) +
+                            align_to_x(residual, FILTER_NUM_ALIGNMENT);
+
+  int filter_sub_conv_offset = element_num * num_after_alignment;
+  uint32_t out_addr_offset = 0;
+  for (int i = 0; i < sub_conv_num; ++i) {
+    if (sub_conv_num == 1) {
+      arg->split_conv_args[i]->output.address = arg->output.address;
+      arg->split_conv_args[i]->output.scale_address = arg->output.scale_address;
+      out_addr_offset = 0;
+
+    } else {
+      out_addr_offset =
+          sizeof(int16_t) * (sub_conv_num - 1 - i) *
+          (align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT));
+
+      arg->split_conv_args[i]->output.address = out_ptr;
+      arg->split_conv_args[i]->output.scale_address =
+          static_cast<float *>(fpga_malloc(2 * sizeof(float)));
+      arg->split_conv_args[i]->vector_conv_space.push_back(
+          std::shared_ptr<char>(
+              reinterpret_cast<char *>(
+                  arg->split_conv_args[i]->output.scale_address),
+              deleter));
+    }
+
+    for (int j = 0; j < split_num; ++j) {
+      arg->split_conv_args[i]->conv_arg[j].output.activation.activation_type =
+          activation_enable;
+      arg->split_conv_args[i]
+          ->conv_arg[j]
+          .output.activation.leaky_relu_negative_slope =
+          leaky_relu_negative_slope;
+      arg->split_conv_args[i]->conv_arg[j].group_num = (uint32_t)group_num;
+
+      arg->split_conv_args[i]->conv_arg[j].kernel.width =
+          (uint32_t)sub_filter_width;
+      arg->split_conv_args[i]->conv_arg[j].kernel.height =
+          (uint32_t)sub_filter_width;
+      arg->split_conv_args[i]->conv_arg[j].kernel.stride_w = 1;
+      arg->split_conv_args[i]->conv_arg[j].kernel.stride_h = 1;
+
+      arg->split_conv_args[i]->conv_arg[j].deconv_tx_param.deconv_en = 1;
+      arg->split_conv_args[i]->conv_arg[j].deconv_tx_param.sub_conv_num =
+          sub_conv_num;
+      arg->split_conv_args[i]->conv_arg[j].deconv_tx_param.omit_size =
+          omit_size;
+      arg->split_conv_args[i]->conv_arg[j].deconv_tx_param.out_addr_offset =
+          out_addr_offset;
+
+      arg->split_conv_args[i]->conv_arg[j].image.scale_address = input->scale;
+      arg->split_conv_args[i]->conv_arg[j].image.channels =
+          (uint32_t)sub_channels;
+      arg->split_conv_args[i]->conv_arg[j].image.width =
+          (uint32_t)input->dims()[3];
+      arg->split_conv_args[i]->conv_arg[j].image.height =
+          (uint32_t)input->dims()[2];
+      arg->split_conv_args[i]->conv_arg[j].image.pad_width = (uint32_t)sub_pad;
+      arg->split_conv_args[i]->conv_arg[j].image.pad_height = (uint32_t)sub_pad;
+      arg->split_conv_args[i]->conv_arg[j].image.address = input_ptr;
+
+      arg->split_conv_args[i]->conv_arg[j].filter_scale_address = filter->scale;
+      arg->split_conv_args[i]->conv_arg[j].filter_num =
+          (uint32_t)(j == split_num - 1
+                         ? sub_filter_num - (split_num - 1) * filter_num_per_div
+                         : filter_num_per_div);
+
+      size_t filter_size =
+          element_num *
+          align_to_x(arg->split_conv_args[i]->conv_arg[j].filter_num,
+                     FILTER_NUM_ALIGNMENT) *
+          sizeof(int8_t);
+      auto filter_head = &((
+          int8_t *)filter_ptr)[j * element_num * filter_num_per_div +  // NOLINT
+                               i * filter_sub_conv_offset];
+      arg->split_conv_args[i]->conv_arg[j].filter_address =
+          fpga_malloc(filter_size);
+      arg->split_conv_args[i]->vector_conv_space.push_back(
+          std::shared_ptr<char>(
+              reinterpret_cast<char *>(
+                  arg->split_conv_args[i]->conv_arg[j].filter_address),
+              deleter));
+
+      memcpy(arg->split_conv_args[i]->conv_arg[j].filter_address, filter_head,
+             filter_size);
+      fpga_flush(arg->split_conv_args[i]->conv_arg[j].filter_address,
+                 filter_size);
+
+      size_t bs_align_num = align_to_x(
+          arg->split_conv_args[i]->conv_arg[j].filter_num, BS_NUM_ALIGNMENT);
+      size_t bs_size = 2 * bs_align_num * sizeof(float);
+      auto bs_head = &bs_ptr[j * filter_num_per_div * 2];
+
+      arg->split_conv_args[i]->conv_arg[j].sb_address = fpga_malloc(bs_size);
+      arg->split_conv_args[i]->vector_conv_space.push_back(
+          std::shared_ptr<char>(
+              reinterpret_cast<char *>(
+                  arg->split_conv_args[i]->conv_arg[j].sb_address),
+              deleter));
+
+      memcpy(arg->split_conv_args[i]->conv_arg[j].sb_address, bs_head, bs_size);
+      fpga_flush(arg->split_conv_args[i]->conv_arg[j].sb_address, bs_size);
+
+      if (split_num == 1) {
+        arg->split_conv_args[i]->conv_arg[j].output.address =
+            arg->split_conv_args[i]->output.address;
+        arg->split_conv_args[i]->conv_arg[j].output.scale_address =
+            arg->split_conv_args[i]->output.scale_address;
+      } else {
+        arg->split_conv_args[i]->conv_arg[j].output.address =
+            fpga_malloc(conv_output_size * sizeof(int16_t));
+        arg->split_conv_args[i]->conv_arg[j].output.scale_address =
+            static_cast<float *>(fpga_malloc(2 * sizeof(float)));
+        arg->split_conv_args[i]->vector_conv_space.push_back(
+            std::shared_ptr<char>(
+                reinterpret_cast<char *>(
+                    arg->split_conv_args[i]->conv_arg[j].output.address),
+                deleter));
+        arg->split_conv_args[i]->vector_conv_space.push_back(
+            std::shared_ptr<char>(
+                reinterpret_cast<char *>(
+                    arg->split_conv_args[i]->conv_arg[j].output.scale_address),
+                deleter));
+      }
+      arg->split_conv_args[i]->concat_arg.images_in[j] = static_cast<half *>(
+          arg->split_conv_args[i]->conv_arg[j].output.address);
+      arg->split_conv_args[i]->concat_arg.scales_in[j] =
+          arg->split_conv_args[i]->conv_arg[j].output.scale_address;
+      arg->split_conv_args[i]->concat_arg.channel_num[j] =
+          arg->split_conv_args[i]->conv_arg[j].filter_num;
+
+      expand_conv_arg(&(arg->split_conv_args[i]->conv_arg[j]));
+    }
+
+    arg->split_conv_args[i]->concat_arg.image_out =
+        arg->split_conv_args[i]->output.address;
+    arg->split_conv_args[i]->concat_arg.scale_out =
+        arg->split_conv_args[i]->output.scale_address;
+  }
+  filter->reset_data_ptr(nullptr);
+  fpga_free(bs_ptr);
+}  // fill_deconv_arg
+
+void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input,
+                     framework::Tensor *out, framework::Tensor *filter,
+                     ActivationType activation_enable,
+                     int16_t leaky_relu_negative_slope, int stride_h,
+                     int stride_w, int padding_h, int padding_w,
+                     float *bias_ptr) {
+  auto deleter = [](void *p) { fpga_free(p); };
+  arg->vector_dwconv_space.push_back(
+      std::shared_ptr<char>(reinterpret_cast<char *>(bias_ptr), deleter));
+
+  auto filter_ptr = filter->data<int16_t>();
+  auto input_ptr = input->data<half>();
+  auto output_ptr = out->mutable_data<half>();
+  arg->sub_conv_num = 1;
+  // arg->relu_enabled = relu_enabled;
+  arg->output.activation.activation_type = activation_enable;
+  arg->output.activation.leaky_relu_negative_slope = leaky_relu_negative_slope;
+  arg->bias_address = bias_ptr;
+  arg->filter_address = filter_ptr;
+  arg->kernel.height = (uint32_t)filter->dims()[2];
+  arg->kernel.width = (uint32_t)filter->dims()[3];
+  arg->kernel.stride_h = (uint32_t)stride_h;
+  arg->kernel.stride_w = (uint32_t)stride_w;
+  arg->image.address = input_ptr;
+  arg->image.channels = (uint32_t)input->dims()[1];
+  arg->image.height = (uint32_t)input->dims()[2];
+  arg->image.width = (uint32_t)input->dims()[3];
+  arg->image.pad_height = (uint32_t)padding_h;
+  arg->image.pad_width = (uint32_t)padding_w;
+  arg->image.scale_address = input->scale;
+  arg->output.address = output_ptr;
+  arg->output.scale_address = out->scale;
+}  // end dwconv arg fill
+
+void fill_DWDeconv_arg(struct DWDeconvArgs *arg, framework::Tensor *input,
+                       framework::Tensor *out, framework::Tensor *filter,
+                       ActivationType activation_enable,
+                       int16_t leaky_relu_negative_slope, int stride_h,
+                       int stride_w, int padding_h, int padding_w,
+                       float *bias_ptr) {
+  auto filter_ptr = filter->data<int8_t>();
+  auto input_ptr = input->data<half>();
+
+  auto deleter = [](void *p) { fpga_free(p); };
+
+  arg->group_num = (uint32_t)filter->dims()[0];
+  arg->sub_conv_num = (uint32_t)stride_w;
+  arg->filter_num = (uint32_t)filter->dims()[0];
+
+  int sub_conv_num = stride_w;
+
+  int sub_pad =
+      deconv_filter::deconv_calc_sub_pad((int)filter->dims()[3],  // NOLINT
+                                         padding_w, stride_w);
+  auto sub_filter_width = (uint32_t)deconv_filter::deconv_get_sub_filter_axis(
+      (int)filter->dims()[3], stride_w);  // NOLINT
+
+  auto sub_output_width = (uint32_t)deconv_filter::deconv_get_sub_out_axis(
+      (int)input->dims()[3], sub_pad, sub_filter_width);  // NOLINT
+  auto sub_output_height = (uint32_t)deconv_filter::deconv_get_sub_out_axis(
+      (int)input->dims()[2], sub_pad, sub_filter_width);  // NOLINT
+
+  arg->sub_output_width = (uint32_t)sub_output_width;
+  arg->sub_output_height = (uint32_t)sub_output_height;
+  arg->omit_size = (uint32_t)deconv_filter::deconv_get_omit(
+      stride_w, (int)filter->dims()[3], padding_w);  // NOLINT
+
+  auto sub_channels = (int)input->dims()[1];  // NOLINT
+  uint32_t omit_size = arg->omit_size;
+  int real_out_width = sub_output_width * sub_conv_num - 2 * omit_size;
+  int real_out_height = sub_output_height * sub_conv_num - 2 * omit_size;
+  int sub_filter_num = sub_conv_num * (arg->filter_num);
+
+  framework::DDim dims_out_new = framework::make_ddim(
+      {1, arg->filter_num, real_out_height, real_out_width});
+  fpga::format_fp16_ofm(out, dims_out_new);
+  auto out_ptr = out->data<half>();
+
+  /*====For Addition
+  arg->output.address =
+      (half *)out_ptr +  // NOLINT
+      omit_size * sizeof(half) *
+          (align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT));
+          */
+  arg->output.address = out_ptr;
+  arg->output.scale_address = out->scale;
+
+  int filter_offset = sub_filter_width * sub_filter_width *
+                      align_to_x(sub_channels, FILTER_ELEMENT_ALIGNMENT) *
+                      arg->sub_conv_num;
+
+  for (int i = 0; i < sub_conv_num; ++i) {
+    arg->dw_conv_args.push_back(std::make_shared<DWconvArgs>());
+
+    arg->dw_conv_args[i]->sub_conv_num = sub_conv_num;
+    // arg->dw_conv_args[i]->relu_enabled = relu_enabled;
+    arg->dw_conv_args[i]->output.activation.activation_type = activation_enable;
+    arg->dw_conv_args[i]->output.activation.leaky_relu_negative_slope =
+        leaky_relu_negative_slope;
+    arg->dw_conv_args[i]->bias_address = bias_ptr;
+
+    arg->dw_conv_args[i]->filter_address =
+        fpga_malloc(filter_offset * sizeof(int16_t));
+    memcpy(arg->dw_conv_args[i]->filter_address,
+           (reinterpret_cast<half *>(filter_ptr) + i * filter_offset),
+           filter_offset * sizeof(int16_t));
+    arg->vector_dw_conv_space.push_back(std::shared_ptr<char>(
+        reinterpret_cast<char *>(arg->dw_conv_args[i]->filter_address),
+        deleter));
+
+    arg->dw_conv_args[i]->kernel.height = (uint32_t)sub_filter_width;
+    arg->dw_conv_args[i]->kernel.width = (uint32_t)sub_filter_width;
+
+    arg->dw_conv_args[i]->kernel.stride_h = (uint32_t)1;
+    arg->dw_conv_args[i]->kernel.stride_w = (uint32_t)1;
+    arg->dw_conv_args[i]->image.address = input_ptr;
+    arg->dw_conv_args[i]->image.channels = (uint32_t)input->dims()[1];
+    arg->dw_conv_args[i]->image.height = (uint32_t)input->dims()[2];
+    arg->dw_conv_args[i]->image.width = (uint32_t)input->dims()[3];
+
+    arg->dw_conv_args[i]->image.pad_height = sub_pad;
+    arg->dw_conv_args[i]->image.pad_width = sub_pad;
+    arg->dw_conv_args[i]->image.scale_address = input->scale;
+
+    arg->dw_conv_args[i]->output.address =
+        fpga_malloc(sub_output_height *
+                    align_to_x(sub_output_width * sub_channels * sub_conv_num,
+                               IMAGE_ALIGNMENT) *
+                    sizeof(int16_t));
+    arg->dw_conv_args[i]->output.scale_address =
+        static_cast<float *>(fpga_malloc(2 * sizeof(float)));
+    arg->vector_dw_conv_space.push_back(std::shared_ptr<char>(
+        reinterpret_cast<char *>(arg->dw_conv_args[i]->output.address),
+        deleter));
+    arg->vector_dw_conv_space.push_back(std::shared_ptr<char>(
+        reinterpret_cast<char *>(arg->dw_conv_args[i]->output.scale_address),
+        deleter));
+  }
+
+  // arg->output.scale_address = out->scale;
+}  // end dwconv arg fill
 
 }  // namespace fpga
 }  // namespace paddle_mobile
diff --git a/src/fpga/V2/api.h b/src/fpga/V2/api.h
index 7f87f158a8..33a5d3d33f 100644
--- a/src/fpga/V2/api.h
+++ b/src/fpga/V2/api.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <string>
 #include "fpga/common/fpga_common.h"
 #include "fpga/common/pe.h"
 #include "framework/tensor.h"
@@ -21,31 +22,81 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace fpga {
 
+void format_image(framework::Tensor* image_tensor);
+void format_ofm(framework::Tensor* ofm_tensor);
+void format_fp16_ofm(framework::Tensor* ofm_tensor);  // only allocate memory
+void format_fp16_ofm(framework::Tensor* ofm_tensor, framework::DDim dims);
+void format_fp32_ofm(framework::Tensor* ofm_tensor);
+
 float filter_find_max(framework::Tensor* filter_tensor);
-int get_aligned_channel_num(int channel_num);
-int get_aligned_filter_num(framework::Tensor* filter_tensor);
-int get_conv_output_channel(framework::Tensor* filter_tensor);
+int get_filter_num_per_div(framework::Tensor* filter_tensor, int group_num);
+int get_deconv_filter_num_per_div(framework::Tensor* filter_tensor,
+                                  int group_num, int stride);
 
-void format_image(framework::Tensor* image_tensor);
-void format_fp16_ofm(framework::Tensor* ofm_tensor,
-                     int aligned_channel);  // only allocate memory
-void format_fp32_ofm(framework::Tensor* ofm_tensor, int aligned_channel);
+int get_plit_num(framework::Tensor* filter_tensor);
+int get_deconv_plit_num(framework::Tensor* filter_tensor, int stride);
 
+int get_aligned_filter_element_num(int chw);
 void format_filter(framework::Tensor* filter_tensor, float max_value,
                    int group_num);
 void format_fc_filter(framework::Tensor* filter_tensor, float max_value);
-void format_bias_scale_array(float** bias_scale_array, int filter_num,
-                             int filter_channel);
+void format_bias_scale_array(float** bias_scale_array,
+                             int element_num_per_division, int num);
+void format_bias_array(float** bias_array, int num);
 void format_concat_output(framework::Tensor* out, int height, int width,
-                          uint32_t out_channel);
-int format_conv_data(framework::Tensor* filter_tensor,
-                     framework::Tensor* ofm_tensor, float** bs_ptr, int group);
-int format_fc_data(framework::Tensor* filter_tensor,
-                   framework::Tensor* ofm_tensor, float** bs_ptr);
+                          int image_num, uint32_t* channel_num);
+
 void fill_split_arg(struct SplitConvArgs* arg, framework::Tensor* input,
                     framework::Tensor* out, framework::Tensor* filter,
-                    bool relu_enabled, int group_num, int stride_h,
-                    int stride_w, int padding_h, int padding_w, float* bs_ptr);
+                    ActivationType activation_enable,
+                    int16_t leaky_relu_negative_slope, int group_num,
+                    int stride_h, int stride_w, int padding_h, int padding_w,
+                    float* bs_ptr);
+void fill_deconv_arg(struct DeconvArgs* arg, framework::Tensor* input,
+                     framework::Tensor* out, framework::Tensor* filter,
+                     ActivationType activation_enable,
+                     int16_t leaky_relu_negative_slope, int group_num,
+                     int stride_h, int stride_w, int padding_h, int padding_w,
+                     float* bs_ptr);
+void fill_dwconv_arg(struct DWconvArgs* arg, framework::Tensor* input,
+                     framework::Tensor* out, framework::Tensor* filter,
+                     ActivationType activation_enable,
+                     int16_t leaky_relu_negative_slope, int stride_h,
+                     int stride_w, int padding_h, int padding_w,
+                     float* bias_ptr);
+void fill_DWDeconv_arg(struct DWDeconvArgs* arg, framework::Tensor* input,
+                       framework::Tensor* out, framework::Tensor* filter,
+                       ActivationType activation_enable,
+                       int16_t leaky_relu_negative_slope, int stride_h,
+                       int stride_w, int padding_h, int padding_w,
+                       float* bs_ptr);
+
+void format_deconv_filter(framework::Tensor* filter_tensor, float max_value,
+                          int group_num, int stride);
+void format_dwconv_filter(framework::Tensor* filter_tensor, float* scale_ptr);
+void format_conv_data(framework::Tensor* filter_tensor,
+                      framework::Tensor* ofm_tensor, float** bs_ptr, int group);
+void format_deconv_data(framework::Tensor* filter_tensor,
+                        framework::Tensor* ofm_tensor, float** bs_ptr,
+                        int group, int sub_conv_n);
+void format_dwconv_data(framework::Tensor* filter_tensor,
+                        framework::Tensor* ofm_tensor, float* scale_ptr,
+                        float** bias_ptr);
+void format_DWDeconv_data(framework::Tensor* filter_tensor,
+                          framework::Tensor* ofm_tensor, float** bs_ptr,
+                          int group, int sub_conv_n);
+
+template <typename Dtype>
+void savefile(std::string filename, void* buffer, int dataSize, Dtype tmp) {
+  float data;
+  std::ofstream out(filename.c_str());
+  for (int i = 0; i < dataSize; ++i) {
+    data = (((Dtype*)buffer)[i]);  // NOLINT
+    out << data << std::endl;
+  }
+  out.close();
+  return;
+}
 
 }  // namespace fpga
 }  // namespace paddle_mobile
diff --git a/src/fpga/V2/bias_scale.cpp b/src/fpga/V2/bias_scale.cpp
index c8f587da33..ca93fe17ca 100644
--- a/src/fpga/V2/bias_scale.cpp
+++ b/src/fpga/V2/bias_scale.cpp
@@ -20,26 +20,81 @@ namespace paddle_mobile {
 namespace fpga {
 namespace bias_scale {
 
-void align_element(float **data_in, int num, int num_after_alignment) {
+void align_element(float **data_in, int num_per_div_before_alignment, int num) {
+  int copynum = 0;
   float *ptr_unaligned = *data_in;
-  int total_element = 2 * num_after_alignment;  // including bias & scale
+  int div_num =
+      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
+  int num_per_div_after_alignment =
+      align_to_x(num_per_div_before_alignment, BS_NUM_ALIGNMENT);
+  int num_element =
+      2 * div_num * num_per_div_after_alignment;  // including bias & scale
   float *ptr_aligned =
-      (float *)fpga_malloc(total_element * sizeof(float));  // NOLINT
-  memset(ptr_aligned, 0, total_element * sizeof(float));
+      (float *)fpga_malloc(num_element * sizeof(float));  // NOLINT
 
-  for (int i = 0; i < num; i++) {
-    ptr_aligned[i * 2 + 0] = ptr_unaligned[i];
-    ptr_aligned[i * 2 + 1] = ptr_unaligned[i + num];
+  memset(ptr_aligned, 0, num_element * sizeof(float));
+
+  for (int i = 0; i < div_num; i++) {
+    if (i == div_num - 1) {
+      copynum = (num_per_div_after_alignment * div_num > num)
+                    ? (num % num_per_div_after_alignment)
+                    : (num_per_div_before_alignment);
+    } else {
+      copynum = num_per_div_before_alignment;
+    }
+
+    memcpy(ptr_aligned + i * num_per_div_after_alignment,
+           ptr_unaligned + num_per_div_before_alignment * i,
+           copynum * sizeof(float));
+    memcpy(ptr_aligned + (div_num + i) * num_per_div_after_alignment,
+           ptr_unaligned + num_per_div_before_alignment * i + num,
+           copynum * sizeof(float));
   }
 
   fpga_free(ptr_unaligned);
   *data_in = ptr_aligned;
 }
 
-void format_bias_scale_array(float **data_in, int num,
-                             int num_after_alignment) {
-  align_element(data_in, num, num_after_alignment);
-  fpga_flush(*data_in, 2 * num_after_alignment * sizeof(float));
+void interleave(float **data_in, int num_after_alignment) {
+  // num_after_alignment: number of bias after alignment
+
+  float *ptr_uninterleaved = *data_in;
+  float *ptr_interleaved =
+      (float *)fpga_malloc(2 * num_after_alignment * sizeof(float));  // NOLINT
+  int num = num_after_alignment / 4;
+  for (int i = 0; i < num; i++) {
+    memcpy(ptr_interleaved + 8 * i, ptr_uninterleaved + 4 * i,
+           4 * sizeof(float));
+    memcpy(ptr_interleaved + 8 * i + 4,
+           ptr_uninterleaved + num_after_alignment + 4 * i, 4 * sizeof(float));
+  }
+
+  fpga_free(ptr_uninterleaved);
+  *data_in = ptr_interleaved;
+}
+
+void format_bias_scale_array(float **bias_scale_array,
+                             int element_num_per_division, int num) {
+  align_element(bias_scale_array, element_num_per_division, num);
+  int div_num = (num + element_num_per_division - 1) / element_num_per_division;
+  int element_num_after_division =
+      align_to_x(element_num_per_division, BS_NUM_ALIGNMENT);
+  interleave(bias_scale_array, div_num * element_num_after_division);
+  fpga_flush(*bias_scale_array, 2 * element_num_after_division * sizeof(float));
+}
+void format_bias_array(float **bias_array, int num) {
+  float *ptr_unaligned = *bias_array;
+  int num_before_align = num;
+  int num_after_align = align_to_x(num_before_align, BIAS_NUM_ALIGNMENT);
+  int16_t *ptr_aligned =
+      (int16_t *)fpga_malloc(num_after_align * sizeof(int16_t));  // NOLINT
+
+  memset(ptr_aligned, 0, num_after_align * sizeof(int16_t));
+  for (int i = 0; i < num_before_align; i++) {
+    ptr_aligned[i] = fp32_2_fp16(ptr_unaligned[i]);
+  }
+  *bias_array = (float *)ptr_aligned;  // NOLINT
+  fpga_free(ptr_unaligned);
 }
 
 }  // namespace bias_scale
diff --git a/src/fpga/V2/bias_scale.h b/src/fpga/V2/bias_scale.h
index 6040c0bef1..9ebdc71bce 100644
--- a/src/fpga/V2/bias_scale.h
+++ b/src/fpga/V2/bias_scale.h
@@ -18,8 +18,11 @@ namespace paddle_mobile {
 namespace fpga {
 namespace bias_scale {
 
-void align_element(float **data_in, int num, int num_after_alignment);
-void format_bias_scale_array(float **data_in, int num, int num_after_alignment);
+void align_element(float** data_in, int num_per_div_before_alignment, int num);
+void interleave(float** data_in, int num_after_alignment);
+void format_bias_scale_array(float** bias_scale_array,
+                             int element_num_per_division, int num);
+void format_bias_array(float** bias_array, int num);
 
 }  // namespace bias_scale
 }  // namespace fpga
diff --git a/src/fpga/V2/deconv_bias_scale.cpp b/src/fpga/V2/deconv_bias_scale.cpp
new file mode 100644
index 0000000000..f88e1a7738
--- /dev/null
+++ b/src/fpga/V2/deconv_bias_scale.cpp
@@ -0,0 +1,48 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "fpga/V2/deconv_bias_scale.h"
+// #include "deconv_bias_scale.h"
+#include "fpga/V2/bias_scale.h"
+// #include "bias_scale.h"
+// #include <memory.h>
+
+#include "fpga/V2/api.h"
+// #include "fpga_api.h"
+namespace paddle_mobile {
+namespace fpga {
+namespace deconv_bias_scale {
+
+void deconv_bias_scale_expand(float** bias_scale_array, int num,
+                              int sub_conv_n) {
+  int sub_num = num * sub_conv_n;
+  float* ptr_tmp = *bias_scale_array;
+  float* ptr_bias_scale_expand =
+      reinterpret_cast<float*>(fpga_malloc(sizeof(float) * sub_num * 2));
+  int scale_base_offset = sub_num;
+  for (int i = 0; i < sub_conv_n; ++i) {
+    int offset = num * i;
+    // copy bias
+    fpga_copy(ptr_bias_scale_expand + offset, ptr_tmp, num * sizeof(float));
+    // copy scale
+    fpga_copy(ptr_bias_scale_expand + scale_base_offset + offset, ptr_tmp + num,
+              num * sizeof(float));
+  }
+  *bias_scale_array = ptr_bias_scale_expand;
+  fpga_free(ptr_tmp);
+}
+
+}  // namespace deconv_bias_scale
+}  // namespace fpga
+}  // namespace paddle_mobile
diff --git a/src/fpga/V2/deconv_bias_scale.h b/src/fpga/V2/deconv_bias_scale.h
new file mode 100644
index 0000000000..820c6984d4
--- /dev/null
+++ b/src/fpga/V2/deconv_bias_scale.h
@@ -0,0 +1,26 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace paddle_mobile {
+namespace fpga {
+namespace deconv_bias_scale {
+
+void deconv_bias_scale_expand(float** bias_scale_array, int num,
+                              int sub_conv_n);
+
+}  // namespace deconv_bias_scale
+}  // namespace fpga
+}  // namespace paddle_mobile
diff --git a/src/fpga/V2/deconv_filter.cpp b/src/fpga/V2/deconv_filter.cpp
new file mode 100644
index 0000000000..5ed9786f19
--- /dev/null
+++ b/src/fpga/V2/deconv_filter.cpp
@@ -0,0 +1,280 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "fpga/V2/deconv_filter.h"
+#include <memory.h>
+#include <algorithm>
+// #include "deconv_filter.h"
+#include "fpga/V2/filter.h"
+// #include "filter.h"
+#include "fpga/V2/api.h"
+
+namespace paddle_mobile {
+namespace fpga {
+namespace deconv_filter {
+
+/*
+inverse kernel weights of each channel for every filter
+*/
+void deconv_inverse_filter(float** data_in, int num, int channel, int width,
+                           int height) {
+  float* tmp = *data_in;
+  int data_size = num * channel * width * height;
+  int hw_len = height * width;
+  auto tmp_data =
+      reinterpret_cast<float*>(fpga_malloc(data_size * sizeof(float)));
+  for (int i = 0; i < num; ++i) {
+    for (int j = 0; j < channel; ++j) {
+      for (int k = 0; k < hw_len; ++k) {
+        tmp_data[i * channel * hw_len + j * hw_len + k] =
+            (*data_in)[i * channel * hw_len + j * hw_len + hw_len - k - 1];
+      }
+    }
+  }
+  *data_in = tmp_data;
+  fpga_free(tmp);
+}
+
+/*
+    calculate sub padding number
+*/
+int deconv_calc_sub_pad(int filter_axis, int pad, int stride) {
+  if (stride == 0 || ((filter_axis - pad - 1) < 0)) {
+    PADDLE_MOBILE_ENFORCE(false, "Wrong deconv parameters");
+  }
+  return (filter_axis - pad - 1) / stride;
+}
+int deconv_get_sub_filter_axis(int filter_axis, int stride) {
+  return (filter_axis / stride);
+}
+
+int deconv_get_sub_out_axis(int image_axis, int sub_pad, int sub_filter_axis) {
+  return ((image_axis + 2 * sub_pad - sub_filter_axis) + 1);
+}
+
+/*
+    (filter_width-pad,filter_width-pad) is the first pixel of sub-pixel image
+   position. so the omit rows or columns is (stride - )
+*/
+int deconv_get_omit(int stride, int filter_width, int pad) {
+  PADDLE_MOBILE_ENFORCE(filter_width > pad, "Wrong deconv parameters");
+  int idx;
+  bool flag = false;
+  for (idx = 1; idx <= stride; ++idx) {
+    int j = idx;
+    for (; j <= filter_width;) {
+      if (j == filter_width - pad) {
+        flag = true;
+        break;
+      }
+      j = j + stride;
+    }
+    if (flag) {
+      break;
+    }
+  }
+
+  return (stride - idx);
+}
+
+template <typename T>
+void deconv_get_sub_filter(T** data_in, int height, int width, int sub_conv_n,
+                           int kernel_num, int channel) {
+  T* ptr_tmp = *data_in;
+  int sub_num = kernel_num * sub_conv_n;
+  int sub_h = height / sub_conv_n;
+  int sub_w = width / sub_conv_n;
+
+  int sub_filter_size =
+      kernel_num * sub_h * sub_w * channel * sub_conv_n * sub_conv_n;
+
+  T* ptr_sub_filter =
+      reinterpret_cast<T*>(fpga_malloc(sub_filter_size * sizeof(T)));
+  for (int idx = 0; idx < sub_conv_n; ++idx) {
+    for (int nn = 0; nn < sub_num; ++nn) {
+      int ni = nn % kernel_num;
+
+      int woff = sub_conv_n - 1 - (nn / kernel_num);  //
+
+      for (int hh = 0; hh < sub_h; ++hh) {
+        int hi = hh * sub_conv_n + idx % sub_conv_n;
+        for (int ww = 0; ww < sub_w; ++ww) {
+          int wi = ww * sub_conv_n + woff;  // 1 0
+
+          int sidx = ((nn * sub_h + hh) * sub_w + ww) * channel;   //
+          int kidx = ((ni * height + hi) * width + wi) * channel;  //
+
+          fpga_copy(
+              ptr_sub_filter + idx * sub_h * sub_w * channel * sub_num + sidx,
+              (*data_in) + kidx, channel * sizeof(T));
+          // for (int cc =0; cc < channel; ++cc) {
+          //     ptr_sub_filter[idx*sub_h*sub_w*channel*sub_num + sidx + cc] =
+          //     (*data_in)[kidx + cc];
+          // }
+        }
+      }
+    }
+  }
+  *data_in = ptr_sub_filter;
+  fpga_free(ptr_tmp);
+}
+
+void deconv_NC_convert(float** filter_in, int kernel_num, int channels,
+                       int hw) {
+  float* tmp = *filter_in;
+  float* ptr_filter = reinterpret_cast<float*>(paddle_mobile::fpga::fpga_malloc(
+      hw * kernel_num * channels * sizeof(float)));
+
+  for (int c = 0; c < channels; ++c) {
+    for (int n = 0; n < kernel_num; ++n) {
+      paddle_mobile::fpga::fpga_copy(ptr_filter + n * hw + kernel_num * hw * c,
+                                     tmp + n * channels * hw + c * hw,
+                                     hw * sizeof(float));
+    }
+  }
+  *filter_in = ptr_filter;
+  paddle_mobile::fpga::fpga_free(tmp);
+}
+
+void deconv_format_filter(float** data_in, int num, int channel, int height,
+                          int width, int group_num, float max, int stride) {
+  int data_size = channel * height * width * num;
+
+  /*{
+       float result2 = (float)0;
+       string filename = "origin_filter_data";
+       api::savefile<float>(filename, (void *)*data_in, data_size, result2);
+    }*/
+
+  deconv_inverse_filter(data_in, num, channel, width, height);
+
+  /* {
+          float result2 = (float)0;
+          string filename = "inverse_filter_data";
+          api::savefile<float>(filename, (void *)*data_in, data_size, result2);
+   }*/
+
+  filter::quantize(data_in, data_size, max);
+  /* {
+        char result2 = (char)0;
+        string filename = "quantize_filter_data";
+        api::savefile<char>(filename, (void *)*data_in, data_size, result2);
+ }*/
+  char** quantize_data = (char**)data_in;  // NOLINT
+
+  filter::convert_to_hwc(quantize_data, num, channel, height, width);
+  /*{
+       char result2 = (char)0;
+       string filename = "convert_to_hwc_filter_data";
+       api::savefile<char>(filename, (void *)*quantize_data, data_size,
+  result2);
+  }*/
+
+  deconv_get_sub_filter<char>(quantize_data, height, width, stride, num,
+                              channel);
+  /*{
+     char result2 = (char)0;
+     string filename = "sub_filter_filter_data";
+     api::savefile<char>(filename, (void *)*quantize_data, data_size, result2);
+}*/
+
+  int sub_conv_n = stride;
+  int sub_h = height / sub_conv_n;
+  int sub_w = width / sub_conv_n;
+  int sub_chw = sub_h * sub_w * channel;
+  int sub_num = sub_conv_n * num;
+  int division_capacity = filter::calc_division_capacity(sub_chw);
+  int num_per_div_before_alignment =
+      filter::calc_num_per_div(sub_num, group_num, division_capacity);
+  int num_per_div_after_alignment =
+      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
+  int div_num = (sub_num + num_per_div_before_alignment - 1) /
+                num_per_div_before_alignment;
+  int residual = (sub_num) % num_per_div_before_alignment;
+  int num_after_alignment = num_per_div_after_alignment *
+                                ((residual == 0) ? div_num : (div_num - 1)) +
+                            align_to_x(residual, FILTER_NUM_ALIGNMENT);
+
+  char** ptr_ptr_data =
+      reinterpret_cast<char**>(fpga_malloc(sub_conv_n * sizeof(char*)));
+  int origin_offset = sub_chw * sub_num;
+  for (int i = 0; i < sub_conv_n; ++i) {
+    (ptr_ptr_data)[i] =
+        reinterpret_cast<char*>(fpga_malloc(origin_offset * sizeof(char)));
+    fpga_copy((ptr_ptr_data)[i], (*quantize_data) + origin_offset * i,
+              origin_offset * sizeof(char));
+
+    /* char result2 = (char)0;
+     string filename = "ptr_ptr_data" + to_string(i);
+     api::savefile<char>(filename, (void *)(ptr_ptr_data[i]), origin_offset,
+     result2);
+     */
+  }
+  // char result2 = (char)0;
+  //      string filename = "interleave";
+  //      api::savefile<char>(filename, (void *)*ptr_ptr_data, origin_offset,
+  //      result2);
+  fpga_free(*quantize_data);
+
+  int align_offset =
+      align_to_x(sub_chw, FILTER_ELEMENT_ALIGNMENT) * num_after_alignment;
+  char* ptr_space = reinterpret_cast<char*>(fpga_malloc(
+      sub_conv_n * align_offset * sizeof(char)));  // continuous space
+  for (int i = 0; i < sub_conv_n; ++i) {
+    char* ptr_tmp = (ptr_ptr_data)[i];
+
+    filter::align_element(&ptr_tmp, sub_num, sub_chw);
+    filter::align_num(&ptr_tmp, num_per_div_before_alignment, sub_num, sub_chw);
+
+    filter::reorder(&ptr_tmp, num_after_alignment, sub_chw);
+    filter::interleave(&ptr_tmp, num_after_alignment, sub_chw);
+
+    /*   char result2 = (char)0;
+       string filename = "interleave" + to_string(i);
+       api::savefile<char>(filename, (void *)ptr_tmp, align_offset, result2);
+*/
+    fpga_copy(ptr_space + i * align_offset, ptr_tmp, align_offset);
+    fpga_free(ptr_tmp);
+  }
+  fpga_free(ptr_ptr_data);
+  *data_in = reinterpret_cast<float*>(ptr_space);
+
+  /*    {
+        char result2 = (char)0;
+         string filename = "ptr_space";
+         api::savefile<char>(filename, (void *)ptr_space, sub_conv_n *
+     align_offset, result2);
+      }*/
+  fpga_flush(ptr_space, sub_conv_n * align_offset * sizeof(char));
+}
+
+void DWDconv_format_filter(float** data_in, int num, int channel, int height,
+                           int width, float* scale_ptr, int stride) {
+  deconv_inverse_filter(data_in, num, channel, width, height);
+
+  filter::quantize_to_fp16(data_in, channel, height, width, scale_ptr);
+  int16_t** quantize_data = (int16_t**)data_in;  // NOLINT
+  filter::convert_to_hwn(quantize_data, channel, height, width);
+
+  deconv_get_sub_filter<int16_t>(quantize_data, height, width, stride, num,
+                                 channel);
+
+  filter::align_element_n(quantize_data, channel, height, width);
+  fpga_flush(*quantize_data, align_to_x(channel, FILTER_ELEMENT_ALIGNMENT) *
+                                 height * width * sizeof(int16_t));
+}
+
+}  // namespace deconv_filter
+}  // namespace fpga
+}  // namespace paddle_mobile
diff --git a/src/fpga/V2/deconv_filter.h b/src/fpga/V2/deconv_filter.h
new file mode 100644
index 0000000000..f1a50b95c5
--- /dev/null
+++ b/src/fpga/V2/deconv_filter.h
@@ -0,0 +1,39 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace paddle_mobile {
+namespace fpga {
+namespace deconv_filter {
+
+void deconv_inverse_filter(float** data_in, int num, int channel, int width,
+                           int height);
+int deconv_calc_sub_pad(int filter_axis, int pad, int stride);
+int deconv_get_sub_filter_axis(int filter_axis, int stride);
+int deconv_get_sub_out_axis(int image_axis, int sub_pad, int sub_filter_axis);
+int deconv_get_omit(int stride, int filter_width, int pad);
+
+template <typename T>
+void deconv_get_sub_filter(T** data_in, int height, int width, int sub_conv_n,
+                           int kernel_num, int channel);
+void deconv_format_filter(float** data_in, int num, int channel, int height,
+                          int width, int group_num, float max, int stride);
+void deconv_NC_convert(float** filter_in, int kernel_num, int channels, int hw);
+void DWDconv_format_filter(float** data_in, int num, int channel, int height,
+                           int width, float* scale_ptr, int stride);
+
+}  // namespace deconv_filter
+}  // namespace fpga
+}  // namespace paddle_mobile
diff --git a/src/fpga/V2/filter.cpp b/src/fpga/V2/filter.cpp
index 3b0692a99e..a281a7335c 100644
--- a/src/fpga/V2/filter.cpp
+++ b/src/fpga/V2/filter.cpp
@@ -16,44 +16,53 @@ limitations under the License. */
 #include <memory.h>
 #include <algorithm>
 #include "fpga/common/fpga_common.h"
+
 namespace paddle_mobile {
 namespace fpga {
 namespace filter {
 
-int calc_channel_parallelism(int channel) {
-  if (channel <= 16) {
-    return 16;
-  } else if (channel <= 32) {
-    return 32;
-  } else if (channel <= 64) {
-    return 64;
-  } else {
-    return 128;
-  }
-}
-int calc_aligned_channel(int channel) {
-  return align_to_x(channel, calc_channel_parallelism(channel));
+int calc_division_capacity(int chw) {
+  int n = 2048 / ((chw + 15) / 16) * 32;
+  return n < 2048 ? n : 2048;
 }
 
-int calc_num_parallelism(int channel) {
-  return FILTER_PARALLELISM / calc_channel_parallelism(channel);
+int calc_split_num(int num, int division_capacity) {
+  return (num + division_capacity - 1) / division_capacity;
 }
 
-int calc_aligned_num(int num, int channel) {
-  return align_to_x(num, calc_num_parallelism(channel));
+int calc_division_number(int num, int group_num, int division_capacity) {
+  //  PADDLE_MOBILE_ENFORCE(num % group_num == 0,
+  //                        "Filter number should be divisible by group
+  //                        number");
+  int split_num = calc_split_num(num, division_capacity);
+  //  PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1,
+  //                        "Split number or group number should be 1");
+  return group_num * split_num;
 }
 
-int calc_aligned_total_pixel_num(int num, int channel, int height, int width) {
-  int aligned_channel = calc_aligned_channel(channel);
-  int aligned_filter_num = calc_aligned_num(num, channel);
-  return aligned_filter_num * aligned_channel * height * width;
+int calc_num_per_div(int num, int group_num, int division_capacity) {
+  //  PADDLE_MOBILE_ENFORCE(num % group_num == 0,
+  //                        "Filter number should be divisible by group
+  //                        number");
+  int split_num = calc_split_num(num, division_capacity);
+  //  PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1,
+  //                        "Split number or group number should be 1");
+  if (group_num == 1) {
+    if (num > division_capacity) {
+      return division_capacity;
+    } else {
+      return num;
+    }
+  } else {
+    return (num + group_num - 1) / group_num;
+  }
 }
 
-void convert_to_hwc(float **data_in, int num, int channel, int height,
+void convert_to_hwc(char **data_in, int num, int channel, int height,
                     int width) {
-  float *tmp = *data_in;
+  char *tmp = *data_in;
   int chw = channel * height * width;
-  float *data_tmp = (float *)fpga_malloc(chw * num * sizeof(float));  // NOLINT
+  char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char));  // NOLINT
   for (int n = 0; n < num; n++) {
     int64_t amount_per_row = width * channel;
     for (int c = 0; c < channel; c++) {
@@ -66,52 +75,170 @@ void convert_to_hwc(float **data_in, int num, int channel, int height,
       }
     }
   }
+
   *data_in = data_tmp;
   fpga_free(tmp);
 }
 
-void align_filter(float **data_in, int num, int channel, int height,
-                  int width) {
-  int aligned_channel = calc_aligned_channel(channel);
-  int hw = height * width;
-  int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width);
-  float *new_data = (float *)fpga_malloc(pixel_num * sizeof(float));  // NOLINT
-  float *temp = *data_in;
-  memset(new_data, 0, pixel_num * sizeof(float));
-  for (int i = 0; i < num; i++) {
-    for (int j = 0; j < hw; j++) {
-      memcpy(new_data + i * aligned_channel * hw + j * aligned_channel,
-             temp + i * channel * hw + j * channel, channel * sizeof(float));
-    }
+float find_max(float *data_in, int data_size) {
+  float max = 0.0;
+  for (int i = 0; i < data_size; ++i) {
+    float value = data_in[i];
+    float abs = value > 0 ? value : -value;
+    max = std::max(max, abs);
+  }
+  return max;
+}
+
+signed char float_to_int8(float fdata) {
+  if (fdata < 0.0) {
+    fdata -= 0.5;
+  } else {
+    fdata += 0.5;
   }
-  *data_in = new_data;
-  fpga_free(temp);
+  return (signed char)fdata;
 }
-void convert_to_fp16(float **data_in, int data_size) {
+
+void quantize(float **data_in, int data_size, float max) {
   float *tmp = *data_in;
-  // half_float::half *tmp_data = (half_float::half *)fpga_malloc(data_size *
-  // sizeof(half_float::half));
-  int16_t *tmp_data =
-      (int16_t *)fpga_malloc(data_size * sizeof(int16_t));  // NOLINT
+  float fix_range = 127;
+  float scale = fix_range / max;
+
+  signed char *tmp_data = (signed char *)fpga_malloc(data_size * sizeof(char));
   for (int i = 0; i < data_size; i++) {
-    // tmp_data[i] = (half_float::half)((*data_in)[i]);
-    tmp_data[i] = fp32_2_fp16((*data_in)[i]);
+    tmp_data[i] = float_to_int8(
+        (*data_in)[i] * scale);  // (signed char)((*data_in)[i] * scale);
   }
   *data_in = (float *)tmp_data;  // NOLINT
   fpga_free(tmp);
 }
+
+void align_element(char **data_in, int num, int chw) {
+  int i = 0;
+  int j = 0;
+  int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
+  if (align_chw != chw) {
+    char *tmp = *data_in;
+    char *data_tmp =
+        (char *)fpga_malloc(num * align_chw * sizeof(char));  // NOLINT
+
+    memset(data_tmp, 0, num * align_chw);
+    for (j = 0; j < num; j++) {
+      memcpy(data_tmp + j * align_chw, (*data_in) + j * chw, chw);
+    }
+    *data_in = data_tmp;
+    fpga_free(tmp);
+  }
+}
+
+void align_num(char **data_in, int num_per_div_before_alignment, int num,
+               int chw) {
+  int i = 0;
+  int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
+  int num_per_div_after_alignment =
+      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
+
+  char *tmp = *data_in;
+  int div_num =
+      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
+  int num_element = div_num * num_per_div_after_alignment * align_chw;
+  char *data_tmp = (char *)fpga_malloc(num_element * sizeof(char));  // NOLINT
+
+  memset(data_tmp, 0, num_element * sizeof(char));
+
+  for (i = 0; i < div_num - 1; i++) {
+    memcpy(data_tmp + num_per_div_after_alignment * align_chw * i,
+           *data_in + num_per_div_before_alignment * align_chw * i,
+           num_per_div_before_alignment * align_chw);
+  }
+
+  memcpy(data_tmp + num_per_div_after_alignment * align_chw * i,
+         *data_in + num_per_div_before_alignment * align_chw * i,
+         (num - (div_num - 1) * num_per_div_before_alignment) * align_chw);
+
+  *data_in = data_tmp;
+  fpga_free(tmp);
+}
+
+void reorder(char **data_in, int num_after_alignment, int chw) {
+  int index = 0;
+  int new_index;
+
+  int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
+
+  char *data_tmp =
+      (char *)fpga_malloc(chw_align * num_after_alignment *  // NOLINT
+                          sizeof(char));
+  char *tmp = *data_in;
+  for (index = 0; index < num_after_alignment; index++) {
+    new_index = index / 32 * 32 + (index % 16 / 4 * 8) + (index % 16 % 4) +
+                (index / 16 % 2 * 4);
+    memcpy(data_tmp + index * chw_align, *data_in + new_index * chw_align,
+           chw_align);
+  }
+  *data_in = data_tmp;
+  fpga_free(tmp);
+}
+
+void interleave(char **data_in, int num_after_alignment, int chw) {
+  int i = 0;
+  int j = 0;
+  int k = 0;
+  int interleave_per_num = 16;
+
+  int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
+  char *data_tmp =
+      (char *)fpga_malloc(chw_align * num_after_alignment *  // NOLINT
+                          sizeof(char));
+  char *tmp = *data_in;
+  int interleave_num = chw_align * 2 / interleave_per_num;
+  for (i = 0; i < num_after_alignment; i += 2) {
+    for (j = 0, k = 0; j < interleave_num; j += 2, k++) {
+      memcpy(data_tmp + i * chw_align + interleave_per_num * j,
+             *data_in + i * chw_align + interleave_per_num * k,
+             interleave_per_num);
+      memcpy(data_tmp + i * chw_align + interleave_per_num * (j + 1),
+             *data_in + (i + 1) * chw_align + interleave_per_num * k,
+             interleave_per_num);
+    }
+  }
+  *data_in = data_tmp;
+  fpga_free(tmp);
+}
+
 void format_filter(float **data_in, int num, int channel, int height, int width,
                    int group_num, float max) {
-  convert_to_hwc(data_in, num, channel, height, width);
-  align_filter(data_in, num, channel, height, width);
-  int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width);
-  convert_to_fp16(data_in, pixel_num);
-  fpga_flush(*data_in, pixel_num * sizeof(float));
+  int data_size = channel * height * width * num;
+  int chw = channel * height * width;
+
+  int division_capacity = calc_division_capacity(chw);
+  int num_per_div_before_alignment =
+      calc_num_per_div(num, group_num, division_capacity);
+  int num_per_div_after_alignment =
+      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
+  int div_num =
+      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
+  int residual = num % num_per_div_before_alignment;
+  int num_after_alignment = num_per_div_after_alignment *
+                                ((residual == 0) ? div_num : (div_num - 1)) +
+                            align_to_x(residual, FILTER_NUM_ALIGNMENT);
+  quantize(data_in, data_size, max);
+  char **quantize_data = (char **)data_in;  // NOLINT
+  convert_to_hwc(quantize_data, num, channel, height, width);
+  align_element(quantize_data, num, chw);
+  if (num_after_alignment != num) {
+    align_num(quantize_data, num_per_div_before_alignment, num, chw);
+  }
+
+  reorder(quantize_data, num_after_alignment, chw);
+  interleave(quantize_data, num_after_alignment, chw);
+  fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) *
+                                 num_after_alignment * sizeof(char));
 }
 
-void convert_fc_filter(float **data_in, int num, int chw) {
-  float *tmp = *data_in;
-  float *data_tmp = (float *)fpga_malloc(chw * num * sizeof(float));  // NOLINT
+void convert_fc_filter(char **data_in, int num, int chw) {
+  char *tmp = *data_in;
+  char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char));  // NOLINT
   for (int n = 0; n < num; n++) {
     for (int c = 0; c < chw; c++) {
       data_tmp[n * chw + c] = (*data_in)[num * c + n];
@@ -123,47 +250,113 @@ void convert_fc_filter(float **data_in, int num, int chw) {
 
 void format_fc_filter(float **data_in, int num, int channel, int height,
                       int width, int group_num, float max) {
+  int data_size = channel * height * width * num;
   int chw = channel * height * width;
-  convert_fc_filter(data_in, num, chw);
-  align_filter(data_in, num, channel, height, width);
-  int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width);
-  convert_to_fp16(data_in, pixel_num);
-  fpga_flush(*data_in, pixel_num * sizeof(float));
-}
 
-float find_max(float *data_in, int data_size) {
-  float max = 0.0;
-  for (int i = 0; i < data_size; ++i) {
-    float value = data_in[i];
-    float abs = value > 0 ? value : -value;
-    max = std::max(max, abs);
+  int division_capacity = calc_division_capacity(chw);
+  int num_per_div_before_alignment =
+      calc_num_per_div(num, group_num, division_capacity);
+  int num_per_div_after_alignment =
+      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
+  int div_num =
+      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
+  int residual = num % num_per_div_before_alignment;
+  int num_after_alignment = num_per_div_after_alignment *
+                                ((residual == 0) ? div_num : (div_num - 1)) +
+                            align_to_x(residual, FILTER_NUM_ALIGNMENT);
+
+  quantize(data_in, data_size, max);
+  char **quantize_data = (char **)data_in;  // NOLINT
+  convert_fc_filter(quantize_data, num, chw);
+  convert_to_hwc(quantize_data, num, channel, height, width);
+  align_element(quantize_data, num, chw);
+  if (num_after_alignment != num) {
+    align_num(quantize_data, num_per_div_before_alignment, num, chw);
   }
-  return max;
+  reorder(quantize_data, num_after_alignment, chw);
+  interleave(quantize_data, num_after_alignment, chw);
+  fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) *
+                                 num_after_alignment * sizeof(char));
+}
+void convert_to_hwn(int16_t **data_in, int num, int height, int width) {
+  int16_t *tmp = *data_in;
+  int16_t *data_tmp =
+      (int16_t *)fpga_malloc(height * width * num * sizeof(int16_t));  // NOLINT
+  for (int n = 0; n < num; n++) {
+    for (int h = 0; h < height; h++) {
+      for (int w = 0; w < width; w++) {
+        *(data_tmp + h * width * num + w * num + n) = *((*data_in)++);
+      }
+    }
+  }
+  *data_in = data_tmp;
+  fpga_free(tmp);
 }
 
-signed char float_to_int8(float fdata) {
-  if (fdata < 0.0) {
-    fdata -= 0.5;
+void align_element_n(int16_t **data_in, int num, int height, int width) {
+  int unalign_n = num;
+  int align_n = align_to_x(num, FILTER_ELEMENT_ALIGNMENT);
+  if (unalign_n == align_n) {
+    return;
   } else {
-    fdata += 0.5;
+    int16_t *tmp = *data_in;
+
+    int num_element = height * width * align_n;
+    int16_t *data_tmp =
+        (int16_t *)fpga_malloc(num_element * sizeof(int16_t));  // NOLINT
+
+    memset(data_tmp, 0, num_element * sizeof(int16_t));
+    for (int h = 0; h < height; h++) {
+      for (int w = 0; w < width; w++) {
+        int offset_unalign = h * width * unalign_n + w * unalign_n;
+        int offset_align = h * width * align_n + w * align_n;
+        for (int n = 0; n < unalign_n; n++) {
+          data_tmp[offset_align + n] = *((*data_in) + offset_unalign + n);
+        }
+      }
+    }
+
+    *data_in = data_tmp;
+    fpga_free(tmp);
   }
-  return (signed char)fdata;
 }
-
-void quantize(float **data_in, int data_size, float max) {
+void quantize_to_fp16(float **data_in, int num, int height, int width,
+                      float *scale_ptr) {
   float *tmp = *data_in;
-  float fix_range = 127;
-  float scale = fix_range / max;
+  int size = num * height * width;
 
-  signed char *tmp_data = (signed char *)fpga_malloc(data_size * sizeof(char));
-  for (int i = 0; i < data_size; i++) {
-    tmp_data[i] = float_to_int8(
-        (*data_in)[i] * scale);  // (signed char)((*data_in)[i] * scale);
+  int16_t *tmp_data = (int16_t *)fpga_malloc(size * sizeof(int16_t));  // NOLINT
+  for (int n = 0; n < num; n++) {
+    float scale_val = scale_ptr[n];
+    for (int h = 0; h < height; h++) {
+      for (int w = 0; w < width; w++) {
+        int index = n * height * width + h * width + w;
+        tmp_data[index] = fp32_2_fp16((*data_in)[index] * scale_val);
+      }
+    }
   }
   *data_in = (float *)tmp_data;  // NOLINT
   fpga_free(tmp);
 }
+void format_dwconv_filter(float **data_in, int num, int height, int width,
+                          float *scale_ptr) {
+  quantize_to_fp16(data_in, num, height, width, scale_ptr);
+  int16_t **quantize_data = (int16_t **)data_in;  // NOLINT
+  convert_to_hwn(quantize_data, num, height, width);
+  align_element_n(quantize_data, num, height, width);
+  fpga_flush(*quantize_data, align_to_x(num, FILTER_ELEMENT_ALIGNMENT) *
+                                 height * width * sizeof(int16_t));
+}
 
+void format_DWDeconv_filter(float **data_in, int num, int height, int width,
+                            float *scale_ptr) {
+  quantize_to_fp16(data_in, num, height, width, scale_ptr);
+  int16_t **quantize_data = (int16_t **)data_in;  // NOLINT
+  convert_to_hwn(quantize_data, num, height, width);
+  align_element_n(quantize_data, num, height, width);
+  fpga_flush(*quantize_data, align_to_x(num, FILTER_ELEMENT_ALIGNMENT) *
+                                 height * width * sizeof(int16_t));
+}
 }  // namespace filter
 }  // namespace fpga
 }  // namespace paddle_mobile
diff --git a/src/fpga/V2/filter.h b/src/fpga/V2/filter.h
index 08c758bca4..4812a75af2 100644
--- a/src/fpga/V2/filter.h
+++ b/src/fpga/V2/filter.h
@@ -13,25 +13,38 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-
-#define FILTER_PARALLELISM 1024
+#include <cstdint>
 namespace paddle_mobile {
 namespace fpga {
 namespace filter {
 
-int calc_channel_parallelism(int channel);
-int calc_aligned_channel(int channel);
-int calc_num_parallelism(int channel);
-int calc_aligned_num(int num, int channel);
-int calc_aligned_total_pixel_num(int num, int channel, int height, int width);
-void convert_to_hwc(float** data_in, int num, int channel, int height,
+int calc_division_capacity(int chw);
+int calc_split_num(int num, int division_capacity);
+int calc_division_number(int num, int group_num, int division_capacity);
+int calc_num_per_div(int num, int group_num, int division_capacity);
+void convert_to_hwc(char** data_in, int num, int channel, int height,
                     int width);
+float find_max(float* data_in, int data_size);
+void quantize(float** data_in, int data_size, float max);
+void align_element(char** data_in, int num, int chw);
+void align_num(char** data_in, int num_per_div_before_alignment, int num,
+               int chw);
+void reorder(char** data_in, int num_after_alignment, int chw);
+void interleave(char** data_in, int num_after_alignment, int chw);
 void format_filter(float** data_in, int num, int channel, int height, int width,
                    int group_num, float max);
-void convert_fc_filter(float** data_in, int num, int chw);
+
+void convert_fc_filter(char** data_in, int num, int chw);
 void format_fc_filter(float** data_in, int num, int channel, int height,
                       int width, int group_num, float max);
-float find_max(float* data_in, int data_size);
+
+void convert_to_hwn(int16_t** data_in, int num, int height, int width);
+void align_element_n(int16_t** data_in, int num, int height, int width);
+void quantize_to_fp16(float** data_in, int num, int height, int width,
+                      float* scale_ptr);
+void format_dwconv_filter(float** data_in, int num, int height, int width,
+                          float* scale_ptr);
+
 }  // namespace filter
 }  // namespace fpga
 }  // namespace paddle_mobile
diff --git a/src/fpga/V2/image.cpp b/src/fpga/V2/image.cpp
index 3d1ed95df2..928526e2b9 100644
--- a/src/fpga/V2/image.cpp
+++ b/src/fpga/V2/image.cpp
@@ -13,80 +13,124 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "fpga/V2/image.h"
-#include <memory.h>
-#include <algorithm>
-#include "fpga/common/fpga_common.h"
 
 namespace paddle_mobile {
 namespace fpga {
 namespace image {
 
-void convert_to_hwc(float **data_in, int channel, int height, int width) {
-  float *tmp = *data_in;
-  float *data_tmp =
-      (float *)fpga_malloc(channel * height * width * sizeof(float));  // NOLINT
+void convert_to_hwc(float **data_in, int channel, int height, int width,
+                    int num) {
+  float *data_tmp = reinterpret_cast<float *>(
+      fpga_malloc(num * channel * height * width * sizeof(float)));
   int64_t amount_per_row = width * channel;
-  for (int c = 0; c < channel; c++) {
-    for (int h = 0; h < height; h++) {
-      int64_t offset_height = h * amount_per_row;
-      for (int w = 0; w < width; w++) {
-        *(data_tmp + offset_height + w * channel + c) = *((*data_in)++);
+  for (int n = 0; n < num; n++) {
+    for (int c = 0; c < channel; c++) {
+      for (int h = 0; h < height; h++) {
+        int64_t offset_height = h * amount_per_row;
+        for (int w = 0; w < width; w++) {
+          *(data_tmp + n * channel * height * width + offset_height +
+            w * channel + c) = *((*data_in)++);
+        }
       }
     }
   }
   *data_in = data_tmp;
-  fpga_free(tmp);
 }
-void align_image(float **data_in, int channel, int height, int width,
-                 int aligned_channel) {
-  if (channel == aligned_channel) return;
-  float *tmp = *data_in;
-  float *new_data =
-      (float *)fpga_malloc(aligned_channel * height * width *  // NOLINT
-                           sizeof(float));                     // NOLINT
-  memset(new_data, 0, aligned_channel * height * width * sizeof(float));
 
-  for (int i = 0; i < height * width; i++) {
-    memcpy(new_data + i * aligned_channel, tmp + i * channel,
-           channel * sizeof(float));
+void convert_to_chw(float **data_in, int channel, int height, int width,
+                    int num) {
+  float *data_tmp =
+      (float *)fpga_malloc(channel * height * width * sizeof(float));  // NOLINT
+  int64_t amount_per_side = width * height;
+  for (int n = 0; n < num; n++) {
+    for (int h = 0; h < height; h++) {
+      for (int w = 0; w < width; w++) {
+        for (int c = 0; c < channel; c++) {
+          *(data_tmp + n * height * width * channel + c * amount_per_side +
+            width * h + w) = *((*data_in)++);
+        }
+      }
+    }
   }
-  *data_in = new_data;
-  fpga_free(tmp);
-}
-
-void format_image(float **data_in, int channel, int height, int width,
-                  int aligned_channel) {
-  convert_to_hwc(data_in, channel, height, width);
-  align_image(data_in, channel, height, width, aligned_channel);
-  fpga_flush(*data_in, aligned_channel * height * width * sizeof(float));
+  *data_in = data_tmp;
 }
 
 void concat_images(int16_t **images_in, float **scales_in, void *image_out,
-                   float *scale_out, int image_num, const uint32_t *channel_num,
-                   int height, int width, const uint32_t *aligned_channel_num,
-                   int out_channel) {
-  int hw = height * width;
+                   float *scale_out, int image_num, uint32_t *channel_num,
+                   int height, int width) {
+  int i = 0;
+  int j = 0;
+  int k = 0;
+  int each_out_line_channel = 0;
+  int align_each_out_area_cw = 0;
+  int align_each_in_area_cw = 0;
+  int align_each_out_area_cw_differ = 0;
+  int tmp_channel = 0;
   scale_out[0] = 0.0;
   scale_out[1] = 0.0;
-  for (int i = 0; i < image_num; i++) {
+  for (i = 0; i < image_num; i++) {
+    each_out_line_channel += channel_num[i];
     scale_out[0] = std::max(*scale_out, scales_in[i][0]);
     fpga_invalidate(images_in[i],
-                    height * width * aligned_channel_num[i] * sizeof(int16_t));
+                    height *
+                        align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT) *
+                        sizeof(int16_t));
   }
   scale_out[1] = 1 / scale_out[0];
+  align_each_out_area_cw =
+      align_to_x(each_out_line_channel * width, IMAGE_ALIGNMENT);
+  align_each_out_area_cw_differ =
+      align_each_out_area_cw - each_out_line_channel * width;
 
-  for (int j = 0; j < hw; j++) {
-    int tmp_channel_sum = 0;
-    for (int i = 0; i < image_num; i++) {
-      memcpy(
-          (int16_t *)image_out + j * out_channel + tmp_channel_sum,  // NOLINT
-          images_in[i] + j * aligned_channel_num[i],
-          channel_num[i] * sizeof(int16_t));
+  for (k = 0; k < height; k++) {
+    for (j = 0; j < width; j++) {
+      for (i = 0; i < image_num; i++) {
+        align_each_in_area_cw =
+            align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT);
+        memcpy((int16_t *)image_out + tmp_channel +  // NOLINT
+                   k * align_each_out_area_cw_differ,
+               images_in[i] + j * channel_num[i] + k * align_each_in_area_cw,
+               channel_num[i] * sizeof(int16_t));
 
-      tmp_channel_sum += channel_num[i];
+        tmp_channel += channel_num[i];
+      }
     }
   }
-  fpga_flush(image_out, hw * out_channel * sizeof(int16_t));
+
+  fpga_flush(image_out, height * align_each_out_area_cw * sizeof(int16_t));
+}
+
+void split_image(int16_t *image_in, const float *scale_in, void **images_out,
+                 float **scales_out, int image_num,
+                 const uint32_t *channel_nums, int height, int width) {
+  int total_channel = 0;
+  for (int i = 0; i < image_num; i++) {
+    scales_out[i][0] = scale_in[0];
+    scales_out[i][1] = scale_in[1];
+    total_channel += channel_nums[i];
+  }
+  int element_num = height * align_to_x(width * total_channel, IMAGE_ALIGNMENT);
+  fpga_invalidate(image_in, element_num * sizeof(int16_t));
+
+  int src_offset = 0, des_offset = 0;
+  for (int h = 0; h < height; h++) {
+    for (int w = 0; w < width; w++) {
+      src_offset = h * align_to_x(total_channel * width, IMAGE_ALIGNMENT) +
+                   w * total_channel;
+      for (int i = 0; i < image_num; i++) {
+        des_offset = h * align_to_x(channel_nums[i] * width, IMAGE_ALIGNMENT) +
+                     w * channel_nums[i];
+        memcpy(reinterpret_cast<int16_t *>(images_out[i]) + des_offset,
+               image_in + src_offset, channel_nums[i] * sizeof(int16_t));
+        src_offset += channel_nums[i];
+      }
+    }
+  }
+
+  for (int i = 0; i < image_num; i++) {
+    element_num = height * align_to_x(width * channel_nums[i], IMAGE_ALIGNMENT);
+    fpga_flush(images_out[i], element_num * sizeof(int16_t));
+  }
 }
 
 }  // namespace image
diff --git a/src/fpga/V2/image.h b/src/fpga/V2/image.h
index df20e583fc..f5dc6ffe3e 100644
--- a/src/fpga/V2/image.h
+++ b/src/fpga/V2/image.h
@@ -14,23 +14,63 @@ limitations under the License. */
 
 #pragma once
 
-#include <stdint.h>
-
+#include <memory.h>
+#include <algorithm>
+#include <cstdint>
+#include "fpga/common/fpga_common.h"
 namespace paddle_mobile {
 namespace fpga {
 namespace image {
 
-void convert_to_hwc(float **data_in, int channel, int height, int width);
-void align_image(float **data_in, int channel, int height, int width,
-                 int aligned_channel);
-void format_image(float **data_in, int channel, int height, int width,
-                  int aligned_channel);
-void concat_images(
-    int16_t **images_in, float **scales_in, void *image_out, float *scale_out,
-    int image_num, const uint32_t *channel_num, int height, int width,
-    const uint32_t *aligned_channel_num,
-    int out_channel);  // Concat featuremaps along channel direction
+void convert_to_hwc(float** data_in, int channel, int height, int width,
+                    int num = 1);
+void convert_to_chw(float** data_in, int channel, int height, int width,
+                    int num = 1);
+// template <typename Dtype>
+// void align_element_conv(Dtype** data_in, int height, int cw);
+// template <typename T>
+// void format_image(T** data_in, int channel, int height, int width);
+template <typename Dtype>
+void align_element_conv(Dtype** data_in, int height, int cw);
+template <typename Dtype>
+void align_element_conv(Dtype** data_in, int height, int cw) {
+  int h = 0;
+  int align_cw = align_to_x(cw, IMAGE_ALIGNMENT);
+
+  Dtype* data_tmp =
+      (Dtype*)fpga_malloc(height * align_cw * sizeof(Dtype));  // NOLINT
+
+  memset(data_tmp, 0, height * align_cw * sizeof(Dtype));
+
+  for (h = 0; h < height; h++) {
+    memcpy((void*)(data_tmp + h * align_cw),  // NOLINT
+           (void*)(*data_in + h * cw),        // NOLINT
+           cw * sizeof(Dtype));
+  }
+
+  *data_in = data_tmp;
+}
+template <typename T>
+void format_image(T** data_in, int channel, int height, int width) {
+  int cw = channel * width;
+  int align_cw = align_to_x(cw, IMAGE_ALIGNMENT);
+  if (align_cw != cw) {
+    T* hwc_temp = *data_in;
+    align_element_conv(data_in, height, channel * width);
+    fpga_free(hwc_temp);
+  }
+  fpga_flush(*data_in,
+             align_to_x(channel * width, IMAGE_ALIGNMENT) * height * sizeof(T));
+}
+// Concat featuremaps along channel direction
+void concat_images(int16_t** images_in, float** scales_in, void* image_out,
+                   float* scale_out, int image_num, uint32_t* channel_num,
+                   int height, int width);
 
+// Split featuremap along channel direction
+void split_image(int16_t* image_in, const float* scale_in, void** images_out,
+                 float** scales_out, int image_num,
+                 const uint32_t* channel_nums, int height, int width);
 }  // namespace image
 }  // namespace fpga
 }  // namespace paddle_mobile
diff --git a/src/fpga/V2/pe.cpp b/src/fpga/V2/pe.cpp
index d22bd17175..0503a51910 100644
--- a/src/fpga/V2/pe.cpp
+++ b/src/fpga/V2/pe.cpp
@@ -13,73 +13,189 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "fpga/common/pe.h"
-#include "fpga/V2/api.h"
+#include "common/enforce.h"
+#include "common/types.h"
 #include "fpga/V2/filter.h"
 #include "fpga/V2/image.h"
 #include "fpga/common/config.h"
 #include "fpga/common/driver.h"
-
-using namespace std;                          // NOLINT
-using namespace paddle_mobile::fpga::driver;  // NOLINT
+#include "fpga/common/fpga_common.h"
+#ifdef COST_TIME_PRINT
+#include <sys/time.h>
+#include <time.h>
+#include <iomanip>
+#include <iostream>
+#endif
 
 namespace paddle_mobile {
 namespace fpga {
-#define MUL8(x) (x * 8)
-#define BYPASS_DONE 2
-#define CONV_DONE 1
-
-static inline int get_image_out_axis(int src_len, int pad, int kernel_len,
-                                     int kernel_step) {
-  if (kernel_step == 0) {
-    return 0;
+
+using namespace driver;  // NOLINT
+using namespace std;     // NOLINT
+#define USE_RELU 1
+#define USE_BIAS 2
+
+// bypass cmd
+#define CMD_FP16_TO_FP16 0
+#define CMD_FP16_TO_FP32 1
+#define CMD_FP32_TO_FP16 2
+#define CMD_FP32_TO_FP32 3
+#define CMD_INT8_TO_FP16 4
+
+// bypass macro
+#define SIZE_FP16 2
+#define SIZE_FP32 4
+#define SIZE_INT8 1
+
+#define PE_IRQ_TIMEOUT 1000000
+
+/* Interrupt bit-set offset*/
+#define INTERRUPT_RSVD 0x0001
+#define INTERRUPT_BYPASS 0x0002
+#define INTERRUPT_CONV 0x0004
+#define INTERRUPT_POOLING 0x0008
+#define INTERRUPT_EW 0x0010
+
+/* Register offset */
+#define REG_INTERRUPT 0x000
+#define REG_VERSION 0x008
+#define REG_TEMPERATURE 0x010
+#define REG_FPGA_RESET 0x018
+#define REG_TEST_REGISTER 0x048
+#define REG_HARDWARE_STATUS 0x050
+
+#define REG_TIMER_COUNTER 0x070
+
+#define REG_SCALE_PARAMETER 0x080
+#define REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR 0x090
+
+#define REG_FLASH_CMD 0x200
+#define REG_FLASH_DATA 0x208
+#define REG_FLASH_CONFIG 0x210
+#define REG_FLASH_STATUS 0x218
+#define REG_SN 0x220
+
+/*bypass*/
+#define REG_CONVERT_CMD 0x400
+#define REG_CONVERT_SRC_ADDR 0x408
+#define REG_CONVERT_DST_ADDR 0x410
+#define REG_CONVERT_LENGTH 0x418
+
+/*resize*/
+#define REG_RESIZE_CMD 0x600
+#define REG_RESIZE_CHANNEL_NUMBER 0x608
+#define REG_RESIZE_INPUT_IMAGE_PIXEL 0x610
+#define REG_RESIZE_OUTPUT_IMAGE_PIXEL 0x618
+#define REG_RESIZE_INPUT_BASE_ADDR 0x620
+#define REG_RESIZE_WEIGHT_BASE_ADDR 0x628
+#define REG_RESIZE_SRC_POS_BASE_ADDR 0x630
+#define REG_RESIZE_OUTPUT_BASE_ADDR 0x638
+
+/*pooling*/
+#define REG_POOLING_CMD 0x800
+#define REG_POOLING_IMAGE_BASE_ADDR 0x808
+#define REG_POOLING_RESULT_BASE_ADDR 0x810
+#define REG_POOLING_IMAGE_PIXEL 0x818
+#define REG_POOLING_WINDOW_SIZE 0x820
+#define REG_POOLING_RESULT_PIXEL 0x828
+#define REG_POOLING_PAD_PIXEL 0x830
+#define REG_POOLING_STEP_PIXEL 0x838
+#define REG_POOLING_CHANNEL_NUMBER 0x840
+#define REG_POOLING_IMAGE_AMOUNT_PER_ROW 0x848
+#define REG_POOLING_IMAGE_ONE_PAD_PER_ROW 0x850
+#define REG_POOLING_IMAGE_TWO_PAD_PER_ROW 0x858
+#define REG_POOLING_IMAGE_ROW_MUL_WINDOW_HEIGHT 0x860
+#define REG_POOLING_IMAGE_ROW_MUL_PAD_HEIGHT 0x868
+#define REG_POOLING_IMAGE_ROW_MUL_STEP_HEIGHT 0x870
+#define REG_POOLING_RESULT_AMOUNT_ALIGN_32 0x878
+#define REG_POOLING_RESULT_AMOUNT_ALIGN_64 0x880
+#define REG_POOLING_IMAGE_CALCU_HEIGHT 0x888
+#define REG_POOLING_IMAGE_PADLEFT_SKIPWINDOW 0x898
+#define REG_POOLING_MODE_RECIPROCAL 0x890
+
+/*conv*/
+#define REG_CONV_CMD 0xC00
+#define REG_CONV_IMAGE_BASE_ADDR 0xC08
+#define REG_CONV_FILTER_BASE_ADDR 0xC10
+#define REG_CONV_SB_BASE_ADDR 0xC18
+#define REG_CONV_RESULT_BASE_ADDR 0xC20
+#define REG_CONV_IMAGE_PIXEL 0xC28
+#define REG_CONV_FILTER_PIXEL 0xC30
+#define REG_CONV_RESULT_PIXEL 0xC38
+#define REG_CONV_PAD_PIXEL 0xC40
+#define REG_CONV_STEP_PIXEL 0xC48
+#define REG_CONV_GROUP_NUMBER 0xC50
+#define REG_CONV_FILTER_NUMBER 0xC58
+#define REG_CONV_CHANNEL_NUMBER 0xC60
+#define REG_CONV_FILTER_PER_GROUP 0xC68
+#define REG_CONV_CHANNEL_PER_GROUP 0xC70
+#define REG_CONV_IMAGE_AMOUNT_PER_ROW 0xC78
+#define REG_CONV_IMAGE_ONE_PAD_PER_ROW 0xC80
+#define REG_CONV_IMAGE_TWO_PAD_PER_ROW 0xC88
+#define REG_CONV_FILTER_AMOUNT_ALL 0xC90
+#define REG_CONV_RESULT_AMOUNT_PER_ROW 0xC98
+#define REG_CONV_RESULT_LAST_VALID 0xCA0
+
+#define REG_CONV_BLOCK_AMOUNT_PER_ROW 0xCA8
+#define REG_CONV_FILTER_PAD_WIDTH_MUL_CH 0xCB0
+#define REG_CONV_IMAGE_AMOUNT_PER_ROW_MUL_WIN_F 0xCB8
+#define REG_CONV_IMAGE_AMOUNT_PER_ROW_MUL_WIN 0xCC0
+#define REG_CONV_IMAGE_BLOCK_NUM 0xCC8
+#define REG_CONV_IMAGE_BLOCK_LEN 0xCD0
+#define REG_CONV_IMAGE_BLOCK_LEN_LAST 0xCD8
+#define REG_CONV_IMAGE_WIN_CNT 0xCE0
+#define REG_CONV_IMAGE_WIN_CNT_LAST 0xCE8
+#define REG_CONV_RES_ROW_DATA_ALIGN4_PAD 0xCF8
+#define REG_CONV_PROG_FULL_CNT 0xD08
+#define REG_CONV_POST_PROG_FULL_CNT 0xD10
+#define REG_CONV_FPGA_BIAS_SCALE_LEN 0xD20
+
+#define REG_CONV_IMAGE_SCALE 0xD28
+#define REG_CONV_FILTER_SCALE 0xD30
+
+/*ew*/
+#define REG_EW_CMD 0x0F00
+#define REG_EW_IMAGE0_BASE_ADDR 0x0F08
+#define REG_EW_IMAGE1_BASE_ADDR 0x0F10
+#define REG_EW_RESULT_BASE_ADDR 0x0F18
+#define REG_EW_DATA_LEN 0x0F20
+#define REG_EW_COEFFICIENT 0x0F28
+#define REG_EW_IMAGE_PIXEL 0x0F30
+#define REG_EW_IMAGE_AMOUNT_PER_ROW 0x0F38
+
+/*dwconv*/
+#define REG_DWCONV_FILTER_BASE_ADDR 0xe08
+#define REG_DWCONV_FILTER_SHAPE 0xe10
+#define REG_DWCONV_FILTER_N_ALIGN 0xe18
+#define REG_DWCONV_FILTER_SUBNUMBER 0xe20
+#define REG_DWCONV_CMD 0xe00
+
+int ComputeFpgaConv(const struct SplitConvArgs &args) {
+//  ComputeBasicConv(args.conv_arg[0]);
+#ifdef FPGA_PRINT_MODE
+  DLOG << "=============ComputeFPGAConv===========";
+  DLOG << "   filter_num:" << args.filter_num
+       << "   group_num:" << args.group_num
+       << "   split_num:" << args.split_num;
+#endif
+  int ret = 0;
+  int split_num = args.split_num;
+  for (int i = 0; i < split_num; i++) {
+    ret |= ComputeBasicConv(args.conv_arg[i]);
   }
-  return (src_len + 2 * pad - kernel_len) / kernel_step + 1;
-}
 
-float Findfp16Max() {
-  uint16_t abs_vals[16];
-  uint64_t max_fp16;
-
-  max_fp16 = reg_readq(MUL8(49));
-  abs_vals[0] = (uint16_t)(0x0000007fff & (max_fp16));        // NOLINT
-  abs_vals[1] = (uint16_t)(0x0000007fff & (max_fp16 >> 16));  // NOLINT
-  abs_vals[2] = (uint16_t)(0x0000007fff & (max_fp16 >> 32));  // NOLINT
-  abs_vals[3] = (uint16_t)(0x0000007fff & (max_fp16 >> 48));  // NOLINT
-  max_fp16 = reg_readq(MUL8(50));
-  abs_vals[4] = (uint16_t)(0x0000007fff & (max_fp16));        // NOLINT
-  abs_vals[5] = (uint16_t)(0x0000007fff & (max_fp16 >> 16));  // NOLINT
-  abs_vals[6] = (uint16_t)(0x0000007fff & (max_fp16 >> 32));  // NOLINT
-  abs_vals[7] = (uint16_t)(0x0000007fff & (max_fp16 >> 48));  // NOLINT
-  max_fp16 = reg_readq(MUL8(51));
-  abs_vals[8] = (uint16_t)(0x0000007fff & (max_fp16));         // NOLINT
-  abs_vals[9] = (uint16_t)(0x0000007fff & (max_fp16 >> 16));   // NOLINT
-  abs_vals[10] = (uint16_t)(0x0000007fff & (max_fp16 >> 32));  // NOLINT
-  abs_vals[11] = (uint16_t)(0x0000007fff & (max_fp16 >> 48));  // NOLINT
-  max_fp16 = reg_readq(MUL8(52));
-  abs_vals[12] = (uint16_t)(0x0000007fff & (max_fp16));
-  abs_vals[13] = (uint16_t)(0x0000007fff & (max_fp16 >> 16));  // NOLINT
-  abs_vals[14] = (uint16_t)(0x0000007fff & (max_fp16 >> 32));  // NOLINT
-  abs_vals[15] = (uint16_t)(0x0000007fff & (max_fp16 >> 48));  // NOLINT
-
-  uint16_t tmp = 0;
-  for (int i = 0; i < 16; i++) {
-    if (tmp < abs_vals[i]) {
-      tmp = abs_vals[i];
-    }
+  if (split_num > 1) {
+    ComputeFPGAConcat(args.concat_arg);
   }
-  DLOG << "max value found: " << fp16_2_fp32(tmp);
-  return fp16_2_fp32(tmp) / 127.0f;
-}
 
-int ComputeFpgaConv(const struct SplitConvArgs &args) {
-  ComputeBasicConv(args.conv_arg[0]);
+  return ret;
 }
 
 int ComputeBasicConv(const struct ConvArgs &args) {
 #ifdef FPGA_PRINT_MODE
   DLOG << "======Compute Basic Conv======";
-  DLOG << "   relu_enabled:" << args.relu_enabled
-       << "   sb_address:" << args.sb_address
+  // DLOG << "   relu_enabled:" << args.relu_enabled
+  DLOG << "   sb_address:" << args.sb_address
        << "   filter_address:" << args.filter_address
        << "   filter_num:" << args.filter_num
        << "   group_num:" << args.group_num;
@@ -98,478 +214,112 @@ int ComputeBasicConv(const struct ConvArgs &args) {
        << "   out_scale_address:" << args.output.scale_address;
 #endif
 
-#ifndef PADDLE_MOBILE_ZU5
-  return 0;
-#endif
+#ifdef PADDLE_MOBILE_ZU5
+  int ret = 0;
+  uint64_t output_scale = 0;
 
-  uint64_t ifm_pixel_num =
-      ((args.image.width) * (args.image.height) * args.image.channels);
-  uint64_t ifm_memory_size = ifm_pixel_num * sizeof(short);          // NOLINT
-  uint64_t flt_pixel_num = (args.filter_num * (args.kernel.width) *  // NOLINT
-                            (args.kernel.height) * args.image.channels);
-  uint64_t filter_memory_size = flt_pixel_num * sizeof(short);  // NOLINT
-
-  uint64_t bn_pixel_num = (args.filter_num * 2);  // NOLINT
-  uint64_t bn_memory_size = bn_pixel_num * sizeof(float);
-
-  uint64_t ofm_width =
-      ((args.image.width) + 2 * args.image.pad_width - args.kernel.width) /
-          (args.kernel.stride_w) +
-      1;
-  uint64_t ofm_height = ((args.image.height) + 2 * (args.image.pad_height) -
-                         (args.kernel.height)) /
-                            (args.kernel.stride_h) +
-                        1;
-
-  uint32_t filter_num = args.filter_num;
-  uint32_t image_channels = args.image.channels;
-
-  DLOG << "filter_num: " << filter_num;
-  uint64_t ifm_src_paddr = vaddr_to_paddr((args.image.address));
-  uint64_t flt_src_paddr = vaddr_to_paddr((args.filter_address));
-  uint64_t sb_src_paddr = vaddr_to_paddr((args.free_space));
-  uint64_t ifm_dst_paddr = vaddr_to_paddr((args.output.address));
-  /**********BN******************/
-  float image_inv_scale = (args.image.scale_address)[0];
-  float filter_inv_scale = (args.filter_scale_address)[0];
-  float scale_tmp = image_inv_scale * filter_inv_scale;
-  int idx = 0;
-  float tmp = 0.0;
-  float *convert_sb_addr = (float *)(args.free_space);  // NOLINT
-  for (idx = 0; idx < args.filter_num * 2; idx++) {
-    if (idx % 2 == 1) {
-      tmp = ((float *)(args.sb_address))[idx] * scale_tmp;  // NOLINT
-    } else {
-      tmp = ((float *)(args.sb_address))[idx];  // NOLINT
-    }
-    convert_sb_addr[idx] = tmp;  // NOLINT
-  }
+  uint64_t reg_ActivationArgs = 0;
+  // active function:{none,leakeyrelu,sigmoid,tanh}
+  ActivationArgs active_args;
+  // active_args.activation_type = LEAKYRELU;
+
+  active_args.activation_type = args.output.activation.activation_type;
+
+  active_args.leaky_relu_negative_slope =
+      args.output.activation.leaky_relu_negative_slope;
+
+  reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) |
+                       active_args.leaky_relu_negative_slope;
+
+  DLOG << "   activation_type:" << active_args.activation_type
+       << "   leaky_relu_negative_slope:"
+       << active_args.leaky_relu_negative_slope;
+  DLOG << "   reg_ActivationArgs:" << reg_ActivationArgs;
 
-  fpga_flush(convert_sb_addr, args.filter_num * 2 * sizeof(float));
-  reg_writeq(1, MUL8(24));
-  usleep(1);
-  reg_writeq(0, MUL8(24));
-
-  reg_writeq(sb_src_paddr, MUL8(27));
-  reg_writeq(0, MUL8(0));
-
-  uint64_t bps_addr = 0x8c00000000000000;
-  bps_addr += bn_memory_size;
-  reg_writeq(bps_addr, MUL8(0));
-  int ret = -1;
-  ret = fpga_regpoll(MUL8(48), BYPASS_DONE, 0xffffff);
-  if (ret) {
-    DLOG << "conv bypass failed";
+  pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
+  if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status) {
+    ret = -EIO;
+    DLOG << "Conv Status Error!";
+    pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
     return ret;
   }
-  reg_readq(MUL8(63));
-
-  /*********configuring registers*************/
-  uint32_t cmd_image_vir_base_addr = (uint32_t)ifm_src_paddr;
-  uint32_t cmd_filter_vir_base_addr = (uint32_t)flt_src_paddr;
-  uint32_t cmd_scale_base_addr = (uint32_t)sb_src_paddr;
-  uint32_t conv_ofm_addr_base = (uint32_t)ifm_dst_paddr;
-  uint64_t cmd_group_num = args.group_num;
-  uint64_t cmd_filter_per_group = filter_num / cmd_group_num;
-
-  uint64_t cmd_flt_sqr_len = (args.kernel.width) * (args.kernel.height);
-  uint64_t cmd_ifm_pre_row_num = 0;
-
-  if (1 == args.image.height) {
-    cmd_ifm_pre_row_num = 1;
-  } else {
-    cmd_ifm_pre_row_num =
-        (args.kernel.height) - (args.image.pad_height) + (args.kernel.stride_h);
-  }
-  uint64_t cmd_flt_pre_batch_num = 1;
-  uint64_t cmd_ifm_pack_num_per_row_mns1 =
-      (uint64_t)(((args.image.channels) + 127) / 128) - 1;
-  uint64_t cmd_bn_num = filter_num;
-  uint64_t cmd_bias_num = filter_num;
-  uint64_t cmd_ifm_stride_row_length = args.image.width * args.kernel.stride_h;
-  uint64_t cmd_flt_pack_num_per_kernel_mns1 =
-      (uint64_t)(((args.image.channels) + 127) / 128) - 1;
-  uint64_t cmd_ofm_width_mns1 = (uint64_t)(
-      ((args.image.width) - (args.kernel.width) + 2 * (args.image.pad_width)) /
-      (args.kernel.stride_w));
-  uint64_t cmd_ofm_height =
-      (uint64_t)(((args.image.height) - (args.kernel.height) +
-                  2 * (args.image.pad_height)) /
-                 (args.kernel.stride_h)) +
-      1;
-
-  uint64_t cmd_channel_num = 0;
-  uint64_t cmd_ifm_pack_len = 0;
-  uint64_t cmd_channel_per_group = 0;
-  uint64_t cmd_flt_batch_num_mns1 = 0;
-  uint64_t cmd_flt_N_impl = 8;
-  uint64_t cmd_ifm_C_impl = 16;
-  uint64_t cmd_flt_pack_length = 0;
-  uint64_t cmd_step_h_mul_row_byte_len = 0;
-  uint64_t cmd_pad_h_mul_row_byte_len = 0;
-  uint64_t cmd_ifm_pack_byte_length = 16 * ((((args.image.width) + 7) / 8) * 8);
-  uint64_t row_len_align = args.image.width;
-  if (image_channels > 64) {
-    cmd_channel_num = (uint64_t)((((args.image.channels) + 127)) / 128) * 128;
-    cmd_ifm_pack_len = 128 * (args.image.width);
-    cmd_channel_per_group = 128;
-    cmd_flt_batch_num_mns1 = (uint64_t)(((args.filter_num + 7)) / 8 - 1);
-    cmd_flt_N_impl = 8;
-    cmd_ifm_C_impl = 128;
-    cmd_flt_pack_length = (args.kernel.width) * (args.kernel.height) * 128;
-    cmd_step_h_mul_row_byte_len =
-        (args.kernel.stride_h) * cmd_channel_num * (args.image.width);
-    cmd_pad_h_mul_row_byte_len =
-        (args.image.pad_height) * cmd_channel_num * (args.image.width);
-    cmd_ifm_pack_byte_length = 128 * (args.image.width);
-    row_len_align = args.image.width * (cmd_ifm_pack_num_per_row_mns1 + 1);
-  } else if (image_channels > 32) {
-    cmd_channel_num = 64;
-    cmd_ifm_pack_len = 64 * (args.image.width);
-    cmd_channel_per_group = 64;
-    cmd_flt_batch_num_mns1 = (uint64_t)((((args.filter_num) + 15)) / 16 - 1);
-    cmd_flt_N_impl = 16;
-    cmd_ifm_C_impl = 64;
-    cmd_flt_pack_length = (args.kernel.width) * (args.kernel.height) * 64;
-    cmd_step_h_mul_row_byte_len = (args.kernel.stride_h) * cmd_channel_num *
-                                  ((((args.image.width) + 1)) / 2) * 2;
-    cmd_pad_h_mul_row_byte_len = (args.image.pad_height) * cmd_channel_num *
-                                 ((((args.image.width) + 1)) / 2) * 2;
-    cmd_ifm_pack_byte_length =
-        64 * (uint64_t)((((args.image.width) + 1)) / 2) * 2;
-    row_len_align = (uint64_t)((((args.image.width) + 1)) / 2);
-  } else if (image_channels > 16) {
-    cmd_channel_num = 32;
-    cmd_ifm_pack_len = 32 * (args.image.width);
-    cmd_channel_per_group = 32;
-    cmd_flt_batch_num_mns1 = (uint64_t)((((args.filter_num) + 31)) / 32 - 1);
-    cmd_flt_N_impl = 32;
-    cmd_ifm_C_impl = 32;
-    cmd_flt_pack_length = (args.kernel.width) * (args.kernel.height) * 32;
-    cmd_step_h_mul_row_byte_len = (args.kernel.stride_h) * cmd_channel_num *
-                                  ((((args.image.width) + 3)) / 4) * 4;
-    cmd_pad_h_mul_row_byte_len = (args.image.pad_height) * cmd_channel_num *
-                                 ((((args.image.width) + 3)) / 4) * 4;
-    cmd_ifm_pack_byte_length =
-        32 * (uint64_t)((((args.image.width) + 3)) / 4) * 4;
-    row_len_align = (uint64_t)((((args.image.width) + 3)) / 4);
-  } else {
-    cmd_channel_num = 16;
-    cmd_ifm_pack_len = 16 * (args.image.width);
-    cmd_channel_per_group = 16;
-    cmd_flt_batch_num_mns1 = (uint64_t)((((args.filter_num) + 63)) / 64 - 1);
-    cmd_flt_N_impl = 64;
-    cmd_ifm_C_impl = 16;
-    cmd_flt_pack_length = (args.kernel.width) * (args.kernel.height) * 16;
-    cmd_step_h_mul_row_byte_len = (args.kernel.stride_h) * cmd_channel_num *
-                                  ((((args.image.width) + 7)) / 8) * 8;
-    cmd_pad_h_mul_row_byte_len = (args.image.pad_height) * cmd_channel_num *
-                                 ((((args.image.width) + 7)) / 8) * 8;
-    cmd_ifm_pack_byte_length = 16 * ((((args.image.width) + 7)) / 8) * 8;
-    row_len_align = (uint64_t)((((args.image.width) + 7)) / 8);
-  }
-  uint64_t cmd_flt_length =
-      (args.kernel.width) * (args.kernel.height) * cmd_channel_num;
-  uint64_t cmd_ifm_row_byte_length = cmd_channel_num * (args.image.width);
-
-  uint64_t cmd_ifm_buf_col_len = 0;
-
-  uint64_t ifm_one_batch_len =
-      (1048576 / ((args.image.width) * cmd_channel_num));
-  uint64_t cmd_ifm_batch_num_tmp = (uint64_t)(
-      ((args.image.height) + ifm_one_batch_len - 1) / ifm_one_batch_len);
-  if (1 == cmd_ifm_batch_num_tmp) {
-    cmd_ifm_buf_col_len = args.image.height;
-  } else {
-    if (((args.image.height) / (cmd_ifm_batch_num_tmp) % 2) == 0) {
-      cmd_ifm_buf_col_len = (args.image.height) / cmd_ifm_batch_num_tmp;
-    } else {
-      cmd_ifm_buf_col_len = (args.image.height) / cmd_ifm_batch_num_tmp - 1;
-    }
-  }
-  uint64_t cmd_ifm_batch_num_mns1 =
-      (((args.image.height) + cmd_ifm_buf_col_len - 1) / cmd_ifm_buf_col_len) -
-      1;
-  uint64_t cmd_flt_cycle_num_mns1 = cmd_ifm_batch_num_mns1;
-  uint64_t cmd_flt_total_batch_num = filter_num / cmd_flt_N_impl;
-  uint64_t cmd_ifm_buf_col_len_rem =
-      (args.image.height) -
-      cmd_ifm_batch_num_mns1 * cmd_ifm_buf_col_len;  //= -4;
-  uint64_t cmd_flt_N_len = args.kernel.width * args.kernel.height *
-                           (cmd_flt_pack_num_per_kernel_mns1 + 1);
-
-  //-------- ofm batch number reg &&  initial URAM reading address
-  // logic-----------------
-  uint64_t cmd_init_raddr_cnt = 1;
-  uint64_t cmd_init_raddr_flag = 0;
-  int64_t cmd_init_raddr_index = -8;
-  int64_t cmd_init_raddr_col_0 = -4;
-  int64_t cmd_init_raddr_col_1 = -4;
-  uint64_t conv_ofm_buf_col_len = 0;
-  uint64_t conv_ofm_buf_col_len_rem = 0;
-
-  if (((args.image.pad_height) % (2 * (args.kernel.stride_h))) == 0) {
-    cmd_init_raddr_cnt = 0;
-    cmd_init_raddr_flag = 0;
-    cmd_init_raddr_index =
-        0 - (int64_t)row_len_align * (((args.image.pad_height) + 1) / 2);
-    cmd_init_raddr_col_0 = cmd_init_raddr_index;
-    cmd_init_raddr_col_1 = cmd_init_raddr_index;
-  } else if (((args.image.pad_height) -
-              2 * ((args.image.pad_height) / (2 * (args.kernel.stride_h)))) <=
-             (args.kernel.stride_h)) {
-    cmd_init_raddr_cnt =
-        (args.kernel.stride_h) -
-        ((args.image.pad_height) -
-         ((args.image.pad_height) / (2 * (args.kernel.stride_h))));
-    cmd_init_raddr_flag = 1;
-    cmd_init_raddr_index =
-        0 - (int64_t)row_len_align * (int64_t)(args.image.pad_height) -
-        (int64_t)row_len_align *
-            ((args.image.pad_height) / (2 * args.kernel.stride_h));
-    cmd_init_raddr_col_0 =
-        0 - (int64_t)row_len_align * (int64_t)(args.image.pad_height) -
-        (int64_t)row_len_align *
-            ((args.image.pad_height) / (2 * (args.kernel.stride_h)));
-    cmd_init_raddr_col_1 = 0;
-  } else if (((args.image.pad_height) -
-              2 * ((args.image.pad_height) / (2 * (args.kernel.stride_h)))) <=
-             2 * (args.kernel.stride_h)) {
-    cmd_init_raddr_cnt =
-        2 * (args.kernel.stride_h) *
-            (((args.image.pad_height) + 2 * (args.kernel.stride_h) - 1) /
-             (2 * (args.kernel.stride_h))) -
-        (args.image.pad_height);
-    cmd_init_raddr_flag = 0;
-    cmd_init_raddr_index =
-        0 - (int64_t)row_len_align * (int64_t)(args.kernel.stride_h) *
-                (((args.image.pad_height) + 2 * (args.kernel.stride_h) - 1) /
-                 (2 * (args.kernel.stride_h)));
-    cmd_init_raddr_col_0 =
-        0 -
-        (int64_t)row_len_align *
-            ((args.image.pad_height) / (2 * (args.kernel.stride_h))) -
-        (int64_t)row_len_align *
-            (2 * (args.kernel.stride_h) *
-                 (((args.image.pad_height) + 2 * (args.kernel.stride_h) - 1) /
-                  (2 * (args.kernel.stride_h))) -
-             (args.image.pad_height));
-    cmd_init_raddr_col_1 = cmd_init_raddr_col_0;
-  }
 
-  if (cmd_ifm_batch_num_mns1 == 0) {
-    if ((args.kernel.height) <= (args.kernel.stride_h)) {
-      conv_ofm_buf_col_len = (args.image.height) + 2 * (args.image.pad_height) -
-                             3 * (args.kernel.stride_h);
-    } else {
-      conv_ofm_buf_col_len = (args.image.height) + 2 * (args.image.pad_height) -
-                             2 * (args.kernel.stride_h) - (args.kernel.height);
-    }
-    conv_ofm_buf_col_len_rem = conv_ofm_buf_col_len;
-  } else {
-    int N_rem = 0;
-    int row_rem = 0;
-
-    if ((args.kernel.height) <= (args.kernel.stride_h)) {
-      conv_ofm_buf_col_len = cmd_ifm_buf_col_len - 3 * (args.kernel.stride_h);
-      N_rem = (cmd_ifm_buf_col_len - (args.kernel.height)) /
-                  (args.kernel.stride_h) +
-              1;
-      row_rem = cmd_ifm_buf_col_len - (args.kernel.stride_h) * N_rem;
-      conv_ofm_buf_col_len_rem = cmd_ifm_buf_col_len_rem +
-                                 2 * (args.image.pad_height) + row_rem -
-                                 3 * (args.kernel.stride_h);
-    } else {
-      conv_ofm_buf_col_len = cmd_ifm_buf_col_len + 2 * (args.image.pad_height) -
-                             2 * (args.kernel.stride_h) - (args.kernel.height);
-      N_rem = (cmd_ifm_buf_col_len - (args.kernel.height)) /
-                  (args.kernel.stride_h) +
-              1;
-      row_rem = cmd_ifm_buf_col_len - (args.kernel.stride_h) * N_rem;
-      conv_ofm_buf_col_len_rem =
-          cmd_ifm_buf_col_len_rem + (args.image.pad_height) + row_rem -
-          2 * (args.kernel.stride_h) - (args.kernel.height);
-    }
-  }
-  //-----------------------  para functions --------------------------------
-  float filter_quant_scale_tmp = ((args.filter_scale_address)[1]);
-  float image_quant_scale_tmp = ((args.image.scale_address)[1]);
-
-  uint32_t cmd_filter_quant_scale =
-      *(uint32_t *)(&filter_quant_scale_tmp);  // NOLINT
-  uint32_t cmd_image_quant_scale =
-      *(uint32_t *)(&image_quant_scale_tmp);  // NOLINT
-
-  uint64_t wParallelsim = cmd_flt_N_impl >> 3;
-  uint64_t wParallelsim_num =
-      (uint64_t)(((args.filter_num) + cmd_flt_N_impl - 1) / cmd_flt_N_impl) - 1;
-  uint64_t win_size = (args.kernel.width) * (args.kernel.height) *
-                          (cmd_ifm_pack_num_per_row_mns1 + 1) -
-                      1;
-  uint64_t conv_ofm_width = (((args.image.width) - (args.kernel.width) +
-                              (args.image.pad_width) + (args.image.pad_width)) /
-                             (args.kernel.stride_w));
-  uint64_t conv_ofm_dma_length = cmd_flt_N_impl * sizeof(short);   // NOLINT
-  uint64_t conv_ofm_dma_stride = args.filter_num * sizeof(short);  // NOLINT
-  uint64_t conv_ofm_height_batch_tmp =
-      get_image_out_axis(args.image.height, args.image.pad_height,
-                         args.kernel.height, args.kernel.stride_h);
-  uint64_t conv_ofm_height_batch = (conv_ofm_height_batch_tmp + 1) / 2 - 1;
-  uint64_t o_ust_rst = 0;
-  uint64_t conv_ofm_dma_repeat =
-      (uint64_t)(((((args.image.width) - (args.kernel.width) +
-                    (args.image.pad_width) + (args.image.pad_width))) /
-                  (args.kernel.stride_w)) +
-                 1);
-  uint64_t conv_ofm_dma_offset =
-      args.filter_num * conv_ofm_dma_repeat * sizeof(short);  // NOLINT
-  uint64_t conv_ofm_inter_stride = conv_ofm_dma_offset * 2;
-  //----------------- register contation ------------------
-  uint64_t cmd_ifm_flt_base_addr = ((uint64_t)cmd_filter_vir_base_addr << 32) |
-                                   ((uint64_t)cmd_image_vir_base_addr);
-  uint64_t cmd_ifm_flt_dim = ((uint64_t)(args.kernel.height) << 48) |
-                             ((uint64_t)(args.kernel.width) << 32) |
-                             ((uint64_t)(args.image.height) << 16) |
-                             ((uint64_t)(args.image.width));
-  uint64_t cmd_pad_step_size = ((uint64_t)(args.kernel.stride_h) << 48) |
-                               ((uint64_t)(args.kernel.stride_w) << 32) |
-                               ((uint64_t)(args.image.pad_height) << 16) |
-                               ((uint64_t)(args.image.pad_width));
-  uint64_t cmd_param1 = ((uint64_t)cmd_filter_per_group << 48) |
-                        ((uint64_t)cmd_channel_num << 32) |
-                        ((uint64_t)filter_num << 16) |
-                        ((uint64_t)cmd_group_num);
-  uint64_t cmd_param2 =
-      ((uint64_t)cmd_flt_sqr_len << 48) | ((uint64_t)cmd_ifm_pack_len << 32) |
-      ((uint64_t)cmd_ifm_pre_row_num << 16) | ((uint64_t)cmd_channel_per_group);
-  uint64_t cmd_param3 = ((uint64_t)cmd_flt_batch_num_mns1 << 48) |
-                        ((uint64_t)cmd_flt_total_batch_num << 32) |
-                        ((uint64_t)cmd_flt_N_impl << 16) |
-                        ((uint64_t)cmd_flt_pre_batch_num);
-  uint64_t cmd_param4 = ((uint64_t)cmd_ifm_pack_num_per_row_mns1 << 48) |
-                        ((uint64_t)cmd_bn_num << 32) |
-                        ((uint64_t)cmd_bias_num << 16) |
-                        ((uint64_t)cmd_flt_N_len);
-  uint64_t cmd_param5 = ((uint64_t)cmd_ifm_stride_row_length << 48) |
-                        ((uint64_t)cmd_flt_pack_length << 32) |
-                        ((uint64_t)cmd_flt_cycle_num_mns1 << 16) |
-                        ((uint64_t)cmd_flt_pack_num_per_kernel_mns1);
-  uint64_t cmd_param6 = ((uint64_t)cmd_ofm_width_mns1 << 48) |
-                        ((uint64_t)cmd_ifm_batch_num_mns1 << 32) |
-                        ((uint64_t)cmd_ifm_buf_col_len << 16) |
-                        ((uint64_t)cmd_ifm_C_impl);
-  uint64_t cmd_param7 = ((uint64_t)conv_ofm_inter_stride << 32) |
-                        ((uint64_t)cmd_ifm_buf_col_len_rem << 16) |
-                        ((uint64_t)cmd_ofm_height);
-  uint64_t cmd_param8 =
-      ((uint64_t)cmd_flt_length << 32) | ((uint64_t)cmd_ifm_row_byte_length);
-  uint64_t cmd_ifm_flt_quant_scale =
-      (((uint64_t)cmd_filter_quant_scale) << 32) |
-      ((uint64_t)cmd_image_quant_scale);
-  uint64_t cmd_step_pad_mul_row_len =
-      ((uint64_t)cmd_pad_h_mul_row_byte_len << 32) |
-      ((uint64_t)cmd_step_h_mul_row_byte_len);
-  //---- ofm paras ----
-  uint64_t cmd_conv_param_reg = ((uint64_t)wParallelsim_num << 32) |
-                                ((uint64_t)wParallelsim << 16) |
-                                ((uint64_t)win_size);
-  uint64_t cmd_ofm_addr_width_reg =
-      ((uint64_t)conv_ofm_width << 32) | ((uint64_t)conv_ofm_addr_base);
-  uint64_t cmd_intra_stride_atoms_reg =
-      ((uint64_t)conv_ofm_dma_length << 32) | ((uint64_t)conv_ofm_dma_stride);
-  uint64_t cmd_ofm_height_batch_reg =
-      ((uint64_t)conv_ofm_buf_col_len_rem << 48) |
-      ((uint64_t)conv_ofm_buf_col_len << 32) |
-      ((uint64_t)conv_ofm_height_batch + 0x80000000);
-  uint64_t cmd_user_ctrl_reg = ((uint64_t)o_ust_rst);
-  uint64_t cmd_wdma_param_reg =
-      ((uint64_t)(conv_ofm_dma_repeat | 0x80000000) << 32) |
-      ((uint64_t)conv_ofm_dma_offset);
-
-  uint64_t cmd_init_raddr_reg = ((cmd_init_raddr_col_1 & 0xffff) << 48) |
-                                ((cmd_init_raddr_col_0 & 0xffff) << 32) |
-                                (((cmd_init_raddr_index & 0xffff) << 16)) |
-                                (cmd_init_raddr_flag & 0xffff) << 15 |
-                                ((cmd_init_raddr_cnt & 0xffff));
-
-  uint64_t cmd_para31 = (cmd_para31 & 0x1) | args.relu_enabled;
-
-  DLOG << "cmd_init_raddr_col_1 = " << hex << cmd_init_raddr_col_1;
-
-  DLOG << "cmd_init_raddr_col_0 = " << hex << cmd_init_raddr_col_0;
-  DLOG << "cmd_init_raddr_index = " << hex << cmd_init_raddr_index;  //
-  DLOG << "cmd_init_raddr_cnt = " << hex << cmd_init_raddr_cnt;
-  DLOG << "conv_ofm_height_batch = " << conv_ofm_height_batch;
-
-  DLOG << "cmd_ifm_flt_base_addr = " << hex << cmd_ifm_flt_base_addr;
-  DLOG << "cmd_scale_base_addr = " << hex << cmd_scale_base_addr;
-  DLOG << "cmd_ifm_flt_dim = " << hex << cmd_ifm_flt_dim;
-  DLOG << "cmd_pad_step_size = " << hex << cmd_pad_step_size;
-  DLOG << "cmd_param1 = " << hex << cmd_param1;
-  DLOG << "cmd_param2 = " << hex << cmd_param2;
-  DLOG << "cmd_param3 = " << hex << cmd_param3;
-  DLOG << "cmd_param4 = " << hex << cmd_param4;
-  DLOG << "cmd_param5 = " << hex << cmd_param5;
-  DLOG << "cmd_param6 = " << hex << cmd_param6;
-  DLOG << "cmd_param7 = " << hex << cmd_param7;
-  DLOG << "cmd_param8 =  " << hex << cmd_param8;
-  DLOG << "cmd_ifm_flt_quant_scale =  " << hex << cmd_ifm_flt_quant_scale;
-  DLOG << "cmd_step_pad_mul_row_len = " << hex << cmd_step_pad_mul_row_len;
-  DLOG << "cmd_ifm_pack_byte_length = " << hex << cmd_ifm_pack_byte_length;
-  DLOG << "cmd_conv_param_reg = " << hex << cmd_conv_param_reg;
-  DLOG << "cmd_ofm_addr_width_reg = " << hex << cmd_ofm_addr_width_reg;
-  DLOG << "cmd_intra_stride_atoms_reg = " << hex << cmd_intra_stride_atoms_reg;
-  DLOG << "cmd_init_raddr_reg = " << hex << cmd_init_raddr_reg;
-  DLOG << "cmd_ofm_height_batch_reg = " << hex << cmd_ofm_height_batch_reg;
-  DLOG << "cmd_wdma_param_reg = " << hex << cmd_wdma_param_reg;
-  DLOG << "cmd_para31 = " << hex << cmd_para31;
-
-  reg_writeq(cmd_ifm_flt_base_addr, MUL8(1));
-  reg_writeq(cmd_scale_base_addr, MUL8(2));
-  reg_writeq(cmd_ifm_flt_dim, MUL8(3));
-  reg_writeq(cmd_pad_step_size, MUL8(4));
-  reg_writeq(cmd_param1, MUL8(5));
-  reg_writeq(cmd_param2, MUL8(6));
-  reg_writeq(cmd_param3, MUL8(7));
-  reg_writeq(cmd_param4, MUL8(8));
-  reg_writeq(cmd_param5, MUL8(9));
-  reg_writeq(cmd_param6, MUL8(10));
-  reg_writeq(cmd_param7, MUL8(11));
-  reg_writeq(cmd_param8, MUL8(12));
-  reg_writeq(cmd_ifm_flt_quant_scale, MUL8(13));
-  reg_writeq(cmd_step_pad_mul_row_len, MUL8(14));
-  reg_writeq(cmd_ifm_pack_byte_length, MUL8(15));
-  reg_writeq(cmd_conv_param_reg, MUL8(16));
-  reg_writeq(cmd_ofm_addr_width_reg, MUL8(17));
-  reg_writeq(cmd_intra_stride_atoms_reg, MUL8(18));
-
-  reg_writeq(cmd_init_raddr_reg, MUL8(29));
-  reg_writeq(cmd_para31, MUL8(31));
-
-  reg_writeq(0, MUL8(19));
-  reg_writeq(cmd_ofm_height_batch_reg, MUL8(19));
-  reg_writeq(cmd_ofm_height_batch_reg & 0xffffffff00000000, MUL8(19));
-
-  reg_writeq(cmd_wdma_param_reg, MUL8(25));
-
-  reg_writeq(0, MUL8(0));
-  reg_writeq(0x4000000000000000, MUL8(0));
-
-  ret = fpga_regpoll(MUL8(48), CONV_DONE, 0xffffff);
-  if (ret == -1) {
-    DLOG << "fpga conv no interrupt!!";
-    return ret;
+  reg_writeq(reg_ActivationArgs,
+             REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);  // active functoion
+
+  reg_writeq(output_scale, REG_SCALE_PARAMETER);
+  reg_writeq(
+      ((uint64_t)args.image.height) | (((uint64_t)args.image.width) << 32),
+      REG_CONV_IMAGE_PIXEL);
+  reg_writeq(
+      ((uint64_t)args.kernel.height) | (((uint64_t)args.kernel.width) << 32),
+      REG_CONV_FILTER_PIXEL);
+
+  uint64_t output_height_fraction =
+      args.driver.output_height / ROW_PARALLEL_NUM;
+  uint64_t output_height_remainder =
+      args.driver.output_height % ROW_PARALLEL_NUM;
+  reg_writeq(args.driver.output_height | (output_height_fraction << 16) |
+                 (output_height_remainder << 26) |
+                 (args.driver.output_width << 32),
+             REG_CONV_RESULT_PIXEL);
+  reg_writeq(((uint64_t)args.image.pad_height) |
+                 (((uint64_t)args.image.pad_width) << 32),
+             REG_CONV_PAD_PIXEL);
+  reg_writeq(((uint64_t)args.kernel.stride_h) |
+                 (((uint64_t)args.kernel.stride_w) << 32),
+             REG_CONV_STEP_PIXEL);
+  reg_writeq((uint64_t)args.group_num, REG_CONV_GROUP_NUMBER);
+  reg_writeq((uint64_t)args.filter_num, REG_CONV_FILTER_NUMBER);
+  reg_writeq((uint64_t)args.image.channels, REG_CONV_CHANNEL_NUMBER);
+  reg_writeq(*(uint64_t *)args.image.scale_address,  // NOLINT
+             REG_CONV_IMAGE_SCALE);
+  reg_writeq(*(uint64_t *)args.filter_scale_address,  // NOLINT
+             REG_CONV_FILTER_SCALE);
+  reg_writeq(args.driver.image_address_phy, REG_CONV_IMAGE_BASE_ADDR);
+  reg_writeq(args.driver.filter_address_phy, REG_CONV_FILTER_BASE_ADDR);
+  reg_writeq(args.driver.sb_address_phy, REG_CONV_SB_BASE_ADDR);
+  reg_writeq(args.driver.output_address_phy, REG_CONV_RESULT_BASE_ADDR);
+  reg_writeq(args.driver.filter_per_group, REG_CONV_FILTER_PER_GROUP);
+  reg_writeq(args.driver.channel_per_group, REG_CONV_CHANNEL_PER_GROUP);
+  reg_writeq(args.driver.image_amount_per_row, REG_CONV_IMAGE_AMOUNT_PER_ROW);
+  reg_writeq(args.driver.image_one_pad_per_row, REG_CONV_IMAGE_ONE_PAD_PER_ROW);
+  reg_writeq(args.driver.filter_amount_all, REG_CONV_FILTER_AMOUNT_ALL);
+  reg_writeq(args.driver.output_amount_per_row, REG_CONV_RESULT_AMOUNT_PER_ROW);
+  reg_writeq(args.driver.image_block_amount_per_row, 0xca8);
+  reg_writeq(args.driver.filter_pad_width_mul_channel, 0xcb0);
+  reg_writeq(args.driver.image_amount_per_row_multi_win_first, 0xcb8);
+  reg_writeq(args.driver.image_amount_per_row_multi_win, 0xcc0);
+  reg_writeq(args.driver.image_block_num, 0xcc8);
+  reg_writeq(args.driver.image_block_len, 0xcd0);
+  reg_writeq(args.driver.image_block_len_last, 0xcd8);
+  reg_writeq(args.driver.image_win_cnt, 0xce0);
+  reg_writeq(args.driver.image_win_cnt_last, 0xce8);
+  reg_writeq(args.driver.res_row_data_align4_pad, 0xcf8);
+  reg_writeq(args.driver.prog_full_cnt, 0xd08);
+  reg_writeq(args.driver.post_prog_full_cnt, 0xd10);
+  reg_writeq(args.driver.deconv_param, 0xd18);
+  reg_writeq(args.driver.fpga_bias_scale_len / 4, 0xd20);
+  reg_writeq(args.driver.cmd, REG_CONV_CMD);
+  if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_CONV, PE_IRQ_TIMEOUT)) {
+    g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status = ERROR;
+    ret = -EIO;
+    DLOG << "Conv Wait Irq Timeout!";
+    PADDLE_MOBILE_ENFORCE(0, "Conv Wait Irq Timeout");
   }
-  reg_readq(MUL8(63));
-
-  usleep(10);
-  float scale = Findfp16Max();
-  (args.output.scale_address)[0] = scale;                 // NOLINT
-  (args.output.scale_address)[1] = (float)(1.0 / scale);  // NOLINT
-  DLOG << "Findfp16Max scale = " << scale;
-  DLOG << "ret=" << ret;
+  output_scale = reg_readq(REG_SCALE_PARAMETER);
+  output_scale = (output_scale << 32) | (output_scale >> 32);
+  fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
+
+  active_args.activation_type = NONE;
+  reg_writeq(reg_ActivationArgs, REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);
+
+  pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
+
   return ret;
-}
+#endif
+  return 0;
+}  // ComputeBasicConv
 
 int ComputeFpgaPool(const struct PoolingArgs &args) {
 #ifdef FPGA_PRINT_MODE
@@ -577,6 +327,7 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
   DLOG << "   mode:" << args.mode
        << "   kernel_reciprocal:" << fp16_2_fp32(args.kernel_reciprocal);
   DLOG << "   image_address:" << args.image.address
+       << "   image_scale_address:" << args.image.scale_address
        << "   image_channels:" << args.image.channels
        << "   image_height:" << args.image.height
        << "   image_width:" << args.image.width
@@ -586,996 +337,223 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
        << "   kernel_width:" << args.kernel.width
        << "   stride_h:" << args.kernel.stride_h
        << "   stride_w:" << args.kernel.stride_w;
-  DLOG << "   out_address:" << args.output.address;
-#endif
-#ifndef PADDLE_MOBILE_ZU5
-  return 0;
+  DLOG << "   out_address:" << args.output.address
+       << "   out_scale_address:" << args.output.scale_address;
 #endif
-
-  uint32_t filter_num_align = 0;
-  filter_num_align = args.image.channels;
-
-  DLOG << "______db_______: begin to set registers. ";
-  uint64_t ifm_pixel_num =
-      ((args.image.width) * (args.image.height) * args.image.channels);
-  uint64_t ifm_memory_size = ifm_pixel_num * sizeof(short);  // NOLINT
-  uint64_t flt_pixel_num = 0;
-  uint64_t filter_memory_size = 0;
-  //!! ???
-  uint64_t bn_pixel_num = (filter_num_align * 2);
-  uint64_t bn_memory_size = bn_pixel_num * sizeof(uint16_t);
-
-  uint64_t ofm_width =
-      ((args.image.width) + 2 * args.image.pad_width - args.kernel.width) /
-          (args.kernel.stride_w) +
-      1;
-  uint64_t ofm_height = ((args.image.height) + 2 * (args.image.pad_height) -
-                         (args.kernel.height)) /
-                            (args.kernel.stride_h) +
-                        1;
-
-  uint32_t filter_num = filter_num_align;
-  uint32_t image_channels = args.image.channels;
-
-  uint64_t ifm_src_paddr = vaddr_to_paddr((args.image.address));
-  uint64_t flt_src_paddr = 0;
-  uint64_t sb_src_paddr = 0;
-  uint64_t ifm_dst_paddr = vaddr_to_paddr((args.output.address));
-
-  /**********BN******************/
-  float image_inv_scale = 0;
-  float filter_inv_scale = 0;
-  int idx = 0;
-  DLOG << "______db_______: reset registers. ";
-  reg_writeq(1, MUL8(24));
-  usleep(1);
-  reg_writeq(0, MUL8(24));
-  /*********configuring registers*************/
-  uint32_t cmd_image_vir_base_addr = (uint32_t)ifm_src_paddr;
-  uint32_t cmd_filter_vir_base_addr = (uint32_t)flt_src_paddr;
-  uint32_t cmd_scale_base_addr = (uint32_t)sb_src_paddr;
-  uint32_t conv_ofm_addr_base = (uint32_t)ifm_dst_paddr;
-  uint64_t cmd_group_num = 1;  // args.group_num;
-  uint64_t cmd_filter_per_group = filter_num / cmd_group_num;
-
-  uint64_t cmd_flt_sqr_len = (args.kernel.width) * (args.kernel.height);
-  uint64_t cmd_ifm_pre_row_num = args.kernel.height;
-  if ((args.kernel.height == args.image.height) &&
-      (0 == args.image.pad_height)) {
-    cmd_ifm_pre_row_num = (args.kernel.height);
-  } else {
-    cmd_ifm_pre_row_num =
-        (args.kernel.height) - (args.image.pad_height) + (args.kernel.stride_h);
-  }
-  uint64_t cmd_flt_pre_batch_num = 1;
-  uint64_t cmd_ifm_pack_num_per_row_mns1 =
-      (uint64_t)(((args.image.channels) + 63) / 64) - 1;
-  uint64_t cmd_bn_num = filter_num;
-  uint64_t cmd_bias_num = filter_num;
-  uint64_t cmd_ifm_stride_row_length = args.image.width * args.kernel.stride_h;
-  uint64_t cmd_flt_pack_num_per_kernel_mns1 =
-      (uint64_t)(((args.image.channels) + 63) / 64) - 1;
-  uint64_t cmd_ofm_width_mns1 = (uint64_t)(
-      ((args.image.width) - (args.kernel.width) + 2 * (args.image.pad_width)) /
-      (args.kernel.stride_w));
-  uint64_t cmd_ofm_height =
-      (uint64_t)(((args.image.height) - (args.kernel.height) +
-                  2 * (args.image.pad_height)) /
-                 (args.kernel.stride_h)) +
-      1;
-
-  uint64_t cmd_channel_num = 0;
-  uint64_t cmd_ifm_pack_len = 0;
-  uint64_t cmd_channel_per_group = 0;
-  uint64_t cmd_flt_batch_num_mns1 = 0;
-  uint64_t cmd_flt_N_impl = 8;
-  uint64_t cmd_ifm_C_impl = 16;
-  uint64_t cmd_flt_pack_length = 0;
-  uint64_t cmd_step_h_mul_row_byte_len = 0;
-  uint64_t cmd_pad_h_mul_row_byte_len = 0;
-  uint64_t cmd_ifm_pack_byte_length = 16 * ((((args.image.width) + 7) / 8) * 8);
-  uint64_t row_len_align = args.image.width;
-  uint64_t cmd_flt_cycle_num_mns1 = 0;
-  if (image_channels > 32) {
-    cmd_channel_num = (uint64_t)((((args.image.channels) + 63)) / 64) * 64;
-    cmd_ifm_pack_len = 64 * (args.image.width);
-    cmd_channel_per_group = 64;
-    cmd_flt_batch_num_mns1 = (uint64_t)(((filter_num + 7)) / 8 - 1);
-    cmd_flt_N_impl = 8;
-    cmd_ifm_C_impl = 64;
-    cmd_flt_pack_length = (args.kernel.width) * (args.kernel.height) * 64;
-    cmd_step_h_mul_row_byte_len =
-        (args.kernel.stride_h) * cmd_channel_num * args.image.width;
-    cmd_pad_h_mul_row_byte_len =
-        (args.image.pad_height) * cmd_channel_num * args.image.width;
-    cmd_ifm_pack_byte_length = 64 * args.image.width;
-    row_len_align = args.image.width * (cmd_ifm_pack_num_per_row_mns1 + 1);
-    cmd_flt_cycle_num_mns1 = (cmd_channel_num / 64) - 1;
-  } else if (image_channels > 16) {
-    cmd_channel_num = 32;
-    cmd_ifm_pack_len = 32 * (args.image.width);
-    cmd_channel_per_group = 32;
-    cmd_flt_batch_num_mns1 = (uint64_t)((((filter_num) + 15)) / 16 - 1);
-    cmd_flt_N_impl = 16;
-    cmd_ifm_C_impl = 32;
-    cmd_flt_pack_length = (args.kernel.width) * (args.kernel.height) * 32;
-    cmd_step_h_mul_row_byte_len = (args.kernel.stride_h) * cmd_channel_num *
-                                  ((((args.image.width) + 1)) / 2) * 2;
-    cmd_pad_h_mul_row_byte_len = (args.image.pad_height) * cmd_channel_num *
-                                 ((((args.image.width) + 1)) / 2) * 2;
-    cmd_ifm_pack_byte_length =
-        32 * (uint64_t)((((args.image.width) + 1)) / 2) * 2;
-    row_len_align = (uint64_t)((((args.image.width) + 1)) / 2);
-    cmd_flt_cycle_num_mns1 = 0;
-  } else if (image_channels > 8) {
-    cmd_channel_num = 16;
-    cmd_ifm_pack_len = 16 * (args.image.width);
-    cmd_channel_per_group = 16;
-    cmd_flt_batch_num_mns1 = (uint64_t)((((filter_num) + 15)) / 16 - 1);
-    cmd_flt_N_impl = 32;
-    cmd_ifm_C_impl = 16;
-    cmd_flt_pack_length = (args.kernel.width) * (args.kernel.height) * 16;
-    cmd_step_h_mul_row_byte_len = (args.kernel.stride_h) * cmd_channel_num *
-                                  ((((args.image.width) + 3)) / 4) * 4;
-    cmd_pad_h_mul_row_byte_len = (args.image.pad_height) * cmd_channel_num *
-                                 ((((args.image.width) + 3)) / 4) * 4;
-    cmd_ifm_pack_byte_length =
-        16 * (uint64_t)((((args.image.width) + 3)) / 4) * 4;
-    row_len_align = (uint64_t)((((args.image.width) + 3)) / 4);
-    cmd_flt_cycle_num_mns1 = 0;
-  }
-
-  cmd_flt_N_impl = 16;
-  cmd_flt_batch_num_mns1 = 0;
-  cmd_flt_pack_length = 64;
-  uint64_t cmd_flt_N_len = 0;
-  uint64_t cmd_flt_length = 64;
-
-  uint64_t cmd_ifm_row_byte_length = cmd_channel_num * (args.image.width);
-
-  uint64_t cmd_ifm_buf_col_len = 0;
-
-  uint64_t ifm_one_batch_len =
-      (1048576 / ((args.image.width) * cmd_channel_num));
-  uint64_t cmd_ifm_batch_num_tmp = (uint64_t)(
-      ((args.image.height) + ifm_one_batch_len - 1) / ifm_one_batch_len);
-  if (1 == cmd_ifm_batch_num_tmp) {
-    cmd_ifm_buf_col_len = args.image.height;
-  } else {
-    if (((args.image.height) / (cmd_ifm_batch_num_tmp) % 2) == 0) {
-      cmd_ifm_buf_col_len = (args.image.height) / cmd_ifm_batch_num_tmp;
-    } else {
-      cmd_ifm_buf_col_len = (args.image.height) / cmd_ifm_batch_num_tmp - 1;
-    }
-  }
-  uint64_t cmd_ifm_batch_num_mns1 =
-      (((args.image.height) + cmd_ifm_buf_col_len - 1) / cmd_ifm_buf_col_len) -
-      1;
-
-  uint64_t cmd_flt_total_batch_num = 1;
-  uint64_t cmd_ifm_buf_col_len_rem =
-      (args.image.height) -
-      cmd_ifm_batch_num_mns1 * cmd_ifm_buf_col_len;  //= -4;
-
-  //-------- ofm batch number reg &&  initial URAM reading address
-  uint64_t cmd_init_raddr_cnt = 1;
-  uint64_t cmd_init_raddr_flag = 0;
-  int64_t cmd_init_raddr_index = -8;
-  int64_t cmd_init_raddr_col_0 = -4;
-  int64_t cmd_init_raddr_col_1 = -4;
-  int64_t conv_ofm_buf_col_len = 0;
-  int64_t conv_ofm_buf_col_len_rem = 0;
-
-  if (((args.image.pad_height) % (2 * (args.kernel.stride_h))) == 0) {
-    cmd_init_raddr_cnt = 0;
-    cmd_init_raddr_flag = 0;
-    cmd_init_raddr_index =
-        0 - (int64_t)row_len_align * (((args.image.pad_height) + 1) / 2);
-    cmd_init_raddr_col_0 = cmd_init_raddr_index;
-    cmd_init_raddr_col_1 = cmd_init_raddr_index;
-  } else if (((args.image.pad_height) -
-              2 * ((args.image.pad_height) / (2 * (args.kernel.stride_h)))) <=
-             (args.kernel.stride_h)) {
-    cmd_init_raddr_cnt =
-        (args.kernel.stride_h) -
-        ((args.image.pad_height) -
-         ((args.image.pad_height) / (2 * (args.kernel.stride_h))));
-    cmd_init_raddr_flag = 1;
-    cmd_init_raddr_index =
-        0 - (int64_t)row_len_align * (int64_t)(args.image.pad_height) -
-        (int64_t)row_len_align *
-            ((args.image.pad_height) / (2 * args.kernel.stride_h));
-    cmd_init_raddr_col_0 =
-        0 - (int64_t)row_len_align * (int64_t)(args.image.pad_height) -
-        (int64_t)row_len_align *
-            ((args.image.pad_height) / (2 * (args.kernel.stride_h)));
-    cmd_init_raddr_col_1 =
-        cmd_init_raddr_col_0 + args.kernel.stride_h * (int64_t)row_len_align;
-  } else if (((args.image.pad_height) -
-              2 * ((args.image.pad_height) / (2 * (args.kernel.stride_h)))) <=
-             2 * (args.kernel.stride_h)) {
-    cmd_init_raddr_cnt =
-        2 * (args.kernel.stride_h) *
-            (((args.image.pad_height) + 2 * (args.kernel.stride_h) - 1) /
-             (2 * (args.kernel.stride_h))) -
-        (args.image.pad_height);
-    cmd_init_raddr_flag = 0;
-    cmd_init_raddr_index =
-        0 - (int64_t)row_len_align * (int64_t)(args.kernel.stride_h) *
-                (((args.image.pad_height) + 2 * (args.kernel.stride_h) - 1) /
-                 (2 * (args.kernel.stride_h)));
-    cmd_init_raddr_col_0 =
-        0 -
-        (int64_t)row_len_align *
-            ((args.image.pad_height) / (2 * (args.kernel.stride_h))) -
-        (int64_t)row_len_align *
-            (2 * (args.kernel.stride_h) *
-                 (((args.image.pad_height) + 2 * (args.kernel.stride_h) - 1) /
-                  (2 * (args.kernel.stride_h))) -
-             (args.image.pad_height));
-    cmd_init_raddr_col_1 = cmd_init_raddr_col_0;
-  }
-
-  if (cmd_ifm_batch_num_mns1 == 0) {
-    if ((args.kernel.height) <= (args.kernel.stride_h)) {
-      conv_ofm_buf_col_len = (args.image.height) + 2 * (args.image.pad_height) -
-                             3 * (args.kernel.stride_h);
-    } else {
-      conv_ofm_buf_col_len = (args.image.height) + 2 * (args.image.pad_height) -
-                             2 * (args.kernel.stride_h) - (args.kernel.height);
-    }
-    conv_ofm_buf_col_len_rem = conv_ofm_buf_col_len;
-  } else {
-    int N_rem = 0;
-    int row_rem = 0;
-
-    if ((args.kernel.height) <= (args.kernel.stride_h)) {
-      conv_ofm_buf_col_len = cmd_ifm_buf_col_len - 3 * (args.kernel.stride_h);
-      N_rem = (cmd_ifm_buf_col_len - (args.kernel.height)) /
-                  (args.kernel.stride_h) +
-              1;
-      row_rem = cmd_ifm_buf_col_len - (args.kernel.stride_h) * N_rem;
-      conv_ofm_buf_col_len_rem = cmd_ifm_buf_col_len_rem +
-                                 2 * (args.image.pad_height) + row_rem -
-                                 3 * (args.kernel.stride_h);
-    } else {
-      conv_ofm_buf_col_len = cmd_ifm_buf_col_len + 2 * (args.image.pad_height) -
-                             2 * (args.kernel.stride_h) - (args.kernel.height);
-      N_rem = (cmd_ifm_buf_col_len - (args.kernel.height)) /
-                  (args.kernel.stride_h) +
-              1;
-      row_rem = cmd_ifm_buf_col_len - (args.kernel.stride_h) * N_rem;
-      conv_ofm_buf_col_len_rem =
-          cmd_ifm_buf_col_len_rem + (args.image.pad_height) + row_rem -
-          2 * (args.kernel.stride_h) - (args.kernel.height);
-    }
-  }
-
-  //-----------------------  para functions --------------------------------
-  uint64_t cmd_filter_quant_scale = 0x3c00;
-  uint64_t cmd_image_quant_scale = 0x3c00;
-  uint64_t wParallelsim = cmd_ifm_C_impl >> 3;
-  uint64_t wParallelsim_num = cmd_flt_cycle_num_mns1;
-  uint64_t win_size = (args.kernel.width) * (args.kernel.height) *
-                          (cmd_ifm_pack_num_per_row_mns1 + 1) -
-                      1;  //
-  uint64_t conv_ofm_width = (((args.image.width) - (args.kernel.width) +
-                              (args.image.pad_width) + (args.image.pad_width)) /
-                             (args.kernel.stride_w));
-  uint64_t conv_ofm_dma_length = cmd_channel_num * sizeof(short);  // NOLINT
-  uint64_t conv_ofm_dma_stride = conv_ofm_dma_length;
-  uint64_t conv_ofm_height_batch_tmp =
-      (args.image.height + 2 * args.image.pad_height - args.kernel.height) /
+#ifdef PADDLE_MOBILE_ZU5
+  DLOG << "Polling";
+  // return 0;
+  uint64_t output_scale = 0;
+  uint64_t timer_cnt = 0;
+  int ret = 0;
+  uint64_t cmd = 0;
+  uint64_t image_physical_address = 0;
+  uint64_t output_physical_address = 0;
+
+  uint64_t reg_ActivationArgs = 0;
+  // active function:{none,leakeyrelu,sigmoid,tanh}
+  ActivationArgs active_args;
+  // active_args.activation_type = LEAKYRELU;
+  active_args.activation_type = args.output.activation.activation_type;
+
+  active_args.leaky_relu_negative_slope =
+      args.output.activation.leaky_relu_negative_slope;
+
+  reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) |
+                       active_args.leaky_relu_negative_slope;
+
+  DLOG << "   activation_type:" << active_args.activation_type
+       << "   leaky_relu_negative_slope:"
+       << active_args.leaky_relu_negative_slope;
+  DLOG << "   reg_ActivationArgs:" << reg_ActivationArgs;
+
+  image_physical_address = vaddr_to_paddr_driver(args.image.address);
+  output_physical_address = vaddr_to_paddr_driver(args.output.address);
+  uint32_t output_height = (uint32_t)(
+      (args.image.height + args.image.pad_height * 2 - args.kernel.height) /
           args.kernel.stride_h +
-      1;
-
-  uint64_t conv_ofm_height_batch = (conv_ofm_height_batch_tmp + 1) / 2 - 1;
-  uint64_t o_ust_rst = 0;
-  uint64_t conv_ofm_dma_repeat =
-      (uint64_t)(((((args.image.width) - (args.kernel.width) +
-                    (args.image.pad_width) + (args.image.pad_width))) /
-                  (args.kernel.stride_w)) +
-                 1);
-  uint64_t conv_ofm_dma_offset =
-      args.image.channels * conv_ofm_dma_repeat * sizeof(short);  // NOLINT
-  uint64_t conv_ofm_inter_stride = conv_ofm_dma_offset * 2;
-  //----------------- register contation ------------------
-  uint64_t cmd_ifm_flt_base_addr = ((uint64_t)cmd_filter_vir_base_addr << 32) |
-                                   ((uint64_t)cmd_image_vir_base_addr);
-  uint64_t cmd_ifm_flt_dim = ((uint64_t)(args.kernel.height) << 48) |
-                             ((uint64_t)(args.kernel.width) << 32) |
-                             ((uint64_t)(args.image.height) << 16) |
-                             ((uint64_t)(args.image.width));
-  uint64_t cmd_pad_step_size = ((uint64_t)(args.kernel.stride_h) << 48) |
-                               ((uint64_t)(args.kernel.stride_w) << 32) |
-                               ((uint64_t)(args.image.pad_height) << 16) |
-                               ((uint64_t)(args.image.pad_width));
-  uint64_t cmd_param1 = ((uint64_t)cmd_filter_per_group << 48) |
-                        ((uint64_t)cmd_channel_num << 32) |
-                        ((uint64_t)filter_num << 16) |
-                        ((uint64_t)cmd_group_num);
-  uint64_t cmd_param2 =
-      ((uint64_t)cmd_flt_sqr_len << 48) | ((uint64_t)cmd_ifm_pack_len << 32) |
-      ((uint64_t)cmd_ifm_pre_row_num << 16) | ((uint64_t)cmd_channel_per_group);
-  uint64_t cmd_param3 = ((uint64_t)cmd_flt_batch_num_mns1 << 48) |
-                        ((uint64_t)cmd_flt_total_batch_num << 32) |
-                        ((uint64_t)cmd_flt_N_impl << 16) |
-                        ((uint64_t)cmd_flt_pre_batch_num);
-  uint64_t cmd_param4 = ((uint64_t)cmd_ifm_pack_num_per_row_mns1 << 48) |
-                        ((uint64_t)cmd_bn_num << 32) |
-                        ((uint64_t)cmd_bias_num << 16) |
-                        ((uint64_t)cmd_flt_N_len);
-  uint64_t cmd_param5 = ((uint64_t)cmd_ifm_stride_row_length << 48) |
-                        ((uint64_t)cmd_flt_pack_length << 32) |
-                        ((uint64_t)cmd_flt_cycle_num_mns1 << 16) |
-                        ((uint64_t)cmd_flt_pack_num_per_kernel_mns1);
-  uint64_t cmd_param6 = ((uint64_t)cmd_ofm_width_mns1 << 48) |
-                        ((uint64_t)cmd_ifm_batch_num_mns1 << 32) |
-                        ((uint64_t)cmd_ifm_buf_col_len << 16) |
-                        ((uint64_t)cmd_ifm_C_impl);
-  uint64_t cmd_param7 = ((uint64_t)conv_ofm_inter_stride << 32) |
-                        ((uint64_t)cmd_ifm_buf_col_len_rem << 16) |
-                        ((uint64_t)cmd_ofm_height);
-  uint64_t cmd_param8 =
-      ((uint64_t)cmd_flt_length << 32) | ((uint64_t)cmd_ifm_row_byte_length);
-  uint64_t cmd_ifm_flt_quant_scale = ((uint64_t)cmd_filter_quant_scale << 32) |
-                                     ((uint64_t)cmd_image_quant_scale);
-  uint64_t cmd_step_pad_mul_row_len =
-      ((uint64_t)cmd_pad_h_mul_row_byte_len << 32) |
-      ((uint64_t)cmd_step_h_mul_row_byte_len);
-  //---- ofm paras ----
-  uint64_t cmd_conv_param_reg = ((uint64_t)wParallelsim_num << 32) |
-                                ((uint64_t)wParallelsim << 16) |
-                                ((uint64_t)win_size);
-  uint64_t cmd_ofm_addr_width_reg =
-      ((uint64_t)conv_ofm_width << 32) | ((uint64_t)conv_ofm_addr_base);
-  uint64_t cmd_intra_stride_atoms_reg =
-      ((uint64_t)conv_ofm_dma_length << 32) | ((uint64_t)conv_ofm_dma_stride);
-  uint64_t cmd_ofm_height_batch_reg =
-      ((uint64_t)(conv_ofm_buf_col_len_rem & 0xffff) << 48) |
-      ((uint64_t)(conv_ofm_buf_col_len & 0xffff) << 32) |
-      ((uint64_t)conv_ofm_height_batch + 0x80000000);
-  uint64_t cmd_user_ctrl_reg = ((uint64_t)o_ust_rst);
-  uint64_t cmd_wdma_param_reg =
-      ((uint64_t)(conv_ofm_dma_repeat | 0x80000000) << 32) |
-      ((uint64_t)conv_ofm_dma_offset);
-  uint64_t cmd_init_raddr_reg = ((cmd_init_raddr_col_1 & 0xffff) << 48) |
-                                ((cmd_init_raddr_col_0 & 0xffff) << 32) |
-                                (((cmd_init_raddr_index & 0xffff) << 16)) |
-                                (cmd_init_raddr_flag & 0xffff) << 15 |
-                                ((cmd_init_raddr_cnt & 0xffff));
-
-  DLOG << "cmd_init_raddr_col_1 = " << hex << cmd_init_raddr_col_1;
-
-  DLOG << "cmd_init_raddr_col_0 = " << hex << cmd_init_raddr_col_0;
-  DLOG << "cmd_init_raddr_index = " << hex << cmd_init_raddr_index;  //
-  DLOG << "cmd_init_raddr_cnt = " << hex << cmd_init_raddr_cnt;
-  DLOG << "conv_ofm_buf_col_len = " << hex << conv_ofm_buf_col_len;
-  DLOG << "conv_ofm_buf_col_len_rem = " << hex << conv_ofm_buf_col_len_rem;
-  DLOG << "cmd_ifm_flt_base_addr = " << hex << cmd_ifm_flt_base_addr;
-  DLOG << "cmd_scale_base_addr = " << hex << cmd_scale_base_addr;
-  DLOG << "cmd_ifm_flt_dim = " << hex << cmd_ifm_flt_dim;
-  DLOG << "cmd_pad_step_size = " << hex << cmd_pad_step_size;
-  DLOG << "cmd_param1 = " << hex << cmd_param1;
-  DLOG << "cmd_param2 = " << hex << cmd_param2;
-  DLOG << "cmd_param3 = " << hex << cmd_param3;
-  DLOG << "cmd_param4 = " << hex << cmd_param4;
-  DLOG << "cmd_param5 = " << hex << cmd_param5;
-  DLOG << "cmd_param6 = " << hex << cmd_param6;
-  DLOG << "cmd_param7 = " << hex << cmd_param7;
-  DLOG << "cmd_param8 =  " << hex << cmd_param8;
-  DLOG << "cmd_ifm_flt_quant_scale =  " << hex << cmd_ifm_flt_quant_scale;
-  DLOG << "cmd_step_pad_mul_row_len = " << hex << cmd_step_pad_mul_row_len;
-  DLOG << "cmd_ifm_pack_byte_length = " << hex << cmd_ifm_pack_byte_length;
-  DLOG << "cmd_conv_param_reg = " << hex << cmd_conv_param_reg;
-  DLOG << "cmd_ofm_addr_width_reg = " << hex << cmd_ofm_addr_width_reg;
-  DLOG << "cmd_intra_stride_atoms_reg = " << hex << cmd_intra_stride_atoms_reg;
-  DLOG << "cmd_init_raddr_reg = " << hex << cmd_init_raddr_reg;
-  DLOG << "cmd_ofm_height_batch_reg = " << hex << cmd_ofm_height_batch_reg;
-  DLOG << "cmd_wdma_param_reg = " << hex << cmd_wdma_param_reg;
-  DLOG << "pooling_mode = " << hex << args.mode;
-
-  reg_writeq(cmd_ifm_flt_base_addr, MUL8(1));
-  reg_writeq(cmd_scale_base_addr, MUL8(2));
-  reg_writeq(cmd_ifm_flt_dim, MUL8(3));
-  reg_writeq(cmd_pad_step_size, MUL8(4));
-  reg_writeq(cmd_param1, MUL8(5));
-  reg_writeq(cmd_param2, MUL8(6));
-  reg_writeq(cmd_param3, MUL8(7));
-  reg_writeq(cmd_param4, MUL8(8));
-  reg_writeq(cmd_param5, MUL8(9));
-  reg_writeq(cmd_param6, MUL8(10));
-  reg_writeq(cmd_param7, MUL8(11));
-  reg_writeq(cmd_param8, MUL8(12));
-  reg_writeq(cmd_ifm_flt_quant_scale, MUL8(13));
-  reg_writeq(cmd_step_pad_mul_row_len, MUL8(14));
-  reg_writeq(cmd_ifm_pack_byte_length, MUL8(15));
-  reg_writeq(cmd_conv_param_reg, MUL8(16));
-  reg_writeq(cmd_ofm_addr_width_reg, MUL8(17));
-  reg_writeq(cmd_intra_stride_atoms_reg, MUL8(18));
-
-  reg_writeq(cmd_init_raddr_reg, MUL8(29));
-
-  reg_writeq(0, MUL8(19));
-  reg_writeq(cmd_ofm_height_batch_reg, MUL8(19));
-  reg_writeq(cmd_ofm_height_batch_reg & 0xffffffff00000000, MUL8(19));
-
-  reg_writeq(cmd_wdma_param_reg, MUL8(25));
-
-  /******************************************************************/
-  uint64_t cmd_mult_factor = ((uint64_t)args.kernel_reciprocal) |
-                             ((uint64_t)args.kernel_reciprocal << 16);
-  reg_writeq(cmd_mult_factor, MUL8(30));
-  /******************************************************************/
-
-  reg_writeq(0, MUL8(0));
-  if (args.mode == 0) {  // max pooling
-    reg_writeq(0x2200000000000000, MUL8(0));
-  } else {  // average pooling
-    reg_writeq(0x2400000000000000, MUL8(0));
-  }
-  int ret = -1;
-  ret = fpga_regpoll(MUL8(48), CONV_DONE, 0x00ffff);
-  if (ret == -1) {
-    DLOG << "fpga pooling no interrupt!!";
+      1);
+  uint32_t output_width = (uint32_t)(
+      (args.image.width + args.image.pad_width * 2 - args.kernel.width) /
+          args.kernel.stride_w +
+      1);
+  uint64_t image_amount_per_row =
+      align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels,
+                 IMAGE_ALIGNMENT);
+  uint64_t image_one_pad_per_row =
+      align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels,
+                 FILTER_ELEMENT_ALIGNMENT) +
+      (uint64_t)args.image.pad_width * (uint64_t)args.image.channels;
+  uint64_t image_two_pad_per_row = align_to_x(
+      ((uint64_t)args.image.width + (uint64_t)args.image.pad_width * 2) *
+          (uint64_t)args.image.channels,
+      IMAGE_ALIGNMENT);
+  uint64_t image_row_mul_pooling_hight =
+      image_amount_per_row * (uint64_t)args.kernel.height;
+  uint64_t image_row_mul_pad_hight =
+      image_amount_per_row * (uint64_t)args.image.pad_height;
+  uint64_t image_row_mul_step_hight =
+      image_amount_per_row * (uint64_t)args.kernel.stride_h;
+  uint64_t result_amount_align_32 =
+      align_to_x((uint64_t)output_width * (uint64_t)args.image.channels,
+                 FILTER_ELEMENT_ALIGNMENT);
+  uint64_t result_amount_align_64 = align_to_x(
+      (uint64_t)output_width * (uint64_t)args.image.channels, IMAGE_ALIGNMENT);
+  uint64_t image_calcu_height =
+      (uint64_t)args.kernel.height +
+      ((uint64_t)output_height - 1) * (uint64_t)args.kernel.stride_h;
+  uint64_t image_pad_left = args.image.channels * args.image.pad_width;
+  uint64_t image_skip_window = args.image.channels * args.kernel.stride_w;
+  uint64_t image_padleft_skipwindow =
+      (image_skip_window << 32) | image_pad_left;
+  uint64_t mode_reciprocal = (uint64_t)0 | ((uint64_t)args.mode) << 16 |
+                             (((uint64_t)args.kernel_reciprocal));
+
+  pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
+  if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) {
+    ret = -EIO;
+    DLOG << "Conv Status Error!";
+    pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
     return ret;
   }
-  reg_readq(MUL8(63));
-  usleep(10);
-  // get max value
-  float scale = Findfp16Max();
-  (args.output.scale_address)[0] = scale;                 // NOLINT
-  (args.output.scale_address)[1] = (float)(1.0 / scale);  // NOLINT
-  DLOG << "Findfp16Max scale = " << scale;
-  DLOG << "ret=" << ret;
-  return ret;
-}
 
-int get_ofm_batch_size(int width, int channel) {
-  int pad_channel, row_size;
-
-  if (64 < channel) {
-    pad_channel = (int)((channel + 127) / 128) * 128;  // NOLINT
-  } else if (32 < channel && channel <= 64) {
-    pad_channel = ((channel + 63) / (64)) * 64;
-  } else if (16 < channel && channel <= 32) {
-    pad_channel = ((channel + 31) / (32)) * 32;
-  } else if (channel <= 16) {
-    pad_channel = ((channel + 15) / (16)) * 16;
+  reg_writeq(reg_ActivationArgs,
+             REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);  // active functoion
+
+  reg_writeq(output_scale, REG_SCALE_PARAMETER);
+  reg_writeq(image_physical_address, REG_POOLING_IMAGE_BASE_ADDR);
+  reg_writeq(output_physical_address, REG_POOLING_RESULT_BASE_ADDR);
+  reg_writeq(
+      ((uint64_t)args.image.height) | (((uint64_t)args.image.width) << 32),
+      REG_POOLING_IMAGE_PIXEL);
+  reg_writeq(
+      ((uint64_t)args.kernel.height) | (((uint64_t)args.kernel.width) << 32),
+      REG_POOLING_WINDOW_SIZE);
+  reg_writeq(((uint64_t)output_height) | (((uint64_t)output_width) << 32),
+             REG_POOLING_RESULT_PIXEL);
+  reg_writeq(((uint64_t)args.image.pad_height) |
+                 (((uint64_t)args.image.pad_width) << 32),
+             REG_POOLING_PAD_PIXEL);
+  reg_writeq(((uint64_t)args.kernel.stride_h) |
+                 (((uint64_t)args.kernel.stride_w) << 32),
+             REG_POOLING_STEP_PIXEL);
+  reg_writeq((uint64_t)args.image.channels, REG_POOLING_CHANNEL_NUMBER);
+  reg_writeq(image_amount_per_row, REG_POOLING_IMAGE_AMOUNT_PER_ROW);
+  reg_writeq(image_one_pad_per_row, REG_POOLING_IMAGE_ONE_PAD_PER_ROW);
+  reg_writeq(image_two_pad_per_row, REG_POOLING_IMAGE_TWO_PAD_PER_ROW);
+  reg_writeq(image_row_mul_pooling_hight,
+             REG_POOLING_IMAGE_ROW_MUL_WINDOW_HEIGHT);
+  reg_writeq(image_row_mul_pad_hight, REG_POOLING_IMAGE_ROW_MUL_PAD_HEIGHT);
+  reg_writeq(image_row_mul_step_hight, REG_POOLING_IMAGE_ROW_MUL_STEP_HEIGHT);
+  reg_writeq(result_amount_align_32, REG_POOLING_RESULT_AMOUNT_ALIGN_32);
+  reg_writeq(result_amount_align_64, REG_POOLING_RESULT_AMOUNT_ALIGN_64);
+  reg_writeq(image_calcu_height, REG_POOLING_IMAGE_CALCU_HEIGHT);
+  reg_writeq(image_padleft_skipwindow, REG_POOLING_IMAGE_PADLEFT_SKIPWINDOW);
+  reg_writeq(mode_reciprocal, REG_POOLING_MODE_RECIPROCAL);
+  reg_writeq(cmd, REG_POOLING_CMD);
+
+  DLOG << "before reg poll";
+  if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) {
+    g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status = ERROR;
+    ret = -EIO;
+    DLOG << "Pooling Wait Irq Timeout!";
+    PADDLE_MOBILE_ENFORCE(0, "Pooling Wait Irq Timeout!");
   }
+  DLOG << "after reg poll";
 
-  row_size = pad_channel * width;
+  // *(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER);
+  output_scale = reg_readq(REG_SCALE_PARAMETER);
+  output_scale = (output_scale << 32) | (output_scale >> 32);
+  fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
 
-  return row_size;
-}
+  active_args.activation_type = NONE;
+  reg_writeq(reg_ActivationArgs, REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);
+
+  pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
+
+  return ret;
+#endif
+  return 0;
+}  // ComputeFpgaPool
 
 int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
 #ifdef FPGA_PRINT_MODE
   DLOG << "=============ComputeFpgaEWAdd===========";
-  DLOG << "   relu_enabled:" << args.relu_enabled
-       << "   const0:" << fp16_2_fp32(int16_t(args.const0))
+  // DLOG << "   relu_enabled:" << args.relu_enabled
+  DLOG << "   const0:" << fp16_2_fp32(int16_t(args.const0))
        << "   const1:" << fp16_2_fp32(int16_t(args.const1));
   DLOG << "   image0_address:" << args.image0.address
+       << "   image0_scale_address:" << args.image0.scale_address
        << "   image0_channels:" << args.image0.channels
        << "   image0_height:" << args.image0.height
-       << "   image0_width:" << args.image0.width;
+       << "   image0_width:" << args.image0.width
+       << "   pad0_height:" << args.image0.pad_height
+       << "   pad0_width:" << args.image0.pad_width;
   DLOG << "   image1_address:" << args.image1.address
+       << "   image1_scale_address:" << args.image1.scale_address
        << "   image1_channels:" << args.image1.channels
        << "   image1_height:" << args.image1.height
-       << "   image1_width:" << args.image1.width;
-  DLOG << "   out_address:" << args.output.address;
-#endif
-#ifndef PADDLE_MOBILE_ZU5
-  return 0;
+       << "   image1_width:" << args.image1.width
+       << "   pad1_height:" << args.image1.pad_height
+       << "   pad_width:" << args.image1.pad_width;
+  DLOG << "   out_address:" << args.output.address
+       << "   out_scale_address:" << args.output.scale_address;
 #endif
-  uint32_t filter_num_align = args.image0.channels;
-
-  uint32_t const_kernel_width_1 = 1;
-  uint32_t const_stride_width_1 = 1;
-  uint32_t const_kernel_height_2 = 2;
-  uint32_t const_stride_height_2 = 2;
-  uint32_t const_pad_height_0 = 0;
-  uint32_t const_pad_width_0 = 0;
-  uint32_t ew_image_height = args.image0.height * 2;
-
-  DLOG << "______db_______: begin to set registers. ";
-  uint64_t ifm_pixel_num =
-      ((args.image0.width) * (args.image0.height) * args.image0.channels);
-  uint64_t ifm_memory_size = ifm_pixel_num * sizeof(short);  // NOLINT
-  uint64_t flt_pixel_num = 0;
-  uint64_t filter_memory_size = 0;
-  uint64_t bn_pixel_num = (filter_num_align * 2);
-  uint64_t bn_memory_size = bn_pixel_num * sizeof(uint16_t);
-
-  uint64_t ofm_width =
-      ((args.image0.width) + 2 * const_pad_width_0 - const_kernel_width_1) /
-          (const_stride_width_1) +
-      1;
-  uint64_t ofm_height =
-      ((ew_image_height) + 2 * (const_pad_height_0) - (const_kernel_height_2)) /
-          (const_stride_height_2) +
-      1;
-
-  uint32_t filter_num = filter_num_align;
-  uint32_t image_channels = args.image0.channels;
-
-  uint64_t ifm_src_paddr = vaddr_to_paddr((args.image0.address));
-  uint64_t flt_src_paddr = vaddr_to_paddr((args.image1.address));
-  uint64_t ifm_dst_paddr = vaddr_to_paddr((args.output.address));
-  float image_inv_scale = 0;
-  float filter_inv_scale = 0;
-  int idx = 0;
-
-  DLOG << "______db_______: reset registers. ";
-
-  reg_writeq(1, MUL8(24));
-  usleep(1);
-  reg_writeq(0, MUL8(24));
-
-  /*********configuring registers*************/
-  uint32_t cmd_image_vir_base_addr = (uint32_t)ifm_src_paddr;
-  uint32_t cmd_filter_vir_base_addr = (uint32_t)flt_src_paddr;
-  uint32_t cmd_scale_base_addr = 0;
-  uint32_t conv_ofm_addr_base = (uint32_t)ifm_dst_paddr;
-  uint64_t cmd_group_num = 1;
-  uint64_t cmd_filter_per_group = filter_num / cmd_group_num;
-
-  uint64_t cmd_flt_sqr_len = (const_kernel_width_1) * (const_kernel_height_2);
-  uint64_t cmd_ifm_pre_row_num = const_kernel_height_2;
-  if ((const_kernel_height_2 == ew_image_height) && (0 == const_pad_height_0)) {
-    cmd_ifm_pre_row_num = (const_kernel_height_2);
-  } else {
-    cmd_ifm_pre_row_num = (const_kernel_height_2) - (const_pad_height_0) +
-                          (const_stride_height_2);
-  }
-  uint64_t cmd_flt_pre_batch_num = 1;
-  uint64_t cmd_ifm_pack_num_per_row_mns1 =
-      (uint64_t)(((args.image0.channels) + 63) / 64) - 1;
-  uint64_t cmd_bn_num = filter_num;
-  uint64_t cmd_bias_num = filter_num;
-  uint64_t cmd_ifm_stride_row_length =
-      args.image0.width * const_stride_height_2;
-  uint64_t cmd_flt_pack_num_per_kernel_mns1 =
-      (uint64_t)(((args.image0.channels) + 63) / 64) - 1;
-  uint64_t cmd_ofm_width_mns1 = (uint64_t)(
-      ((args.image0.width) - (const_kernel_width_1) + 2 * (const_pad_width_0)) /
-      (const_stride_width_1));
-  uint64_t cmd_ofm_height =
-      (uint64_t)(((args.image0.height) * 2 - (const_kernel_height_2) +
-                  2 * (const_pad_height_0)) /
-                 (const_stride_height_2)) +
-      1;
-
-  uint64_t cmd_channel_num = 0;
-  uint64_t cmd_ifm_pack_len = 0;
-  uint64_t cmd_channel_per_group = 0;
-  uint64_t cmd_flt_batch_num_mns1 = 0;
-  uint64_t cmd_flt_N_impl = 8;
-  uint64_t cmd_ifm_C_impl = 16;
-  uint64_t cmd_flt_pack_length = 0;
-  uint64_t cmd_step_h_mul_row_byte_len = 0;
-  uint64_t cmd_pad_h_mul_row_byte_len = 0;
-  uint64_t cmd_ifm_pack_byte_length =
-      16 * ((((args.image0.width) + 7) / 8) * 8);
-  uint64_t row_len_align = args.image0.width;
-  uint64_t cmd_flt_cycle_num_mns1 = 0;
-  if (image_channels > 32) {
-    cmd_channel_num = (uint64_t)((((args.image0.channels) + 63)) / 64) * 64;
-    cmd_ifm_pack_len = 64 * (args.image0.width);
-    cmd_channel_per_group = 64;
-    cmd_flt_batch_num_mns1 = (uint64_t)(((filter_num + 7)) / 8 - 1);
-    cmd_flt_N_impl = 8;
-    cmd_ifm_C_impl = 64;
-    cmd_flt_pack_length = (const_kernel_width_1) * (const_kernel_height_2)*64;
-    cmd_step_h_mul_row_byte_len =
-        (const_stride_height_2)*cmd_channel_num * args.image0.width;
-    cmd_pad_h_mul_row_byte_len =
-        (const_pad_height_0)*cmd_channel_num * args.image0.width;
-    cmd_ifm_pack_byte_length = 64 * args.image0.width;
-    row_len_align = args.image0.width;
-    cmd_flt_cycle_num_mns1 = (cmd_channel_num / 64) - 1;
-  } else if (image_channels > 16) {
-    cmd_channel_num = 32;
-    cmd_ifm_pack_len = 32 * (args.image0.width);
-    cmd_channel_per_group = 32;
-    cmd_flt_batch_num_mns1 = (uint64_t)((((filter_num) + 15)) / 16 - 1);
-    cmd_flt_N_impl = 16;
-    cmd_ifm_C_impl = 32;
-    cmd_flt_pack_length = (const_kernel_width_1) * (const_kernel_height_2)*32;
-    cmd_step_h_mul_row_byte_len = (const_stride_height_2)*cmd_channel_num *
-                                  ((((args.image0.width) + 1)) / 2) * 2;
-    cmd_pad_h_mul_row_byte_len = (const_pad_height_0)*cmd_channel_num *
-                                 ((((args.image0.width) + 1)) / 2) * 2;
-    cmd_ifm_pack_byte_length =
-        32 * (uint64_t)((((args.image0.width) + 1)) / 2) * 2;
-    row_len_align = (uint64_t)((((args.image0.width) + 1)) / 2);
-    cmd_flt_cycle_num_mns1 = 0;
-  } else if (image_channels > 8) {
-    cmd_channel_num = 16;
-    cmd_ifm_pack_len = 16 * (args.image0.width);
-    cmd_channel_per_group = 16;
-    cmd_flt_batch_num_mns1 = (uint64_t)((((filter_num) + 15)) / 16 - 1);
-    cmd_flt_N_impl = 32;
-    cmd_ifm_C_impl = 16;
-    cmd_flt_pack_length = (const_kernel_width_1) * (const_kernel_height_2)*16;
-    cmd_step_h_mul_row_byte_len = (const_stride_height_2)*cmd_channel_num *
-                                  ((((args.image0.width) + 3)) / 4) * 4;
-    cmd_pad_h_mul_row_byte_len = (const_pad_height_0)*cmd_channel_num *
-                                 ((((args.image0.width) + 3)) / 4) * 4;
-    cmd_ifm_pack_byte_length =
-        16 * (uint64_t)((((args.image0.width) + 3)) / 4) * 4;
-    row_len_align = (uint64_t)((((args.image0.width) + 3)) / 4);
-    cmd_flt_cycle_num_mns1 = 0;
-  }
-
-  cmd_flt_N_impl = 16;
-  cmd_flt_batch_num_mns1 = 0;
-  cmd_flt_pack_length = 64;
-  uint64_t cmd_flt_N_len = 0;
-  uint64_t cmd_flt_length = 64;
-  uint64_t cmd_ifm_row_byte_length = cmd_channel_num * (args.image0.width);
-  uint64_t cmd_ifm_buf_col_len = 0;
-  uint64_t ifm_one_batch_len =
-      (1048576 / ((2 * row_len_align) * cmd_channel_num));
-  uint64_t cmd_ifm_batch_num_tmp = (uint64_t)(
-      ((ew_image_height) + ifm_one_batch_len - 1) / ifm_one_batch_len);
-  DLOG << "ifm_one_batch_len = " << hex << ifm_one_batch_len;
-  DLOG << "cmd_ifm_batch_num_tmp = " << hex << cmd_ifm_batch_num_tmp;
-
-  if (1 == cmd_ifm_batch_num_tmp) {
-    cmd_ifm_buf_col_len = ew_image_height;
-  } else {
-    cmd_ifm_buf_col_len = ifm_one_batch_len;
-  }
-  uint64_t cmd_ifm_batch_num_mns1 =
-      (((ew_image_height) + cmd_ifm_buf_col_len - 1) / cmd_ifm_buf_col_len) - 1;
-  DLOG << "___db____ew____:cmd_ifm_batch_num_mns1 = " << hex
-       << cmd_ifm_batch_num_mns1;
-
-  uint64_t cmd_flt_total_batch_num = 1;
-  uint64_t cmd_ifm_buf_col_len_rem =
-      (ew_image_height)-cmd_ifm_batch_num_mns1 * cmd_ifm_buf_col_len;
-  //-------- ofm batch number reg &&  initial URAM reading address
-  // logic-----------------
-  uint64_t cmd_init_raddr_cnt = 1;
-  uint64_t cmd_init_raddr_flag = 0;
-  int64_t cmd_init_raddr_index = -8;
-  int64_t cmd_init_raddr_col_0 = -4;
-  int64_t cmd_init_raddr_col_1 = -4;
-  int64_t conv_ofm_buf_col_len = 0;
-  int64_t conv_ofm_buf_col_len_rem = 0;
-
-  if (((const_pad_height_0) % (2 * (const_stride_height_2))) == 0) {
-    cmd_init_raddr_cnt = 0;
-    cmd_init_raddr_flag = 0;
-    cmd_init_raddr_index =
-        0 - (int64_t)row_len_align * (((const_pad_height_0) + 1) / 2);
-    cmd_init_raddr_col_0 = cmd_init_raddr_index;
-    cmd_init_raddr_col_1 = cmd_init_raddr_index;
-  } else if (((const_pad_height_0)-2 *
-              ((const_pad_height_0) / (2 * (const_stride_height_2)))) <=
-             (const_stride_height_2)) {
-    cmd_init_raddr_cnt =
-        (const_stride_height_2) -
-        ((const_pad_height_0) -
-         ((const_pad_height_0) / (2 * (const_stride_height_2))));
-    cmd_init_raddr_flag = 1;
-    cmd_init_raddr_index =
-        0 - (int64_t)row_len_align * (int64_t)(const_pad_height_0) -
-        (int64_t)row_len_align *
-            ((const_pad_height_0) / (2 * const_stride_height_2));
-    cmd_init_raddr_col_0 =
-        0 - (int64_t)row_len_align * (int64_t)(const_pad_height_0) -
-        (int64_t)row_len_align *
-            ((const_pad_height_0) / (2 * (const_stride_height_2)));
-    cmd_init_raddr_col_1 =
-        cmd_init_raddr_col_0 +
-        const_stride_height_2 * (int64_t)row_len_align;  // 0;
-  } else if (((const_pad_height_0)-2 *
-              ((const_pad_height_0) / (2 * (const_stride_height_2)))) <=
-             2 * (const_stride_height_2)) {
-    cmd_init_raddr_cnt =
-        2 * (const_stride_height_2) *
-            (((const_pad_height_0) + 2 * (const_stride_height_2)-1) /
-             (2 * (const_stride_height_2))) -
-        (const_pad_height_0);
-    cmd_init_raddr_flag = 0;
-    cmd_init_raddr_index =
-        0 - (int64_t)row_len_align * (int64_t)(const_stride_height_2) *
-                (((const_pad_height_0) + 2 * (const_stride_height_2)-1) /
-                 (2 * (const_stride_height_2)));
-    cmd_init_raddr_col_0 =
-        0 -
-        (int64_t)row_len_align *
-            ((const_pad_height_0) / (2 * (const_stride_height_2))) -
-        (int64_t)row_len_align *
-            (2 * (const_stride_height_2) *
-                 (((const_pad_height_0) + 2 * (const_stride_height_2)-1) /
-                  (2 * (const_stride_height_2))) -
-             (const_pad_height_0));
-    cmd_init_raddr_col_1 = cmd_init_raddr_col_0;
-  }
-
-  if (cmd_ifm_batch_num_mns1 == 0) {
-    if ((const_kernel_height_2) <= (const_stride_height_2)) {
-      conv_ofm_buf_col_len = cmd_ifm_buf_col_len + 2 * (const_pad_height_0)-3 *
-                                                       (const_stride_height_2);
-    } else {
-      conv_ofm_buf_col_len =
-          cmd_ifm_buf_col_len +
-          2 * (const_pad_height_0)-3 * (const_stride_height_2) -
-          (const_kernel_height_2);
-    }
-    conv_ofm_buf_col_len_rem = conv_ofm_buf_col_len;
-  } else {
-    int N_rem = 0;
-    int row_rem = 0;
-
-    if ((const_kernel_height_2) <= (const_stride_height_2)) {
-      conv_ofm_buf_col_len = cmd_ifm_buf_col_len - 3 * (const_stride_height_2);
-      N_rem = (cmd_ifm_buf_col_len - (const_kernel_height_2)) /
-                  (const_stride_height_2) +
-              1;
-      row_rem = cmd_ifm_buf_col_len - (const_stride_height_2)*N_rem;
-      conv_ofm_buf_col_len_rem = cmd_ifm_buf_col_len_rem +
-                                 2 * (const_pad_height_0) + row_rem -
-                                 3 * (const_stride_height_2);
-    } else {
-      conv_ofm_buf_col_len =
-          cmd_ifm_buf_col_len +
-          2 * (const_pad_height_0)-3 * (const_stride_height_2) -
-          (const_kernel_height_2);
-      N_rem = (cmd_ifm_buf_col_len - (const_kernel_height_2)) /
-                  (const_stride_height_2) +
-              1;
-      row_rem = cmd_ifm_buf_col_len - (const_stride_height_2)*N_rem;
-      conv_ofm_buf_col_len_rem =
-          cmd_ifm_buf_col_len_rem + (const_pad_height_0) + row_rem -
-          3 * (const_stride_height_2) - (const_kernel_height_2);
-    }
-  }
-
-  //*************************
-  uint64_t ifm_height_raw_batch = 0;
-  uint64_t cmd_ofm_height_batch_reg;
-  uint64_t conv_ofm_height_batch_tmp = 0;
-  uint64_t conv_ofm_height_batch[16];
-  int ofm_height_norm_batch;
-  int height_batch_num;
-
-  int row_norm_size = get_ofm_batch_size(args.image0.width, cmd_channel_num);
-  int ifm_norm_size =
-      ew_image_height * row_norm_size * sizeof(short);  // NOLINT
-
-  if (ifm_norm_size <= (1024 * 1024)) {
-    conv_ofm_height_batch[0] =
-        get_image_out_axis(ew_image_height, const_pad_height_0,
-                           const_kernel_height_2, const_stride_height_2);
-    height_batch_num = 0;
-  } else if (row_norm_size < (1024 * 1024)) {
-    // raw ifm batch ,should make ofm be 2*N
-    ifm_height_raw_batch =
-        (int)(((double)(1024 * 1024) - row_norm_size + 1) /  // NOLINT
-              (double)(2 * row_norm_size));                  // NOLINT
-    ofm_height_norm_batch = get_image_out_axis(
-        ifm_height_raw_batch, 0, const_kernel_height_2, const_stride_height_2);
-    if (ofm_height_norm_batch % 2 == 0) {
-      ofm_height_norm_batch = ofm_height_norm_batch;
-    } else {
-      ofm_height_norm_batch = ofm_height_norm_batch - 1;
-    }
-
-    DLOG << "ofm_height_norm_batch = " << hex << ofm_height_norm_batch;
-    int ofm_height_rems = cmd_ofm_height;
-    int i = 0;
-    for (i = 0; 0 < ofm_height_rems; i++) {
-      if (ofm_height_norm_batch <= ofm_height_rems) {
-        ofm_height_rems = ofm_height_rems - ofm_height_norm_batch;
-        conv_ofm_height_batch[i] = ofm_height_norm_batch;
-        DLOG << "ofm_height_norm_batch[i] = " << hex
-             << conv_ofm_height_batch[i];
-      } else {
-        conv_ofm_height_batch[i] = ofm_height_rems;
-        break;
-      }
-    }
-    height_batch_num = i;
-  }
-  //*************************
-
-  //-----------------------  para functions --------------------------------
-  uint64_t cmd_filter_quant_scale = 0x3c00;
-  uint64_t cmd_image_quant_scale = 0x3c00;
-  uint64_t wParallelsim = cmd_ifm_C_impl >> 3;
-  uint64_t wParallelsim_num = cmd_flt_cycle_num_mns1;
-  uint64_t win_size = (const_kernel_width_1) * (const_kernel_height_2) *
-                          (cmd_ifm_pack_num_per_row_mns1 + 1) -
-                      1;  //
-  uint64_t conv_ofm_width = (((args.image0.width) - (const_kernel_width_1) +
-                              (const_pad_width_0) + (const_pad_width_0)) /
-                             (const_stride_width_1));
-  uint64_t conv_ofm_dma_length = cmd_channel_num * sizeof(short);  // NOLINT
-  uint64_t conv_ofm_dma_stride = cmd_channel_num * sizeof(short);  // NOLINT
-  uint64_t cmd_image_addr_low = 0;
-  uint64_t cmd_image_addr_high = 0;
-  uint64_t cmd_image_addr_diff = 0;
-
-  if (cmd_filter_vir_base_addr < cmd_image_vir_base_addr) {
-    cmd_image_addr_low = (uint64_t)cmd_filter_vir_base_addr;
-    cmd_image_addr_high = (uint64_t)cmd_image_vir_base_addr;
-  } else {
-    cmd_image_addr_low = (uint64_t)cmd_image_vir_base_addr;
-    cmd_image_addr_high = (uint64_t)cmd_filter_vir_base_addr;
+#ifdef PADDLE_MOBILE_ZU5
+  int ret = 0;
+  uint64_t output_scale = 0;
+
+  uint64_t reg_ActivationArgs = 0;
+  ActivationArgs active_args;
+  active_args.activation_type = args.output.activation.activation_type;
+  active_args.leaky_relu_negative_slope =
+      args.output.activation.leaky_relu_negative_slope;
+  reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) |
+                       active_args.leaky_relu_negative_slope;
+  DLOG << "    activation_type:" << active_args.activation_type
+       << "    leaky_relu_negative_slope:"
+       << active_args.leaky_relu_negative_slope;
+  DLOG << "    reg_ActivationArgs:" << reg_ActivationArgs;
+
+  pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
+  if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_EW]->status) {
+    ret = -EIO;
+    DLOG << "EW Status Error!";
+    pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
+    return ret;
   }
 
-  cmd_image_addr_diff = cmd_image_addr_high - cmd_image_addr_low;
-  uint64_t o_ust_rst = 0;
-  uint64_t conv_ofm_dma_repeat =
-      (uint64_t)(((((args.image0.width) - (const_kernel_width_1) +
-                    (const_pad_width_0) + (const_pad_width_0))) /
-                  (const_stride_width_1)) +
-                 1);
-  uint64_t conv_ofm_dma_offset =
-      cmd_channel_num * conv_ofm_dma_repeat * sizeof(short);  // NOLINT
-  uint64_t conv_ofm_inter_stride = conv_ofm_dma_offset * 2;
-  //----------------- register contation ------------------
-  uint64_t cmd_ifm_flt_base_addr =
-      (cmd_image_addr_high << 32) | (cmd_image_addr_low);
-
-  uint64_t cmd_ifm_flt_dim = ((uint64_t)(const_kernel_height_2) << 48) |
-                             ((uint64_t)(const_kernel_width_1) << 32) |
-                             ((uint64_t)(ew_image_height) << 16) |
-                             ((uint64_t)(args.image0.width));
-  uint64_t cmd_pad_step_size = ((uint64_t)(const_stride_height_2) << 48) |
-                               ((uint64_t)(const_stride_width_1) << 32) |
-                               ((uint64_t)(const_pad_height_0) << 16) |
-                               ((uint64_t)(const_pad_width_0));
-  uint64_t cmd_param1 = ((uint64_t)cmd_filter_per_group << 48) |
-                        ((uint64_t)cmd_channel_num << 32) |
-                        ((uint64_t)filter_num << 16) |
-                        ((uint64_t)cmd_group_num);
-  uint64_t cmd_param2 =
-      ((uint64_t)cmd_flt_sqr_len << 48) | ((uint64_t)cmd_ifm_pack_len << 32) |
-      ((uint64_t)cmd_ifm_pre_row_num << 16) | ((uint64_t)cmd_channel_per_group);
-  uint64_t cmd_param3 = ((uint64_t)cmd_flt_batch_num_mns1 << 48) |
-                        ((uint64_t)cmd_flt_total_batch_num << 32) |
-                        ((uint64_t)cmd_flt_N_impl << 16) |
-                        ((uint64_t)cmd_flt_pre_batch_num);
-  uint64_t cmd_param4 = ((uint64_t)cmd_ifm_pack_num_per_row_mns1 << 48) |
-                        ((uint64_t)cmd_bn_num << 32) |
-                        ((uint64_t)cmd_bias_num << 16) |
-                        ((uint64_t)cmd_flt_N_len);
-  uint64_t cmd_param5 = ((uint64_t)cmd_ifm_stride_row_length << 48) |
-                        ((uint64_t)cmd_flt_pack_length << 32) |
-                        ((uint64_t)cmd_flt_cycle_num_mns1 << 16) |
-                        ((uint64_t)cmd_flt_pack_num_per_kernel_mns1);
-  uint64_t cmd_param6 = ((uint64_t)cmd_ofm_width_mns1 << 48) |
-                        ((uint64_t)cmd_ifm_batch_num_mns1 << 32) |
-                        ((uint64_t)cmd_ifm_buf_col_len << 16) |
-                        ((uint64_t)cmd_ifm_C_impl);
-  uint64_t cmd_param7 = ((uint64_t)conv_ofm_inter_stride << 32) |
-                        ((uint64_t)cmd_ifm_buf_col_len_rem << 16) |
-                        ((uint64_t)cmd_ofm_height);
-  uint64_t cmd_param8 =
-      ((uint64_t)cmd_flt_length << 32) | ((uint64_t)cmd_ifm_row_byte_length);
-  uint64_t cmd_ifm_flt_quant_scale = ((uint64_t)cmd_filter_quant_scale << 32) |
-                                     ((uint64_t)cmd_image_quant_scale);
-  uint64_t cmd_step_pad_mul_row_len =
-      ((uint64_t)cmd_pad_h_mul_row_byte_len << 32) |
-      ((uint64_t)cmd_step_h_mul_row_byte_len);
-  //---- ofm paras ----
-  uint64_t cmd_conv_param_reg = ((uint64_t)wParallelsim_num << 32) |
-                                ((uint64_t)wParallelsim << 16) |
-                                ((uint64_t)win_size);
-  uint64_t cmd_ofm_addr_width_reg =
-      ((uint64_t)conv_ofm_width << 32) | ((uint64_t)conv_ofm_addr_base);
-  uint64_t cmd_intra_stride_atoms_reg =
-      ((uint64_t)conv_ofm_dma_length << 32) | ((uint64_t)conv_ofm_dma_stride);
-  uint64_t cmd_user_ctrl_reg = ((uint64_t)o_ust_rst);
-  uint64_t cmd_wdma_param_reg =
-      ((uint64_t)(conv_ofm_dma_repeat | 0x80000000) << 32) |
-      ((uint64_t)conv_ofm_dma_offset);
-  uint64_t cmd_init_raddr_reg = ((cmd_init_raddr_col_1 & 0xffff) << 48) |
-                                ((cmd_init_raddr_col_0 & 0xffff) << 32) |
-                                (((cmd_init_raddr_index & 0xffff) << 16)) |
-                                (cmd_init_raddr_flag & 0xffff) << 15 |
-                                ((cmd_init_raddr_cnt & 0xffff));
-  uint64_t cmd_mult_factor =
-      ((uint64_t)args.const0) | ((uint64_t)args.const1 << 16);
-  uint64_t cmd_para31 = (cmd_para31 & 0x1) | args.relu_enabled;
-
-  DLOG << "cmd_init_raddr_col_1 = " << hex << cmd_init_raddr_col_1;
-  DLOG << "cmd_init_raddr_col_0 = " << hex << cmd_init_raddr_col_0;
-  DLOG << "cmd_init_raddr_index = " << hex << cmd_init_raddr_index;  //
-  DLOG << "cmd_init_raddr_cnt = " << hex << cmd_init_raddr_cnt;
-  DLOG << "cmd_ifm_buf_col_len = " << hex << cmd_ifm_buf_col_len;
-  DLOG << "cmd_ifm_buf_col_len_rem = " << hex << cmd_ifm_buf_col_len_rem;
-  DLOG << "conv_ofm_buf_col_len = " << hex << conv_ofm_buf_col_len;
-  DLOG << "conv_ofm_buf_col_len_rem = " << hex << conv_ofm_buf_col_len_rem;
-  DLOG << "cmd_ifm_flt_base_addr = " << hex << cmd_ifm_flt_base_addr;
-  DLOG << "cmd_scale_base_addr = " << hex << cmd_scale_base_addr;
-  DLOG << "cmd_ifm_flt_dim = " << hex << cmd_ifm_flt_dim;
-  DLOG << "cmd_pad_step_size = " << hex << cmd_pad_step_size;
-  DLOG << "cmd_param1 = " << hex << cmd_param1;
-  DLOG << "cmd_param2 = " << hex << cmd_param2;
-  DLOG << "cmd_param3 = " << hex << cmd_param3;
-  DLOG << "cmd_param4 = " << hex << cmd_param4;
-  DLOG << "cmd_param5 = " << hex << cmd_param5;
-  DLOG << "cmd_param6 = " << hex << cmd_param6;
-  DLOG << "cmd_param7 = " << hex << cmd_param7;
-  DLOG << "cmd_param8 =  " << hex << cmd_param8;
-  DLOG << "cmd_ifm_flt_quant_scale =  " << hex << cmd_ifm_flt_quant_scale;
-  DLOG << "cmd_step_pad_mul_row_len = " << hex << cmd_step_pad_mul_row_len;
-  DLOG << "cmd_ifm_pack_byte_length = " << hex << cmd_ifm_pack_byte_length;
-  DLOG << "cmd_conv_param_reg = " << hex << cmd_conv_param_reg;
-  DLOG << "cmd_ofm_addr_width_reg = " << hex << cmd_ofm_addr_width_reg;
-  DLOG << "cmd_intra_stride_atoms_reg = " << hex << cmd_intra_stride_atoms_reg;
-  DLOG << "cmd_init_raddr_reg = " << hex << cmd_init_raddr_reg;
-  DLOG << "cmd_mult_factor = " << hex << cmd_mult_factor;
-  DLOG << "cmd_wdma_param_reg = " << hex << cmd_wdma_param_reg;
-  DLOG << "cmd_para31 = " << hex << cmd_para31;
-
-  reg_writeq(cmd_ifm_flt_base_addr, MUL8(1));
-  reg_writeq(cmd_scale_base_addr, MUL8(2));
-  reg_writeq(cmd_ifm_flt_dim, MUL8(3));
-  reg_writeq(cmd_pad_step_size, MUL8(4));
-  reg_writeq(cmd_param1, MUL8(5));
-  reg_writeq(cmd_param2, MUL8(6));
-  reg_writeq(cmd_param3, MUL8(7));
-  reg_writeq(cmd_param4, MUL8(8));
-  reg_writeq(cmd_param5, MUL8(9));
-  reg_writeq(cmd_param6, MUL8(10));
-  reg_writeq(cmd_param7, MUL8(11));
-  reg_writeq(cmd_param8, MUL8(12));
-  reg_writeq(cmd_ifm_flt_quant_scale, MUL8(13));
-  reg_writeq(cmd_step_pad_mul_row_len, MUL8(14));
-  reg_writeq(cmd_ifm_pack_byte_length, MUL8(15));
-  reg_writeq(cmd_conv_param_reg, MUL8(16));
-  reg_writeq(cmd_ofm_addr_width_reg, MUL8(17));
-  reg_writeq(cmd_intra_stride_atoms_reg, MUL8(18));
-
-  reg_writeq(cmd_init_raddr_reg, MUL8(29));
-  reg_writeq(cmd_para31, MUL8(31));
-
-  reg_writeq(0, MUL8(19));
-  for (int i = 0; i < height_batch_num + 1; i++) {
-    conv_ofm_height_batch_tmp =
-        int((conv_ofm_height_batch[i] + 1) / 2) - 1;  // NOLINT
-    cmd_ofm_height_batch_reg =
-        ((uint64_t)(conv_ofm_buf_col_len_rem & 0xffff) << 48) |
-        ((uint64_t)(conv_ofm_buf_col_len & 0xffff) << 32) |
-        ((uint64_t)conv_ofm_height_batch_tmp + 0x80000000);
-    reg_writeq(cmd_ofm_height_batch_reg, MUL8(19));
-    reg_writeq(cmd_ofm_height_batch_reg & 0xffffffff00000000, MUL8(19));
-    usleep(1);
+  reg_writeq(reg_ActivationArgs,
+             REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);  // active functoion
+
+  reg_writeq(output_scale, REG_SCALE_PARAMETER);
+  reg_writeq(args.driver.image0_address_phy, REG_EW_IMAGE0_BASE_ADDR);
+  reg_writeq(args.driver.image1_address_phy, REG_EW_IMAGE1_BASE_ADDR);
+  reg_writeq(args.driver.datalen, REG_EW_DATA_LEN);
+  reg_writeq(args.driver.image_image_pixel, REG_EW_IMAGE_PIXEL);
+  reg_writeq(args.driver.image_amount_per_row, REG_EW_IMAGE_AMOUNT_PER_ROW);
+  reg_writeq(args.driver.output_address_phy, REG_EW_RESULT_BASE_ADDR);
+  reg_writeq(args.driver.coefficient, REG_EW_COEFFICIENT);
+  reg_writeq(args.driver.cmd, REG_EW_CMD);
+
+  if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) {
+    g_fpgainfo.pe_data->pes[PE_IDX_EW]->status = ERROR;
+    ret = -EIO;
+    DLOG << "EW Wait Irq Timeout!";
+    PADDLE_MOBILE_ENFORCE(0, "EW Wait Irq Timeout!");
   }
-  reg_writeq(cmd_wdma_param_reg, MUL8(25));
-  DLOG << "cmd_ofm_height_batch_reg = " << hex << cmd_ofm_height_batch_reg;
-
-  /******************************************************************/
-  reg_writeq(cmd_mult_factor, MUL8(30));
-  /******************************************************************/
 
-  reg_writeq(0, MUL8(0));
+  output_scale = reg_readq(REG_SCALE_PARAMETER);
+  output_scale = (output_scale << 32) | (output_scale >> 32);
+  fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
+  active_args.activation_type = NONE;
+  reg_writeq(reg_ActivationArgs, REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);
 
-  reg_writeq(0x2100000000000000, MUL8(0));
-
-  int ret = fpga_regpoll(MUL8(48), CONV_DONE, 0xffffff);
-  if (ret == -1) {
-    DLOG << "fpga EW no interrupt!!";
-    return ret;
-  }
-  reg_readq(MUL8(63));
-  usleep(10);
-  // get max value
-  float scale = Findfp16Max();
-  (args.output.scale_address)[0] = scale;                 // NOLINT
-  (args.output.scale_address)[1] = (float)(1.0 / scale);  // NOLINT
-  DLOG << "Findfp16Max scale = " << scale;
-
-  DLOG << "ret=" << ret;
+  pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
   return ret;
-}
+#endif
+  return 0;
+}  // ComputeFpgaEWAdd
 
 int PerformBypass(const struct BypassArgs &args) {
 #ifdef FPGA_PRINT_MODE
@@ -1594,75 +572,159 @@ int PerformBypass(const struct BypassArgs &args) {
   DLOG << "   out_address:" << args.output.address
        << "   out_scale_address:" << args.output.scale_address;
 #endif
-#ifndef PADDLE_MOBILE_ZU5
-  return 0;
-#endif
+#ifdef PADDLE_MOBILE_ZU5
+  uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT);
+  uint64_t output_scale = 0;
+  uint64_t timer_cnt = 0;
+  uint64_t cmd = 0;
+  uint64_t datalen = 0;
+  uint64_t input_address_phy = 0;
+  uint64_t output_address_phy = 0;
+  uint8_t data_cell_in = 0;
+  uint8_t data_cell_out = 0;
+  int ret = 0;
+
+  uint64_t reg_ActivationArgs = 0;
+  ActivationArgs active_args;
+  active_args.activation_type = args.output.activation.activation_type;
+
+  active_args.leaky_relu_negative_slope =
+      args.output.activation.leaky_relu_negative_slope;
+
+  reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) |
+                       active_args.leaky_relu_negative_slope;
+
+  datalen = (uint64_t)args.image.width * (uint64_t)args.image.height *
+            (uint64_t)args.image.channels;
+  datalen = align_to_x(datalen, 16);
+  input_address_phy = vaddr_to_paddr_driver(args.image.address);
+  output_address_phy = vaddr_to_paddr_driver(args.output.address);
+  DLOG << "input_phy:" << input_address_phy;
+  DLOG << "output_phy:" << output_address_phy;
+
+  switch (args.input_data_type) {
+    case DATA_TYPE_FP16: {
+      switch (args.output_data_type) {
+        case DATA_TYPE_FP16:
+          data_cell_in = SIZE_FP16;
+          data_cell_out = SIZE_FP16;
+          cmd = CMD_FP16_TO_FP16;
+          break;
+
+        case DATA_TYPE_FP32:
+          data_cell_in = SIZE_FP16;
+          data_cell_out = SIZE_FP32;
+          cmd = CMD_FP16_TO_FP32;
+          break;
+
+        default:
+          break;
+      }
+    } break;
 
-  uint64_t ifm_src_paddr = vaddr_to_paddr(args.image.address);
-  uint64_t ifm_dst_paddr = vaddr_to_paddr(args.output.address);
-  uint64_t bp_enable;
-  int64_t length;
-  uint64_t pixels;
-
-  // fp32->fp16
-  if ((args.input_data_type) && (!args.output_data_type)) {
-    DLOG << "fp32-fp16";
-    pixels = (args.image.channels) * (args.image.width) * (args.image.height);
-    length = pixels * sizeof(float);
-    bp_enable = 0x8800000000000000UL + (uint64_t)length;
+    case DATA_TYPE_INT8: {
+      if (args.output_data_type != DATA_TYPE_FP16) {
+        DLOG << "error:Output Datetype error,not DATA_TYPE_FP16: "
+             << args.output_data_type;
+      }
+      data_cell_in = SIZE_INT8;
+      data_cell_out = SIZE_FP16;
+      cmd = CMD_INT8_TO_FP16;
+    } break;
+
+    case DATA_TYPE_FP32: {
+      switch (args.output_data_type) {
+        case DATA_TYPE_FP16:
+          data_cell_in = SIZE_FP32;
+          data_cell_out = SIZE_FP16;
+          cmd = CMD_FP32_TO_FP16;
+          break;
+
+        case DATA_TYPE_FP32:
+          data_cell_in = SIZE_FP32;
+          data_cell_out = SIZE_FP32;
+          cmd = CMD_FP32_TO_FP32;
+          break;
+
+        default:
+          break;
+      }
+    } break;
+
+    default:
+      break;
   }
-  // fp16->fp32
-  else if ((!args.input_data_type) && (args.output_data_type)) {  // NOLINT
-    DLOG << "fp16-fp32";
-    pixels = filter::calc_aligned_channel((args.image.channels)) *
-             (args.image.width) * (args.image.height);
-    length = pixels * sizeof(short);       // NOLINT
-    length = align_to_x((int)length, 64);  // NOLINT
-    bp_enable = 0x8a00000000000000UL + length;
+  if (cmd != CMD_FP16_TO_FP16 && cmd != CMD_FP16_TO_FP32 &&
+      cmd != CMD_FP32_TO_FP16 && cmd != CMD_FP32_TO_FP32 &&
+      cmd != CMD_INT8_TO_FP16) {
+    //   std::cout<< " err back Error1!" <<std::endl;
+    return -EFAULT;
   }
-  // fp16->fp16 findmax
-  else if ((!args.input_data_type) && (!args.output_data_type)) {  // NOLINT
-    DLOG << "16-16";
-    pixels = (args.image.channels) * (args.image.width) * (args.image.height);
-    length = pixels * sizeof(short);  // NOLINT
-    bp_enable = 0x8900000000000000 + length;
-  } else {
-    return -1;
+  if ((data_cell_in != SIZE_FP16 && data_cell_in != SIZE_FP32 &&
+       data_cell_in != SIZE_INT8) ||
+      (data_cell_out != SIZE_FP16 && data_cell_out != SIZE_FP32)) {
+    return -EFAULT;
   }
-  // start bypass
-  reg_writeq(0, MUL8(0));
-  reg_writeq(ifm_src_paddr, MUL8(27));
-  reg_writeq(ifm_dst_paddr, MUL8(28));
-  reg_writeq(bp_enable, MUL8(0));
-  int ret = -1;
-  ret = fpga_regpoll(MUL8(48), BYPASS_DONE, 0xffffff);
-
-  if (ret != -1) {
-    DLOG << "test done";
+  pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
+  if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_BYPASS]->status) {
+    ret = -EIO;
+    DLOG << "Bypass Status Error!";
+    pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
+    return ret;
+  }
+  reg_writeq(reg_ActivationArgs,
+             REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);  // active functoion
+  reg_writeq(output_scale, REG_SCALE_PARAMETER);
+  reg_writeq(input_address_phy, REG_CONVERT_SRC_ADDR);
+  reg_writeq(output_address_phy, REG_CONVERT_DST_ADDR);
+  reg_writeq(datalen, REG_CONVERT_LENGTH);
+  reg_writeq(cmd, REG_CONVERT_CMD);
+  DLOG << "before reg poll";
+  if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_BYPASS, PE_IRQ_TIMEOUT)) {
+    g_fpgainfo.pe_data->pes[PE_IDX_BYPASS]->status = ERROR;
+    ret = -EIO;
+    DLOG << "BYPASS Wait Irq Timeout!";
+    PADDLE_MOBILE_ENFORCE(0, "BYPASS Wait Irq Timeout!");
   }
-  reg_readq(MUL8(63));
-  usleep(10);
-  // get max value
-  float scale = Findfp16Max();
-  args.output.scale_address[0] = scale;                 // NOLINT
-  args.output.scale_address[1] = (float)(1.0 / scale);  // NOLINT
-  DLOG << "ret=" << ret;
+  DLOG << "after reg poll";
+
+  output_scale = reg_readq(REG_SCALE_PARAMETER);
+  output_scale = (output_scale << 32) | (output_scale >> 32);
+  fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
+  reg_writeq(reg_ActivationArgs, REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);
+  pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
   return ret;
-}
+#endif
+  return 0;
+}  // PerformBypass
+
+uint64_t FPGAVersion() {
+#ifdef FPGA_PRINT_MODE
+  DLOG << "=============ComputeFpgaBypass===========";
+#endif
+#ifdef PADDLE_MOBILE_ZU5
+  uint64_t fpga_ver = 0;
+  pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
+  fpga_ver = reg_readq(REG_HARDWARE_STATUS);
+  pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
+  return fpga_ver;
+#endif
+  return 0;
+}  // FPGAVersion
 
 int ComputeFPGAConcat(const struct ConcatArgs &args) {
 #ifdef FPGA_PRINT_MODE
   DLOG << "=============ComputeFpgaConcat===========";
   DLOG << "   Image_num: " << args.image_num
-
        << "   out_address:" << args.image_out
        << "   out_scale_address:" << args.scale_out
        << "   out_channel:" << args.out_channel;
   DLOG << "   image_height:" << args.height << "   image_width:" << args.width;
   for (int i = 0; i < args.image_num; i++) {
     DLOG << "   " << i << "th:        ";
-    DLOG << "   channel_num:" << args.channel_num[i]
-         << "   aligned_channel_num:" << args.aligned_channel_num[i]
+    DLOG << "   channel_num:"
+         << args.channel_num[i]
+         //<< "   aligned_channel_num:" << args.aligned_channel_num[i]
          << "   image_address:" << args.images_in[i]
          << "   image_scale_address:" << args.scales_in[i];
   }
@@ -1670,10 +732,429 @@ int ComputeFPGAConcat(const struct ConcatArgs &args) {
 
   image::concat_images(args.images_in, args.scales_in, args.image_out,
                        args.scale_out, args.image_num, args.channel_num,
-                       args.height, args.width, args.aligned_channel_num,
-                       args.out_channel);
+                       args.height, args.width);
   return 0;
+}  // ComputeFPGAConcat
+
+void deconv_post_process(const struct DeconvArgs &args) {
+  int sub_conv_n = args.sub_conv_num;
+  int sub_height = args.sub_output_height;
+  int sub_width = args.sub_output_width;
+  int omit_size = args.omit_size;
+  int channel = args.filter_num;
+  int num = 1;
+  int origin_h = sub_height * sub_conv_n;
+  int origin_w = sub_width * sub_conv_n;
+  int align_origin_w = align_to_x(origin_w * channel, 16);
+  int deconv_h = origin_h - 2 * omit_size;
+  int deconv_w = origin_w - 2 * omit_size;
+  int deconv_row_len = deconv_w * channel;
+  int align_deconv_row_len = align_to_x(deconv_row_len, 16);
+
+  for (int idx = 0; idx < sub_conv_n; ++idx) {
+    paddle_mobile::fpga::fpga_invalidate(
+        args.split_conv_args[idx]->output.address,
+        align_origin_w * origin_h * sizeof(int16_t));
+  }
+
+  int deconv_idx = 0;
+  for (int nn = 0; nn < num; ++nn) {
+    for (int hh = 0; hh < origin_h; ++hh) {
+      int hx = (hh % sub_conv_n);
+      auto sub_t =
+          (int16_t *)(args.split_conv_args[sub_conv_n - hx - 1]  // NOLINT
+                          ->output.address);
+      int hi = (hh / sub_conv_n);
+      if ((hh < omit_size) || (hh >= (origin_h - omit_size))) continue;
+      int sidx = (nn * origin_h * align_origin_w + hi * align_origin_w +
+                  omit_size * channel);
+      fpga_copy((int16_t *)(args.output.address) + deconv_idx,    // NOLINT
+                sub_t + sidx, sizeof(int16_t) * deconv_row_len);  // NOLINT
+      deconv_idx += align_deconv_row_len;
+    }
+  }
+  fpga_flush(args.output.address,
+             num * align_deconv_row_len * deconv_h * sizeof(int16_t));
+}
+void DWDeconv_post_process(const struct DWDeconvArgs &args) {
+  int sub_conv_n = args.sub_conv_num;
+  int sub_height = args.sub_output_height;
+  int sub_width = args.sub_output_width;
+  int omit_size = args.omit_size;
+  int channel = args.filter_num;
+  int num = 1;
+  int origin_h = sub_height * sub_conv_n;
+  int origin_w = sub_width * sub_conv_n;
+  int align_origin_w = align_to_x(origin_w * channel, IMAGE_ALIGNMENT);
+  int deconv_h = origin_h - 2 * omit_size;
+  int deconv_w = origin_w - 2 * omit_size;
+  int deconv_row_len = deconv_w * channel;
+  int align_deconv_row_len = align_to_x(deconv_row_len, IMAGE_ALIGNMENT);
+
+  for (int idx = 0; idx < sub_conv_n; ++idx) {
+    paddle_mobile::fpga::fpga_invalidate(
+        args.dw_conv_args[idx]->output.address,
+        align_origin_w * origin_h * sizeof(int16_t));
+  }
+
+  int deconv_idx = 0;
+  for (int nn = 0; nn < num; ++nn) {
+    for (int hh = 0; hh < origin_h; ++hh) {
+      int hx = (hh % sub_conv_n);
+      auto sub_t = (int16_t *)(args.dw_conv_args[sub_conv_n - hx - 1]  // NOLINT
+                                   ->output.address);
+      int hi = (hh / sub_conv_n);
+      if ((hh < omit_size) || (hh >= (origin_h - omit_size))) continue;
+      int sidx = (nn * origin_h * align_origin_w + hi * align_origin_w +
+                  omit_size * channel);
+      fpga_copy((int16_t *)(args.output.address) + deconv_idx,    // NOLINT
+                sub_t + sidx, sizeof(int16_t) * deconv_row_len);  // NOLINT
+      deconv_idx += align_deconv_row_len;
+    }
+  }
+  fpga_flush(args.output.address,
+             num * align_deconv_row_len * deconv_h * sizeof(int16_t));
 }
 
+int ComputeFpgaDeconv(const struct DeconvArgs &args) {
+#ifdef FPGA_PRINT_MODE
+  DLOG << "=============ComputeFPGADeConv===========";
+  DLOG << "   filter_num:" << args.filter_num
+       << "   group_num:" << args.group_num << "omit_size:" << args.omit_size
+       << "sub_output_width: " << args.sub_output_width
+       << "sub_output_height: " << args.sub_output_height
+       << "   sub_conv_num:" << args.sub_conv_num;
+  DLOG << "args.output.address: " << args.output.address
+       << "args.output.scale_address: " << args.output.scale_address;
+
+#endif
+
+  int sub_conv_num = args.sub_conv_num;
+
+#ifdef COST_TIME_PRINT
+  timeval start, end;
+  long dif_sec, dif_usec;  // NOLINT
+#endif
+
+  for (int i = 0; i < sub_conv_num; i++) {
+#ifdef COST_TIME_PRINT
+    gettimeofday(&start, NULL);
+#endif
+
+    ComputeFpgaConv(*args.split_conv_args[i]);
+#ifdef COST_TIME_PRINT
+    gettimeofday(&end, NULL);
+    dif_sec = end.tv_sec - start.tv_sec;
+    dif_usec = end.tv_usec - start.tv_usec;
+    std::cout << "deconv basic_conv: " << i << " times:  "
+              << "    cost time: " << (dif_sec * 1000000 + dif_usec) << "us"
+              << std::endl;
+#endif
+  }
+
+  if (sub_conv_num > 1) {
+    float max_scale = -1.0f;
+#ifdef COST_TIME_PRINT
+    gettimeofday(&start, NULL);
+#endif
+    for (int i = 0; i < sub_conv_num; i++) {
+      paddle_mobile::fpga::fpga_invalidate(
+          args.split_conv_args[i]->output.scale_address, 2 * sizeof(float));
+      float ptr_scale = (args.split_conv_args[i]->output.scale_address)[0];
+      if (ptr_scale > max_scale) {
+        args.output.scale_address[0] = ptr_scale;
+        args.output.scale_address[1] =
+            (args.split_conv_args[i]->output.scale_address)[1];
+      }
+    }
+
+#ifdef COST_TIME_PRINT
+    gettimeofday(&end, NULL);
+    dif_sec = end.tv_sec - start.tv_sec;
+    dif_usec = end.tv_usec - start.tv_usec;
+    std::cout << "deconv scale  "
+              << "    cost time: " << (dif_sec * 1000000 + dif_usec) << "us"
+              << std::endl;
+#endif
+
+    //    fpga_flush(args.output.scale_address, 2 * sizeof(float));
+    /*#ifdef COST_TIME_PRINT
+    gettimeofday(&start,NULL);
+    #endif
+        //deconv_post_process(args);
+    #ifdef COST_TIME_PRINT
+        gettimeofday(&end,NULL);
+     dif_sec = end.tv_sec - start.tv_sec;
+     dif_usec = end.tv_usec - start.tv_usec;
+      std::cout << "deconv_post_process  " << "    cost time: "  <<
+    (dif_sec*1000000+dif_usec)  << "us" << std::endl; #endif*/
+  }
+
+  return 0;
+}  // ComputeFpgaDeconv
+
+int ComputeFPGASplit(const struct SplitArgs &args) {
+#ifdef FPGA_PRINT_MODE
+  DLOG << "=============ComputeFpgaSplit===========";
+  DLOG << "   Image_num: " << args.image_num
+       << "   in_address:" << args.image_in
+       << "   in_scale_address:" << args.scale_in;
+  DLOG << "   image_height:" << args.height << "   image_width:" << args.width;
+  for (int i = 0; i < args.image_num; i++) {
+    DLOG << "   " << i << "th:        ";
+    DLOG << "   channel_num:" << args.out_channel_nums[i]
+         << "   image_address:" << args.images_out[i]
+         << "   image_scale_address:" << args.scales_out[i];
+  }
+#endif
+  image::split_image(args.image_in, args.scale_in, args.images_out,
+                     args.scales_out, args.image_num, args.out_channel_nums,
+                     args.height, args.width);
+  return 0;
+}  // ComputeFPGASplit
+int ComputeDWConv(const struct DWconvArgs &args) {
+#ifdef FPGA_PRINT_MODE
+  DLOG << "=============ComputeDWConv===========";
+  // DLOG << "   mode:" << args.relu_enabled;
+  DLOG << "   image_address:" << args.image.address
+       << "   image_scale_address:" << args.image.scale_address
+       << "   image_channels:" << args.image.channels
+       << "   image_height:" << args.image.height
+       << "   image_width:" << args.image.width
+       << "   pad_height:" << args.image.pad_height
+       << "   pad_width:" << args.image.pad_width;
+  DLOG << "   filter_address:" << args.filter_address
+       << "   bias_address:" << args.bias_address;
+  DLOG << "   kernel_height:" << args.kernel.height
+       << "   kernel_width:" << args.kernel.width
+       << "   stride_h:" << args.kernel.stride_h
+       << "   stride_w:" << args.kernel.stride_w;
+  DLOG << "   out_address:" << args.output.address
+       << "   out_scale_address:" << args.output.scale_address;
+#endif
+#ifdef PADDLE_MOBILE_ZU5
+  DLOG << "DWConv";
+  // return 0;
+  uint64_t output_scale = 0;
+  uint64_t timer_cnt = 0;
+  int ret = 0;
+  // uint64_t cmd = args.relu_enabled;
+  uint64_t cmd = 0;
+  uint64_t image_physical_address = 0;
+  uint64_t output_physical_address = 0;
+  uint64_t filter_physical_address = 0;
+  uint64_t bias_physical_address = 0;
+
+  image_physical_address = vaddr_to_paddr(args.image.address);
+  output_physical_address = vaddr_to_paddr(args.output.address);
+  filter_physical_address = vaddr_to_paddr(args.filter_address);
+  bias_physical_address = vaddr_to_paddr(args.bias_address);
+  uint64_t filter_N_align =
+      align_to_x((uint64_t)args.image.channels, IMAGE_ALIGNMENT);
+  uint64_t filter_amount_per_row_align =
+      filter_N_align * (uint64_t)args.kernel.width;
+  uint64_t sub_filter_amount_align = filter_N_align *
+                                     (uint64_t)args.kernel.width *
+                                     (uint64_t)args.kernel.height;
+  uint64_t filter_amount_align =
+      sub_filter_amount_align * (uint64_t)args.sub_conv_num;
+
+  uint32_t output_height = (uint32_t)(
+      (args.image.height + args.image.pad_height * 2 - args.kernel.height) /
+          args.kernel.stride_h +
+      1);
+  uint32_t output_width = (uint32_t)(
+      ((args.image.width + args.image.pad_width * 2 - args.kernel.width) /
+           args.kernel.stride_w +
+       1) *
+      args.sub_conv_num);
+
+  uint64_t image_amount_per_row =
+      align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels,
+                 IMAGE_ALIGNMENT);
+  uint64_t image_one_pad_per_row =
+      align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels,
+                 FILTER_ELEMENT_ALIGNMENT) +
+      (uint64_t)args.image.pad_width * (uint64_t)args.image.channels;
+  uint64_t image_two_pad_per_row = align_to_x(
+      ((uint64_t)args.image.width + (uint64_t)args.image.pad_width * 2) *
+          (uint64_t)args.image.channels,
+      IMAGE_ALIGNMENT);
+  uint64_t image_row_mul_pooling_hight =
+      image_amount_per_row * (uint64_t)args.kernel.height;
+  uint64_t image_row_mul_pad_hight =
+      image_amount_per_row * (uint64_t)args.image.pad_height;
+  uint64_t image_row_mul_step_hight =
+      image_amount_per_row * (uint64_t)args.kernel.stride_h;
+  uint64_t result_amount_align_32 =
+      align_to_x((uint64_t)output_width * (uint64_t)args.image.channels,
+                 FILTER_ELEMENT_ALIGNMENT);
+  uint64_t result_amount_align_64 = align_to_x(
+      (uint64_t)output_width * (uint64_t)args.image.channels, IMAGE_ALIGNMENT);
+  uint64_t image_calcu_height =
+      (uint64_t)args.kernel.height +
+      ((uint64_t)output_height - 1) * (uint64_t)args.kernel.stride_h;
+  uint64_t image_pad_left = args.image.channels * args.image.pad_width;
+  uint64_t image_skip_window = args.image.channels * args.kernel.stride_w;
+
+  uint64_t image_padleft_skipwindow =
+      (image_skip_window << 32) | image_pad_left;
+
+  pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
+  if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) {
+    ret = -EIO;
+    DLOG << "Conv Status Error!";
+    pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
+    return ret;
+  }
+
+  /*restart scale*/
+  reg_writeq(output_scale, REG_SCALE_PARAMETER);
+
+  reg_writeq(image_physical_address, REG_POOLING_IMAGE_BASE_ADDR);
+  reg_writeq(output_physical_address, REG_POOLING_RESULT_BASE_ADDR);
+  reg_writeq((bias_physical_address << 32 | filter_physical_address),
+             REG_DWCONV_FILTER_BASE_ADDR);
+  reg_writeq(filter_amount_per_row_align | (filter_amount_align << 32),
+             REG_DWCONV_FILTER_SHAPE);
+  reg_writeq(sub_filter_amount_align | (((uint64_t)args.sub_conv_num) << 32),
+             REG_DWCONV_FILTER_SUBNUMBER);
+  reg_writeq(filter_N_align, REG_DWCONV_FILTER_N_ALIGN);
+
+  reg_writeq(
+      ((uint64_t)args.image.height) | (((uint64_t)args.image.width) << 32),
+      REG_POOLING_IMAGE_PIXEL);
+  reg_writeq(
+      ((uint64_t)args.kernel.height) | (((uint64_t)args.kernel.width) << 32),
+      REG_POOLING_WINDOW_SIZE);
+
+  reg_writeq(((uint64_t)output_height) | (((uint64_t)output_width) << 32),
+             REG_POOLING_RESULT_PIXEL);
+
+  reg_writeq(((uint64_t)args.image.pad_height) |
+                 (((uint64_t)args.image.pad_width) << 32),
+             REG_POOLING_PAD_PIXEL);
+  reg_writeq(((uint64_t)args.kernel.stride_h) |
+                 (((uint64_t)args.kernel.stride_w) << 32),
+             REG_POOLING_STEP_PIXEL);
+
+  reg_writeq((uint64_t)args.image.channels, REG_POOLING_CHANNEL_NUMBER);
+
+  reg_writeq(image_amount_per_row, REG_POOLING_IMAGE_AMOUNT_PER_ROW);
+  reg_writeq(image_one_pad_per_row, REG_POOLING_IMAGE_ONE_PAD_PER_ROW);
+  reg_writeq(image_two_pad_per_row, REG_POOLING_IMAGE_TWO_PAD_PER_ROW);
+
+  reg_writeq(image_row_mul_pooling_hight,
+             REG_POOLING_IMAGE_ROW_MUL_WINDOW_HEIGHT);
+  reg_writeq(image_row_mul_pad_hight, REG_POOLING_IMAGE_ROW_MUL_PAD_HEIGHT);
+  reg_writeq(image_row_mul_step_hight, REG_POOLING_IMAGE_ROW_MUL_STEP_HEIGHT);
+
+  reg_writeq(result_amount_align_32, REG_POOLING_RESULT_AMOUNT_ALIGN_32);
+  reg_writeq(result_amount_align_64, REG_POOLING_RESULT_AMOUNT_ALIGN_64);
+
+  reg_writeq(image_calcu_height, REG_POOLING_IMAGE_CALCU_HEIGHT);
+
+  reg_writeq(image_padleft_skipwindow, REG_POOLING_IMAGE_PADLEFT_SKIPWINDOW);
+
+  /*SDK刷Cache保证数据一致性*/
+
+  reg_writeq(cmd, REG_DWCONV_CMD);
+
+  DLOG << "before reg poll";
+  if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) {
+    g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status = ERROR;
+    ret = -EIO;
+    DLOG << "Pooling Wait Irq Timeout!";
+    PADDLE_MOBILE_ENFORCE(0, "DWConv Wait Irq Timeout");
+  }
+  DLOG << "after reg poll";
+
+  // *(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER);
+  output_scale = reg_readq(REG_SCALE_PARAMETER);
+  output_scale = (output_scale << 32) | (output_scale >> 32);
+  fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
+  DLOG << "output_scale:" << output_scale;
+  pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
+  return ret;
+#endif
+  return 0;
+}
+int ComputeDWDeconv(const struct DWDeconvArgs &args) {
+#ifdef FPGA_PRINT_MODE
+  DLOG << "=============ComputeFPGADeConv===========";
+  DLOG << "   filter_num:" << args.filter_num
+       << "   group_num:" << args.group_num << "omit_size:" << args.omit_size
+       << "sub_output_width: " << args.sub_output_width
+       << "sub_output_height: " << args.sub_output_height
+       << "   sub_conv_num:" << args.sub_conv_num;
+  DLOG << "args.output.address: " << args.output.address
+       << "args.output.scale_address: " << args.output.scale_address;
+
+#endif
+
+  int sub_conv_num = args.sub_conv_num;
+
+#ifdef COST_TIME_PRINT
+  timeval start, end;
+  long dif_sec, dif_usec;  // NOLINT
+#endif
+
+  for (int i = 0; i < sub_conv_num; i++) {
+#ifdef COST_TIME_PRINT
+    gettimeofday(&start, NULL);
+#endif
+
+    ComputeDWConv(*args.dw_conv_args[i]);
+#ifdef COST_TIME_PRINT
+    gettimeofday(&end, NULL);
+    dif_sec = end.tv_sec - start.tv_sec;
+    dif_usec = end.tv_usec - start.tv_usec;
+    std::cout << "deconv basic_conv: " << i << " times:  "
+              << "    cost time: " << (dif_sec * 1000000 + dif_usec) << "us"
+              << std::endl;
+#endif
+  }
+
+  if (sub_conv_num > 1) {
+    float max_scale = -1.0f;
+#ifdef COST_TIME_PRINT
+    gettimeofday(&start, NULL);
+#endif
+    for (int i = 0; i < sub_conv_num; i++) {
+      paddle_mobile::fpga::fpga_invalidate(
+          args.dw_conv_args[i]->output.scale_address, 2 * sizeof(float));
+      float ptr_scale = (args.dw_conv_args[i]->output.scale_address)[0];
+      if (ptr_scale > max_scale) {
+        args.output.scale_address[0] = ptr_scale;
+        args.output.scale_address[1] =
+            (args.dw_conv_args[i]->output.scale_address)[1];
+      }
+    }
+
+#ifdef COST_TIME_PRINT
+    gettimeofday(&end, NULL);
+    dif_sec = end.tv_sec - start.tv_sec;
+    dif_usec = end.tv_usec - start.tv_usec;
+    std::cout << "deconv scale  "
+              << "    cost time: " << (dif_sec * 1000000 + dif_usec) << "us"
+              << std::endl;
+#endif
+  }
+
+#ifdef COST_TIME_PRINT
+  gettimeofday(&start, NULL);
+#endif
+  DWDeconv_post_process(args);
+#ifdef COST_TIME_PRINT
+  gettimeofday(&end, NULL);
+  dif_sec = end.tv_sec - start.tv_sec;
+  dif_usec = end.tv_usec - start.tv_usec;
+  std::cout << "deconv_post_process  "
+            << "    cost time: " << (dif_sec * 1000000 + dif_usec) << "us"
+            << std::endl;
+#endif
+  return 0;
+}  // ComputeFpgaDeconv
+
 }  // namespace fpga
 }  // namespace paddle_mobile
diff --git a/src/fpga/common/fpga_common.h b/src/fpga/common/fpga_common.h
index bc882bed5e..8711f239f5 100644
--- a/src/fpga/common/fpga_common.h
+++ b/src/fpga/common/fpga_common.h
@@ -27,6 +27,14 @@ limitations under the License. */
 #define BIAS_NUM_ALIGNMENT (16)
 #define ROW_PARALLEL_NUM (3)
 #endif
+#ifdef PADDLE_MOBILE_FPGA_V2
+#define IMAGE_ALIGNMENT (32)           // Aligned to 32
+#define FILTER_NUM_ALIGNMENT (32)      // Filter number aligned to 32
+#define FILTER_ELEMENT_ALIGNMENT (16)  // Filter element number aligned to 16
+#define BS_NUM_ALIGNMENT (8)
+#define BIAS_NUM_ALIGNMENT (16)
+#define ROW_PARALLEL_NUM (3)
+#endif
 
 namespace paddle_mobile {
 namespace fpga {
@@ -80,7 +88,8 @@ struct ImageOutputArgs {
       activation;  // To select activation and specify (Leaky)Relu parameter.
 };
 
-#ifdef PADDLE_MOBILE_FPGA_V1
+// #ifdef PADDLE_MOBILE_FPGA_V1
+#if ((defined PADDLE_MOBILE_FPGA_V1) || (defined PADDLE_MOBILE_FPGA_V2))
 struct ConvDriverParam {
   uint64_t image_address_phy;
   uint64_t filter_address_phy;
@@ -146,11 +155,8 @@ struct ConvArgs {
   struct ImageInputArgs image;  // input image;
   struct ImageOutputArgs output;
 
-#ifdef PADDLE_MOBILE_FPGA_V2
-  void* free_space;  // used by FPGA logic
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA_V1
+// #ifdef PADDLE_MOBILE_FPGA_V1
+#if ((defined PADDLE_MOBILE_FPGA_V1) || (defined PADDLE_MOBILE_FPGA_V2))
   struct DeconvTxParm deconv_tx_param;
   struct ConvDriverParam driver;
 #endif
@@ -208,7 +214,10 @@ struct EWAddArgs {
   struct ImageInputArgs image0;
   struct ImageInputArgs image1;
   struct ImageOutputArgs output;
-#ifdef PADDLE_MOBILE_FPGA_V1
+  std::vector<float> image_in_quantVal;
+  std::vector<float> image_out_quantVal;
+// #ifdef PADDLE_MOBILE_FPGA_V1
+#if ((defined PADDLE_MOBILE_FPGA_V1) || (defined PADDLE_MOBILE_FPGA_V2))
   struct EWAddDriverParam driver;
 #endif
 };
diff --git a/src/framework/executor.cpp b/src/framework/executor.cpp
index f1139ae4b9..2364dfdb19 100644
--- a/src/framework/executor.cpp
+++ b/src/framework/executor.cpp
@@ -68,7 +68,6 @@ Executor<Device, T>::Executor(const Program<Device> &program,
   // resize feed and fetch list
   // should init feed and fetch variables before infer shape
   InitFeedFetchList();
-
   const auto &blocks = program_desc_->Blocks();
   std::shared_ptr<BlockDesc> block_desc = blocks[0];
   std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
@@ -86,6 +85,9 @@ Executor<Device, T>::Executor(const Program<Device> &program,
     }
     ops_of_block0_.push_back(op_handler);
   }
+#ifdef PADDLE_MOBILE_FPGA_V2
+  InitQuantMemory();
+#endif
   if (program_.combined) {
     InitCombineMemory();
   } else {
@@ -626,8 +628,74 @@ template <typename Device, typename T>
 void Executor<Device, T>::Predict_To(int end) {
   Predict_From_To(0, end);
 }
-#endif
+#ifdef PADDLE_MOBILE_FPGA_V2
+std::map<std::string, float> LoadQuantValFromFile(std::string filename) {
+  std::map<std::string, float> quantValList;
+  std::ifstream in;
+  in.open(filename, std::ios::in);
+  if (!in.is_open()) {
+    std::cout << "open File Failed." << std::endl;
+    exit(-1);
+  }
 
+  std::string line;
+  while (getline(in, line)) {
+    std::string splitStr = " : ";
+    std::string::size_type pos;
+    pos = line.find(splitStr);
+    std::string subStr[2];
+    subStr[0] = line.substr(0, pos);
+    subStr[1] = line.substr(pos + splitStr.size(), line.size());
+    quantValList.insert(std::make_pair(subStr[0], atof(subStr[1].c_str())));
+  }
+  in.close();
+  return quantValList;
+}
+
+template <typename Device, typename T>
+void Executor<Device, T>::InitQuantMemory() {
+  std::string quantValFilePath;
+  if (program_.combined) {
+    quantValFilePath = program_.para_path;
+    quantValFilePath =
+        quantValFilePath.substr(0, (quantValFilePath.length() - 6));
+    quantValFilePath = quantValFilePath + "scale";
+  } else {
+    quantValFilePath = program_.model_path + "/scale";
+  }
+  std::map<std::string, float> quantValList =
+      LoadQuantValFromFile(quantValFilePath);
+  auto ops = ops_of_block0_;
+  for (int id = 0; id < ops.size(); id++) {
+    auto op = ops[id];
+    auto input_keys = op->GetInputKeys();
+    auto inputs = op->Inputs();
+    for (auto key = input_keys.begin(); key != input_keys.end(); key++) {
+      auto inputs_vars = inputs[*key];
+      int count = inputs_vars.size();
+      for (int i = 0; i < count; i++) {
+        auto tensor = GetTensorByName(inputs_vars[i]);
+        tensor->scale[0] = quantValList[inputs_vars[i]];
+        std::cout << "input variance name : " << inputs_vars[i]
+                  << ", scale value : " << tensor->scale[0] << std::endl;
+      }
+    }
+    auto output_keys = op->GetOutKeys();
+    auto outputs = op->Outputs();
+    for (auto key = output_keys.begin(); key != output_keys.end(); key++) {
+      auto outputs_vars = outputs[*key];
+      int count = outputs_vars.size();
+      for (int i = 0; i < count; i++) {
+        auto tensor = GetTensorByName(outputs_vars[i]);
+        tensor->scale[0] = quantValList[outputs_vars[i]];
+        std::cout << "output variance name : " << outputs_vars[i]
+                  << ", scale value : " << tensor->scale[0] << std::endl;
+      }
+    }
+  }
+}
+#endif
+#endif
 #ifdef PADDLE_MOBILE_CL
 template <>
 void Executor<GPU_CL, float>::InitNoPersistableMemory(
diff --git a/src/framework/executor.h b/src/framework/executor.h
index fa589880c1..78a4bd61dd 100644
--- a/src/framework/executor.h
+++ b/src/framework/executor.h
@@ -64,6 +64,9 @@ class Executor {
   void Predict_From_To(int start = 0, int end = -1);
   void Predict_From(int start);
   void Predict_To(int end);
+#ifdef PADDLE_MOBILE_FPGA_V2
+  void InitQuantMemory();
+#endif
 #endif
 
  protected:
diff --git a/src/operators/kernel/fpga/V2/anchor_generator_kernel.cpp b/src/operators/kernel/fpga/V2/anchor_generator_kernel.cpp
new file mode 100644
index 0000000000..6046b3d2f0
--- /dev/null
+++ b/src/operators/kernel/fpga/V2/anchor_generator_kernel.cpp
@@ -0,0 +1,87 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef ANCHOR_GENERATOR_OP
+
+#include <string.h>
+#include <iostream>
+#include <utility>
+#include <vector>
+#include "operators/kernel/detection_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool AnchorGeneratorKernel<FPGA, float>::Init(
+    AnchorGeneratorParam<FPGA> *param) {
+  auto input = param->input_;
+  auto anchors = param->output_anchors_;
+  auto anchor_ptr = anchors->mutable_data<float>();
+  auto stride = param->stride_;
+  auto feature_width = input->dims()[3], feature_height = input->dims()[2];
+  auto stride_width = stride[0], stride_height = stride[1];
+  auto offset = param->offset_;
+
+  int anchors_offset[] = {-2,  -2,   18,   18,  -10, -9,   26,   25,   -23,
+                          -20, 39,   36,   -43, -34, 59,   49,   -63,  -54,
+                          79,  69,   -96,  -77, 112, 93,   -137, -118, 153,
+                          134, -204, -188, 220, 204, -281, -395, 296,  441};
+
+  int anchors_offset2[] = {0, 0, 51, 77, 0, 0, 30, 35, 0, 0, 81, 103,
+                           0, 0, 20, 21, 0, 0, 36, 44, 0, 0, 43, 58,
+                           0, 0, 34, 68, 0, 0, 24, 28, 0, 0, 19, 46};
+
+  if (offset > 0.6) {
+    memcpy(anchors_offset, anchors_offset2, sizeof(anchors_offset));
+    std::cout << "anchor generator marker" << std::endl;
+  } else {
+    std::cout << "anchor generator rfcn" << std::endl;
+  }
+  int num_anchors = sizeof(anchors_offset) / (sizeof(int) * 4);
+
+  //  DLOG << "feature_height: " << feature_height;
+  //  DLOG << "feature_width: " << feature_width;
+  //  DLOG << "num_anchors: " << num_anchors;
+  //  DLOG << "stride_width: " << stride_width;
+  //  DLOG << "stride_height: " << stride_height;
+
+  for (int h_idx = 0; h_idx < feature_height; ++h_idx) {
+    int offset0 = h_idx * feature_width * num_anchors * 4;
+    for (int w_idx = 0; w_idx < feature_width; ++w_idx) {
+      int offset1 = w_idx * num_anchors * 4;
+      for (int idx = 0; idx < num_anchors; idx++) {
+        int offset = offset0 + offset1 + idx * 4;
+        anchor_ptr[offset + 0] =
+            anchors_offset[idx * 4 + 0] + w_idx * stride_width;
+        anchor_ptr[offset + 1] =
+            anchors_offset[idx * 4 + 1] + h_idx * stride_height;
+        anchor_ptr[offset + 2] =
+            anchors_offset[idx * 4 + 2] + w_idx * stride_width;
+        anchor_ptr[offset + 3] =
+            anchors_offset[idx * 4 + 3] + h_idx * stride_height;
+      }
+    }
+  }
+  return true;
+}
+
+template <>
+void AnchorGeneratorKernel<FPGA, float>::Compute(
+    const AnchorGeneratorParam<FPGA> &param) {}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif  // ANCHOR_GENERATOR_OP
diff --git a/src/operators/kernel/fpga/V2/concat_kernel.cpp b/src/operators/kernel/fpga/V2/concat_kernel.cpp
index 7f9ab66d48..7690f41ad3 100644
--- a/src/operators/kernel/fpga/V2/concat_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/concat_kernel.cpp
@@ -15,7 +15,6 @@ limitations under the License. */
 #ifdef CONCAT_OP
 
 #include "operators/kernel/concat_kernel.h"
-#include "fpga/V2/api.h"
 
 namespace paddle_mobile {
 namespace operators {
@@ -31,45 +30,36 @@ bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
       (float **)fpga::fpga_malloc(image_num * sizeof(float *));  // NOLINT
   auto channel_num =
       (uint32_t *)fpga::fpga_malloc(image_num * sizeof(uint32_t));  // NOLINT
-  auto aligned_channel_num =
-      (uint32_t *)fpga::fpga_malloc(image_num * sizeof(uint32_t));  // NOLINT
 
   auto height = inputs[0]->dims()[2];
   auto width = inputs[0]->dims()[3];
-  auto out_channel =
-      (uint32_t)fpga::get_aligned_channel_num((int)out->dims()[1]);  // NOLINT
   for (int i = 0; i < image_num; i++) {
     auto input = inputs[i];
     PADDLE_MOBILE_ENFORCE(
         input->dims()[2] == height && input->dims()[3] == width,
         "Image height & width should be unified");
-    images_in[i] = (half *)input->data<float>();  // NOLINT
-    channel_num[i] = (uint32_t)inputs[i]->dims()[1];
-    aligned_channel_num[i] =
-        (uint32_t)fpga::get_aligned_channel_num(channel_num[i]);
+    images_in[i] = input->data<half>();
+    channel_num[i] = (uint32_t)inputs[i]->dims()[1];  // NOLINT
     scales_in[i] = input->scale;
   }
-  fpga::format_concat_output(out, (int)height, (int)width,  // NOLINT
-                             out_channel);
+  fpga::format_concat_output(out, height, width, image_num, channel_num);
 
   fpga::ConcatArgs concatArgs = {0};
-  concatArgs.image_num = (uint32_t)image_num;
+  concatArgs.image_num = image_num;
   concatArgs.images_in = images_in;
   concatArgs.scales_in = scales_in;
-  concatArgs.image_out = (half *)out->data<float>();  // NOLINT
+  concatArgs.image_out = out->data<half>();
   concatArgs.scale_out = out->scale;
   concatArgs.channel_num = channel_num;
-  concatArgs.aligned_channel_num = aligned_channel_num;
-  concatArgs.out_channel = out_channel;
-  concatArgs.height = (uint32_t)height;
-  concatArgs.width = (uint32_t)width;
+  concatArgs.height = height;
+  concatArgs.width = width;
   param->SetFpgaArgs(concatArgs);
   return true;
 }
 
 template <>
 void ConcatKernel<FPGA, float>::Compute(const ConcatParam<FPGA> &param) {
-  fpga::ComputeFPGAConcat(param.FpgaArgs());
+  ComputeFPGAConcat(param.FpgaArgs());
 }
 template class ConcatKernel<FPGA, float>;
 
diff --git a/src/operators/kernel/fpga/V2/conv_add_bn_kernel.cpp b/src/operators/kernel/fpga/V2/conv_add_bn_kernel.cpp
index a529a98719..c052805dfd 100644
--- a/src/operators/kernel/fpga/V2/conv_add_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/conv_add_bn_kernel.cpp
@@ -22,12 +22,15 @@ namespace operators {
 
 template <>
 bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
-  bool relu_enabled = false;
-  auto input = const_cast<Tensor *>(param->Input());
+  // bool relu_enabled = false;
+  paddle_mobile::fpga::ActivationType activation_enable =
+      paddle_mobile::fpga::NONE;
+  int16_t leaky_relu_negative_slope = 0;
+  auto input = const_cast<LoDTensor *>(param->Input());
 
   auto bias = param->Bias();
   auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<Tensor *>(param->Filter());
+  auto filter = const_cast<LoDTensor *>(param->Filter());
 
   auto out = param->Output();
 
@@ -56,18 +59,18 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
     bs_ptr[i + channel] = new_scale_ptr[i];
     bs_ptr[i] = new_bias_ptr[i];
   }
-  param->SetNewScale(new_scale);
-  param->SetNewBias(new_bias);
 
   fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
-
   fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
-                       param->Groups(), param->Strides()[0],
-                       param->Strides()[1], param->Paddings()[0],
-                       param->Paddings()[1], bs_ptr);
+  fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable,
+                       leaky_relu_negative_slope, param->Groups(),
+                       param->Strides()[0], param->Strides()[1],
+                       param->Paddings()[0], param->Paddings()[1], bs_ptr);
   param->SetFpgaArgs(conv_arg);
 
+  delete new_scale;
+  delete new_bias;
+
   return true;
 }
 
diff --git a/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp
index c9cf04fe67..a7a93de9ba 100644
--- a/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp
@@ -23,12 +23,18 @@ namespace operators {
 template <>
 bool ConvAddBNReluKernel<FPGA, float>::Init(
     FusionConvAddBNReluParam<FPGA> *param) {
-  bool relu_enabled = true;
-  auto input = const_cast<Tensor *>(param->Input());
-  const Tensor *bias = param->Bias();
+  // bool relu_enabled = true;
+  paddle_mobile::fpga::ActivationType activation_enable =
+      paddle_mobile::fpga::LEAKYRELU;
+  int16_t leaky_relu_negative_slope = 0;
+  auto input = const_cast<LoDTensor *>(param->Input());
+  auto bias = param->Bias();
   auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<Tensor *>(param->Filter());
+  auto filter = const_cast<LoDTensor *>(param->Filter());
   auto out = param->Output();
+
+  vector<int> paddings = param->Paddings();
+  vector<int> strides = param->Strides();
   auto bn_mean_ptr = param->InputMean()->data<float>();
   auto bn_var_ptr = param->InputVariance()->data<float>();
   auto bn_scale_ptr = param->InputScale()->data<float>();
@@ -40,7 +46,7 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
 
   const int channel = out->dims()[1];
   auto bs_ptr =
-      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
+      reinterpret_cast<float *>(fpga::fpga_malloc(2 * channel * sizeof(float)));
   auto new_scale = new Tensor();
   auto new_bias = new Tensor();
   auto new_scale_ptr = new_scale->mutable_data<float>({channel});
@@ -51,27 +57,41 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
                        static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
     new_bias_ptr[i] =
         bn_bias_ptr[i] + (bias_ptr[i] - bn_mean_ptr[i]) * new_scale_ptr[i];
-    bs_ptr[i + 2] = new_scale_ptr[i];
+    bs_ptr[i + channel] = new_scale_ptr[i];
     bs_ptr[i] = new_bias_ptr[i];
   }
-  param->SetNewScale(new_scale);
-  param->SetNewBias(new_bias);
-
-  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
 
-  fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
-                       param->Groups(), param->Strides()[0],
-                       param->Strides()[1], param->Paddings()[0],
-                       param->Paddings()[1], bs_ptr);
-  param->SetFpgaArgs(conv_arg);
+  const int groups = param->Groups();
+  if (groups == channel) {
+    fpga::format_dwconv_data(filter, out, new_scale_ptr, &new_bias_ptr);
+    fpga::DWconvArgs dwconv_arg = {0};
+    fpga::fill_dwconv_arg(&dwconv_arg, input, out, filter, activation_enable,
+                          leaky_relu_negative_slope, strides[0], strides[1],
+                          paddings[0], paddings[1], new_bias_ptr);
+    param->SetFpgaArgs(dwconv_arg);
+    fpga::fpga_free(new_scale_ptr);
+    fpga::fpga_free(bs_ptr);
+  } else {
+    fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
+    fpga::SplitConvArgs conv_arg = {0};
+    fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable,
+                         leaky_relu_negative_slope, param->Groups(), strides[0],
+                         strides[1], paddings[0], paddings[1], bs_ptr);
+    param->SetFpgaArgs(conv_arg);
+    delete new_scale;
+    delete new_bias;
+  }
   return true;
 }
 
 template <>
 void ConvAddBNReluKernel<FPGA, float>::Compute(
     const FusionConvAddBNReluParam<FPGA> &param) {
-  fpga::ComputeFpgaConv(param.FpgaArgs());
+  if (param.Groups() == param.Output()->dims()[1]) {
+    fpga::ComputeDWConv(param.FpgaDwconvArgs());
+  } else {
+    fpga::ComputeFpgaConv(param.FpgaArgs());
+  }
 }
 
 }  // namespace operators
diff --git a/src/operators/kernel/fpga/V2/conv_add_kernel.cpp b/src/operators/kernel/fpga/V2/conv_add_kernel.cpp
index e9c5032779..da16af58f1 100644
--- a/src/operators/kernel/fpga/V2/conv_add_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/conv_add_kernel.cpp
@@ -21,11 +21,14 @@ namespace operators {
 
 template <>
 bool ConvAddKernel<FPGA, float>::Init(FusionConvAddParam<FPGA> *param) {
-  bool relu_enabled = false;
-  auto input = const_cast<Tensor *>(param->Input());
+  // bool relu_enabled = false;
+  paddle_mobile::fpga::ActivationType activation_enable =
+      paddle_mobile::fpga::NONE;
+  int16_t leaky_relu_negative_slope = 0;
+  auto input = const_cast<LoDTensor *>(param->Input());
   const Tensor *bias = param->Bias();
   auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<Tensor *>(param->Filter());
+  auto filter = const_cast<LoDTensor *>(param->Filter());
   auto out = param->Output();
 
   PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
@@ -39,12 +42,11 @@ bool ConvAddKernel<FPGA, float>::Init(FusionConvAddParam<FPGA> *param) {
   }
 
   fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
-
   fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
-                       param->Groups(), param->Strides()[0],
-                       param->Strides()[1], param->Paddings()[0],
-                       param->Paddings()[1], bs_ptr);
+  fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable,
+                       leaky_relu_negative_slope, param->Groups(),
+                       param->Strides()[0], param->Strides()[1],
+                       param->Paddings()[0], param->Paddings()[1], bs_ptr);
   param->SetFpgaArgs(conv_arg);
   return true;
 }
diff --git a/src/operators/kernel/fpga/V2/conv_add_relu_kernel.cpp b/src/operators/kernel/fpga/V2/conv_add_relu_kernel.cpp
index 1002a35843..f1f61da421 100644
--- a/src/operators/kernel/fpga/V2/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/conv_add_relu_kernel.cpp
@@ -21,11 +21,14 @@ namespace operators {
 
 template <>
 bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
-  bool relu_enabled = true;
-  auto input = const_cast<Tensor *>(param->Input());
+  // bool relu_enabled = true;
+  paddle_mobile::fpga::ActivationType activation_enable =
+      paddle_mobile::fpga::LEAKYRELU;
+  int16_t leaky_relu_negative_slope = 0;
+  auto input = const_cast<LoDTensor *>(param->Input());
   const Tensor *bias = param->Bias();
   auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<Tensor *>(param->Filter());
+  auto filter = const_cast<LoDTensor *>(param->Filter());
   auto out = param->Output();
 
   PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
@@ -39,12 +42,11 @@ bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
   }
 
   fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
-
   fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
-                       param->Groups(), param->Strides()[0],
-                       param->Strides()[1], param->Paddings()[0],
-                       param->Paddings()[1], bs_ptr);
+  fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable,
+                       leaky_relu_negative_slope, param->Groups(),
+                       param->Strides()[0], param->Strides()[1],
+                       param->Paddings()[0], param->Paddings()[1], bs_ptr);
   param->SetFpgaArgs(conv_arg);
   return true;
 }
diff --git a/src/operators/kernel/fpga/V2/conv_bn_kernel.cpp b/src/operators/kernel/fpga/V2/conv_bn_kernel.cpp
index 77b6a9a535..431a9f9ac5 100644
--- a/src/operators/kernel/fpga/V2/conv_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/conv_bn_kernel.cpp
@@ -22,10 +22,16 @@ namespace operators {
 
 template <>
 bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
-  bool relu_enabled = false;
-  auto input = const_cast<Tensor *>(param->Input());
-  auto filter = const_cast<Tensor *>(param->Filter());
+  // bool relu_enabled = false;
+  paddle_mobile::fpga::ActivationType activation_enable =
+      paddle_mobile::fpga::NONE;
+  int16_t leaky_relu_negative_slope = 0;
+  auto input = const_cast<LoDTensor *>(param->Input());
+  auto filter = const_cast<LoDTensor *>(param->Filter());
   auto out = param->Output();
+  float Si = input->scale[0];
+  float So = out->scale[0];
+  float Sf = fpga::filter_find_max(filter);
   auto bn_mean_ptr = param->InputMean()->data<float>();
   auto bn_var_ptr = param->InputVariance()->data<float>();
   auto bn_scale_ptr = param->InputScale()->data<float>();
@@ -45,20 +51,21 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
     new_scale_ptr[i] = bn_scale_ptr[i] /
                        static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
     new_bias_ptr[i] = bn_bias_ptr[i] + (0 - bn_mean_ptr[i]) * new_scale_ptr[i];
-    bs_ptr[i + channel] = new_scale_ptr[i];
-    bs_ptr[i] = new_bias_ptr[i];
+    // bs_ptr[i + channel] = new_scale_ptr[i];
+    // bs_ptr[i] = new_bias_ptr[i];
+    bs_ptr[i + channel] = new_scale_ptr[i] * Si / So * Sf / 127.0;
+    bs_ptr[i] = new_bias_ptr[i] * 127.0 / So;
   }
-  param->SetNewScale(new_scale);
-  param->SetNewBias(new_bias);
 
   fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
-
   fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
-                       param->Groups(), param->Strides()[0],
-                       param->Strides()[1], param->Paddings()[0],
-                       param->Paddings()[1], bs_ptr);
+  fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable,
+                       leaky_relu_negative_slope, param->Groups(),
+                       param->Strides()[0], param->Strides()[1],
+                       param->Paddings()[0], param->Paddings()[1], bs_ptr);
   param->SetFpgaArgs(conv_arg);
+  delete new_scale;
+  delete new_bias;
   return true;
 }
 
diff --git a/src/operators/kernel/fpga/V2/conv_bn_relu_kernel.cpp b/src/operators/kernel/fpga/V2/conv_bn_relu_kernel.cpp
index c20dfb8911..856b23ac38 100644
--- a/src/operators/kernel/fpga/V2/conv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/conv_bn_relu_kernel.cpp
@@ -16,17 +16,20 @@ limitations under the License. */
 
 #include "operators/kernel/conv_bn_relu_kernel.h"
 #include <cmath>
-#include "fpga/V2/filter.h"
-
 namespace paddle_mobile {
 namespace operators {
-
 template <>
 bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
-  bool relu_enabled = true;
-  auto input = const_cast<Tensor *>(param->Input());
-  auto filter = const_cast<Tensor *>(param->Filter());
+  paddle_mobile::fpga::ActivationType activation_enable =
+      paddle_mobile::fpga::LEAKYRELU;
+  int16_t leaky_relu_negative_slope = 0;
+  const int groups = param->Groups();
+  auto input = const_cast<LoDTensor *>(param->Input());
+  auto filter = const_cast<LoDTensor *>(param->Filter());
   auto out = param->Output();
+  float Si = input->scale[0];
+  float So = out->scale[0];
+  float Sf = fpga::filter_find_max(filter);
   auto bn_mean_ptr = param->InputMean()->data<float>();
   auto bn_var_ptr = param->InputVariance()->data<float>();
   auto bn_scale_ptr = param->InputScale()->data<float>();
@@ -41,32 +44,49 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
   auto new_bias = new Tensor();
   auto new_scale_ptr = new_scale->mutable_data<float>({channel});
   auto new_bias_ptr = new_bias->mutable_data<float>({channel});
-
   for (int i = 0; i < channel; i++) {
     new_scale_ptr[i] = bn_scale_ptr[i] /
                        static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
     new_bias_ptr[i] = bn_bias_ptr[i] + (0 - bn_mean_ptr[i]) * new_scale_ptr[i];
-    bs_ptr[i + channel] = new_scale_ptr[i];
-    bs_ptr[i] = new_bias_ptr[i];
+    // bs_ptr[i + channel] = new_scale_ptr[i];
+    // bs_ptr[i] = new_bias_ptr[i];
+    bs_ptr[i + channel] = new_scale_ptr[i] * Si / So * Sf / 127.0;
+    bs_ptr[i] = new_bias_ptr[i] * 127.0 / So;
+    if (groups == channel) {
+      new_scale_ptr[i] = new_scale_ptr[i] * Si / So;
+      new_bias_ptr[i] = new_bias_ptr[i] * 127.0 / So;
+    }
   }
-  param->SetNewScale(new_scale);
-  param->SetNewBias(new_bias);
-
-  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
-
-  fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
-                       param->Groups(), param->Strides()[0],
-                       param->Strides()[1], param->Paddings()[0],
-                       param->Paddings()[1], bs_ptr);
-  param->SetFpgaArgs(conv_arg);
+  if (groups == channel) {
+    fpga::format_dwconv_data(filter, out, new_scale_ptr, &new_bias_ptr);
+    fpga::DWconvArgs dwconv_arg = {0};
+    fpga::fill_dwconv_arg(&dwconv_arg, input, out, filter, activation_enable,
+                          leaky_relu_negative_slope, param->Strides()[0],
+                          param->Strides()[1], param->Paddings()[0],
+                          param->Paddings()[1], new_bias_ptr);
+    param->SetFpgaArgs(dwconv_arg);
+  } else {
+    fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
+    fpga::SplitConvArgs conv_arg = {0};
+    fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable,
+                         leaky_relu_negative_slope, param->Groups(),
+                         param->Strides()[0], param->Strides()[1],
+                         param->Paddings()[0], param->Paddings()[1], bs_ptr);
+    param->SetFpgaArgs(conv_arg);
+  }
+  delete new_scale;
+  delete new_bias;
   return true;
 }
 
 template <>
 void ConvBNReluKernel<FPGA, float>::Compute(
     const FusionConvBNReluParam<FPGA> &param) {
-  fpga::ComputeFpgaConv(param.FpgaArgs());
+  if (param.Groups() == param.Output()->dims()[1]) {
+    fpga::ComputeDWConv(param.FpgaDwconvArgs());
+  } else {
+    fpga::ComputeFpgaConv(param.FpgaArgs());
+  }
 }
 
 }  // namespace operators
diff --git a/src/operators/kernel/fpga/V2/conv_kernel.cpp b/src/operators/kernel/fpga/V2/conv_kernel.cpp
new file mode 100644
index 0000000000..c981c38b23
--- /dev/null
+++ b/src/operators/kernel/fpga/V2/conv_kernel.cpp
@@ -0,0 +1,61 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef CONV_OP
+
+#include "operators/kernel/conv_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvKernel<FPGA, float>::Init(ConvParam<FPGA> *param) {
+  paddle_mobile::fpga::ActivationType activation_enable =
+      paddle_mobile::fpga::NONE;
+  int16_t leaky_relu_negative_slope = 0;
+  auto input = const_cast<LoDTensor *>(param->Input());
+  auto filter = const_cast<LoDTensor *>(param->Filter());
+  auto out = param->Output();
+  float Si = input->scale[0];
+  float So = out->scale[0];
+  float Sf = fpga::filter_find_max(filter);
+  int channel = out->dims()[1];
+  auto bs_ptr =
+      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
+  for (int i = 0; i < channel; i++) {
+    // bs_ptr[i + channel] = 1;
+    // bs_ptr[i] = 0;
+    bs_ptr[i + channel] = Si / So * Sf / 127.0;
+    bs_ptr[i] = 0;
+  }
+
+  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
+  fpga::SplitConvArgs conv_arg = {0};
+  fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable,
+                       leaky_relu_negative_slope, param->Groups(),
+                       param->Strides()[0], param->Strides()[1],
+                       param->Paddings()[0], param->Paddings()[1], bs_ptr);
+  param->SetFpgaArgs(conv_arg);
+  return true;
+}
+
+template <>
+void ConvKernel<FPGA, float>::Compute(const ConvParam<FPGA> &param) {
+  fpga::ComputeFpgaConv(param.FpgaArgs());
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp b/src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp
new file mode 100644
index 0000000000..1597885e43
--- /dev/null
+++ b/src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp
@@ -0,0 +1,89 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef CONV_TRANSPOSE_OP
+
+#include "operators/kernel/conv_transpose_kernel.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvTransposeKernel<FPGA, float>::Init(ConvTransposeParam<FPGA> *param) {
+  // bool relu_enabled = false;
+  paddle_mobile::fpga::ActivationType activation_enable =
+      paddle_mobile::fpga::NONE;
+  int16_t leaky_relu_negative_slope = 0;
+  auto input = const_cast<LoDTensor *>(param->Input());
+  // const Tensor *bias = param->Bias();
+  // auto bias_ptr = bias->data<float>();
+  auto filter = const_cast<LoDTensor *>(param->Filter());
+  auto out = param->Output();
+
+  // PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
+  //                      "Output channel should be equal to bias number");
+  int channel = out->dims()[1];
+
+  int sub_conv_n = param->Strides()[0];
+  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *  // NOLINT
+                                           sizeof(float));             // NOLINT
+
+  for (int i = 0; i < channel * sub_conv_n; i++) {
+    bs_ptr[i + sub_conv_n * channel] = 1;
+    bs_ptr[i] = 0;  // bias_ptr[i % (channel)];
+  }
+
+  PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
+                        "stride_width should be equal to stride_height ");
+  PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
+                        "filter width should be equal to filter height ");
+  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
+                        "filter axis should be the multiple of stride axis ");
+  if (param->Groups() == channel) {
+    fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
+                               sub_conv_n);
+    fpga::DWDeconvArgs DWDeconv_arg = {0};
+    fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter,
+                            activation_enable, leaky_relu_negative_slope,
+                            param->Strides()[0], param->Strides()[1],
+                            param->Paddings()[0], param->Paddings()[1], bs_ptr);
+    param->SetFpgaArgs(DWDeconv_arg);
+  } else {
+    fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
+    fpga::DeconvArgs deconv_arg = {0};
+    fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,
+                          leaky_relu_negative_slope, param->Groups(),
+                          param->Strides()[0], param->Strides()[1],
+                          param->Paddings()[0], param->Paddings()[1], bs_ptr);
+    param->SetFpgaArgs(deconv_arg);
+  }
+  return true;
+}
+
+template <>
+void ConvTransposeKernel<FPGA, float>::Compute(
+    const ConvTransposeParam<FPGA> &param) {
+  if (param.Groups() == param.Output()->dims()[1]) {
+    fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
+  } else {
+    fpga::ComputeFpgaDeconv(param.FpgaArgs());
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/V2/deconv_add_bn_kernel.cpp b/src/operators/kernel/fpga/V2/deconv_add_bn_kernel.cpp
new file mode 100644
index 0000000000..a8205df3c9
--- /dev/null
+++ b/src/operators/kernel/fpga/V2/deconv_add_bn_kernel.cpp
@@ -0,0 +1,90 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_DECONVADDBN_OP
+
+#include "operators/kernel/deconv_add_bn_kernel.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool DeconvAddBNKernel<FPGA, float>::Init(FusionDeconvAddBNParam<FPGA> *param) {
+  // bool relu_enabled = true;
+  paddle_mobile::fpga::ActivationType activation_enable =
+      paddle_mobile::fpga::NONE;
+  int16_t leaky_relu_negative_slope = 0;
+  auto input = const_cast<LoDTensor *>(param->Input());
+  const Tensor *bias = param->InputBias();
+  auto bias_ptr = bias->data<float>();
+  auto filter = const_cast<LoDTensor *>(param->Filter());
+  auto out = param->Output();
+
+  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
+                        "Output channel should be equal to bias number");
+  int channel = out->dims()[1];
+
+  int sub_conv_n = param->Strides()[0];
+  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *  // NOLINT
+                                           sizeof(float));             // NOLINT
+
+  for (int i = 0; i < channel * sub_conv_n; i++) {
+    bs_ptr[i + sub_conv_n * channel] = 1;
+    bs_ptr[i] = bias_ptr[i % (channel)];
+  }
+
+  PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
+                        "stride_width should be equal to stride_height ");
+  PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
+                        "filter width should be equal to filter height ");
+  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
+                        "filter axis should be the multiple of stride axis ");
+  if (param->Groups() == channel) {
+    fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
+                               sub_conv_n);
+    fpga::DWDeconvArgs DWDeconv_arg = {0};
+    fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter,
+                            activation_enable, leaky_relu_negative_slope,
+                            param->Strides()[0], param->Strides()[1],
+                            param->Paddings()[0], param->Paddings()[1], bs_ptr);
+    param->SetFpgaArgs(DWDeconv_arg);
+  } else {
+    fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
+    fpga::DeconvArgs deconv_arg = {0};
+    fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,
+                          leaky_relu_negative_slope, param->Groups(),
+                          param->Strides()[0], param->Strides()[1],
+                          param->Paddings()[0], param->Paddings()[1], bs_ptr);
+    param->SetFpgaArgs(deconv_arg);
+  }
+  return true;
+}
+
+template <>
+void DeconvAddBNKernel<FPGA, float>::Compute(
+    const FusionDeconvAddBNParam<FPGA> &param) {
+  // fpga::ComputeFpgaDeconv(param.FpgaArgs());
+  if (param.Groups() == param.Output()->dims()[1]) {
+    fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
+  } else {
+    fpga::ComputeFpgaDeconv(param.FpgaArgs());
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/V2/deconv_add_bn_relu_kernel.cpp b/src/operators/kernel/fpga/V2/deconv_add_bn_relu_kernel.cpp
new file mode 100644
index 0000000000..b27f5cf870
--- /dev/null
+++ b/src/operators/kernel/fpga/V2/deconv_add_bn_relu_kernel.cpp
@@ -0,0 +1,91 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_DECONVADDBNRELU_OP
+
+#include "operators/kernel/deconv_add_bn_relu_kernel.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool DeconvAddBNReluKernel<FPGA, float>::Init(
+    FusionDeconvAddBNReluParam<FPGA> *param) {
+  // bool relu_enabled = true;
+  paddle_mobile::fpga::ActivationType activation_enable =
+      paddle_mobile::fpga::LEAKYRELU;
+  int16_t leaky_relu_negative_slope = 0;
+  auto input = const_cast<LoDTensor *>(param->Input());
+  const Tensor *bias = param->InputBias();
+  auto bias_ptr = bias->data<float>();
+  auto filter = const_cast<LoDTensor *>(param->Filter());
+  auto out = param->Output();
+
+  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
+                        "Output channel should be equal to bias number");
+  int channel = out->dims()[1];
+
+  int sub_conv_n = param->Strides()[0];
+  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *  // NOLINT
+                                           sizeof(float));             // NOLINT
+
+  for (int i = 0; i < channel * sub_conv_n; i++) {
+    bs_ptr[i + sub_conv_n * channel] = 1;
+    bs_ptr[i] = bias_ptr[i % (channel)];
+  }
+
+  PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
+                        "stride_width should be equal to stride_height ");
+  PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
+                        "filter width should be equal to filter height ");
+  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
+                        "filter axis should be the multiple of stride axis ");
+  if (param->Groups() == channel) {
+    fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
+                               sub_conv_n);
+    fpga::DWDeconvArgs DWDeconv_arg = {0};
+    fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter,
+                            activation_enable, leaky_relu_negative_slope,
+                            param->Strides()[0], param->Strides()[1],
+                            param->Paddings()[0], param->Paddings()[1], bs_ptr);
+    param->SetFpgaArgs(DWDeconv_arg);
+  } else {
+    fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
+    fpga::DeconvArgs deconv_arg = {0};
+    fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,
+                          leaky_relu_negative_slope, param->Groups(),
+                          param->Strides()[0], param->Strides()[1],
+                          param->Paddings()[0], param->Paddings()[1], bs_ptr);
+    param->SetFpgaArgs(deconv_arg);
+  }
+  return true;
+}
+
+template <>
+void DeconvAddBNReluKernel<FPGA, float>::Compute(
+    const FusionDeconvAddBNReluParam<FPGA> &param) {
+  // fpga::ComputeFpgaDeconv(param.FpgaArgs());
+  if (param.Groups() == param.Output()->dims()[1]) {
+    fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
+  } else {
+    fpga::ComputeFpgaDeconv(param.FpgaArgs());
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/V2/deconv_add_kernel.cpp b/src/operators/kernel/fpga/V2/deconv_add_kernel.cpp
index 39d7e81897..41844d008b 100644
--- a/src/operators/kernel/fpga/V2/deconv_add_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/deconv_add_kernel.cpp
@@ -23,12 +23,66 @@ namespace operators {
 
 template <>
 bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) {
+  // bool relu_enabled = false;
+  paddle_mobile::fpga::ActivationType activation_enable =
+      paddle_mobile::fpga::NONE;
+  int16_t leaky_relu_negative_slope = 0;
+  auto input = const_cast<LoDTensor *>(param->Input());
+  const Tensor *bias = param->Bias();
+  auto bias_ptr = bias->data<float>();
+  auto filter = const_cast<LoDTensor *>(param->Filter());
+  auto out = param->Output();
+
+  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
+                        "Output channel should be equal to bias number");
+  int channel = out->dims()[1];
+
+  int sub_conv_n = param->Strides()[0];
+  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *  // NOLINT
+                                           sizeof(float));             // NOLINT
+
+  for (int i = 0; i < channel * sub_conv_n; i++) {
+    bs_ptr[i + sub_conv_n * channel] = 1;
+    bs_ptr[i] = bias_ptr[i % (channel)];
+  }
+
+  PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
+                        "stride_width should be equal to stride_height ");
+  PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
+                        "filter width should be equal to filter height ");
+  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
+                        "filter axis should be the multiple of stride axis ");
+  if (param->Groups() == channel) {
+    fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
+                               sub_conv_n);
+    fpga::DWDeconvArgs DWDeconv_arg = {0};
+    fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter,
+                            activation_enable, leaky_relu_negative_slope,
+                            param->Strides()[0], param->Strides()[1],
+                            param->Paddings()[0], param->Paddings()[1], bs_ptr);
+    param->SetFpgaArgs(DWDeconv_arg);
+  } else {
+    fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
+    fpga::DeconvArgs deconv_arg = {0};
+    fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,
+                          leaky_relu_negative_slope, param->Groups(),
+                          param->Strides()[0], param->Strides()[1],
+                          param->Paddings()[0], param->Paddings()[1], bs_ptr);
+    param->SetFpgaArgs(deconv_arg);
+  }
+
   return true;
 }
 
 template <>
 void DeconvAddKernel<FPGA, float>::Compute(
-    const FusionDeconvAddParam<FPGA> &param) {}
+    const FusionDeconvAddParam<FPGA> &param) {
+  if (param.Groups() == param.Output()->dims()[1]) {
+    fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
+  } else {
+    fpga::ComputeFpgaDeconv(param.FpgaArgs());
+  }
+}
 
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/fpga/V2/deconv_add_relu_kernel.cpp b/src/operators/kernel/fpga/V2/deconv_add_relu_kernel.cpp
index ef2556208a..c6fc9d1955 100644
--- a/src/operators/kernel/fpga/V2/deconv_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/deconv_add_relu_kernel.cpp
@@ -24,12 +24,66 @@ namespace operators {
 template <>
 bool DeconvAddReluKernel<FPGA, float>::Init(
     FusionDeconvAddReluParam<FPGA> *param) {
+  // bool relu_enabled = true;
+  paddle_mobile::fpga::ActivationType activation_enable =
+      paddle_mobile::fpga::LEAKYRELU;
+  int16_t leaky_relu_negative_slope = 0;
+  auto input = const_cast<LoDTensor *>(param->Input());
+  const Tensor *bias = param->Bias();
+  auto bias_ptr = bias->data<float>();
+  auto filter = const_cast<LoDTensor *>(param->Filter());
+  auto out = param->Output();
+
+  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
+                        "Output channel should be equal to bias number");
+  int channel = out->dims()[1];
+
+  int sub_conv_n = param->Strides()[0];
+  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *  // NOLINT
+                                           sizeof(float));             // NOLINT
+
+  for (int i = 0; i < channel * sub_conv_n; i++) {
+    bs_ptr[i + sub_conv_n * channel] = 1;
+    bs_ptr[i] = bias_ptr[i % (channel)];
+  }
+
+  PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
+                        "stride_width should be equal to stride_height ");
+  PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
+                        "filter width should be equal to filter height ");
+  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
+                        "filter axis should be the multiple of stride axis ");
+  if (param->Groups() == channel) {
+    fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
+                               sub_conv_n);
+    fpga::DWDeconvArgs DWDeconv_arg = {0};
+    fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter,
+                            activation_enable, leaky_relu_negative_slope,
+                            param->Strides()[0], param->Strides()[1],
+                            param->Paddings()[0], param->Paddings()[1], bs_ptr);
+    param->SetFpgaArgs(DWDeconv_arg);
+  } else {
+    fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
+    fpga::DeconvArgs deconv_arg = {0};
+    fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,
+                          leaky_relu_negative_slope, param->Groups(),
+                          param->Strides()[0], param->Strides()[1],
+                          param->Paddings()[0], param->Paddings()[1], bs_ptr);
+    param->SetFpgaArgs(deconv_arg);
+  }
   return true;
 }
 
 template <>
 void DeconvAddReluKernel<FPGA, float>::Compute(
-    const FusionDeconvAddReluParam<FPGA> &param) {}
+    const FusionDeconvAddReluParam<FPGA> &param) {
+  // fpga::ComputeFpgaDeconv(param.FpgaArgs());
+  if (param.Groups() == param.Output()->dims()[1]) {
+    fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
+  } else {
+    fpga::ComputeFpgaDeconv(param.FpgaArgs());
+  }
+}
 
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/fpga/V2/deconv_bn_relu_kernel.cpp b/src/operators/kernel/fpga/V2/deconv_bn_relu_kernel.cpp
new file mode 100644
index 0000000000..75597f0ecd
--- /dev/null
+++ b/src/operators/kernel/fpga/V2/deconv_bn_relu_kernel.cpp
@@ -0,0 +1,108 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_DECONVBNRELU_OP
+
+#include "operators/kernel/deconv_bn_relu_kernel.h"
+#include <cmath>
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool DeconvBNReluKernel<FPGA, float>::Init(
+    FusionDeconvBNReluParam<FPGA> *param) {
+  // bool relu_enabled = true;
+  paddle_mobile::fpga::ActivationType activation_enable =
+      paddle_mobile::fpga::LEAKYRELU;
+  int16_t leaky_relu_negative_slope = 0;
+  auto input = const_cast<LoDTensor *>(param->Input());
+  const Tensor *bias = param->InputBias();
+  auto bias_ptr = bias->data<float>();
+  auto filter = const_cast<LoDTensor *>(param->Filter());
+  auto out = param->Output();
+  auto bn_mean_ptr = param->InputMean()->data<float>();
+  auto bn_var_ptr = param->InputVariance()->data<float>();
+  auto bn_scale_ptr = param->InputScale()->data<float>();
+  auto bn_bias_ptr = param->InputBias()->data<float>();
+  const float epsilon = param->Epsilon();
+
+  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
+                        "Output channel should be equal to bias number");
+  int channel = out->dims()[1];
+  auto new_scale = new Tensor();
+  auto new_bias = new Tensor();
+  auto new_scale_ptr = new_scale->mutable_data<float>({channel});
+  auto new_bias_ptr = new_bias->mutable_data<float>({channel});
+  for (int i = 0; i < channel; i++) {
+    new_scale_ptr[i] = bn_scale_ptr[i] /
+                       static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
+    new_bias_ptr[i] = bn_bias_ptr[i] + (0 - bn_mean_ptr[i]) * new_scale_ptr[i];
+  }
+
+  int sub_conv_n = param->Strides()[0];
+  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *  // NOLINT
+                                           sizeof(float));             // NOLINT
+
+  for (int i = 0; i < channel * sub_conv_n; i++) {
+    bs_ptr[i + sub_conv_n * channel] = new_scale_ptr[i % channel];
+    bs_ptr[i] = new_bias_ptr[i % (channel)];
+  }
+
+  PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
+                        "stride_width should be equal to stride_height ");
+  PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
+                        "filter width should be equal to filter height ");
+  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
+                        "filter axis should be the multiple of stride axis ");
+  if (param->Groups() == channel) {
+    fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
+                               sub_conv_n);
+    fpga::DWDeconvArgs DWDeconv_arg = {0};
+    fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter,
+                            activation_enable, leaky_relu_negative_slope,
+                            param->Strides()[0], param->Strides()[1],
+                            param->Paddings()[0], param->Paddings()[1], bs_ptr);
+    param->SetFpgaArgs(DWDeconv_arg);
+  } else {
+    fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
+    fpga::DeconvArgs deconv_arg = {0};
+    fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,
+                          leaky_relu_negative_slope, param->Groups(),
+                          param->Strides()[0], param->Strides()[1],
+                          param->Paddings()[0], param->Paddings()[1], bs_ptr);
+    param->SetFpgaArgs(deconv_arg);
+  }
+  delete new_scale;
+  delete new_bias;
+  return true;
+}
+
+template <>
+void DeconvBNReluKernel<FPGA, float>::Compute(
+    const FusionDeconvBNReluParam<FPGA> &param) {
+  // fpga::ComputeFpgaDeconv(param.FpgaArgs());
+  if (param.Groups() == param.Output()->dims()[1]) {
+    fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
+  } else {
+    fpga::ComputeFpgaDeconv(param.FpgaArgs());
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp b/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp
index 4b5085f261..7878cc743e 100644
--- a/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp
@@ -15,49 +15,176 @@ limitations under the License. */
 
 #include "operators/kernel/elementwise_add_kernel.h"
 
+#include <string>
+#include "fpga/V1/api.h"
+
 namespace paddle_mobile {
 namespace operators {
 
 template <>
 bool ElementwiseAddKernel<FPGA, float>::Init(ElementwiseAddParam<FPGA> *param) {
-  bool relu_enabled = false;
-  auto *input_x = const_cast<LoDTensor *>(param->InputX());
   auto *input_y = const_cast<LoDTensor *>(param->InputY());
   auto *out = param->Out();
-  auto input_x_ptr = input_x->data<float>();
-  auto input_y_ptr = input_y->data<float>();
-  int aligned_channel_num = fpga::get_aligned_channel_num(input_x->dims()[1]);
-  fpga::format_fp16_ofm(out, aligned_channel_num);
-  auto out_ptr = out->mutable_data<float>();
-
-  fpga::EWAddArgs ewaddArgs = {0};
-  ewaddArgs.relu_enabled = relu_enabled;
-  ewaddArgs.const0 = 0x3c00;  // =1
-  ewaddArgs.const1 = 0x3c00;  // =1
-  ewaddArgs.image0.address = input_x_ptr;
-  ewaddArgs.image0.channels = (uint32_t)input_x->dims()[1];
-  ewaddArgs.image0.scale_address = input_x->scale;
-  ewaddArgs.image0.height = (uint32_t)input_x->dims()[2];
-  ewaddArgs.image0.width = (uint32_t)input_x->dims()[3];
-  ewaddArgs.image0.pad_height = 0;
-  ewaddArgs.image0.pad_width = 0;
-  ewaddArgs.image1.address = input_y_ptr;
-  ewaddArgs.image1.channels = (uint32_t)input_y->dims()[1];
-  ewaddArgs.image1.scale_address = input_y->scale;
-  ewaddArgs.image1.height = (uint32_t)input_y->dims()[2];
-  ewaddArgs.image1.width = (uint32_t)input_y->dims()[3];
-  ewaddArgs.image1.pad_height = 0;
-  ewaddArgs.image1.pad_width = 0;
-  ewaddArgs.output.scale_address = out->scale;
-  ewaddArgs.output.address = out_ptr;
-  param->SetFpgaArgs(ewaddArgs);
+  if (input_y->type() != type_id<float>()) {
+    paddle_mobile::fpga::ActivationType activation_enable =
+        paddle_mobile::fpga::NONE;
+    int16_t leaky_relu_negative_slope = 0;
+    auto *input_x = const_cast<LoDTensor *>(param->InputX());
+    auto input_x_ptr = input_x->data<half>();
+    auto input_y_ptr = input_y->data<half>();
+    fpga::format_fp16_ofm(out);
+    auto out_ptr = out->mutable_data<half>();
+
+    fpga::EWAddArgs ewaddArgs = {0};
+    // ewaddArgs.relu_enabled = relu_enabled;
+    ewaddArgs.output.activation.activation_type = activation_enable;
+    ewaddArgs.output.activation.leaky_relu_negative_slope =
+        leaky_relu_negative_slope;
+    ewaddArgs.const0 = 0x3c00;  // =1
+    ewaddArgs.const1 = 0x3c00;  // =1
+    ewaddArgs.image0.address = input_x_ptr;
+    ewaddArgs.image0.channels = (uint32_t)input_x->dims()[1];
+    ewaddArgs.image0.scale_address = input_x->scale;
+    ewaddArgs.image0.height = (uint32_t)input_x->dims()[2];
+    ewaddArgs.image0.width = (uint32_t)input_x->dims()[3];
+    ewaddArgs.image0.pad_height = 0;
+    ewaddArgs.image0.pad_width = 0;
+    ewaddArgs.image1.address = input_y_ptr;
+    ewaddArgs.image1.channels = (uint32_t)input_y->dims()[1];
+    ewaddArgs.image1.scale_address = input_y->scale;
+    ewaddArgs.image1.height = (uint32_t)input_y->dims()[2];
+    ewaddArgs.image1.width = (uint32_t)input_y->dims()[3];
+    ewaddArgs.image1.pad_height = 0;
+    ewaddArgs.image1.pad_width = 0;
+    ewaddArgs.output.scale_address = out->scale;
+    ewaddArgs.output.address = out_ptr;
+    fpga::expand_EW_arg(&ewaddArgs);
+    param->SetFpgaArgs(ewaddArgs);
+  } else {
+    param->float_input_x.Resize(param->InputX()->dims());
+    param->float_input_x.init(type_id<float>().hash_code());
+    fpga::format_fp32_ofm(&(param->float_input_x));
+
+    param->float_out.Resize(param->InputX()->dims());
+    param->float_out.mutable_data<float>(param->InputX()->dims());
+    fpga::format_fp32_ofm(&(param->float_out));
+
+    fpga::format_fp16_ofm(out);
+  }
   return true;
 }
 
+inline void ElementwiseAddCompute(const ElementwiseAddParam<FPGA> &param) {
+  auto input_x = param.float_input_x;
+  auto input_y = param.InputY();
+  auto Out = param.float_out;
+  int axis = param.Axis();
+
+  const auto &x_dims = input_x.dims();
+  const auto &y_dims = input_y->dims();
+  /// axis = -1 represent the last dimensions.
+  axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
+  size_t batch = 1;
+  size_t channels = 1;
+  size_t elementwise_num = 1;
+  for (int i = 0; i < axis; ++i) {
+    batch *= x_dims[i];
+  }
+  for (int i = 0; i < y_dims.size(); ++i) {
+    channels *= y_dims[i];
+  }
+  for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) {
+    elementwise_num *= x_dims[i];
+  }
+  const float *bias_data = input_y->data<float>();
+  const float *input_data = input_x.data<float>();
+  float *output_data = Out.mutable_data<float>();
+
+  for (int i = 0; i < batch; ++i) {
+    for (int j = 0; j < channels; ++j) {
+      size_t offset = (i * channels + j) * elementwise_num;
+      const float *input = input_data + offset;
+      const float bias = bias_data[j];
+      float *output = output_data + offset;
+      // DLOG << "output address: "<< output;
+      for (int k = 0; k < elementwise_num; ++k) {
+        output[k] = input[k] + bias;
+        // DLOG << "output[" << k << "]= " << output[k] ;
+      }
+    }
+  }
+}
 template <>
 void ElementwiseAddKernel<FPGA, float>::Compute(
     const ElementwiseAddParam<FPGA> &param) {
-  fpga::ComputeFpgaEWAdd(param.FpgaArgs());
+  auto input_y = const_cast<LoDTensor *>(param.InputY());
+  if (input_y->type() != type_id<float>()) {
+    fpga::ComputeFpgaEWAdd(param.FpgaArgs());
+  } else {
+    auto input_x = const_cast<LoDTensor *>(param.InputX());
+    auto intput_x_float = const_cast<Tensor *>(&(param.float_input_x));
+    fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
+    args.input_data_type = fpga::DATA_TYPE_FP16;
+    args.output_data_type = fpga::DATA_TYPE_FP32;
+    args.input_layout_type = fpga::LAYOUT_CHW;
+    args.output_layout_type = fpga::LAYOUT_HWC;
+    args.image.address = input_x->data<half>();
+    args.image.channels = (uint32_t)(input_x->fpga_data_num);
+    args.image.height = 1;
+    args.image.width = 1;
+    args.image.pad_height = 0;
+    args.image.pad_width = 0;
+    args.output.address = intput_x_float->data<float>();
+    args.output.scale_address = intput_x_float->scale;
+
+    // fpga::fpga_flush(input_x->data<half>(),input_x->fpga_data_num *
+    // sizeof(half));
+    fpga::PerformBypass(args);
+    fpga::fpga_invalidate(args.output.address,
+                          input_x->fpga_data_num * sizeof(float));
+
+    // just for test
+    /*    {
+           static int cnt = 0;
+           if(cnt == 0){
+               std::string str= "first_bypass_data";
+               float rslt = 0.0f;
+               fpga::savefile(str, args.output.address, input_x->fpga_data_num,
+       rslt); cnt++;
+           }
+       }*/
+    ElementwiseAddCompute(param);
+
+    auto out_float = const_cast<Tensor *>(&(param.float_out));
+    DLOG << "out float: " << out_float->data<float>();
+    fpga::fpga_flush(out_float->data<float>(),
+                     input_x->fpga_data_num * sizeof(float));
+    // just for test
+    /*{
+       static int cnt = 0;
+       if(cnt == 0){
+           std::string str= "ew_output_data";
+           float rslt = 0.0f;
+
+           fpga::savefile(str, out_float->data<float>(), input_x->fpga_data_num,
+   rslt); cnt++;
+       }
+   }*/
+    auto Out = param.Out();
+    args.input_data_type = fpga::DATA_TYPE_FP32;
+    args.output_data_type = fpga::DATA_TYPE_FP16;
+    args.input_layout_type = fpga::LAYOUT_CHW;
+    args.output_layout_type = fpga::LAYOUT_HWC;
+    args.image.address = out_float->data<float>();
+    args.image.channels = (uint32_t)(input_x->fpga_data_num);
+    args.image.height = 1;
+    args.image.width = 1;
+    args.image.pad_height = 0;
+    args.image.pad_width = 0;
+    args.output.address = Out->data<half>();
+    args.output.scale_address = Out->scale;
+    fpga::PerformBypass(args);
+  }
 }
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp b/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
index f74b188b56..f36206a8a1 100644
--- a/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
@@ -21,18 +21,23 @@ namespace operators {
 template <>
 bool ElementwiseAddReluKernel<FPGA, float>::Init(
     ElementwiseAddReluParam<FPGA> *param) {
-  bool relu_enabled = true;
+  // bool relu_enabled = true;
+  paddle_mobile::fpga::ActivationType activation_enable =
+      paddle_mobile::fpga::LEAKYRELU;
+  int16_t leaky_relu_negative_slope = 0;
   auto *input_x = const_cast<LoDTensor *>(param->InputX());
   auto *input_y = const_cast<LoDTensor *>(param->InputY());
   auto *out = param->Out();
-  auto input_x_ptr = input_x->data<float>();
-  auto input_y_ptr = input_y->data<float>();
-  int aligned_channel_num = fpga::get_aligned_channel_num(input_x->dims()[1]);
-  fpga::format_fp16_ofm(out, aligned_channel_num);
-  auto out_ptr = out->mutable_data<float>();
+  auto input_x_ptr = input_x->data<half>();
+  auto input_y_ptr = input_y->data<half>();
+  fpga::format_fp16_ofm(out);
+  auto out_ptr = out->mutable_data<half>();
 
   fpga::EWAddArgs ewaddArgs = {0};
-  ewaddArgs.relu_enabled = relu_enabled;
+  // ewaddArgs.relu_enabled = relu_enabled;
+  ewaddArgs.output.activation.activation_type = activation_enable;
+  ewaddArgs.output.activation.leaky_relu_negative_slope =
+      leaky_relu_negative_slope;
   ewaddArgs.const0 = 0x3c00;  // =1
   ewaddArgs.const1 = 0x3c00;  // =1
   ewaddArgs.image0.address = input_x_ptr;
@@ -51,6 +56,7 @@ bool ElementwiseAddReluKernel<FPGA, float>::Init(
   ewaddArgs.image1.pad_width = 0;
   ewaddArgs.output.scale_address = out->scale;
   ewaddArgs.output.address = out_ptr;
+  fpga::expand_EW_arg(&ewaddArgs);
   param->SetFpgaArgs(ewaddArgs);
   return true;
 }
diff --git a/src/operators/kernel/fpga/V2/elementwise_mul_kernel.cpp b/src/operators/kernel/fpga/V2/elementwise_mul_kernel.cpp
new file mode 100644
index 0000000000..d744ae2c07
--- /dev/null
+++ b/src/operators/kernel/fpga/V2/elementwise_mul_kernel.cpp
@@ -0,0 +1,93 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef ELEMENTWISEMUL_OP
+
+#include "operators/kernel/elementwise_mul_kernel.h"
+#include "operators/math/elementwise_op_function.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename T>
+struct MulFunctor {
+  inline T operator()(T a, T b) const { return a * b; }
+};
+template <>
+bool ElementwiseMulKernel<FPGA, float>::Init(ElementwiseMulParam<FPGA> *param) {
+  param->float_input_x.Resize(param->InputX()->dims());
+  param->float_input_x.init(type_id<float>().hash_code());
+  fpga::format_fp32_ofm(&(param->float_input_x));
+
+  param->float_out.Resize(param->InputX()->dims());
+  param->float_out.init(type_id<float>().hash_code());
+  fpga::format_fp32_ofm(&(param->float_out));
+
+  auto *out = param->Out();
+  fpga::format_fp16_ofm(out);
+  return true;
+}
+
+template <>
+void ElementwiseMulKernel<FPGA, float>::Compute(
+    const ElementwiseMulParam<FPGA> &param) {
+  auto input_x = const_cast<LoDTensor *>(param.InputX());
+  auto intput_x_float = const_cast<Tensor *>(&(param.float_input_x));
+  // auto intput_x_32_ptr =
+  // const_cast<float*>(param.float_input_x.data<float>());
+  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
+  args.input_data_type = fpga::DATA_TYPE_FP16;
+  args.output_data_type = fpga::DATA_TYPE_FP32;
+  args.input_layout_type = fpga::LAYOUT_CHW;
+  args.output_layout_type = fpga::LAYOUT_HWC;
+  args.image.address = input_x->data<half>();
+  args.image.channels = (uint32_t)(input_x->fpga_data_num);
+  args.image.height = 1;
+  args.image.width = 1;
+  args.image.pad_height = 0;
+  args.image.pad_width = 0;
+  args.output.address = intput_x_float->data<float>();
+  args.output.scale_address = intput_x_float->scale;
+  fpga::PerformBypass(args);
+  fpga::fpga_invalidate(args.output.address,
+                        input_x->fpga_data_num * sizeof(float));
+
+  auto input_y = param.InputY();
+  int axis = param.Axis();
+  auto out_float = const_cast<Tensor *>(&(param.float_out));
+  ElementwiseComputeEx<MulFunctor<float>, float>(
+      intput_x_float, input_y, axis, MulFunctor<float>(), out_float);
+  fpga::fpga_flush(out_float->data<float>(),
+                   input_x->fpga_data_num * sizeof(float));
+
+  Tensor *Out = param.Out();
+  args.input_data_type = fpga::DATA_TYPE_FP32;
+  args.output_data_type = fpga::DATA_TYPE_FP16;
+  args.input_layout_type = fpga::LAYOUT_CHW;
+  args.output_layout_type = fpga::LAYOUT_HWC;
+  args.image.address = out_float->data<float>();
+  args.image.channels = (uint32_t)(Out->fpga_data_num);
+  args.image.height = 1;
+  args.image.width = 1;
+  args.image.pad_height = 0;
+  args.image.pad_width = 0;
+  args.output.address = Out->data<half>();
+  args.output.scale_address = Out->scale;
+  fpga::PerformBypass(args);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/V2/feed_kernel.cpp b/src/operators/kernel/fpga/V2/feed_kernel.cpp
index 7c4b999e7c..28559b2b4b 100644
--- a/src/operators/kernel/fpga/V2/feed_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/feed_kernel.cpp
@@ -13,44 +13,94 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "operators/kernel/feed_kernel.h"
-#include "fpga/V2/filter.h"
+
 namespace paddle_mobile {
 namespace operators {
 
 template <>
 bool FeedKernel<FPGA, float>::Init(FeedParam<FPGA> *param) {
-  Tensor *output = param->Out();
-  int aligned_channel = fpga::get_aligned_channel_num(output->dims()[1]);
-  fpga::format_fp16_ofm(output, aligned_channel);
+  auto output = param->Out();
+  int col = param->Col();
+  DLOG << "col = " << col;
+  auto input = const_cast<LoDTensor *>(&param->InputX()->at(col));
+  input->init(type_id<float>().hash_code());
+  input->Resize(output->dims());
+
+  if (output->dims().size() != 4) {
+    return true;
+  }
+
+  fpga::format_fp16_ofm(output);
   return true;
 }
+
 template <>
 void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
-  auto input =
-      reinterpret_cast<Tensor *>(const_cast<LoDTensor *>(param.InputX()));
-  fpga::format_image(input);
-  auto input_ptr = input->data<float>();
-  Tensor *output = param.Out();
-  auto output_ptr = output->data<float>();
-  auto channel = input->dims()[1];
-  uint32_t aligned_channels =
-      fpga::filter::calc_aligned_channel((int)channel);  // NOLINT
+  auto output = param.Out();
+  int col = param.Col();
+  auto input = const_cast<LoDTensor *>(&param.InputX()->at(col));
+  kTypeId_t input_type = input->type();
 
+  if (input_type == type_id<float>()) {
+    input->init(type_id<float>().hash_code());
+  } else {
+    input->init(type_id<int8_t>().hash_code());
+  }
+  input->Resize(output->dims());
+
+  if (output->dims().size() != 4) {
+    size_t size = output->numel() * sizeof(float);
+    auto output_ptr = output->data<float>();
+    auto input_ptr = input->data<float>();
+    auto external_ptr = reinterpret_cast<float *>(input->external_data);
+    float *p_data = external_ptr == nullptr ? input_ptr : external_ptr;
+    memcpy(output_ptr, p_data, size);
+    input->external_data = nullptr;
+    return;
+  }
+
+  fpga::format_image(input);
+  auto output_ptr = output->data<half>();
   fpga::BypassArgs args = {fpga::DATA_TYPE_FP32};
+  if (input_type == type_id<float>()) {
+    auto input_ptr = input->data<float>();
+    auto external_ptr = reinterpret_cast<float *>(input->external_data);
+    float *p_data = external_ptr == nullptr ? input_ptr : external_ptr;
+
+    args.input_data_type = fpga::DATA_TYPE_FP32;
+    args.output_data_type = fpga::DATA_TYPE_FP16;
+    args.input_layout_type = fpga::LAYOUT_CHW;
+    args.output_layout_type = fpga::LAYOUT_HWC;
+    args.image.address = p_data;
+    args.image.channels = (uint32_t)input->dims()[1];
+    args.image.height = (uint32_t)input->dims()[2];
+    args.image.width = (uint32_t)input->dims()[3];
+    args.image.pad_height = 0;
+    args.image.pad_width = 0;
+    args.output.address = output_ptr;
+    args.output.scale_address = output->scale;
+    fpga::PerformBypass(args);
+    input->external_data = nullptr;
+  } else {
+    auto input_ptr = input->data<int8_t>();
+    auto external_ptr = reinterpret_cast<int8_t *>(input->external_data);
+    int8_t *p_data = external_ptr == nullptr ? input_ptr : external_ptr;
 
-  args.input_data_type = fpga::DATA_TYPE_FP32;
-  args.output_data_type = fpga::DATA_TYPE_FP16;
-  args.input_layout_type = fpga::LAYOUT_CHW;
-  args.output_layout_type = fpga::LAYOUT_HWC;
-  args.image.address = reinterpret_cast<void *>(input_ptr);
-  args.image.channels = aligned_channels;
-  args.image.height = (uint32_t)input->dims()[2];
-  args.image.width = (uint32_t)input->dims()[3];
-  args.image.pad_height = 0;
-  args.image.pad_width = 0;
-  args.output.address = output_ptr;
-  args.output.scale_address = output->scale;
-  fpga::PerformBypass(args);
+    args.input_data_type = fpga::DATA_TYPE_INT8;
+    args.output_data_type = fpga::DATA_TYPE_FP16;
+    args.input_layout_type = fpga::LAYOUT_CHW;
+    args.output_layout_type = fpga::LAYOUT_HWC;
+    args.image.address = p_data;
+    args.image.channels = (uint32_t)input->dims()[1];
+    args.image.height = (uint32_t)input->dims()[2];
+    args.image.width = (uint32_t)input->dims()[3];
+    args.image.pad_height = 0;
+    args.image.pad_width = 0;
+    args.output.address = output_ptr;
+    args.output.scale_address = output->scale;
+    fpga::PerformBypass(args);
+    input->external_data = nullptr;
+  }
 }
 template class FeedKernel<FPGA, float>;
 
diff --git a/src/operators/kernel/fpga/V2/fetch_kernel.cpp b/src/operators/kernel/fpga/V2/fetch_kernel.cpp
index e6e4591168..87ede2af1a 100644
--- a/src/operators/kernel/fpga/V2/fetch_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/fetch_kernel.cpp
@@ -11,22 +11,116 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #include "operators/kernel/fetch_kernel.h"
-
 namespace paddle_mobile {
 namespace operators {
 
 template <>
 bool FetchKernel<FPGA, float>::Init(FetchParam<FPGA> *param) {
+  auto input = const_cast<LoDTensor *>(param->InputX());
+  int col = param->Col();
+  DLOG << "col = " << col;
+  auto output = &(param->Out()->at(col));
+  if (input->type() == type_id<float>()) {
+    return true;
+  }
+  output->init(type_id<float>().hash_code());
+  output->Resize(input->dims());
+  fpga::format_fp32_ofm(output);
+  int outC = 1;
+  int outH = 1;
+  int outW = 1;
+  if (output->dims().size() == 4) {
+    outC = output->dims()[1];
+    outH = output->dims()[2];
+    outW = output->dims()[3];
+  } else {  // 2
+    outC = output->dims()[1];
+  }
+  int unalignedCW = outC * outW;
+  int alignedCW = fpga::align_to_x(unalignedCW, IMAGE_ALIGNMENT);
+  if (alignedCW != unalignedCW) {
+    param->aligned_out.Resize(input->dims());
+    param->aligned_out.mutable_data<float>(input->dims());
+    fpga::fpga_flush(param->aligned_out.data<float>(),
+                     outH * unalignedCW * sizeof(float));
+  }
+  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
+
+  args.input_data_type = fpga::DATA_TYPE_FP16;
+  args.output_data_type = fpga::DATA_TYPE_FP32;
+  args.input_layout_type = fpga::LAYOUT_CHW;
+  args.output_layout_type = fpga::LAYOUT_HWC;
+  args.image.address = input->data<half>();
+  args.image.channels = (uint32_t)(input->fpga_data_num);
+  args.image.height = 1;
+  args.image.width = 1;
+  args.image.pad_height = 0;
+  args.image.pad_width = 0;
+  args.output.address = output->data<float>();
+  args.output.scale_address = output->scale;
+  param->fpga_bypass_args = args;
+
   return true;
 }
-
+void dealign(float *src, float *dst, int input_c, int input_h, int input_w) {
+  int alignCW = paddle_mobile::fpga::align_to_x(input_c * input_w, 16);
+  int dealignCW = input_c * input_w;
+  for (int h = 0; h < input_h; ++h) {
+    auto input_offset = h * alignCW;
+    auto output_offset = h * dealignCW;
+    memcpy((dst + output_offset), (src + input_offset),
+           dealignCW * sizeof(float));
+  }
+}
 template <>
 void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
-  param.Out()->ShareDataWith(*(param.InputX()));
-}
+  auto input = const_cast<LoDTensor *>(param.InputX());
+  int col = param.Col();
+  auto output = &param.Out()->at(col);
+  if (input->type() == type_id<float>()) {
+    output->ShareDataWith(*input);
+    return;
+  }
+
+  fpga::BypassArgs args = param.fpga_bypass_args;
+  auto input_address = (input->data<half>());
+  args.image.address = static_cast<void *>(input_address);
+  float *outdata_ptr =
+      reinterpret_cast<float *>(param.fpga_bypass_args.output.address);
+  const int num_th = 32;
+  if (output->fpga_data_num < num_th) {
+    fpga::fpga_invalidate(input_address, (input->fpga_data_num) * sizeof(half));
+
+    for (int idx = 0; idx < product(input->dims()); ++idx) {
+      outdata_ptr[idx] = fpga::fp16_2_fp32(input_address[idx]);
+    }
+    return;
+  }
 
+  fpga::PerformBypass(args);
+  int outC = 1;
+  int outH = 1;
+  int outW = 1;
+  if (output->dims().size() == 4) {
+    outC = output->dims()[1];
+    outH = output->dims()[2];
+    outW = output->dims()[3];
+  } else {  // 2
+    outC = output->dims()[1];
+  }
+
+  fpga::fpga_invalidate(param.fpga_bypass_args.output.address,
+                        output->fpga_data_num * sizeof(float));
+  int unalignedCW = outC * outW;
+  int alignedCW = fpga::align_to_x(unalignedCW, IMAGE_ALIGNMENT);
+  if (unalignedCW != alignedCW) {
+    auto aligned_ptr = const_cast<float *>(param.aligned_out.data<float>());
+    dealign(outdata_ptr, aligned_ptr, outC, outH, outW);
+    memcpy(outdata_ptr, aligned_ptr, outC * outH * outW * sizeof(float));
+    fpga::fpga_flush(outdata_ptr, outC * outH * outW * sizeof(float));
+  }
+}
 template class FetchKernel<FPGA, float>;
 
 }  // namespace operators
diff --git a/src/operators/kernel/fpga/V2/fusion_fc_kernel.cpp b/src/operators/kernel/fpga/V2/fusion_fc_kernel.cpp
index 8d24c44340..3a29104d0f 100644
--- a/src/operators/kernel/fpga/V2/fusion_fc_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/fusion_fc_kernel.cpp
@@ -20,15 +20,18 @@ namespace operators {
 
 template <>
 bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
-  bool relu_enabled = false;
+  // bool relu_enabled = false;
+  paddle_mobile::fpga::ActivationType activation_enable =
+      paddle_mobile::fpga::NONE;
+  int16_t leaky_relu_negative_slope = 0;
   auto input_x = const_cast<LoDTensor *>(param->InputX());
-  auto filter = const_cast<Tensor *>(param->InputY());
+  auto filter = const_cast<LoDTensor *>(param->InputY());
   const Tensor *input_z = param->InputZ();
   auto input_z_ptr = input_z->data<float>();
   auto out = param->Out();
 
-  PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0],
-                        "Image channel should be equal to weight number");
+  // PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0],
+  //                     "Image channel should be equal to weight number");
   int channel = (uint32_t)out->dims()[1];
   auto bs_ptr =
       (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
@@ -47,11 +50,16 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
 
   out->Resize(framework::make_ddim({1, channel, 1, 1}));
   filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
-  fpga::format_fc_data(filter, out, &bs_ptr);
+  float max_value = fpga::filter_find_max(filter);
+  fpga::format_fc_filter(filter, max_value);
+
+  int element_num_per_div = fpga::get_filter_num_per_div(filter, 1);
+  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
+  fpga::format_fp16_ofm(out);
 
   fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_split_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1,
-                       0, 0, bs_ptr);
+  fpga::fill_split_arg(&conv_arg, input_x, out, filter, activation_enable,
+                       leaky_relu_negative_slope, 1, 1, 1, 0, 0, bs_ptr);
   param->SetFpgaArgs(conv_arg);
   return true;
 }
diff --git a/src/operators/kernel/fpga/V2/fusion_fc_relu_kernel.cpp b/src/operators/kernel/fpga/V2/fusion_fc_relu_kernel.cpp
new file mode 100644
index 0000000000..fef370515e
--- /dev/null
+++ b/src/operators/kernel/fpga/V2/fusion_fc_relu_kernel.cpp
@@ -0,0 +1,75 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_FCRELU_OP
+
+#include "operators/kernel/fc_relu_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
+  // bool relu_enabled = false;
+  paddle_mobile::fpga::ActivationType activation_enable =
+      paddle_mobile::fpga::LEAKYRELU;
+  int16_t leaky_relu_negative_slope = 0;
+  auto input_x = const_cast<LoDTensor *>(param->InputX());
+  auto filter = const_cast<LoDTensor *>(param->InputY());
+  const Tensor *input_z = param->InputZ();
+  auto input_z_ptr = input_z->data<float>();
+  auto out = param->Out();
+
+  // PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0],
+  //                      "Image channel should be equal to weight number");
+  int channel = (uint32_t)out->dims()[1];
+  auto bs_ptr =
+      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
+  for (int i = 0; i < channel; i++) {
+    bs_ptr[i + channel] = 1;
+    bs_ptr[i] = input_z_ptr[i];
+  }
+  int num = (uint32_t)filter->dims()[1];
+  int chw = (uint32_t)filter->dims()[0];
+  PADDLE_MOBILE_ENFORCE(
+      chw == input_x->numel(),
+      "Filter element num should be equal to IFM element num");
+  int height = (uint32_t)input_x->dims()[2];
+  int width = (uint32_t)input_x->dims()[3];
+  int filter_channel = chw / height / width;
+
+  out->Resize(framework::make_ddim({1, channel, 1, 1}));
+  filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
+  float max_value = fpga::filter_find_max(filter);
+  fpga::format_fc_filter(filter, max_value);
+
+  int element_num_per_div = fpga::get_filter_num_per_div(filter, 1);
+  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
+  fpga::format_fp16_ofm(out);
+
+  fpga::SplitConvArgs conv_arg = {0};
+  fpga::fill_split_arg(&conv_arg, input_x, out, filter, activation_enable,
+                       leaky_relu_negative_slope, 1, 1, 1, 0, 0, bs_ptr);
+  param->SetFpgaArgs(conv_arg);
+  return true;
+}
+
+template <>
+void FusionFcReluKernel<FPGA, float>::Compute(
+    const FusionFcReluParam<FPGA> &param) {
+  fpga::ComputeFpgaConv(param.FpgaArgs());
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/V2/pad2d_kernel.cpp b/src/operators/kernel/fpga/V2/pad2d_kernel.cpp
new file mode 100644
index 0000000000..e5328dc319
--- /dev/null
+++ b/src/operators/kernel/fpga/V2/pad2d_kernel.cpp
@@ -0,0 +1,61 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef PAD2D_OP
+#include "operators/kernel/pad2d_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool Pad2DKernel<FPGA, float>::Init(Pad2DParam<FPGA> *param) {
+  Tensor *output = param->output_;
+  fpga::format_fp16_ofm(output);
+  return true;
+}
+void pad2dFunc(const framework::Tensor *input, framework::Tensor *output) {
+  auto input_data = (input->data<half>());
+  auto output_data = (output->data<half>());
+  auto input_c = input->dims()[1];
+  auto input_h = input->dims()[2];
+  auto input_w = input->dims()[3];
+  auto output_c = output->dims()[1];
+  auto output_w = output->dims()[3];
+  auto copysize = input_c * input_w;
+  for (int h = 0; h < input_h; ++h) {
+    auto input_offset = h * input_c * input_w;
+    auto output_offset = h * paddle_mobile::fpga::align_to_x(
+                                 output_c * output_w, IMAGE_ALIGNMENT);
+    memcpy((output_data + output_offset), (input_data + input_offset),
+           copysize * sizeof(half));
+  }
+}
+template <>
+void Pad2DKernel<FPGA, float>::Compute(const Pad2DParam<FPGA> &param) {
+  auto in_x = param.input_;
+  auto out = param.output_;
+  fpga::fpga_invalidate((void *)in_x->data<half>(),  // NOLINT
+                        in_x->numel() * sizeof(half));
+  pad2dFunc(in_x, out);
+  (out->scale)[0] = (in_x->scale)[0];
+  (out->scale)[1] = (in_x->scale)[1];
+  DLOG << (out->scale)[0];
+  DLOG << (out->scale)[1];
+  size_t outputSize =
+      out->dims()[2] *
+      paddle_mobile::fpga::align_to_x((out->dims()[1]) * (out->dims()[3]),
+                                      IMAGE_ALIGNMENT) *
+      sizeof(half);
+  fpga::fpga_flush(out->data<half>(), outputSize);
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif  // PAD2D_OP
diff --git a/src/operators/kernel/fpga/V2/pool_kernel.cpp b/src/operators/kernel/fpga/V2/pool_kernel.cpp
index 480aca4eb3..7c8dba1696 100644
--- a/src/operators/kernel/fpga/V2/pool_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/pool_kernel.cpp
@@ -21,18 +21,30 @@ namespace operators {
 
 template <>
 bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) {
-  auto *input = const_cast<Tensor *>(param->Input());
-  auto input_ptr = input->data<float>();
-  Tensor *output = param->Output();
-  int aligned_channel_num =
-      fpga::get_aligned_channel_num((int)output->dims()[1]);  // NOLINT
-  fpga::format_fp16_ofm(output, aligned_channel_num);
-  auto output_ptr = output->mutable_data<float>();
+  auto *input = const_cast<LoDTensor *>(param->Input());
+  auto *output = param->Output();
   vector<int> ksize = param->Ksize();
   vector<int> strides = param->Strides();
   vector<int> paddings = param->Paddings();
   std::string pooling_type = param->PoolingType();
 
+  if (input->type() == type_id<float>()) {
+    int channels = input->dims()[1];
+    int height = input->dims()[2];
+    int width = input->dims()[3];
+    int num = input->dims()[0];
+    int out_width = (width + 2 * paddings[1] - ksize[1]) / strides[1] + 1;
+    int out_height = (height + 2 * paddings[0] - ksize[0]) / strides[0] + 1;
+    framework::DDim dim =
+        framework::make_ddim({num, channels, out_height, out_width});
+    output->mutable_data<float>(dim);
+    return true;
+  }
+
+  auto input_ptr = input->data<half>();
+  fpga::format_fp16_ofm(output);
+  auto output_ptr = output->mutable_data<half>();
+
   fpga::PoolingArgs poolArgs = {0};
   poolArgs.mode = pooling_type == "max" ? 0 : 1;  // max:0, avg:1
   poolArgs.kernel_reciprocal =
@@ -56,6 +68,34 @@ bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) {
 
 template <>
 void PoolKernel<FPGA, float>::Compute(const PoolParam<FPGA> &param) {
+  auto *input = const_cast<LoDTensor *>(param.Input());
+
+  if (input->type() == type_id<float>()) {
+    auto *output = param.Output();
+    auto in = input->data<float>();
+    auto N = input->dims()[0];
+    output->Resize(
+        {N, output->dims()[1], output->dims()[2], output->dims()[3]});
+    auto len = output->numel();
+    auto out = output->mutable_data<float>();
+    int C = input->dims()[1], H = input->dims()[2],  // N = input->dims()[0],
+        W = input->dims()[3];
+    int HW = H * W, CHW = C * H * W, WC = W * C;
+
+    for (int n = 0; n < N; n++) {
+      for (int c = 0; c < C; c++) {
+        out[n * C + c] = 0;
+        for (int h = 0; h < H; h++) {
+          for (int w = 0; w < W; w++) {
+            out[n * C + c] += in[n * CHW + h * WC + w * C +
+                                 c];  // in[n * CHW + c * HW + h * W + w]; //
+          }
+        }
+        out[n * C + c] /= HW;
+      }
+    }
+    return;
+  }
   fpga::ComputeFpgaPool(param.FpgaArgs());
 }
 }  // namespace operators
diff --git a/src/operators/kernel/fpga/V2/proposal_kernel.cpp b/src/operators/kernel/fpga/V2/proposal_kernel.cpp
new file mode 100644
index 0000000000..bd6703bb81
--- /dev/null
+++ b/src/operators/kernel/fpga/V2/proposal_kernel.cpp
@@ -0,0 +1,567 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PROPOSAL_OP
+
+#include <algorithm>
+#include <cmath>
+#include <vector>
+#include "operators/kernel/detection_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+static const double kBBoxClipDefault = std::log(1000.0 / 16.0);
+
+template <>
+bool ProposalKernel<FPGA, float>::Init(ProposalParam<FPGA> *param) {
+  int post_nms_top_n = param->post_nms_topn_;
+  int64_t batch = param->scores_->dims()[0];
+  auto total = post_nms_top_n * batch;
+  param->rpn_rois_->mutable_data<float>({total, 4});
+  param->rpn_probs_->mutable_data<float>({total, 1});
+
+  //  DLOG << *param->rpn_rois_;
+  //  DLOG << *param->rpn_probs_;
+
+  param->float_bbox = std::make_shared<Tensor>();
+  param->float_bbox->Resize(param->bbox_deltas_->dims());
+  param->float_bbox->init(type_id<float>().hash_code());
+  fpga::format_fp32_ofm(param->float_bbox.get());
+  param->float_score = std::make_shared<Tensor>();
+  param->float_score->Resize(param->scores_->dims());
+  param->float_score->init(type_id<float>().hash_code());
+  fpga::format_fp32_ofm(param->float_score.get());
+
+  auto input = param->bbox_deltas_;
+  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
+  args.input_layout_type = fpga::LAYOUT_HWC;
+  args.output_layout_type = fpga::LAYOUT_HWC;
+  args.input_data_type = fpga::DATA_TYPE_FP16;
+  args.output_data_type = fpga::DATA_TYPE_FP32;
+  args.image.address = input->data<half>();
+  args.image.height = (uint32_t)input->dims()[2];
+  args.image.width = (uint32_t)input->dims()[3];
+  args.image.channels = (uint32_t)input->dims()[1];
+  args.output.address = param->float_bbox->mutable_data<float>();
+  args.output.scale_address = param->float_bbox->scale;
+  param->bbox_arg = args;
+
+  input = param->scores_;
+  args.image.address = input->data<half>();
+  args.image.height = (uint32_t)input->dims()[2];
+  args.image.width = (uint32_t)input->dims()[3];
+  args.image.channels = (uint32_t)input->dims()[1];
+  args.output.address = param->float_score->mutable_data<float>();
+  args.output.scale_address = param->float_score->scale;
+  param->score_arg = args;
+
+  param->score_index_ = std::make_shared<Tensor>();
+  param->score_index_->mutable_data<int32_t>({input->numel()});
+  auto score_index = param->score_index_->data<int32_t>();
+  for (int i = 0; i < input->numel(); ++i) {
+    score_index[i] = i;
+  }
+
+  return true;
+}
+template <typename T>
+void CPUGather(const Tensor &src, const Tensor &index, Tensor *output) {
+  PADDLE_MOBILE_ENFORCE(index.dims().size() == 1 ||
+                            (index.dims().size() == 2 && index.dims()[1] == 1),
+                        "Dim not correct");
+  int64_t index_size = index.dims()[0];
+
+  auto src_dims = src.dims();
+
+  const T *p_src = src.data<T>();
+  const int *p_index = index.data<int>();
+  T *p_output = output->data<T>();
+
+  // slice size
+  int slice_size = 1;
+  for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
+
+  const size_t slice_bytes = slice_size * sizeof(T);
+
+  for (int64_t i = 0; i < index_size; ++i) {
+    int index_ = p_index[i];
+    memcpy(p_output + i * slice_size, p_src + index_ * slice_size, slice_bytes);
+  }
+}
+
+void AppendProposals(Tensor *dst, int64_t offset, const Tensor &src) {
+  auto *out_data = dst->data<void>();
+  auto *to_add_data = src.data<void>();
+  size_t size_of_t = framework::SizeOfType(src.type());
+  offset *= size_of_t;
+  std::memcpy(
+      reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(out_data) + offset),
+      to_add_data, src.numel() * size_of_t);
+}
+
+template <class T>
+static inline void BoxCoder(Tensor *all_anchors, Tensor *bbox_deltas,
+                            Tensor *variances, Tensor *proposals) {
+  T *proposals_data = proposals->mutable_data<T>();
+
+  int64_t row = all_anchors->dims()[0];
+  int64_t len = all_anchors->dims()[1];
+
+  auto *bbox_deltas_data = bbox_deltas->data<T>();
+  auto *anchor_data = all_anchors->data<T>();
+  const T *variances_data = nullptr;
+  if (variances) {
+    variances_data = variances->data<T>();
+  }
+
+  for (int64_t i = 0; i < row; ++i) {
+    T anchor_width = anchor_data[i * len + 2] - anchor_data[i * len] + 1.0;
+    T anchor_height = anchor_data[i * len + 3] - anchor_data[i * len + 1] + 1.0;
+
+    T anchor_center_x = anchor_data[i * len] + 0.5 * anchor_width;
+    T anchor_center_y = anchor_data[i * len + 1] + 0.5 * anchor_height;
+
+    T bbox_center_x = 0, bbox_center_y = 0;
+    T bbox_width = 0, bbox_height = 0;
+
+    /*
+        if (variances) {
+          bbox_center_x =
+              variances_data[i * len] * bbox_deltas_data[i * len] * anchor_width
+       + anchor_center_x; bbox_center_y = variances_data[i * len + 1] *
+                              bbox_deltas_data[i * len + 1] * anchor_height +
+                          anchor_center_y;
+          bbox_width = std::exp(std::min<T>(variances_data[i * len + 2] *
+                                                bbox_deltas_data[i * len + 2],
+                                            kBBoxClipDefault)) *
+                       anchor_width;
+          bbox_height = std::exp(std::min<T>(variances_data[i * len + 3] *
+                                                 bbox_deltas_data[i * len + 3],
+                                             kBBoxClipDefault)) *
+                        anchor_height;
+        } else {
+    */
+    bbox_center_x = bbox_deltas_data[i * len] * anchor_width + anchor_center_x;
+    bbox_center_y =
+        bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y;
+
+    /*
+          bbox_width = std::exp(std::min<T>(bbox_deltas_data[i * len + 2],
+                                            kBBoxClipDefault)) *
+                       anchor_width;
+          bbox_height = std::exp(std::min<T>(bbox_deltas_data[i * len + 3],
+                                             kBBoxClipDefault)) *
+                        anchor_height;
+    */
+    bbox_width = std::exp(bbox_deltas_data[i * len + 2]) * anchor_width;
+    bbox_height = std::exp(bbox_deltas_data[i * len + 3]) * anchor_height;
+    //    }
+
+    proposals_data[i * len] = bbox_center_x - bbox_width / 2;
+    proposals_data[i * len + 1] = bbox_center_y - bbox_height / 2;
+    /*
+        //wong
+        proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2 - 1;
+        proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2 - 1;
+        //wong
+    */
+    proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2;
+    proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2;
+  }
+  // return proposals;
+}
+
+template <class T>
+static inline void ClipTiledBoxes(const Tensor &im_info, Tensor *boxes) {
+  T *boxes_data = boxes->mutable_data<T>();
+  const T *im_info_data = im_info.data<T>();
+  T zero(0);
+  for (int64_t i = 0; i < boxes->numel(); ++i) {
+    if (i % 4 == 0) {
+      boxes_data[i] =
+          std::max(std::min(boxes_data[i], im_info_data[1] - 1), zero);
+    } else if (i % 4 == 1) {
+      boxes_data[i] =
+          std::max(std::min(boxes_data[i], im_info_data[0] - 1), zero);
+    } else if (i % 4 == 2) {
+      boxes_data[i] =
+          std::max(std::min(boxes_data[i], im_info_data[1] - 1), zero);
+    } else {
+      boxes_data[i] =
+          std::max(std::min(boxes_data[i], im_info_data[0] - 1), zero);
+    }
+  }
+}
+
+template <class T>
+static inline void FilterBoxes(Tensor *boxes, float min_size,
+                               const Tensor &im_info, Tensor *keep) {
+  const T *im_info_data = im_info.data<T>();
+  T *boxes_data = boxes->mutable_data<T>();
+  T im_scale = im_info_data[2];
+  keep->Resize({boxes->dims()[0]});
+  min_size = std::max(min_size, 1.0f);
+  int *keep_data = keep->mutable_data<int>();
+
+  int keep_len = 0;
+  for (int i = 0; i < boxes->dims()[0]; ++i) {
+    T ws = boxes_data[4 * i + 2] - boxes_data[4 * i] + 1;
+    T hs = boxes_data[4 * i + 3] - boxes_data[4 * i + 1] + 1;
+    T ws_origin_scale =
+        (boxes_data[4 * i + 2] - boxes_data[4 * i]) / im_scale + 1;
+    T hs_origin_scale =
+        (boxes_data[4 * i + 3] - boxes_data[4 * i + 1]) / im_scale + 1;
+    T x_ctr = boxes_data[4 * i] + ws / 2;
+    T y_ctr = boxes_data[4 * i + 1] + hs / 2;
+    if (ws_origin_scale >= min_size && hs_origin_scale >= min_size &&
+        x_ctr <= im_info_data[1] && y_ctr <= im_info_data[0]) {
+      keep_data[keep_len++] = i;
+    }
+  }
+  keep->Resize({keep_len});
+}
+
+template <class T>
+static inline std::vector<std::pair<T, int>> GetSortedScoreIndex(
+    const std::vector<T> &scores) {
+  std::vector<std::pair<T, int>> sorted_indices;
+  sorted_indices.reserve(scores.size());
+  for (size_t i = 0; i < scores.size(); ++i) {
+    sorted_indices.emplace_back(scores[i], i);
+  }
+  // Sort the score pair according to the scores in descending order
+  std::stable_sort(sorted_indices.begin(), sorted_indices.end(),
+                   [](const std::pair<T, int> &a, const std::pair<T, int> &b) {
+                     return a.first < b.first;
+                   });
+  return sorted_indices;
+}
+
+template <class T>
+static inline T BBoxArea(const T *box, bool normalized) {
+  if (box[2] < box[0] || box[3] < box[1]) {
+    // If coordinate values are is invalid
+    // (e.g. xmax < xmin or ymax < ymin), return 0.
+    return static_cast<T>(0.);
+  } else {
+    const T w = box[2] - box[0];
+    const T h = box[3] - box[1];
+    if (normalized) {
+      return w * h;
+    } else {
+      // If coordinate values are not within range [0, 1].
+      return (w + 1) * (h + 1);
+    }
+  }
+}
+
+template <typename T>
+static inline Tensor VectorToTensor(const std::vector<T> &selected_indices,
+                                    int selected_num) {
+  Tensor keep_nms;
+  keep_nms.Resize({selected_num});
+  auto *keep_data = keep_nms.mutable_data<T>();
+  for (int i = 0; i < selected_num; ++i) {
+    keep_data[i] = selected_indices[i];
+  }
+  return keep_nms;
+}
+
+template <class T>
+static inline T JaccardOverlap(const T *box1, const T *box2, bool normalized) {
+  if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
+      box2[3] < box1[1]) {
+    return static_cast<T>(0.);
+  } else {
+    const T inter_xmin = std::max(box1[0], box2[0]);
+    const T inter_ymin = std::max(box1[1], box2[1]);
+    const T inter_xmax = std::min(box1[2], box2[2]);
+    const T inter_ymax = std::min(box1[3], box2[3]);
+    const T inter_w = std::max(T(0), inter_xmax - inter_xmin + 1);
+    const T inter_h = std::max(T(0), inter_ymax - inter_ymin + 1);
+    const T inter_area = inter_w * inter_h;
+    const T bbox1_area = BBoxArea<T>(box1, normalized);
+    const T bbox2_area = BBoxArea<T>(box2, normalized);
+    return inter_area / (bbox1_area + bbox2_area - inter_area);
+  }
+}
+
+template <class T>
+static inline Tensor NMS(Tensor *bbox, Tensor *scores, T nms_threshold,
+                         float eta, int post_nms_num = 100) {
+  int64_t num_boxes = bbox->dims()[0];
+  // 4: [xmin ymin xmax ymax]
+  int64_t box_size = bbox->dims()[1];
+
+  std::vector<T> scores_data(num_boxes);
+  std::copy_n(scores->data<T>(), num_boxes, scores_data.begin());
+  std::vector<std::pair<T, int>> sorted_indices =
+      GetSortedScoreIndex<T>(scores_data);
+
+  std::vector<int> selected_indices;
+  int selected_num = 0;
+  T adaptive_threshold = nms_threshold;
+  const T *bbox_data = bbox->data<T>();
+  while ((sorted_indices.size() != 0) && (selected_num < post_nms_num)) {
+    int idx = sorted_indices.back().second;
+    bool flag = true;
+    for (int kept_idx : selected_indices) {
+      if (flag) {
+        T overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
+                                      bbox_data + kept_idx * box_size, false);
+        flag = (overlap <= adaptive_threshold);
+      } else {
+        break;
+      }
+    }
+    if (flag) {
+      selected_indices.push_back(idx);
+      ++selected_num;
+    }
+    sorted_indices.erase(sorted_indices.end() - 1);
+    if (flag && eta < 1 && adaptive_threshold > 0.5) {
+      adaptive_threshold *= eta;
+    }
+  }
+  return VectorToTensor(selected_indices, selected_num);
+}
+
+template <typename T>
+std::pair<Tensor, Tensor> ProposalForOneImage(
+    const Tensor &im_info_slice, const Tensor &anchors, const Tensor &variances,
+    const Tensor &bbox_deltas_slice,  // [M, 4]
+    const Tensor &scores_slice,       // [N, 1]
+    const Tensor &score_index, int pre_nms_top_n, int post_nms_top_n,
+    float nms_thresh, float min_size, float eta) {
+  auto *scores_data = scores_slice.data<T>();
+
+  // Sort index
+  Tensor index_t;
+  index_t.Resize({scores_slice.numel()});
+  int *index = index_t.mutable_data<int>();
+  /*for (int i = 0; i < scores_slice.numel(); ++i) {
+    index[i] = i;
+  }*/
+  std::memcpy(index, score_index.data<int32_t>(),
+              scores_slice.numel() * sizeof(int));
+
+  auto compare = [scores_data](const int64_t &i, const int64_t &j) {
+    return scores_data[i] > scores_data[j];
+  };
+
+  if (pre_nms_top_n <= 0 || pre_nms_top_n >= scores_slice.numel()) {
+    std::sort(index, index + scores_slice.numel(), compare);
+  } else {
+    std::nth_element(index, index + pre_nms_top_n, index + scores_slice.numel(),
+                     compare);
+    index_t.Resize({pre_nms_top_n});
+  }
+
+  Tensor scores_sel, bbox_sel, anchor_sel, var_sel;
+  scores_sel.mutable_data<T>({index_t.numel(), 1});
+  bbox_sel.mutable_data<T>({index_t.numel(), 4});
+  anchor_sel.mutable_data<T>({index_t.numel(), 4});
+  var_sel.mutable_data<T>({index_t.numel(), 4});
+
+  CPUGather<T>(scores_slice, index_t, &scores_sel);
+  CPUGather<T>(bbox_deltas_slice, index_t, &bbox_sel);
+  CPUGather<T>(anchors, index_t, &anchor_sel);
+  Tensor proposals;
+  proposals.mutable_data<T>({index_t.numel(), 4});
+  BoxCoder<T>(&anchor_sel, &bbox_sel, nullptr, &proposals);
+
+  ClipTiledBoxes<T>(im_info_slice, &proposals);
+
+  Tensor keep;
+  FilterBoxes<T>(&proposals, min_size, im_info_slice, &keep);
+
+  Tensor scores_filter;
+  bbox_sel.mutable_data<T>({keep.numel(), 4});
+  scores_filter.mutable_data<T>({keep.numel(), 1});
+
+  CPUGather<T>(proposals, keep, &bbox_sel);
+  CPUGather<T>(scores_sel, keep, &scores_filter);
+  if (nms_thresh <= 0) {
+    return std::make_pair(bbox_sel, scores_filter);
+  }
+
+  // Tensor keep_nms = NMS<T>(&bbox_sel, &scores_filter, nms_thresh, eta);
+  Tensor keep_nms =
+      NMS<T>(&bbox_sel, &scores_filter, nms_thresh, eta, post_nms_top_n);
+
+  if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) {
+    keep_nms.Resize({post_nms_top_n});
+  }
+
+  proposals.mutable_data<T>({keep_nms.numel(), 4});   // original
+  scores_sel.mutable_data<T>({keep_nms.numel(), 1});  // original
+
+  // proposals.mutable_data<T>({post_nms_top_n, 4});   // wong
+  // scores_sel.mutable_data<T>({post_nms_top_n, 1});  // wong
+  CPUGather<T>(bbox_sel, keep_nms, &proposals);
+  CPUGather<T>(scores_filter, keep_nms, &scores_sel);
+  return std::make_pair(proposals, scores_sel);
+}
+
+template <>
+void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) {
+  auto input_score = param.scores_;
+  auto input_score_data = input_score->data<half>();
+  auto input_score_data_tmp = input_score->data<half>();
+  uint32_t score_n, score_height, score_width, score_channels;
+
+  auto input_bbox = param.bbox_deltas_;
+  auto input_bbox_data = input_bbox->data<half>();
+  auto input_bbox_data_tmp = input_bbox->data<half>();
+  uint32_t bbox_n, bbox_height, bbox_width, bbox_channels;
+
+  score_n = (uint32_t)(input_score->dims()[0]);
+  score_channels = (uint32_t)(input_score->dims()[1]);
+  score_height = (uint32_t)(input_score->dims()[2]);
+  score_width = (uint32_t)(input_score->dims()[3]);
+
+  bbox_n = (uint32_t)(input_bbox->dims()[0]);
+  bbox_channels = (uint32_t)(input_bbox->dims()[1]);
+  bbox_height = (uint32_t)(input_bbox->dims()[2]);
+  bbox_width = (uint32_t)(input_bbox->dims()[3]);
+
+  std::shared_ptr<Tensor> score_tmp = std::make_shared<Tensor>();
+  score_tmp->Resize(param.scores_->dims());
+  score_tmp->mutable_data<half>();
+
+  std::shared_ptr<Tensor> bbox_tmp = std::make_shared<Tensor>();
+  bbox_tmp->Resize(param.bbox_deltas_->dims());
+  bbox_tmp->mutable_data<half>();
+
+  auto score_tmp_data = score_tmp->data<half>();
+  auto bbox_tmp_data = bbox_tmp->data<half>();
+  int64_t amount_per_side = score_width * score_height;
+  int idx = 0;
+  fpga::fpga_invalidate(
+      input_score_data_tmp,
+      score_height * score_width * score_channels * sizeof(half));
+  for (int h = 0; h < score_height; h++) {
+    for (int w = 0; w < score_width; w++) {
+      for (int c = 0; c < score_channels; c++) {
+        idx++;
+        // DLOG  << "wong input_score: "<<
+        // paddle_mobile::fpga::fp16_2_fp32(input_score_data[idx]);
+        *(score_tmp_data + c * amount_per_side + score_width * h + w) =
+            (*(input_score_data_tmp++));
+      }
+    }
+  }
+  amount_per_side = bbox_width * bbox_height;
+  fpga::fpga_invalidate(input_bbox_data_tmp, bbox_height * bbox_width *
+                                                 bbox_channels * sizeof(half));
+  for (int h = 0; h < bbox_height; h++) {
+    for (int w = 0; w < bbox_width; w++) {
+      for (int c = 0; c < bbox_channels; c++) {
+        idx++;
+        // DLOG  << "wong input_score: "<<
+        // paddle_mobile::fpga::fp16_2_fp32(input_score_data[idx]);
+        *(bbox_tmp_data + c * amount_per_side + bbox_width * h + w) =
+            (*(input_bbox_data_tmp++));
+      }
+    }
+  }
+  struct paddle_mobile::fpga::BypassArgs temp_score_arg;
+  struct paddle_mobile::fpga::BypassArgs temp_bbox_arg;
+  temp_score_arg = param.score_arg;
+  temp_score_arg.image.address = score_tmp->data<half>();
+
+  temp_bbox_arg = param.bbox_arg;
+  temp_bbox_arg.image.address = bbox_tmp->data<half>();
+  auto score_tensor = param.float_score.get();
+  fpga::PerformBypass(param.score_arg);
+  fpga::fpga_invalidate(score_tensor->data<float>(),
+                        score_tensor->numel() * sizeof(float));
+
+  auto bbox_tensor = param.float_bbox.get();
+  fpga::PerformBypass(param.bbox_arg);
+  fpga::fpga_invalidate(bbox_tensor->data<float>(),
+                        bbox_tensor->numel() * sizeof(float));
+
+  auto *scores = param.float_score.get();
+  auto *bbox_deltas = param.float_bbox.get();
+  auto *im_info = param.im_info_;
+  auto anchors = *param.anchors_;
+  auto variances = *param.variances_;
+
+  auto *rpn_rois = param.rpn_rois_;
+  auto *rpn_roi_probs = param.rpn_probs_;
+
+  auto score_index = *(param.score_index_.get());
+
+  int pre_nms_top_n = param.pre_nms_topn_;
+  int post_nms_top_n = param.post_nms_topn_;
+  // DLOG << " param.post_nms_topn_ : " << param.post_nms_topn_;
+
+  float nms_thresh = param.nms_thresh_ / 2.0f;
+  float min_size = param.min_size_;
+  float eta = param.eta_;
+
+  auto &scores_dim = scores->dims();
+  int64_t num = scores_dim[0];
+  int64_t c_score = scores_dim[1];
+  int64_t h_score = scores_dim[2];
+  int64_t w_score = scores_dim[3];
+
+  auto &bbox_dim = bbox_deltas->dims();
+  int64_t c_bbox = bbox_dim[1];
+  int64_t h_bbox = bbox_dim[2];
+  int64_t w_bbox = bbox_dim[3];
+
+  //
+  rpn_rois->mutable_data<float>({bbox_deltas->numel(), 4});
+  rpn_roi_probs->mutable_data<float>({scores->numel(), 1});
+
+  framework::LoD lod;
+  lod.resize(1);
+  auto &lod0 = lod[0];
+  lod0.push_back(0);
+  anchors.Resize({anchors.numel(), 4});
+  variances.Resize({variances.numel(), 4});
+
+  int64_t num_proposals = 0;
+  for (int64_t i = 0; i < num; ++i) {
+    Tensor im_info_slice = im_info->Slice(i, i + 1);
+    Tensor bbox_deltas_slice = (*bbox_tensor).Slice(i, i + 1);
+    Tensor scores_slice = (*score_tensor).Slice(i, i + 1);
+
+    bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox, 4});
+    scores_slice.Resize({h_score * w_score * c_score, 1});
+
+    std::pair<Tensor, Tensor> tensor_pair = ProposalForOneImage<float>(
+        im_info_slice, anchors, variances, bbox_deltas_slice, scores_slice,
+        score_index, pre_nms_top_n, post_nms_top_n, nms_thresh, min_size, eta);
+    Tensor &proposals = tensor_pair.first;
+    Tensor &scores = tensor_pair.second;
+
+    AppendProposals(rpn_rois, 4 * num_proposals, proposals);
+    AppendProposals(rpn_roi_probs, num_proposals, scores);
+    num_proposals += proposals.dims()[0];
+    lod0.push_back(num_proposals);
+  }
+  rpn_rois->set_lod(lod);
+  rpn_roi_probs->set_lod(lod);
+  rpn_rois->Resize({num_proposals, 4});
+  rpn_roi_probs->Resize({num_proposals, 1});
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif  // PROPOSAL_OP
diff --git a/src/operators/kernel/fpga/V2/psroi_pool_kernel.cpp b/src/operators/kernel/fpga/V2/psroi_pool_kernel.cpp
new file mode 100644
index 0000000000..7e0852ca4b
--- /dev/null
+++ b/src/operators/kernel/fpga/V2/psroi_pool_kernel.cpp
@@ -0,0 +1,284 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PSROI_POOL_OP
+
+#include <cmath>
+#include <vector>
+#include "operators/kernel/detection_kernel.h"
+
+#include "fpga/V1/api.h"
+#include "fpga/V1/image.h"
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool PSRoiPoolKernel<FPGA, float>::Init(PSRoiPoolParam<FPGA>* param) {
+  auto dims = param->input_x_->dims();
+  PADDLE_MOBILE_ENFORCE(dims[1] * dims[3] % IMAGE_ALIGNMENT == 0,
+                        "data not aligned");
+
+  param->float_input = std::make_shared<Tensor>();
+  param->float_input->mutable_data<float>(param->input_x_->dims());
+  // param->float_output = std::make_shared<Tensor>();
+
+  auto input = param->input_x_;
+  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
+  args.input_layout_type = fpga::LAYOUT_HWC;
+  args.output_layout_type = fpga::LAYOUT_HWC;
+  args.input_data_type = fpga::DATA_TYPE_FP16;
+  args.output_data_type = fpga::DATA_TYPE_FP32;
+  args.image.address = input->data<half>();
+  args.image.height = (uint32_t)input->dims()[2];
+  args.image.width = (uint32_t)input->dims()[3];
+  args.image.channels = (uint32_t)input->dims()[1];
+  args.output.address = param->float_input->mutable_data<float>();
+  args.output.scale_address = param->float_input->scale;
+  param->input_arg = args;
+
+  auto* rois = param->input_rois_;
+  int rois_num = rois->dims()[0];
+  framework::DDim dims_out_new = framework::make_ddim(
+      {rois_num, param->output_->dims()[1], param->output_->dims()[2],
+       param->output_->dims()[3]});
+  param->output_->Resize(dims_out_new);
+  // fpga::format_fp16_ofm(param->output_);
+
+  param->output_->mutable_data<float>(dims_out_new);
+  //  auto output = param->float_output.get();
+  // param->output_ = output;
+  /* args.input_data_type = fpga::DATA_TYPE_FP32;
+   args.output_data_type = fpga::DATA_TYPE_FP16;
+   args.image.address = output->data<float>();
+   args.image.height = (uint32_t)output->dims()[2];
+   args.image.width = (uint32_t)output->dims()[3];
+   args.image.channels = (uint32_t)output->dims()[1]  ;
+   args.output.address = param->output_->mutable_data<half>();
+   args.output.scale_address = param->output_->scale;
+   param->output_arg = args;*/
+
+  return true;
+}
+
+/*
+    template <typename Dtype>
+    void PSROIPoolingForward(
+    const Dtype* bottom_data,
+    const int height, const int width, const int input_channel,
+    Dtype* top_data,
+    const int pooled_height, const int pooled_width, const int output_channel,
+    const Dtype* bottom_rois,
+    const Dtype Bin_size_h, const Dtype Bin_size_w, const Dtype roi_start_h,
+   const Dtype roi_start_w, const int pw, const int ph, const int roi_batch_ind)
+    {
+
+      int hstart = floor(static_cast<Dtype>(ph) * Bin_size_h + roi_start_h);
+      int wstart = floor(static_cast<Dtype>(pw)* Bin_size_w + roi_start_w);
+      int hend = ceil(static_cast<Dtype>(ph + 1) * Bin_size_h + roi_start_h);
+      int wend = ceil(static_cast<Dtype>(pw + 1) * Bin_size_w + roi_start_w);
+
+      hstart = std::min(std::max(hstart, 0), height);
+      hend = std::min(std::max(hend, 0), height);
+      wstart = std::min(std::max(wstart, 0), width);
+      wend = std::min(std::max(wend, 0), width);
+      bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+      float32x4_t sum_pixels_low_c= vdupq_n_f32(0);
+      float32x4_t sum_pixels_high_c= vdupq_n_f32(0);
+
+      if(!is_empty){
+          Dtype bin_area = (hend - hstart) * (wend - wstart);
+          float rev_bin_area = 1 / bin_area;
+          float32x4_t q_bin_area = vdupq_n_f32(rev_bin_area);
+   //static_cast<float>(bin_area) float pixels_c[output_channel];
+
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+                int pixel_offset = (h * width + w) * input_channel;
+                for(int output_c = 0; output_c < output_channel; output_c++){
+                    int input_channel_offset = output_c * pooled_height *
+   pooled_width; int input_bias = pixel_offset + input_channel_offset + ph *
+   pooled_width + pw; pixels_c[output_c] = bottom_data[input_bias];
+                }
+                float32x4_t pixel_low_c = vld1q_f32(pixels_c);
+                float32x4_t pixel_high_c = vld1q_f32(pixels_c + 4);
+                sum_pixels_low_c = vaddq_f32(sum_pixels_low_c, pixel_low_c);
+                sum_pixels_high_c = vaddq_f32(sum_pixels_high_c, pixel_high_c);
+            }
+          }
+          sum_pixels_low_c = vmulq_f32(sum_pixels_low_c, q_bin_area);
+          sum_pixels_high_c = vmulq_f32(sum_pixels_high_c, q_bin_area);
+        }
+
+      int output_index_base = (ph * pooled_width + pw) * output_channel;
+      top_data += output_index_base;
+      vst1q_f32(top_data, sum_pixels_low_c);
+      top_data += 4;
+      vst1q_f32(top_data, sum_pixels_high_c);
+    }*/
+
+template <typename Dtype>
+void PSROIPoolingForward(const Dtype* bottom_data, const int height,
+                         const int width, const int input_channel,
+                         Dtype* top_data, const int pooled_height,
+                         const int pooled_width, const int output_channel,
+                         const Dtype* bottom_rois, const Dtype Bin_size_h,
+                         const Dtype Bin_size_w, const Dtype roi_start_h,
+                         const Dtype roi_start_w, const int pw, const int ph,
+                         const int roi_batch_ind) {
+  int hstart = floor(static_cast<Dtype>(ph) * Bin_size_h + roi_start_h);
+  int wstart = floor(static_cast<Dtype>(pw) * Bin_size_w + roi_start_w);
+  int hend = ceil(static_cast<Dtype>(ph + 1) * Bin_size_h + roi_start_h);
+  int wend = ceil(static_cast<Dtype>(pw + 1) * Bin_size_w + roi_start_w);
+
+  // Add roi offsets and clip to input boundaries
+  hstart = std::min(std::max(hstart, 0), height);
+  hend = std::min(std::max(hend, 0), height);
+  wstart = std::min(std::max(wstart, 0), width);
+  wend = std::min(std::max(wend, 0), width);
+  bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+  float sum_pixels_c[output_channel] = {0};
+  float pixels_c[output_channel] = {0};
+  if (!is_empty) {
+    Dtype bin_area = (hend - hstart) * (wend - wstart);
+    float rec_bin_area = 1 / bin_area;
+
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        int pixel_offset = (h * width + w) * input_channel;
+        for (int output_c = 0; output_c < output_channel; output_c++) {
+          int input_channel_offset = output_c * pooled_height * pooled_width;
+          int input_bias =
+              pixel_offset + input_channel_offset + ph * pooled_width + pw;
+          pixels_c[output_c] = bottom_data[input_bias];
+        }
+
+        for (int output_c = 0; output_c < output_channel; output_c++) {
+          sum_pixels_c[output_c] += pixels_c[output_c];
+        }
+      }
+    }
+    for (int output_c = 0; output_c < output_channel; output_c++) {
+      sum_pixels_c[output_c] *= rec_bin_area;
+    }
+  }
+
+  int output_index_base = (ph * pooled_width + pw) * output_channel;
+  top_data += output_index_base;
+  memcpy(top_data, sum_pixels_c, output_channel * 4);
+}
+
+template <>
+void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
+  auto input_tensor = param.float_input.get();
+  fpga::PerformBypass(param.input_arg);
+  fpga::fpga_invalidate(input_tensor->data<float>(),
+                        input_tensor->numel() * sizeof(float));
+
+  auto* in = input_tensor;
+  auto* rois = param.input_rois_;
+  auto* out = param.output_;  // param.float_output.get();
+
+  auto pooled_height = param.pooled_height_;
+  auto pooled_width = param.pooled_width_;
+  auto spatial_scale = param.spatial_scale_;
+  auto output_channels = param.output_channels_;
+
+  auto in_dims = in->dims();
+  int batch_size = in_dims[0];
+  int input_channels = in_dims[1];
+  int height = in_dims[2];
+  int width = in_dims[3];
+  int rois_num = rois->dims()[0];
+
+  auto data_nhwc = in->mutable_data<float>();
+
+  //  fpga::image::convert_to_chw(&data_nhwc, input_channels, height, width);
+  framework::DDim dims_out_new = framework::make_ddim(
+      {rois_num, (param.output_)->dims()[1], (((param.output_)->dims()[2])),
+       (param.output_)->dims()[3]});
+
+  (param.output_)->Resize(dims_out_new);
+
+  const float* input_data = data_nhwc;  // in->data<float>();
+  framework::Tensor rois_batch_id_list;
+  rois_batch_id_list.Resize({rois_num});
+  auto rois_batch_id_data = rois_batch_id_list.mutable_data<int>();
+
+  PADDLE_MOBILE_ENFORCE(rois->NumLevels() > 0, "ROIS should not be empty");
+
+  auto rois_lod = rois->lod().back();
+  int rois_batch_size = rois_lod.size() - 1;
+  PADDLE_MOBILE_ENFORCE(
+      rois_batch_size == batch_size,
+      "the rois_batch_size and input(X) batch_size should be the same.");
+  int rois_num_with_lod = rois_lod[rois_batch_size];
+  PADDLE_MOBILE_ENFORCE(rois_num_with_lod == rois_num,
+                        "the rois_num from input and lod must be the same");
+
+  PADDLE_MOBILE_ENFORCE(
+      input_channels == output_channels * pooled_height * pooled_width,
+      "the channels of input X should equal the product of "
+      "output_channels x pooled_height x pooled_width");
+
+  // calculate batch id index for each roi according to LoD
+  for (int n = 0; n < rois_batch_size; ++n) {
+    for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+      rois_batch_id_data[i] = n;
+    }
+  }
+  auto output_data = out->mutable_data<float>();
+  auto input_rois = rois->data<float>();
+
+  for (int n = 0; n < rois_num; ++n) {
+    auto offset_input_rois = input_rois + n * 4;
+    auto offset_output_data =
+        output_data + pooled_height * pooled_width * output_channels * n;
+
+    auto roi_start_w =
+        static_cast<float>(round(offset_input_rois[0])) * spatial_scale;
+    auto roi_start_h =
+        static_cast<float>(round(offset_input_rois[1])) * spatial_scale;
+    auto roi_end_w =
+        static_cast<float>(round(offset_input_rois[2]) + 1.) * spatial_scale;
+    auto roi_end_h =
+        static_cast<float>(round(offset_input_rois[3]) + 1.) * spatial_scale;
+
+    // Force too small rois to be 1 x 1
+    auto roi_height = std::max(roi_end_h - roi_start_h, 0.1f);  // avoid 0
+    auto roi_width = std::max(roi_end_w - roi_start_w, 0.1f);
+
+    // Compute bin size w and h at input feature map
+    auto bin_size_h = roi_height / static_cast<float>(pooled_height);
+    auto bin_size_w = roi_width / static_cast<float>(pooled_width);
+
+    int roi_batch_ind = rois_batch_id_data[n];
+
+    for (int ph = 0; ph < pooled_height; ph++) {
+      for (int pw = 0; pw < pooled_width; pw++) {
+        PSROIPoolingForward<float>(input_data, height, width, input_channels,
+                                   offset_output_data, pooled_height,
+                                   pooled_width, output_channels, input_rois,
+                                   bin_size_h, bin_size_w, roi_start_h,
+                                   roi_start_w, pw, ph, roi_batch_ind);
+      }
+    }
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif  // PSROI_POOL_OP
diff --git a/src/operators/kernel/fpga/V2/deconv_relu_kernel.cpp b/src/operators/kernel/fpga/V2/relu_kernel.cpp
similarity index 70%
rename from src/operators/kernel/fpga/V2/deconv_relu_kernel.cpp
rename to src/operators/kernel/fpga/V2/relu_kernel.cpp
index bf3556609a..6fff10f620 100644
--- a/src/operators/kernel/fpga/V2/deconv_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/relu_kernel.cpp
@@ -12,24 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef FUSION_DECONVRELU_OP
+#ifdef RELU_OP
 
-#include "operators/kernel/deconv_relu_kernel.h"
-#include "framework/operator.h"
-#include "operators/op_param.h"
+#include "operators/kernel/activation_kernel.h"
 
 namespace paddle_mobile {
 namespace operators {
 
 template <>
-bool DeconvReluKernel<FPGA, float>::Init(FusionDeconvReluParam<FPGA> *param) {
+bool ReluKernel<FPGA, float>::Init(ReluParam<FPGA> *param) {
+  param->Out()->ShareDataWith(*param->InputX());
   return true;
 }
 
 template <>
-void DeconvReluKernel<FPGA, float>::Compute(
-    const FusionDeconvReluParam<FPGA> &param) {}
-
+void ReluKernel<FPGA, float>::Compute(const ReluParam<FPGA> &param) {}
 }  // namespace operators
 }  // namespace paddle_mobile
 
diff --git a/src/operators/kernel/fpga/V2/reshape2_kernel.cpp b/src/operators/kernel/fpga/V2/reshape2_kernel.cpp
new file mode 100644
index 0000000000..647ecb5a65
--- /dev/null
+++ b/src/operators/kernel/fpga/V2/reshape2_kernel.cpp
@@ -0,0 +1,127 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef RESHAPE2_OP
+
+#include "operators/kernel/reshape2_kernel.h"
+#include "framework/ddim.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool Reshape2Kernel<FPGA, float>::Init(Reshape2Param<FPGA> *param) {
+  auto input = const_cast<LoDTensor *>(param->InputX());
+  auto output = param->Out();
+  auto shape = param->Shape();
+
+  auto num_in = framework::product(input->dims());
+  auto num_shape = framework::product(framework::make_ddim(shape));
+  PADDLE_MOBILE_ENFORCE(num_shape != 0, "0 index is not supported");
+
+  for (int i = 0; i < shape.size(); i++) {
+    if (shape[i] == -1) {
+      shape[i] = static_cast<int>(-num_in / num_shape);
+      break;
+    }
+  }
+  output->Resize(framework::make_ddim(shape));
+  output->set_type(input->type());
+  fpga::format_ofm(output);
+  DLOG << "input: " << input;
+  DLOG << "output: " << output;
+
+  return true;
+}
+
+void reshape(LoDTensor *input, LoDTensor *output) {
+  // Subscript r means after reshape
+
+  auto input_ptr = input->data<half>();
+  auto output_ptr = output->data<half>();
+  output->scale[0] = input->scale[0];
+  output->scale[1] = input->scale[1];
+
+  auto C = static_cast<int>(input->dims()[1]);
+  auto H = static_cast<int>(input->dims()[2]);
+  auto W = static_cast<int>(input->dims()[3]);
+  auto Cr = static_cast<int>(output->dims()[1]);
+  auto Hr = static_cast<int>(output->dims()[2]);
+  auto Wr = static_cast<int>(output->dims()[3]);
+  PADDLE_MOBILE_ENFORCE(C * H * W == Cr * Hr * Wr, "Dims don't match");
+  auto WC = W * C;
+  auto WC_align = fpga::align_to_x(WC, IMAGE_ALIGNMENT);
+  auto HW = H * W;
+  auto WCr = Wr * Cr;
+  auto WCr_align = fpga::align_to_x(WCr, IMAGE_ALIGNMENT);
+  auto HWr = Hr * Wr;
+
+  fpga::fpga_invalidate(input_ptr, H * WC_align * sizeof(half));
+
+  int offset_align = 0;
+  int offset_r = 0, offset_align_r = 0;
+  int cr = 0, hr = 0, wr = 0;
+
+  for (int h = 0; h < H; h++) {
+    int offset0 = h * WC_align;
+    for (int w = 0; w < W; w++) {
+      int offset1 = w * C + offset0;
+      for (int c = 0; c < C; c++) {
+        offset_align = offset1 + c;
+        offset_r = c * HW + h * W + w;
+        cr = offset_r / HWr;
+        hr = offset_r % HWr / Wr;
+        wr = offset_r % Wr;
+        offset_align_r = hr * WCr_align + wr * Cr + cr;
+        output_ptr[offset_align_r] = input_ptr[offset_align];
+      }
+    }
+  }
+
+  fpga::fpga_flush(output_ptr, Hr * WCr_align * sizeof(half));
+}
+
+template <>
+void Reshape2Kernel<FPGA, float>::Compute(const Reshape2Param<FPGA> &param) {
+  auto input = const_cast<LoDTensor *>(param.InputX());
+  auto output = param.Out();
+  auto shape = param.Shape();
+
+  auto num_in = framework::product(input->dims());
+  auto num_shape = framework::product(framework::make_ddim(shape));
+  PADDLE_MOBILE_ENFORCE(num_shape != 0, "0 index is not supported");
+
+  for (int i = 0; i < shape.size(); i++) {
+    if (shape[i] == -1) {
+      shape[i] = static_cast<int>(-num_in / num_shape);
+      break;
+    }
+  }
+  output->Resize(framework::make_ddim(shape));
+  if (output->dims() == input->dims()) {
+    DLOG << "No need to reshape";
+    output->ShareDataWith(*input);
+    framework::LoD lod = input->lod();
+    output->set_lod(lod);
+    return;
+  }
+
+  reshape(input, output);
+  //
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/V2/reshape_kernel.cpp b/src/operators/kernel/fpga/V2/reshape_kernel.cpp
new file mode 100644
index 0000000000..5e01bb74ba
--- /dev/null
+++ b/src/operators/kernel/fpga/V2/reshape_kernel.cpp
@@ -0,0 +1,40 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef RESHAPE_OP
+
+#include "operators/kernel/reshape_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ReshapeKernel<FPGA, float>::Init(ReshapeParam<FPGA> *param) {
+  param->Out()->ShareDataWith(*param->InputX());
+  const int in_n = param->InputX()->dims()[0];
+  const int in_c = param->InputX()->dims()[1];
+  const int in_h = param->InputX()->dims()[2];
+  const int in_w = param->InputX()->dims()[3];
+  auto out = param->Out();
+  out->Resize(framework::make_ddim({in_n, in_c * in_h * in_w}));
+  return true;
+}
+
+template <>
+void ReshapeKernel<FPGA, float>::Compute(const ReshapeParam<FPGA> &param) {}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/V2/roialign_pool_kernel.cpp b/src/operators/kernel/fpga/V2/roialign_pool_kernel.cpp
new file mode 100644
index 0000000000..ec8d19db80
--- /dev/null
+++ b/src/operators/kernel/fpga/V2/roialign_pool_kernel.cpp
@@ -0,0 +1,296 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef ROIALIGN_POOL_OP
+
+#include <cmath>
+#include <vector>
+#include "operators/kernel/detection_kernel.h"
+
+#include "fpga/V1/api.h"
+#include "fpga/V1/image.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool RoiAlignPoolKernel<FPGA, float>::Init(RoiAlignPoolParam<FPGA>* param) {
+  auto dims = param->input_x_->dims();
+  PADDLE_MOBILE_ENFORCE(dims[1] * dims[3] % IMAGE_ALIGNMENT == 0,
+                        "data not aligned");
+
+  param->float_input = std::make_shared<Tensor>();
+  param->float_input->mutable_data<float>(param->input_x_->dims());
+
+  auto input = param->input_x_;
+  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
+  args.input_layout_type = fpga::LAYOUT_HWC;
+  args.output_layout_type = fpga::LAYOUT_HWC;
+  args.input_data_type = fpga::DATA_TYPE_FP16;
+  args.output_data_type = fpga::DATA_TYPE_FP32;
+  args.image.address = input->data<half>();
+  args.image.height = (uint32_t)input->dims()[2];
+  args.image.width = (uint32_t)input->dims()[3];
+  args.image.channels = (uint32_t)input->dims()[1];
+  args.output.address = param->float_input->mutable_data<float>();
+  args.output.scale_address = param->float_input->scale;
+  param->input_arg = args;
+
+  auto* rois = param->input_rois_;
+  int rois_num = rois->dims()[0];
+  framework::DDim dims_out_new = framework::make_ddim(
+      {rois_num, param->output_->dims()[1], param->output_->dims()[2],
+       param->output_->dims()[3]});
+  param->output_->Resize(dims_out_new);
+
+  param->output_->mutable_data<float>(dims_out_new);
+
+  return true;
+}
+
+template <typename T>
+struct PreCalc {
+  int pos1;
+  int pos2;
+  int pos3;
+  int pos4;
+  T w1;
+  T w2;
+  T w3;
+  T w4;
+};
+
+template <typename T>
+void pre_calc_for_bilinear_interpolate(
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, const int iy_upper, const int ix_upper,
+    T roi_start_h, T roi_start_w, T bin_size_h, T bin_size_w,
+    int roi_bin_grid_h, int roi_bin_grid_w,
+    std::vector<PreCalc<T>>& pre_calc) {  // NOLINT
+  int pre_calc_index = 0;
+  for (int ph = 0; ph < pooled_height; ph++) {
+    for (int pw = 0; pw < pooled_width; pw++) {
+      for (int iy = 0; iy < iy_upper; iy++) {
+        const T yy = roi_start_h + ph * bin_size_h +
+                     static_cast<T>(iy + .5f) * bin_size_h /
+                         static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+        for (int ix = 0; ix < ix_upper; ix++) {
+          const T xx = roi_start_w + pw * bin_size_w +
+                       static_cast<T>(ix + .5f) * bin_size_w /
+                           static_cast<T>(roi_bin_grid_w);
+
+          T x = xx;
+          T y = yy;
+          // deal with: inverse elements are out of feature map boundary
+          if (y < -1.0 || y > height || x < -1.0 || x > width) {
+            // empty
+            PreCalc<T> pc;
+            pc.pos1 = 0;
+            pc.pos2 = 0;
+            pc.pos3 = 0;
+            pc.pos4 = 0;
+            pc.w1 = 0;
+            pc.w2 = 0;
+            pc.w3 = 0;
+            pc.w4 = 0;
+            pre_calc[pre_calc_index] = pc;
+            pre_calc_index += 1;
+            continue;
+          }
+
+          if (y <= 0) {
+            y = 0;
+          }
+          if (x <= 0) {
+            x = 0;
+          }
+
+          int y_low = static_cast<int>(y);
+          int x_low = static_cast<int>(x);
+          int y_high;
+          int x_high;
+
+          if (y_low >= height - 1) {
+            y_high = y_low = height - 1;
+            y = (T)y_low;
+          } else {
+            y_high = y_low + 1;
+          }
+
+          if (x_low >= width - 1) {
+            x_high = x_low = width - 1;
+            x = (T)x_low;
+          } else {
+            x_high = x_low + 1;
+          }
+
+          T ly = y - y_low;
+          T lx = x - x_low;
+          T hy = 1. - ly, hx = 1. - lx;
+          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+          // save weights and indeces
+          PreCalc<T> pc;
+          pc.pos1 = y_low * width + x_low;
+          pc.pos2 = y_low * width + x_high;
+          pc.pos3 = y_high * width + x_low;
+          pc.pos4 = y_high * width + x_high;
+          pc.w1 = w1;
+          pc.w2 = w2;
+          pc.w3 = w3;
+          pc.w4 = w4;
+          pre_calc[pre_calc_index] = pc;
+
+          pre_calc_index += 1;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void ROIAlignForward(const int nthreads, const T* bottom_data,
+                     const T& spatial_scale, const int channels,
+                     const int height, const int width, const int pooled_height,
+                     const int pooled_width, const int sampling_ratio,
+                     const T* bottom_rois, T* top_data) {
+  int n_rois = nthreads / channels / pooled_width / pooled_height;
+
+  for (int n = 0; n < n_rois; n++) {
+    int index_n = n * channels * pooled_width * pooled_height;
+
+    // roi could have 4 or 5 columns
+    const T* offset_bottom_rois = bottom_rois + n * 4;
+    int roi_batch_ind = 0;
+    // if (roi_cols == 5) {
+    // roi_batch_ind = offset_bottom_rois[0];
+    // offset_bottom_rois++;
+    // }
+
+    // Do not using rounding; this implementation detail is critical
+    T roi_start_w = offset_bottom_rois[0] * spatial_scale;
+    T roi_start_h = offset_bottom_rois[1] * spatial_scale;
+    T roi_end_w = offset_bottom_rois[2] * spatial_scale;
+    T roi_end_h = offset_bottom_rois[3] * spatial_scale;
+    // T roi_start_w = round(offset_bottom_rois[0] * spatial_scale);
+    // T roi_start_h = round(offset_bottom_rois[1] * spatial_scale);
+    // T roi_end_w = round(offset_bottom_rois[2] * spatial_scale);
+    // T roi_end_h = round(offset_bottom_rois[3] * spatial_scale);
+
+    // Force malformed ROIs to be 1x1
+    T roi_width = std::max(roi_end_w - roi_start_w, (T)1.);
+    T roi_height = std::max(roi_end_h - roi_start_h, (T)1.);
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : ceil(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
+
+    // we want to precalculate indeces and weights shared by all chanels,
+    // this is the key point of optimiation
+    std::vector<PreCalc<T>> pre_calc(roi_bin_grid_h * roi_bin_grid_w *
+                                     pooled_width * pooled_height);
+    pre_calc_for_bilinear_interpolate(
+        height, width, pooled_height, pooled_width, roi_bin_grid_h,
+        roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w,
+        roi_bin_grid_h, roi_bin_grid_w, pre_calc);
+
+    for (int c = 0; c < channels; c++) {
+      int index_n_c = index_n + c * pooled_width * pooled_height;
+      const T* offset_bottom_data =
+          bottom_data + (roi_batch_ind * channels + c) * height * width;
+      int pre_calc_index = 0;
+
+      for (int ph = 0; ph < pooled_height; ph++) {
+        for (int pw = 0; pw < pooled_width; pw++) {
+          int index = index_n_c + ph * pooled_width + pw;
+
+          T output_val = 0.;
+          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+              PreCalc<T> pc = pre_calc[pre_calc_index];
+              output_val += pc.w1 * offset_bottom_data[pc.pos1] +
+                            pc.w2 * offset_bottom_data[pc.pos2] +
+                            pc.w3 * offset_bottom_data[pc.pos3] +
+                            pc.w4 * offset_bottom_data[pc.pos4];
+
+              pre_calc_index += 1;
+            }
+          }
+          output_val /= count;
+
+          top_data[index] = output_val;
+        }  // for pw
+      }    // for ph
+    }      // for c
+  }        // for n
+}
+
+template <>
+void RoiAlignPoolKernel<FPGA, float>::Compute(
+    const RoiAlignPoolParam<FPGA>& param) {
+  auto input_tensor = param.float_input.get();
+  fpga::PerformBypass(param.input_arg);
+  fpga::fpga_invalidate(input_tensor->data<float>(),
+                        input_tensor->numel() * sizeof(float));
+
+  auto* in = input_tensor;
+  auto* rois = param.input_rois_;
+  auto* out = param.output_;  // param.float_output.get();
+
+  auto pooled_height = param.pooled_height_;
+  auto pooled_width = param.pooled_width_;
+  auto spatial_scale = param.spatial_scale_;
+  auto sampe_ratio = param.sampling_ratio_;
+
+  auto in_dims = in->dims();
+  int batch_size = in_dims[0];
+  int input_channels = in_dims[1];
+  int height = in_dims[2];
+  int width = in_dims[3];
+  int rois_num = rois->dims()[0];
+
+  auto data_nhwc = in->mutable_data<float>();
+
+  fpga::image::convert_to_chw(&data_nhwc, input_channels, height, width);
+  framework::DDim dims_out_new = framework::make_ddim(
+      {rois_num, (param.output_)->dims()[1], (((param.output_)->dims()[2])),
+       (param.output_)->dims()[3]});
+  (param.output_)->Resize(dims_out_new);
+
+  const int index = input_channels * pooled_height * pooled_width * rois_num;
+  auto rois_data = rois->data<float>();
+  auto top_data = param.output_->mutable_data<float>();
+  for (int i = 0; i < index; ++i) {
+    ROIAlignForward<float>(index, data_nhwc, spatial_scale, input_channels,
+                           height, width, pooled_height, pooled_width,
+                           sampe_ratio, rois_data, top_data);
+  }
+
+  fpga::image::convert_to_hwc(&top_data, input_channels, pooled_height,
+                              pooled_width, rois_num);
+  out->reset_data_ptr(top_data);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif  // ROIALIGN_POOL_OP
diff --git a/src/operators/kernel/fpga/V2/sigmoid_kernel.cpp b/src/operators/kernel/fpga/V2/sigmoid_kernel.cpp
new file mode 100644
index 0000000000..e61f00a09a
--- /dev/null
+++ b/src/operators/kernel/fpga/V2/sigmoid_kernel.cpp
@@ -0,0 +1,55 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef SIGMOID_OP
+
+#include "operators/kernel/activation_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool SigmoidKernel<FPGA, float>::Init(SigmoidParam<FPGA> *param) {
+  paddle_mobile::fpga::ActivationType activation_enable =
+      paddle_mobile::fpga::SIGMOID;
+  int16_t leaky_relu_negative_slope = 0;
+  auto input = const_cast<LoDTensor *>(param->InputX());
+  auto input_ptr = input->data<half>();
+  auto out = param->Out();
+  fpga::format_fp16_ofm(out);
+
+  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
+  args.input_data_type = fpga::DATA_TYPE_FP16;
+  args.output_data_type = fpga::DATA_TYPE_FP16;
+  args.image.address = input_ptr;
+  args.image.height = 1;
+  args.image.width = 1;
+  args.image.channels = input->fpga_data_num;
+  args.output.address = out->data<half>();
+  args.output.scale_address = out->scale;
+  args.output.activation.activation_type = activation_enable;
+  args.output.activation.leaky_relu_negative_slope = leaky_relu_negative_slope;
+  param->SetFpgaArgs(args);
+  return true;
+}
+
+template <>
+void SigmoidKernel<FPGA, float>::Compute(const SigmoidParam<FPGA> &param) {
+  fpga::PerformBypass(param.FpgaArgs());
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/V2/slice_kernel.cpp b/src/operators/kernel/fpga/V2/slice_kernel.cpp
index bc3fbfd796..2fd6ef542e 100644
--- a/src/operators/kernel/fpga/V2/slice_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/slice_kernel.cpp
@@ -18,13 +18,46 @@ limitations under the License. */
 
 namespace paddle_mobile {
 namespace operators {
+
 template <>
 bool SliceKernel<FPGA, float>::Init(SliceParam<FPGA>* param) {
+  auto output = param->output_;
+  fpga::format_fp16_ofm(output);
+  DLOG << "input: " << param->input_;
+  DLOG << "output: " << param->output_;
+  if (param->input_->type() != type_id<half>()) {
+    DLOG << "wrong type";
+  }
   return true;
 }
 template <>
-void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) {}
+void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) {
+  // Only support slicing in channel dimension
+  // Only support half data
+  // W must be aligned to 16
+
+  auto input = param.input_;
+  auto output = param.output_;
+  int HW = input->dims()[2] * input->dims()[3];
+  int channel = input->dims()[1];
+  auto input_ptr = input->data<half>();
+  auto output_ptr = output->data<half>();
+
+  output->scale[0] = input->scale[0];
+  output->scale[1] = input->scale[1];
 
+  int start = param.starts_[0], end = param.ends_[0];
+  start = start < 0 ? start + channel : start;
+  end = end < 0 ? end + channel : end;
+  start = start > channel ? channel : start;
+  end = end > channel ? channel : end;
+  int len = end - start;
+  size_t size = len * sizeof(half);
+
+  for (int i = 0; i < HW; i++) {
+    memcpy(output_ptr + len * i, input_ptr + i * channel + start, size);
+  }
+}
 }  // namespace operators
 }  // namespace paddle_mobile
 #endif
diff --git a/src/operators/kernel/fpga/V2/softmax_kernel.cpp b/src/operators/kernel/fpga/V2/softmax_kernel.cpp
index 5232364ac2..ba86787c64 100755
--- a/src/operators/kernel/fpga/V2/softmax_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/softmax_kernel.cpp
@@ -16,45 +16,100 @@ limitations under the License. */
 
 #include "operators/kernel/softmax_kernel.h"
 #include "operators/kernel/central-arm-func/softmax_arm_func.h"
+
 namespace paddle_mobile {
 namespace operators {
 
 template <>
 bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
-  auto input = const_cast<Tensor *>(param->InputX());
-  auto input_ptr = input->data<float>();
-  auto float_input = new Tensor;
-  float_input->mutable_data<float>({1, input->dims()[1]});
-  fpga::format_fp32_ofm(float_input, 1024);
-
-  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
-  args.input_layout_type = fpga::LAYOUT_HWC;
-  args.output_layout_type = fpga::LAYOUT_CHW;
-  args.input_data_type = fpga::DATA_TYPE_FP16;
-  args.output_data_type = fpga::DATA_TYPE_FP32;
-  args.image.address = input_ptr;
-  args.image.height = 1;
-  args.image.width = 1;
-  args.image.channels = (uint32_t)input->dims()[1];
-  args.output.address = float_input->data<float>();
-  args.output.scale_address = float_input->scale;
-  param->SetFloatInput(float_input);
-  param->SetFpgaArgs(args);
+  auto input = const_cast<LoDTensor *>(param->InputX());
+  auto dims = framework::vectorize(input->dims());
+  half *input_ptr;
+  auto out = param->Out();
+  if (input->type() == type_id<float>()) {
+    out->Resize(framework::make_ddim(dims));
+    out->mutable_data<float>(framework::make_ddim(dims));
+  } else {
+    input_ptr = input->data<half>();
+  }
+
+  auto float_input = new LoDTensor;
+
+  PADDLE_MOBILE_ENFORCE(input->dims().size() == 4,
+                        "Softmax should have 4-order input");
+
+  auto channel = dims[3];
+  if (channel == 1) {  // This input is generated by FC op, dims = [N C 1 1]
+    PADDLE_MOBILE_ENFORCE(dims[2] == 1, "Softmax input must come from FC op");
+    dims[3] = dims[1];
+    dims[1] = 1;
+  }
+  input->Resize(framework::make_ddim(dims));
+  float_input->Resize(framework::make_ddim(dims));
+
+  if (channel != 2) {  // Use CPU
+    out->Resize(framework::make_ddim(dims));
+    out->mutable_data<float>(framework::make_ddim(dims));
+    float_input->init(type_id<float>().hash_code());
+    float_input->mutable_data<float>(framework::make_ddim(dims));
+    //  fpga::format_fp32_ofm(float_input);
+    // fpga::format_fp32_ofm(out);
+
+    fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
+    args.input_layout_type = fpga::LAYOUT_HWC;
+    args.output_layout_type = fpga::LAYOUT_CHW;
+    args.input_data_type = fpga::DATA_TYPE_FP16;
+    args.output_data_type = fpga::DATA_TYPE_FP32;
+    args.image.address = input_ptr;
+    args.image.height = (uint32_t)dims[1] * dims[0];
+    args.image.width = (uint32_t)dims[2];
+    args.image.channels = (uint32_t)dims[3];
+    args.output.address = float_input->data<float>();
+    args.output.scale_address = float_input->scale;
+    param->SetFloatInput(float_input);
+    param->SetFpgaArgs(args);
+  } else {  // Use FPGA
+    fpga::format_fp16_ofm(out);
+    fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
+    args.input_layout_type = fpga::LAYOUT_HWC;
+    args.output_layout_type = fpga::LAYOUT_CHW;
+    args.input_data_type = fpga::DATA_TYPE_FP16;
+    args.output_data_type = fpga::DATA_TYPE_FP16;
+    args.image.address = input_ptr;
+    args.image.height = (uint32_t)input->dims()[1];
+    args.image.width = (uint32_t)input->dims()[2];
+    args.image.channels = (uint32_t)input->dims()[3];
+    args.output.address = out->data<half>();
+    args.output.scale_address = out->scale;
+    args.output.activation.activation_type = fpga::SOFTMAX;
+    param->SetFpgaArgs(args);
+  }
+
   return true;
 }
 
 template <>
 void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) {
-  Tensor *in_x = param.FloatInput();
-  Tensor *out = param.Out();
-
-  fpga::PerformBypass(param.FpgaArgs());
-  fpga::fpga_invalidate(
-      (void *)in_x->data<float>(),                           // NOLINT
-      fpga::get_aligned_channel_num((int)in_x->dims()[1]) *  // NOLINT
-          sizeof(float));
-  math::SoftmaxFuntor<CPU, float>()(in_x, out);
-  fpga::fpga_flush(out->data<float>(), out->memory_size());
+  auto *in_x = (param.InputX());
+  if (in_x->type() == type_id<half>()) {
+    fpga::PerformBypass(param.FpgaArgs());
+    if (param.FpgaArgs().output.activation.activation_type != fpga::SOFTMAX) {
+      Tensor *out = param.Out();
+      Tensor *in_x2 = param.FloatInput();
+
+      fpga::fpga_invalidate(in_x2->data<float>(),
+                            in_x2->numel() * sizeof(float));
+      math::SoftmaxFuntor<CPU, float>()(in_x2, out);
+      fpga::fpga_flush(out->data<float>(), out->memory_size());
+    }
+  } else {
+    if (param.FpgaArgs().output.activation.activation_type != fpga::SOFTMAX) {
+      Tensor *out = param.Out();
+      out->Resize(
+          {in_x->dims()[0], out->dims()[1], out->dims()[2], out->dims()[3]});
+      math::SoftmaxFuntor<CPU, float>()(in_x, out);
+    }
+  }
 }
 
 }  // namespace operators
diff --git a/src/operators/kernel/fpga/V2/split_kernel.cpp b/src/operators/kernel/fpga/V2/split_kernel.cpp
index faa1da9186..584cb41fb3 100644
--- a/src/operators/kernel/fpga/V2/split_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/split_kernel.cpp
@@ -19,11 +19,55 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
 template <>
-bool SplitKernel<FPGA, float>::Init(SplitParam<FPGA>* param) {
+bool SplitKernel<FPGA, float>::Init(SplitParam<FPGA> *param) {
+  auto *in = const_cast<LoDTensor *>(param->InputX());
+  auto outs = param->Outs();
+  auto sections = param->Sections();
+  int axis = param->Axis();
+  PADDLE_MOBILE_ENFORCE(axis == 1, "Only support split in channel dimension");
+  PADDLE_MOBILE_ENFORCE(outs.size() == sections.size(),
+                        "Output number should be equal to section number");
+  auto image_num = (uint32_t)outs.size();
+  auto images_out =
+      reinterpret_cast<void **>(fpga::fpga_malloc(image_num * sizeof(void *)));
+  auto scales_out = reinterpret_cast<float **>(
+      fpga::fpga_malloc(image_num * sizeof(float *)));
+  auto out_channels = reinterpret_cast<uint32_t *>(
+      fpga::fpga_malloc(image_num * sizeof(uint32_t)));
+  DLOG << "input: " << in;
+  for (int i = 0; i < image_num; i++) {
+    fpga::format_fp16_ofm(outs[i]);
+    DLOG << "output: " << outs[i];
+    images_out[i] = outs[i]->mutable_data<half>();
+    scales_out[i] = outs[i]->scale;
+    out_channels[i] = (uint32_t)sections[i];
+  }
+
+  auto deleter = [](void *p) { fpga::fpga_free(p); };
+
+  fpga::SplitArgs arg = {0};
+  arg.image_num = image_num;
+  arg.image_in = in->data<half>();
+  arg.scale_in = in->scale;
+  arg.images_out = images_out;
+  arg.scales_out = scales_out;
+  arg.out_channel_nums = out_channels;
+  arg.height = (uint32_t)in->dims()[2];
+  arg.width = (uint32_t)in->dims()[3];
+  arg.vector_split_space.push_back(
+      std::shared_ptr<char>(reinterpret_cast<char *>(images_out), deleter));
+  arg.vector_split_space.push_back(
+      std::shared_ptr<char>(reinterpret_cast<char *>(scales_out), deleter));
+  arg.vector_split_space.push_back(
+      std::shared_ptr<char>(reinterpret_cast<char *>(out_channels), deleter));
+
+  param->SetFpgaArgs(arg);
   return true;
 }
 template <>
-void SplitKernel<FPGA, float>::Compute(const SplitParam<FPGA>& param) {}
+void SplitKernel<FPGA, float>::Compute(const SplitParam<FPGA> &param) {
+  fpga::ComputeFPGASplit(param.FpgaArgs());
+}
 
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/fpga/V2/tanh_kernel.cpp b/src/operators/kernel/fpga/V2/tanh_kernel.cpp
index 46dd3a0f6f..670689e083 100644
--- a/src/operators/kernel/fpga/V2/tanh_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/tanh_kernel.cpp
@@ -15,17 +15,63 @@ limitations under the License. */
 #ifdef TANH_OP
 
 #include "operators/kernel/tanh_kernel.h"
-
+#include <math.h>
 namespace paddle_mobile {
 namespace operators {
 
 template <>
 bool TanhKernel<FPGA, float>::Init(TanhParam<FPGA> *param) {
+  auto input = const_cast<LoDTensor *>(param->InputX());
+  DLOG << "input: " << input;
+  auto input_ptr = input->data<half>();
+  auto float_input = new LoDTensor;
+
+  float_input->mutable_data<float>(
+      {1, input->dims()[1], input->dims()[2], input->dims()[3]});
+  fpga::format_fp32_ofm(float_input);
+
+  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
+  args.input_layout_type = fpga::LAYOUT_HWC;
+  args.output_layout_type = fpga::LAYOUT_CHW;
+  args.input_data_type = fpga::DATA_TYPE_FP16;
+  args.output_data_type = fpga::DATA_TYPE_FP32;
+  args.image.address = input_ptr;
+  args.image.height = (uint32_t)input->dims()[2];
+  args.image.width = (uint32_t)input->dims()[3];
+  args.image.channels = (uint32_t)input->dims()[1];
+  args.output.address = float_input->data<float>();
+  args.output.scale_address = float_input->scale;
+  param->SetFloatInput(float_input);
+  param->SetFpgaArgs(args);
   return true;
 }
 
+#define EXP_MAX_INPUT 40.0
+template <typename T>
+T Tanh(const T a) {
+  T tmp = -2.0 * a;
+  tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
+  return (2.0 / (1.0 + exp(tmp))) - 1.0;
+}
+template <typename T>
+void tanhFuntor(Tensor *input, Tensor *output) {
+  auto *input_ptr = input->data<T>();
+  auto *output_ptr = output->mutable_data<T>();
+  for (int i = 0; i < input->numel(); i++) {
+    *(output_ptr + i) = Tanh<T>(*(input_ptr + i));
+  }
+}
 template <>
-void TanhKernel<FPGA, float>::Compute(const TanhParam<FPGA> &param) {}
+void TanhKernel<FPGA, float>::Compute(const TanhParam<FPGA> &param) {
+  Tensor *in_x = param.FloatInput();
+  Tensor *out = param.Out();
+
+  fpga::PerformBypass(param.FpgaArgs());
+  fpga::fpga_invalidate(reinterpret_cast<void *>(in_x->data<float>()),
+                        in_x->numel() * sizeof(float));
+  tanhFuntor<float>(in_x, out);
+  fpga::fpga_flush(out->data<float>(), out->memory_size());
+}
 
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/fpga/V2/transpose2_kernel.cpp b/src/operators/kernel/fpga/V2/transpose2_kernel.cpp
index 585cc52947..cc839a971e 100644
--- a/src/operators/kernel/fpga/V2/transpose2_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/transpose2_kernel.cpp
@@ -14,13 +14,27 @@ limitations under the License. */
 #ifdef TRANSPOSE2_OP
 
 #include "operators/kernel/transpose2_kernel.h"
-#include "operators/kernel/central-arm-func/transpose2_arm_func.h"
 
 namespace paddle_mobile {
 namespace operators {
 
 template <>
 bool Transpose2Kernel<FPGA, float>::Init(Transpose2Param<FPGA> *param) {
+  auto input = param->InputX();
+  auto output = param->Out();
+  auto axis = param->Axis();
+  auto dim = input->dims();
+  output->ShareDataWith(*input);
+
+  auto dim_v = vectorize(dim);
+
+  for (int i = 0; i < axis.size(); i++) {
+    dim_v[i] = dim[axis[i]];
+  }
+  output->Resize(framework::make_ddim(dim_v));
+
+  DLOG << "input: " << input;
+  DLOG << "output: " << output;
   return true;
 }
 
@@ -28,6 +42,11 @@ template <>
 void Transpose2Kernel<FPGA, float>::Compute(
     const Transpose2Param<FPGA> &param) {
   // Transpose2Compute<float>(param);
+  auto input = param.InputX();
+  auto output = param.Out();
+
+  output->Resize({input->dims()[0], output->dims()[1], output->dims()[2],
+                  output->dims()[3]});
 }
 
 }  // namespace operators
-- 
GitLab