add static quantization code and update FPGA V2(V3) related files

59bdbff9 · qnqinan · dad903fe · 59bdbff9 · 59bdbff9 · 59bdbff9
52 changed file
--- a/src/fpga/V2/api.cpp
+++ b/src/fpga/V2/api.cpp
--- a/src/fpga/V2/api.h
+++ b/src/fpga/V2/api.h
@@ -14,6 +14,7 @@ limitations under the License. */

 #pragma once

+#include <string>
 #include "fpga/common/fpga_common.h"
 #include "fpga/common/pe.h"
 #include "framework/tensor.h"
@@ -21,31 +22,81 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace fpga {

+void format_image(framework::Tensor* image_tensor);
+void format_ofm(framework::Tensor* ofm_tensor);
+void format_fp16_ofm(framework::Tensor* ofm_tensor);  // only allocate memory
+void format_fp16_ofm(framework::Tensor* ofm_tensor, framework::DDim dims);
+void format_fp32_ofm(framework::Tensor* ofm_tensor);
+
 float filter_find_max(framework::Tensor* filter_tensor);
-int get_aligned_channel_num(int channel_num);
-int get_aligned_filter_num(framework::Tensor* filter_tensor);
-int get_conv_output_channel(framework::Tensor* filter_tensor);
+int get_filter_num_per_div(framework::Tensor* filter_tensor, int group_num);
+int get_deconv_filter_num_per_div(framework::Tensor* filter_tensor,
+                                  int group_num, int stride);

-void format_image(framework::Tensor* image_tensor);
-void format_fp16_ofm(framework::Tensor* ofm_tensor,
-                     int aligned_channel);  // only allocate memory
-void format_fp32_ofm(framework::Tensor* ofm_tensor, int aligned_channel);
+int get_plit_num(framework::Tensor* filter_tensor);
+int get_deconv_plit_num(framework::Tensor* filter_tensor, int stride);

+int get_aligned_filter_element_num(int chw);
 void format_filter(framework::Tensor* filter_tensor, float max_value,
                   int group_num);
 void format_fc_filter(framework::Tensor* filter_tensor, float max_value);
-void format_bias_scale_array(float** bias_scale_array, int filter_num,
-                             int filter_channel);
+void format_bias_scale_array(float** bias_scale_array,
+                             int element_num_per_division, int num);
+void format_bias_array(float** bias_array, int num);
 void format_concat_output(framework::Tensor* out, int height, int width,
-                          uint32_t out_channel);
-int format_conv_data(framework::Tensor* filter_tensor,
-                     framework::Tensor* ofm_tensor, float** bs_ptr, int group);
-int format_fc_data(framework::Tensor* filter_tensor,
-                   framework::Tensor* ofm_tensor, float** bs_ptr);
+                          int image_num, uint32_t* channel_num);
+
 void fill_split_arg(struct SplitConvArgs* arg, framework::Tensor* input,
                    framework::Tensor* out, framework::Tensor* filter,
-                    bool relu_enabled, int group_num, int stride_h,
-                    int stride_w, int padding_h, int padding_w, float* bs_ptr);
+                    ActivationType activation_enable,
+                    int16_t leaky_relu_negative_slope, int group_num,
+                    int stride_h, int stride_w, int padding_h, int padding_w,
+                    float* bs_ptr);
+void fill_deconv_arg(struct DeconvArgs* arg, framework::Tensor* input,
+                     framework::Tensor* out, framework::Tensor* filter,
+                     ActivationType activation_enable,
+                     int16_t leaky_relu_negative_slope, int group_num,
+                     int stride_h, int stride_w, int padding_h, int padding_w,
+                     float* bs_ptr);
+void fill_dwconv_arg(struct DWconvArgs* arg, framework::Tensor* input,
+                     framework::Tensor* out, framework::Tensor* filter,
+                     ActivationType activation_enable,
+                     int16_t leaky_relu_negative_slope, int stride_h,
+                     int stride_w, int padding_h, int padding_w,
+                     float* bias_ptr);
+void fill_DWDeconv_arg(struct DWDeconvArgs* arg, framework::Tensor* input,
+                       framework::Tensor* out, framework::Tensor* filter,
+                       ActivationType activation_enable,
+                       int16_t leaky_relu_negative_slope, int stride_h,
+                       int stride_w, int padding_h, int padding_w,
+                       float* bs_ptr);
+
+void format_deconv_filter(framework::Tensor* filter_tensor, float max_value,
+                          int group_num, int stride);
+void format_dwconv_filter(framework::Tensor* filter_tensor, float* scale_ptr);
+void format_conv_data(framework::Tensor* filter_tensor,
+                      framework::Tensor* ofm_tensor, float** bs_ptr, int group);
+void format_deconv_data(framework::Tensor* filter_tensor,
+                        framework::Tensor* ofm_tensor, float** bs_ptr,
+                        int group, int sub_conv_n);
+void format_dwconv_data(framework::Tensor* filter_tensor,
+                        framework::Tensor* ofm_tensor, float* scale_ptr,
+                        float** bias_ptr);
+void format_DWDeconv_data(framework::Tensor* filter_tensor,
+                          framework::Tensor* ofm_tensor, float** bs_ptr,
+                          int group, int sub_conv_n);
+
+template <typename Dtype>
+void savefile(std::string filename, void* buffer, int dataSize, Dtype tmp) {
+  float data;
+  std::ofstream out(filename.c_str());
+  for (int i = 0; i < dataSize; ++i) {
+    data = (((Dtype*)buffer)[i]);  // NOLINT
+    out << data << std::endl;
+  }
+  out.close();
+  return;
+}

 }  // namespace fpga
 }  // namespace paddle_mobile
--- a/src/fpga/V2/bias_scale.cpp
+++ b/src/fpga/V2/bias_scale.cpp
@@ -20,26 +20,81 @@ namespace paddle_mobile {
 namespace fpga {
 namespace bias_scale {

-void align_element(float **data_in, int num, int num_after_alignment) {
+void align_element(float **data_in, int num_per_div_before_alignment, int num) {
+  int copynum = 0;
  float *ptr_unaligned = *data_in;
-  int total_element = 2 * num_after_alignment;  // including bias & scale
+  int div_num =
+      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
+  int num_per_div_after_alignment =
+      align_to_x(num_per_div_before_alignment, BS_NUM_ALIGNMENT);
+  int num_element =
+      2 * div_num * num_per_div_after_alignment;  // including bias & scale
  float *ptr_aligned =
-      (float *)fpga_malloc(total_element * sizeof(float));  // NOLINT
-  memset(ptr_aligned, 0, total_element * sizeof(float));
+      (float *)fpga_malloc(num_element * sizeof(float));  // NOLINT

-  for (int i = 0; i < num; i++) {
-    ptr_aligned[i * 2 + 0] = ptr_unaligned[i];
-    ptr_aligned[i * 2 + 1] = ptr_unaligned[i + num];
+  memset(ptr_aligned, 0, num_element * sizeof(float));
+
+  for (int i = 0; i < div_num; i++) {
+    if (i == div_num - 1) {
+      copynum = (num_per_div_after_alignment * div_num > num)
+                    ? (num % num_per_div_after_alignment)
+                    : (num_per_div_before_alignment);
+    } else {
+      copynum = num_per_div_before_alignment;
+    }
+
+    memcpy(ptr_aligned + i * num_per_div_after_alignment,
+           ptr_unaligned + num_per_div_before_alignment * i,
+           copynum * sizeof(float));
+    memcpy(ptr_aligned + (div_num + i) * num_per_div_after_alignment,
+           ptr_unaligned + num_per_div_before_alignment * i + num,
+           copynum * sizeof(float));
  }

  fpga_free(ptr_unaligned);
  *data_in = ptr_aligned;
 }

-void format_bias_scale_array(float **data_in, int num,
-                             int num_after_alignment) {
-  align_element(data_in, num, num_after_alignment);
-  fpga_flush(*data_in, 2 * num_after_alignment * sizeof(float));
+void interleave(float **data_in, int num_after_alignment) {
+  // num_after_alignment: number of bias after alignment
+
+  float *ptr_uninterleaved = *data_in;
+  float *ptr_interleaved =
+      (float *)fpga_malloc(2 * num_after_alignment * sizeof(float));  // NOLINT
+  int num = num_after_alignment / 4;
+  for (int i = 0; i < num; i++) {
+    memcpy(ptr_interleaved + 8 * i, ptr_uninterleaved + 4 * i,
+           4 * sizeof(float));
+    memcpy(ptr_interleaved + 8 * i + 4,
+           ptr_uninterleaved + num_after_alignment + 4 * i, 4 * sizeof(float));
+  }
+
+  fpga_free(ptr_uninterleaved);
+  *data_in = ptr_interleaved;
+}
+
+void format_bias_scale_array(float **bias_scale_array,
+                             int element_num_per_division, int num) {
+  align_element(bias_scale_array, element_num_per_division, num);
+  int div_num = (num + element_num_per_division - 1) / element_num_per_division;
+  int element_num_after_division =
+      align_to_x(element_num_per_division, BS_NUM_ALIGNMENT);
+  interleave(bias_scale_array, div_num * element_num_after_division);
+  fpga_flush(*bias_scale_array, 2 * element_num_after_division * sizeof(float));
+}
+void format_bias_array(float **bias_array, int num) {
+  float *ptr_unaligned = *bias_array;
+  int num_before_align = num;
+  int num_after_align = align_to_x(num_before_align, BIAS_NUM_ALIGNMENT);
+  int16_t *ptr_aligned =
+      (int16_t *)fpga_malloc(num_after_align * sizeof(int16_t));  // NOLINT
+
+  memset(ptr_aligned, 0, num_after_align * sizeof(int16_t));
+  for (int i = 0; i < num_before_align; i++) {
+    ptr_aligned[i] = fp32_2_fp16(ptr_unaligned[i]);
+  }
+  *bias_array = (float *)ptr_aligned;  // NOLINT
+  fpga_free(ptr_unaligned);
 }

 }  // namespace bias_scale

--- a/src/fpga/V2/bias_scale.h
+++ b/src/fpga/V2/bias_scale.h
@@ -18,8 +18,11 @@ namespace paddle_mobile {
 namespace fpga {
 namespace bias_scale {

-void align_element(float **data_in, int num, int num_after_alignment);
-void format_bias_scale_array(float **data_in, int num, int num_after_alignment);
+void align_element(float** data_in, int num_per_div_before_alignment, int num);
+void interleave(float** data_in, int num_after_alignment);
+void format_bias_scale_array(float** bias_scale_array,
+                             int element_num_per_division, int num);
+void format_bias_array(float** bias_array, int num);

 }  // namespace bias_scale
 }  // namespace fpga

--- a/src/fpga/V2/deconv_bias_scale.cpp
+++ b/src/fpga/V2/deconv_bias_scale.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "fpga/V2/deconv_bias_scale.h"
+// #include "deconv_bias_scale.h"
+#include "fpga/V2/bias_scale.h"
+// #include "bias_scale.h"
+// #include <memory.h>
+
+#include "fpga/V2/api.h"
+// #include "fpga_api.h"
+namespace paddle_mobile {
+namespace fpga {
+namespace deconv_bias_scale {
+
+void deconv_bias_scale_expand(float** bias_scale_array, int num,
+                              int sub_conv_n) {
+  int sub_num = num * sub_conv_n;
+  float* ptr_tmp = *bias_scale_array;
+  float* ptr_bias_scale_expand =
+      reinterpret_cast<float*>(fpga_malloc(sizeof(float) * sub_num * 2));
+  int scale_base_offset = sub_num;
+  for (int i = 0; i < sub_conv_n; ++i) {
+    int offset = num * i;
+    // copy bias
+    fpga_copy(ptr_bias_scale_expand + offset, ptr_tmp, num * sizeof(float));
+    // copy scale
+    fpga_copy(ptr_bias_scale_expand + scale_base_offset + offset, ptr_tmp + num,
+              num * sizeof(float));
+  }
+  *bias_scale_array = ptr_bias_scale_expand;
+  fpga_free(ptr_tmp);
+}
+
+}  // namespace deconv_bias_scale
+}  // namespace fpga
+}  // namespace paddle_mobile
--- a/src/fpga/V2/deconv_bias_scale.h
+++ b/src/fpga/V2/deconv_bias_scale.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace paddle_mobile {
+namespace fpga {
+namespace deconv_bias_scale {
+
+void deconv_bias_scale_expand(float** bias_scale_array, int num,
+                              int sub_conv_n);
+
+}  // namespace deconv_bias_scale
+}  // namespace fpga
+}  // namespace paddle_mobile
--- a/src/fpga/V2/deconv_filter.cpp
+++ b/src/fpga/V2/deconv_filter.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "fpga/V2/deconv_filter.h"
+#include <memory.h>
+#include <algorithm>
+// #include "deconv_filter.h"
+#include "fpga/V2/filter.h"
+// #include "filter.h"
+#include "fpga/V2/api.h"
+
+namespace paddle_mobile {
+namespace fpga {
+namespace deconv_filter {
+
+/*
+inverse kernel weights of each channel for every filter
+*/
+void deconv_inverse_filter(float** data_in, int num, int channel, int width,
+                           int height) {
+  float* tmp = *data_in;
+  int data_size = num * channel * width * height;
+  int hw_len = height * width;
+  auto tmp_data =
+      reinterpret_cast<float*>(fpga_malloc(data_size * sizeof(float)));
+  for (int i = 0; i < num; ++i) {
+    for (int j = 0; j < channel; ++j) {
+      for (int k = 0; k < hw_len; ++k) {
+        tmp_data[i * channel * hw_len + j * hw_len + k] =
+            (*data_in)[i * channel * hw_len + j * hw_len + hw_len - k - 1];
+      }
+    }
+  }
+  *data_in = tmp_data;
+  fpga_free(tmp);
+}
+
+/*
+    calculate sub padding number
+*/
+int deconv_calc_sub_pad(int filter_axis, int pad, int stride) {
+  if (stride == 0 || ((filter_axis - pad - 1) < 0)) {
+    PADDLE_MOBILE_ENFORCE(false, "Wrong deconv parameters");
+  }
+  return (filter_axis - pad - 1) / stride;
+}
+int deconv_get_sub_filter_axis(int filter_axis, int stride) {
+  return (filter_axis / stride);
+}
+
+int deconv_get_sub_out_axis(int image_axis, int sub_pad, int sub_filter_axis) {
+  return ((image_axis + 2 * sub_pad - sub_filter_axis) + 1);
+}
+
+/*
+    (filter_width-pad,filter_width-pad) is the first pixel of sub-pixel image
+   position. so the omit rows or columns is (stride - )
+*/
+int deconv_get_omit(int stride, int filter_width, int pad) {
+  PADDLE_MOBILE_ENFORCE(filter_width > pad, "Wrong deconv parameters");
+  int idx;
+  bool flag = false;
+  for (idx = 1; idx <= stride; ++idx) {
+    int j = idx;
+    for (; j <= filter_width;) {
+      if (j == filter_width - pad) {
+        flag = true;
+        break;
+      }
+      j = j + stride;
+    }
+    if (flag) {
+      break;
+    }
+  }
+
+  return (stride - idx);
+}
+
+template <typename T>
+void deconv_get_sub_filter(T** data_in, int height, int width, int sub_conv_n,
+                           int kernel_num, int channel) {
+  T* ptr_tmp = *data_in;
+  int sub_num = kernel_num * sub_conv_n;
+  int sub_h = height / sub_conv_n;
+  int sub_w = width / sub_conv_n;
+
+  int sub_filter_size =
+      kernel_num * sub_h * sub_w * channel * sub_conv_n * sub_conv_n;
+
+  T* ptr_sub_filter =
+      reinterpret_cast<T*>(fpga_malloc(sub_filter_size * sizeof(T)));
+  for (int idx = 0; idx < sub_conv_n; ++idx) {
+    for (int nn = 0; nn < sub_num; ++nn) {
+      int ni = nn % kernel_num;
+
+      int woff = sub_conv_n - 1 - (nn / kernel_num);  //
+
+      for (int hh = 0; hh < sub_h; ++hh) {
+        int hi = hh * sub_conv_n + idx % sub_conv_n;
+        for (int ww = 0; ww < sub_w; ++ww) {
+          int wi = ww * sub_conv_n + woff;  // 1 0
+
+          int sidx = ((nn * sub_h + hh) * sub_w + ww) * channel;   //
+          int kidx = ((ni * height + hi) * width + wi) * channel;  //
+
+          fpga_copy(
+              ptr_sub_filter + idx * sub_h * sub_w * channel * sub_num + sidx,
+              (*data_in) + kidx, channel * sizeof(T));
+          // for (int cc =0; cc < channel; ++cc) {
+          //     ptr_sub_filter[idx*sub_h*sub_w*channel*sub_num + sidx + cc] =
+          //     (*data_in)[kidx + cc];
+          // }
+        }
+      }
+    }
+  }
+  *data_in = ptr_sub_filter;
+  fpga_free(ptr_tmp);
+}
+
+void deconv_NC_convert(float** filter_in, int kernel_num, int channels,
+                       int hw) {
+  float* tmp = *filter_in;
+  float* ptr_filter = reinterpret_cast<float*>(paddle_mobile::fpga::fpga_malloc(
+      hw * kernel_num * channels * sizeof(float)));
+
+  for (int c = 0; c < channels; ++c) {
+    for (int n = 0; n < kernel_num; ++n) {
+      paddle_mobile::fpga::fpga_copy(ptr_filter + n * hw + kernel_num * hw * c,
+                                     tmp + n * channels * hw + c * hw,
+                                     hw * sizeof(float));
+    }
+  }
+  *filter_in = ptr_filter;
+  paddle_mobile::fpga::fpga_free(tmp);
+}
+
+void deconv_format_filter(float** data_in, int num, int channel, int height,
+                          int width, int group_num, float max, int stride) {
+  int data_size = channel * height * width * num;
+
+  /*{
+       float result2 = (float)0;
+       string filename = "origin_filter_data";
+       api::savefile<float>(filename, (void *)*data_in, data_size, result2);
+    }*/
+
+  deconv_inverse_filter(data_in, num, channel, width, height);
+
+  /* {
+          float result2 = (float)0;
+          string filename = "inverse_filter_data";
+          api::savefile<float>(filename, (void *)*data_in, data_size, result2);
+   }*/
+
+  filter::quantize(data_in, data_size, max);
+  /* {
+        char result2 = (char)0;
+        string filename = "quantize_filter_data";
+        api::savefile<char>(filename, (void *)*data_in, data_size, result2);
+ }*/
+  char** quantize_data = (char**)data_in;  // NOLINT
+
+  filter::convert_to_hwc(quantize_data, num, channel, height, width);
+  /*{
+       char result2 = (char)0;
+       string filename = "convert_to_hwc_filter_data";
+       api::savefile<char>(filename, (void *)*quantize_data, data_size,
+  result2);
+  }*/
+
+  deconv_get_sub_filter<char>(quantize_data, height, width, stride, num,
+                              channel);
+  /*{
+     char result2 = (char)0;
+     string filename = "sub_filter_filter_data";
+     api::savefile<char>(filename, (void *)*quantize_data, data_size, result2);
+}*/
+
+  int sub_conv_n = stride;
+  int sub_h = height / sub_conv_n;
+  int sub_w = width / sub_conv_n;
+  int sub_chw = sub_h * sub_w * channel;
+  int sub_num = sub_conv_n * num;
+  int division_capacity = filter::calc_division_capacity(sub_chw);
+  int num_per_div_before_alignment =
+      filter::calc_num_per_div(sub_num, group_num, division_capacity);
+  int num_per_div_after_alignment =
+      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
+  int div_num = (sub_num + num_per_div_before_alignment - 1) /
+                num_per_div_before_alignment;
+  int residual = (sub_num) % num_per_div_before_alignment;
+  int num_after_alignment = num_per_div_after_alignment *
+                                ((residual == 0) ? div_num : (div_num - 1)) +
+                            align_to_x(residual, FILTER_NUM_ALIGNMENT);
+
+  char** ptr_ptr_data =
+      reinterpret_cast<char**>(fpga_malloc(sub_conv_n * sizeof(char*)));
+  int origin_offset = sub_chw * sub_num;
+  for (int i = 0; i < sub_conv_n; ++i) {
+    (ptr_ptr_data)[i] =
+        reinterpret_cast<char*>(fpga_malloc(origin_offset * sizeof(char)));
+    fpga_copy((ptr_ptr_data)[i], (*quantize_data) + origin_offset * i,
+              origin_offset * sizeof(char));
+
+    /* char result2 = (char)0;
+     string filename = "ptr_ptr_data" + to_string(i);
+     api::savefile<char>(filename, (void *)(ptr_ptr_data[i]), origin_offset,
+     result2);
+     */
+  }
+  // char result2 = (char)0;
+  //      string filename = "interleave";
+  //      api::savefile<char>(filename, (void *)*ptr_ptr_data, origin_offset,
+  //      result2);
+  fpga_free(*quantize_data);
+
+  int align_offset =
+      align_to_x(sub_chw, FILTER_ELEMENT_ALIGNMENT) * num_after_alignment;
+  char* ptr_space = reinterpret_cast<char*>(fpga_malloc(
+      sub_conv_n * align_offset * sizeof(char)));  // continuous space
+  for (int i = 0; i < sub_conv_n; ++i) {
+    char* ptr_tmp = (ptr_ptr_data)[i];
+
+    filter::align_element(&ptr_tmp, sub_num, sub_chw);
+    filter::align_num(&ptr_tmp, num_per_div_before_alignment, sub_num, sub_chw);
+
+    filter::reorder(&ptr_tmp, num_after_alignment, sub_chw);
+    filter::interleave(&ptr_tmp, num_after_alignment, sub_chw);
+
+    /*   char result2 = (char)0;
+       string filename = "interleave" + to_string(i);
+       api::savefile<char>(filename, (void *)ptr_tmp, align_offset, result2);
+*/
+    fpga_copy(ptr_space + i * align_offset, ptr_tmp, align_offset);
+    fpga_free(ptr_tmp);
+  }
+  fpga_free(ptr_ptr_data);
+  *data_in = reinterpret_cast<float*>(ptr_space);
+
+  /*    {
+        char result2 = (char)0;
+         string filename = "ptr_space";
+         api::savefile<char>(filename, (void *)ptr_space, sub_conv_n *
+     align_offset, result2);
+      }*/
+  fpga_flush(ptr_space, sub_conv_n * align_offset * sizeof(char));
+}
+
+void DWDconv_format_filter(float** data_in, int num, int channel, int height,
+                           int width, float* scale_ptr, int stride) {
+  deconv_inverse_filter(data_in, num, channel, width, height);
+
+  filter::quantize_to_fp16(data_in, channel, height, width, scale_ptr);
+  int16_t** quantize_data = (int16_t**)data_in;  // NOLINT
+  filter::convert_to_hwn(quantize_data, channel, height, width);
+
+  deconv_get_sub_filter<int16_t>(quantize_data, height, width, stride, num,
+                                 channel);
+
+  filter::align_element_n(quantize_data, channel, height, width);
+  fpga_flush(*quantize_data, align_to_x(channel, FILTER_ELEMENT_ALIGNMENT) *
+                                 height * width * sizeof(int16_t));
+}
+
+}  // namespace deconv_filter
+}  // namespace fpga
+}  // namespace paddle_mobile
--- a/src/fpga/V2/deconv_filter.h
+++ b/src/fpga/V2/deconv_filter.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace paddle_mobile {
+namespace fpga {
+namespace deconv_filter {
+
+void deconv_inverse_filter(float** data_in, int num, int channel, int width,
+                           int height);
+int deconv_calc_sub_pad(int filter_axis, int pad, int stride);
+int deconv_get_sub_filter_axis(int filter_axis, int stride);
+int deconv_get_sub_out_axis(int image_axis, int sub_pad, int sub_filter_axis);
+int deconv_get_omit(int stride, int filter_width, int pad);
+
+template <typename T>
+void deconv_get_sub_filter(T** data_in, int height, int width, int sub_conv_n,
+                           int kernel_num, int channel);
+void deconv_format_filter(float** data_in, int num, int channel, int height,
+                          int width, int group_num, float max, int stride);
+void deconv_NC_convert(float** filter_in, int kernel_num, int channels, int hw);
+void DWDconv_format_filter(float** data_in, int num, int channel, int height,
+                           int width, float* scale_ptr, int stride);
+
+}  // namespace deconv_filter
+}  // namespace fpga
+}  // namespace paddle_mobile
--- a/src/fpga/V2/filter.cpp
+++ b/src/fpga/V2/filter.cpp
@@ -16,44 +16,53 @@ limitations under the License. */
 #include <memory.h>
 #include <algorithm>
 #include "fpga/common/fpga_common.h"
+
 namespace paddle_mobile {
 namespace fpga {
 namespace filter {

-int calc_channel_parallelism(int channel) {
-  if (channel <= 16) {
-    return 16;
-  } else if (channel <= 32) {
-    return 32;
-  } else if (channel <= 64) {
-    return 64;
-  } else {
-    return 128;
-  }
-}
-int calc_aligned_channel(int channel) {
-  return align_to_x(channel, calc_channel_parallelism(channel));
+int calc_division_capacity(int chw) {
+  int n = 2048 / ((chw + 15) / 16) * 32;
+  return n < 2048 ? n : 2048;
 }

-int calc_num_parallelism(int channel) {
-  return FILTER_PARALLELISM / calc_channel_parallelism(channel);
+int calc_split_num(int num, int division_capacity) {
+  return (num + division_capacity - 1) / division_capacity;
 }

-int calc_aligned_num(int num, int channel) {
-  return align_to_x(num, calc_num_parallelism(channel));
+int calc_division_number(int num, int group_num, int division_capacity) {
+  //  PADDLE_MOBILE_ENFORCE(num % group_num == 0,
+  //                        "Filter number should be divisible by group
+  //                        number");
+  int split_num = calc_split_num(num, division_capacity);
+  //  PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1,
+  //                        "Split number or group number should be 1");
+  return group_num * split_num;
 }

-int calc_aligned_total_pixel_num(int num, int channel, int height, int width) {
-  int aligned_channel = calc_aligned_channel(channel);
-  int aligned_filter_num = calc_aligned_num(num, channel);
-  return aligned_filter_num * aligned_channel * height * width;
+int calc_num_per_div(int num, int group_num, int division_capacity) {
+  //  PADDLE_MOBILE_ENFORCE(num % group_num == 0,
+  //                        "Filter number should be divisible by group
+  //                        number");
+  int split_num = calc_split_num(num, division_capacity);
+  //  PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1,
+  //                        "Split number or group number should be 1");
+  if (group_num == 1) {
+    if (num > division_capacity) {
+      return division_capacity;
+    } else {
+      return num;
+    }
+  } else {
+    return (num + group_num - 1) / group_num;
+  }
 }

-void convert_to_hwc(float **data_in, int num, int channel, int height,
+void convert_to_hwc(char **data_in, int num, int channel, int height,
                    int width) {
-  float *tmp = *data_in;
+  char *tmp = *data_in;
  int chw = channel * height * width;
-  float *data_tmp = (float *)fpga_malloc(chw * num * sizeof(float));  // NOLINT
+  char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char));  // NOLINT
  for (int n = 0; n < num; n++) {
    int64_t amount_per_row = width * channel;
    for (int c = 0; c < channel; c++) {
@@ -66,52 +75,170 @@ void convert_to_hwc(float **data_in, int num, int channel, int height,
      }
    }
  }
+
  *data_in = data_tmp;
  fpga_free(tmp);
 }

-void align_filter(float **data_in, int num, int channel, int height,
-                  int width) {
-  int aligned_channel = calc_aligned_channel(channel);
-  int hw = height * width;
-  int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width);
-  float *new_data = (float *)fpga_malloc(pixel_num * sizeof(float));  // NOLINT
-  float *temp = *data_in;
-  memset(new_data, 0, pixel_num * sizeof(float));
-  for (int i = 0; i < num; i++) {
-    for (int j = 0; j < hw; j++) {
-      memcpy(new_data + i * aligned_channel * hw + j * aligned_channel,
-             temp + i * channel * hw + j * channel, channel * sizeof(float));
-    }
+float find_max(float *data_in, int data_size) {
+  float max = 0.0;
+  for (int i = 0; i < data_size; ++i) {
+    float value = data_in[i];
+    float abs = value > 0 ? value : -value;
+    max = std::max(max, abs);
+  }
+  return max;
+}
+
+signed char float_to_int8(float fdata) {
+  if (fdata < 0.0) {
+    fdata -= 0.5;
+  } else {
+    fdata += 0.5;
  }
-  *data_in = new_data;
-  fpga_free(temp);
+  return (signed char)fdata;
 }
-void convert_to_fp16(float **data_in, int data_size) {
+
+void quantize(float **data_in, int data_size, float max) {
  float *tmp = *data_in;
-  // half_float::half *tmp_data = (half_float::half *)fpga_malloc(data_size *
-  // sizeof(half_float::half));
-  int16_t *tmp_data =
-      (int16_t *)fpga_malloc(data_size * sizeof(int16_t));  // NOLINT
+  float fix_range = 127;
+  float scale = fix_range / max;
+
+  signed char *tmp_data = (signed char *)fpga_malloc(data_size * sizeof(char));
  for (int i = 0; i < data_size; i++) {
-    // tmp_data[i] = (half_float::half)((*data_in)[i]);
-    tmp_data[i] = fp32_2_fp16((*data_in)[i]);
+    tmp_data[i] = float_to_int8(
+        (*data_in)[i] * scale);  // (signed char)((*data_in)[i] * scale);
  }
  *data_in = (float *)tmp_data;  // NOLINT
  fpga_free(tmp);
 }
+
+void align_element(char **data_in, int num, int chw) {
+  int i = 0;
+  int j = 0;
+  int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
+  if (align_chw != chw) {
+    char *tmp = *data_in;
+    char *data_tmp =
+        (char *)fpga_malloc(num * align_chw * sizeof(char));  // NOLINT
+
+    memset(data_tmp, 0, num * align_chw);
+    for (j = 0; j < num; j++) {
+      memcpy(data_tmp + j * align_chw, (*data_in) + j * chw, chw);
+    }
+    *data_in = data_tmp;
+    fpga_free(tmp);
+  }
+}
+
+void align_num(char **data_in, int num_per_div_before_alignment, int num,
+               int chw) {
+  int i = 0;
+  int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
+  int num_per_div_after_alignment =
+      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
+
+  char *tmp = *data_in;
+  int div_num =
+      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
+  int num_element = div_num * num_per_div_after_alignment * align_chw;
+  char *data_tmp = (char *)fpga_malloc(num_element * sizeof(char));  // NOLINT
+
+  memset(data_tmp, 0, num_element * sizeof(char));
+
+  for (i = 0; i < div_num - 1; i++) {
+    memcpy(data_tmp + num_per_div_after_alignment * align_chw * i,
+           *data_in + num_per_div_before_alignment * align_chw * i,
+           num_per_div_before_alignment * align_chw);
+  }
+
+  memcpy(data_tmp + num_per_div_after_alignment * align_chw * i,
+         *data_in + num_per_div_before_alignment * align_chw * i,
+         (num - (div_num - 1) * num_per_div_before_alignment) * align_chw);
+
+  *data_in = data_tmp;
+  fpga_free(tmp);
+}
+
+void reorder(char **data_in, int num_after_alignment, int chw) {
+  int index = 0;
+  int new_index;
+
+  int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
+
+  char *data_tmp =
+      (char *)fpga_malloc(chw_align * num_after_alignment *  // NOLINT
+                          sizeof(char));
+  char *tmp = *data_in;
+  for (index = 0; index < num_after_alignment; index++) {
+    new_index = index / 32 * 32 + (index % 16 / 4 * 8) + (index % 16 % 4) +
+                (index / 16 % 2 * 4);
+    memcpy(data_tmp + index * chw_align, *data_in + new_index * chw_align,
+           chw_align);
+  }
+  *data_in = data_tmp;
+  fpga_free(tmp);
+}
+
+void interleave(char **data_in, int num_after_alignment, int chw) {
+  int i = 0;
+  int j = 0;
+  int k = 0;
+  int interleave_per_num = 16;
+
+  int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
+  char *data_tmp =
+      (char *)fpga_malloc(chw_align * num_after_alignment *  // NOLINT
+                          sizeof(char));
+  char *tmp = *data_in;
+  int interleave_num = chw_align * 2 / interleave_per_num;
+  for (i = 0; i < num_after_alignment; i += 2) {
+    for (j = 0, k = 0; j < interleave_num; j += 2, k++) {
+      memcpy(data_tmp + i * chw_align + interleave_per_num * j,
+             *data_in + i * chw_align + interleave_per_num * k,
+             interleave_per_num);
+      memcpy(data_tmp + i * chw_align + interleave_per_num * (j + 1),
+             *data_in + (i + 1) * chw_align + interleave_per_num * k,
+             interleave_per_num);
+    }
+  }
+  *data_in = data_tmp;
+  fpga_free(tmp);
+}
+
 void format_filter(float **data_in, int num, int channel, int height, int width,
                   int group_num, float max) {
-  convert_to_hwc(data_in, num, channel, height, width);
-  align_filter(data_in, num, channel, height, width);
-  int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width);
-  convert_to_fp16(data_in, pixel_num);
-  fpga_flush(*data_in, pixel_num * sizeof(float));
+  int data_size = channel * height * width * num;
+  int chw = channel * height * width;
+
+  int division_capacity = calc_division_capacity(chw);
+  int num_per_div_before_alignment =
+      calc_num_per_div(num, group_num, division_capacity);
+  int num_per_div_after_alignment =
+      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
+  int div_num =
+      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
+  int residual = num % num_per_div_before_alignment;
+  int num_after_alignment = num_per_div_after_alignment *
+                                ((residual == 0) ? div_num : (div_num - 1)) +
+                            align_to_x(residual, FILTER_NUM_ALIGNMENT);
+  quantize(data_in, data_size, max);
+  char **quantize_data = (char **)data_in;  // NOLINT
+  convert_to_hwc(quantize_data, num, channel, height, width);
+  align_element(quantize_data, num, chw);
+  if (num_after_alignment != num) {
+    align_num(quantize_data, num_per_div_before_alignment, num, chw);
+  }
+
+  reorder(quantize_data, num_after_alignment, chw);
+  interleave(quantize_data, num_after_alignment, chw);
+  fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) *
+                                 num_after_alignment * sizeof(char));
 }

-void convert_fc_filter(float **data_in, int num, int chw) {
-  float *tmp = *data_in;
-  float *data_tmp = (float *)fpga_malloc(chw * num * sizeof(float));  // NOLINT
+void convert_fc_filter(char **data_in, int num, int chw) {
+  char *tmp = *data_in;
+  char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char));  // NOLINT
  for (int n = 0; n < num; n++) {
    for (int c = 0; c < chw; c++) {
      data_tmp[n * chw + c] = (*data_in)[num * c + n];
@@ -123,47 +250,113 @@ void convert_fc_filter(float **data_in, int num, int chw) {

 void format_fc_filter(float **data_in, int num, int channel, int height,
                      int width, int group_num, float max) {
+  int data_size = channel * height * width * num;
  int chw = channel * height * width;
-  convert_fc_filter(data_in, num, chw);
-  align_filter(data_in, num, channel, height, width);
-  int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width);
-  convert_to_fp16(data_in, pixel_num);
-  fpga_flush(*data_in, pixel_num * sizeof(float));
-}

-float find_max(float *data_in, int data_size) {
-  float max = 0.0;
-  for (int i = 0; i < data_size; ++i) {
-    float value = data_in[i];
-    float abs = value > 0 ? value : -value;
-    max = std::max(max, abs);
+  int division_capacity = calc_division_capacity(chw);
+  int num_per_div_before_alignment =
+      calc_num_per_div(num, group_num, division_capacity);
+  int num_per_div_after_alignment =
+      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
+  int div_num =
+      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
+  int residual = num % num_per_div_before_alignment;
+  int num_after_alignment = num_per_div_after_alignment *
+                                ((residual == 0) ? div_num : (div_num - 1)) +
+                            align_to_x(residual, FILTER_NUM_ALIGNMENT);
+
+  quantize(data_in, data_size, max);
+  char **quantize_data = (char **)data_in;  // NOLINT
+  convert_fc_filter(quantize_data, num, chw);
+  convert_to_hwc(quantize_data, num, channel, height, width);
+  align_element(quantize_data, num, chw);
+  if (num_after_alignment != num) {
+    align_num(quantize_data, num_per_div_before_alignment, num, chw);
  }
-  return max;
+  reorder(quantize_data, num_after_alignment, chw);
+  interleave(quantize_data, num_after_alignment, chw);
+  fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) *
+                                 num_after_alignment * sizeof(char));
+}
+void convert_to_hwn(int16_t **data_in, int num, int height, int width) {
+  int16_t *tmp = *data_in;
+  int16_t *data_tmp =
+      (int16_t *)fpga_malloc(height * width * num * sizeof(int16_t));  // NOLINT
+  for (int n = 0; n < num; n++) {
+    for (int h = 0; h < height; h++) {
+      for (int w = 0; w < width; w++) {
+        *(data_tmp + h * width * num + w * num + n) = *((*data_in)++);
+      }
+    }
+  }
+  *data_in = data_tmp;
+  fpga_free(tmp);
 }

-signed char float_to_int8(float fdata) {
-  if (fdata < 0.0) {
-    fdata -= 0.5;
+void align_element_n(int16_t **data_in, int num, int height, int width) {
+  int unalign_n = num;
+  int align_n = align_to_x(num, FILTER_ELEMENT_ALIGNMENT);
+  if (unalign_n == align_n) {
+    return;
  } else {
-    fdata += 0.5;
+    int16_t *tmp = *data_in;
+
+    int num_element = height * width * align_n;
+    int16_t *data_tmp =
+        (int16_t *)fpga_malloc(num_element * sizeof(int16_t));  // NOLINT
+
+    memset(data_tmp, 0, num_element * sizeof(int16_t));
+    for (int h = 0; h < height; h++) {
+      for (int w = 0; w < width; w++) {
+        int offset_unalign = h * width * unalign_n + w * unalign_n;
+        int offset_align = h * width * align_n + w * align_n;
+        for (int n = 0; n < unalign_n; n++) {
+          data_tmp[offset_align + n] = *((*data_in) + offset_unalign + n);
+        }
+      }
+    }
+
+    *data_in = data_tmp;
+    fpga_free(tmp);
  }
-  return (signed char)fdata;
 }
-
-void quantize(float **data_in, int data_size, float max) {
+void quantize_to_fp16(float **data_in, int num, int height, int width,
+                      float *scale_ptr) {
  float *tmp = *data_in;
-  float fix_range = 127;
-  float scale = fix_range / max;
+  int size = num * height * width;

-  signed char *tmp_data = (signed char *)fpga_malloc(data_size * sizeof(char));
-  for (int i = 0; i < data_size; i++) {
-    tmp_data[i] = float_to_int8(
-        (*data_in)[i] * scale);  // (signed char)((*data_in)[i] * scale);
+  int16_t *tmp_data = (int16_t *)fpga_malloc(size * sizeof(int16_t));  // NOLINT
+  for (int n = 0; n < num; n++) {
+    float scale_val = scale_ptr[n];
+    for (int h = 0; h < height; h++) {
+      for (int w = 0; w < width; w++) {
+        int index = n * height * width + h * width + w;
+        tmp_data[index] = fp32_2_fp16((*data_in)[index] * scale_val);
+      }
+    }
  }
  *data_in = (float *)tmp_data;  // NOLINT
  fpga_free(tmp);
 }
+void format_dwconv_filter(float **data_in, int num, int height, int width,
+                          float *scale_ptr) {
+  quantize_to_fp16(data_in, num, height, width, scale_ptr);
+  int16_t **quantize_data = (int16_t **)data_in;  // NOLINT
+  convert_to_hwn(quantize_data, num, height, width);
+  align_element_n(quantize_data, num, height, width);
+  fpga_flush(*quantize_data, align_to_x(num, FILTER_ELEMENT_ALIGNMENT) *
+                                 height * width * sizeof(int16_t));
+}

+void format_DWDeconv_filter(float **data_in, int num, int height, int width,
+                            float *scale_ptr) {
+  quantize_to_fp16(data_in, num, height, width, scale_ptr);
+  int16_t **quantize_data = (int16_t **)data_in;  // NOLINT
+  convert_to_hwn(quantize_data, num, height, width);
+  align_element_n(quantize_data, num, height, width);
+  fpga_flush(*quantize_data, align_to_x(num, FILTER_ELEMENT_ALIGNMENT) *
+                                 height * width * sizeof(int16_t));
+}
 }  // namespace filter
 }  // namespace fpga
 }  // namespace paddle_mobile
--- a/src/fpga/V2/filter.h
+++ b/src/fpga/V2/filter.h
@@ -13,25 +13,38 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
-
-#define FILTER_PARALLELISM 1024
+#include <cstdint>
 namespace paddle_mobile {
 namespace fpga {
 namespace filter {

-int calc_channel_parallelism(int channel);
-int calc_aligned_channel(int channel);
-int calc_num_parallelism(int channel);
-int calc_aligned_num(int num, int channel);
-int calc_aligned_total_pixel_num(int num, int channel, int height, int width);
-void convert_to_hwc(float** data_in, int num, int channel, int height,
+int calc_division_capacity(int chw);
+int calc_split_num(int num, int division_capacity);
+int calc_division_number(int num, int group_num, int division_capacity);
+int calc_num_per_div(int num, int group_num, int division_capacity);
+void convert_to_hwc(char** data_in, int num, int channel, int height,
                    int width);
+float find_max(float* data_in, int data_size);
+void quantize(float** data_in, int data_size, float max);
+void align_element(char** data_in, int num, int chw);
+void align_num(char** data_in, int num_per_div_before_alignment, int num,
+               int chw);
+void reorder(char** data_in, int num_after_alignment, int chw);
+void interleave(char** data_in, int num_after_alignment, int chw);
 void format_filter(float** data_in, int num, int channel, int height, int width,
                   int group_num, float max);
-void convert_fc_filter(float** data_in, int num, int chw);
+
+void convert_fc_filter(char** data_in, int num, int chw);
 void format_fc_filter(float** data_in, int num, int channel, int height,
                      int width, int group_num, float max);
-float find_max(float* data_in, int data_size);
+
+void convert_to_hwn(int16_t** data_in, int num, int height, int width);
+void align_element_n(int16_t** data_in, int num, int height, int width);
+void quantize_to_fp16(float** data_in, int num, int height, int width,
+                      float* scale_ptr);
+void format_dwconv_filter(float** data_in, int num, int height, int width,
+                          float* scale_ptr);
+
 }  // namespace filter
 }  // namespace fpga
 }  // namespace paddle_mobile
--- a/src/fpga/V2/image.cpp
+++ b/src/fpga/V2/image.cpp
@@ -13,80 +13,124 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "fpga/V2/image.h"
-#include <memory.h>
-#include <algorithm>
-#include "fpga/common/fpga_common.h"

 namespace paddle_mobile {
 namespace fpga {
 namespace image {

-void convert_to_hwc(float **data_in, int channel, int height, int width) {
-  float *tmp = *data_in;
-  float *data_tmp =
-      (float *)fpga_malloc(channel * height * width * sizeof(float));  // NOLINT
+void convert_to_hwc(float **data_in, int channel, int height, int width,
+                    int num) {
+  float *data_tmp = reinterpret_cast<float *>(
+      fpga_malloc(num * channel * height * width * sizeof(float)));
  int64_t amount_per_row = width * channel;
-  for (int c = 0; c < channel; c++) {
-    for (int h = 0; h < height; h++) {
-      int64_t offset_height = h * amount_per_row;
-      for (int w = 0; w < width; w++) {
-        *(data_tmp + offset_height + w * channel + c) = *((*data_in)++);
+  for (int n = 0; n < num; n++) {
+    for (int c = 0; c < channel; c++) {
+      for (int h = 0; h < height; h++) {
+        int64_t offset_height = h * amount_per_row;
+        for (int w = 0; w < width; w++) {
+          *(data_tmp + n * channel * height * width + offset_height +
+            w * channel + c) = *((*data_in)++);
+        }
      }
    }
  }
  *data_in = data_tmp;
-  fpga_free(tmp);
 }
-void align_image(float **data_in, int channel, int height, int width,
-                 int aligned_channel) {
-  if (channel == aligned_channel) return;
-  float *tmp = *data_in;
-  float *new_data =
-      (float *)fpga_malloc(aligned_channel * height * width *  // NOLINT
-                           sizeof(float));                     // NOLINT
-  memset(new_data, 0, aligned_channel * height * width * sizeof(float));

-  for (int i = 0; i < height * width; i++) {
-    memcpy(new_data + i * aligned_channel, tmp + i * channel,
-           channel * sizeof(float));
+void convert_to_chw(float **data_in, int channel, int height, int width,
+                    int num) {
+  float *data_tmp =
+      (float *)fpga_malloc(channel * height * width * sizeof(float));  // NOLINT
+  int64_t amount_per_side = width * height;
+  for (int n = 0; n < num; n++) {
+    for (int h = 0; h < height; h++) {
+      for (int w = 0; w < width; w++) {
+        for (int c = 0; c < channel; c++) {
+          *(data_tmp + n * height * width * channel + c * amount_per_side +
+            width * h + w) = *((*data_in)++);
+        }
+      }
+    }
  }
-  *data_in = new_data;
-  fpga_free(tmp);
-}
-
-void format_image(float **data_in, int channel, int height, int width,
-                  int aligned_channel) {
-  convert_to_hwc(data_in, channel, height, width);
-  align_image(data_in, channel, height, width, aligned_channel);
-  fpga_flush(*data_in, aligned_channel * height * width * sizeof(float));
+  *data_in = data_tmp;
 }

 void concat_images(int16_t **images_in, float **scales_in, void *image_out,
-                   float *scale_out, int image_num, const uint32_t *channel_num,
-                   int height, int width, const uint32_t *aligned_channel_num,
-                   int out_channel) {
-  int hw = height * width;
+                   float *scale_out, int image_num, uint32_t *channel_num,
+                   int height, int width) {
+  int i = 0;
+  int j = 0;
+  int k = 0;
+  int each_out_line_channel = 0;
+  int align_each_out_area_cw = 0;
+  int align_each_in_area_cw = 0;
+  int align_each_out_area_cw_differ = 0;
+  int tmp_channel = 0;
  scale_out[0] = 0.0;
  scale_out[1] = 0.0;
-  for (int i = 0; i < image_num; i++) {
+  for (i = 0; i < image_num; i++) {
+    each_out_line_channel += channel_num[i];
    scale_out[0] = std::max(*scale_out, scales_in[i][0]);
    fpga_invalidate(images_in[i],
-                    height * width * aligned_channel_num[i] * sizeof(int16_t));
+                    height *
+                        align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT) *
+                        sizeof(int16_t));
  }
  scale_out[1] = 1 / scale_out[0];
+  align_each_out_area_cw =
+      align_to_x(each_out_line_channel * width, IMAGE_ALIGNMENT);
+  align_each_out_area_cw_differ =
+      align_each_out_area_cw - each_out_line_channel * width;

-  for (int j = 0; j < hw; j++) {
-    int tmp_channel_sum = 0;
-    for (int i = 0; i < image_num; i++) {
-      memcpy(
-          (int16_t *)image_out + j * out_channel + tmp_channel_sum,  // NOLINT
-          images_in[i] + j * aligned_channel_num[i],
-          channel_num[i] * sizeof(int16_t));
+  for (k = 0; k < height; k++) {
+    for (j = 0; j < width; j++) {
+      for (i = 0; i < image_num; i++) {
+        align_each_in_area_cw =
+            align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT);
+        memcpy((int16_t *)image_out + tmp_channel +  // NOLINT
+                   k * align_each_out_area_cw_differ,
+               images_in[i] + j * channel_num[i] + k * align_each_in_area_cw,
+               channel_num[i] * sizeof(int16_t));

-      tmp_channel_sum += channel_num[i];
+        tmp_channel += channel_num[i];
+      }
    }
  }
-  fpga_flush(image_out, hw * out_channel * sizeof(int16_t));
+
+  fpga_flush(image_out, height * align_each_out_area_cw * sizeof(int16_t));
+}
+
+void split_image(int16_t *image_in, const float *scale_in, void **images_out,
+                 float **scales_out, int image_num,
+                 const uint32_t *channel_nums, int height, int width) {
+  int total_channel = 0;
+  for (int i = 0; i < image_num; i++) {
+    scales_out[i][0] = scale_in[0];
+    scales_out[i][1] = scale_in[1];
+    total_channel += channel_nums[i];
+  }
+  int element_num = height * align_to_x(width * total_channel, IMAGE_ALIGNMENT);
+  fpga_invalidate(image_in, element_num * sizeof(int16_t));
+
+  int src_offset = 0, des_offset = 0;
+  for (int h = 0; h < height; h++) {
+    for (int w = 0; w < width; w++) {
+      src_offset = h * align_to_x(total_channel * width, IMAGE_ALIGNMENT) +
+                   w * total_channel;
+      for (int i = 0; i < image_num; i++) {
+        des_offset = h * align_to_x(channel_nums[i] * width, IMAGE_ALIGNMENT) +
+                     w * channel_nums[i];
+        memcpy(reinterpret_cast<int16_t *>(images_out[i]) + des_offset,
+               image_in + src_offset, channel_nums[i] * sizeof(int16_t));
+        src_offset += channel_nums[i];
+      }
+    }
+  }
+
+  for (int i = 0; i < image_num; i++) {
+    element_num = height * align_to_x(width * channel_nums[i], IMAGE_ALIGNMENT);
+    fpga_flush(images_out[i], element_num * sizeof(int16_t));
+  }
 }

 }  // namespace image

--- a/src/fpga/V2/image.h
+++ b/src/fpga/V2/image.h
@@ -14,23 +14,63 @@ limitations under the License. */

 #pragma once

-#include <stdint.h>
-
+#include <memory.h>
+#include <algorithm>
+#include <cstdint>
+#include "fpga/common/fpga_common.h"
 namespace paddle_mobile {
 namespace fpga {
 namespace image {

-void convert_to_hwc(float **data_in, int channel, int height, int width);
-void align_image(float **data_in, int channel, int height, int width,
-                 int aligned_channel);
-void format_image(float **data_in, int channel, int height, int width,
-                  int aligned_channel);
-void concat_images(
-    int16_t **images_in, float **scales_in, void *image_out, float *scale_out,
-    int image_num, const uint32_t *channel_num, int height, int width,
-    const uint32_t *aligned_channel_num,
-    int out_channel);  // Concat featuremaps along channel direction
+void convert_to_hwc(float** data_in, int channel, int height, int width,
+                    int num = 1);
+void convert_to_chw(float** data_in, int channel, int height, int width,
+                    int num = 1);
+// template <typename Dtype>
+// void align_element_conv(Dtype** data_in, int height, int cw);
+// template <typename T>
+// void format_image(T** data_in, int channel, int height, int width);
+template <typename Dtype>
+void align_element_conv(Dtype** data_in, int height, int cw);
+template <typename Dtype>
+void align_element_conv(Dtype** data_in, int height, int cw) {
+  int h = 0;
+  int align_cw = align_to_x(cw, IMAGE_ALIGNMENT);
+
+  Dtype* data_tmp =
+      (Dtype*)fpga_malloc(height * align_cw * sizeof(Dtype));  // NOLINT
+
+  memset(data_tmp, 0, height * align_cw * sizeof(Dtype));
+
+  for (h = 0; h < height; h++) {
+    memcpy((void*)(data_tmp + h * align_cw),  // NOLINT
+           (void*)(*data_in + h * cw),        // NOLINT
+           cw * sizeof(Dtype));
+  }
+
+  *data_in = data_tmp;
+}
+template <typename T>
+void format_image(T** data_in, int channel, int height, int width) {
+  int cw = channel * width;
+  int align_cw = align_to_x(cw, IMAGE_ALIGNMENT);
+  if (align_cw != cw) {
+    T* hwc_temp = *data_in;
+    align_element_conv(data_in, height, channel * width);
+    fpga_free(hwc_temp);
+  }
+  fpga_flush(*data_in,
+             align_to_x(channel * width, IMAGE_ALIGNMENT) * height * sizeof(T));
+}
+// Concat featuremaps along channel direction
+void concat_images(int16_t** images_in, float** scales_in, void* image_out,
+                   float* scale_out, int image_num, uint32_t* channel_num,
+                   int height, int width);

+// Split featuremap along channel direction
+void split_image(int16_t* image_in, const float* scale_in, void** images_out,
+                 float** scales_out, int image_num,
+                 const uint32_t* channel_nums, int height, int width);
 }  // namespace image
 }  // namespace fpga
 }  // namespace paddle_mobile
--- a/src/fpga/V2/pe.cpp
+++ b/src/fpga/V2/pe.cpp
--- a/src/fpga/common/fpga_common.h
+++ b/src/fpga/common/fpga_common.h
@@ -27,6 +27,14 @@ limitations under the License. */
 #define BIAS_NUM_ALIGNMENT (16)
 #define ROW_PARALLEL_NUM (3)
 #endif
+#ifdef PADDLE_MOBILE_FPGA_V2
+#define IMAGE_ALIGNMENT (32)           // Aligned to 32
+#define FILTER_NUM_ALIGNMENT (32)      // Filter number aligned to 32
+#define FILTER_ELEMENT_ALIGNMENT (16)  // Filter element number aligned to 16
+#define BS_NUM_ALIGNMENT (8)
+#define BIAS_NUM_ALIGNMENT (16)
+#define ROW_PARALLEL_NUM (3)
+#endif

 namespace paddle_mobile {
 namespace fpga {
@@ -80,7 +88,8 @@ struct ImageOutputArgs {
      activation;  // To select activation and specify (Leaky)Relu parameter.
 };

-#ifdef PADDLE_MOBILE_FPGA_V1
+// #ifdef PADDLE_MOBILE_FPGA_V1
+#if ((defined PADDLE_MOBILE_FPGA_V1) || (defined PADDLE_MOBILE_FPGA_V2))
 struct ConvDriverParam {
  uint64_t image_address_phy;
  uint64_t filter_address_phy;
@@ -146,11 +155,8 @@ struct ConvArgs {
  struct ImageInputArgs image;  // input image;
  struct ImageOutputArgs output;

-#ifdef PADDLE_MOBILE_FPGA_V2
-  void* free_space;  // used by FPGA logic
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA_V1
+// #ifdef PADDLE_MOBILE_FPGA_V1
+#if ((defined PADDLE_MOBILE_FPGA_V1) || (defined PADDLE_MOBILE_FPGA_V2))
  struct DeconvTxParm deconv_tx_param;
  struct ConvDriverParam driver;
 #endif
@@ -208,7 +214,10 @@ struct EWAddArgs {
  struct ImageInputArgs image0;
  struct ImageInputArgs image1;
  struct ImageOutputArgs output;
-#ifdef PADDLE_MOBILE_FPGA_V1
+  std::vector<float> image_in_quantVal;
+  std::vector<float> image_out_quantVal;
+// #ifdef PADDLE_MOBILE_FPGA_V1
+#if ((defined PADDLE_MOBILE_FPGA_V1) || (defined PADDLE_MOBILE_FPGA_V2))
  struct EWAddDriverParam driver;
 #endif
 };

--- a/src/framework/executor.cpp
+++ b/src/framework/executor.cpp
@@ -68,7 +68,6 @@ Executor<Device, T>::Executor(const Program<Device> &program,
  // resize feed and fetch list
  // should init feed and fetch variables before infer shape
  InitFeedFetchList();
-
  const auto &blocks = program_desc_->Blocks();
  std::shared_ptr<BlockDesc> block_desc = blocks[0];
  std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
@@ -86,6 +85,9 @@ Executor<Device, T>::Executor(const Program<Device> &program,
    }
    ops_of_block0_.push_back(op_handler);
  }
+#ifdef PADDLE_MOBILE_FPGA_V2
+  InitQuantMemory();
+#endif
  if (program_.combined) {
    InitCombineMemory();
  } else {
@@ -626,8 +628,74 @@ template <typename Device, typename T>
 void Executor<Device, T>::Predict_To(int end) {
  Predict_From_To(0, end);
 }
-#endif
+#ifdef PADDLE_MOBILE_FPGA_V2
+std::map<std::string, float> LoadQuantValFromFile(std::string filename) {
+  std::map<std::string, float> quantValList;
+  std::ifstream in;
+  in.open(filename, std::ios::in);
+  if (!in.is_open()) {
+    std::cout << "open File Failed." << std::endl;
+    exit(-1);
+  }

+  std::string line;
+  while (getline(in, line)) {
+    std::string splitStr = " : ";
+    std::string::size_type pos;
+    pos = line.find(splitStr);
+    std::string subStr[2];
+    subStr[0] = line.substr(0, pos);
+    subStr[1] = line.substr(pos + splitStr.size(), line.size());
+    quantValList.insert(std::make_pair(subStr[0], atof(subStr[1].c_str())));
+  }
+  in.close();
+  return quantValList;
+}
+
+template <typename Device, typename T>
+void Executor<Device, T>::InitQuantMemory() {
+  std::string quantValFilePath;
+  if (program_.combined) {
+    quantValFilePath = program_.para_path;
+    quantValFilePath =
+        quantValFilePath.substr(0, (quantValFilePath.length() - 6));
+    quantValFilePath = quantValFilePath + "scale";
+  } else {
+    quantValFilePath = program_.model_path + "/scale";
+  }
+  std::map<std::string, float> quantValList =
+      LoadQuantValFromFile(quantValFilePath);
+  auto ops = ops_of_block0_;
+  for (int id = 0; id < ops.size(); id++) {
+    auto op = ops[id];
+    auto input_keys = op->GetInputKeys();
+    auto inputs = op->Inputs();
+    for (auto key = input_keys.begin(); key != input_keys.end(); key++) {
+      auto inputs_vars = inputs[*key];
+      int count = inputs_vars.size();
+      for (int i = 0; i < count; i++) {
+        auto tensor = GetTensorByName(inputs_vars[i]);
+        tensor->scale[0] = quantValList[inputs_vars[i]];
+        std::cout << "input variance name : " << inputs_vars[i]
+                  << ", scale value : " << tensor->scale[0] << std::endl;
+      }
+    }
+    auto output_keys = op->GetOutKeys();
+    auto outputs = op->Outputs();
+    for (auto key = output_keys.begin(); key != output_keys.end(); key++) {
+      auto outputs_vars = outputs[*key];
+      int count = outputs_vars.size();
+      for (int i = 0; i < count; i++) {
+        auto tensor = GetTensorByName(outputs_vars[i]);
+        tensor->scale[0] = quantValList[outputs_vars[i]];
+        std::cout << "output variance name : " << outputs_vars[i]
+                  << ", scale value : " << tensor->scale[0] << std::endl;
+      }
+    }
+  }
+}
+#endif
+#endif
 #ifdef PADDLE_MOBILE_CL
 template <>
 void Executor<GPU_CL, float>::InitNoPersistableMemory(

--- a/src/framework/executor.h
+++ b/src/framework/executor.h
@@ -64,6 +64,9 @@ class Executor {
  void Predict_From_To(int start = 0, int end = -1);
  void Predict_From(int start);
  void Predict_To(int end);
+#ifdef PADDLE_MOBILE_FPGA_V2
+  void InitQuantMemory();
+#endif
 #endif

 protected:

--- a/src/operators/kernel/fpga/V2/anchor_generator_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/anchor_generator_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef ANCHOR_GENERATOR_OP
+
+#include <string.h>
+#include <iostream>
+#include <utility>
+#include <vector>
+#include "operators/kernel/detection_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool AnchorGeneratorKernel<FPGA, float>::Init(
+    AnchorGeneratorParam<FPGA> *param) {
+  auto input = param->input_;
+  auto anchors = param->output_anchors_;
+  auto anchor_ptr = anchors->mutable_data<float>();
+  auto stride = param->stride_;
+  auto feature_width = input->dims()[3], feature_height = input->dims()[2];
+  auto stride_width = stride[0], stride_height = stride[1];
+  auto offset = param->offset_;
+
+  int anchors_offset[] = {-2,  -2,   18,   18,  -10, -9,   26,   25,   -23,
+                          -20, 39,   36,   -43, -34, 59,   49,   -63,  -54,
+                          79,  69,   -96,  -77, 112, 93,   -137, -118, 153,
+                          134, -204, -188, 220, 204, -281, -395, 296,  441};
+
+  int anchors_offset2[] = {0, 0, 51, 77, 0, 0, 30, 35, 0, 0, 81, 103,
+                           0, 0, 20, 21, 0, 0, 36, 44, 0, 0, 43, 58,
+                           0, 0, 34, 68, 0, 0, 24, 28, 0, 0, 19, 46};
+
+  if (offset > 0.6) {
+    memcpy(anchors_offset, anchors_offset2, sizeof(anchors_offset));
+    std::cout << "anchor generator marker" << std::endl;
+  } else {
+    std::cout << "anchor generator rfcn" << std::endl;
+  }
+  int num_anchors = sizeof(anchors_offset) / (sizeof(int) * 4);
+
+  //  DLOG << "feature_height: " << feature_height;
+  //  DLOG << "feature_width: " << feature_width;
+  //  DLOG << "num_anchors: " << num_anchors;
+  //  DLOG << "stride_width: " << stride_width;
+  //  DLOG << "stride_height: " << stride_height;
+
+  for (int h_idx = 0; h_idx < feature_height; ++h_idx) {
+    int offset0 = h_idx * feature_width * num_anchors * 4;
+    for (int w_idx = 0; w_idx < feature_width; ++w_idx) {
+      int offset1 = w_idx * num_anchors * 4;
+      for (int idx = 0; idx < num_anchors; idx++) {
+        int offset = offset0 + offset1 + idx * 4;
+        anchor_ptr[offset + 0] =
+            anchors_offset[idx * 4 + 0] + w_idx * stride_width;
+        anchor_ptr[offset + 1] =
+            anchors_offset[idx * 4 + 1] + h_idx * stride_height;
+        anchor_ptr[offset + 2] =
+            anchors_offset[idx * 4 + 2] + w_idx * stride_width;
+        anchor_ptr[offset + 3] =
+            anchors_offset[idx * 4 + 3] + h_idx * stride_height;
+      }
+    }
+  }
+  return true;
+}
+
+template <>
+void AnchorGeneratorKernel<FPGA, float>::Compute(
+    const AnchorGeneratorParam<FPGA> &param) {}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif  // ANCHOR_GENERATOR_OP
--- a/src/operators/kernel/fpga/V2/concat_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/concat_kernel.cpp
@@ -15,7 +15,6 @@ limitations under the License. */
 #ifdef CONCAT_OP

 #include "operators/kernel/concat_kernel.h"
-#include "fpga/V2/api.h"

 namespace paddle_mobile {
 namespace operators {
@@ -31,45 +30,36 @@ bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
      (float **)fpga::fpga_malloc(image_num * sizeof(float *));  // NOLINT
  auto channel_num =
      (uint32_t *)fpga::fpga_malloc(image_num * sizeof(uint32_t));  // NOLINT
-  auto aligned_channel_num =
-      (uint32_t *)fpga::fpga_malloc(image_num * sizeof(uint32_t));  // NOLINT

  auto height = inputs[0]->dims()[2];
  auto width = inputs[0]->dims()[3];
-  auto out_channel =
-      (uint32_t)fpga::get_aligned_channel_num((int)out->dims()[1]);  // NOLINT
  for (int i = 0; i < image_num; i++) {
    auto input = inputs[i];
    PADDLE_MOBILE_ENFORCE(
        input->dims()[2] == height && input->dims()[3] == width,
        "Image height & width should be unified");
-    images_in[i] = (half *)input->data<float>();  // NOLINT
-    channel_num[i] = (uint32_t)inputs[i]->dims()[1];
-    aligned_channel_num[i] =
-        (uint32_t)fpga::get_aligned_channel_num(channel_num[i]);
+    images_in[i] = input->data<half>();
+    channel_num[i] = (uint32_t)inputs[i]->dims()[1];  // NOLINT
    scales_in[i] = input->scale;
  }
-  fpga::format_concat_output(out, (int)height, (int)width,  // NOLINT
-                             out_channel);
+  fpga::format_concat_output(out, height, width, image_num, channel_num);

  fpga::ConcatArgs concatArgs = {0};
-  concatArgs.image_num = (uint32_t)image_num;
+  concatArgs.image_num = image_num;
  concatArgs.images_in = images_in;
  concatArgs.scales_in = scales_in;
-  concatArgs.image_out = (half *)out->data<float>();  // NOLINT
+  concatArgs.image_out = out->data<half>();
  concatArgs.scale_out = out->scale;
  concatArgs.channel_num = channel_num;
-  concatArgs.aligned_channel_num = aligned_channel_num;
-  concatArgs.out_channel = out_channel;
-  concatArgs.height = (uint32_t)height;
-  concatArgs.width = (uint32_t)width;
+  concatArgs.height = height;
+  concatArgs.width = width;
  param->SetFpgaArgs(concatArgs);
  return true;
 }

 template <>
 void ConcatKernel<FPGA, float>::Compute(const ConcatParam<FPGA> &param) {
-  fpga::ComputeFPGAConcat(param.FpgaArgs());
+  ComputeFPGAConcat(param.FpgaArgs());
 }
 template class ConcatKernel<FPGA, float>;


--- a/src/operators/kernel/fpga/V2/conv_add_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/conv_add_bn_kernel.cpp
@@ -22,12 +22,15 @@ namespace operators {

 template <>
 bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
-  bool relu_enabled = false;
-  auto input = const_cast<Tensor *>(param->Input());
+  // bool relu_enabled = false;
+  paddle_mobile::fpga::ActivationType activation_enable =
+      paddle_mobile::fpga::NONE;
+  int16_t leaky_relu_negative_slope = 0;
+  auto input = const_cast<LoDTensor *>(param->Input());

  auto bias = param->Bias();
  auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<Tensor *>(param->Filter());
+  auto filter = const_cast<LoDTensor *>(param->Filter());

  auto out = param->Output();

@@ -56,18 +59,18 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
    bs_ptr[i + channel] = new_scale_ptr[i];
    bs_ptr[i] = new_bias_ptr[i];
  }
-  param->SetNewScale(new_scale);
-  param->SetNewBias(new_bias);

  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
-
  fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
-                       param->Groups(), param->Strides()[0],
-                       param->Strides()[1], param->Paddings()[0],
-                       param->Paddings()[1], bs_ptr);
+  fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable,
+                       leaky_relu_negative_slope, param->Groups(),
+                       param->Strides()[0], param->Strides()[1],
+                       param->Paddings()[0], param->Paddings()[1], bs_ptr);
  param->SetFpgaArgs(conv_arg);

+  delete new_scale;
+  delete new_bias;
+
  return true;
 }


--- a/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp
@@ -23,12 +23,18 @@ namespace operators {
 template <>
 bool ConvAddBNReluKernel<FPGA, float>::Init(
    FusionConvAddBNReluParam<FPGA> *param) {
-  bool relu_enabled = true;
-  auto input = const_cast<Tensor *>(param->Input());
-  const Tensor *bias = param->Bias();
+  // bool relu_enabled = true;
+  paddle_mobile::fpga::ActivationType activation_enable =
+      paddle_mobile::fpga::LEAKYRELU;
+  int16_t leaky_relu_negative_slope = 0;
+  auto input = const_cast<LoDTensor *>(param->Input());
+  auto bias = param->Bias();
  auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<Tensor *>(param->Filter());
+  auto filter = const_cast<LoDTensor *>(param->Filter());
  auto out = param->Output();
+
+  vector<int> paddings = param->Paddings();
+  vector<int> strides = param->Strides();
  auto bn_mean_ptr = param->InputMean()->data<float>();
  auto bn_var_ptr = param->InputVariance()->data<float>();
  auto bn_scale_ptr = param->InputScale()->data<float>();
@@ -40,7 +46,7 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(

  const int channel = out->dims()[1];
  auto bs_ptr =
-      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
+      reinterpret_cast<float *>(fpga::fpga_malloc(2 * channel * sizeof(float)));
  auto new_scale = new Tensor();
  auto new_bias = new Tensor();
  auto new_scale_ptr = new_scale->mutable_data<float>({channel});
@@ -51,27 +57,41 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
                       static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
    new_bias_ptr[i] =
        bn_bias_ptr[i] + (bias_ptr[i] - bn_mean_ptr[i]) * new_scale_ptr[i];
-    bs_ptr[i + 2] = new_scale_ptr[i];
+    bs_ptr[i + channel] = new_scale_ptr[i];
    bs_ptr[i] = new_bias_ptr[i];
  }
-  param->SetNewScale(new_scale);
-  param->SetNewBias(new_bias);
-
-  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());

-  fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
-                       param->Groups(), param->Strides()[0],
-                       param->Strides()[1], param->Paddings()[0],
-                       param->Paddings()[1], bs_ptr);
-  param->SetFpgaArgs(conv_arg);
+  const int groups = param->Groups();
+  if (groups == channel) {
+    fpga::format_dwconv_data(filter, out, new_scale_ptr, &new_bias_ptr);
+    fpga::DWconvArgs dwconv_arg = {0};
+    fpga::fill_dwconv_arg(&dwconv_arg, input, out, filter, activation_enable,
+                          leaky_relu_negative_slope, strides[0], strides[1],
+                          paddings[0], paddings[1], new_bias_ptr);
+    param->SetFpgaArgs(dwconv_arg);
+    fpga::fpga_free(new_scale_ptr);
+    fpga::fpga_free(bs_ptr);
+  } else {
+    fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
+    fpga::SplitConvArgs conv_arg = {0};
+    fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable,
+                         leaky_relu_negative_slope, param->Groups(), strides[0],
+                         strides[1], paddings[0], paddings[1], bs_ptr);
+    param->SetFpgaArgs(conv_arg);
+    delete new_scale;
+    delete new_bias;
+  }
  return true;
 }

 template <>
 void ConvAddBNReluKernel<FPGA, float>::Compute(
    const FusionConvAddBNReluParam<FPGA> &param) {
-  fpga::ComputeFpgaConv(param.FpgaArgs());
+  if (param.Groups() == param.Output()->dims()[1]) {
+    fpga::ComputeDWConv(param.FpgaDwconvArgs());
+  } else {
+    fpga::ComputeFpgaConv(param.FpgaArgs());
+  }
 }

 }  // namespace operators

--- a/src/operators/kernel/fpga/V2/conv_add_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/conv_add_kernel.cpp
@@ -21,11 +21,14 @@ namespace operators {

 template <>
 bool ConvAddKernel<FPGA, float>::Init(FusionConvAddParam<FPGA> *param) {
-  bool relu_enabled = false;
-  auto input = const_cast<Tensor *>(param->Input());
+  // bool relu_enabled = false;
+  paddle_mobile::fpga::ActivationType activation_enable =
+      paddle_mobile::fpga::NONE;
+  int16_t leaky_relu_negative_slope = 0;
+  auto input = const_cast<LoDTensor *>(param->Input());
  const Tensor *bias = param->Bias();
  auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<Tensor *>(param->Filter());
+  auto filter = const_cast<LoDTensor *>(param->Filter());
  auto out = param->Output();

  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
@@ -39,12 +42,11 @@ bool ConvAddKernel<FPGA, float>::Init(FusionConvAddParam<FPGA> *param) {
  }

  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
-
  fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
-                       param->Groups(), param->Strides()[0],
-                       param->Strides()[1], param->Paddings()[0],
-                       param->Paddings()[1], bs_ptr);
+  fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable,
+                       leaky_relu_negative_slope, param->Groups(),
+                       param->Strides()[0], param->Strides()[1],
+                       param->Paddings()[0], param->Paddings()[1], bs_ptr);
  param->SetFpgaArgs(conv_arg);
  return true;
 }

--- a/src/operators/kernel/fpga/V2/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/conv_add_relu_kernel.cpp
@@ -21,11 +21,14 @@ namespace operators {

 template <>
 bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
-  bool relu_enabled = true;
-  auto input = const_cast<Tensor *>(param->Input());
+  // bool relu_enabled = true;
+  paddle_mobile::fpga::ActivationType activation_enable =
+      paddle_mobile::fpga::LEAKYRELU;
+  int16_t leaky_relu_negative_slope = 0;
+  auto input = const_cast<LoDTensor *>(param->Input());
  const Tensor *bias = param->Bias();
  auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<Tensor *>(param->Filter());
+  auto filter = const_cast<LoDTensor *>(param->Filter());
  auto out = param->Output();

  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
@@ -39,12 +42,11 @@ bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
  }

  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
-
  fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
-                       param->Groups(), param->Strides()[0],
-                       param->Strides()[1], param->Paddings()[0],
-                       param->Paddings()[1], bs_ptr);
+  fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable,
+                       leaky_relu_negative_slope, param->Groups(),
+                       param->Strides()[0], param->Strides()[1],
+                       param->Paddings()[0], param->Paddings()[1], bs_ptr);
  param->SetFpgaArgs(conv_arg);
  return true;
 }

--- a/src/operators/kernel/fpga/V2/conv_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/conv_bn_kernel.cpp
@@ -22,10 +22,16 @@ namespace operators {

 template <>
 bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
-  bool relu_enabled = false;
-  auto input = const_cast<Tensor *>(param->Input());
-  auto filter = const_cast<Tensor *>(param->Filter());
+  // bool relu_enabled = false;
+  paddle_mobile::fpga::ActivationType activation_enable =
+      paddle_mobile::fpga::NONE;
+  int16_t leaky_relu_negative_slope = 0;
+  auto input = const_cast<LoDTensor *>(param->Input());
+  auto filter = const_cast<LoDTensor *>(param->Filter());
  auto out = param->Output();
+  float Si = input->scale[0];
+  float So = out->scale[0];
+  float Sf = fpga::filter_find_max(filter);
  auto bn_mean_ptr = param->InputMean()->data<float>();
  auto bn_var_ptr = param->InputVariance()->data<float>();
  auto bn_scale_ptr = param->InputScale()->data<float>();
@@ -45,20 +51,21 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
    new_scale_ptr[i] = bn_scale_ptr[i] /
                       static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
    new_bias_ptr[i] = bn_bias_ptr[i] + (0 - bn_mean_ptr[i]) * new_scale_ptr[i];
-    bs_ptr[i + channel] = new_scale_ptr[i];
-    bs_ptr[i] = new_bias_ptr[i];
+    // bs_ptr[i + channel] = new_scale_ptr[i];
+    // bs_ptr[i] = new_bias_ptr[i];
+    bs_ptr[i + channel] = new_scale_ptr[i] * Si / So * Sf / 127.0;
+    bs_ptr[i] = new_bias_ptr[i] * 127.0 / So;
  }
-  param->SetNewScale(new_scale);
-  param->SetNewBias(new_bias);

  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
-
  fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
-                       param->Groups(), param->Strides()[0],
-                       param->Strides()[1], param->Paddings()[0],
-                       param->Paddings()[1], bs_ptr);
+  fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable,
+                       leaky_relu_negative_slope, param->Groups(),
+                       param->Strides()[0], param->Strides()[1],
+                       param->Paddings()[0], param->Paddings()[1], bs_ptr);
  param->SetFpgaArgs(conv_arg);
+  delete new_scale;
+  delete new_bias;
  return true;
 }


--- a/src/operators/kernel/fpga/V2/conv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/conv_bn_relu_kernel.cpp
@@ -16,17 +16,20 @@ limitations under the License. */

 #include "operators/kernel/conv_bn_relu_kernel.h"
 #include <cmath>
-#include "fpga/V2/filter.h"
-
 namespace paddle_mobile {
 namespace operators {
-
 template <>
 bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
-  bool relu_enabled = true;
-  auto input = const_cast<Tensor *>(param->Input());
-  auto filter = const_cast<Tensor *>(param->Filter());
+  paddle_mobile::fpga::ActivationType activation_enable =
+      paddle_mobile::fpga::LEAKYRELU;
+  int16_t leaky_relu_negative_slope = 0;
+  const int groups = param->Groups();
+  auto input = const_cast<LoDTensor *>(param->Input());
+  auto filter = const_cast<LoDTensor *>(param->Filter());
  auto out = param->Output();
+  float Si = input->scale[0];
+  float So = out->scale[0];
+  float Sf = fpga::filter_find_max(filter);
  auto bn_mean_ptr = param->InputMean()->data<float>();
  auto bn_var_ptr = param->InputVariance()->data<float>();
  auto bn_scale_ptr = param->InputScale()->data<float>();
@@ -41,32 +44,49 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
  auto new_bias = new Tensor();
  auto new_scale_ptr = new_scale->mutable_data<float>({channel});
  auto new_bias_ptr = new_bias->mutable_data<float>({channel});
-
  for (int i = 0; i < channel; i++) {
    new_scale_ptr[i] = bn_scale_ptr[i] /
                       static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
    new_bias_ptr[i] = bn_bias_ptr[i] + (0 - bn_mean_ptr[i]) * new_scale_ptr[i];
-    bs_ptr[i + channel] = new_scale_ptr[i];
-    bs_ptr[i] = new_bias_ptr[i];
+    // bs_ptr[i + channel] = new_scale_ptr[i];
+    // bs_ptr[i] = new_bias_ptr[i];
+    bs_ptr[i + channel] = new_scale_ptr[i] * Si / So * Sf / 127.0;
+    bs_ptr[i] = new_bias_ptr[i] * 127.0 / So;
+    if (groups == channel) {
+      new_scale_ptr[i] = new_scale_ptr[i] * Si / So;
+      new_bias_ptr[i] = new_bias_ptr[i] * 127.0 / So;
+    }
  }
-  param->SetNewScale(new_scale);
-  param->SetNewBias(new_bias);
-
-  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
-
-  fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
-                       param->Groups(), param->Strides()[0],
-                       param->Strides()[1], param->Paddings()[0],
-                       param->Paddings()[1], bs_ptr);
-  param->SetFpgaArgs(conv_arg);
+  if (groups == channel) {
+    fpga::format_dwconv_data(filter, out, new_scale_ptr, &new_bias_ptr);
+    fpga::DWconvArgs dwconv_arg = {0};
+    fpga::fill_dwconv_arg(&dwconv_arg, input, out, filter, activation_enable,
+                          leaky_relu_negative_slope, param->Strides()[0],
+                          param->Strides()[1], param->Paddings()[0],
+                          param->Paddings()[1], new_bias_ptr);
+    param->SetFpgaArgs(dwconv_arg);
+  } else {
+    fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
+    fpga::SplitConvArgs conv_arg = {0};
+    fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable,
+                         leaky_relu_negative_slope, param->Groups(),
+                         param->Strides()[0], param->Strides()[1],
+                         param->Paddings()[0], param->Paddings()[1], bs_ptr);
+    param->SetFpgaArgs(conv_arg);
+  }
+  delete new_scale;
+  delete new_bias;
  return true;
 }

 template <>
 void ConvBNReluKernel<FPGA, float>::Compute(
    const FusionConvBNReluParam<FPGA> &param) {
-  fpga::ComputeFpgaConv(param.FpgaArgs());
+  if (param.Groups() == param.Output()->dims()[1]) {
+    fpga::ComputeDWConv(param.FpgaDwconvArgs());
+  } else {
+    fpga::ComputeFpgaConv(param.FpgaArgs());
+  }
 }

 }  // namespace operators

--- a/src/operators/kernel/fpga/V2/conv_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/conv_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef CONV_OP
+
+#include "operators/kernel/conv_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvKernel<FPGA, float>::Init(ConvParam<FPGA> *param) {
+  paddle_mobile::fpga::ActivationType activation_enable =
+      paddle_mobile::fpga::NONE;
+  int16_t leaky_relu_negative_slope = 0;
+  auto input = const_cast<LoDTensor *>(param->Input());
+  auto filter = const_cast<LoDTensor *>(param->Filter());
+  auto out = param->Output();
+  float Si = input->scale[0];
+  float So = out->scale[0];
+  float Sf = fpga::filter_find_max(filter);
+  int channel = out->dims()[1];
+  auto bs_ptr =
+      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
+  for (int i = 0; i < channel; i++) {
+    // bs_ptr[i + channel] = 1;
+    // bs_ptr[i] = 0;
+    bs_ptr[i + channel] = Si / So * Sf / 127.0;
+    bs_ptr[i] = 0;
+  }
+
+  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
+  fpga::SplitConvArgs conv_arg = {0};
+  fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable,
+                       leaky_relu_negative_slope, param->Groups(),
+                       param->Strides()[0], param->Strides()[1],
+                       param->Paddings()[0], param->Paddings()[1], bs_ptr);
+  param->SetFpgaArgs(conv_arg);
+  return true;
+}
+
+template <>
+void ConvKernel<FPGA, float>::Compute(const ConvParam<FPGA> &param) {
+  fpga::ComputeFpgaConv(param.FpgaArgs());
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef CONV_TRANSPOSE_OP
+
+#include "operators/kernel/conv_transpose_kernel.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvTransposeKernel<FPGA, float>::Init(ConvTransposeParam<FPGA> *param) {
+  // bool relu_enabled = false;
+  paddle_mobile::fpga::ActivationType activation_enable =
+      paddle_mobile::fpga::NONE;
+  int16_t leaky_relu_negative_slope = 0;
+  auto input = const_cast<LoDTensor *>(param->Input());
+  // const Tensor *bias = param->Bias();
+  // auto bias_ptr = bias->data<float>();
+  auto filter = const_cast<LoDTensor *>(param->Filter());
+  auto out = param->Output();
+
+  // PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
+  //                      "Output channel should be equal to bias number");
+  int channel = out->dims()[1];
+
+  int sub_conv_n = param->Strides()[0];
+  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *  // NOLINT
+                                           sizeof(float));             // NOLINT
+
+  for (int i = 0; i < channel * sub_conv_n; i++) {
+    bs_ptr[i + sub_conv_n * channel] = 1;
+    bs_ptr[i] = 0;  // bias_ptr[i % (channel)];
+  }
+
+  PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
+                        "stride_width should be equal to stride_height ");
+  PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
+                        "filter width should be equal to filter height ");
+  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
+                        "filter axis should be the multiple of stride axis ");
+  if (param->Groups() == channel) {
+    fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
+                               sub_conv_n);
+    fpga::DWDeconvArgs DWDeconv_arg = {0};
+    fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter,
+                            activation_enable, leaky_relu_negative_slope,
+                            param->Strides()[0], param->Strides()[1],
+                            param->Paddings()[0], param->Paddings()[1], bs_ptr);
+    param->SetFpgaArgs(DWDeconv_arg);
+  } else {
+    fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
+    fpga::DeconvArgs deconv_arg = {0};
+    fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,
+                          leaky_relu_negative_slope, param->Groups(),
+                          param->Strides()[0], param->Strides()[1],
+                          param->Paddings()[0], param->Paddings()[1], bs_ptr);
+    param->SetFpgaArgs(deconv_arg);
+  }
+  return true;
+}
+
+template <>
+void ConvTransposeKernel<FPGA, float>::Compute(
+    const ConvTransposeParam<FPGA> &param) {
+  if (param.Groups() == param.Output()->dims()[1]) {
+    fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
+  } else {
+    fpga::ComputeFpgaDeconv(param.FpgaArgs());
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/fpga/V2/deconv_add_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/deconv_add_bn_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_DECONVADDBN_OP
+
+#include "operators/kernel/deconv_add_bn_kernel.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool DeconvAddBNKernel<FPGA, float>::Init(FusionDeconvAddBNParam<FPGA> *param) {
+  // bool relu_enabled = true;
+  paddle_mobile::fpga::ActivationType activation_enable =
+      paddle_mobile::fpga::NONE;
+  int16_t leaky_relu_negative_slope = 0;
+  auto input = const_cast<LoDTensor *>(param->Input());
+  const Tensor *bias = param->InputBias();
+  auto bias_ptr = bias->data<float>();
+  auto filter = const_cast<LoDTensor *>(param->Filter());
+  auto out = param->Output();
+
+  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
+                        "Output channel should be equal to bias number");
+  int channel = out->dims()[1];
+
+  int sub_conv_n = param->Strides()[0];
+  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *  // NOLINT
+                                           sizeof(float));             // NOLINT
+
+  for (int i = 0; i < channel * sub_conv_n; i++) {
+    bs_ptr[i + sub_conv_n * channel] = 1;
+    bs_ptr[i] = bias_ptr[i % (channel)];
+  }
+
+  PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
+                        "stride_width should be equal to stride_height ");
+  PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
+                        "filter width should be equal to filter height ");
+  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
+                        "filter axis should be the multiple of stride axis ");
+  if (param->Groups() == channel) {
+    fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
+                               sub_conv_n);
+    fpga::DWDeconvArgs DWDeconv_arg = {0};
+    fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter,
+                            activation_enable, leaky_relu_negative_slope,
+                            param->Strides()[0], param->Strides()[1],
+                            param->Paddings()[0], param->Paddings()[1], bs_ptr);
+    param->SetFpgaArgs(DWDeconv_arg);
+  } else {
+    fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
+    fpga::DeconvArgs deconv_arg = {0};
+    fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,
+                          leaky_relu_negative_slope, param->Groups(),
+                          param->Strides()[0], param->Strides()[1],
+                          param->Paddings()[0], param->Paddings()[1], bs_ptr);
+    param->SetFpgaArgs(deconv_arg);
+  }
+  return true;
+}
+
+template <>
+void DeconvAddBNKernel<FPGA, float>::Compute(
+    const FusionDeconvAddBNParam<FPGA> &param) {
+  // fpga::ComputeFpgaDeconv(param.FpgaArgs());
+  if (param.Groups() == param.Output()->dims()[1]) {
+    fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
+  } else {
+    fpga::ComputeFpgaDeconv(param.FpgaArgs());
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/fpga/V2/deconv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/deconv_add_bn_relu_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_DECONVADDBNRELU_OP
+
+#include "operators/kernel/deconv_add_bn_relu_kernel.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool DeconvAddBNReluKernel<FPGA, float>::Init(
+    FusionDeconvAddBNReluParam<FPGA> *param) {
+  // bool relu_enabled = true;
+  paddle_mobile::fpga::ActivationType activation_enable =
+      paddle_mobile::fpga::LEAKYRELU;
+  int16_t leaky_relu_negative_slope = 0;
+  auto input = const_cast<LoDTensor *>(param->Input());
+  const Tensor *bias = param->InputBias();
+  auto bias_ptr = bias->data<float>();
+  auto filter = const_cast<LoDTensor *>(param->Filter());
+  auto out = param->Output();
+
+  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
+                        "Output channel should be equal to bias number");
+  int channel = out->dims()[1];
+
+  int sub_conv_n = param->Strides()[0];
+  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *  // NOLINT
+                                           sizeof(float));             // NOLINT
+
+  for (int i = 0; i < channel * sub_conv_n; i++) {
+    bs_ptr[i + sub_conv_n * channel] = 1;
+    bs_ptr[i] = bias_ptr[i % (channel)];
+  }
+
+  PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
+                        "stride_width should be equal to stride_height ");
+  PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
+                        "filter width should be equal to filter height ");
+  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
+                        "filter axis should be the multiple of stride axis ");
+  if (param->Groups() == channel) {
+    fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
+                               sub_conv_n);
+    fpga::DWDeconvArgs DWDeconv_arg = {0};
+    fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter,
+                            activation_enable, leaky_relu_negative_slope,
+                            param->Strides()[0], param->Strides()[1],
+                            param->Paddings()[0], param->Paddings()[1], bs_ptr);
+    param->SetFpgaArgs(DWDeconv_arg);
+  } else {
+    fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
+    fpga::DeconvArgs deconv_arg = {0};
+    fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,
+                          leaky_relu_negative_slope, param->Groups(),
+                          param->Strides()[0], param->Strides()[1],
+                          param->Paddings()[0], param->Paddings()[1], bs_ptr);
+    param->SetFpgaArgs(deconv_arg);
+  }
+  return true;
+}
+
+template <>
+void DeconvAddBNReluKernel<FPGA, float>::Compute(
+    const FusionDeconvAddBNReluParam<FPGA> &param) {
+  // fpga::ComputeFpgaDeconv(param.FpgaArgs());
+  if (param.Groups() == param.Output()->dims()[1]) {
+    fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
+  } else {
+    fpga::ComputeFpgaDeconv(param.FpgaArgs());
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/fpga/V2/deconv_add_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/deconv_add_kernel.cpp
@@ -23,12 +23,66 @@ namespace operators {

 template <>
 bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) {
+  // bool relu_enabled = false;
+  paddle_mobile::fpga::ActivationType activation_enable =
+      paddle_mobile::fpga::NONE;
+  int16_t leaky_relu_negative_slope = 0;
+  auto input = const_cast<LoDTensor *>(param->Input());
+  const Tensor *bias = param->Bias();
+  auto bias_ptr = bias->data<float>();
+  auto filter = const_cast<LoDTensor *>(param->Filter());
+  auto out = param->Output();
+
+  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
+                        "Output channel should be equal to bias number");
+  int channel = out->dims()[1];
+
+  int sub_conv_n = param->Strides()[0];
+  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *  // NOLINT
+                                           sizeof(float));             // NOLINT
+
+  for (int i = 0; i < channel * sub_conv_n; i++) {
+    bs_ptr[i + sub_conv_n * channel] = 1;
+    bs_ptr[i] = bias_ptr[i % (channel)];
+  }
+
+  PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
+                        "stride_width should be equal to stride_height ");
+  PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
+                        "filter width should be equal to filter height ");
+  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
+                        "filter axis should be the multiple of stride axis ");
+  if (param->Groups() == channel) {
+    fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
+                               sub_conv_n);
+    fpga::DWDeconvArgs DWDeconv_arg = {0};
+    fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter,
+                            activation_enable, leaky_relu_negative_slope,
+                            param->Strides()[0], param->Strides()[1],
+                            param->Paddings()[0], param->Paddings()[1], bs_ptr);
+    param->SetFpgaArgs(DWDeconv_arg);
+  } else {
+    fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
+    fpga::DeconvArgs deconv_arg = {0};
+    fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,
+                          leaky_relu_negative_slope, param->Groups(),
+                          param->Strides()[0], param->Strides()[1],
+                          param->Paddings()[0], param->Paddings()[1], bs_ptr);
+    param->SetFpgaArgs(deconv_arg);
+  }
+
  return true;
 }

 template <>
 void DeconvAddKernel<FPGA, float>::Compute(
-    const FusionDeconvAddParam<FPGA> &param) {}
+    const FusionDeconvAddParam<FPGA> &param) {
+  if (param.Groups() == param.Output()->dims()[1]) {
+    fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
+  } else {
+    fpga::ComputeFpgaDeconv(param.FpgaArgs());
+  }
+}

 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/kernel/fpga/V2/deconv_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/deconv_add_relu_kernel.cpp
@@ -24,12 +24,66 @@ namespace operators {
 template <>
 bool DeconvAddReluKernel<FPGA, float>::Init(
    FusionDeconvAddReluParam<FPGA> *param) {
+  // bool relu_enabled = true;
+  paddle_mobile::fpga::ActivationType activation_enable =
+      paddle_mobile::fpga::LEAKYRELU;
+  int16_t leaky_relu_negative_slope = 0;
+  auto input = const_cast<LoDTensor *>(param->Input());
+  const Tensor *bias = param->Bias();
+  auto bias_ptr = bias->data<float>();
+  auto filter = const_cast<LoDTensor *>(param->Filter());
+  auto out = param->Output();
+
+  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
+                        "Output channel should be equal to bias number");
+  int channel = out->dims()[1];
+
+  int sub_conv_n = param->Strides()[0];
+  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *  // NOLINT
+                                           sizeof(float));             // NOLINT
+
+  for (int i = 0; i < channel * sub_conv_n; i++) {
+    bs_ptr[i + sub_conv_n * channel] = 1;
+    bs_ptr[i] = bias_ptr[i % (channel)];
+  }
+
+  PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
+                        "stride_width should be equal to stride_height ");
+  PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
+                        "filter width should be equal to filter height ");
+  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
+                        "filter axis should be the multiple of stride axis ");
+  if (param->Groups() == channel) {
+    fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
+                               sub_conv_n);
+    fpga::DWDeconvArgs DWDeconv_arg = {0};
+    fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter,
+                            activation_enable, leaky_relu_negative_slope,
+                            param->Strides()[0], param->Strides()[1],
+                            param->Paddings()[0], param->Paddings()[1], bs_ptr);
+    param->SetFpgaArgs(DWDeconv_arg);
+  } else {
+    fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
+    fpga::DeconvArgs deconv_arg = {0};
+    fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,
+                          leaky_relu_negative_slope, param->Groups(),
+                          param->Strides()[0], param->Strides()[1],
+                          param->Paddings()[0], param->Paddings()[1], bs_ptr);
+    param->SetFpgaArgs(deconv_arg);
+  }
  return true;
 }

 template <>
 void DeconvAddReluKernel<FPGA, float>::Compute(
-    const FusionDeconvAddReluParam<FPGA> &param) {}
+    const FusionDeconvAddReluParam<FPGA> &param) {
+  // fpga::ComputeFpgaDeconv(param.FpgaArgs());
+  if (param.Groups() == param.Output()->dims()[1]) {
+    fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
+  } else {
+    fpga::ComputeFpgaDeconv(param.FpgaArgs());
+  }
+}

 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/kernel/fpga/V2/deconv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/deconv_bn_relu_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_DECONVBNRELU_OP
+
+#include "operators/kernel/deconv_bn_relu_kernel.h"
+#include <cmath>
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool DeconvBNReluKernel<FPGA, float>::Init(
+    FusionDeconvBNReluParam<FPGA> *param) {
+  // bool relu_enabled = true;
+  paddle_mobile::fpga::ActivationType activation_enable =
+      paddle_mobile::fpga::LEAKYRELU;
+  int16_t leaky_relu_negative_slope = 0;
+  auto input = const_cast<LoDTensor *>(param->Input());
+  const Tensor *bias = param->InputBias();
+  auto bias_ptr = bias->data<float>();
+  auto filter = const_cast<LoDTensor *>(param->Filter());
+  auto out = param->Output();
+  auto bn_mean_ptr = param->InputMean()->data<float>();
+  auto bn_var_ptr = param->InputVariance()->data<float>();
+  auto bn_scale_ptr = param->InputScale()->data<float>();
+  auto bn_bias_ptr = param->InputBias()->data<float>();
+  const float epsilon = param->Epsilon();
+
+  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
+                        "Output channel should be equal to bias number");
+  int channel = out->dims()[1];
+  auto new_scale = new Tensor();
+  auto new_bias = new Tensor();
+  auto new_scale_ptr = new_scale->mutable_data<float>({channel});
+  auto new_bias_ptr = new_bias->mutable_data<float>({channel});
+  for (int i = 0; i < channel; i++) {
+    new_scale_ptr[i] = bn_scale_ptr[i] /
+                       static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
+    new_bias_ptr[i] = bn_bias_ptr[i] + (0 - bn_mean_ptr[i]) * new_scale_ptr[i];
+  }
+
+  int sub_conv_n = param->Strides()[0];
+  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *  // NOLINT
+                                           sizeof(float));             // NOLINT
+
+  for (int i = 0; i < channel * sub_conv_n; i++) {
+    bs_ptr[i + sub_conv_n * channel] = new_scale_ptr[i % channel];
+    bs_ptr[i] = new_bias_ptr[i % (channel)];
+  }
+
+  PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
+                        "stride_width should be equal to stride_height ");
+  PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
+                        "filter width should be equal to filter height ");
+  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
+                        "filter axis should be the multiple of stride axis ");
+  if (param->Groups() == channel) {
+    fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
+                               sub_conv_n);
+    fpga::DWDeconvArgs DWDeconv_arg = {0};
+    fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter,
+                            activation_enable, leaky_relu_negative_slope,
+                            param->Strides()[0], param->Strides()[1],
+                            param->Paddings()[0], param->Paddings()[1], bs_ptr);
+    param->SetFpgaArgs(DWDeconv_arg);
+  } else {
+    fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
+    fpga::DeconvArgs deconv_arg = {0};
+    fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,
+                          leaky_relu_negative_slope, param->Groups(),
+                          param->Strides()[0], param->Strides()[1],
+                          param->Paddings()[0], param->Paddings()[1], bs_ptr);
+    param->SetFpgaArgs(deconv_arg);
+  }
+  delete new_scale;
+  delete new_bias;
+  return true;
+}
+
+template <>
+void DeconvBNReluKernel<FPGA, float>::Compute(
+    const FusionDeconvBNReluParam<FPGA> &param) {
+  // fpga::ComputeFpgaDeconv(param.FpgaArgs());
+  if (param.Groups() == param.Output()->dims()[1]) {
+    fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
+  } else {
+    fpga::ComputeFpgaDeconv(param.FpgaArgs());
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp
@@ -15,49 +15,176 @@ limitations under the License. */

 #include "operators/kernel/elementwise_add_kernel.h"

+#include <string>
+#include "fpga/V1/api.h"
+
 namespace paddle_mobile {
 namespace operators {

 template <>
 bool ElementwiseAddKernel<FPGA, float>::Init(ElementwiseAddParam<FPGA> *param) {
-  bool relu_enabled = false;
-  auto *input_x = const_cast<LoDTensor *>(param->InputX());
  auto *input_y = const_cast<LoDTensor *>(param->InputY());
  auto *out = param->Out();
-  auto input_x_ptr = input_x->data<float>();
-  auto input_y_ptr = input_y->data<float>();
-  int aligned_channel_num = fpga::get_aligned_channel_num(input_x->dims()[1]);
-  fpga::format_fp16_ofm(out, aligned_channel_num);
-  auto out_ptr = out->mutable_data<float>();
-
-  fpga::EWAddArgs ewaddArgs = {0};
-  ewaddArgs.relu_enabled = relu_enabled;
-  ewaddArgs.const0 = 0x3c00;  // =1
-  ewaddArgs.const1 = 0x3c00;  // =1
-  ewaddArgs.image0.address = input_x_ptr;
-  ewaddArgs.image0.channels = (uint32_t)input_x->dims()[1];
-  ewaddArgs.image0.scale_address = input_x->scale;
-  ewaddArgs.image0.height = (uint32_t)input_x->dims()[2];
-  ewaddArgs.image0.width = (uint32_t)input_x->dims()[3];
-  ewaddArgs.image0.pad_height = 0;
-  ewaddArgs.image0.pad_width = 0;
-  ewaddArgs.image1.address = input_y_ptr;
-  ewaddArgs.image1.channels = (uint32_t)input_y->dims()[1];
-  ewaddArgs.image1.scale_address = input_y->scale;
-  ewaddArgs.image1.height = (uint32_t)input_y->dims()[2];
-  ewaddArgs.image1.width = (uint32_t)input_y->dims()[3];
-  ewaddArgs.image1.pad_height = 0;
-  ewaddArgs.image1.pad_width = 0;
-  ewaddArgs.output.scale_address = out->scale;
-  ewaddArgs.output.address = out_ptr;
-  param->SetFpgaArgs(ewaddArgs);
+  if (input_y->type() != type_id<float>()) {
+    paddle_mobile::fpga::ActivationType activation_enable =
+        paddle_mobile::fpga::NONE;
+    int16_t leaky_relu_negative_slope = 0;
+    auto *input_x = const_cast<LoDTensor *>(param->InputX());
+    auto input_x_ptr = input_x->data<half>();
+    auto input_y_ptr = input_y->data<half>();
+    fpga::format_fp16_ofm(out);
+    auto out_ptr = out->mutable_data<half>();
+
+    fpga::EWAddArgs ewaddArgs = {0};
+    // ewaddArgs.relu_enabled = relu_enabled;
+    ewaddArgs.output.activation.activation_type = activation_enable;
+    ewaddArgs.output.activation.leaky_relu_negative_slope =
+        leaky_relu_negative_slope;
+    ewaddArgs.const0 = 0x3c00;  // =1
+    ewaddArgs.const1 = 0x3c00;  // =1
+    ewaddArgs.image0.address = input_x_ptr;
+    ewaddArgs.image0.channels = (uint32_t)input_x->dims()[1];
+    ewaddArgs.image0.scale_address = input_x->scale;
+    ewaddArgs.image0.height = (uint32_t)input_x->dims()[2];
+    ewaddArgs.image0.width = (uint32_t)input_x->dims()[3];
+    ewaddArgs.image0.pad_height = 0;
+    ewaddArgs.image0.pad_width = 0;
+    ewaddArgs.image1.address = input_y_ptr;
+    ewaddArgs.image1.channels = (uint32_t)input_y->dims()[1];
+    ewaddArgs.image1.scale_address = input_y->scale;
+    ewaddArgs.image1.height = (uint32_t)input_y->dims()[2];
+    ewaddArgs.image1.width = (uint32_t)input_y->dims()[3];
+    ewaddArgs.image1.pad_height = 0;
+    ewaddArgs.image1.pad_width = 0;
+    ewaddArgs.output.scale_address = out->scale;
+    ewaddArgs.output.address = out_ptr;
+    fpga::expand_EW_arg(&ewaddArgs);
+    param->SetFpgaArgs(ewaddArgs);
+  } else {
+    param->float_input_x.Resize(param->InputX()->dims());
+    param->float_input_x.init(type_id<float>().hash_code());
+    fpga::format_fp32_ofm(&(param->float_input_x));
+
+    param->float_out.Resize(param->InputX()->dims());
+    param->float_out.mutable_data<float>(param->InputX()->dims());
+    fpga::format_fp32_ofm(&(param->float_out));
+
+    fpga::format_fp16_ofm(out);
+  }
  return true;
 }

+inline void ElementwiseAddCompute(const ElementwiseAddParam<FPGA> &param) {
+  auto input_x = param.float_input_x;
+  auto input_y = param.InputY();
+  auto Out = param.float_out;
+  int axis = param.Axis();
+
+  const auto &x_dims = input_x.dims();
+  const auto &y_dims = input_y->dims();
+  /// axis = -1 represent the last dimensions.
+  axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
+  size_t batch = 1;
+  size_t channels = 1;
+  size_t elementwise_num = 1;
+  for (int i = 0; i < axis; ++i) {
+    batch *= x_dims[i];
+  }
+  for (int i = 0; i < y_dims.size(); ++i) {
+    channels *= y_dims[i];
+  }
+  for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) {
+    elementwise_num *= x_dims[i];
+  }
+  const float *bias_data = input_y->data<float>();
+  const float *input_data = input_x.data<float>();
+  float *output_data = Out.mutable_data<float>();
+
+  for (int i = 0; i < batch; ++i) {
+    for (int j = 0; j < channels; ++j) {
+      size_t offset = (i * channels + j) * elementwise_num;
+      const float *input = input_data + offset;
+      const float bias = bias_data[j];
+      float *output = output_data + offset;
+      // DLOG << "output address: "<< output;
+      for (int k = 0; k < elementwise_num; ++k) {
+        output[k] = input[k] + bias;
+        // DLOG << "output[" << k << "]= " << output[k] ;
+      }
+    }
+  }
+}
 template <>
 void ElementwiseAddKernel<FPGA, float>::Compute(
    const ElementwiseAddParam<FPGA> &param) {
-  fpga::ComputeFpgaEWAdd(param.FpgaArgs());
+  auto input_y = const_cast<LoDTensor *>(param.InputY());
+  if (input_y->type() != type_id<float>()) {
+    fpga::ComputeFpgaEWAdd(param.FpgaArgs());
+  } else {
+    auto input_x = const_cast<LoDTensor *>(param.InputX());
+    auto intput_x_float = const_cast<Tensor *>(&(param.float_input_x));
+    fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
+    args.input_data_type = fpga::DATA_TYPE_FP16;
+    args.output_data_type = fpga::DATA_TYPE_FP32;
+    args.input_layout_type = fpga::LAYOUT_CHW;
+    args.output_layout_type = fpga::LAYOUT_HWC;
+    args.image.address = input_x->data<half>();
+    args.image.channels = (uint32_t)(input_x->fpga_data_num);
+    args.image.height = 1;
+    args.image.width = 1;
+    args.image.pad_height = 0;
+    args.image.pad_width = 0;
+    args.output.address = intput_x_float->data<float>();
+    args.output.scale_address = intput_x_float->scale;
+
+    // fpga::fpga_flush(input_x->data<half>(),input_x->fpga_data_num *
+    // sizeof(half));
+    fpga::PerformBypass(args);
+    fpga::fpga_invalidate(args.output.address,
+                          input_x->fpga_data_num * sizeof(float));
+
+    // just for test
+    /*    {
+           static int cnt = 0;
+           if(cnt == 0){
+               std::string str= "first_bypass_data";
+               float rslt = 0.0f;
+               fpga::savefile(str, args.output.address, input_x->fpga_data_num,
+       rslt); cnt++;
+           }
+       }*/
+    ElementwiseAddCompute(param);
+
+    auto out_float = const_cast<Tensor *>(&(param.float_out));
+    DLOG << "out float: " << out_float->data<float>();
+    fpga::fpga_flush(out_float->data<float>(),
+                     input_x->fpga_data_num * sizeof(float));
+    // just for test
+    /*{
+       static int cnt = 0;
+       if(cnt == 0){
+           std::string str= "ew_output_data";
+           float rslt = 0.0f;
+
+           fpga::savefile(str, out_float->data<float>(), input_x->fpga_data_num,
+   rslt); cnt++;
+       }
+   }*/
+    auto Out = param.Out();
+    args.input_data_type = fpga::DATA_TYPE_FP32;
+    args.output_data_type = fpga::DATA_TYPE_FP16;
+    args.input_layout_type = fpga::LAYOUT_CHW;
+    args.output_layout_type = fpga::LAYOUT_HWC;
+    args.image.address = out_float->data<float>();
+    args.image.channels = (uint32_t)(input_x->fpga_data_num);
+    args.image.height = 1;
+    args.image.width = 1;
+    args.image.pad_height = 0;
+    args.image.pad_width = 0;
+    args.output.address = Out->data<half>();
+    args.output.scale_address = Out->scale;
+    fpga::PerformBypass(args);
+  }
 }
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
@@ -21,18 +21,23 @@ namespace operators {
 template <>
 bool ElementwiseAddReluKernel<FPGA, float>::Init(
    ElementwiseAddReluParam<FPGA> *param) {
-  bool relu_enabled = true;
+  // bool relu_enabled = true;
+  paddle_mobile::fpga::ActivationType activation_enable =
+      paddle_mobile::fpga::LEAKYRELU;
+  int16_t leaky_relu_negative_slope = 0;
  auto *input_x = const_cast<LoDTensor *>(param->InputX());
  auto *input_y = const_cast<LoDTensor *>(param->InputY());
  auto *out = param->Out();
-  auto input_x_ptr = input_x->data<float>();
-  auto input_y_ptr = input_y->data<float>();
-  int aligned_channel_num = fpga::get_aligned_channel_num(input_x->dims()[1]);
-  fpga::format_fp16_ofm(out, aligned_channel_num);
-  auto out_ptr = out->mutable_data<float>();
+  auto input_x_ptr = input_x->data<half>();
+  auto input_y_ptr = input_y->data<half>();
+  fpga::format_fp16_ofm(out);
+  auto out_ptr = out->mutable_data<half>();

  fpga::EWAddArgs ewaddArgs = {0};
-  ewaddArgs.relu_enabled = relu_enabled;
+  // ewaddArgs.relu_enabled = relu_enabled;
+  ewaddArgs.output.activation.activation_type = activation_enable;
+  ewaddArgs.output.activation.leaky_relu_negative_slope =
+      leaky_relu_negative_slope;
  ewaddArgs.const0 = 0x3c00;  // =1
  ewaddArgs.const1 = 0x3c00;  // =1
  ewaddArgs.image0.address = input_x_ptr;
@@ -51,6 +56,7 @@ bool ElementwiseAddReluKernel<FPGA, float>::Init(
  ewaddArgs.image1.pad_width = 0;
  ewaddArgs.output.scale_address = out->scale;
  ewaddArgs.output.address = out_ptr;
+  fpga::expand_EW_arg(&ewaddArgs);
  param->SetFpgaArgs(ewaddArgs);
  return true;
 }

--- a/src/operators/kernel/fpga/V2/elementwise_mul_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/elementwise_mul_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef ELEMENTWISEMUL_OP
+
+#include "operators/kernel/elementwise_mul_kernel.h"
+#include "operators/math/elementwise_op_function.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename T>
+struct MulFunctor {
+  inline T operator()(T a, T b) const { return a * b; }
+};
+template <>
+bool ElementwiseMulKernel<FPGA, float>::Init(ElementwiseMulParam<FPGA> *param) {
+  param->float_input_x.Resize(param->InputX()->dims());
+  param->float_input_x.init(type_id<float>().hash_code());
+  fpga::format_fp32_ofm(&(param->float_input_x));
+
+  param->float_out.Resize(param->InputX()->dims());
+  param->float_out.init(type_id<float>().hash_code());
+  fpga::format_fp32_ofm(&(param->float_out));
+
+  auto *out = param->Out();
+  fpga::format_fp16_ofm(out);
+  return true;
+}
+
+template <>
+void ElementwiseMulKernel<FPGA, float>::Compute(
+    const ElementwiseMulParam<FPGA> &param) {
+  auto input_x = const_cast<LoDTensor *>(param.InputX());
+  auto intput_x_float = const_cast<Tensor *>(&(param.float_input_x));
+  // auto intput_x_32_ptr =
+  // const_cast<float*>(param.float_input_x.data<float>());
+  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
+  args.input_data_type = fpga::DATA_TYPE_FP16;
+  args.output_data_type = fpga::DATA_TYPE_FP32;
+  args.input_layout_type = fpga::LAYOUT_CHW;
+  args.output_layout_type = fpga::LAYOUT_HWC;
+  args.image.address = input_x->data<half>();
+  args.image.channels = (uint32_t)(input_x->fpga_data_num);
+  args.image.height = 1;
+  args.image.width = 1;
+  args.image.pad_height = 0;
+  args.image.pad_width = 0;
+  args.output.address = intput_x_float->data<float>();
+  args.output.scale_address = intput_x_float->scale;
+  fpga::PerformBypass(args);
+  fpga::fpga_invalidate(args.output.address,
+                        input_x->fpga_data_num * sizeof(float));
+
+  auto input_y = param.InputY();
+  int axis = param.Axis();
+  auto out_float = const_cast<Tensor *>(&(param.float_out));
+  ElementwiseComputeEx<MulFunctor<float>, float>(
+      intput_x_float, input_y, axis, MulFunctor<float>(), out_float);
+  fpga::fpga_flush(out_float->data<float>(),
+                   input_x->fpga_data_num * sizeof(float));
+
+  Tensor *Out = param.Out();
+  args.input_data_type = fpga::DATA_TYPE_FP32;
+  args.output_data_type = fpga::DATA_TYPE_FP16;
+  args.input_layout_type = fpga::LAYOUT_CHW;
+  args.output_layout_type = fpga::LAYOUT_HWC;
+  args.image.address = out_float->data<float>();
+  args.image.channels = (uint32_t)(Out->fpga_data_num);
+  args.image.height = 1;
+  args.image.width = 1;
+  args.image.pad_height = 0;
+  args.image.pad_width = 0;
+  args.output.address = Out->data<half>();
+  args.output.scale_address = Out->scale;
+  fpga::PerformBypass(args);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/fpga/V2/feed_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/feed_kernel.cpp
@@ -13,44 +13,94 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "operators/kernel/feed_kernel.h"
-#include "fpga/V2/filter.h"
+
 namespace paddle_mobile {
 namespace operators {

 template <>
 bool FeedKernel<FPGA, float>::Init(FeedParam<FPGA> *param) {
-  Tensor *output = param->Out();
-  int aligned_channel = fpga::get_aligned_channel_num(output->dims()[1]);
-  fpga::format_fp16_ofm(output, aligned_channel);
+  auto output = param->Out();
+  int col = param->Col();
+  DLOG << "col = " << col;
+  auto input = const_cast<LoDTensor *>(&param->InputX()->at(col));
+  input->init(type_id<float>().hash_code());
+  input->Resize(output->dims());
+
+  if (output->dims().size() != 4) {
+    return true;
+  }
+
+  fpga::format_fp16_ofm(output);
  return true;
 }
+
 template <>
 void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
-  auto input =
-      reinterpret_cast<Tensor *>(const_cast<LoDTensor *>(param.InputX()));
-  fpga::format_image(input);
-  auto input_ptr = input->data<float>();
-  Tensor *output = param.Out();
-  auto output_ptr = output->data<float>();
-  auto channel = input->dims()[1];
-  uint32_t aligned_channels =
-      fpga::filter::calc_aligned_channel((int)channel);  // NOLINT
+  auto output = param.Out();
+  int col = param.Col();
+  auto input = const_cast<LoDTensor *>(&param.InputX()->at(col));
+  kTypeId_t input_type = input->type();

+  if (input_type == type_id<float>()) {
+    input->init(type_id<float>().hash_code());
+  } else {
+    input->init(type_id<int8_t>().hash_code());
+  }
+  input->Resize(output->dims());
+
+  if (output->dims().size() != 4) {
+    size_t size = output->numel() * sizeof(float);
+    auto output_ptr = output->data<float>();
+    auto input_ptr = input->data<float>();
+    auto external_ptr = reinterpret_cast<float *>(input->external_data);
+    float *p_data = external_ptr == nullptr ? input_ptr : external_ptr;
+    memcpy(output_ptr, p_data, size);
+    input->external_data = nullptr;
+    return;
+  }
+
+  fpga::format_image(input);
+  auto output_ptr = output->data<half>();
  fpga::BypassArgs args = {fpga::DATA_TYPE_FP32};
+  if (input_type == type_id<float>()) {
+    auto input_ptr = input->data<float>();
+    auto external_ptr = reinterpret_cast<float *>(input->external_data);
+    float *p_data = external_ptr == nullptr ? input_ptr : external_ptr;
+
+    args.input_data_type = fpga::DATA_TYPE_FP32;
+    args.output_data_type = fpga::DATA_TYPE_FP16;
+    args.input_layout_type = fpga::LAYOUT_CHW;
+    args.output_layout_type = fpga::LAYOUT_HWC;
+    args.image.address = p_data;
+    args.image.channels = (uint32_t)input->dims()[1];
+    args.image.height = (uint32_t)input->dims()[2];
+    args.image.width = (uint32_t)input->dims()[3];
+    args.image.pad_height = 0;
+    args.image.pad_width = 0;
+    args.output.address = output_ptr;
+    args.output.scale_address = output->scale;
+    fpga::PerformBypass(args);
+    input->external_data = nullptr;
+  } else {
+    auto input_ptr = input->data<int8_t>();
+    auto external_ptr = reinterpret_cast<int8_t *>(input->external_data);
+    int8_t *p_data = external_ptr == nullptr ? input_ptr : external_ptr;

-  args.input_data_type = fpga::DATA_TYPE_FP32;
-  args.output_data_type = fpga::DATA_TYPE_FP16;
-  args.input_layout_type = fpga::LAYOUT_CHW;
-  args.output_layout_type = fpga::LAYOUT_HWC;
-  args.image.address = reinterpret_cast<void *>(input_ptr);
-  args.image.channels = aligned_channels;
-  args.image.height = (uint32_t)input->dims()[2];
-  args.image.width = (uint32_t)input->dims()[3];
-  args.image.pad_height = 0;
-  args.image.pad_width = 0;
-  args.output.address = output_ptr;
-  args.output.scale_address = output->scale;
-  fpga::PerformBypass(args);
+    args.input_data_type = fpga::DATA_TYPE_INT8;
+    args.output_data_type = fpga::DATA_TYPE_FP16;
+    args.input_layout_type = fpga::LAYOUT_CHW;
+    args.output_layout_type = fpga::LAYOUT_HWC;
+    args.image.address = p_data;
+    args.image.channels = (uint32_t)input->dims()[1];
+    args.image.height = (uint32_t)input->dims()[2];
+    args.image.width = (uint32_t)input->dims()[3];
+    args.image.pad_height = 0;
+    args.image.pad_width = 0;
+    args.output.address = output_ptr;
+    args.output.scale_address = output->scale;
+    fpga::PerformBypass(args);
+    input->external_data = nullptr;
+  }
 }
 template class FeedKernel<FPGA, float>;


--- a/src/operators/kernel/fpga/V2/fetch_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/fetch_kernel.cpp
@@ -11,22 +11,116 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #include "operators/kernel/fetch_kernel.h"
-
 namespace paddle_mobile {
 namespace operators {

 template <>
 bool FetchKernel<FPGA, float>::Init(FetchParam<FPGA> *param) {
+  auto input = const_cast<LoDTensor *>(param->InputX());
+  int col = param->Col();
+  DLOG << "col = " << col;
+  auto output = &(param->Out()->at(col));
+  if (input->type() == type_id<float>()) {
+    return true;
+  }
+  output->init(type_id<float>().hash_code());
+  output->Resize(input->dims());
+  fpga::format_fp32_ofm(output);
+  int outC = 1;
+  int outH = 1;
+  int outW = 1;
+  if (output->dims().size() == 4) {
+    outC = output->dims()[1];
+    outH = output->dims()[2];
+    outW = output->dims()[3];
+  } else {  // 2
+    outC = output->dims()[1];
+  }
+  int unalignedCW = outC * outW;
+  int alignedCW = fpga::align_to_x(unalignedCW, IMAGE_ALIGNMENT);
+  if (alignedCW != unalignedCW) {
+    param->aligned_out.Resize(input->dims());
+    param->aligned_out.mutable_data<float>(input->dims());
+    fpga::fpga_flush(param->aligned_out.data<float>(),
+                     outH * unalignedCW * sizeof(float));
+  }
+  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
+
+  args.input_data_type = fpga::DATA_TYPE_FP16;
+  args.output_data_type = fpga::DATA_TYPE_FP32;
+  args.input_layout_type = fpga::LAYOUT_CHW;
+  args.output_layout_type = fpga::LAYOUT_HWC;
+  args.image.address = input->data<half>();
+  args.image.channels = (uint32_t)(input->fpga_data_num);
+  args.image.height = 1;
+  args.image.width = 1;
+  args.image.pad_height = 0;
+  args.image.pad_width = 0;
+  args.output.address = output->data<float>();
+  args.output.scale_address = output->scale;
+  param->fpga_bypass_args = args;
+
  return true;
 }
-
+void dealign(float *src, float *dst, int input_c, int input_h, int input_w) {
+  int alignCW = paddle_mobile::fpga::align_to_x(input_c * input_w, 16);
+  int dealignCW = input_c * input_w;
+  for (int h = 0; h < input_h; ++h) {
+    auto input_offset = h * alignCW;
+    auto output_offset = h * dealignCW;
+    memcpy((dst + output_offset), (src + input_offset),
+           dealignCW * sizeof(float));
+  }
+}
 template <>
 void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
-  param.Out()->ShareDataWith(*(param.InputX()));
-}
+  auto input = const_cast<LoDTensor *>(param.InputX());
+  int col = param.Col();
+  auto output = &param.Out()->at(col);
+  if (input->type() == type_id<float>()) {
+    output->ShareDataWith(*input);
+    return;
+  }
+
+  fpga::BypassArgs args = param.fpga_bypass_args;
+  auto input_address = (input->data<half>());
+  args.image.address = static_cast<void *>(input_address);
+  float *outdata_ptr =
+      reinterpret_cast<float *>(param.fpga_bypass_args.output.address);
+  const int num_th = 32;
+  if (output->fpga_data_num < num_th) {
+    fpga::fpga_invalidate(input_address, (input->fpga_data_num) * sizeof(half));
+
+    for (int idx = 0; idx < product(input->dims()); ++idx) {
+      outdata_ptr[idx] = fpga::fp16_2_fp32(input_address[idx]);
+    }
+    return;
+  }

+  fpga::PerformBypass(args);
+  int outC = 1;
+  int outH = 1;
+  int outW = 1;
+  if (output->dims().size() == 4) {
+    outC = output->dims()[1];
+    outH = output->dims()[2];
+    outW = output->dims()[3];
+  } else {  // 2
+    outC = output->dims()[1];
+  }
+
+  fpga::fpga_invalidate(param.fpga_bypass_args.output.address,
+                        output->fpga_data_num * sizeof(float));
+  int unalignedCW = outC * outW;
+  int alignedCW = fpga::align_to_x(unalignedCW, IMAGE_ALIGNMENT);
+  if (unalignedCW != alignedCW) {
+    auto aligned_ptr = const_cast<float *>(param.aligned_out.data<float>());
+    dealign(outdata_ptr, aligned_ptr, outC, outH, outW);
+    memcpy(outdata_ptr, aligned_ptr, outC * outH * outW * sizeof(float));
+    fpga::fpga_flush(outdata_ptr, outC * outH * outW * sizeof(float));
+  }
+}
 template class FetchKernel<FPGA, float>;

 }  // namespace operators

--- a/src/operators/kernel/fpga/V2/fusion_fc_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/fusion_fc_kernel.cpp
@@ -20,15 +20,18 @@ namespace operators {

 template <>
 bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
-  bool relu_enabled = false;
+  // bool relu_enabled = false;
+  paddle_mobile::fpga::ActivationType activation_enable =
+      paddle_mobile::fpga::NONE;
+  int16_t leaky_relu_negative_slope = 0;
  auto input_x = const_cast<LoDTensor *>(param->InputX());
-  auto filter = const_cast<Tensor *>(param->InputY());
+  auto filter = const_cast<LoDTensor *>(param->InputY());
  const Tensor *input_z = param->InputZ();
  auto input_z_ptr = input_z->data<float>();
  auto out = param->Out();

-  PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0],
-                        "Image channel should be equal to weight number");
+  // PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0],
+  //                     "Image channel should be equal to weight number");
  int channel = (uint32_t)out->dims()[1];
  auto bs_ptr =
      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
@@ -47,11 +50,16 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {

  out->Resize(framework::make_ddim({1, channel, 1, 1}));
  filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
-  fpga::format_fc_data(filter, out, &bs_ptr);
+  float max_value = fpga::filter_find_max(filter);
+  fpga::format_fc_filter(filter, max_value);
+
+  int element_num_per_div = fpga::get_filter_num_per_div(filter, 1);
+  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
+  fpga::format_fp16_ofm(out);

  fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_split_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1,
-                       0, 0, bs_ptr);
+  fpga::fill_split_arg(&conv_arg, input_x, out, filter, activation_enable,
+                       leaky_relu_negative_slope, 1, 1, 1, 0, 0, bs_ptr);
  param->SetFpgaArgs(conv_arg);
  return true;
 }

--- a/src/operators/kernel/fpga/V2/fusion_fc_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/fusion_fc_relu_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_FCRELU_OP
+
+#include "operators/kernel/fc_relu_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
+  // bool relu_enabled = false;
+  paddle_mobile::fpga::ActivationType activation_enable =
+      paddle_mobile::fpga::LEAKYRELU;
+  int16_t leaky_relu_negative_slope = 0;
+  auto input_x = const_cast<LoDTensor *>(param->InputX());
+  auto filter = const_cast<LoDTensor *>(param->InputY());
+  const Tensor *input_z = param->InputZ();
+  auto input_z_ptr = input_z->data<float>();
+  auto out = param->Out();
+
+  // PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0],
+  //                      "Image channel should be equal to weight number");
+  int channel = (uint32_t)out->dims()[1];
+  auto bs_ptr =
+      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
+  for (int i = 0; i < channel; i++) {
+    bs_ptr[i + channel] = 1;
+    bs_ptr[i] = input_z_ptr[i];
+  }
+  int num = (uint32_t)filter->dims()[1];
+  int chw = (uint32_t)filter->dims()[0];
+  PADDLE_MOBILE_ENFORCE(
+      chw == input_x->numel(),
+      "Filter element num should be equal to IFM element num");
+  int height = (uint32_t)input_x->dims()[2];
+  int width = (uint32_t)input_x->dims()[3];
+  int filter_channel = chw / height / width;
+
+  out->Resize(framework::make_ddim({1, channel, 1, 1}));
+  filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
+  float max_value = fpga::filter_find_max(filter);
+  fpga::format_fc_filter(filter, max_value);
+
+  int element_num_per_div = fpga::get_filter_num_per_div(filter, 1);
+  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
+  fpga::format_fp16_ofm(out);
+
+  fpga::SplitConvArgs conv_arg = {0};
+  fpga::fill_split_arg(&conv_arg, input_x, out, filter, activation_enable,
+                       leaky_relu_negative_slope, 1, 1, 1, 0, 0, bs_ptr);
+  param->SetFpgaArgs(conv_arg);
+  return true;
+}
+
+template <>
+void FusionFcReluKernel<FPGA, float>::Compute(
+    const FusionFcReluParam<FPGA> &param) {
+  fpga::ComputeFpgaConv(param.FpgaArgs());
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/fpga/V2/pad2d_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/pad2d_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef PAD2D_OP
+#include "operators/kernel/pad2d_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool Pad2DKernel<FPGA, float>::Init(Pad2DParam<FPGA> *param) {
+  Tensor *output = param->output_;
+  fpga::format_fp16_ofm(output);
+  return true;
+}
+void pad2dFunc(const framework::Tensor *input, framework::Tensor *output) {
+  auto input_data = (input->data<half>());
+  auto output_data = (output->data<half>());
+  auto input_c = input->dims()[1];
+  auto input_h = input->dims()[2];
+  auto input_w = input->dims()[3];
+  auto output_c = output->dims()[1];
+  auto output_w = output->dims()[3];
+  auto copysize = input_c * input_w;
+  for (int h = 0; h < input_h; ++h) {
+    auto input_offset = h * input_c * input_w;
+    auto output_offset = h * paddle_mobile::fpga::align_to_x(
+                                 output_c * output_w, IMAGE_ALIGNMENT);
+    memcpy((output_data + output_offset), (input_data + input_offset),
+           copysize * sizeof(half));
+  }
+}
+template <>
+void Pad2DKernel<FPGA, float>::Compute(const Pad2DParam<FPGA> &param) {
+  auto in_x = param.input_;
+  auto out = param.output_;
+  fpga::fpga_invalidate((void *)in_x->data<half>(),  // NOLINT
+                        in_x->numel() * sizeof(half));
+  pad2dFunc(in_x, out);
+  (out->scale)[0] = (in_x->scale)[0];
+  (out->scale)[1] = (in_x->scale)[1];
+  DLOG << (out->scale)[0];
+  DLOG << (out->scale)[1];
+  size_t outputSize =
+      out->dims()[2] *
+      paddle_mobile::fpga::align_to_x((out->dims()[1]) * (out->dims()[3]),
+                                      IMAGE_ALIGNMENT) *
+      sizeof(half);
+  fpga::fpga_flush(out->data<half>(), outputSize);
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif  // PAD2D_OP
--- a/src/operators/kernel/fpga/V2/pool_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/pool_kernel.cpp
@@ -21,18 +21,30 @@ namespace operators {

 template <>
 bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) {
-  auto *input = const_cast<Tensor *>(param->Input());
-  auto input_ptr = input->data<float>();
-  Tensor *output = param->Output();
-  int aligned_channel_num =
-      fpga::get_aligned_channel_num((int)output->dims()[1]);  // NOLINT
-  fpga::format_fp16_ofm(output, aligned_channel_num);
-  auto output_ptr = output->mutable_data<float>();
+  auto *input = const_cast<LoDTensor *>(param->Input());
+  auto *output = param->Output();
  vector<int> ksize = param->Ksize();
  vector<int> strides = param->Strides();
  vector<int> paddings = param->Paddings();
  std::string pooling_type = param->PoolingType();

+  if (input->type() == type_id<float>()) {
+    int channels = input->dims()[1];
+    int height = input->dims()[2];
+    int width = input->dims()[3];
+    int num = input->dims()[0];
+    int out_width = (width + 2 * paddings[1] - ksize[1]) / strides[1] + 1;
+    int out_height = (height + 2 * paddings[0] - ksize[0]) / strides[0] + 1;
+    framework::DDim dim =
+        framework::make_ddim({num, channels, out_height, out_width});
+    output->mutable_data<float>(dim);
+    return true;
+  }
+
+  auto input_ptr = input->data<half>();
+  fpga::format_fp16_ofm(output);
+  auto output_ptr = output->mutable_data<half>();
+
  fpga::PoolingArgs poolArgs = {0};
  poolArgs.mode = pooling_type == "max" ? 0 : 1;  // max:0, avg:1
  poolArgs.kernel_reciprocal =
@@ -56,6 +68,34 @@ bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) {

 template <>
 void PoolKernel<FPGA, float>::Compute(const PoolParam<FPGA> &param) {
+  auto *input = const_cast<LoDTensor *>(param.Input());
+
+  if (input->type() == type_id<float>()) {
+    auto *output = param.Output();
+    auto in = input->data<float>();
+    auto N = input->dims()[0];
+    output->Resize(
+        {N, output->dims()[1], output->dims()[2], output->dims()[3]});
+    auto len = output->numel();
+    auto out = output->mutable_data<float>();
+    int C = input->dims()[1], H = input->dims()[2],  // N = input->dims()[0],
+        W = input->dims()[3];
+    int HW = H * W, CHW = C * H * W, WC = W * C;
+
+    for (int n = 0; n < N; n++) {
+      for (int c = 0; c < C; c++) {
+        out[n * C + c] = 0;
+        for (int h = 0; h < H; h++) {
+          for (int w = 0; w < W; w++) {
+            out[n * C + c] += in[n * CHW + h * WC + w * C +
+                                 c];  // in[n * CHW + c * HW + h * W + w]; //
+          }
+        }
+        out[n * C + c] /= HW;
+      }
+    }
+    return;
+  }
  fpga::ComputeFpgaPool(param.FpgaArgs());
 }
 }  // namespace operators

--- a/src/operators/kernel/fpga/V2/proposal_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/proposal_kernel.cpp
--- a/src/operators/kernel/fpga/V2/psroi_pool_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/psroi_pool_kernel.cpp
--- a/src/operators/kernel/fpga/V2/deconv_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/deconv_relu_kernel.cpp
--- a/src/operators/kernel/fpga/V2/reshape2_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/reshape2_kernel.cpp
--- a/src/operators/kernel/fpga/V2/reshape_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/reshape_kernel.cpp
--- a/src/operators/kernel/fpga/V2/roialign_pool_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/roialign_pool_kernel.cpp
--- a/src/operators/kernel/fpga/V2/sigmoid_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/sigmoid_kernel.cpp
--- a/src/operators/kernel/fpga/V2/slice_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/slice_kernel.cpp
--- a/src/operators/kernel/fpga/V2/softmax_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/softmax_kernel.cpp
--- a/src/operators/kernel/fpga/V2/split_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/split_kernel.cpp
--- a/src/operators/kernel/fpga/V2/tanh_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/tanh_kernel.cpp
--- a/src/operators/kernel/fpga/V2/transpose2_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/transpose2_kernel.cpp