Merge pull request #1367 from qnqinan/develop

add fpga dwconv and reshape op and refactor code of fpga conv and dec… fixed#1366

Merge pull request #1367 from qnqinan/develop
add fpga dwconv and reshape op and refactor code of fpga conv and dec… fixed#1366
ff74556f · zhangyang0701 · GitHub · c4531b38 · 96242b6e · ff74556f
21 changed file
--- a/src/fpga/V1/api.cpp
+++ b/src/fpga/V1/api.cpp
@@ -140,6 +140,16 @@ void format_filter(framework::Tensor *filter_tensor, float max_value,
                        max_value);
  filter_tensor->reset_data_ptr(new_data);
 }
+void format_dwconv_filter(framework::Tensor *filter_tensor, float *scale_ptr) {
+  auto dims = filter_tensor->dims();
+  auto num = dims[0], height = dims[2], width = dims[3];
+  auto data_ptr = filter_tensor->data<float>();
+  size_t memory_size = num * height * width * sizeof(float);
+  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
+  fpga_copy(new_data, data_ptr, memory_size);
+  filter::format_dwconv_filter(&new_data, num, height, width, scale_ptr);
+  filter_tensor->reset_data_ptr(new_data);
+}

 void format_fc_filter(framework::Tensor *filter_tensor, float max_value) {
  filter_tensor->scale[0] = float(max_value / 127.0);  // NOLINT
@@ -186,6 +196,9 @@ void format_bias_scale_array(float **bias_scale_array,
  bias_scale::format_bias_scale_array(bias_scale_array,
                                      element_num_per_division, num);
 }
+void format_bias_array(float **bias_array, int num) {
+  bias_scale::format_bias_array(bias_array, num);
+}

 void format_concat_output(framework::Tensor *out, int height, int width,
                          int image_num, uint32_t *channel_num) {
@@ -200,7 +213,36 @@ void format_concat_output(framework::Tensor *out, int height, int width,
  out->Resize(ddim);
  out->reset_data_ptr(data_ptr);
 }
+void format_conv_data(framework::Tensor *filter_tensor,
+                      framework::Tensor *ofm_tensor, float **bs_ptr,
+                      int group) {
+  float max_value = fpga::filter_find_max(filter_tensor);
+  fpga::format_filter(filter_tensor, max_value, group);
+  int element_num_per_div = fpga::get_filter_num_per_div(filter_tensor, group);
+  fpga::format_bias_scale_array(bs_ptr, element_num_per_div,
+                                ofm_tensor->dims()[1]);
+  fpga::format_fp16_ofm(ofm_tensor);
+}
+void format_deconv_data(framework::Tensor *filter_tensor,
+                        framework::Tensor *ofm_tensor, float **bs_ptr,
+                        int group, int sub_conv_n) {
+  int channel = ofm_tensor->dims()[1];
+  float max_value = filter_find_max(filter_tensor);
+  format_deconv_filter(filter_tensor, max_value, group, sub_conv_n);
+  int element_num_per_div =
+      get_deconv_filter_num_per_div(filter_tensor, group, sub_conv_n);
+  format_bias_scale_array(bs_ptr, element_num_per_div, channel * sub_conv_n);
+  format_fp16_ofm(ofm_tensor);
+}

+void format_dwconv_data(framework::Tensor *filter_tensor,
+                        framework::Tensor *ofm_tensor, float *scale_ptr,
+                        float **bias_ptr) {
+  auto channel = ofm_tensor->dims()[1];
+  format_dwconv_filter(filter_tensor, scale_ptr);
+  format_bias_array(bias_ptr, channel);
+  format_fp16_ofm(ofm_tensor);
+}
 void expand_conv_arg(ConvArgs *arg) {
  ConvArgs args = *arg;

@@ -360,7 +402,6 @@ void expand_EW_arg(EWAddArgs *arg) {
  (*arg).driver.output_address_phy = output_address_phy;
  (*arg).driver.coefficient = coefficient;
  (*arg).driver.cmd = cmd;
-
 }  // expand_EW_arg

 void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
@@ -399,7 +440,8 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
  auto channel = (int)out->dims()[1];  // NOLINT
  int filter_num_per_div = get_filter_num_per_div(filter, group_num);
  int element_num = get_aligned_filter_element_num(
-      (int)(filter->dims()[1] * filter->dims()[2] * filter->dims()[3]));
+      (int)(filter->dims()[1] * filter->dims()[2] *  // NOLINT
+            filter->dims()[3]));

  for (int i = 0; i < n; i++) {
    arg->conv_arg[i].relu_enabled = relu_enabled;
@@ -424,8 +466,8 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
        element_num *
        align_to_x(arg->conv_arg[i].filter_num, FILTER_NUM_ALIGNMENT) *
        sizeof(int8_t);
-    auto filter_head =
-        &((int8_t *)filter_ptr)[i * element_num * filter_num_per_div];
+    auto filter_head = &(
+        (int8_t *)filter_ptr)[i * element_num * filter_num_per_div];  // NOLINT
    arg->conv_arg[i].filter_address = fpga_malloc(filter_size);
    memcpy(arg->conv_arg[i].filter_address, filter_head, filter_size);
    fpga_flush(arg->conv_arg[i].filter_address, filter_size);
@@ -441,11 +483,12 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
    if (n > 1) {
      arg->conv_arg[i].output.scale_address =
          (float *)fpga_malloc(2 * sizeof(float));  // NOLINT
-      arg->conv_arg[i].output.address = fpga_malloc(
-          out->dims()[2] *
-          align_to_x((int)(out->dims()[3] * arg->conv_arg[i].filter_num),
-                     IMAGE_ALIGNMENT) *
-          sizeof(half));
+      arg->conv_arg[i].output.address =
+          fpga_malloc(out->dims()[2] *
+                      align_to_x((int)(out->dims()[3] *  // NOLINT
+                                       arg->conv_arg[i].filter_num),
+                                 IMAGE_ALIGNMENT) *
+                      sizeof(half));
    } else {
      arg->conv_arg[i].output.scale_address = out->scale;
      arg->conv_arg[i].output.address = out_ptr;
@@ -474,22 +517,23 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
  arg->sub_conv_num = (uint32_t)stride_h;
  arg->filter_num = (uint32_t)filter->dims()[0];
  uint32_t sub_conv_num = arg->sub_conv_num;
-  int sub_pad = deconv_filter::deconv_calc_sub_pad((int)filter->dims()[3],
-                                                   padding_w, stride_w);
+  int sub_pad =
+      deconv_filter::deconv_calc_sub_pad((int)filter->dims()[3],  // NOLINT
+                                         padding_w, stride_w);
  auto sub_filter_width = (uint32_t)deconv_filter::deconv_get_sub_filter_axis(
-      (int)filter->dims()[3], stride_w);
+      (int)filter->dims()[3], stride_w);  // NOLINT

  auto sub_output_width = (uint32_t)deconv_filter::deconv_get_sub_out_axis(
-      (int)input->dims()[3], sub_pad, sub_filter_width);
+      (int)input->dims()[3], sub_pad, sub_filter_width);  // NOLINT
  auto sub_output_height = (uint32_t)deconv_filter::deconv_get_sub_out_axis(
-      (int)input->dims()[2], sub_pad, sub_filter_width);
+      (int)input->dims()[2], sub_pad, sub_filter_width);  // NOLINT

  arg->sub_output_width = (uint32_t)sub_output_width;
  arg->sub_output_height = (uint32_t)sub_output_height;
  arg->omit_size = (uint32_t)deconv_filter::deconv_get_omit(
-      stride_w, (int)filter->dims()[3], padding_w);
+      stride_w, (int)filter->dims()[3], padding_w);  // NOLINT

-  auto sub_channels = (int)input->dims()[1];
+  auto sub_channels = (int)input->dims()[1];  // NOLINT
  uint32_t omit_size = arg->omit_size;
  int real_out_width = sub_output_width * sub_conv_num - 2 * omit_size;
  int sub_filter_num = sub_conv_num * (arg->filter_num);
@@ -499,7 +543,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
  fpga::format_fp16_ofm(out, dims_out_new);
  auto out_ptr = out->data<float>();
  arg->output.address =
-      (half *)out_ptr +
+      (half *)out_ptr +  // NOLINT
      omit_size * sizeof(half) *
          (align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT));
  arg->output.scale_address = out->scale;
@@ -510,31 +554,31 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
  uint32_t split_num =
      group_num == 1 ? (uint32_t)get_deconv_plit_num(filter, sub_conv_num) : 1;

-  arg->split_conv_args =
-      (SplitConvArgs *)fpga_malloc(sub_conv_num * sizeof(SplitConvArgs));
+  arg->split_conv_args = (SplitConvArgs *)fpga_malloc(  // NOLINT
+      sub_conv_num * sizeof(SplitConvArgs));            // NOLINT
  for (int i = 0; i < sub_conv_num; ++i) {
    arg->split_conv_args[i].filter_num =
        (arg->sub_conv_num) * (arg->filter_num);
    arg->split_conv_args[i].group_num = (uint32_t)group_num;
    arg->split_conv_args[i].split_num = split_num;
    arg->split_conv_args[i].conv_arg =
-        (ConvArgs *)fpga_malloc(split_num * sizeof(ConvArgs));
+        (ConvArgs *)fpga_malloc(split_num * sizeof(ConvArgs));  // NOLINT

    arg->split_conv_args[i].concat_arg.height = sub_output_height;
    arg->split_conv_args[i].concat_arg.width = sub_output_width;
    arg->split_conv_args[i].concat_arg.image_num = split_num;
    arg->split_conv_args[i].concat_arg.images_in =
-        (half **)fpga_malloc(split_num * sizeof(half *));
+        (half **)fpga_malloc(split_num * sizeof(half *));  // NOLINT
    arg->split_conv_args[i].concat_arg.scales_in =
-        (float **)fpga_malloc(split_num * sizeof(float *));
+        (float **)fpga_malloc(split_num * sizeof(float *));  // NOLINT
    arg->split_conv_args[i].concat_arg.channel_num =
-        (uint32_t *)fpga_malloc(split_num * sizeof(uint32_t));
+        (uint32_t *)fpga_malloc(split_num * sizeof(uint32_t));  // NOLINT
  }

  auto filter_num_per_div =
      (uint32_t)get_deconv_filter_num_per_div(filter, group_num, stride_w);
  int element_num = get_aligned_filter_element_num(
-      (int)(sub_channels * sub_filter_width * sub_filter_width));
+      (int)(sub_channels * sub_filter_width * sub_filter_width));  // NOLINT

  int chw = sub_channels * sub_filter_width * sub_filter_width;
  int division_capacity = filter::calc_division_capacity(chw);
@@ -558,14 +602,15 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
      out_addr_offset = 0;

    } else {
-      auto ptr_output = (half *)out_ptr;
+      auto ptr_output = (half *)out_ptr;  // NOLINT
      out_addr_offset =
          sizeof(half) * (sub_conv_num - 1 - i) *
          (align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT));

-      arg->split_conv_args[i].output.address = (void *)(ptr_output);
+      arg->split_conv_args[i].output.address = (void *)(ptr_output);  // NOLINT

-      auto ptr_output_scale = (float *)fpga_malloc(2 * sizeof(float));
+      auto ptr_output_scale =
+          (float *)fpga_malloc(2 * sizeof(float));  // NOLINT
      arg->split_conv_args[i].output.scale_address = ptr_output_scale;
    }

@@ -609,9 +654,9 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
          align_to_x(arg->split_conv_args[i].conv_arg[j].filter_num,
                     FILTER_NUM_ALIGNMENT) *
          sizeof(int8_t);
-      auto filter_head =
-          &((int8_t *)filter_ptr)[j * element_num * filter_num_per_div +
-                                  i * filter_sub_conv_offset];
+      auto filter_head = &((
+          int8_t *)filter_ptr)[j * element_num * filter_num_per_div +  // NOLINT
+                               i * filter_sub_conv_offset];
      arg->split_conv_args[i].conv_arg[j].filter_address =
          fpga_malloc(filter_size);
      memcpy(arg->split_conv_args[i].conv_arg[j].filter_address, filter_head,
@@ -634,10 +679,12 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
        arg->split_conv_args[i].conv_arg[j].output.scale_address =
            arg->split_conv_args[i].output.scale_address;
      } else {
-        auto ptr_output = (half *)fpga_malloc(conv_output_size * sizeof(half));
+        auto ptr_output =
+            (half *)fpga_malloc(conv_output_size * sizeof(half));  // NOLINT
        arg->split_conv_args[i].conv_arg[j].output.address =
-            (void *)((half *)ptr_output);
-        auto ptr_output_scale = (float *)fpga_malloc(2 * sizeof(float));
+            (void *)((half *)ptr_output);  // NOLINT
+        auto ptr_output_scale =
+            (float *)fpga_malloc(2 * sizeof(float));  // NOLINT
        arg->split_conv_args[i].conv_arg[j].output.scale_address =
            ptr_output_scale;
      }
@@ -660,5 +707,30 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
  fpga_free(bs_ptr);
 }  // fill_deconv_arg

+void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input,
+                     framework::Tensor *out, framework::Tensor *filter,
+                     bool relu_enabled, int stride_h, int stride_w,
+                     int padding_h, int padding_w, float *bias_ptr) {
+  auto filter_ptr = filter->data<float>();
+  auto input_ptr = input->data<float>();
+  auto output_ptr = out->mutable_data<float>();
+  arg->relu_enabled = relu_enabled;
+  arg->bias_address = bias_ptr;
+  arg->filter_address = filter_ptr;
+  arg->kernel.height = filter->dims()[2];
+  arg->kernel.width = filter->dims()[3];
+  arg->kernel.stride_h = stride_h;
+  arg->kernel.stride_w = stride_w;
+  arg->image.address = input_ptr;
+  arg->image.channels = (uint32_t)input->dims()[1];
+  arg->image.height = (uint32_t)input->dims()[2];
+  arg->image.width = (uint32_t)input->dims()[3];
+  arg->image.pad_height = padding_h;
+  arg->image.pad_width = padding_w;
+  arg->image.scale_address = input->scale;
+  arg->output.address = output_ptr;
+  arg->output.scale_address = out->scale;
+}  // end dwconv arg fill
+
 }  // namespace fpga
 }  // namespace paddle_mobile
--- a/src/fpga/V1/api.h
+++ b/src/fpga/V1/api.h
@@ -14,6 +14,7 @@ limitations under the License. */

 #pragma once

+#include <string>
 #include "fpga/common/fpga_common.h"
 #include "fpga/common/pe.h"
 #include "framework/tensor.h"
@@ -40,6 +41,7 @@ void format_filter(framework::Tensor* filter_tensor, float max_value,
 void format_fc_filter(framework::Tensor* filter_tensor, float max_value);
 void format_bias_scale_array(float** bias_scale_array,
                             int element_num_per_division, int num);
+void format_bias_array(float** bias_array, int num);
 void format_concat_output(framework::Tensor* out, int height, int width,
                          int image_num, uint32_t* channel_num);

@@ -51,16 +53,28 @@ void fill_deconv_arg(struct DeconvArgs* arg, framework::Tensor* input,
                     framework::Tensor* out, framework::Tensor* filter,
                     bool relu_enabled, int group_num, int stride_h,
                     int stride_w, int padding_h, int padding_w, float* bs_ptr);
+void fill_dwconv_arg(struct DWconvArgs* arg, framework::Tensor* input,
+                     framework::Tensor* out, framework::Tensor* filter,
+                     bool relu_enabled, int stride_h, int stride_w,
+                     int padding_h, int padding_w, float* bias_ptr);

 void format_deconv_filter(framework::Tensor* filter_tensor, float max_value,
                          int group_num, int stride);
-
+void format_dwconv_filter(framework::Tensor* filter_tensor, float* scale_ptr);
+void format_conv_data(framework::Tensor* filter_tensor,
+                      framework::Tensor* ofm_tensor, float** bs_ptr, int group);
+void format_deconv_data(framework::Tensor* filter_tensor,
+                        framework::Tensor* ofm_tensor, float** bs_ptr,
+                        int group, int sub_conv_n);
+void format_dwconv_data(framework::Tensor* filter_tensor,
+                        framework::Tensor* ofm_tensor, float* scale_ptr,
+                        float** bias_ptr);
 template <typename Dtype>
 void savefile(std::string filename, void* buffer, int dataSize, Dtype tmp) {
  float data;
  std::ofstream out(filename.c_str());
  for (int i = 0; i < dataSize; ++i) {
-    data = (((Dtype*)buffer)[i]);
+    data = (((Dtype*)buffer)[i]);  // NOLINT
    out << data << std::endl;
  }
  out.close();

--- a/src/fpga/V1/bias_scale.cpp
+++ b/src/fpga/V1/bias_scale.cpp
@@ -82,6 +82,25 @@ void format_bias_scale_array(float **bias_scale_array,
  interleave(bias_scale_array, div_num * element_num_after_division);
  fpga_flush(*bias_scale_array, 2 * element_num_after_division * sizeof(float));
 }
+void format_bias_array(float **bias_array, int num) {
+  float *ptr_unaligned = *bias_array;
+  int num_before_align = num;
+  int num_after_align = align_to_x(num_before_align, BIAS_NUM_ALIGNMENT);
+  float *ptr_aligned =
+      (float *)fpga_malloc(num_after_align * sizeof(float));  // NOLINT
+
+  memset(ptr_aligned, 0, num_after_align * sizeof(float));
+  if (num < 16) {
+    memcpy(ptr_aligned, ptr_unaligned, num * sizeof(float));
+    for (int i = num; i < num_after_align; i++) {
+      ptr_aligned[i] = ptr_unaligned[i % num];
+    }
+  } else {
+    memcpy(ptr_aligned, ptr_unaligned, num * sizeof(float));
+  }
+  fpga_free(ptr_unaligned);
+  *bias_array = ptr_aligned;
+}

 }  // namespace bias_scale
 }  // namespace fpga

--- a/src/fpga/V1/bias_scale.h
+++ b/src/fpga/V1/bias_scale.h
@@ -22,6 +22,7 @@ void align_element(float** data_in, int num_per_div_before_alignment, int num);
 void interleave(float** data_in, int num_after_alignment);
 void format_bias_scale_array(float** bias_scale_array,
                             int element_num_per_division, int num);
+void format_bias_array(float** bias_array, int num);

 }  // namespace bias_scale
 }  // namespace fpga

--- a/src/fpga/V1/filter.cpp
+++ b/src/fpga/V1/filter.cpp
@@ -277,7 +277,84 @@ void format_fc_filter(float **data_in, int num, int channel, int height,
  fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) *
                                 num_after_alignment * sizeof(char));
 }
+void convert_to_hwn(int16_t **data_in, int num, int height, int width) {
+  int16_t *tmp = *data_in;
+  int16_t *data_tmp =
+      (int16_t *)fpga_malloc(height * width * num * sizeof(int16_t));  // NOLINT
+  for (int n = 0; n < num; n++) {
+    for (int h = 0; h < height; h++) {
+      for (int w = 0; w < width; w++) {
+        *(data_tmp + h * width * num + w * num + n) = *((*data_in)++);
+      }
+    }
+  }
+  *data_in = data_tmp;
+  fpga_free(tmp);
+}
+
+void align_element_nw(int16_t **data_in, int num, int height, int width) {
+  int unalign_nw = num * width;
+  int align_nw = align_to_x(num * width, FILTER_ELEMENT_ALIGNMENT);
+  if (unalign_nw == align_nw) {
+    return;
+  } else {
+    int16_t *tmp = *data_in;
+
+    int num_element = height * align_nw;
+    int16_t *data_tmp =
+        (int16_t *)fpga_malloc(num_element * sizeof(int16_t));  // NOLINT
+
+    memset(data_tmp, 0, num_element * sizeof(int16_t));
+    if (unalign_nw >= FILTER_ELEMENT_ALIGNMENT) {
+      for (int h = 0; h < height; h++) {
+        int offset_unalign = h * unalign_nw;
+        int offset_align = h * align_nw;
+        for (int nw = 0; nw < unalign_nw; nw++) {
+          data_tmp[offset_align + nw] = *((*data_in) + offset_unalign + nw);
+        }
+      }
+    } else {
+      for (int h = 0; h < height; h++) {
+        int offset_unalign = h * unalign_nw;
+        int offset_align = h * align_nw;
+        for (int nw = 0; nw < align_nw; nw++) {
+          data_tmp[offset_align + nw] =
+              *((*data_in) + offset_unalign + nw % unalign_nw);
+        }
+      }
+    }
+
+    *data_in = data_tmp;
+    free(tmp);
+  }
+}
+void quantize_to_fp16(float **data_in, int num, int height, int width,
+                      float *scale_ptr) {
+  float *tmp = *data_in;
+  int size = num * height * width;

+  int16_t *tmp_data = (int16_t *)fpga_malloc(size * sizeof(int16_t));  // NOLINT
+  for (int n = 0; n < num; n++) {
+    float scale_val = scale_ptr[n];
+    for (int h = 0; h < height; h++) {
+      for (int w = 0; w < width; w++) {
+        int index = n * height * width + h * width + w;
+        tmp_data[index] = fp32_2_fp16((*data_in)[index] * scale_val);
+      }
+    }
+  }
+  *data_in = (float *)tmp_data;  // NOLINT
+  fpga_free(tmp);
+}
+void format_dwconv_filter(float **data_in, int num, int height, int width,
+                          float *scale_ptr) {
+  quantize_to_fp16(data_in, num, height, width, scale_ptr);
+  int16_t **quantize_data = (int16_t **)data_in;  // NOLINT
+  convert_to_hwn(quantize_data, num, height, width);
+  align_element_nw(quantize_data, num, height, width);
+  fpga_flush(*quantize_data, align_to_x(num * width, FILTER_ELEMENT_ALIGNMENT) *
+                                 height * sizeof(char));
+}
 }  // namespace filter
 }  // namespace fpga
 }  // namespace paddle_mobile
--- a/src/fpga/V1/filter.h
+++ b/src/fpga/V1/filter.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
-
+#include <cstdint>
 namespace paddle_mobile {
 namespace fpga {
 namespace filter {
@@ -38,6 +38,13 @@ void convert_fc_filter(char** data_in, int num, int chw);
 void format_fc_filter(float** data_in, int num, int channel, int height,
                      int width, int group_num, float max);

+void convert_to_hwn(int16_t** data_in, int num, int height, int width);
+void align_element_nw(int16_t** data_in, int num, int height, int width);
+void quantize_to_fp16(float** data_in, int num, int height, int width,
+                      float* scale_ptr);
+void format_dwconv_filter(float** data_in, int num, int height, int width,
+                          float* scale_ptr);
+
 }  // namespace filter
 }  // namespace fpga
 }  // namespace paddle_mobile
--- a/src/fpga/V1/pe.cpp
+++ b/src/fpga/V1/pe.cpp
@@ -24,14 +24,13 @@ limitations under the License. */
 #include <time.h>
 #include <iomanip>
 #include <iostream>
-//#include <iostream>
 #endif

 namespace paddle_mobile {
 namespace fpga {

 using namespace driver;  // NOLINT
-using namespace std;
+using namespace std;     // NOLINT
 #define USE_RELU 1
 #define USE_BIAS 2

@@ -53,7 +52,6 @@ using namespace std;
 #define INTERRUPT_CONV 0x0004
 #define INTERRUPT_POOLING 0x0008
 #define INTERRUPT_EW 0x0010
-//#define INTERRUPT_RESIZE 0x0020

 /* Register offset */
 #define REG_INTERRUPT 0x000
@@ -73,9 +71,6 @@ using namespace std;
 #define REG_FLASH_STATUS 0x218
 #define REG_SN 0x220

-//#define REG_READ_SCALE
-//#define REG_WRITE_SCALE
-
 /*bypass*/
 #define REG_CONVERT_CMD 0x400
 #define REG_CONVERT_SRC_ADDR 0x408
@@ -236,8 +231,10 @@ int ComputeBasicConv(const struct ConvArgs &args) {
  reg_writeq((uint64_t)args.group_num, REG_CONV_GROUP_NUMBER);
  reg_writeq((uint64_t)args.filter_num, REG_CONV_FILTER_NUMBER);
  reg_writeq((uint64_t)args.image.channels, REG_CONV_CHANNEL_NUMBER);
-  reg_writeq(*(uint64_t *)args.image.scale_address, REG_CONV_IMAGE_SCALE);
-  reg_writeq(*(uint64_t *)args.filter_scale_address, REG_CONV_FILTER_SCALE);
+  reg_writeq(*(uint64_t *)args.image.scale_address,  // NOLINT
+             REG_CONV_IMAGE_SCALE);
+  reg_writeq(*(uint64_t *)args.filter_scale_address,  // NOLINT
+             REG_CONV_FILTER_SCALE);
  reg_writeq(args.driver.image_address_phy, REG_CONV_IMAGE_BASE_ADDR);
  reg_writeq(args.driver.filter_address_phy, REG_CONV_FILTER_BASE_ADDR);
  reg_writeq(args.driver.sb_address_phy, REG_CONV_SB_BASE_ADDR);
@@ -280,7 +277,6 @@ int ComputeBasicConv(const struct ConvArgs &args) {
  return ret;
 #endif
  return 0;
-
 }  // ComputeBasicConv

 int ComputeFpgaPool(const struct PoolingArgs &args) {
@@ -406,13 +402,11 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
  output_scale = reg_readq(REG_SCALE_PARAMETER);
  output_scale = (output_scale << 32) | (output_scale >> 32);
  fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
-  //*(args.output.timer_cnt) = reg_readq(REG_TIMER_COUNTER);
  pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);

  return ret;
 #endif
  return 0;
-
 }  // ComputeFpgaPool

 int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
@@ -468,13 +462,10 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
  output_scale = reg_readq(REG_SCALE_PARAMETER);
  output_scale = (output_scale << 32) | (output_scale >> 32);
  fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
-  //*(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER);
-  //*(args.output.timer_cnt) = reg_readq(REG_TIMER_COUNTER);
  pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
  return ret;
 #endif
  return 0;
-
 }  // ComputeFpgaEWAdd

 int PerformBypass(const struct BypassArgs &args) {
@@ -588,13 +579,10 @@ int PerformBypass(const struct BypassArgs &args) {
  output_scale = reg_readq(REG_SCALE_PARAMETER);
  output_scale = (output_scale << 32) | (output_scale >> 32);
  fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
-  //*(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER);
-  //*(args.output.timer_cnt) = reg_readq(REG_TIMER_COUNTER);
  pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
  return ret;
 #endif
  return 0;
-
 }  // PerformBypass

 int ComputeFPGAConcat(const struct ConcatArgs &args) {
@@ -647,13 +635,14 @@ void deconv_post_process(const struct DeconvArgs &args) {
    for (int hh = 0; hh < origin_h; ++hh) {
      int hx = (hh % sub_conv_n);
      auto sub_t =
-          (int16_t *)(args.split_conv_args[sub_conv_n - hx - 1].output.address);
+          (int16_t *)(args.split_conv_args[sub_conv_n - hx - 1]  // NOLINT
+                          .output.address);
      int hi = (hh / sub_conv_n);
      if ((hh < omit_size) || (hh >= (origin_h - omit_size))) continue;
      int sidx = (nn * origin_h * align_origin_w + hi * align_origin_w +
                  omit_size * channel);
-      fpga_copy((int16_t *)(args.output.address) + deconv_idx, sub_t + sidx,
-                sizeof(int16_t) * deconv_row_len);
+      fpga_copy((int16_t *)(args.output.address) + deconv_idx,    // NOLINT
+                sub_t + sidx, sizeof(int16_t) * deconv_row_len);  // NOLINT
      deconv_idx += align_deconv_row_len;
    }
  }
@@ -678,7 +667,7 @@ int ComputeFpgaDeconv(const struct DeconvArgs &args) {

 #ifdef COST_TIME_PRINT
  timeval start, end;
-  long dif_sec, dif_usec;
+  long dif_sec, dif_usec;  // NOLINT
 #endif

  for (int i = 0; i < sub_conv_num; i++) {
@@ -723,18 +712,16 @@ int ComputeFpgaDeconv(const struct DeconvArgs &args) {
 #endif

    //    fpga_flush(args.output.scale_address, 2 * sizeof(float));
-#ifdef COST_TIME_PRINT
-    gettimeofday(&start, NULL);
-#endif
-    deconv_post_process(args);
-#ifdef COST_TIME_PRINT
-    gettimeofday(&end, NULL);
-    dif_sec = end.tv_sec - start.tv_sec;
-    dif_usec = end.tv_usec - start.tv_usec;
-    std::cout << "deconv_post_process  "
-              << "    cost time: " << (dif_sec * 1000000 + dif_usec) << "us"
-              << std::endl;
-#endif
+    /*#ifdef COST_TIME_PRINT
+    gettimeofday(&start,NULL);
+    #endif
+        //deconv_post_process(args);
+    #ifdef COST_TIME_PRINT
+        gettimeofday(&end,NULL);
+     dif_sec = end.tv_sec - start.tv_sec;
+     dif_usec = end.tv_usec - start.tv_usec;
+      std::cout << "deconv_post_process  " << "    cost time: "  <<
+    (dif_sec*1000000+dif_usec)  << "us" << std::endl; #endif*/
  }

  return 0;

--- a/src/fpga/common/fpga_common.h
+++ b/src/fpga/common/fpga_common.h
@@ -25,6 +25,7 @@ namespace fpga {
 #define FILTER_NUM_ALIGNMENT 32      // Filter number aligned to 32
 #define FILTER_ELEMENT_ALIGNMENT 16  // Filter element number aligned to 16
 #define BS_NUM_ALIGNMENT 8
+#define BIAS_NUM_ALIGNMENT 16
 #endif

 enum DataType {
@@ -222,7 +223,14 @@ struct DeconvArgs {
  struct ImageOutputArgs output;
  struct SplitConvArgs* split_conv_args;
 };
-
+struct DWconvArgs {
+  bool relu_enabled;
+  void* bias_address;
+  void* filter_address;
+  struct KernelArgs kernel;
+  struct ImageInputArgs image;
+  struct ImageOutputArgs output;
+};
 // static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x;
 // }
 static inline uint32_t align_to_x(int64_t num, int64_t x) {

--- a/src/operators/kernel/fpga/V1/conv_add_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/conv_add_bn_kernel.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVADDBN_OP

 #include "operators/kernel/conv_add_bn_kernel.h"
-
+#include <math.h>
 namespace paddle_mobile {
 namespace operators {

@@ -58,14 +58,7 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
  param->SetNewScale(new_scale);
  param->SetNewBias(new_bias);

-  float max_value = fpga::filter_find_max(filter);
-  fpga::format_filter(filter, max_value, param->Groups());
-
-  int element_num_per_div =
-      fpga::get_filter_num_per_div(filter, param->Groups());
-  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
-  fpga::format_fp16_ofm(out);
-
+  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
  fpga::SplitConvArgs conv_arg = {0};
  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
                       param->Groups(), param->Strides()[0],

--- a/src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDBNRELU_OP
+
+#include "operators/kernel/conv_add_bn_relu_kernel.h"
+#include <math.h>
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvAddBNReluKernel<FPGA, float>::Init(
+    FusionConvAddBNReluParam<FPGA> *param) {
+  bool relu_enabled = true;
+  auto input = const_cast<Tensor *>(param->Input());
+  auto bias = param->Bias();
+  auto bias_ptr = bias->data<float>();
+  auto filter = const_cast<Tensor *>(param->Filter());
+  auto out = param->Output();
+
+  vector<int> paddings = param->Paddings();
+  vector<int> strides = param->Strides();
+  auto bn_mean_ptr = param->InputMean()->data<float>();
+  auto bn_var_ptr = param->InputVariance()->data<float>();
+  auto bn_scale_ptr = param->InputScale()->data<float>();
+  auto bn_bias_ptr = param->InputBias()->data<float>();
+  const float epsilon = param->Epsilon();
+  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0] &&
+                            bias->dims()[0] == param->InputBias()->dims()[0],
+                        "Output channel should be equal to bias number");
+
+  const int channel = out->dims()[1];
+  auto bs_ptr =
+      reinterpret_cast<float *>(fpga::fpga_malloc(2 * channel * sizeof(float)));
+  auto new_scale = new Tensor();
+  auto new_bias = new Tensor();
+  auto new_scale_ptr = new_scale->mutable_data<float>({channel});
+  auto new_bias_ptr = new_bias->mutable_data<float>({channel});
+
+  for (int i = 0; i < channel; i++) {
+    new_scale_ptr[i] = bn_scale_ptr[i] /
+                       static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
+    new_bias_ptr[i] =
+        bn_bias_ptr[i] + (bias_ptr[i] - bn_mean_ptr[i]) * new_scale_ptr[i];
+    bs_ptr[i + channel] = new_scale_ptr[i];
+    bs_ptr[i] = new_bias_ptr[i];
+  }
+  param->SetNewScale(new_scale);
+  param->SetNewBias(new_bias);
+
+  const int groups = param->Groups();
+  if (groups == channel) {
+    fpga::format_dwconv_data(filter, out, new_scale_ptr, &new_bias_ptr);
+    fpga::DWconvArgs dwconv_arg = {0};
+    fpga::fill_dwconv_arg(&dwconv_arg, input, out, filter, relu_enabled,
+                          strides[0], strides[1], paddings[0], paddings[1],
+                          new_bias_ptr);
+    param->SetFpgaArgs(dwconv_arg);
+  } else {
+    fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
+    fpga::SplitConvArgs conv_arg = {0};
+    fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
+                         param->Groups(), strides[0], strides[1], paddings[0],
+                         paddings[1], bs_ptr);
+    param->SetFpgaArgs(conv_arg);
+  }
+  return true;
+}
+
+template <>
+void ConvAddBNReluKernel<FPGA, float>::Compute(
+    const FusionConvAddBNReluParam<FPGA> &param) {
+  if (param.Groups() == param.Output()->dims()[1]) {
+    // fpga::ComputeFpgaConv(param.FpgaDwconvArgs());
+  } else {
+    fpga::ComputeFpgaConv(param.FpgaArgs());
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/fpga/V1/conv_add_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/conv_add_kernel.cpp
@@ -38,15 +38,7 @@ bool ConvAddKernel<FPGA, float>::Init(FusionConvAddParam<FPGA> *param) {
    bs_ptr[i] = bias_ptr[i];
  }

-  float max_value = fpga::filter_find_max(filter);
-  fpga::format_filter(filter, max_value, param->Groups());
-
-  int element_num_per_div =
-      fpga::get_filter_num_per_div(filter, param->Groups());
-  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
-
-  fpga::format_fp16_ofm(out);
-
+  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
  fpga::SplitConvArgs conv_arg = {0};
  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
                       param->Groups(), param->Strides()[0],

--- a/src/operators/kernel/fpga/V1/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/conv_add_relu_kernel.cpp
@@ -38,15 +38,7 @@ bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
    bs_ptr[i] = bias_ptr[i];
  }

-  float max_value = fpga::filter_find_max(filter);
-  fpga::format_filter(filter, max_value, param->Groups());
-
-  int element_num_per_div =
-      fpga::get_filter_num_per_div(filter, param->Groups());
-  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
-
-  fpga::format_fp16_ofm(out);
-
+  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
  fpga::SplitConvArgs conv_arg = {0};
  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
                       param->Groups(), param->Strides()[0],

--- a/src/operators/kernel/fpga/V1/conv_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/conv_bn_kernel.cpp
@@ -51,15 +51,7 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
  param->SetNewScale(new_scale);
  param->SetNewBias(new_bias);

-  float max_value = fpga::filter_find_max(filter);
-  fpga::format_filter(filter, max_value, param->Groups());
-
-  int element_num_per_div =
-      fpga::get_filter_num_per_div(filter, param->Groups());
-  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
-
-  fpga::format_fp16_ofm(out);
-
+  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
  fpga::SplitConvArgs conv_arg = {0};
  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
                       param->Groups(), param->Strides()[0],

--- a/src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp
@@ -51,15 +51,7 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
  param->SetNewScale(new_scale);
  param->SetNewBias(new_bias);

-  float max_value = fpga::filter_find_max(filter);
-  fpga::format_filter(filter, max_value, param->Groups());
-
-  int element_num_per_div =
-      fpga::get_filter_num_per_div(filter, param->Groups());
-  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
-
-  fpga::format_fp16_ofm(out);
-
+  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
  fpga::SplitConvArgs conv_arg = {0};
  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
                       param->Groups(), param->Strides()[0],

--- a/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp
@@ -35,8 +35,8 @@ bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) {
  int channel = out->dims()[1];

  int sub_conv_n = param->Strides()[0];
-  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *
-                                           sizeof(float));  // NOLINT
+  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *  // NOLINT
+                                           sizeof(float));             // NOLINT

  for (int i = 0; i < channel * sub_conv_n; i++) {
    bs_ptr[i + sub_conv_n * channel] = 1;
@@ -49,17 +49,7 @@ bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) {
                        "filter width should be equal to filter height ");
  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
                        "filter axis should be the multiple of stride axis ");
-
-  float max_value = fpga::filter_find_max(filter);
-  fpga::format_deconv_filter(filter, max_value, param->Groups(),
-                             param->Strides()[0]);
-
-  int element_num_per_div =
-      fpga::get_deconv_filter_num_per_div(filter, param->Groups(), sub_conv_n);
-
-  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div,
-                                channel * sub_conv_n);
-
+  fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
  fpga::DeconvArgs deconv_arg = {0};
  fpga::fill_deconv_arg(&deconv_arg, input, out, filter, relu_enabled,
                        param->Groups(), param->Strides()[0],

--- a/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp
@@ -36,8 +36,8 @@ bool DeconvAddReluKernel<FPGA, float>::Init(
  int channel = out->dims()[1];

  int sub_conv_n = param->Strides()[0];
-  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *
-                                           sizeof(float));  // NOLINT
+  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *  // NOLINT
+                                           sizeof(float));             // NOLINT

  for (int i = 0; i < channel * sub_conv_n; i++) {
    bs_ptr[i + sub_conv_n * channel] = 1;
@@ -50,17 +50,7 @@ bool DeconvAddReluKernel<FPGA, float>::Init(
                        "filter width should be equal to filter height ");
  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
                        "filter axis should be the multiple of stride axis ");
-
-  float max_value = fpga::filter_find_max(filter);
-  fpga::format_deconv_filter(filter, max_value, param->Groups(),
-                             param->Strides()[0]);
-
-  int element_num_per_div =
-      fpga::get_deconv_filter_num_per_div(filter, param->Groups(), sub_conv_n);
-
-  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div,
-                                channel * sub_conv_n);
-
+  fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
  fpga::DeconvArgs deconv_arg = {0};
  fpga::fill_deconv_arg(&deconv_arg, input, out, filter, relu_enabled,
                        param->Groups(), param->Strides()[0],

--- a/src/operators/kernel/fpga/V1/reshape_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/reshape_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef RESHAPE_OP
+
+#include "operators/kernel/reshape_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ReshapeKernel<FPGA, float>::Init(ReshapeParam<FPGA> *param) {
+  return true;
+}
+
+template <>
+void ReshapeKernel<FPGA, float>::Compute(const ReshapeParam<FPGA> &param) {}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/fpga/V1/softmax_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/softmax_kernel.cpp
@@ -28,18 +28,26 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
  fpga::format_fp32_ofm(out);

  auto float_input = new Tensor;
-  float_input->mutable_data<float>(
-      {1, input->dims()[2], input->dims()[3], input->dims()[1]});
-  fpga::format_fp32_ofm(float_input);
+  if (input->dims().size() == 2) {
+    float_input->mutable_data<float>({1, input->dims()[1]});
+  } else if (input->dims().size() == 4) {
+    float_input->mutable_data<float>(
+        {1, input->dims()[2], input->dims()[3], input->dims()[1]});
+  } else {
+    DLOG << "wrong dimension of softmax input";
+  }

+  fpga::format_fp32_ofm(float_input);
  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
  args.input_layout_type = fpga::LAYOUT_HWC;
  args.output_layout_type = fpga::LAYOUT_CHW;
  args.input_data_type = fpga::DATA_TYPE_FP16;
  args.output_data_type = fpga::DATA_TYPE_FP32;
  args.image.address = input_ptr;
-  args.image.height = (uint32_t)input->dims()[2];
-  args.image.width = (uint32_t)input->dims()[3];
+  args.image.height =
+      (input->dims().size() == 4) ? (uint32_t)input->dims()[2] : 1;
+  args.image.width =
+      (input->dims().size() == 4) ? (uint32_t)input->dims()[3] : 1;
  args.image.channels = (uint32_t)input->dims()[1];
  args.output.address = float_input->data<float>();
  args.output.scale_address = float_input->scale;
@@ -56,7 +64,7 @@ void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) {
  fpga::PerformBypass(param.FpgaArgs());
  fpga::fpga_invalidate((void *)in_x->data<float>(),  // NOLINT
                        in_x->numel() * sizeof(float));
-  // TODO: In general case, 0 should be squeezed before softmax input
+  // TODO: In general case, 0 should be squeezed before softmax input  // NOLINT
  math::SoftmaxFuntor<CPU, float>()(in_x, out);
  fpga::fpga_flush(out->data<float>(), out->memory_size());
 }

--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -462,6 +462,13 @@ class ConvParam : public OpParam {
 public:
  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
+
+ public:
+  fpga::DWconvArgs fpga_dwconv_args;
+
+ public:
+  const fpga::DWconvArgs &FpgaDwconvArgs() const { return fpga_dwconv_args; }
+  void SetFpgaArgs(const fpga::DWconvArgs &args) { fpga_dwconv_args = args; }
 #endif
 };
 template <typename Dtype>

--- a/src/operators/reshape_op.cpp
+++ b/src/operators/reshape_op.cpp
@@ -38,6 +38,9 @@ REGISTER_OPERATOR_CPU(reshape, ops::ReshapeOp);
 #ifdef PADDLE_MOBILE_MALI_GPU
 REGISTER_OPERATOR_MALI_GPU(reshape, ops::ReshapeOp);
 #endif
+#ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(reshape, ops::ReshapeOp);
+#endif
 #ifdef PADDLE_MOBILE_CL
 REGISTER_OPERATOR_CL(reshape, ops::ReshapeOp);
 #endif

--- a/tools/op.cmake
+++ b/tools/op.cmake
@@ -122,6 +122,11 @@ if (CON GREATER -1)
  set(SPLIT_OP ON)
  set(FUSION_DECONVADD_OP ON)
  set(FUSION_DECONVADDRELU_OP ON)
+
+  set(RESHAPE_OP ON)
+  set(FUSION_CONVADDBNRELU_OP ON)
+  set(FUSION_CONVADDBN_OP ON)
+
  set(FOUND_MATCH ON)
 endif()