Merge pull request #1402 from qnqinan/develop

add dw deconv with group in FPGA track fixed#1401

Merge pull request #1402 from qnqinan/develop
add dw deconv with group in FPGA track fixed#1401
b2855761 · zhangyang0701 · GitHub · 0b359bd1 · d934df97 · b2855761
13 changed file
--- a/src/fpga/V1/api.cpp
+++ b/src/fpga/V1/api.cpp
@@ -151,6 +151,30 @@ void format_dwconv_filter(framework::Tensor *filter_tensor, float *scale_ptr) {
  filter_tensor->reset_data_ptr(new_data);
 }

+void format_DWDconv_filter(framework::Tensor *filter_tensor, float *scale_ptr,
+                           int stride) {
+  auto dims = filter_tensor->dims();
+  auto num = dims[0], height = dims[2], width = dims[3];
+  auto data_ptr = filter_tensor->data<float>();
+  size_t memory_size = num * height * width * sizeof(float);
+  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
+  fpga_copy(new_data, data_ptr, memory_size);
+
+  int hw = height * width;
+  deconv_filter::deconv_NC_convert(&new_data, num, 1, hw);
+
+  num = dims[1];
+  int channel = dims[0];
+
+  deconv_filter::DWDconv_format_filter(&new_data, num, channel, height, width,
+                                       scale_ptr, stride);
+
+  //  framework::DDim dims_new =
+  //      framework::make_ddim({num, 1, height, width});
+  //  filter_tensor->Resize(dims_new);
+  filter_tensor->reset_data_ptr(new_data);
+}
+
 void format_fc_filter(framework::Tensor *filter_tensor, float max_value) {
  filter_tensor->scale[0] = float(max_value / 127.0);  // NOLINT
  filter_tensor->scale[1] = float(127.0 / max_value);  // NOLINT
@@ -243,6 +267,17 @@ void format_dwconv_data(framework::Tensor *filter_tensor,
  format_bias_array(bias_ptr, channel);
  format_fp16_ofm(ofm_tensor);
 }
+void format_DWDeconv_data(framework::Tensor *filter_tensor,
+                          framework::Tensor *ofm_tensor, float **bs_ptr,
+                          int group, int sub_conv_n) {
+  int channel = ofm_tensor->dims()[1];
+  // dw-deconv
+  format_DWDconv_filter(
+      filter_tensor,
+      (reinterpret_cast<float *>(*bs_ptr) + sub_conv_n * channel), sub_conv_n);
+  format_bias_array(bs_ptr, channel);
+  format_fp16_ofm(ofm_tensor);
+}
 void expand_conv_arg(ConvArgs *arg) {
  ConvArgs args = *arg;

@@ -770,6 +805,7 @@ void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input,
  auto filter_ptr = filter->data<float>();
  auto input_ptr = input->data<float>();
  auto output_ptr = out->mutable_data<float>();
+  arg->sub_conv_num = 1;
  arg->relu_enabled = relu_enabled;
  arg->bias_address = bias_ptr;
  arg->filter_address = filter_ptr;
@@ -788,5 +824,109 @@ void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input,
  arg->output.scale_address = out->scale;
 }  // end dwconv arg fill

+void fill_DWDeconv_arg(struct DWDeconvArgs *arg, framework::Tensor *input,
+                       framework::Tensor *out, framework::Tensor *filter,
+                       bool relu_enabled, int stride_h, int stride_w,
+                       int padding_h, int padding_w, float *bias_ptr) {
+  auto filter_ptr = filter->data<float>();
+  auto input_ptr = input->data<float>();
+  auto output_ptr = out->mutable_data<float>();
+
+  auto deleter = [](void *p) { fpga_free(p); };
+
+  arg->group_num = (uint32_t)filter->dims()[0];
+  arg->sub_conv_num = (uint32_t)stride_w;
+  arg->filter_num = (uint32_t)filter->dims()[0];
+
+  int sub_conv_num = stride_w;
+
+  int sub_pad =
+      deconv_filter::deconv_calc_sub_pad((int)filter->dims()[3],  // NOLINT
+                                         padding_w, stride_w);
+  auto sub_filter_width = (uint32_t)deconv_filter::deconv_get_sub_filter_axis(
+      (int)filter->dims()[3], stride_w);  // NOLINT
+
+  auto sub_output_width = (uint32_t)deconv_filter::deconv_get_sub_out_axis(
+      (int)input->dims()[3], sub_pad, sub_filter_width);  // NOLINT
+  auto sub_output_height = (uint32_t)deconv_filter::deconv_get_sub_out_axis(
+      (int)input->dims()[2], sub_pad, sub_filter_width);  // NOLINT
+
+  arg->sub_output_width = (uint32_t)sub_output_width;
+  arg->sub_output_height = (uint32_t)sub_output_height;
+  arg->omit_size = (uint32_t)deconv_filter::deconv_get_omit(
+      stride_w, (int)filter->dims()[3], padding_w);  // NOLINT
+
+  auto sub_channels = (int)input->dims()[1];  // NOLINT
+  uint32_t omit_size = arg->omit_size;
+  int real_out_width = sub_output_width * sub_conv_num - 2 * omit_size;
+  int real_out_height = sub_output_height * sub_conv_num - 2 * omit_size;
+  int sub_filter_num = sub_conv_num * (arg->filter_num);
+
+  framework::DDim dims_out_new = framework::make_ddim(
+      {1, arg->filter_num, real_out_height, real_out_width});
+  fpga::format_fp16_ofm(out, dims_out_new);
+  auto out_ptr = out->data<float>();
+
+  /*====For Addition
+  arg->output.address =
+      (half *)out_ptr +  // NOLINT
+      omit_size * sizeof(half) *
+          (align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT));
+          */
+  arg->output.address = out_ptr;
+  arg->output.scale_address = out->scale;
+
+  int filter_offset = sub_filter_width * sub_filter_width *
+                      align_to_x(sub_channels, FILTER_ELEMENT_ALIGNMENT) *
+                      arg->sub_conv_num;
+
+  for (int i = 0; i < sub_conv_num; ++i) {
+    arg->dw_conv_args.push_back(std::make_shared<DWconvArgs>());
+
+    arg->dw_conv_args[i]->sub_conv_num = sub_conv_num;
+    arg->dw_conv_args[i]->relu_enabled = relu_enabled;
+    arg->dw_conv_args[i]->bias_address = bias_ptr;
+
+    arg->dw_conv_args[i]->filter_address =
+        fpga_malloc(filter_offset * sizeof(int16_t));
+    memcpy(arg->dw_conv_args[i]->filter_address,
+           (reinterpret_cast<half *>(filter_ptr) + i * filter_offset),
+           filter_offset * sizeof(int16_t));
+    arg->vector_dw_conv_space.push_back(std::shared_ptr<char>(
+        reinterpret_cast<char *>(arg->dw_conv_args[i]->filter_address),
+        deleter));
+
+    arg->dw_conv_args[i]->kernel.height = (uint32_t)sub_filter_width;
+    arg->dw_conv_args[i]->kernel.width = (uint32_t)sub_filter_width;
+
+    arg->dw_conv_args[i]->kernel.stride_h = (uint32_t)1;
+    arg->dw_conv_args[i]->kernel.stride_w = (uint32_t)1;
+    arg->dw_conv_args[i]->image.address = input_ptr;
+    arg->dw_conv_args[i]->image.channels = (uint32_t)input->dims()[1];
+    arg->dw_conv_args[i]->image.height = (uint32_t)input->dims()[2];
+    arg->dw_conv_args[i]->image.width = (uint32_t)input->dims()[3];
+
+    arg->dw_conv_args[i]->image.pad_height = sub_pad;
+    arg->dw_conv_args[i]->image.pad_width = sub_pad;
+    arg->dw_conv_args[i]->image.scale_address = input->scale;
+
+    arg->dw_conv_args[i]->output.address =
+        fpga_malloc(sub_output_height *
+                    align_to_x(sub_output_width * sub_channels * sub_conv_num,
+                               IMAGE_ALIGNMENT) *
+                    sizeof(int16_t));
+    arg->dw_conv_args[i]->output.scale_address =
+        static_cast<float *>(fpga_malloc(2 * sizeof(float)));
+    arg->vector_dw_conv_space.push_back(std::shared_ptr<char>(
+        reinterpret_cast<char *>(arg->dw_conv_args[i]->output.address),
+        deleter));
+    arg->vector_dw_conv_space.push_back(std::shared_ptr<char>(
+        reinterpret_cast<char *>(arg->dw_conv_args[i]->output.scale_address),
+        deleter));
+  }
+
+  // arg->output.scale_address = out->scale;
+}  // end dwconv arg fill
+
 }  // namespace fpga
 }  // namespace paddle_mobile
--- a/src/fpga/V1/api.h
+++ b/src/fpga/V1/api.h
@@ -57,6 +57,10 @@ void fill_dwconv_arg(struct DWconvArgs* arg, framework::Tensor* input,
                     framework::Tensor* out, framework::Tensor* filter,
                     bool relu_enabled, int stride_h, int stride_w,
                     int padding_h, int padding_w, float* bias_ptr);
+void fill_DWDeconv_arg(struct DWDeconvArgs* arg, framework::Tensor* input,
+                       framework::Tensor* out, framework::Tensor* filter,
+                       bool relu_enabled, int stride_h, int stride_w,
+                       int padding_h, int padding_w, float* bs_ptr);

 void format_deconv_filter(framework::Tensor* filter_tensor, float max_value,
                          int group_num, int stride);
@@ -69,6 +73,10 @@ void format_deconv_data(framework::Tensor* filter_tensor,
 void format_dwconv_data(framework::Tensor* filter_tensor,
                        framework::Tensor* ofm_tensor, float* scale_ptr,
                        float** bias_ptr);
+void format_DWDeconv_data(framework::Tensor* filter_tensor,
+                          framework::Tensor* ofm_tensor, float** bs_ptr,
+                          int group, int sub_conv_n);
+
 template <typename Dtype>
 void savefile(std::string filename, void* buffer, int dataSize, Dtype tmp) {
  float data;

--- a/src/fpga/V1/deconv_filter.cpp
+++ b/src/fpga/V1/deconv_filter.cpp
@@ -21,15 +21,6 @@ limitations under the License. */
 #include "fpga/V1/api.h"
 // #include "fpga_api.h"

-// just for test
-//#include <string>
-//#include "deconv.h"
-//#include "deconv_api.h"
-// using namespace std;
-// using namespace paddle_mobile::fpga;
-// using namespace baidu::fpga::deconv::api;
-// namespace api = baidu::fpga::deconv::api;
-
 namespace paddle_mobile {
 namespace fpga {
 namespace deconv_filter {
@@ -42,7 +33,8 @@ void deconv_inverse_filter(float** data_in, int num, int channel, int width,
  float* tmp = *data_in;
  int data_size = num * channel * width * height;
  int hw_len = height * width;
-  auto tmp_data = (float*)fpga_malloc(data_size * sizeof(float));
+  auto tmp_data =
+      reinterpret_cast<float*>(fpga_malloc(data_size * sizeof(float)));
  for (int i = 0; i < num; ++i) {
    for (int j = 0; j < channel; ++j) {
      for (int k = 0; k < hw_len; ++k) {
@@ -97,9 +89,10 @@ int deconv_get_omit(int stride, int filter_width, int pad) {
  return (stride - idx);
 }

-void deconv_get_sub_filter(char** data_in, int height, int width,
-                           int sub_conv_n, int kernel_num, int channel) {
-  char* ptr_tmp = *data_in;
+template <typename T>
+void deconv_get_sub_filter(T** data_in, int height, int width, int sub_conv_n,
+                           int kernel_num, int channel) {
+  T* ptr_tmp = *data_in;
  int sub_num = kernel_num * sub_conv_n;
  int sub_h = height / sub_conv_n;
  int sub_w = width / sub_conv_n;
@@ -107,7 +100,8 @@ void deconv_get_sub_filter(char** data_in, int height, int width,
  int sub_filter_size =
      kernel_num * sub_h * sub_w * channel * sub_conv_n * sub_conv_n;

-  char* ptr_sub_filter = (char*)fpga_malloc(sub_filter_size * sizeof(char));
+  T* ptr_sub_filter =
+      reinterpret_cast<T*>(fpga_malloc(sub_filter_size * sizeof(T)));
  for (int idx = 0; idx < sub_conv_n; ++idx) {
    for (int nn = 0; nn < sub_num; ++nn) {
      int ni = nn % kernel_num;
@@ -124,7 +118,7 @@ void deconv_get_sub_filter(char** data_in, int height, int width,

          fpga_copy(
              ptr_sub_filter + idx * sub_h * sub_w * channel * sub_num + sidx,
-              (*data_in) + kidx, channel * sizeof(char));
+              (*data_in) + kidx, channel * sizeof(T));
          // for (int cc =0; cc < channel; ++cc) {
          //     ptr_sub_filter[idx*sub_h*sub_w*channel*sub_num + sidx + cc] =
          //     (*data_in)[kidx + cc];
@@ -140,7 +134,7 @@ void deconv_get_sub_filter(char** data_in, int height, int width,
 void deconv_NC_convert(float** filter_in, int kernel_num, int channels,
                       int hw) {
  float* tmp = *filter_in;
-  float* ptr_filter = (float*)(paddle_mobile::fpga::fpga_malloc(
+  float* ptr_filter = reinterpret_cast<float*>(paddle_mobile::fpga::fpga_malloc(
      hw * kernel_num * channels * sizeof(float)));

  for (int c = 0; c < channels; ++c) {
@@ -188,7 +182,8 @@ void deconv_format_filter(float** data_in, int num, int channel, int height,
  result2);
  }*/

-  deconv_get_sub_filter(quantize_data, height, width, stride, num, channel);
+  deconv_get_sub_filter<char>(quantize_data, height, width, stride, num,
+                              channel);
  /*{
     char result2 = (char)0;
     string filename = "sub_filter_filter_data";
@@ -212,10 +207,12 @@ void deconv_format_filter(float** data_in, int num, int channel, int height,
                                ((residual == 0) ? div_num : (div_num - 1)) +
                            align_to_x(residual, FILTER_NUM_ALIGNMENT);

-  char** ptr_ptr_data = (char**)fpga_malloc(sub_conv_n * sizeof(char*));
+  char** ptr_ptr_data =
+      reinterpret_cast<char**>(fpga_malloc(sub_conv_n * sizeof(char*)));
  int origin_offset = sub_chw * sub_num;
  for (int i = 0; i < sub_conv_n; ++i) {
-    (ptr_ptr_data)[i] = (char*)fpga_malloc(origin_offset * sizeof(char));
+    (ptr_ptr_data)[i] =
+        reinterpret_cast<char*>(fpga_malloc(origin_offset * sizeof(char)));
    fpga_copy((ptr_ptr_data)[i], (*quantize_data) + origin_offset * i,
              origin_offset * sizeof(char));

@@ -233,8 +230,8 @@ void deconv_format_filter(float** data_in, int num, int channel, int height,

  int align_offset =
      align_to_x(sub_chw, FILTER_ELEMENT_ALIGNMENT) * num_after_alignment;
-  char* ptr_space = (char*)fpga_malloc(sub_conv_n * align_offset *
-                                       sizeof(char));  // continuous space
+  char* ptr_space = reinterpret_cast<char*>(fpga_malloc(
+      sub_conv_n * align_offset * sizeof(char)));  // continuous space
  for (int i = 0; i < sub_conv_n; ++i) {
    char* ptr_tmp = (ptr_ptr_data)[i];

@@ -251,7 +248,7 @@ void deconv_format_filter(float** data_in, int num, int channel, int height,
    fpga_copy(ptr_space + i * align_offset, ptr_tmp, align_offset);
    fpga_free(ptr_tmp);
  }
-  *data_in = (float*)ptr_space;
+  *data_in = reinterpret_cast<float*>(ptr_space);

  /*    {
        char result2 = (char)0;
@@ -262,6 +259,22 @@ void deconv_format_filter(float** data_in, int num, int channel, int height,
  fpga_flush(ptr_space, sub_conv_n * align_offset * sizeof(char));
 }

+void DWDconv_format_filter(float** data_in, int num, int channel, int height,
+                           int width, float* scale_ptr, int stride) {
+  deconv_inverse_filter(data_in, num, channel, width, height);
+
+  filter::quantize_to_fp16(data_in, channel, height, width, scale_ptr);
+  int16_t** quantize_data = (int16_t**)data_in;  // NOLINT
+  filter::convert_to_hwn(quantize_data, channel, height, width);
+
+  deconv_get_sub_filter<int16_t>(quantize_data, height, width, stride, num,
+                                 channel);
+
+  filter::align_element_n(quantize_data, channel, height, width);
+  fpga_flush(*quantize_data, align_to_x(channel, FILTER_ELEMENT_ALIGNMENT) *
+                                 height * width * sizeof(int16_t));
+}
+
 }  // namespace deconv_filter
 }  // namespace fpga
 }  // namespace paddle_mobile
--- a/src/fpga/V1/deconv_filter.h
+++ b/src/fpga/V1/deconv_filter.h
@@ -24,11 +24,15 @@ int deconv_calc_sub_pad(int filter_axis, int pad, int stride);
 int deconv_get_sub_filter_axis(int filter_axis, int stride);
 int deconv_get_sub_out_axis(int image_axis, int sub_pad, int sub_filter_axis);
 int deconv_get_omit(int stride, int filter_width, int pad);
-void deconv_get_sub_filter(char** data_in, int height, int width,
-                           int sub_conv_n, int kernel_num, int channel);
+
+template <typename T>
+void deconv_get_sub_filter(T** data_in, int height, int width, int sub_conv_n,
+                           int kernel_num, int channel);
 void deconv_format_filter(float** data_in, int num, int channel, int height,
                          int width, int group_num, float max, int stride);
 void deconv_NC_convert(float** filter_in, int kernel_num, int channels, int hw);
+void DWDconv_format_filter(float** data_in, int num, int channel, int height,
+                           int width, float* scale_ptr, int stride);

 }  // namespace deconv_filter
 }  // namespace fpga

--- a/src/fpga/V1/filter.cpp
+++ b/src/fpga/V1/filter.cpp
@@ -346,6 +346,16 @@ void format_dwconv_filter(float **data_in, int num, int height, int width,
  fpga_flush(*quantize_data, align_to_x(num, FILTER_ELEMENT_ALIGNMENT) *
                                 height * width * sizeof(int16_t));
 }
+
+void format_DWDeconv_filter(float **data_in, int num, int height, int width,
+                            float *scale_ptr) {
+  quantize_to_fp16(data_in, num, height, width, scale_ptr);
+  int16_t **quantize_data = (int16_t **)data_in;  // NOLINT
+  convert_to_hwn(quantize_data, num, height, width);
+  align_element_n(quantize_data, num, height, width);
+  fpga_flush(*quantize_data, align_to_x(num, FILTER_ELEMENT_ALIGNMENT) *
+                                 height * width * sizeof(int16_t));
+}
 }  // namespace filter
 }  // namespace fpga
 }  // namespace paddle_mobile
--- a/src/fpga/V1/pe.cpp
+++ b/src/fpga/V1/pe.cpp
@@ -18,7 +18,6 @@ limitations under the License. */
 #include "fpga/V1/image.h"
 #include "fpga/common/config.h"
 #include "fpga/common/driver.h"
-
 #ifdef COST_TIME_PRINT
 #include <sys/time.h>
 #include <time.h>
@@ -163,6 +162,7 @@ using namespace std;     // NOLINT
 #define REG_DWCONV_FILTER_BASE_ADDR 0xe08
 #define REG_DWCONV_FILTER_SHAPE 0xe10
 #define REG_DWCONV_FILTER_N_ALIGN 0xe18
+#define REG_DWCONV_FILTER_SUBNUMBER 0xe20
 #define REG_DWCONV_CMD 0xe00

 int ComputeFpgaConv(const struct SplitConvArgs &args) {
@@ -591,6 +591,20 @@ int PerformBypass(const struct BypassArgs &args) {
  return 0;
 }  // PerformBypass

+uint64_t FPGAVersion() {
+#ifdef FPGA_PRINT_MODE
+  DLOG << "=============ComputeFpgaBypass===========";
+#endif
+#ifdef PADDLE_MOBILE_ZU5
+  uint64_t fpga_ver = 0;
+  pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
+  fpga_ver = reg_readq(REG_HARDWARE_STATUS);
+  pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
+  return fpga_ver;
+#endif
+  return 0;
+}  // FPGAVersion
+
 int ComputeFPGAConcat(const struct ConcatArgs &args) {
 #ifdef FPGA_PRINT_MODE
  DLOG << "=============ComputeFpgaConcat===========";
@@ -655,6 +669,45 @@ void deconv_post_process(const struct DeconvArgs &args) {
  fpga_flush(args.output.address,
             num * align_deconv_row_len * deconv_h * sizeof(int16_t));
 }
+void DWDeconv_post_process(const struct DWDeconvArgs &args) {
+  int sub_conv_n = args.sub_conv_num;
+  int sub_height = args.sub_output_height;
+  int sub_width = args.sub_output_width;
+  int omit_size = args.omit_size;
+  int channel = args.filter_num;
+  int num = 1;
+  int origin_h = sub_height * sub_conv_n;
+  int origin_w = sub_width * sub_conv_n;
+  int align_origin_w = align_to_x(origin_w * channel, IMAGE_ALIGNMENT);
+  int deconv_h = origin_h - 2 * omit_size;
+  int deconv_w = origin_w - 2 * omit_size;
+  int deconv_row_len = deconv_w * channel;
+  int align_deconv_row_len = align_to_x(deconv_row_len, IMAGE_ALIGNMENT);
+
+  for (int idx = 0; idx < sub_conv_n; ++idx) {
+    paddle_mobile::fpga::fpga_invalidate(
+        args.dw_conv_args[idx]->output.address,
+        align_origin_w * origin_h * sizeof(int16_t));
+  }
+
+  int deconv_idx = 0;
+  for (int nn = 0; nn < num; ++nn) {
+    for (int hh = 0; hh < origin_h; ++hh) {
+      int hx = (hh % sub_conv_n);
+      auto sub_t = (int16_t *)(args.dw_conv_args[sub_conv_n - hx - 1]  // NOLINT
+                                   ->output.address);
+      int hi = (hh / sub_conv_n);
+      if ((hh < omit_size) || (hh >= (origin_h - omit_size))) continue;
+      int sidx = (nn * origin_h * align_origin_w + hi * align_origin_w +
+                  omit_size * channel);
+      fpga_copy((int16_t *)(args.output.address) + deconv_idx,    // NOLINT
+                sub_t + sidx, sizeof(int16_t) * deconv_row_len);  // NOLINT
+      deconv_idx += align_deconv_row_len;
+    }
+  }
+  fpga_flush(args.output.address,
+             num * align_deconv_row_len * deconv_h * sizeof(int16_t));
+}

 int ComputeFpgaDeconv(const struct DeconvArgs &args) {
 #ifdef FPGA_PRINT_MODE
@@ -792,17 +845,21 @@ int ComputeDWConv(const struct DWconvArgs &args) {
      align_to_x((uint64_t)args.image.channels, IMAGE_ALIGNMENT);
  uint64_t filter_amount_per_row_align =
      filter_N_align * (uint64_t)args.kernel.width;
-  uint64_t filter_amount_align = filter_N_align * (uint64_t)args.kernel.width *
-                                 (uint64_t)args.kernel.height;
+  uint64_t sub_filter_amount_align = filter_N_align *
+                                     (uint64_t)args.kernel.width *
+                                     (uint64_t)args.kernel.height;
+  uint64_t filter_amount_align =
+      sub_filter_amount_align * (uint64_t)args.sub_conv_num;

  uint32_t output_height = (uint32_t)(
      (args.image.height + args.image.pad_height * 2 - args.kernel.height) /
          args.kernel.stride_h +
      1);
  uint32_t output_width = (uint32_t)(
-      (args.image.width + args.image.pad_width * 2 - args.kernel.width) /
-          args.kernel.stride_w +
-      1);
+      ((args.image.width + args.image.pad_width * 2 - args.kernel.width) /
+           args.kernel.stride_w +
+       1) *
+      args.sub_conv_num);

  uint64_t image_amount_per_row =
      align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels,
@@ -845,12 +902,15 @@ int ComputeDWConv(const struct DWconvArgs &args) {

  /*restart scale*/
  reg_writeq(output_scale, REG_SCALE_PARAMETER);
+
  reg_writeq(image_physical_address, REG_POOLING_IMAGE_BASE_ADDR);
  reg_writeq(output_physical_address, REG_POOLING_RESULT_BASE_ADDR);
  reg_writeq((bias_physical_address << 32 | filter_physical_address),
             REG_DWCONV_FILTER_BASE_ADDR);
  reg_writeq(filter_amount_per_row_align | (filter_amount_align << 32),
             REG_DWCONV_FILTER_SHAPE);
+  reg_writeq(sub_filter_amount_align | (((uint64_t)args.sub_conv_num) << 32),
+             REG_DWCONV_FILTER_SUBNUMBER);
  reg_writeq(filter_N_align, REG_DWCONV_FILTER_N_ALIGN);

  reg_writeq(
@@ -904,10 +964,88 @@ int ComputeDWConv(const struct DWconvArgs &args) {
  output_scale = reg_readq(REG_SCALE_PARAMETER);
  output_scale = (output_scale << 32) | (output_scale >> 32);
  fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
+  DLOG << "output_scale:" << output_scale;
  pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
  return ret;
 #endif
  return 0;
 }
+int ComputeDWDeconv(const struct DWDeconvArgs &args) {
+#ifdef FPGA_PRINT_MODE
+  DLOG << "=============ComputeFPGADeConv===========";
+  DLOG << "   filter_num:" << args.filter_num
+       << "   group_num:" << args.group_num << "omit_size:" << args.omit_size
+       << "sub_output_width: " << args.sub_output_width
+       << "sub_output_height: " << args.sub_output_height
+       << "   sub_conv_num:" << args.sub_conv_num;
+  DLOG << "args.output.address: " << args.output.address
+       << "args.output.scale_address: " << args.output.scale_address;
+
+#endif
+
+  int sub_conv_num = args.sub_conv_num;
+
+#ifdef COST_TIME_PRINT
+  timeval start, end;
+  long dif_sec, dif_usec;  // NOLINT
+#endif
+
+  for (int i = 0; i < sub_conv_num; i++) {
+#ifdef COST_TIME_PRINT
+    gettimeofday(&start, NULL);
+#endif
+
+    ComputeDWConv(*args.dw_conv_args[i]);
+#ifdef COST_TIME_PRINT
+    gettimeofday(&end, NULL);
+    dif_sec = end.tv_sec - start.tv_sec;
+    dif_usec = end.tv_usec - start.tv_usec;
+    std::cout << "deconv basic_conv: " << i << " times:  "
+              << "    cost time: " << (dif_sec * 1000000 + dif_usec) << "us"
+              << std::endl;
+#endif
+  }
+
+  if (sub_conv_num > 1) {
+    float max_scale = -1.0f;
+#ifdef COST_TIME_PRINT
+    gettimeofday(&start, NULL);
+#endif
+    for (int i = 0; i < sub_conv_num; i++) {
+      paddle_mobile::fpga::fpga_invalidate(
+          args.dw_conv_args[i]->output.scale_address, 2 * sizeof(float));
+      float ptr_scale = (args.dw_conv_args[i]->output.scale_address)[0];
+      if (ptr_scale > max_scale) {
+        args.output.scale_address[0] = ptr_scale;
+        args.output.scale_address[1] =
+            (args.dw_conv_args[i]->output.scale_address)[1];
+      }
+    }
+
+#ifdef COST_TIME_PRINT
+    gettimeofday(&end, NULL);
+    dif_sec = end.tv_sec - start.tv_sec;
+    dif_usec = end.tv_usec - start.tv_usec;
+    std::cout << "deconv scale  "
+              << "    cost time: " << (dif_sec * 1000000 + dif_usec) << "us"
+              << std::endl;
+#endif
+  }
+
+#ifdef COST_TIME_PRINT
+  gettimeofday(&start, NULL);
+#endif
+  DWDeconv_post_process(args);
+#ifdef COST_TIME_PRINT
+  gettimeofday(&end, NULL);
+  dif_sec = end.tv_sec - start.tv_sec;
+  dif_usec = end.tv_usec - start.tv_usec;
+  std::cout << "deconv_post_process  "
+            << "    cost time: " << (dif_sec * 1000000 + dif_usec) << "us"
+            << std::endl;
+#endif
+  return 0;
+}  // ComputeFpgaDeconv
+
 }  // namespace fpga
 }  // namespace paddle_mobile
--- a/src/fpga/common/fpga_common.cpp
+++ b/src/fpga/common/fpga_common.cpp
@@ -76,7 +76,7 @@ int32_t convertmantissa(int32_t i) {
 }

 float fp16_2_fp32(int16_t fp16_num) {
-  int16_t se_fp16 = fp16_num >> 10;
+  int16_t se_fp16 = (fp16_num >> 10) & 0x3f;
  int16_t m_fp16 = fp16_num & 0x3ff;
  int32_t e_fp32 = 0;
  int16_t offset = 0;
@@ -94,7 +94,7 @@ float fp16_2_fp32(int16_t fp16_num) {
    e_fp32 = 0x80000000;
    offset = 0;
  } else if (se_fp16 < 63) {
-    e_fp32 = 0x80000000 + (se_fp16 - 32) << 23;
+    e_fp32 = 0x80000000 + ((se_fp16 - 32) << 23);
    offset = 1024;
  } else {  // se_fp16 == 63
    e_fp32 = 0xC7800000;

--- a/src/fpga/common/fpga_common.h
+++ b/src/fpga/common/fpga_common.h
@@ -229,6 +229,7 @@ struct DeconvArgs {
  std::vector<std::shared_ptr<SplitConvArgs>> split_conv_args;
 };
 struct DWconvArgs {
+  uint32_t sub_conv_num;
  bool relu_enabled;
  void* bias_address;
  void* filter_address;
@@ -236,6 +237,19 @@ struct DWconvArgs {
  struct ImageInputArgs image;
  struct ImageOutputArgs output;
 };
+
+struct DWDeconvArgs {
+  uint32_t sub_conv_num;
+  uint32_t group_num;
+  uint32_t filter_num;
+  uint32_t omit_size;
+  uint32_t sub_output_width;
+  uint32_t sub_output_height;
+  struct ImageOutputArgs output;
+  std::vector<std::shared_ptr<DWconvArgs>> dw_conv_args;
+  std::vector<std::shared_ptr<char>> vector_dw_conv_space;
+};
+
 // static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x;
 // }
 static inline uint32_t align_to_x(int64_t num, int64_t x) {

--- a/src/fpga/common/pe.h
+++ b/src/fpga/common/pe.h
@@ -18,6 +18,7 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace fpga {

+uint64_t FPGAVersion();
 int PerformBypass(const struct BypassArgs& args);
 int ComputeBasicConv(const struct ConvArgs& args);
 int ComputeFpgaPool(const struct PoolingArgs& args);
@@ -28,5 +29,7 @@ int ComputeFPGAConcat(const struct ConcatArgs& args);
 int ComputeFPGASplit(const struct SplitArgs& args);
 int ComputeFpgaDeconv(const struct DeconvArgs& args);
 int ComputeDWConv(const struct DWconvArgs& args);
+int ComputeDWDeconv(const struct DWDeconvArgs& args);
+
 }  // namespace fpga
 }  // namespace paddle_mobile
--- a/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp
@@ -49,13 +49,23 @@ bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) {
                        "filter width should be equal to filter height ");
  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
                        "filter axis should be the multiple of stride axis ");
-  fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
-  fpga::DeconvArgs deconv_arg = {0};
-  fpga::fill_deconv_arg(&deconv_arg, input, out, filter, relu_enabled,
-                        param->Groups(), param->Strides()[0],
-                        param->Strides()[1], param->Paddings()[0],
-                        param->Paddings()[1], bs_ptr);
-  param->SetFpgaArgs(deconv_arg);
+  if (param->Groups() == channel) {
+    fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
+                               sub_conv_n);
+    fpga::DWDeconvArgs DWDeconv_arg = {0};
+    fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, relu_enabled,
+                            param->Strides()[0], param->Strides()[1],
+                            param->Paddings()[0], param->Paddings()[1], bs_ptr);
+    param->SetFpgaArgs(DWDeconv_arg);
+  } else {
+    fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
+    fpga::DeconvArgs deconv_arg = {0};
+    fpga::fill_deconv_arg(&deconv_arg, input, out, filter, relu_enabled,
+                          param->Groups(), param->Strides()[0],
+                          param->Strides()[1], param->Paddings()[0],
+                          param->Paddings()[1], bs_ptr);
+    param->SetFpgaArgs(deconv_arg);
+  }

  return true;
 }
@@ -63,7 +73,11 @@ bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) {
 template <>
 void DeconvAddKernel<FPGA, float>::Compute(
    const FusionDeconvAddParam<FPGA> &param) {
-  fpga::ComputeFpgaDeconv(param.FpgaArgs());
+  if (param.Groups() == param.Output()->dims()[1]) {
+    fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
+  } else {
+    fpga::ComputeFpgaDeconv(param.FpgaArgs());
+  }
 }

 }  // namespace operators

--- a/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp
@@ -50,20 +50,35 @@ bool DeconvAddReluKernel<FPGA, float>::Init(
                        "filter width should be equal to filter height ");
  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
                        "filter axis should be the multiple of stride axis ");
-  fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
-  fpga::DeconvArgs deconv_arg = {0};
-  fpga::fill_deconv_arg(&deconv_arg, input, out, filter, relu_enabled,
-                        param->Groups(), param->Strides()[0],
-                        param->Strides()[1], param->Paddings()[0],
-                        param->Paddings()[1], bs_ptr);
-  param->SetFpgaArgs(deconv_arg);
+  if (param->Groups() == channel) {
+    fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
+                               sub_conv_n);
+    fpga::DWDeconvArgs DWDeconv_arg = {0};
+    fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, relu_enabled,
+                            param->Strides()[0], param->Strides()[1],
+                            param->Paddings()[0], param->Paddings()[1], bs_ptr);
+    param->SetFpgaArgs(DWDeconv_arg);
+  } else {
+    fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
+    fpga::DeconvArgs deconv_arg = {0};
+    fpga::fill_deconv_arg(&deconv_arg, input, out, filter, relu_enabled,
+                          param->Groups(), param->Strides()[0],
+                          param->Strides()[1], param->Paddings()[0],
+                          param->Paddings()[1], bs_ptr);
+    param->SetFpgaArgs(deconv_arg);
+  }
  return true;
 }

 template <>
 void DeconvAddReluKernel<FPGA, float>::Compute(
    const FusionDeconvAddReluParam<FPGA> &param) {
-  fpga::ComputeFpgaDeconv(param.FpgaArgs());
+  // fpga::ComputeFpgaDeconv(param.FpgaArgs());
+  if (param.Groups() == param.Output()->dims()[1]) {
+    fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
+  } else {
+    fpga::ComputeFpgaDeconv(param.FpgaArgs());
+  }
 }

 }  // namespace operators

--- a/src/operators/kernel/fpga/V1/sigmoid_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/sigmoid_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef SIGMOID_OP
+
+#include "operators/kernel/activation_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+
+using framework::DDim;
+using framework::Tensor;
+
+template <>
+bool SigmoidKernel<FPGA, float>::Init(SigmoidParam<FPGA> *param) {
+  auto input = const_cast<Tensor *>(param->InputX());
+  auto input_ptr = input->data<float>();
+  auto out = param->Out();
+  fpga::format_fp32_ofm(out);
+
+  auto float_input = new Tensor;
+  if (input->dims().size() == 2) {
+    float_input->mutable_data<float>({1, input->dims()[1]});
+  } else if (input->dims().size() == 4) {
+    float_input->mutable_data<float>(
+        {1, input->dims()[2], input->dims()[3], input->dims()[1]});
+  } else {
+    DLOG << "wrong dimension of softmax input";
+  }
+
+  fpga::format_fp32_ofm(float_input);
+  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
+  args.input_layout_type = fpga::LAYOUT_HWC;
+  args.output_layout_type = fpga::LAYOUT_CHW;
+  args.input_data_type = fpga::DATA_TYPE_FP16;
+  args.output_data_type = fpga::DATA_TYPE_FP32;
+  args.image.address = input_ptr;
+  args.image.height =
+      (input->dims().size() == 4) ? (uint32_t)input->dims()[2] : 1;
+  args.image.width =
+      (input->dims().size() == 4) ? (uint32_t)input->dims()[3] : 1;
+  args.image.channels = (uint32_t)input->dims()[1];
+  args.output.address = float_input->data<float>();
+  args.output.scale_address = float_input->scale;
+  param->SetFloatInput(float_input);
+  param->SetFpgaArgs(args);
+
+  return true;
+}
+template <typename T>
+T Sigmoid(const T a) {
+  T tmp = -1.0f * a;
+  return (1.0 / (1.0 + exp(tmp)));
+}
+template <typename T>
+void sigmoidFuntor(Tensor *input, Tensor *output) {
+  auto *input_ptr = input->data<T>();
+  auto *output_ptr = output->mutable_data<T>();
+  for (int i = 0; i < input->numel(); i++) {
+    *(output_ptr + i) = Sigmoid<T>(*(input_ptr + i));
+  }
+}
+template <>
+void SigmoidKernel<FPGA, float>::Compute(const SigmoidParam<FPGA> &param) {
+  Tensor *in_x = param.FloatInput();
+  Tensor *out = param.Out();
+
+  fpga::PerformBypass(param.FpgaArgs());
+  fpga::fpga_invalidate((void *)in_x->data<float>(),  // NOLINT
+                        in_x->numel() * sizeof(float));
+  // TODO: In general case, 0 should be squeezed before softmax input  // NOLINT
+  sigmoidFuntor<float>(in_x, out);
+  fpga::fpga_flush(out->data<float>(), out->memory_size());
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -1078,6 +1078,20 @@ class SigmoidParam : public OpParam {
 private:
  RType *input_x_;
  RType *out_;
+#ifdef PADDLE_MOBILE_FPGA
+
+ private:
+  std::shared_ptr<RType> float_input_x_;
+  fpga::BypassArgs fpga_bypass_args;
+
+ public:
+  RType *FloatInput() const {
+    return float_input_x_ == nullptr ? input_x_ : float_input_x_.get();
+  }
+  void SetFloatInput(Tensor *input) { float_input_x_.reset(input); }
+  const fpga::BypassArgs &FpgaArgs() const { return fpga_bypass_args; }
+  void SetFpgaArgs(const fpga::BypassArgs &args) { fpga_bypass_args = args; }
+#endif
 };
 #endif

@@ -2357,10 +2371,17 @@ class ConvTransposeParam : public OpParam {

 private:
  fpga::DeconvArgs fpga_conv_args;
+  fpga::DWDeconvArgs fpga_DWDeconv_args;

 public:
  const fpga::DeconvArgs &FpgaArgs() const { return fpga_conv_args; }
+  const fpga::DWDeconvArgs &FpgaDWDconvArgs() const {
+    return fpga_DWDeconv_args;
+  }
  void SetFpgaArgs(const fpga::DeconvArgs &args) { fpga_conv_args = args; }
+  void SetFpgaArgs(const fpga::DWDeconvArgs &args) {
+    fpga_DWDeconv_args = args;
+  }
 #endif
 };