Merge branch 'develop' into add_concat_int8

965fce05 · xiebaiyuan · GitHub · 88baf9ca · 9437e287 · 965fce05
22 changed file
--- a/src/fpga/V1/api.cpp
+++ b/src/fpga/V1/api.cpp
@@ -21,6 +21,9 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace fpga {

+#define USE_RELU 1
+#define USE_BIAS 2
+
 int get_align_image_cw(int cw) { return align_to_x(cw, IMAGE_ALIGNMENT); }

 void format_image(framework::Tensor *image_tensor) {
@@ -172,6 +175,170 @@ void format_concat_output(framework::Tensor *out, int height, int width,
  out->reset_data_ptr(data_ptr);
 }

+void expand_conv_arg(ConvArgs *arg) {
+  ConvArgs args = *arg;
+  uint64_t filterlen = (uint64_t)args.kernel.width *
+                       (uint64_t)args.kernel.height *
+                       (uint64_t)args.image.channels;
+  filterlen = align_to_x(filterlen, FILTER_ELEMENT_ALIGNMENT);
+  filterlen *= align_to_x((uint64_t)args.filter_num, FILTER_NUM_ALIGNMENT);
+  uint64_t fpga_bias_scale_len =
+      align_to_x(args.filter_num / args.group_num, 8) * args.group_num;
+
+  uint64_t output_height =
+      (args.image.height + args.image.pad_height * 2 - args.kernel.height) /
+          args.kernel.stride_h +
+      1;
+  uint64_t output_width =
+      (args.image.width + args.image.pad_width * 2 - args.kernel.width) /
+          args.kernel.stride_w +
+      1;
+  uint64_t output_size =
+      output_height * output_width * (uint64_t)args.filter_num;
+
+  auto filter_per_group = (uint64_t)(args.filter_num / args.group_num);
+  auto channel_per_group = (uint64_t)(args.image.channels / args.group_num);
+
+  uint64_t image_row_count = ((uint64_t)args.image.width) *
+                             ((uint64_t)args.image.channels);  // without align
+  uint64_t image_amount_per_row = align_to_x(image_row_count, IMAGE_ALIGNMENT);
+  uint64_t image_one_pad_per_row =
+      align_to_x(image_row_count, IMAGE_ALIGNMENT) +
+      ((uint64_t)args.image.pad_width) * ((uint64_t)args.image.channels);
+  uint64_t filter_amount_all =
+      align_to_x(((uint64_t)args.kernel.height) *
+                     ((uint64_t)args.kernel.width) * channel_per_group,
+                 FILTER_ELEMENT_ALIGNMENT);
+
+  uint64_t output_amount_per_row =
+      align_to_x(output_width * ((uint64_t)args.filter_num), IMAGE_ALIGNMENT);
+
+  // find the opt partition strategy
+  uint64_t res_win;
+  uint64_t res_fit = 0;
+  for (res_win = 1; res_win <= output_width; res_win = res_win + 1) {
+    if ((align_to_x(
+             (args.image.channels *
+              (args.kernel.width + (res_win - 1) * args.kernel.stride_w)),
+             IMAGE_ALIGNMENT) /
+             16 +
+         1) *
+            args.kernel.height >
+        2048) {
+      break;
+    }
+  }
+
+  if (res_win != output_width) {
+    res_win -= 1;
+  }
+
+  if (((res_win % 2) != 0) && (res_win != 1)) {
+    res_win = res_win - 1;
+  }
+  res_fit = res_win;
+
+  uint64_t block_num = (output_width + res_fit - 1) / res_fit;
+  uint64_t block_len = res_fit;
+  uint64_t block_last = output_width - res_fit * (block_num - 1);
+
+  uint64_t res_amount_per_row = output_width * args.filter_num;
+  uint64_t res_amount_per_row_pad = output_amount_per_row - res_amount_per_row;
+
+  uint64_t image_block_amount_per_row =
+      args.kernel.stride_w * (res_fit)*args.image.channels;
+  uint64_t filter_pad_width_mul_channel =
+      args.image.pad_width * args.image.channels;
+  uint64_t image_amount_per_row_multi_win_first =
+      image_amount_per_row * (4 * args.kernel.stride_h - args.image.pad_height);
+  uint64_t image_amount_per_row_multi_win =
+      image_amount_per_row * (4 * args.kernel.stride_h);
+
+  uint64_t image_block_num = block_num;
+  uint64_t image_block_len =
+      align_to_x((args.image.channels *
+                  (args.kernel.width + (block_len - 1) * args.kernel.stride_w)),
+                 IMAGE_ALIGNMENT) /
+          16 +
+      1;
+  uint64_t image_block_len_last =
+      align_to_x(
+          (args.image.channels *
+           (args.kernel.width + (block_last - 1) * args.kernel.stride_w)),
+          IMAGE_ALIGNMENT) /
+          16 +
+      1;
+  uint64_t image_win_cnt = block_len;
+  uint64_t image_win_cnt_last = block_last;
+  uint64_t res_row_data_align4_pad = res_amount_per_row_pad / 8;
+  uint64_t prog_full_cnt = 2048 / (filter_amount_all / 16 * 2) - 1;
+  if (prog_full_cnt == 1023) {
+    prog_full_cnt--;
+  }
+  uint64_t post_prog_full_cnt =
+      (512 / (align_to_x(args.filter_num, 4) / 4 * 2) > 2)
+          ? (512 / (align_to_x(args.filter_num, 4) / 4 * 2) - 2)
+          : 0;
+  uint64_t cmd = 0UL | (args.relu_enabled ? USE_RELU : 0) | USE_BIAS;
+
+  (*arg).driver.image_address_phy = vaddr_to_paddr(args.image.address);
+  (*arg).driver.sb_address_phy = vaddr_to_paddr(args.sb_address);
+  (*arg).driver.filter_address_phy = vaddr_to_paddr(args.filter_address);
+  (*arg).driver.output_address_phy = vaddr_to_paddr(args.output.address);
+  (*arg).driver.output_height = output_height;
+  (*arg).driver.output_width = output_width;
+  (*arg).driver.filter_per_group = filter_per_group;
+  (*arg).driver.channel_per_group = channel_per_group;
+  (*arg).driver.image_amount_per_row = image_amount_per_row;
+  (*arg).driver.image_one_pad_per_row = image_one_pad_per_row;
+  (*arg).driver.filter_amount_all = filter_amount_all;
+  (*arg).driver.output_amount_per_row = output_amount_per_row;
+  (*arg).driver.image_block_amount_per_row = image_block_amount_per_row;
+  (*arg).driver.filter_pad_width_mul_channel = filter_pad_width_mul_channel;
+  (*arg).driver.image_amount_per_row_multi_win_first =
+      image_amount_per_row_multi_win_first;
+  (*arg).driver.image_amount_per_row_multi_win = image_amount_per_row_multi_win;
+  (*arg).driver.image_block_num = image_block_num;
+  (*arg).driver.image_block_len = image_block_len;
+  (*arg).driver.image_block_len_last = image_block_len_last;
+  (*arg).driver.image_win_cnt = image_win_cnt;
+  (*arg).driver.image_win_cnt_last = image_win_cnt_last;
+  (*arg).driver.res_row_data_align4_pad = res_row_data_align4_pad;
+  (*arg).driver.prog_full_cnt = prog_full_cnt;
+  (*arg).driver.post_prog_full_cnt = post_prog_full_cnt;
+  (*arg).driver.fpga_bias_scale_len = fpga_bias_scale_len;
+  (*arg).driver.cmd = cmd;
+}  // expand_conv_arg()
+
+void expand_EW_arg(EWAddArgs *arg) {
+  EWAddArgs args = *arg;
+  uint64_t cmd = args.relu_enabled ? USE_RELU : 0;
+  uint64_t datalen = (uint64_t)args.image0.width *
+                     (uint64_t)args.image0.height *
+                     (uint64_t)args.image0.channels;
+  uint64_t coefficient = (uint64_t)args.const0 << 32 | (uint64_t)args.const1;
+  uint64_t image0_address_phy = vaddr_to_paddr(args.image0.address);
+  uint64_t image1_address_phy = vaddr_to_paddr(args.image1.address);
+  uint64_t output_address_phy = vaddr_to_paddr(args.output.address);
+
+  uint64_t image_amount_per_row =
+      align_to_x((uint64_t)args.image0.width * (uint64_t)args.image0.channels,
+                 IMAGE_ALIGNMENT);
+  uint64_t image_image_pixel = ((uint64_t)args.image0.channels << 32) |
+                               ((uint64_t)args.image0.width << 16) |
+                               (uint64_t)args.image0.height;
+
+  (*arg).driver.image0_address_phy = image0_address_phy;
+  (*arg).driver.image1_address_phy = image1_address_phy;
+  (*arg).driver.datalen = datalen;
+  (*arg).driver.image_image_pixel = image_image_pixel;
+  (*arg).driver.image_amount_per_row = image_amount_per_row;
+  (*arg).driver.output_address_phy = output_address_phy;
+  (*arg).driver.coefficient = coefficient;
+  (*arg).driver.cmd = cmd;
+
+}  // expand_EW_arg
+
 void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
                    framework::Tensor *out, framework::Tensor *filter,
                    bool relu_enabled, int group_num, int stride_h,
@@ -206,7 +373,7 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
  auto channel = (int)out->dims()[1];  // NOLINT
  int filter_num_per_div = get_filter_num_per_div(filter, group_num);
  int element_num = get_aligned_filter_element_num(
-      filter->dims()[1] * filter->dims()[2] * filter->dims()[3]);
+      (int)(filter->dims()[1] * filter->dims()[2] * filter->dims()[3]));

  for (int i = 0; i < n; i++) {
    arg->conv_arg[i].relu_enabled = relu_enabled;
@@ -223,24 +390,23 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
    arg->conv_arg[i].image.pad_height = (uint32_t)padding_h;
    arg->conv_arg[i].image.pad_width = (uint32_t)padding_w;
    arg->conv_arg[i].filter_scale_address = filter->scale;
-    //    arg->conv_arg[i].filter_address = &(
-    //        (int8_t *)filter_ptr)[i * element_num * filter_num_per_div];  //
-    //        NOLINT
-    //    arg->conv_arg[i].sb_address = &bs_ptr[i * filter_num_per_div * 2];
-
    arg->conv_arg[i].filter_num = (uint32_t)(
        i == n - 1 ? channel - (n - 1) * filter_num_per_div  // NOLINT
                   : filter_num_per_div);

    size_t filter_size =
-        element_num * arg->conv_arg[i].filter_num * sizeof(int8_t);
+        element_num *
+        align_to_x(arg->conv_arg[i].filter_num, FILTER_NUM_ALIGNMENT) *
+        sizeof(int8_t);
    auto filter_head =
        &((int8_t *)filter_ptr)[i * element_num * filter_num_per_div];
    arg->conv_arg[i].filter_address = fpga_malloc(filter_size);
    memcpy(arg->conv_arg[i].filter_address, filter_head, filter_size);
    fpga_flush(arg->conv_arg[i].filter_address, filter_size);

-    size_t bs_size = 2 * arg->conv_arg[i].filter_num * sizeof(float);
+    size_t bs_size = 2 *
+                     align_to_x(arg->conv_arg[i].filter_num, BS_NUM_ALIGNMENT) *
+                     sizeof(float);
    auto bs_head = &bs_ptr[i * filter_num_per_div * 2];
    arg->conv_arg[i].sb_address = fpga_malloc(bs_size);
    memcpy(arg->conv_arg[i].sb_address, bs_head, bs_size);
@@ -249,11 +415,11 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
    if (n > 1) {
      arg->conv_arg[i].output.scale_address =
          (float *)fpga_malloc(2 * sizeof(float));  // NOLINT
-      arg->conv_arg[i].output.address =
-          fpga_malloc(out->dims()[2] *
-                      align_to_x(out->dims()[3] * arg->conv_arg[i].filter_num,
-                                 IMAGE_ALIGNMENT) *
-                      sizeof(half));
+      arg->conv_arg[i].output.address = fpga_malloc(
+          out->dims()[2] *
+          align_to_x((int)(out->dims()[3] * arg->conv_arg[i].filter_num),
+                     IMAGE_ALIGNMENT) *
+          sizeof(half));
    } else {
      arg->conv_arg[i].output.scale_address = out->scale;
      arg->conv_arg[i].output.address = out_ptr;
@@ -263,10 +429,13 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
        (half *)arg->conv_arg[i].output.address;  // NOLINT
    arg->concat_arg.scales_in[i] = arg->conv_arg[i].output.scale_address;
    arg->concat_arg.channel_num[i] = arg->conv_arg[i].filter_num;
+
+    expand_conv_arg(&arg->conv_arg[i]);
  }
  filter->reset_data_ptr(nullptr);
  fpga_free(bs_ptr);
-}
+}  // fill_split_arg
+
 void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
                     framework::Tensor *out, framework::Tensor *filter,
                     bool relu_enabled, int group_num, int stride_h,
@@ -277,28 +446,27 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
  auto out_ptr = out->data<float>();

  arg->group_num = (uint32_t)group_num;
-  arg->sub_conv_num = stride_h;
+  arg->sub_conv_num = (uint32_t)stride_h;
  arg->filter_num = (uint32_t)filter->dims()[0];
-
  int sub_conv_num = arg->sub_conv_num;
  int sub_stride = 1;
-  int sub_pad = deconv_filter::deconv_calc_sub_pad(filter->dims()[3], padding_w,
-                                                   stride_w);
-  int sub_filter_width =
-      deconv_filter::deconv_get_sub_filter_axis(filter->dims()[3], stride_w);
+  int sub_pad = deconv_filter::deconv_calc_sub_pad((int)filter->dims()[3],
+                                                   padding_w, stride_w);
+  int sub_filter_width = deconv_filter::deconv_get_sub_filter_axis(
+      (int)filter->dims()[3], stride_w);

  int sub_output_width = deconv_filter::deconv_get_sub_out_axis(
-      input->dims()[3], sub_pad, sub_filter_width);
+      (int)input->dims()[3], sub_pad, sub_filter_width);
  int sub_output_height = deconv_filter::deconv_get_sub_out_axis(
-      input->dims()[2], sub_pad, sub_filter_width);
+      (int)input->dims()[2], sub_pad, sub_filter_width);

-  arg->sub_output_width = sub_output_width;
-  arg->sub_output_height = sub_output_height;
-  arg->omit_size =
-      deconv_filter::deconv_get_omit(stride_w, filter->dims()[3], padding_w);
+  arg->sub_output_width = (uint32_t)sub_output_width;
+  arg->sub_output_height = (uint32_t)sub_output_height;
+  arg->omit_size = (uint32_t)deconv_filter::deconv_get_omit(
+      stride_w, (int)filter->dims()[3], padding_w);
  arg->conv_args = (ConvArgs *)fpga_malloc(sub_conv_num * sizeof(ConvArgs));

-  int sub_channels = (int32_t)input->dims()[1];
+  int sub_channels = (int)input->dims()[1];
  int omit_size = arg->omit_size;
  int real_out_width = sub_output_width * sub_conv_num - 2 * omit_size;
  int real_out_height = sub_output_height * sub_conv_num - 2 * omit_size;
@@ -318,42 +486,41 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,

  for (int i = 0; i < sub_conv_num; ++i) {
    arg->conv_args[i].filter_num = (arg->sub_conv_num) * (arg->filter_num);
-    arg->conv_args[i].group_num = group_num;
+    arg->conv_args[i].group_num = (uint32_t)group_num;

    arg->conv_args[i].filter_scale_address = filter->scale;
    arg->conv_args[i].relu_enabled = relu_enabled;

-    arg->conv_args[i].kernel.width = sub_filter_width;
-    arg->conv_args[i].kernel.height = sub_filter_width;
+    arg->conv_args[i].kernel.width = (uint32_t)sub_filter_width;
+    arg->conv_args[i].kernel.height = (uint32_t)sub_filter_width;
    arg->conv_args[i].kernel.stride_w = 1;
    arg->conv_args[i].kernel.stride_h = 1;

    // DeconvParam.conv_args[i].image.address = (void*)ptr_image;
    arg->conv_args[i].image.scale_address = input->scale;
-    arg->conv_args[i].image.channels = sub_channels;
+    arg->conv_args[i].image.channels = (uint32_t)sub_channels;
    arg->conv_args[i].image.width = (uint32_t)input->dims()[3];
    arg->conv_args[i].image.height = (uint32_t)input->dims()[2];
-    arg->conv_args[i].image.pad_width = sub_pad;
-    arg->conv_args[i].image.pad_height = sub_pad;
+    arg->conv_args[i].image.pad_width = (uint32_t)sub_pad;
+    arg->conv_args[i].image.pad_height = (uint32_t)sub_pad;
    arg->conv_args[i].image.address = input_ptr;
-
    arg->conv_args[i].sb_address = (void *)bs_ptr;

-    char *filter_sub_space =
+    auto filter_sub_space =
        (char *)fpga_malloc(align_conv_sub_filter_count * sizeof(char));
    fpga_copy(filter_sub_space,
              (char *)filter_ptr + i * align_conv_sub_filter_count,
-              align_conv_sub_filter_count);
+              (size_t)align_conv_sub_filter_count);
    arg->conv_args[i].filter_address = (void *)(filter_sub_space);
-    fpga_flush(filter_sub_space, align_conv_sub_filter_count);
+    fpga_flush(filter_sub_space, (size_t)align_conv_sub_filter_count);

    if (sub_conv_num == 1) {
      arg->conv_args[i].output.address = out_ptr;
      arg->conv_args[i].output.scale_address = out->scale;
    } else {
-      half *ptr_output = (half *)fpga_malloc(conv_output_size * sizeof(half));
+      auto ptr_output = (half *)fpga_malloc(conv_output_size * sizeof(half));
      arg->conv_args[i].output.address = (void *)((half *)ptr_output);
-      float *ptr_output_scale = (float *)fpga_malloc(2 * sizeof(float));
+      auto ptr_output_scale = (float *)fpga_malloc(2 * sizeof(float));
      arg->conv_args[i].output.scale_address = ptr_output_scale;
    }
  }
@@ -361,6 +528,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
  arg->output.address = out_ptr;
  arg->output.scale_address = out->scale;
  // fpga_free(filter_ptr);
-}
+}  // fill_deconv_arg
+
 }  // namespace fpga
 }  // namespace paddle_mobile
--- a/src/fpga/V1/bias_scale.h
+++ b/src/fpga/V1/bias_scale.h
@@ -14,8 +14,6 @@ limitations under the License. */

 #pragma once

-#define BS_NUM_ALIGNMENT 8
-
 namespace paddle_mobile {
 namespace fpga {
 namespace bias_scale {

--- a/src/fpga/V1/deconv_bias_scale.h
+++ b/src/fpga/V1/deconv_bias_scale.h
@@ -14,8 +14,6 @@ limitations under the License. */

 #pragma once

-#define BS_NUM_ALIGNMENT 8
-
 namespace paddle_mobile {
 namespace fpga {
 namespace deconv_bias_scale {

--- a/src/fpga/V1/filter.h
+++ b/src/fpga/V1/filter.h
@@ -14,9 +14,6 @@ limitations under the License. */

 #pragma once

-#define FILTER_NUM_ALIGNMENT 32      // Filter number aligned to 32
-#define FILTER_ELEMENT_ALIGNMENT 16  // Filter element number aligned to 16
-
 namespace paddle_mobile {
 namespace fpga {
 namespace filter {

--- a/src/fpga/V1/image.cpp
+++ b/src/fpga/V1/image.cpp
@@ -111,25 +111,37 @@ void concat_images(int16_t **images_in, float **scales_in, void *image_out,
  fpga_flush(image_out, height * align_each_out_area_cw * sizeof(int16_t));
 }

-void split_image(int16_t *image_in, float *scale_in, void **images_out,
-                 float **scales_out, int image_num, uint32_t *channel_nums,
-                 int height, int width) {
+void split_image(int16_t *image_in, const float *scale_in, void **images_out,
+                 float **scales_out, int image_num,
+                 const uint32_t *channel_nums, int height, int width) {
  int total_channel = 0;
  for (int i = 0; i < image_num; i++) {
    scales_out[i][0] = scale_in[0];
    scales_out[i][1] = scale_in[1];
    total_channel += channel_nums[i];
  }
+  int element_num = height * align_to_x(width * total_channel, IMAGE_ALIGNMENT);
+  fpga_invalidate(image_in, element_num * sizeof(int16_t));

+  int src_offset = 0, des_offset = 0;
  for (int h = 0; h < height; h++) {
-    int src_offset = h * align_to_x(total_channel * width, IMAGE_ALIGNMENT);
-    for (int i = 0; i < image_num; i++) {
-      int des_offset = h * align_to_x(channel_nums[i] * width, IMAGE_ALIGNMENT);
-      memcpy((int16_t *)images_out[i] + des_offset, image_in + src_offset,
-             channel_nums[i] * sizeof(int16_t));
-      src_offset += channel_nums[i];
+    for (int w = 0; w < width; w++) {
+      src_offset = h * align_to_x(total_channel * width, IMAGE_ALIGNMENT) +
+                   w * total_channel;
+      for (int i = 0; i < image_num; i++) {
+        des_offset = h * align_to_x(channel_nums[i] * width, IMAGE_ALIGNMENT) +
+                     w * channel_nums[i];
+        memcpy((int16_t *)images_out[i] + des_offset, image_in + src_offset,
+               channel_nums[i] * sizeof(int16_t));
+        src_offset += channel_nums[i];
+      }
    }
  }
+
+  for (int i = 0; i < image_num; i++) {
+    element_num = height * align_to_x(width * channel_nums[i], IMAGE_ALIGNMENT);
+    fpga_flush(images_out[i], element_num * sizeof(int16_t));
+  }
 }

 }  // namespace image

--- a/src/fpga/V1/image.h
+++ b/src/fpga/V1/image.h
@@ -14,9 +14,8 @@ limitations under the License. */

 #pragma once

-#include <stdint.h>
+#include <cstdint>

-#define IMAGE_ALIGNMENT 16  // Aligned to 16
 namespace paddle_mobile {
 namespace fpga {
 namespace image {
@@ -24,13 +23,16 @@ namespace image {
 void convert_to_hwc(float** data_in, int channel, int height, int width);
 void align_element_conv(float** data_in, int height, int cw);
 void format_image(float** data_in, int channel, int height, int width);
+
+// Concat featuremaps along channel direction
 void concat_images(int16_t** images_in, float** scales_in, void* image_out,
                   float* scale_out, int image_num, uint32_t* channel_num,
-                   int height,
-                   int width);  // Concat featuremaps along channel direction
-void split_image(int16_t* image_in, float* scale_in, void** images_out,
-                 float** scales_out, int image_num, uint32_t* channel_nums,
-                 int height, int width);
+                   int height, int width);
+
+// Split featuremap along channel direction
+void split_image(int16_t* image_in, const float* scale_in, void** images_out,
+                 float** scales_out, int image_num,
+                 const uint32_t* channel_nums, int height, int width);
 }  // namespace image
 }  // namespace fpga
 }  // namespace paddle_mobile
--- a/src/fpga/V1/pe.cpp
+++ b/src/fpga/V1/pe.cpp
@@ -203,29 +203,11 @@ int ComputeBasicConv(const struct ConvArgs &args) {
  DLOG << "   out_address:" << args.output.address
       << "   out_scale_address:" << args.output.scale_address;
 #endif
-  cout << "    relu_enabled:" << args.relu_enabled
-       << "    sb_address:" << args.sb_address
-       << "    filter_address:" << args.filter_address
-       << "    filter_num:" << args.filter_num
-       << "    group_num:" << args.group_num;
-  cout << "    image_address:" << args.image.address
-       << "    image_scale_address:" << args.image.scale_address
-       << "    image_channels:" << args.image.channels
-       << "    image_height:" << args.image.height
-       << "    image_width:" << args.image.width
-       << "    pad_height:" << args.image.pad_height
-       << "    pad_width:" << args.image.pad_width;
-  cout << "    kernel_height:" << args.kernel.height
-       << "    kernel_width:" << args.kernel.width
-       << "    stride_h:" << args.kernel.stride_h
-       << "    stride_w:" << args.kernel.stride_w;
-  cout << "    out_address:" << args.output.address
-       << "    out_scale_address:" << args.output.scale_address;

 #ifdef PADDLE_MOBILE_ZU5
-  DLOG << "Conv";
-  // return 0;
-  uint64_t timer_cnt;
+  int ret = 0;
+  uint64_t output_scale = 0;
+  /*
  uint64_t output_scale;
  uint64_t image_scale;
  uint64_t filter_scale;
@@ -233,14 +215,10 @@ int ComputeBasicConv(const struct ConvArgs &args) {
  uint64_t sb_address_phy = 0;
  uint64_t filter_address_phy = 0;
  uint64_t output_address_phy = 0;
-  int ret = 0;
+

  fpga_copy(&image_scale, args.image.scale_address, 2 * sizeof(float));
  fpga_copy(&filter_scale, args.filter_scale_address, 2 * sizeof(float));
-
-  cout << "image_scale :" << hex << (image_scale) << endl;
-  cout << "filter_scale :" << hex << (filter_scale) << endl;
-
  uint64_t filterlen = (uint64_t)args.kernel.width *
                       (uint64_t)args.kernel.height *
                       (uint64_t)args.image.channels;
@@ -349,8 +327,8 @@ int ComputeBasicConv(const struct ConvArgs &args) {
  filter_address_phy = vaddr_to_paddr(args.filter_address);
  output_address_phy = vaddr_to_paddr(args.output.address);

-  /*SDK刷Cache保证数据一致性*/
  uint64_t cmd = 0UL | (args.relu_enabled ? USE_RELU : 0) | USE_BIAS;
+*/

  pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
  if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status) {
@@ -359,78 +337,63 @@ int ComputeBasicConv(const struct ConvArgs &args) {
    pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
    return ret;
  }
-
-  /*restart scale*/
  reg_writeq(output_scale, REG_SCALE_PARAMETER);
-
-  reg_writeq(image_address_phy, REG_CONV_IMAGE_BASE_ADDR);
-  reg_writeq(filter_address_phy, REG_CONV_FILTER_BASE_ADDR);
-  reg_writeq(sb_address_phy, REG_CONV_SB_BASE_ADDR);
-  reg_writeq(output_address_phy, REG_CONV_RESULT_BASE_ADDR);
-
  reg_writeq(
      ((uint64_t)args.image.height) | (((uint64_t)args.image.width) << 32),
      REG_CONV_IMAGE_PIXEL);
  reg_writeq(
      ((uint64_t)args.kernel.height) | (((uint64_t)args.kernel.width) << 32),
      REG_CONV_FILTER_PIXEL);
-  reg_writeq(output_height | (output_width << 32), REG_CONV_RESULT_PIXEL);
+  reg_writeq(args.driver.output_height | (args.driver.output_width << 32),
+             REG_CONV_RESULT_PIXEL);
  reg_writeq(((uint64_t)args.image.pad_height) |
                 (((uint64_t)args.image.pad_width) << 32),
             REG_CONV_PAD_PIXEL);
  reg_writeq(((uint64_t)args.kernel.stride_h) |
                 (((uint64_t)args.kernel.stride_w) << 32),
             REG_CONV_STEP_PIXEL);
-
  reg_writeq((uint64_t)args.group_num, REG_CONV_GROUP_NUMBER);
  reg_writeq((uint64_t)args.filter_num, REG_CONV_FILTER_NUMBER);
  reg_writeq((uint64_t)args.image.channels, REG_CONV_CHANNEL_NUMBER);
+  reg_writeq(*(uint64_t *)args.image.scale_address, REG_CONV_IMAGE_SCALE);
+  reg_writeq(*(uint64_t *)args.filter_scale_address, REG_CONV_FILTER_SCALE);
+
+  reg_writeq(args.driver.image_address_phy, REG_CONV_IMAGE_BASE_ADDR);
+  reg_writeq(args.driver.filter_address_phy, REG_CONV_FILTER_BASE_ADDR);
+  reg_writeq(args.driver.sb_address_phy, REG_CONV_SB_BASE_ADDR);
+  reg_writeq(args.driver.output_address_phy, REG_CONV_RESULT_BASE_ADDR);
+  reg_writeq(args.driver.filter_per_group, REG_CONV_FILTER_PER_GROUP);
+  reg_writeq(args.driver.channel_per_group, REG_CONV_CHANNEL_PER_GROUP);
+  reg_writeq(args.driver.image_amount_per_row, REG_CONV_IMAGE_AMOUNT_PER_ROW);
+  reg_writeq(args.driver.image_one_pad_per_row, REG_CONV_IMAGE_ONE_PAD_PER_ROW);
+  reg_writeq(args.driver.filter_amount_all, REG_CONV_FILTER_AMOUNT_ALL);
+  reg_writeq(args.driver.output_amount_per_row, REG_CONV_RESULT_AMOUNT_PER_ROW);
+  reg_writeq(args.driver.image_block_amount_per_row, 0xca8);
+  reg_writeq(args.driver.filter_pad_width_mul_channel, 0xcb0);
+  reg_writeq(args.driver.image_amount_per_row_multi_win_first, 0xcb8);
+  reg_writeq(args.driver.image_amount_per_row_multi_win, 0xcc0);
+  reg_writeq(args.driver.image_block_num, 0xcc8);
+  reg_writeq(args.driver.image_block_len, 0xcd0);
+  reg_writeq(args.driver.image_block_len_last, 0xcd8);
+  reg_writeq(args.driver.image_win_cnt, 0xce0);
+  reg_writeq(args.driver.image_win_cnt_last, 0xce8);
+  reg_writeq(args.driver.res_row_data_align4_pad, 0xcf8);
+  reg_writeq(args.driver.prog_full_cnt, 0xd08);
+  reg_writeq(args.driver.post_prog_full_cnt, 0xd10);
+  reg_writeq(args.driver.fpga_bias_scale_len / 4, 0xd20);
+
+  reg_writeq(args.driver.cmd, REG_CONV_CMD);

-  reg_writeq(filter_per_group, REG_CONV_FILTER_PER_GROUP);
-  reg_writeq(channel_per_group, REG_CONV_CHANNEL_PER_GROUP);
-
-  reg_writeq(image_amount_per_row, REG_CONV_IMAGE_AMOUNT_PER_ROW);
-  reg_writeq(image_one_pad_per_row, REG_CONV_IMAGE_ONE_PAD_PER_ROW);
-  reg_writeq(filter_amount_all, REG_CONV_FILTER_AMOUNT_ALL);
-  reg_writeq(output_amount_per_row, REG_CONV_RESULT_AMOUNT_PER_ROW);
-
-  reg_writeq(image_block_amount_per_row, 0xca8);
-  reg_writeq(filter_pad_width_mul_channel, 0xcb0);
-  reg_writeq(image_amount_per_row_multi_win_first, 0xcb8);
-  reg_writeq(image_amount_per_row_multi_win, 0xcc0);
-  reg_writeq(image_block_num, 0xcc8);
-  reg_writeq(image_block_len, 0xcd0);
-  reg_writeq(image_block_len_last, 0xcd8);
-  reg_writeq(image_win_cnt, 0xce0);
-  reg_writeq(image_win_cnt_last, 0xce8);
-  reg_writeq(res_row_data_align4_pad, 0xcf8);
-  reg_writeq(prog_full_cnt, 0xd08);
-  reg_writeq(post_prog_full_cnt, 0xd10);
-  reg_writeq(fpga_bias_scale_len / 4, 0xd20);
-
-  /*write scale*/
-  reg_writeq(image_scale, REG_CONV_IMAGE_SCALE);
-  reg_writeq(filter_scale, REG_CONV_FILTER_SCALE);
-
-  reg_writeq(cmd, REG_CONV_CMD);
-
-  DLOG << "before reg poll";
  if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_CONV, PE_IRQ_TIMEOUT)) {
    g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status = ERROR;
    ret = -EIO;
    DLOG << "Conv Wait Irq Timeout!";
  }
-  DLOG << "after reg poll";
-  usleep(40);
-
-  /*SDK 无效 Cache保证数据一致性*/

  output_scale = reg_readq(REG_SCALE_PARAMETER);
  output_scale = (output_scale << 32) | (output_scale >> 32);
  fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
-  cout << "output_scale :" << hex << (output_scale) << endl;

-  //*(args.output.scale_address) = output_scale;
  pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);

  return ret;
@@ -575,9 +538,6 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
    DLOG << "Pooling Wait Irq Timeout!";
  }
  DLOG << "after reg poll";
-  usleep(40);
-
-  /*SDK 无效 Cache保证数据一致性*/

  // *(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER);
  output_scale = reg_readq(REG_SCALE_PARAMETER);
@@ -615,11 +575,9 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
       << "   out_scale_address:" << args.output.scale_address;
 #endif
 #ifdef PADDLE_MOBILE_ZU5
-  DLOG << "Conv";
-  // return 0;
  int ret = 0;
  uint64_t output_scale = 0;
-  uint64_t timer_cnt = 0;
+  /*uint64_t timer_cnt = 0;
  uint64_t image0_address_phy = 0;
  uint64_t image1_address_phy = 0;
  uint64_t output_address_phy = 0;
@@ -629,54 +587,44 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
                     (uint64_t)args.image0.height *
                     (uint64_t)args.image0.channels;
  uint64_t coefficient = (uint64_t)args.const0 << 32 | (uint64_t)args.const1;
-
-  pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
-  if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) {
-    ret = -EIO;
-    DLOG << "Conv Status Error!";
-    pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
-    return ret;
-  }
-
  image0_address_phy = vaddr_to_paddr(args.image0.address);
  image1_address_phy = vaddr_to_paddr(args.image1.address);
  output_address_phy = vaddr_to_paddr(args.output.address);

  uint64_t image_amount_per_row =
-      align_to_x((uint64_t)args.image0.width * (uint64_t)args.image0.channels,
-                 IMAGE_ALIGN);
+  align_to_x((uint64_t)args.image0.width * (uint64_t)args.image0.channels,
+             IMAGE_ALIGN);
  uint64_t image_image_pixel = ((uint64_t)args.image0.channels << 32) |
                               ((uint64_t)args.image0.width << 16) |
-                               (uint64_t)args.image0.height;
+                               (uint64_t)args.image0.height;*/

-  /*SDK刷Cache保证数据一致性*/
+  pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
+  if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_EW]->status) {
+    ret = -EIO;
+    DLOG << "EW Status Error!";
+    pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
+    return ret;
+  }

-  /*restart scale*/
  reg_writeq(output_scale, REG_SCALE_PARAMETER);
-
-  reg_writeq(image0_address_phy, REG_EW_IMAGE0_BASE_ADDR);
-  reg_writeq(image1_address_phy, REG_EW_IMAGE1_BASE_ADDR);
-  reg_writeq(datalen, REG_EW_DATA_LEN);
-  reg_writeq(image_image_pixel, REG_EW_IMAGE_PIXEL);
-  reg_writeq(image_amount_per_row, REG_EW_IMAGE_AMOUNT_PER_ROW);
-
-  reg_writeq(output_address_phy, REG_EW_RESULT_BASE_ADDR);
-  reg_writeq(coefficient, REG_EW_COEFFICIENT);
-
-  reg_writeq(cmd, REG_EW_CMD);
+  reg_writeq(args.driver.image0_address_phy, REG_EW_IMAGE0_BASE_ADDR);
+  reg_writeq(args.driver.image1_address_phy, REG_EW_IMAGE1_BASE_ADDR);
+  reg_writeq(args.driver.datalen, REG_EW_DATA_LEN);
+  reg_writeq(args.driver.image_image_pixel, REG_EW_IMAGE_PIXEL);
+  reg_writeq(args.driver.image_amount_per_row, REG_EW_IMAGE_AMOUNT_PER_ROW);
+  reg_writeq(args.driver.output_address_phy, REG_EW_RESULT_BASE_ADDR);
+  reg_writeq(args.driver.coefficient, REG_EW_COEFFICIENT);
+  reg_writeq(args.driver.cmd, REG_EW_CMD);

  if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) {
-    g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status = ERROR;
+    g_fpgainfo.pe_data->pes[PE_IDX_EW]->status = ERROR;
    ret = -EIO;
    DLOG << "EW Wait Irq Timeout!";
  }
-  usleep(40);

-  /*SDK 无效 Cache保证数据一致性*/
  output_scale = reg_readq(REG_SCALE_PARAMETER);
  output_scale = (output_scale << 32) | (output_scale >> 32);
  fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
-
  //*(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER);
  //*(args.output.timer_cnt) = reg_readq(REG_TIMER_COUNTER);
  pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
@@ -802,9 +750,7 @@ int PerformBypass(const struct BypassArgs &args) {
    DLOG << "BYPASS Wait Irq Timeout!";
  }
  DLOG << "after reg poll";
-  usleep(40);

-  /*SDK 无效 Cache保证数据一致性*/
  output_scale = reg_readq(REG_SCALE_PARAMETER);
  output_scale = (output_scale << 32) | (output_scale >> 32);
  fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
@@ -883,8 +829,9 @@ void deconv_post_process(half **data_in, int sub_conv_n, int num, int channel,
  *data_in = ptr_deconv;
  fpga_free(ptr_tmp);
 }
+
 int ComputeFpgaDeconv(const struct DeconvArgs &args) {
-#ifdef FPGA_TEST_MODE
+#ifdef FPGA_PRINT_MODE
  DLOG << "=============ComputeFPGADeConv===========";
  DLOG << "   filter_num:" << args.filter_num
       << "   group_num:" << args.group_num

--- a/src/fpga/V2/api.cpp
+++ b/src/fpga/V2/api.cpp
@@ -146,11 +146,11 @@ int format_conv_data(framework::Tensor *filter_tensor,
 }

 int format_fc_data(framework::Tensor *filter_tensor,
-                   framework::Tensor *ofm_tensor, float *bs_ptr) {
+                   framework::Tensor *ofm_tensor, float **bs_ptr) {
  float max_value = fpga::filter_find_max(filter_tensor);
  fpga::format_fc_filter(filter_tensor, max_value);
  int aligned_num = get_aligned_filter_num(filter_tensor);
-  fpga::format_bias_scale_array(&bs_ptr,
+  fpga::format_bias_scale_array(bs_ptr,
                                (int)filter_tensor->dims()[0],  // NOLINT
                                aligned_num);
  int aligned_channel = fpga::get_conv_output_channel(filter_tensor);
@@ -214,7 +214,7 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
    arg->conv_arg[i].output.scale_address = out->scale;

    int num_after_alignment = filter::calc_aligned_num(
-        (int)input->dims()[1], arg->filter_num);  // NOLINT
+        arg->filter_num, (int)input->dims()[1]);  // NOLINT
    arg->conv_arg[i].free_space =
        fpga_malloc(num_after_alignment * 2 * sizeof(half));
  }

--- a/src/fpga/V2/api.h
+++ b/src/fpga/V2/api.h
@@ -41,7 +41,7 @@ void format_concat_output(framework::Tensor* out, int height, int width,
 int format_conv_data(framework::Tensor* filter_tensor,
                     framework::Tensor* ofm_tensor, float** bs_ptr, int group);
 int format_fc_data(framework::Tensor* filter_tensor,
-                   framework::Tensor* ofm_tensor, float* bs_ptr);
+                   framework::Tensor* ofm_tensor, float** bs_ptr);
 void fill_split_arg(struct SplitConvArgs* arg, framework::Tensor* input,
                    framework::Tensor* out, framework::Tensor* filter,
                    bool relu_enabled, int group_num, int stride_h,

--- a/src/fpga/common/driver.cpp
+++ b/src/fpga/common/driver.cpp
@@ -137,8 +137,6 @@ int fpga_regpoll(uint64_t reg, uint64_t val, int time) {

  for (i = 0; i < timeout; i++) {
    if (val == reg_readq(reg)) {
-      std::cout << "fpga_regpoll:" << i << "val:" << val << "reg:" << reg
-                << std::endl;
      break;
    }
  }
@@ -401,8 +399,6 @@ void fpga_copy_driver(void *dest, const void *src, size_t num) {
  DLOG << "dest:" << dest << " src:" << src << " size:" << num;

  for (i = 0; i < num; i++) {
-    // DLOG << "i:" << i << " val:" << *((int8_t *)src + i);
-    // usleep(1);
    *((int8_t *)dest + i) = *((int8_t *)src + i);  // NOLINT
  }


--- a/src/fpga/common/driver.h
+++ b/src/fpga/common/driver.h
@@ -103,22 +103,15 @@ struct FPGA_INFO {
 extern struct FPGA_INFO g_fpgainfo;

 inline uint64_t reg_readq(uint32_t offset) {
-  // DLOG << "offset : " << offset;
  uint64_t value =
      *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr +  // NOLINT
                             offset);                                // NOLINT
-  // DLOG << "read end";
-  usleep(10);
-
  return value;
 }

 inline void reg_writeq(uint64_t value, uint32_t offset) {
-  // DLOG << "offset : " << offset << ", value : " << value;
  *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr +  // NOLINT
                         offset) = value;
-  // DLOG << "write end";
-  usleep(10);
 }

 int open_device_driver();

--- a/src/fpga/common/fpga_common.cpp
+++ b/src/fpga/common/fpga_common.cpp
@@ -92,7 +92,8 @@ void fpga_free(void *ptr) {
 }
 void fpga_copy(void *dest, const void *src, size_t num) {
 #ifdef PADDLE_MOBILE_ZU5
-  driver::fpga_copy_driver(dest, src, num);
+  // driver::fpga_copy_driver(dest, src, num);
+  memcpy(dest, src, num);
 #else
  memcpy(dest, src, num);
 #endif

--- a/src/fpga/common/fpga_common.h
+++ b/src/fpga/common/fpga_common.h
@@ -20,6 +20,13 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace fpga {

+#ifdef PADDLE_MOBILE_FPGA_V1
+#define IMAGE_ALIGNMENT 16           // Aligned to 16
+#define FILTER_NUM_ALIGNMENT 32      // Filter number aligned to 32
+#define FILTER_ELEMENT_ALIGNMENT 16  // Filter element number aligned to 16
+#define BS_NUM_ALIGNMENT 8
+#endif
+
 enum DataType {
  DATA_TYPE_FP32 = 1,
  DATA_TYPE_FP16 = 0,
@@ -52,19 +59,70 @@ struct ImageOutputArgs {
  float* scale_address;  // output scale address;
  uint64_t timer_cnt;    // time counter for FPGA computation
 };
+#ifdef PADDLE_MOBILE_FPGA_V1
+struct ConvDriverParam {
+  uint64_t image_address_phy;
+  uint64_t filter_address_phy;
+  uint64_t sb_address_phy;
+  uint64_t output_address_phy;
+
+  uint64_t output_height;
+  uint64_t output_width;
+  uint64_t filter_per_group;
+  uint64_t channel_per_group;
+
+  uint64_t image_amount_per_row;
+  uint64_t image_one_pad_per_row;
+  uint64_t filter_amount_all;
+  uint64_t output_amount_per_row;
+
+  uint64_t image_block_amount_per_row;
+  uint64_t filter_pad_width_mul_channel;
+  uint64_t image_amount_per_row_multi_win_first;
+  uint64_t image_amount_per_row_multi_win;
+  uint64_t image_block_num;
+  uint64_t image_block_len;
+  uint64_t image_block_len_last;
+  uint64_t image_win_cnt;
+  uint64_t image_win_cnt_last;
+  uint64_t res_row_data_align4_pad;
+  uint64_t prog_full_cnt;
+  uint64_t post_prog_full_cnt;
+  uint64_t fpga_bias_scale_len;
+  uint64_t cmd;
+};
+
+struct EWAddDriverParam {
+  uint64_t image0_address_phy;
+  uint64_t image1_address_phy;
+  uint64_t datalen;
+  uint64_t image_image_pixel;
+  uint64_t image_amount_per_row;
+  uint64_t output_address_phy;
+  uint64_t coefficient;
+  uint64_t cmd;
+};
+#endif

 struct ConvArgs {
  bool relu_enabled;
  void* sb_address;  // scale and bias
  void* filter_address;
  float* filter_scale_address;
-  void* free_space;  // used by FPGA logic
  uint32_t filter_num;
  uint32_t group_num;

  struct KernelArgs kernel;
  struct ImageInputArgs image;  // input image;
  struct ImageOutputArgs output;
+
+#ifdef PADDLE_MOBILE_FPGA_V2
+  void* free_space;  // used by FPGA logic
+#endif
+
+#ifdef PADDLE_MOBILE_FPGA_V1
+  struct ConvDriverParam driver;
+#endif
 };

 struct ConcatArgs {
@@ -115,6 +173,9 @@ struct EWAddArgs {
  struct ImageInputArgs image0;
  struct ImageInputArgs image1;
  struct ImageOutputArgs output;
+#ifdef PADDLE_MOBILE_FPGA_V1
+  struct EWAddDriverParam driver;
+#endif
 };

 struct BypassArgs {
@@ -150,5 +211,9 @@ void fpga_copy(void* dest, const void* src, size_t num);
 int fpga_flush(void* address, size_t size);
 int fpga_invalidate(void* address, size_t size);

+uint64_t vaddr_to_paddr(void* address);
+void expand_conv_arg(ConvArgs* arg);
+void expand_EW_arg(EWAddArgs* arg);
+
 }  // namespace fpga
 }  // namespace paddle_mobile
--- a/src/fpga/common/pe.h
+++ b/src/fpga/common/pe.h
@@ -26,6 +26,7 @@ int ComputeFpgaEWAdd(const struct EWAddArgs& args);
 int ComputeFpgaConv(const struct SplitConvArgs& args);
 int ComputeFPGAConcat(const struct ConcatArgs& args);
 int ComputeFPGASplit(const struct SplitArgs& args);
+int ComputeFpgaDeconv(const struct DeconvArgs& args);

 }  // namespace fpga
 }  // namespace paddle_mobile
--- a/src/operators/kernel/arm/quantize_kernel.cpp
+++ b/src/operators/kernel/arm/quantize_kernel.cpp
@@ -19,10 +19,12 @@ limitations under the License. */

 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
 #include <arm_neon.h>
+#endif

 namespace paddle_mobile {
 namespace operators {

+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
 #ifndef __aarch64__
 inline float32_t vmaxvq_f32(float32x4_t r) {
  float32x2_t v = vmax_f32(vget_high_f32(r), vget_low_f32(r));

--- a/src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp
@@ -49,6 +49,7 @@ bool ElementwiseAddKernel<FPGA, float>::Init(ElementwiseAddParam<FPGA> *param) {
  ewaddArgs.image1.pad_width = 0;
  ewaddArgs.output.scale_address = out->scale;
  ewaddArgs.output.address = out_ptr;
+  fpga::expand_EW_arg(&ewaddArgs);
  param->SetFpgaArgs(ewaddArgs);
  return true;
 }

--- a/src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp
@@ -50,6 +50,7 @@ bool ElementwiseAddReluKernel<FPGA, float>::Init(
  ewaddArgs.image1.pad_width = 0;
  ewaddArgs.output.scale_address = out->scale;
  ewaddArgs.output.address = out_ptr;
+  fpga::expand_EW_arg(&ewaddArgs);
  param->SetFpgaArgs(ewaddArgs);
  return true;
 }

--- a/src/operators/kernel/fpga/V1/softmax_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/softmax_kernel.cpp
@@ -24,8 +24,12 @@ template <>
 bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
  auto input = const_cast<Tensor *>(param->InputX());
  auto input_ptr = input->data<float>();
+  auto out = param->Out();
+  fpga::format_fp32_ofm(out);
+
  auto float_input = new Tensor;
-  float_input->mutable_data<float>({1, input->dims()[1]});
+  float_input->mutable_data<float>(
+      {1, input->dims()[2], input->dims()[3], input->dims()[1]});
  fpga::format_fp32_ofm(float_input);

  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
@@ -34,8 +38,8 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
  args.input_data_type = fpga::DATA_TYPE_FP16;
  args.output_data_type = fpga::DATA_TYPE_FP32;
  args.image.address = input_ptr;
-  args.image.height = 1;
-  args.image.width = 1;
+  args.image.height = (uint32_t)input->dims()[2];
+  args.image.width = (uint32_t)input->dims()[3];
  args.image.channels = (uint32_t)input->dims()[1];
  args.output.address = float_input->data<float>();
  args.output.scale_address = float_input->scale;
@@ -50,9 +54,9 @@ void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) {
  Tensor *out = param.Out();

  fpga::PerformBypass(param.FpgaArgs());
-  fpga::fpga_invalidate(
-      (void *)in_x->data<float>(),  // NOLINT
-      fpga::get_align_image_cw(in_x->dims()[1]) * sizeof(float));
+  fpga::fpga_invalidate((void *)in_x->data<float>(),  // NOLINT
+                        in_x->numel() * sizeof(float));
+  // TODO: In general case, 0 should be squeezed before softmax input
  math::SoftmaxFuntor<CPU, float>()(in_x, out);
  fpga::fpga_flush(out->data<float>(), out->memory_size());
 }

--- a/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
@@ -21,7 +21,7 @@ namespace operators {
 template <>
 bool ElementwiseAddReluKernel<FPGA, float>::Init(
    ElementwiseAddReluParam<FPGA> *param) {
-  bool relu_enabled = false;
+  bool relu_enabled = true;
  auto *input_x = const_cast<LoDTensor *>(param->InputX());
  auto *input_y = const_cast<LoDTensor *>(param->InputY());
  auto *out = param->Out();

--- a/src/operators/kernel/fpga/V2/fusion_fc_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/fusion_fc_kernel.cpp
@@ -47,7 +47,7 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {

  out->Resize(framework::make_ddim({1, channel, 1, 1}));
  filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
-  fpga::format_fc_data(filter, out, bs_ptr);
+  fpga::format_fc_data(filter, out, &bs_ptr);

  fpga::SplitConvArgs conv_arg = {0};
  fpga::fill_split_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1,

--- a/src/operators/math/depthwise_conv3x3_int8_arm64.cpp
+++ b/src/operators/math/depthwise_conv3x3_int8_arm64.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#if defined(__ARM_NEON__) && defined(__aarch64__)
+
+#include "operators/math/depthwise_conv3x3.h"
+#ifdef __ARM_NEON__
+#include <arm_neon.h>
+#endif
+
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+
+// template<>
+// void DepthwiseConv3x3<int8_t, int32_t>(
+//     const framework::Tensor *input, const framework::Tensor *filter,
+//     const std::vector<int> &strides, framework::Tensor *output) {
+//   PADDLE_MOBILE_THROW_EXCEPTION(
+//       "Depthwise conv with generic strides has not been implemented.");
+// }
+
+template <>
+void DepthwiseConv3x3S1<int8_t, int32_t>(const framework::Tensor &input,
+                                         const framework::Tensor &filter,
+                                         const std::vector<int> &paddings,
+                                         framework::Tensor *output) {
+  PADDLE_MOBILE_THROW_EXCEPTION(
+      "Depthwise conv3x3 with stride 1 for arm v8 has not been implemented.");
+}
+
+template <>
+void DepthwiseConv3x3S2<int8_t, int32_t>(const framework::Tensor &input,
+                                         const framework::Tensor &filter,
+                                         const std::vector<int> &paddings,
+                                         framework::Tensor *output) {
+  PADDLE_MOBILE_THROW_EXCEPTION(
+      "Depthwise conv3x3 with stride 2 for arm v8 has not been implemented.");
+}
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/math/winograd/winograd_transform_f6k3_arm64.cpp
+++ b/src/operators/math/winograd/winograd_transform_f6k3_arm64.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+// Inspired by https://arxiv.org/abs/1509.09308 and refered from nnpack and ncnn
+// project.
+
+#ifdef CONV_OP
+
+#ifdef __aarch64__
+
+#include "operators/math/pad.h"
+#include "operators/math/winograd/winograd_transform.h"
+
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+
+template <>
+void winograd_transform_weight<8, 3>(const framework::Tensor &weight,
+                                     framework::Tensor *output) {
+  /*
+   * w0 = g0
+   * w1 = ((g0 + g2) + g1) * (-2.0 / 9)
+   * w2 = ((g0 + g2) - g1) * (-2.0 / 9)
+   * w3 = ((g0 + 4 * g2) + 2 * g1) * (1.0 / 90)
+   * w4 = ((g0 + 4 * g2) - 2 * g1) * (1.0 / 90)
+   * w5 = ((g2 + 4 * g0) + 2 * g1) * (1.0 / 180)
+   * w6 = ((g2 + 4 * g0) - 2 * g1) * (1.0 / 180)
+   * w7 = g2
+   */
+  // TODO(hjchen2)
+  PADDLE_MOBILE_THROW_EXCEPTION(
+      "Winograd for arm v8 has not been implemented.");
+}
+
+template <>
+void winograd_transform_input<8, 3>(const framework::Tensor &input,
+                                    framework::Tensor *output) {
+  /*
+   * x0 = (d0 - d6) + (d4 - d2) * 5.25
+   * x1 = (d2 + d6) - 4.25 * (d4 + d3) + (d1 + d5)
+   * x2 = (d2 + d6) - 4.25 * (d4 - d3) - (d1 + d5)
+   * x3 = (0.25 * d2 - 1.25 * d4 + d6) + (0.5 * d1 - 2.5 * d3 + 2 * d5)
+   * x4 = (0.25 * d2 - 1.25 * d4 + d6) - (0.5 * d1 - 2.5 * d3 + 2 * d5)
+   * x5 = (4 * d2 - 5 * d4 + d6) + (2 * d1 - 2.5 * d3 + 0.5 * d5)
+   * x6 = (4 * d2 - 5 * d4 + d6) - (2 * d1 - 2.5 * d3 + 0.5 * d5)
+   * x7 = (d7 - d1) + (d3 - d5) * 5.25
+   */
+  // TODO(hjchen2)
+  PADDLE_MOBILE_THROW_EXCEPTION(
+      "Winograd for arm v8 has not been implemented.");
+}
+
+template <>
+void winograd_transform_output<8, 3>(const framework::Tensor &input,
+                                     const framework::Tensor &weight,
+                                     framework::Tensor *output) {
+  // TODO(hjchen2)
+  PADDLE_MOBILE_THROW_EXCEPTION(
+      "Winograd for arm v8 has not been implemented.");
+}
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif  // __aarch64__
+#endif  // CONV_OP