fix some bugs in fpga V2 track and update fpga V2 pe code

7a8b998f · qnqinan · 344c1df7 · 7a8b998f · 7a8b998f · 7a8b998f
6 changed file
--- a/src/fpga/V2/api.cpp
+++ b/src/fpga/V2/api.cpp
@@ -204,7 +204,8 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,

    arg->conv_arg[i].image.address = input_ptr;
    arg->conv_arg[i].image.scale_address = input->scale;
-    arg->conv_arg[i].image.channels = (uint32_t)input->dims()[1];
+    arg->conv_arg[i].image.channels =
+        (uint32_t)get_aligned_channel_num((int)(input->dims()[1]));  // NOLINT
    arg->conv_arg[i].image.height = (uint32_t)input->dims()[2];
    arg->conv_arg[i].image.width = (uint32_t)input->dims()[3];
    arg->conv_arg[i].image.pad_height = (uint32_t)padding_h;
@@ -216,7 +217,7 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
    int num_after_alignment = filter::calc_aligned_num(
        arg->filter_num, (int)input->dims()[1]);  // NOLINT
    arg->conv_arg[i].free_space =
-        fpga_malloc(num_after_alignment * 2 * sizeof(half));
+        fpga_malloc(num_after_alignment * 2 * sizeof(float));  // half
  }
 }


--- a/src/fpga/V2/filter.cpp
+++ b/src/fpga/V2/filter.cpp
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <memory.h>
 #include <algorithm>
 #include "fpga/common/fpga_common.h"
-
 namespace paddle_mobile {
 namespace fpga {
 namespace filter {
@@ -88,12 +87,25 @@ void align_filter(float **data_in, int num, int channel, int height,
  *data_in = new_data;
  fpga_free(temp);
 }
-
+void convert_to_fp16(float **data_in, int data_size) {
+  float *tmp = *data_in;
+  // half_float::half *tmp_data = (half_float::half *)fpga_malloc(data_size *
+  // sizeof(half_float::half));
+  int16_t *tmp_data =
+      (int16_t *)fpga_malloc(data_size * sizeof(int16_t));  // NOLINT
+  for (int i = 0; i < data_size; i++) {
+    // tmp_data[i] = (half_float::half)((*data_in)[i]);
+    tmp_data[i] = fp32_2_fp16((*data_in)[i]);
+  }
+  *data_in = (float *)tmp_data;  // NOLINT
+  fpga_free(tmp);
+}
 void format_filter(float **data_in, int num, int channel, int height, int width,
                   int group_num, float max) {
  convert_to_hwc(data_in, num, channel, height, width);
  align_filter(data_in, num, channel, height, width);
  int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width);
+  convert_to_fp16(data_in, pixel_num);
  fpga_flush(*data_in, pixel_num * sizeof(float));
 }

@@ -115,6 +127,7 @@ void format_fc_filter(float **data_in, int num, int channel, int height,
  convert_fc_filter(data_in, num, chw);
  align_filter(data_in, num, channel, height, width);
  int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width);
+  convert_to_fp16(data_in, pixel_num);
  fpga_flush(*data_in, pixel_num * sizeof(float));
 }


--- a/src/fpga/V2/pe.cpp
+++ b/src/fpga/V2/pe.cpp
@@ -13,40 +13,53 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "fpga/common/pe.h"
+#include "fpga/V2/api.h"
 #include "fpga/V2/filter.h"
 #include "fpga/V2/image.h"
 #include "fpga/common/config.h"
 #include "fpga/common/driver.h"

+using namespace std;                          // NOLINT
+using namespace paddle_mobile::fpga::driver;  // NOLINT
+
 namespace paddle_mobile {
 namespace fpga {
-#define MUL8(x) ((x)*8)
-#define BYPASS_DONE 1
+#define MUL8(x) (x * 8)
+#define BYPASS_DONE 2
+#define CONV_DONE 1
+
+static inline int get_image_out_axis(int src_len, int pad, int kernel_len,
+                                     int kernel_step) {
+  if (kernel_step == 0) {
+    return 0;
+  }
+  return (src_len + 2 * pad - kernel_len) / kernel_step + 1;
+}

 float Findfp16Max() {
  uint16_t abs_vals[16];
  uint64_t max_fp16;

-  max_fp16 = driver::reg_readq(MUL8(49));
-  abs_vals[0] = (uint16_t)(0x0000007f & (max_fp16));        // NOLINT
-  abs_vals[1] = (uint16_t)(0x0000007f & (max_fp16 >> 16));  // NOLINT
-  abs_vals[2] = (uint16_t)(0x0000007f & (max_fp16 >> 32));  // NOLINT
-  abs_vals[3] = (uint16_t)(0x0000007f & (max_fp16 >> 48));  // NOLINT
-  max_fp16 = driver::reg_readq(MUL8(50));
-  abs_vals[4] = (uint16_t)(0x0000007f & (max_fp16));        // NOLINT
-  abs_vals[5] = (uint16_t)(0x0000007f & (max_fp16 >> 16));  // NOLINT
-  abs_vals[6] = (uint16_t)(0x0000007f & (max_fp16 >> 32));  // NOLINT
-  abs_vals[7] = (uint16_t)(0x0000007f & (max_fp16 >> 48));  // NOLINT
-  max_fp16 = driver::reg_readq(MUL8(51));
-  abs_vals[8] = (uint16_t)(0x0000007f & (max_fp16));         // NOLINT
-  abs_vals[9] = (uint16_t)(0x0000007f & (max_fp16 >> 16));   // NOLINT
-  abs_vals[10] = (uint16_t)(0x0000007f & (max_fp16 >> 32));  // NOLINT
-  abs_vals[11] = (uint16_t)(0x0000007f & (max_fp16 >> 48));  // NOLINT
-  max_fp16 = driver::reg_readq(MUL8(52));
-  abs_vals[12] = (uint16_t)(0x0000007f & (max_fp16));
-  abs_vals[13] = (uint16_t)(0x0000007f & (max_fp16 >> 16));  // NOLINT
-  abs_vals[14] = (uint16_t)(0x0000007f & (max_fp16 >> 32));  // NOLINT
-  abs_vals[15] = (uint16_t)(0x0000007f & (max_fp16 >> 48));  // NOLINT
+  max_fp16 = reg_readq(MUL8(49));
+  abs_vals[0] = (uint16_t)(0x0000007fff & (max_fp16));        // NOLINT
+  abs_vals[1] = (uint16_t)(0x0000007fff & (max_fp16 >> 16));  // NOLINT
+  abs_vals[2] = (uint16_t)(0x0000007fff & (max_fp16 >> 32));  // NOLINT
+  abs_vals[3] = (uint16_t)(0x0000007fff & (max_fp16 >> 48));  // NOLINT
+  max_fp16 = reg_readq(MUL8(50));
+  abs_vals[4] = (uint16_t)(0x0000007fff & (max_fp16));        // NOLINT
+  abs_vals[5] = (uint16_t)(0x0000007fff & (max_fp16 >> 16));  // NOLINT
+  abs_vals[6] = (uint16_t)(0x0000007fff & (max_fp16 >> 32));  // NOLINT
+  abs_vals[7] = (uint16_t)(0x0000007fff & (max_fp16 >> 48));  // NOLINT
+  max_fp16 = reg_readq(MUL8(51));
+  abs_vals[8] = (uint16_t)(0x0000007fff & (max_fp16));         // NOLINT
+  abs_vals[9] = (uint16_t)(0x0000007fff & (max_fp16 >> 16));   // NOLINT
+  abs_vals[10] = (uint16_t)(0x0000007fff & (max_fp16 >> 32));  // NOLINT
+  abs_vals[11] = (uint16_t)(0x0000007fff & (max_fp16 >> 48));  // NOLINT
+  max_fp16 = reg_readq(MUL8(52));
+  abs_vals[12] = (uint16_t)(0x0000007fff & (max_fp16));
+  abs_vals[13] = (uint16_t)(0x0000007fff & (max_fp16 >> 16));  // NOLINT
+  abs_vals[14] = (uint16_t)(0x0000007fff & (max_fp16 >> 32));  // NOLINT
+  abs_vals[15] = (uint16_t)(0x0000007fff & (max_fp16 >> 48));  // NOLINT

  uint16_t tmp = 0;
  for (int i = 0; i < 16; i++) {
@@ -54,6 +67,7 @@ float Findfp16Max() {
      tmp = abs_vals[i];
    }
  }
+  DLOG << "max value found: " << fp16_2_fp32(tmp);
  return fp16_2_fp32(tmp) / 127.0f;
 }

@@ -88,7 +102,473 @@ int ComputeBasicConv(const struct ConvArgs &args) {
  return 0;
 #endif

-  return 0;
+  uint64_t ifm_pixel_num =
+      ((args.image.width) * (args.image.height) * args.image.channels);
+  uint64_t ifm_memory_size = ifm_pixel_num * sizeof(short);          // NOLINT
+  uint64_t flt_pixel_num = (args.filter_num * (args.kernel.width) *  // NOLINT
+                            (args.kernel.height) * args.image.channels);
+  uint64_t filter_memory_size = flt_pixel_num * sizeof(short);  // NOLINT
+
+  uint64_t bn_pixel_num = (args.filter_num * 2);  // NOLINT
+  uint64_t bn_memory_size = bn_pixel_num * sizeof(float);
+
+  uint64_t ofm_width =
+      ((args.image.width) + 2 * args.image.pad_width - args.kernel.width) /
+          (args.kernel.stride_w) +
+      1;
+  uint64_t ofm_height = ((args.image.height) + 2 * (args.image.pad_height) -
+                         (args.kernel.height)) /
+                            (args.kernel.stride_h) +
+                        1;
+
+  uint32_t filter_num = args.filter_num;
+  uint32_t image_channels = args.image.channels;
+
+  DLOG << "filter_num: " << filter_num;
+  uint64_t ifm_src_paddr = vaddr_to_paddr((args.image.address));
+  uint64_t flt_src_paddr = vaddr_to_paddr((args.filter_address));
+  uint64_t sb_src_paddr = vaddr_to_paddr((args.free_space));
+  uint64_t ifm_dst_paddr = vaddr_to_paddr((args.output.address));
+  /**********BN******************/
+  float image_inv_scale = (args.image.scale_address)[0];
+  float filter_inv_scale = (args.filter_scale_address)[0];
+  float scale_tmp = image_inv_scale * filter_inv_scale;
+  int idx = 0;
+  float tmp = 0.0;
+  float *convert_sb_addr = (float *)(args.free_space);  // NOLINT
+  for (idx = 0; idx < args.filter_num * 2; idx++) {
+    if (idx % 2 == 1) {
+      tmp = ((float *)(args.sb_address))[idx] * scale_tmp;  // NOLINT
+    } else {
+      tmp = ((float *)(args.sb_address))[idx];  // NOLINT
+    }
+    convert_sb_addr[idx] = tmp;  // NOLINT
+  }
+
+  fpga_flush(convert_sb_addr, args.filter_num * 2 * sizeof(float));
+  reg_writeq(1, MUL8(24));
+  usleep(1);
+  reg_writeq(0, MUL8(24));
+
+  reg_writeq(sb_src_paddr, MUL8(27));
+  reg_writeq(0, MUL8(0));
+
+  uint64_t bps_addr = 0x8c00000000000000;
+  bps_addr += bn_memory_size;
+  reg_writeq(bps_addr, MUL8(0));
+  int ret = -1;
+  ret = fpga_regpoll(MUL8(48), BYPASS_DONE, 0xffffff);
+  if (ret) {
+    DLOG << "conv bypass failed";
+    return ret;
+  }
+  reg_readq(MUL8(63));
+
+  /*********configuring registers*************/
+  uint32_t cmd_image_vir_base_addr = (uint32_t)ifm_src_paddr;
+  uint32_t cmd_filter_vir_base_addr = (uint32_t)flt_src_paddr;
+  uint32_t cmd_scale_base_addr = (uint32_t)sb_src_paddr;
+  uint32_t conv_ofm_addr_base = (uint32_t)ifm_dst_paddr;
+  uint64_t cmd_group_num = args.group_num;
+  uint64_t cmd_filter_per_group = filter_num / cmd_group_num;
+
+  uint64_t cmd_flt_sqr_len = (args.kernel.width) * (args.kernel.height);
+  uint64_t cmd_ifm_pre_row_num = 0;
+
+  if (1 == args.image.height) {
+    cmd_ifm_pre_row_num = 1;
+  } else {
+    cmd_ifm_pre_row_num =
+        (args.kernel.height) - (args.image.pad_height) + (args.kernel.stride_h);
+  }
+  uint64_t cmd_flt_pre_batch_num = 1;
+  uint64_t cmd_ifm_pack_num_per_row_mns1 =
+      (uint64_t)(((args.image.channels) + 127) / 128) - 1;
+  uint64_t cmd_bn_num = filter_num;
+  uint64_t cmd_bias_num = filter_num;
+  uint64_t cmd_ifm_stride_row_length = args.image.width * args.kernel.stride_h;
+  uint64_t cmd_flt_pack_num_per_kernel_mns1 =
+      (uint64_t)(((args.image.channels) + 127) / 128) - 1;
+  uint64_t cmd_ofm_width_mns1 = (uint64_t)(
+      ((args.image.width) - (args.kernel.width) + 2 * (args.image.pad_width)) /
+      (args.kernel.stride_w));
+  uint64_t cmd_ofm_height =
+      (uint64_t)(((args.image.height) - (args.kernel.height) +
+                  2 * (args.image.pad_height)) /
+                 (args.kernel.stride_h)) +
+      1;
+
+  uint64_t cmd_channel_num = 0;
+  uint64_t cmd_ifm_pack_len = 0;
+  uint64_t cmd_channel_per_group = 0;
+  uint64_t cmd_flt_batch_num_mns1 = 0;
+  uint64_t cmd_flt_N_impl = 8;
+  uint64_t cmd_ifm_C_impl = 16;
+  uint64_t cmd_flt_pack_length = 0;
+  uint64_t cmd_step_h_mul_row_byte_len = 0;
+  uint64_t cmd_pad_h_mul_row_byte_len = 0;
+  uint64_t cmd_ifm_pack_byte_length = 16 * ((((args.image.width) + 7) / 8) * 8);
+  uint64_t row_len_align = args.image.width;
+  if (image_channels > 64) {
+    cmd_channel_num = (uint64_t)((((args.image.channels) + 127)) / 128) * 128;
+    cmd_ifm_pack_len = 128 * (args.image.width);
+    cmd_channel_per_group = 128;
+    cmd_flt_batch_num_mns1 = (uint64_t)(((args.filter_num + 7)) / 8 - 1);
+    cmd_flt_N_impl = 8;
+    cmd_ifm_C_impl = 128;
+    cmd_flt_pack_length = (args.kernel.width) * (args.kernel.height) * 128;
+    cmd_step_h_mul_row_byte_len =
+        (args.kernel.stride_h) * cmd_channel_num * (args.image.width);
+    cmd_pad_h_mul_row_byte_len =
+        (args.image.pad_height) * cmd_channel_num * (args.image.width);
+    cmd_ifm_pack_byte_length = 128 * (args.image.width);
+    row_len_align = args.image.width * (cmd_ifm_pack_num_per_row_mns1 + 1);
+  } else if (image_channels > 32) {
+    cmd_channel_num = 64;
+    cmd_ifm_pack_len = 64 * (args.image.width);
+    cmd_channel_per_group = 64;
+    cmd_flt_batch_num_mns1 = (uint64_t)((((args.filter_num) + 15)) / 16 - 1);
+    cmd_flt_N_impl = 16;
+    cmd_ifm_C_impl = 64;
+    cmd_flt_pack_length = (args.kernel.width) * (args.kernel.height) * 64;
+    cmd_step_h_mul_row_byte_len = (args.kernel.stride_h) * cmd_channel_num *
+                                  ((((args.image.width) + 1)) / 2) * 2;
+    cmd_pad_h_mul_row_byte_len = (args.image.pad_height) * cmd_channel_num *
+                                 ((((args.image.width) + 1)) / 2) * 2;
+    cmd_ifm_pack_byte_length =
+        64 * (uint64_t)((((args.image.width) + 1)) / 2) * 2;
+    row_len_align = (uint64_t)((((args.image.width) + 1)) / 2);
+  } else if (image_channels > 16) {
+    cmd_channel_num = 32;
+    cmd_ifm_pack_len = 32 * (args.image.width);
+    cmd_channel_per_group = 32;
+    cmd_flt_batch_num_mns1 = (uint64_t)((((args.filter_num) + 31)) / 32 - 1);
+    cmd_flt_N_impl = 32;
+    cmd_ifm_C_impl = 32;
+    cmd_flt_pack_length = (args.kernel.width) * (args.kernel.height) * 32;
+    cmd_step_h_mul_row_byte_len = (args.kernel.stride_h) * cmd_channel_num *
+                                  ((((args.image.width) + 3)) / 4) * 4;
+    cmd_pad_h_mul_row_byte_len = (args.image.pad_height) * cmd_channel_num *
+                                 ((((args.image.width) + 3)) / 4) * 4;
+    cmd_ifm_pack_byte_length =
+        32 * (uint64_t)((((args.image.width) + 3)) / 4) * 4;
+    row_len_align = (uint64_t)((((args.image.width) + 3)) / 4);
+  } else {
+    cmd_channel_num = 16;
+    cmd_ifm_pack_len = 16 * (args.image.width);
+    cmd_channel_per_group = 16;
+    cmd_flt_batch_num_mns1 = (uint64_t)((((args.filter_num) + 63)) / 64 - 1);
+    cmd_flt_N_impl = 64;
+    cmd_ifm_C_impl = 16;
+    cmd_flt_pack_length = (args.kernel.width) * (args.kernel.height) * 16;
+    cmd_step_h_mul_row_byte_len = (args.kernel.stride_h) * cmd_channel_num *
+                                  ((((args.image.width) + 7)) / 8) * 8;
+    cmd_pad_h_mul_row_byte_len = (args.image.pad_height) * cmd_channel_num *
+                                 ((((args.image.width) + 7)) / 8) * 8;
+    cmd_ifm_pack_byte_length = 16 * ((((args.image.width) + 7)) / 8) * 8;
+    row_len_align = (uint64_t)((((args.image.width) + 7)) / 8);
+  }
+  uint64_t cmd_flt_length =
+      (args.kernel.width) * (args.kernel.height) * cmd_channel_num;
+  uint64_t cmd_ifm_row_byte_length = cmd_channel_num * (args.image.width);
+
+  uint64_t cmd_ifm_buf_col_len = 0;
+
+  uint64_t ifm_one_batch_len =
+      (1048576 / ((args.image.width) * cmd_channel_num));
+  uint64_t cmd_ifm_batch_num_tmp = (uint64_t)(
+      ((args.image.height) + ifm_one_batch_len - 1) / ifm_one_batch_len);
+  if (1 == cmd_ifm_batch_num_tmp) {
+    cmd_ifm_buf_col_len = args.image.height;
+  } else {
+    if (((args.image.height) / (cmd_ifm_batch_num_tmp) % 2) == 0) {
+      cmd_ifm_buf_col_len = (args.image.height) / cmd_ifm_batch_num_tmp;
+    } else {
+      cmd_ifm_buf_col_len = (args.image.height) / cmd_ifm_batch_num_tmp - 1;
+    }
+  }
+  uint64_t cmd_ifm_batch_num_mns1 =
+      (((args.image.height) + cmd_ifm_buf_col_len - 1) / cmd_ifm_buf_col_len) -
+      1;
+  uint64_t cmd_flt_cycle_num_mns1 = cmd_ifm_batch_num_mns1;
+  uint64_t cmd_flt_total_batch_num = filter_num / cmd_flt_N_impl;
+  uint64_t cmd_ifm_buf_col_len_rem =
+      (args.image.height) -
+      cmd_ifm_batch_num_mns1 * cmd_ifm_buf_col_len;  //= -4;
+  uint64_t cmd_flt_N_len = args.kernel.width * args.kernel.height *
+                           (cmd_flt_pack_num_per_kernel_mns1 + 1);
+
+  //-------- ofm batch number reg &&  initial URAM reading address
+  // logic-----------------
+  uint64_t cmd_init_raddr_cnt = 1;
+  uint64_t cmd_init_raddr_flag = 0;
+  int64_t cmd_init_raddr_index = -8;
+  int64_t cmd_init_raddr_col_0 = -4;
+  int64_t cmd_init_raddr_col_1 = -4;
+  uint64_t conv_ofm_buf_col_len = 0;
+  uint64_t conv_ofm_buf_col_len_rem = 0;
+
+  if (((args.image.pad_height) % (2 * (args.kernel.stride_h))) == 0) {
+    cmd_init_raddr_cnt = 0;
+    cmd_init_raddr_flag = 0;
+    cmd_init_raddr_index =
+        0 - (int64_t)row_len_align * (((args.image.pad_height) + 1) / 2);
+    cmd_init_raddr_col_0 = cmd_init_raddr_index;
+    cmd_init_raddr_col_1 = cmd_init_raddr_index;
+  } else if (((args.image.pad_height) -
+              2 * ((args.image.pad_height) / (2 * (args.kernel.stride_h)))) <=
+             (args.kernel.stride_h)) {
+    cmd_init_raddr_cnt =
+        (args.kernel.stride_h) -
+        ((args.image.pad_height) -
+         ((args.image.pad_height) / (2 * (args.kernel.stride_h))));
+    cmd_init_raddr_flag = 1;
+    cmd_init_raddr_index =
+        0 - (int64_t)row_len_align * (int64_t)(args.image.pad_height) -
+        (int64_t)row_len_align *
+            ((args.image.pad_height) / (2 * args.kernel.stride_h));
+    cmd_init_raddr_col_0 =
+        0 - (int64_t)row_len_align * (int64_t)(args.image.pad_height) -
+        (int64_t)row_len_align *
+            ((args.image.pad_height) / (2 * (args.kernel.stride_h)));
+    cmd_init_raddr_col_1 = 0;
+  } else if (((args.image.pad_height) -
+              2 * ((args.image.pad_height) / (2 * (args.kernel.stride_h)))) <=
+             2 * (args.kernel.stride_h)) {
+    cmd_init_raddr_cnt =
+        2 * (args.kernel.stride_h) *
+            (((args.image.pad_height) + 2 * (args.kernel.stride_h) - 1) /
+             (2 * (args.kernel.stride_h))) -
+        (args.image.pad_height);
+    cmd_init_raddr_flag = 0;
+    cmd_init_raddr_index =
+        0 - (int64_t)row_len_align * (int64_t)(args.kernel.stride_h) *
+                (((args.image.pad_height) + 2 * (args.kernel.stride_h) - 1) /
+                 (2 * (args.kernel.stride_h)));
+    cmd_init_raddr_col_0 =
+        0 -
+        (int64_t)row_len_align *
+            ((args.image.pad_height) / (2 * (args.kernel.stride_h))) -
+        (int64_t)row_len_align *
+            (2 * (args.kernel.stride_h) *
+                 (((args.image.pad_height) + 2 * (args.kernel.stride_h) - 1) /
+                  (2 * (args.kernel.stride_h))) -
+             (args.image.pad_height));
+    cmd_init_raddr_col_1 = cmd_init_raddr_col_0;
+  }
+
+  if (cmd_ifm_batch_num_mns1 == 0) {
+    if ((args.kernel.height) <= (args.kernel.stride_h)) {
+      conv_ofm_buf_col_len = (args.image.height) + 2 * (args.image.pad_height) -
+                             3 * (args.kernel.stride_h);
+    } else {
+      conv_ofm_buf_col_len = (args.image.height) + 2 * (args.image.pad_height) -
+                             2 * (args.kernel.stride_h) - (args.kernel.height);
+    }
+    conv_ofm_buf_col_len_rem = conv_ofm_buf_col_len;
+  } else {
+    int N_rem = 0;
+    int row_rem = 0;
+
+    if ((args.kernel.height) <= (args.kernel.stride_h)) {
+      conv_ofm_buf_col_len = cmd_ifm_buf_col_len - 3 * (args.kernel.stride_h);
+      N_rem = (cmd_ifm_buf_col_len - (args.kernel.height)) /
+                  (args.kernel.stride_h) +
+              1;
+      row_rem = cmd_ifm_buf_col_len - (args.kernel.stride_h) * N_rem;
+      conv_ofm_buf_col_len_rem = cmd_ifm_buf_col_len_rem +
+                                 2 * (args.image.pad_height) + row_rem -
+                                 3 * (args.kernel.stride_h);
+    } else {
+      conv_ofm_buf_col_len = cmd_ifm_buf_col_len + 2 * (args.image.pad_height) -
+                             2 * (args.kernel.stride_h) - (args.kernel.height);
+      N_rem = (cmd_ifm_buf_col_len - (args.kernel.height)) /
+                  (args.kernel.stride_h) +
+              1;
+      row_rem = cmd_ifm_buf_col_len - (args.kernel.stride_h) * N_rem;
+      conv_ofm_buf_col_len_rem =
+          cmd_ifm_buf_col_len_rem + (args.image.pad_height) + row_rem -
+          2 * (args.kernel.stride_h) - (args.kernel.height);
+    }
+  }
+  //-----------------------  para functions --------------------------------
+  float filter_quant_scale_tmp = ((args.filter_scale_address)[1]);
+  float image_quant_scale_tmp = ((args.image.scale_address)[1]);
+
+  uint32_t cmd_filter_quant_scale =
+      *(uint32_t *)(&filter_quant_scale_tmp);  // NOLINT
+  uint32_t cmd_image_quant_scale =
+      *(uint32_t *)(&image_quant_scale_tmp);  // NOLINT
+
+  uint64_t wParallelsim = cmd_flt_N_impl >> 3;
+  uint64_t wParallelsim_num =
+      (uint64_t)(((args.filter_num) + cmd_flt_N_impl - 1) / cmd_flt_N_impl) - 1;
+  uint64_t win_size = (args.kernel.width) * (args.kernel.height) *
+                          (cmd_ifm_pack_num_per_row_mns1 + 1) -
+                      1;
+  uint64_t conv_ofm_width = (((args.image.width) - (args.kernel.width) +
+                              (args.image.pad_width) + (args.image.pad_width)) /
+                             (args.kernel.stride_w));
+  uint64_t conv_ofm_dma_length = cmd_flt_N_impl * sizeof(short);   // NOLINT
+  uint64_t conv_ofm_dma_stride = args.filter_num * sizeof(short);  // NOLINT
+  uint64_t conv_ofm_height_batch_tmp =
+      get_image_out_axis(args.image.height, args.image.pad_height,
+                         args.kernel.height, args.kernel.stride_h);
+  uint64_t conv_ofm_height_batch = (conv_ofm_height_batch_tmp + 1) / 2 - 1;
+  uint64_t o_ust_rst = 0;
+  uint64_t conv_ofm_dma_repeat =
+      (uint64_t)(((((args.image.width) - (args.kernel.width) +
+                    (args.image.pad_width) + (args.image.pad_width))) /
+                  (args.kernel.stride_w)) +
+                 1);
+  uint64_t conv_ofm_dma_offset =
+      args.filter_num * conv_ofm_dma_repeat * sizeof(short);  // NOLINT
+  uint64_t conv_ofm_inter_stride = conv_ofm_dma_offset * 2;
+  //----------------- register contation ------------------
+  uint64_t cmd_ifm_flt_base_addr = ((uint64_t)cmd_filter_vir_base_addr << 32) |
+                                   ((uint64_t)cmd_image_vir_base_addr);
+  uint64_t cmd_ifm_flt_dim = ((uint64_t)(args.kernel.height) << 48) |
+                             ((uint64_t)(args.kernel.width) << 32) |
+                             ((uint64_t)(args.image.height) << 16) |
+                             ((uint64_t)(args.image.width));
+  uint64_t cmd_pad_step_size = ((uint64_t)(args.kernel.stride_h) << 48) |
+                               ((uint64_t)(args.kernel.stride_w) << 32) |
+                               ((uint64_t)(args.image.pad_height) << 16) |
+                               ((uint64_t)(args.image.pad_width));
+  uint64_t cmd_param1 = ((uint64_t)cmd_filter_per_group << 48) |
+                        ((uint64_t)cmd_channel_num << 32) |
+                        ((uint64_t)filter_num << 16) |
+                        ((uint64_t)cmd_group_num);
+  uint64_t cmd_param2 =
+      ((uint64_t)cmd_flt_sqr_len << 48) | ((uint64_t)cmd_ifm_pack_len << 32) |
+      ((uint64_t)cmd_ifm_pre_row_num << 16) | ((uint64_t)cmd_channel_per_group);
+  uint64_t cmd_param3 = ((uint64_t)cmd_flt_batch_num_mns1 << 48) |
+                        ((uint64_t)cmd_flt_total_batch_num << 32) |
+                        ((uint64_t)cmd_flt_N_impl << 16) |
+                        ((uint64_t)cmd_flt_pre_batch_num);
+  uint64_t cmd_param4 = ((uint64_t)cmd_ifm_pack_num_per_row_mns1 << 48) |
+                        ((uint64_t)cmd_bn_num << 32) |
+                        ((uint64_t)cmd_bias_num << 16) |
+                        ((uint64_t)cmd_flt_N_len);
+  uint64_t cmd_param5 = ((uint64_t)cmd_ifm_stride_row_length << 48) |
+                        ((uint64_t)cmd_flt_pack_length << 32) |
+                        ((uint64_t)cmd_flt_cycle_num_mns1 << 16) |
+                        ((uint64_t)cmd_flt_pack_num_per_kernel_mns1);
+  uint64_t cmd_param6 = ((uint64_t)cmd_ofm_width_mns1 << 48) |
+                        ((uint64_t)cmd_ifm_batch_num_mns1 << 32) |
+                        ((uint64_t)cmd_ifm_buf_col_len << 16) |
+                        ((uint64_t)cmd_ifm_C_impl);
+  uint64_t cmd_param7 = ((uint64_t)conv_ofm_inter_stride << 32) |
+                        ((uint64_t)cmd_ifm_buf_col_len_rem << 16) |
+                        ((uint64_t)cmd_ofm_height);
+  uint64_t cmd_param8 =
+      ((uint64_t)cmd_flt_length << 32) | ((uint64_t)cmd_ifm_row_byte_length);
+  uint64_t cmd_ifm_flt_quant_scale =
+      (((uint64_t)cmd_filter_quant_scale) << 32) |
+      ((uint64_t)cmd_image_quant_scale);
+  uint64_t cmd_step_pad_mul_row_len =
+      ((uint64_t)cmd_pad_h_mul_row_byte_len << 32) |
+      ((uint64_t)cmd_step_h_mul_row_byte_len);
+  //---- ofm paras ----
+  uint64_t cmd_conv_param_reg = ((uint64_t)wParallelsim_num << 32) |
+                                ((uint64_t)wParallelsim << 16) |
+                                ((uint64_t)win_size);
+  uint64_t cmd_ofm_addr_width_reg =
+      ((uint64_t)conv_ofm_width << 32) | ((uint64_t)conv_ofm_addr_base);
+  uint64_t cmd_intra_stride_atoms_reg =
+      ((uint64_t)conv_ofm_dma_length << 32) | ((uint64_t)conv_ofm_dma_stride);
+  uint64_t cmd_ofm_height_batch_reg =
+      ((uint64_t)conv_ofm_buf_col_len_rem << 48) |
+      ((uint64_t)conv_ofm_buf_col_len << 32) |
+      ((uint64_t)conv_ofm_height_batch + 0x80000000);
+  uint64_t cmd_user_ctrl_reg = ((uint64_t)o_ust_rst);
+  uint64_t cmd_wdma_param_reg =
+      ((uint64_t)(conv_ofm_dma_repeat | 0x80000000) << 32) |
+      ((uint64_t)conv_ofm_dma_offset);
+
+  uint64_t cmd_init_raddr_reg = ((cmd_init_raddr_col_1 & 0xffff) << 48) |
+                                ((cmd_init_raddr_col_0 & 0xffff) << 32) |
+                                (((cmd_init_raddr_index & 0xffff) << 16)) |
+                                (cmd_init_raddr_flag & 0xffff) << 15 |
+                                ((cmd_init_raddr_cnt & 0xffff));
+
+  uint64_t cmd_para31 = (cmd_para31 & 0x1) | args.relu_enabled;
+
+  DLOG << "cmd_init_raddr_col_1 = " << hex << cmd_init_raddr_col_1;
+
+  DLOG << "cmd_init_raddr_col_0 = " << hex << cmd_init_raddr_col_0;
+  DLOG << "cmd_init_raddr_index = " << hex << cmd_init_raddr_index;  //
+  DLOG << "cmd_init_raddr_cnt = " << hex << cmd_init_raddr_cnt;
+  DLOG << "conv_ofm_height_batch = " << conv_ofm_height_batch;
+
+  DLOG << "cmd_ifm_flt_base_addr = " << hex << cmd_ifm_flt_base_addr;
+  DLOG << "cmd_scale_base_addr = " << hex << cmd_scale_base_addr;
+  DLOG << "cmd_ifm_flt_dim = " << hex << cmd_ifm_flt_dim;
+  DLOG << "cmd_pad_step_size = " << hex << cmd_pad_step_size;
+  DLOG << "cmd_param1 = " << hex << cmd_param1;
+  DLOG << "cmd_param2 = " << hex << cmd_param2;
+  DLOG << "cmd_param3 = " << hex << cmd_param3;
+  DLOG << "cmd_param4 = " << hex << cmd_param4;
+  DLOG << "cmd_param5 = " << hex << cmd_param5;
+  DLOG << "cmd_param6 = " << hex << cmd_param6;
+  DLOG << "cmd_param7 = " << hex << cmd_param7;
+  DLOG << "cmd_param8 =  " << hex << cmd_param8;
+  DLOG << "cmd_ifm_flt_quant_scale =  " << hex << cmd_ifm_flt_quant_scale;
+  DLOG << "cmd_step_pad_mul_row_len = " << hex << cmd_step_pad_mul_row_len;
+  DLOG << "cmd_ifm_pack_byte_length = " << hex << cmd_ifm_pack_byte_length;
+  DLOG << "cmd_conv_param_reg = " << hex << cmd_conv_param_reg;
+  DLOG << "cmd_ofm_addr_width_reg = " << hex << cmd_ofm_addr_width_reg;
+  DLOG << "cmd_intra_stride_atoms_reg = " << hex << cmd_intra_stride_atoms_reg;
+  DLOG << "cmd_init_raddr_reg = " << hex << cmd_init_raddr_reg;
+  DLOG << "cmd_ofm_height_batch_reg = " << hex << cmd_ofm_height_batch_reg;
+  DLOG << "cmd_wdma_param_reg = " << hex << cmd_wdma_param_reg;
+  DLOG << "cmd_para31 = " << hex << cmd_para31;
+
+  reg_writeq(cmd_ifm_flt_base_addr, MUL8(1));
+  reg_writeq(cmd_scale_base_addr, MUL8(2));
+  reg_writeq(cmd_ifm_flt_dim, MUL8(3));
+  reg_writeq(cmd_pad_step_size, MUL8(4));
+  reg_writeq(cmd_param1, MUL8(5));
+  reg_writeq(cmd_param2, MUL8(6));
+  reg_writeq(cmd_param3, MUL8(7));
+  reg_writeq(cmd_param4, MUL8(8));
+  reg_writeq(cmd_param5, MUL8(9));
+  reg_writeq(cmd_param6, MUL8(10));
+  reg_writeq(cmd_param7, MUL8(11));
+  reg_writeq(cmd_param8, MUL8(12));
+  reg_writeq(cmd_ifm_flt_quant_scale, MUL8(13));
+  reg_writeq(cmd_step_pad_mul_row_len, MUL8(14));
+  reg_writeq(cmd_ifm_pack_byte_length, MUL8(15));
+  reg_writeq(cmd_conv_param_reg, MUL8(16));
+  reg_writeq(cmd_ofm_addr_width_reg, MUL8(17));
+  reg_writeq(cmd_intra_stride_atoms_reg, MUL8(18));
+
+  reg_writeq(cmd_init_raddr_reg, MUL8(29));
+  reg_writeq(cmd_para31, MUL8(31));
+
+  reg_writeq(0, MUL8(19));
+  reg_writeq(cmd_ofm_height_batch_reg, MUL8(19));
+  reg_writeq(cmd_ofm_height_batch_reg & 0xffffffff00000000, MUL8(19));
+
+  reg_writeq(cmd_wdma_param_reg, MUL8(25));
+
+  reg_writeq(0, MUL8(0));
+  reg_writeq(0x4000000000000000, MUL8(0));
+
+  ret = fpga_regpoll(MUL8(48), CONV_DONE, 0xffffff);
+  if (ret == -1) {
+    DLOG << "fpga conv no interrupt!!";
+    return ret;
+  }
+  reg_readq(MUL8(63));
+
+  usleep(10);
+  float scale = Findfp16Max();
+  (args.output.scale_address)[0] = scale;                 // NOLINT
+  (args.output.scale_address)[1] = (float)(1.0 / scale);  // NOLINT
+  DLOG << "Findfp16Max scale = " << scale;
+  DLOG << "ret=" << ret;
+  return ret;
 }

 int ComputeFpgaPool(const struct PoolingArgs &args) {
@@ -97,7 +577,6 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
  DLOG << "   mode:" << args.mode
       << "   kernel_reciprocal:" << fp16_2_fp32(args.kernel_reciprocal);
  DLOG << "   image_address:" << args.image.address
-       << "   image_scale_address:" << args.image.scale_address
       << "   image_channels:" << args.image.channels
       << "   image_height:" << args.image.height
       << "   image_width:" << args.image.width
@@ -107,13 +586,467 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
       << "   kernel_width:" << args.kernel.width
       << "   stride_h:" << args.kernel.stride_h
       << "   stride_w:" << args.kernel.stride_w;
-  DLOG << "   out_address:" << args.output.address
-       << "   out_scale_address:" << args.output.scale_address;
+  DLOG << "   out_address:" << args.output.address;
 #endif
 #ifndef PADDLE_MOBILE_ZU5
  return 0;
 #endif
-  return 0;
+
+  uint32_t filter_num_align = 0;
+  filter_num_align = args.image.channels;
+
+  DLOG << "______db_______: begin to set registers. ";
+  uint64_t ifm_pixel_num =
+      ((args.image.width) * (args.image.height) * args.image.channels);
+  uint64_t ifm_memory_size = ifm_pixel_num * sizeof(short);  // NOLINT
+  uint64_t flt_pixel_num = 0;
+  uint64_t filter_memory_size = 0;
+  //!! ???
+  uint64_t bn_pixel_num = (filter_num_align * 2);
+  uint64_t bn_memory_size = bn_pixel_num * sizeof(uint16_t);
+
+  uint64_t ofm_width =
+      ((args.image.width) + 2 * args.image.pad_width - args.kernel.width) /
+          (args.kernel.stride_w) +
+      1;
+  uint64_t ofm_height = ((args.image.height) + 2 * (args.image.pad_height) -
+                         (args.kernel.height)) /
+                            (args.kernel.stride_h) +
+                        1;
+
+  uint32_t filter_num = filter_num_align;
+  uint32_t image_channels = args.image.channels;
+
+  uint64_t ifm_src_paddr = vaddr_to_paddr((args.image.address));
+  uint64_t flt_src_paddr = 0;
+  uint64_t sb_src_paddr = 0;
+  uint64_t ifm_dst_paddr = vaddr_to_paddr((args.output.address));
+
+  /**********BN******************/
+  float image_inv_scale = 0;
+  float filter_inv_scale = 0;
+  int idx = 0;
+  DLOG << "______db_______: reset registers. ";
+  reg_writeq(1, MUL8(24));
+  usleep(1);
+  reg_writeq(0, MUL8(24));
+  /*********configuring registers*************/
+  uint32_t cmd_image_vir_base_addr = (uint32_t)ifm_src_paddr;
+  uint32_t cmd_filter_vir_base_addr = (uint32_t)flt_src_paddr;
+  uint32_t cmd_scale_base_addr = (uint32_t)sb_src_paddr;
+  uint32_t conv_ofm_addr_base = (uint32_t)ifm_dst_paddr;
+  uint64_t cmd_group_num = 1;  // args.group_num;
+  uint64_t cmd_filter_per_group = filter_num / cmd_group_num;
+
+  uint64_t cmd_flt_sqr_len = (args.kernel.width) * (args.kernel.height);
+  uint64_t cmd_ifm_pre_row_num = args.kernel.height;
+  if ((args.kernel.height == args.image.height) &&
+      (0 == args.image.pad_height)) {
+    cmd_ifm_pre_row_num = (args.kernel.height);
+  } else {
+    cmd_ifm_pre_row_num =
+        (args.kernel.height) - (args.image.pad_height) + (args.kernel.stride_h);
+  }
+  uint64_t cmd_flt_pre_batch_num = 1;
+  uint64_t cmd_ifm_pack_num_per_row_mns1 =
+      (uint64_t)(((args.image.channels) + 63) / 64) - 1;
+  uint64_t cmd_bn_num = filter_num;
+  uint64_t cmd_bias_num = filter_num;
+  uint64_t cmd_ifm_stride_row_length = args.image.width * args.kernel.stride_h;
+  uint64_t cmd_flt_pack_num_per_kernel_mns1 =
+      (uint64_t)(((args.image.channels) + 63) / 64) - 1;
+  uint64_t cmd_ofm_width_mns1 = (uint64_t)(
+      ((args.image.width) - (args.kernel.width) + 2 * (args.image.pad_width)) /
+      (args.kernel.stride_w));
+  uint64_t cmd_ofm_height =
+      (uint64_t)(((args.image.height) - (args.kernel.height) +
+                  2 * (args.image.pad_height)) /
+                 (args.kernel.stride_h)) +
+      1;
+
+  uint64_t cmd_channel_num = 0;
+  uint64_t cmd_ifm_pack_len = 0;
+  uint64_t cmd_channel_per_group = 0;
+  uint64_t cmd_flt_batch_num_mns1 = 0;
+  uint64_t cmd_flt_N_impl = 8;
+  uint64_t cmd_ifm_C_impl = 16;
+  uint64_t cmd_flt_pack_length = 0;
+  uint64_t cmd_step_h_mul_row_byte_len = 0;
+  uint64_t cmd_pad_h_mul_row_byte_len = 0;
+  uint64_t cmd_ifm_pack_byte_length = 16 * ((((args.image.width) + 7) / 8) * 8);
+  uint64_t row_len_align = args.image.width;
+  uint64_t cmd_flt_cycle_num_mns1 = 0;
+  if (image_channels > 32) {
+    cmd_channel_num = (uint64_t)((((args.image.channels) + 63)) / 64) * 64;
+    cmd_ifm_pack_len = 64 * (args.image.width);
+    cmd_channel_per_group = 64;
+    cmd_flt_batch_num_mns1 = (uint64_t)(((filter_num + 7)) / 8 - 1);
+    cmd_flt_N_impl = 8;
+    cmd_ifm_C_impl = 64;
+    cmd_flt_pack_length = (args.kernel.width) * (args.kernel.height) * 64;
+    cmd_step_h_mul_row_byte_len =
+        (args.kernel.stride_h) * cmd_channel_num * args.image.width;
+    cmd_pad_h_mul_row_byte_len =
+        (args.image.pad_height) * cmd_channel_num * args.image.width;
+    cmd_ifm_pack_byte_length = 64 * args.image.width;
+    row_len_align = args.image.width * (cmd_ifm_pack_num_per_row_mns1 + 1);
+    cmd_flt_cycle_num_mns1 = (cmd_channel_num / 64) - 1;
+  } else if (image_channels > 16) {
+    cmd_channel_num = 32;
+    cmd_ifm_pack_len = 32 * (args.image.width);
+    cmd_channel_per_group = 32;
+    cmd_flt_batch_num_mns1 = (uint64_t)((((filter_num) + 15)) / 16 - 1);
+    cmd_flt_N_impl = 16;
+    cmd_ifm_C_impl = 32;
+    cmd_flt_pack_length = (args.kernel.width) * (args.kernel.height) * 32;
+    cmd_step_h_mul_row_byte_len = (args.kernel.stride_h) * cmd_channel_num *
+                                  ((((args.image.width) + 1)) / 2) * 2;
+    cmd_pad_h_mul_row_byte_len = (args.image.pad_height) * cmd_channel_num *
+                                 ((((args.image.width) + 1)) / 2) * 2;
+    cmd_ifm_pack_byte_length =
+        32 * (uint64_t)((((args.image.width) + 1)) / 2) * 2;
+    row_len_align = (uint64_t)((((args.image.width) + 1)) / 2);
+    cmd_flt_cycle_num_mns1 = 0;
+  } else if (image_channels > 8) {
+    cmd_channel_num = 16;
+    cmd_ifm_pack_len = 16 * (args.image.width);
+    cmd_channel_per_group = 16;
+    cmd_flt_batch_num_mns1 = (uint64_t)((((filter_num) + 15)) / 16 - 1);
+    cmd_flt_N_impl = 32;
+    cmd_ifm_C_impl = 16;
+    cmd_flt_pack_length = (args.kernel.width) * (args.kernel.height) * 16;
+    cmd_step_h_mul_row_byte_len = (args.kernel.stride_h) * cmd_channel_num *
+                                  ((((args.image.width) + 3)) / 4) * 4;
+    cmd_pad_h_mul_row_byte_len = (args.image.pad_height) * cmd_channel_num *
+                                 ((((args.image.width) + 3)) / 4) * 4;
+    cmd_ifm_pack_byte_length =
+        16 * (uint64_t)((((args.image.width) + 3)) / 4) * 4;
+    row_len_align = (uint64_t)((((args.image.width) + 3)) / 4);
+    cmd_flt_cycle_num_mns1 = 0;
+  }
+
+  cmd_flt_N_impl = 16;
+  cmd_flt_batch_num_mns1 = 0;
+  cmd_flt_pack_length = 64;
+  uint64_t cmd_flt_N_len = 0;
+  uint64_t cmd_flt_length = 64;
+
+  uint64_t cmd_ifm_row_byte_length = cmd_channel_num * (args.image.width);
+
+  uint64_t cmd_ifm_buf_col_len = 0;
+
+  uint64_t ifm_one_batch_len =
+      (1048576 / ((args.image.width) * cmd_channel_num));
+  uint64_t cmd_ifm_batch_num_tmp = (uint64_t)(
+      ((args.image.height) + ifm_one_batch_len - 1) / ifm_one_batch_len);
+  if (1 == cmd_ifm_batch_num_tmp) {
+    cmd_ifm_buf_col_len = args.image.height;
+  } else {
+    if (((args.image.height) / (cmd_ifm_batch_num_tmp) % 2) == 0) {
+      cmd_ifm_buf_col_len = (args.image.height) / cmd_ifm_batch_num_tmp;
+    } else {
+      cmd_ifm_buf_col_len = (args.image.height) / cmd_ifm_batch_num_tmp - 1;
+    }
+  }
+  uint64_t cmd_ifm_batch_num_mns1 =
+      (((args.image.height) + cmd_ifm_buf_col_len - 1) / cmd_ifm_buf_col_len) -
+      1;
+
+  uint64_t cmd_flt_total_batch_num = 1;
+  uint64_t cmd_ifm_buf_col_len_rem =
+      (args.image.height) -
+      cmd_ifm_batch_num_mns1 * cmd_ifm_buf_col_len;  //= -4;
+
+  //-------- ofm batch number reg &&  initial URAM reading address
+  uint64_t cmd_init_raddr_cnt = 1;
+  uint64_t cmd_init_raddr_flag = 0;
+  int64_t cmd_init_raddr_index = -8;
+  int64_t cmd_init_raddr_col_0 = -4;
+  int64_t cmd_init_raddr_col_1 = -4;
+  int64_t conv_ofm_buf_col_len = 0;
+  int64_t conv_ofm_buf_col_len_rem = 0;
+
+  if (((args.image.pad_height) % (2 * (args.kernel.stride_h))) == 0) {
+    cmd_init_raddr_cnt = 0;
+    cmd_init_raddr_flag = 0;
+    cmd_init_raddr_index =
+        0 - (int64_t)row_len_align * (((args.image.pad_height) + 1) / 2);
+    cmd_init_raddr_col_0 = cmd_init_raddr_index;
+    cmd_init_raddr_col_1 = cmd_init_raddr_index;
+  } else if (((args.image.pad_height) -
+              2 * ((args.image.pad_height) / (2 * (args.kernel.stride_h)))) <=
+             (args.kernel.stride_h)) {
+    cmd_init_raddr_cnt =
+        (args.kernel.stride_h) -
+        ((args.image.pad_height) -
+         ((args.image.pad_height) / (2 * (args.kernel.stride_h))));
+    cmd_init_raddr_flag = 1;
+    cmd_init_raddr_index =
+        0 - (int64_t)row_len_align * (int64_t)(args.image.pad_height) -
+        (int64_t)row_len_align *
+            ((args.image.pad_height) / (2 * args.kernel.stride_h));
+    cmd_init_raddr_col_0 =
+        0 - (int64_t)row_len_align * (int64_t)(args.image.pad_height) -
+        (int64_t)row_len_align *
+            ((args.image.pad_height) / (2 * (args.kernel.stride_h)));
+    cmd_init_raddr_col_1 =
+        cmd_init_raddr_col_0 + args.kernel.stride_h * (int64_t)row_len_align;
+  } else if (((args.image.pad_height) -
+              2 * ((args.image.pad_height) / (2 * (args.kernel.stride_h)))) <=
+             2 * (args.kernel.stride_h)) {
+    cmd_init_raddr_cnt =
+        2 * (args.kernel.stride_h) *
+            (((args.image.pad_height) + 2 * (args.kernel.stride_h) - 1) /
+             (2 * (args.kernel.stride_h))) -
+        (args.image.pad_height);
+    cmd_init_raddr_flag = 0;
+    cmd_init_raddr_index =
+        0 - (int64_t)row_len_align * (int64_t)(args.kernel.stride_h) *
+                (((args.image.pad_height) + 2 * (args.kernel.stride_h) - 1) /
+                 (2 * (args.kernel.stride_h)));
+    cmd_init_raddr_col_0 =
+        0 -
+        (int64_t)row_len_align *
+            ((args.image.pad_height) / (2 * (args.kernel.stride_h))) -
+        (int64_t)row_len_align *
+            (2 * (args.kernel.stride_h) *
+                 (((args.image.pad_height) + 2 * (args.kernel.stride_h) - 1) /
+                  (2 * (args.kernel.stride_h))) -
+             (args.image.pad_height));
+    cmd_init_raddr_col_1 = cmd_init_raddr_col_0;
+  }
+
+  if (cmd_ifm_batch_num_mns1 == 0) {
+    if ((args.kernel.height) <= (args.kernel.stride_h)) {
+      conv_ofm_buf_col_len = (args.image.height) + 2 * (args.image.pad_height) -
+                             3 * (args.kernel.stride_h);
+    } else {
+      conv_ofm_buf_col_len = (args.image.height) + 2 * (args.image.pad_height) -
+                             2 * (args.kernel.stride_h) - (args.kernel.height);
+    }
+    conv_ofm_buf_col_len_rem = conv_ofm_buf_col_len;
+  } else {
+    int N_rem = 0;
+    int row_rem = 0;
+
+    if ((args.kernel.height) <= (args.kernel.stride_h)) {
+      conv_ofm_buf_col_len = cmd_ifm_buf_col_len - 3 * (args.kernel.stride_h);
+      N_rem = (cmd_ifm_buf_col_len - (args.kernel.height)) /
+                  (args.kernel.stride_h) +
+              1;
+      row_rem = cmd_ifm_buf_col_len - (args.kernel.stride_h) * N_rem;
+      conv_ofm_buf_col_len_rem = cmd_ifm_buf_col_len_rem +
+                                 2 * (args.image.pad_height) + row_rem -
+                                 3 * (args.kernel.stride_h);
+    } else {
+      conv_ofm_buf_col_len = cmd_ifm_buf_col_len + 2 * (args.image.pad_height) -
+                             2 * (args.kernel.stride_h) - (args.kernel.height);
+      N_rem = (cmd_ifm_buf_col_len - (args.kernel.height)) /
+                  (args.kernel.stride_h) +
+              1;
+      row_rem = cmd_ifm_buf_col_len - (args.kernel.stride_h) * N_rem;
+      conv_ofm_buf_col_len_rem =
+          cmd_ifm_buf_col_len_rem + (args.image.pad_height) + row_rem -
+          2 * (args.kernel.stride_h) - (args.kernel.height);
+    }
+  }
+
+  //-----------------------  para functions --------------------------------
+  uint64_t cmd_filter_quant_scale = 0x3c00;
+  uint64_t cmd_image_quant_scale = 0x3c00;
+  uint64_t wParallelsim = cmd_ifm_C_impl >> 3;
+  uint64_t wParallelsim_num = cmd_flt_cycle_num_mns1;
+  uint64_t win_size = (args.kernel.width) * (args.kernel.height) *
+                          (cmd_ifm_pack_num_per_row_mns1 + 1) -
+                      1;  //
+  uint64_t conv_ofm_width = (((args.image.width) - (args.kernel.width) +
+                              (args.image.pad_width) + (args.image.pad_width)) /
+                             (args.kernel.stride_w));
+  uint64_t conv_ofm_dma_length = cmd_channel_num * sizeof(short);  // NOLINT
+  uint64_t conv_ofm_dma_stride = conv_ofm_dma_length;
+  uint64_t conv_ofm_height_batch_tmp =
+      (args.image.height + 2 * args.image.pad_height - args.kernel.height) /
+          args.kernel.stride_h +
+      1;
+
+  uint64_t conv_ofm_height_batch = (conv_ofm_height_batch_tmp + 1) / 2 - 1;
+  uint64_t o_ust_rst = 0;
+  uint64_t conv_ofm_dma_repeat =
+      (uint64_t)(((((args.image.width) - (args.kernel.width) +
+                    (args.image.pad_width) + (args.image.pad_width))) /
+                  (args.kernel.stride_w)) +
+                 1);
+  uint64_t conv_ofm_dma_offset =
+      args.image.channels * conv_ofm_dma_repeat * sizeof(short);  // NOLINT
+  uint64_t conv_ofm_inter_stride = conv_ofm_dma_offset * 2;
+  //----------------- register contation ------------------
+  uint64_t cmd_ifm_flt_base_addr = ((uint64_t)cmd_filter_vir_base_addr << 32) |
+                                   ((uint64_t)cmd_image_vir_base_addr);
+  uint64_t cmd_ifm_flt_dim = ((uint64_t)(args.kernel.height) << 48) |
+                             ((uint64_t)(args.kernel.width) << 32) |
+                             ((uint64_t)(args.image.height) << 16) |
+                             ((uint64_t)(args.image.width));
+  uint64_t cmd_pad_step_size = ((uint64_t)(args.kernel.stride_h) << 48) |
+                               ((uint64_t)(args.kernel.stride_w) << 32) |
+                               ((uint64_t)(args.image.pad_height) << 16) |
+                               ((uint64_t)(args.image.pad_width));
+  uint64_t cmd_param1 = ((uint64_t)cmd_filter_per_group << 48) |
+                        ((uint64_t)cmd_channel_num << 32) |
+                        ((uint64_t)filter_num << 16) |
+                        ((uint64_t)cmd_group_num);
+  uint64_t cmd_param2 =
+      ((uint64_t)cmd_flt_sqr_len << 48) | ((uint64_t)cmd_ifm_pack_len << 32) |
+      ((uint64_t)cmd_ifm_pre_row_num << 16) | ((uint64_t)cmd_channel_per_group);
+  uint64_t cmd_param3 = ((uint64_t)cmd_flt_batch_num_mns1 << 48) |
+                        ((uint64_t)cmd_flt_total_batch_num << 32) |
+                        ((uint64_t)cmd_flt_N_impl << 16) |
+                        ((uint64_t)cmd_flt_pre_batch_num);
+  uint64_t cmd_param4 = ((uint64_t)cmd_ifm_pack_num_per_row_mns1 << 48) |
+                        ((uint64_t)cmd_bn_num << 32) |
+                        ((uint64_t)cmd_bias_num << 16) |
+                        ((uint64_t)cmd_flt_N_len);
+  uint64_t cmd_param5 = ((uint64_t)cmd_ifm_stride_row_length << 48) |
+                        ((uint64_t)cmd_flt_pack_length << 32) |
+                        ((uint64_t)cmd_flt_cycle_num_mns1 << 16) |
+                        ((uint64_t)cmd_flt_pack_num_per_kernel_mns1);
+  uint64_t cmd_param6 = ((uint64_t)cmd_ofm_width_mns1 << 48) |
+                        ((uint64_t)cmd_ifm_batch_num_mns1 << 32) |
+                        ((uint64_t)cmd_ifm_buf_col_len << 16) |
+                        ((uint64_t)cmd_ifm_C_impl);
+  uint64_t cmd_param7 = ((uint64_t)conv_ofm_inter_stride << 32) |
+                        ((uint64_t)cmd_ifm_buf_col_len_rem << 16) |
+                        ((uint64_t)cmd_ofm_height);
+  uint64_t cmd_param8 =
+      ((uint64_t)cmd_flt_length << 32) | ((uint64_t)cmd_ifm_row_byte_length);
+  uint64_t cmd_ifm_flt_quant_scale = ((uint64_t)cmd_filter_quant_scale << 32) |
+                                     ((uint64_t)cmd_image_quant_scale);
+  uint64_t cmd_step_pad_mul_row_len =
+      ((uint64_t)cmd_pad_h_mul_row_byte_len << 32) |
+      ((uint64_t)cmd_step_h_mul_row_byte_len);
+  //---- ofm paras ----
+  uint64_t cmd_conv_param_reg = ((uint64_t)wParallelsim_num << 32) |
+                                ((uint64_t)wParallelsim << 16) |
+                                ((uint64_t)win_size);
+  uint64_t cmd_ofm_addr_width_reg =
+      ((uint64_t)conv_ofm_width << 32) | ((uint64_t)conv_ofm_addr_base);
+  uint64_t cmd_intra_stride_atoms_reg =
+      ((uint64_t)conv_ofm_dma_length << 32) | ((uint64_t)conv_ofm_dma_stride);
+  uint64_t cmd_ofm_height_batch_reg =
+      ((uint64_t)(conv_ofm_buf_col_len_rem & 0xffff) << 48) |
+      ((uint64_t)(conv_ofm_buf_col_len & 0xffff) << 32) |
+      ((uint64_t)conv_ofm_height_batch + 0x80000000);
+  uint64_t cmd_user_ctrl_reg = ((uint64_t)o_ust_rst);
+  uint64_t cmd_wdma_param_reg =
+      ((uint64_t)(conv_ofm_dma_repeat | 0x80000000) << 32) |
+      ((uint64_t)conv_ofm_dma_offset);
+  uint64_t cmd_init_raddr_reg = ((cmd_init_raddr_col_1 & 0xffff) << 48) |
+                                ((cmd_init_raddr_col_0 & 0xffff) << 32) |
+                                (((cmd_init_raddr_index & 0xffff) << 16)) |
+                                (cmd_init_raddr_flag & 0xffff) << 15 |
+                                ((cmd_init_raddr_cnt & 0xffff));
+
+  DLOG << "cmd_init_raddr_col_1 = " << hex << cmd_init_raddr_col_1;
+
+  DLOG << "cmd_init_raddr_col_0 = " << hex << cmd_init_raddr_col_0;
+  DLOG << "cmd_init_raddr_index = " << hex << cmd_init_raddr_index;  //
+  DLOG << "cmd_init_raddr_cnt = " << hex << cmd_init_raddr_cnt;
+  DLOG << "conv_ofm_buf_col_len = " << hex << conv_ofm_buf_col_len;
+  DLOG << "conv_ofm_buf_col_len_rem = " << hex << conv_ofm_buf_col_len_rem;
+  DLOG << "cmd_ifm_flt_base_addr = " << hex << cmd_ifm_flt_base_addr;
+  DLOG << "cmd_scale_base_addr = " << hex << cmd_scale_base_addr;
+  DLOG << "cmd_ifm_flt_dim = " << hex << cmd_ifm_flt_dim;
+  DLOG << "cmd_pad_step_size = " << hex << cmd_pad_step_size;
+  DLOG << "cmd_param1 = " << hex << cmd_param1;
+  DLOG << "cmd_param2 = " << hex << cmd_param2;
+  DLOG << "cmd_param3 = " << hex << cmd_param3;
+  DLOG << "cmd_param4 = " << hex << cmd_param4;
+  DLOG << "cmd_param5 = " << hex << cmd_param5;
+  DLOG << "cmd_param6 = " << hex << cmd_param6;
+  DLOG << "cmd_param7 = " << hex << cmd_param7;
+  DLOG << "cmd_param8 =  " << hex << cmd_param8;
+  DLOG << "cmd_ifm_flt_quant_scale =  " << hex << cmd_ifm_flt_quant_scale;
+  DLOG << "cmd_step_pad_mul_row_len = " << hex << cmd_step_pad_mul_row_len;
+  DLOG << "cmd_ifm_pack_byte_length = " << hex << cmd_ifm_pack_byte_length;
+  DLOG << "cmd_conv_param_reg = " << hex << cmd_conv_param_reg;
+  DLOG << "cmd_ofm_addr_width_reg = " << hex << cmd_ofm_addr_width_reg;
+  DLOG << "cmd_intra_stride_atoms_reg = " << hex << cmd_intra_stride_atoms_reg;
+  DLOG << "cmd_init_raddr_reg = " << hex << cmd_init_raddr_reg;
+  DLOG << "cmd_ofm_height_batch_reg = " << hex << cmd_ofm_height_batch_reg;
+  DLOG << "cmd_wdma_param_reg = " << hex << cmd_wdma_param_reg;
+  DLOG << "pooling_mode = " << hex << args.mode;
+
+  reg_writeq(cmd_ifm_flt_base_addr, MUL8(1));
+  reg_writeq(cmd_scale_base_addr, MUL8(2));
+  reg_writeq(cmd_ifm_flt_dim, MUL8(3));
+  reg_writeq(cmd_pad_step_size, MUL8(4));
+  reg_writeq(cmd_param1, MUL8(5));
+  reg_writeq(cmd_param2, MUL8(6));
+  reg_writeq(cmd_param3, MUL8(7));
+  reg_writeq(cmd_param4, MUL8(8));
+  reg_writeq(cmd_param5, MUL8(9));
+  reg_writeq(cmd_param6, MUL8(10));
+  reg_writeq(cmd_param7, MUL8(11));
+  reg_writeq(cmd_param8, MUL8(12));
+  reg_writeq(cmd_ifm_flt_quant_scale, MUL8(13));
+  reg_writeq(cmd_step_pad_mul_row_len, MUL8(14));
+  reg_writeq(cmd_ifm_pack_byte_length, MUL8(15));
+  reg_writeq(cmd_conv_param_reg, MUL8(16));
+  reg_writeq(cmd_ofm_addr_width_reg, MUL8(17));
+  reg_writeq(cmd_intra_stride_atoms_reg, MUL8(18));
+
+  reg_writeq(cmd_init_raddr_reg, MUL8(29));
+
+  reg_writeq(0, MUL8(19));
+  reg_writeq(cmd_ofm_height_batch_reg, MUL8(19));
+  reg_writeq(cmd_ofm_height_batch_reg & 0xffffffff00000000, MUL8(19));
+
+  reg_writeq(cmd_wdma_param_reg, MUL8(25));
+
+  /******************************************************************/
+  uint64_t cmd_mult_factor = ((uint64_t)args.kernel_reciprocal) |
+                             ((uint64_t)args.kernel_reciprocal << 16);
+  reg_writeq(cmd_mult_factor, MUL8(30));
+  /******************************************************************/
+
+  reg_writeq(0, MUL8(0));
+  if (args.mode == 0) {  // max pooling
+    reg_writeq(0x2200000000000000, MUL8(0));
+  } else {  // average pooling
+    reg_writeq(0x2400000000000000, MUL8(0));
+  }
+  int ret = -1;
+  ret = fpga_regpoll(MUL8(48), CONV_DONE, 0x00ffff);
+  if (ret == -1) {
+    DLOG << "fpga pooling no interrupt!!";
+    return ret;
+  }
+  reg_readq(MUL8(63));
+  usleep(10);
+  // get max value
+  float scale = Findfp16Max();
+  (args.output.scale_address)[0] = scale;                 // NOLINT
+  (args.output.scale_address)[1] = (float)(1.0 / scale);  // NOLINT
+  DLOG << "Findfp16Max scale = " << scale;
+  DLOG << "ret=" << ret;
+  return ret;
+}
+
+int get_ofm_batch_size(int width, int channel) {
+  int pad_channel, row_size;
+
+  if (64 < channel) {
+    pad_channel = (int)((channel + 127) / 128) * 128;  // NOLINT
+  } else if (32 < channel && channel <= 64) {
+    pad_channel = ((channel + 63) / (64)) * 64;
+  } else if (16 < channel && channel <= 32) {
+    pad_channel = ((channel + 31) / (32)) * 32;
+  } else if (channel <= 16) {
+    pad_channel = ((channel + 15) / (16)) * 16;
+  }
+
+  row_size = pad_channel * width;
+
+  return row_size;
 }

 int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
@@ -123,26 +1056,525 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
       << "   const0:" << fp16_2_fp32(int16_t(args.const0))
       << "   const1:" << fp16_2_fp32(int16_t(args.const1));
  DLOG << "   image0_address:" << args.image0.address
-       << "   image0_scale_address:" << args.image0.scale_address
       << "   image0_channels:" << args.image0.channels
       << "   image0_height:" << args.image0.height
-       << "   image0_width:" << args.image0.width
-       << "   pad0_height:" << args.image0.pad_height
-       << "   pad0_width:" << args.image0.pad_width;
+       << "   image0_width:" << args.image0.width;
  DLOG << "   image1_address:" << args.image1.address
-       << "   image1_scale_address:" << args.image1.scale_address
       << "   image1_channels:" << args.image1.channels
       << "   image1_height:" << args.image1.height
-       << "   image1_width:" << args.image1.width
-       << "   pad1_height:" << args.image1.pad_height
-       << "   pad_width:" << args.image1.pad_width;
-  DLOG << "   out_address:" << args.output.address
-       << "   out_scale_address:" << args.output.scale_address;
+       << "   image1_width:" << args.image1.width;
+  DLOG << "   out_address:" << args.output.address;
 #endif
 #ifndef PADDLE_MOBILE_ZU5
  return 0;
 #endif
-  return 0;
+  uint32_t filter_num_align = args.image0.channels;
+
+  uint32_t const_kernel_width_1 = 1;
+  uint32_t const_stride_width_1 = 1;
+  uint32_t const_kernel_height_2 = 2;
+  uint32_t const_stride_height_2 = 2;
+  uint32_t const_pad_height_0 = 0;
+  uint32_t const_pad_width_0 = 0;
+  uint32_t ew_image_height = args.image0.height * 2;
+
+  DLOG << "______db_______: begin to set registers. ";
+  uint64_t ifm_pixel_num =
+      ((args.image0.width) * (args.image0.height) * args.image0.channels);
+  uint64_t ifm_memory_size = ifm_pixel_num * sizeof(short);  // NOLINT
+  uint64_t flt_pixel_num = 0;
+  uint64_t filter_memory_size = 0;
+  uint64_t bn_pixel_num = (filter_num_align * 2);
+  uint64_t bn_memory_size = bn_pixel_num * sizeof(uint16_t);
+
+  uint64_t ofm_width =
+      ((args.image0.width) + 2 * const_pad_width_0 - const_kernel_width_1) /
+          (const_stride_width_1) +
+      1;
+  uint64_t ofm_height =
+      ((ew_image_height) + 2 * (const_pad_height_0) - (const_kernel_height_2)) /
+          (const_stride_height_2) +
+      1;
+
+  uint32_t filter_num = filter_num_align;
+  uint32_t image_channels = args.image0.channels;
+
+  uint64_t ifm_src_paddr = vaddr_to_paddr((args.image0.address));
+  uint64_t flt_src_paddr = vaddr_to_paddr((args.image1.address));
+  uint64_t ifm_dst_paddr = vaddr_to_paddr((args.output.address));
+  float image_inv_scale = 0;
+  float filter_inv_scale = 0;
+  int idx = 0;
+
+  DLOG << "______db_______: reset registers. ";
+
+  reg_writeq(1, MUL8(24));
+  usleep(1);
+  reg_writeq(0, MUL8(24));
+
+  /*********configuring registers*************/
+  uint32_t cmd_image_vir_base_addr = (uint32_t)ifm_src_paddr;
+  uint32_t cmd_filter_vir_base_addr = (uint32_t)flt_src_paddr;
+  uint32_t cmd_scale_base_addr = 0;
+  uint32_t conv_ofm_addr_base = (uint32_t)ifm_dst_paddr;
+  uint64_t cmd_group_num = 1;
+  uint64_t cmd_filter_per_group = filter_num / cmd_group_num;
+
+  uint64_t cmd_flt_sqr_len = (const_kernel_width_1) * (const_kernel_height_2);
+  uint64_t cmd_ifm_pre_row_num = const_kernel_height_2;
+  if ((const_kernel_height_2 == ew_image_height) && (0 == const_pad_height_0)) {
+    cmd_ifm_pre_row_num = (const_kernel_height_2);
+  } else {
+    cmd_ifm_pre_row_num = (const_kernel_height_2) - (const_pad_height_0) +
+                          (const_stride_height_2);
+  }
+  uint64_t cmd_flt_pre_batch_num = 1;
+  uint64_t cmd_ifm_pack_num_per_row_mns1 =
+      (uint64_t)(((args.image0.channels) + 63) / 64) - 1;
+  uint64_t cmd_bn_num = filter_num;
+  uint64_t cmd_bias_num = filter_num;
+  uint64_t cmd_ifm_stride_row_length =
+      args.image0.width * const_stride_height_2;
+  uint64_t cmd_flt_pack_num_per_kernel_mns1 =
+      (uint64_t)(((args.image0.channels) + 63) / 64) - 1;
+  uint64_t cmd_ofm_width_mns1 = (uint64_t)(
+      ((args.image0.width) - (const_kernel_width_1) + 2 * (const_pad_width_0)) /
+      (const_stride_width_1));
+  uint64_t cmd_ofm_height =
+      (uint64_t)(((args.image0.height) * 2 - (const_kernel_height_2) +
+                  2 * (const_pad_height_0)) /
+                 (const_stride_height_2)) +
+      1;
+
+  uint64_t cmd_channel_num = 0;
+  uint64_t cmd_ifm_pack_len = 0;
+  uint64_t cmd_channel_per_group = 0;
+  uint64_t cmd_flt_batch_num_mns1 = 0;
+  uint64_t cmd_flt_N_impl = 8;
+  uint64_t cmd_ifm_C_impl = 16;
+  uint64_t cmd_flt_pack_length = 0;
+  uint64_t cmd_step_h_mul_row_byte_len = 0;
+  uint64_t cmd_pad_h_mul_row_byte_len = 0;
+  uint64_t cmd_ifm_pack_byte_length =
+      16 * ((((args.image0.width) + 7) / 8) * 8);
+  uint64_t row_len_align = args.image0.width;
+  uint64_t cmd_flt_cycle_num_mns1 = 0;
+  if (image_channels > 32) {
+    cmd_channel_num = (uint64_t)((((args.image0.channels) + 63)) / 64) * 64;
+    cmd_ifm_pack_len = 64 * (args.image0.width);
+    cmd_channel_per_group = 64;
+    cmd_flt_batch_num_mns1 = (uint64_t)(((filter_num + 7)) / 8 - 1);
+    cmd_flt_N_impl = 8;
+    cmd_ifm_C_impl = 64;
+    cmd_flt_pack_length = (const_kernel_width_1) * (const_kernel_height_2)*64;
+    cmd_step_h_mul_row_byte_len =
+        (const_stride_height_2)*cmd_channel_num * args.image0.width;
+    cmd_pad_h_mul_row_byte_len =
+        (const_pad_height_0)*cmd_channel_num * args.image0.width;
+    cmd_ifm_pack_byte_length = 64 * args.image0.width;
+    row_len_align = args.image0.width;
+    cmd_flt_cycle_num_mns1 = (cmd_channel_num / 64) - 1;
+  } else if (image_channels > 16) {
+    cmd_channel_num = 32;
+    cmd_ifm_pack_len = 32 * (args.image0.width);
+    cmd_channel_per_group = 32;
+    cmd_flt_batch_num_mns1 = (uint64_t)((((filter_num) + 15)) / 16 - 1);
+    cmd_flt_N_impl = 16;
+    cmd_ifm_C_impl = 32;
+    cmd_flt_pack_length = (const_kernel_width_1) * (const_kernel_height_2)*32;
+    cmd_step_h_mul_row_byte_len = (const_stride_height_2)*cmd_channel_num *
+                                  ((((args.image0.width) + 1)) / 2) * 2;
+    cmd_pad_h_mul_row_byte_len = (const_pad_height_0)*cmd_channel_num *
+                                 ((((args.image0.width) + 1)) / 2) * 2;
+    cmd_ifm_pack_byte_length =
+        32 * (uint64_t)((((args.image0.width) + 1)) / 2) * 2;
+    row_len_align = (uint64_t)((((args.image0.width) + 1)) / 2);
+    cmd_flt_cycle_num_mns1 = 0;
+  } else if (image_channels > 8) {
+    cmd_channel_num = 16;
+    cmd_ifm_pack_len = 16 * (args.image0.width);
+    cmd_channel_per_group = 16;
+    cmd_flt_batch_num_mns1 = (uint64_t)((((filter_num) + 15)) / 16 - 1);
+    cmd_flt_N_impl = 32;
+    cmd_ifm_C_impl = 16;
+    cmd_flt_pack_length = (const_kernel_width_1) * (const_kernel_height_2)*16;
+    cmd_step_h_mul_row_byte_len = (const_stride_height_2)*cmd_channel_num *
+                                  ((((args.image0.width) + 3)) / 4) * 4;
+    cmd_pad_h_mul_row_byte_len = (const_pad_height_0)*cmd_channel_num *
+                                 ((((args.image0.width) + 3)) / 4) * 4;
+    cmd_ifm_pack_byte_length =
+        16 * (uint64_t)((((args.image0.width) + 3)) / 4) * 4;
+    row_len_align = (uint64_t)((((args.image0.width) + 3)) / 4);
+    cmd_flt_cycle_num_mns1 = 0;
+  }
+
+  cmd_flt_N_impl = 16;
+  cmd_flt_batch_num_mns1 = 0;
+  cmd_flt_pack_length = 64;
+  uint64_t cmd_flt_N_len = 0;
+  uint64_t cmd_flt_length = 64;
+  uint64_t cmd_ifm_row_byte_length = cmd_channel_num * (args.image0.width);
+  uint64_t cmd_ifm_buf_col_len = 0;
+  uint64_t ifm_one_batch_len =
+      (1048576 / ((2 * row_len_align) * cmd_channel_num));
+  uint64_t cmd_ifm_batch_num_tmp = (uint64_t)(
+      ((ew_image_height) + ifm_one_batch_len - 1) / ifm_one_batch_len);
+  DLOG << "ifm_one_batch_len = " << hex << ifm_one_batch_len;
+  DLOG << "cmd_ifm_batch_num_tmp = " << hex << cmd_ifm_batch_num_tmp;
+
+  if (1 == cmd_ifm_batch_num_tmp) {
+    cmd_ifm_buf_col_len = ew_image_height;
+  } else {
+    cmd_ifm_buf_col_len = ifm_one_batch_len;
+  }
+  uint64_t cmd_ifm_batch_num_mns1 =
+      (((ew_image_height) + cmd_ifm_buf_col_len - 1) / cmd_ifm_buf_col_len) - 1;
+  DLOG << "___db____ew____:cmd_ifm_batch_num_mns1 = " << hex
+       << cmd_ifm_batch_num_mns1;
+
+  uint64_t cmd_flt_total_batch_num = 1;
+  uint64_t cmd_ifm_buf_col_len_rem =
+      (ew_image_height)-cmd_ifm_batch_num_mns1 * cmd_ifm_buf_col_len;
+  //-------- ofm batch number reg &&  initial URAM reading address
+  // logic-----------------
+  uint64_t cmd_init_raddr_cnt = 1;
+  uint64_t cmd_init_raddr_flag = 0;
+  int64_t cmd_init_raddr_index = -8;
+  int64_t cmd_init_raddr_col_0 = -4;
+  int64_t cmd_init_raddr_col_1 = -4;
+  int64_t conv_ofm_buf_col_len = 0;
+  int64_t conv_ofm_buf_col_len_rem = 0;
+
+  if (((const_pad_height_0) % (2 * (const_stride_height_2))) == 0) {
+    cmd_init_raddr_cnt = 0;
+    cmd_init_raddr_flag = 0;
+    cmd_init_raddr_index =
+        0 - (int64_t)row_len_align * (((const_pad_height_0) + 1) / 2);
+    cmd_init_raddr_col_0 = cmd_init_raddr_index;
+    cmd_init_raddr_col_1 = cmd_init_raddr_index;
+  } else if (((const_pad_height_0)-2 *
+              ((const_pad_height_0) / (2 * (const_stride_height_2)))) <=
+             (const_stride_height_2)) {
+    cmd_init_raddr_cnt =
+        (const_stride_height_2) -
+        ((const_pad_height_0) -
+         ((const_pad_height_0) / (2 * (const_stride_height_2))));
+    cmd_init_raddr_flag = 1;
+    cmd_init_raddr_index =
+        0 - (int64_t)row_len_align * (int64_t)(const_pad_height_0) -
+        (int64_t)row_len_align *
+            ((const_pad_height_0) / (2 * const_stride_height_2));
+    cmd_init_raddr_col_0 =
+        0 - (int64_t)row_len_align * (int64_t)(const_pad_height_0) -
+        (int64_t)row_len_align *
+            ((const_pad_height_0) / (2 * (const_stride_height_2)));
+    cmd_init_raddr_col_1 =
+        cmd_init_raddr_col_0 +
+        const_stride_height_2 * (int64_t)row_len_align;  // 0;
+  } else if (((const_pad_height_0)-2 *
+              ((const_pad_height_0) / (2 * (const_stride_height_2)))) <=
+             2 * (const_stride_height_2)) {
+    cmd_init_raddr_cnt =
+        2 * (const_stride_height_2) *
+            (((const_pad_height_0) + 2 * (const_stride_height_2)-1) /
+             (2 * (const_stride_height_2))) -
+        (const_pad_height_0);
+    cmd_init_raddr_flag = 0;
+    cmd_init_raddr_index =
+        0 - (int64_t)row_len_align * (int64_t)(const_stride_height_2) *
+                (((const_pad_height_0) + 2 * (const_stride_height_2)-1) /
+                 (2 * (const_stride_height_2)));
+    cmd_init_raddr_col_0 =
+        0 -
+        (int64_t)row_len_align *
+            ((const_pad_height_0) / (2 * (const_stride_height_2))) -
+        (int64_t)row_len_align *
+            (2 * (const_stride_height_2) *
+                 (((const_pad_height_0) + 2 * (const_stride_height_2)-1) /
+                  (2 * (const_stride_height_2))) -
+             (const_pad_height_0));
+    cmd_init_raddr_col_1 = cmd_init_raddr_col_0;
+  }
+
+  if (cmd_ifm_batch_num_mns1 == 0) {
+    if ((const_kernel_height_2) <= (const_stride_height_2)) {
+      conv_ofm_buf_col_len = cmd_ifm_buf_col_len + 2 * (const_pad_height_0)-3 *
+                                                       (const_stride_height_2);
+    } else {
+      conv_ofm_buf_col_len =
+          cmd_ifm_buf_col_len +
+          2 * (const_pad_height_0)-3 * (const_stride_height_2) -
+          (const_kernel_height_2);
+    }
+    conv_ofm_buf_col_len_rem = conv_ofm_buf_col_len;
+  } else {
+    int N_rem = 0;
+    int row_rem = 0;
+
+    if ((const_kernel_height_2) <= (const_stride_height_2)) {
+      conv_ofm_buf_col_len = cmd_ifm_buf_col_len - 3 * (const_stride_height_2);
+      N_rem = (cmd_ifm_buf_col_len - (const_kernel_height_2)) /
+                  (const_stride_height_2) +
+              1;
+      row_rem = cmd_ifm_buf_col_len - (const_stride_height_2)*N_rem;
+      conv_ofm_buf_col_len_rem = cmd_ifm_buf_col_len_rem +
+                                 2 * (const_pad_height_0) + row_rem -
+                                 3 * (const_stride_height_2);
+    } else {
+      conv_ofm_buf_col_len =
+          cmd_ifm_buf_col_len +
+          2 * (const_pad_height_0)-3 * (const_stride_height_2) -
+          (const_kernel_height_2);
+      N_rem = (cmd_ifm_buf_col_len - (const_kernel_height_2)) /
+                  (const_stride_height_2) +
+              1;
+      row_rem = cmd_ifm_buf_col_len - (const_stride_height_2)*N_rem;
+      conv_ofm_buf_col_len_rem =
+          cmd_ifm_buf_col_len_rem + (const_pad_height_0) + row_rem -
+          3 * (const_stride_height_2) - (const_kernel_height_2);
+    }
+  }
+
+  //*************************
+  uint64_t ifm_height_raw_batch = 0;
+  uint64_t cmd_ofm_height_batch_reg;
+  uint64_t conv_ofm_height_batch_tmp = 0;
+  uint64_t conv_ofm_height_batch[16];
+  int ofm_height_norm_batch;
+  int height_batch_num;
+
+  int row_norm_size = get_ofm_batch_size(args.image0.width, cmd_channel_num);
+  int ifm_norm_size =
+      ew_image_height * row_norm_size * sizeof(short);  // NOLINT
+
+  if (ifm_norm_size <= (1024 * 1024)) {
+    conv_ofm_height_batch[0] =
+        get_image_out_axis(ew_image_height, const_pad_height_0,
+                           const_kernel_height_2, const_stride_height_2);
+    height_batch_num = 0;
+  } else if (row_norm_size < (1024 * 1024)) {
+    // raw ifm batch ,should make ofm be 2*N
+    ifm_height_raw_batch =
+        (int)(((double)(1024 * 1024) - row_norm_size + 1) /  // NOLINT
+              (double)(2 * row_norm_size));                  // NOLINT
+    ofm_height_norm_batch = get_image_out_axis(
+        ifm_height_raw_batch, 0, const_kernel_height_2, const_stride_height_2);
+    if (ofm_height_norm_batch % 2 == 0) {
+      ofm_height_norm_batch = ofm_height_norm_batch;
+    } else {
+      ofm_height_norm_batch = ofm_height_norm_batch - 1;
+    }
+
+    DLOG << "ofm_height_norm_batch = " << hex << ofm_height_norm_batch;
+    int ofm_height_rems = cmd_ofm_height;
+    int i = 0;
+    for (i = 0; 0 < ofm_height_rems; i++) {
+      if (ofm_height_norm_batch <= ofm_height_rems) {
+        ofm_height_rems = ofm_height_rems - ofm_height_norm_batch;
+        conv_ofm_height_batch[i] = ofm_height_norm_batch;
+        DLOG << "ofm_height_norm_batch[i] = " << hex
+             << conv_ofm_height_batch[i];
+      } else {
+        conv_ofm_height_batch[i] = ofm_height_rems;
+        break;
+      }
+    }
+    height_batch_num = i;
+  }
+  //*************************
+
+  //-----------------------  para functions --------------------------------
+  uint64_t cmd_filter_quant_scale = 0x3c00;
+  uint64_t cmd_image_quant_scale = 0x3c00;
+  uint64_t wParallelsim = cmd_ifm_C_impl >> 3;
+  uint64_t wParallelsim_num = cmd_flt_cycle_num_mns1;
+  uint64_t win_size = (const_kernel_width_1) * (const_kernel_height_2) *
+                          (cmd_ifm_pack_num_per_row_mns1 + 1) -
+                      1;  //
+  uint64_t conv_ofm_width = (((args.image0.width) - (const_kernel_width_1) +
+                              (const_pad_width_0) + (const_pad_width_0)) /
+                             (const_stride_width_1));
+  uint64_t conv_ofm_dma_length = cmd_channel_num * sizeof(short);  // NOLINT
+  uint64_t conv_ofm_dma_stride = cmd_channel_num * sizeof(short);  // NOLINT
+  uint64_t cmd_image_addr_low = 0;
+  uint64_t cmd_image_addr_high = 0;
+  uint64_t cmd_image_addr_diff = 0;
+
+  if (cmd_filter_vir_base_addr < cmd_image_vir_base_addr) {
+    cmd_image_addr_low = (uint64_t)cmd_filter_vir_base_addr;
+    cmd_image_addr_high = (uint64_t)cmd_image_vir_base_addr;
+  } else {
+    cmd_image_addr_low = (uint64_t)cmd_image_vir_base_addr;
+    cmd_image_addr_high = (uint64_t)cmd_filter_vir_base_addr;
+  }
+
+  cmd_image_addr_diff = cmd_image_addr_high - cmd_image_addr_low;
+  uint64_t o_ust_rst = 0;
+  uint64_t conv_ofm_dma_repeat =
+      (uint64_t)(((((args.image0.width) - (const_kernel_width_1) +
+                    (const_pad_width_0) + (const_pad_width_0))) /
+                  (const_stride_width_1)) +
+                 1);
+  uint64_t conv_ofm_dma_offset =
+      cmd_channel_num * conv_ofm_dma_repeat * sizeof(short);  // NOLINT
+  uint64_t conv_ofm_inter_stride = conv_ofm_dma_offset * 2;
+  //----------------- register contation ------------------
+  uint64_t cmd_ifm_flt_base_addr =
+      (cmd_image_addr_high << 32) | (cmd_image_addr_low);
+
+  uint64_t cmd_ifm_flt_dim = ((uint64_t)(const_kernel_height_2) << 48) |
+                             ((uint64_t)(const_kernel_width_1) << 32) |
+                             ((uint64_t)(ew_image_height) << 16) |
+                             ((uint64_t)(args.image0.width));
+  uint64_t cmd_pad_step_size = ((uint64_t)(const_stride_height_2) << 48) |
+                               ((uint64_t)(const_stride_width_1) << 32) |
+                               ((uint64_t)(const_pad_height_0) << 16) |
+                               ((uint64_t)(const_pad_width_0));
+  uint64_t cmd_param1 = ((uint64_t)cmd_filter_per_group << 48) |
+                        ((uint64_t)cmd_channel_num << 32) |
+                        ((uint64_t)filter_num << 16) |
+                        ((uint64_t)cmd_group_num);
+  uint64_t cmd_param2 =
+      ((uint64_t)cmd_flt_sqr_len << 48) | ((uint64_t)cmd_ifm_pack_len << 32) |
+      ((uint64_t)cmd_ifm_pre_row_num << 16) | ((uint64_t)cmd_channel_per_group);
+  uint64_t cmd_param3 = ((uint64_t)cmd_flt_batch_num_mns1 << 48) |
+                        ((uint64_t)cmd_flt_total_batch_num << 32) |
+                        ((uint64_t)cmd_flt_N_impl << 16) |
+                        ((uint64_t)cmd_flt_pre_batch_num);
+  uint64_t cmd_param4 = ((uint64_t)cmd_ifm_pack_num_per_row_mns1 << 48) |
+                        ((uint64_t)cmd_bn_num << 32) |
+                        ((uint64_t)cmd_bias_num << 16) |
+                        ((uint64_t)cmd_flt_N_len);
+  uint64_t cmd_param5 = ((uint64_t)cmd_ifm_stride_row_length << 48) |
+                        ((uint64_t)cmd_flt_pack_length << 32) |
+                        ((uint64_t)cmd_flt_cycle_num_mns1 << 16) |
+                        ((uint64_t)cmd_flt_pack_num_per_kernel_mns1);
+  uint64_t cmd_param6 = ((uint64_t)cmd_ofm_width_mns1 << 48) |
+                        ((uint64_t)cmd_ifm_batch_num_mns1 << 32) |
+                        ((uint64_t)cmd_ifm_buf_col_len << 16) |
+                        ((uint64_t)cmd_ifm_C_impl);
+  uint64_t cmd_param7 = ((uint64_t)conv_ofm_inter_stride << 32) |
+                        ((uint64_t)cmd_ifm_buf_col_len_rem << 16) |
+                        ((uint64_t)cmd_ofm_height);
+  uint64_t cmd_param8 =
+      ((uint64_t)cmd_flt_length << 32) | ((uint64_t)cmd_ifm_row_byte_length);
+  uint64_t cmd_ifm_flt_quant_scale = ((uint64_t)cmd_filter_quant_scale << 32) |
+                                     ((uint64_t)cmd_image_quant_scale);
+  uint64_t cmd_step_pad_mul_row_len =
+      ((uint64_t)cmd_pad_h_mul_row_byte_len << 32) |
+      ((uint64_t)cmd_step_h_mul_row_byte_len);
+  //---- ofm paras ----
+  uint64_t cmd_conv_param_reg = ((uint64_t)wParallelsim_num << 32) |
+                                ((uint64_t)wParallelsim << 16) |
+                                ((uint64_t)win_size);
+  uint64_t cmd_ofm_addr_width_reg =
+      ((uint64_t)conv_ofm_width << 32) | ((uint64_t)conv_ofm_addr_base);
+  uint64_t cmd_intra_stride_atoms_reg =
+      ((uint64_t)conv_ofm_dma_length << 32) | ((uint64_t)conv_ofm_dma_stride);
+  uint64_t cmd_user_ctrl_reg = ((uint64_t)o_ust_rst);
+  uint64_t cmd_wdma_param_reg =
+      ((uint64_t)(conv_ofm_dma_repeat | 0x80000000) << 32) |
+      ((uint64_t)conv_ofm_dma_offset);
+  uint64_t cmd_init_raddr_reg = ((cmd_init_raddr_col_1 & 0xffff) << 48) |
+                                ((cmd_init_raddr_col_0 & 0xffff) << 32) |
+                                (((cmd_init_raddr_index & 0xffff) << 16)) |
+                                (cmd_init_raddr_flag & 0xffff) << 15 |
+                                ((cmd_init_raddr_cnt & 0xffff));
+  uint64_t cmd_mult_factor =
+      ((uint64_t)args.const0) | ((uint64_t)args.const1 << 16);
+  uint64_t cmd_para31 = (cmd_para31 & 0x1) | args.relu_enabled;
+
+  DLOG << "cmd_init_raddr_col_1 = " << hex << cmd_init_raddr_col_1;
+  DLOG << "cmd_init_raddr_col_0 = " << hex << cmd_init_raddr_col_0;
+  DLOG << "cmd_init_raddr_index = " << hex << cmd_init_raddr_index;  //
+  DLOG << "cmd_init_raddr_cnt = " << hex << cmd_init_raddr_cnt;
+  DLOG << "cmd_ifm_buf_col_len = " << hex << cmd_ifm_buf_col_len;
+  DLOG << "cmd_ifm_buf_col_len_rem = " << hex << cmd_ifm_buf_col_len_rem;
+  DLOG << "conv_ofm_buf_col_len = " << hex << conv_ofm_buf_col_len;
+  DLOG << "conv_ofm_buf_col_len_rem = " << hex << conv_ofm_buf_col_len_rem;
+  DLOG << "cmd_ifm_flt_base_addr = " << hex << cmd_ifm_flt_base_addr;
+  DLOG << "cmd_scale_base_addr = " << hex << cmd_scale_base_addr;
+  DLOG << "cmd_ifm_flt_dim = " << hex << cmd_ifm_flt_dim;
+  DLOG << "cmd_pad_step_size = " << hex << cmd_pad_step_size;
+  DLOG << "cmd_param1 = " << hex << cmd_param1;
+  DLOG << "cmd_param2 = " << hex << cmd_param2;
+  DLOG << "cmd_param3 = " << hex << cmd_param3;
+  DLOG << "cmd_param4 = " << hex << cmd_param4;
+  DLOG << "cmd_param5 = " << hex << cmd_param5;
+  DLOG << "cmd_param6 = " << hex << cmd_param6;
+  DLOG << "cmd_param7 = " << hex << cmd_param7;
+  DLOG << "cmd_param8 =  " << hex << cmd_param8;
+  DLOG << "cmd_ifm_flt_quant_scale =  " << hex << cmd_ifm_flt_quant_scale;
+  DLOG << "cmd_step_pad_mul_row_len = " << hex << cmd_step_pad_mul_row_len;
+  DLOG << "cmd_ifm_pack_byte_length = " << hex << cmd_ifm_pack_byte_length;
+  DLOG << "cmd_conv_param_reg = " << hex << cmd_conv_param_reg;
+  DLOG << "cmd_ofm_addr_width_reg = " << hex << cmd_ofm_addr_width_reg;
+  DLOG << "cmd_intra_stride_atoms_reg = " << hex << cmd_intra_stride_atoms_reg;
+  DLOG << "cmd_init_raddr_reg = " << hex << cmd_init_raddr_reg;
+  DLOG << "cmd_mult_factor = " << hex << cmd_mult_factor;
+  DLOG << "cmd_wdma_param_reg = " << hex << cmd_wdma_param_reg;
+  DLOG << "cmd_para31 = " << hex << cmd_para31;
+
+  reg_writeq(cmd_ifm_flt_base_addr, MUL8(1));
+  reg_writeq(cmd_scale_base_addr, MUL8(2));
+  reg_writeq(cmd_ifm_flt_dim, MUL8(3));
+  reg_writeq(cmd_pad_step_size, MUL8(4));
+  reg_writeq(cmd_param1, MUL8(5));
+  reg_writeq(cmd_param2, MUL8(6));
+  reg_writeq(cmd_param3, MUL8(7));
+  reg_writeq(cmd_param4, MUL8(8));
+  reg_writeq(cmd_param5, MUL8(9));
+  reg_writeq(cmd_param6, MUL8(10));
+  reg_writeq(cmd_param7, MUL8(11));
+  reg_writeq(cmd_param8, MUL8(12));
+  reg_writeq(cmd_ifm_flt_quant_scale, MUL8(13));
+  reg_writeq(cmd_step_pad_mul_row_len, MUL8(14));
+  reg_writeq(cmd_ifm_pack_byte_length, MUL8(15));
+  reg_writeq(cmd_conv_param_reg, MUL8(16));
+  reg_writeq(cmd_ofm_addr_width_reg, MUL8(17));
+  reg_writeq(cmd_intra_stride_atoms_reg, MUL8(18));
+
+  reg_writeq(cmd_init_raddr_reg, MUL8(29));
+  reg_writeq(cmd_para31, MUL8(31));
+
+  reg_writeq(0, MUL8(19));
+  for (int i = 0; i < height_batch_num + 1; i++) {
+    conv_ofm_height_batch_tmp =
+        int((conv_ofm_height_batch[i] + 1) / 2) - 1;  // NOLINT
+    cmd_ofm_height_batch_reg =
+        ((uint64_t)(conv_ofm_buf_col_len_rem & 0xffff) << 48) |
+        ((uint64_t)(conv_ofm_buf_col_len & 0xffff) << 32) |
+        ((uint64_t)conv_ofm_height_batch_tmp + 0x80000000);
+    reg_writeq(cmd_ofm_height_batch_reg, MUL8(19));
+    reg_writeq(cmd_ofm_height_batch_reg & 0xffffffff00000000, MUL8(19));
+    usleep(1);
+  }
+  reg_writeq(cmd_wdma_param_reg, MUL8(25));
+  DLOG << "cmd_ofm_height_batch_reg = " << hex << cmd_ofm_height_batch_reg;
+
+  /******************************************************************/
+  reg_writeq(cmd_mult_factor, MUL8(30));
+  /******************************************************************/
+
+  reg_writeq(0, MUL8(0));
+
+  reg_writeq(0x2100000000000000, MUL8(0));
+
+  int ret = fpga_regpoll(MUL8(48), CONV_DONE, 0xffffff);
+  if (ret == -1) {
+    DLOG << "fpga EW no interrupt!!";
+    return ret;
+  }
+  reg_readq(MUL8(63));
+  usleep(10);
+  // get max value
+  float scale = Findfp16Max();
+  (args.output.scale_address)[0] = scale;                 // NOLINT
+  (args.output.scale_address)[1] = (float)(1.0 / scale);  // NOLINT
+  DLOG << "Findfp16Max scale = " << scale;
+
+  DLOG << "ret=" << ret;
+  return ret;
 }

 int PerformBypass(const struct BypassArgs &args) {
@@ -166,60 +1598,63 @@ int PerformBypass(const struct BypassArgs &args) {
  return 0;
 #endif

-  //  uint64_t ifm_src_paddr = driver::vaddr_to_paddr(args.image.address);
-  //  uint64_t ifm_dst_paddr = driver::vaddr_to_paddr(args.output.address);
-  //  uint64_t bp_enable;
-  //  int64_t length;
-  //  uint64_t pixels;
-  //
-  //  // fp32->fp16
-  //  if ((args.input_data_type) && (!args.output_data_type)) {
-  //    pixels = (args.image.channels) * (args.image.width) *
-  //    (args.image.height); length = pixels * sizeof(float); bp_enable =
-  //    0x8800000000000000 + length;
-  //  }
-  //  // fp16->fp32
-  //  else if ((!args.input_data_type) && (args.output_data_type)) {
-  //    pixels = filter::calc_aligned_channel((args.image.channels)) *
-  //             (args.image.width) * (args.image.height);
-  //    length = pixels * sizeof(short);
-  //    length = align_to_x((int)length, 64);  // NOLINT
-  //    bp_enable = 0x8a00000000000000 + length;
-  //  }
-  //  // fp16->fp16 findmax
-  //  else if ((!args.input_data_type) && (!args.output_data_type)) {
-  //    pixels = (args.image.channels) * (args.image.width) *
-  //    (args.image.height); length = pixels * sizeof(short); bp_enable =
-  //    0x8900000000000000 + length;
-  //  } else {
-  //    return -1;
-  //  }
-  //
-  //  // start bypass
-  //  driver::reg_writeq(ifm_src_paddr, MUL8(27));
-  //  driver::reg_writeq(ifm_dst_paddr, MUL8(28));
-  //  driver::reg_writeq(0, MUL8(0));
-  //  driver::reg_writeq(bp_enable, MUL8(0));
-  //  // poll
-  //  int ret = -1;
-  //  ret = driver::fpga_regpoll(MUL8(48), BYPASS_DONE, 0xffffffff);
-  //  if (ret != -1) {
-  //    // clear "irq"
-  //    driver::reg_readq(MUL8(63));
-  //  }
-  //  // get max value
-  //  if ((!args.input_data_type) && (!args.output_data_type)) {
-  //    float scale = Findfp16Max();
-  //    args.output.scale_address[0] = (float)(1.0 / scale);  // NOLINT
-  //    args.output.scale_address[1] = scale;
-  //  }
-  // return ret;
+  uint64_t ifm_src_paddr = vaddr_to_paddr(args.image.address);
+  uint64_t ifm_dst_paddr = vaddr_to_paddr(args.output.address);
+  uint64_t bp_enable;
+  int64_t length;
+  uint64_t pixels;
+
+  // fp32->fp16
+  if ((args.input_data_type) && (!args.output_data_type)) {
+    DLOG << "fp32-fp16";
+    pixels = (args.image.channels) * (args.image.width) * (args.image.height);
+    length = pixels * sizeof(float);
+    bp_enable = 0x8800000000000000UL + (uint64_t)length;
+  }
+  // fp16->fp32
+  else if ((!args.input_data_type) && (args.output_data_type)) {  // NOLINT
+    DLOG << "fp16-fp32";
+    pixels = filter::calc_aligned_channel((args.image.channels)) *
+             (args.image.width) * (args.image.height);
+    length = pixels * sizeof(short);       // NOLINT
+    length = align_to_x((int)length, 64);  // NOLINT
+    bp_enable = 0x8a00000000000000UL + length;
+  }
+  // fp16->fp16 findmax
+  else if ((!args.input_data_type) && (!args.output_data_type)) {  // NOLINT
+    DLOG << "16-16";
+    pixels = (args.image.channels) * (args.image.width) * (args.image.height);
+    length = pixels * sizeof(short);  // NOLINT
+    bp_enable = 0x8900000000000000 + length;
+  } else {
+    return -1;
+  }
+  // start bypass
+  reg_writeq(0, MUL8(0));
+  reg_writeq(ifm_src_paddr, MUL8(27));
+  reg_writeq(ifm_dst_paddr, MUL8(28));
+  reg_writeq(bp_enable, MUL8(0));
+  int ret = -1;
+  ret = fpga_regpoll(MUL8(48), BYPASS_DONE, 0xffffff);
+
+  if (ret != -1) {
+    DLOG << "test done";
+  }
+  reg_readq(MUL8(63));
+  usleep(10);
+  // get max value
+  float scale = Findfp16Max();
+  args.output.scale_address[0] = scale;                 // NOLINT
+  args.output.scale_address[1] = (float)(1.0 / scale);  // NOLINT
+  DLOG << "ret=" << ret;
+  return ret;
 }

 int ComputeFPGAConcat(const struct ConcatArgs &args) {
 #ifdef FPGA_PRINT_MODE
  DLOG << "=============ComputeFpgaConcat===========";
  DLOG << "   Image_num: " << args.image_num
+
       << "   out_address:" << args.image_out
       << "   out_scale_address:" << args.scale_out
       << "   out_channel:" << args.out_channel;

--- a/src/fpga/common/fpga_common.cpp
+++ b/src/fpga/common/fpga_common.cpp
@@ -113,6 +113,12 @@ int fpga_invalidate(void *address, size_t size) {
  return 0;
 #endif
 }
-
+uint64_t vaddr_to_paddr(void *address) {
+#ifdef PADDLE_MOBILE_ZU5
+  return driver::vaddr_to_paddr(address);
+#else
+  return 0;
+#endif
+}
 }  // namespace fpga
 }  // namespace paddle_mobile
--- a/src/operators/kernel/fpga/V2/feed_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/feed_kernel.cpp
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "operators/kernel/feed_kernel.h"
-
+#include "fpga/V2/filter.h"
 namespace paddle_mobile {
 namespace operators {

@@ -24,7 +24,6 @@ bool FeedKernel<FPGA, float>::Init(FeedParam<FPGA> *param) {
  fpga::format_fp16_ofm(output, aligned_channel);
  return true;
 }
-
 template <>
 void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
  auto input =
@@ -33,6 +32,9 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
  auto input_ptr = input->data<float>();
  Tensor *output = param.Out();
  auto output_ptr = output->data<float>();
+  auto channel = input->dims()[1];
+  uint32_t aligned_channels =
+      fpga::filter::calc_aligned_channel((int)channel);  // NOLINT

  fpga::BypassArgs args = {fpga::DATA_TYPE_FP32};

@@ -41,7 +43,7 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
  args.input_layout_type = fpga::LAYOUT_CHW;
  args.output_layout_type = fpga::LAYOUT_HWC;
  args.image.address = reinterpret_cast<void *>(input_ptr);
-  args.image.channels = (uint32_t)input->dims()[1];
+  args.image.channels = aligned_channels;
  args.image.height = (uint32_t)input->dims()[2];
  args.image.width = (uint32_t)input->dims()[3];
  args.image.pad_height = 0;

--- a/src/operators/kernel/fpga/V2/softmax_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/softmax_kernel.cpp
@@ -25,7 +25,7 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
  auto input_ptr = input->data<float>();
  auto float_input = new Tensor;
  float_input->mutable_data<float>({1, input->dims()[1]});
-  fpga::format_fp32_ofm(float_input, 8);
+  fpga::format_fp32_ofm(float_input, 1024);

  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
  args.input_layout_type = fpga::LAYOUT_HWC;