From 7a8b998fadf3a0eedaaec6751bf4609052b98b59 Mon Sep 17 00:00:00 2001 From: qnqinan Date: Wed, 12 Dec 2018 13:45:50 +0800 Subject: [PATCH] fix some bugs in fpga V2 track and update fpga V2 pe code --- src/fpga/V2/api.cpp | 5 +- src/fpga/V2/filter.cpp | 17 +- src/fpga/V2/pe.cpp | 1607 ++++++++++++++++- src/fpga/common/fpga_common.cpp | 8 +- src/operators/kernel/fpga/V2/feed_kernel.cpp | 8 +- .../kernel/fpga/V2/softmax_kernel.cpp | 2 +- 6 files changed, 1552 insertions(+), 95 deletions(-) mode change 100644 => 100755 src/fpga/common/fpga_common.cpp mode change 100644 => 100755 src/operators/kernel/fpga/V2/softmax_kernel.cpp diff --git a/src/fpga/V2/api.cpp b/src/fpga/V2/api.cpp index 8010e43a61..6e1090c00e 100644 --- a/src/fpga/V2/api.cpp +++ b/src/fpga/V2/api.cpp @@ -204,7 +204,8 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, arg->conv_arg[i].image.address = input_ptr; arg->conv_arg[i].image.scale_address = input->scale; - arg->conv_arg[i].image.channels = (uint32_t)input->dims()[1]; + arg->conv_arg[i].image.channels = + (uint32_t)get_aligned_channel_num((int)(input->dims()[1])); // NOLINT arg->conv_arg[i].image.height = (uint32_t)input->dims()[2]; arg->conv_arg[i].image.width = (uint32_t)input->dims()[3]; arg->conv_arg[i].image.pad_height = (uint32_t)padding_h; @@ -216,7 +217,7 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, int num_after_alignment = filter::calc_aligned_num( arg->filter_num, (int)input->dims()[1]); // NOLINT arg->conv_arg[i].free_space = - fpga_malloc(num_after_alignment * 2 * sizeof(half)); + fpga_malloc(num_after_alignment * 2 * sizeof(float)); // half } } diff --git a/src/fpga/V2/filter.cpp b/src/fpga/V2/filter.cpp index b17ce4406b..3b0692a99e 100644 --- a/src/fpga/V2/filter.cpp +++ b/src/fpga/V2/filter.cpp @@ -16,7 +16,6 @@ limitations under the License. */ #include #include #include "fpga/common/fpga_common.h" - namespace paddle_mobile { namespace fpga { namespace filter { @@ -88,12 +87,25 @@ void align_filter(float **data_in, int num, int channel, int height, *data_in = new_data; fpga_free(temp); } - +void convert_to_fp16(float **data_in, int data_size) { + float *tmp = *data_in; + // half_float::half *tmp_data = (half_float::half *)fpga_malloc(data_size * + // sizeof(half_float::half)); + int16_t *tmp_data = + (int16_t *)fpga_malloc(data_size * sizeof(int16_t)); // NOLINT + for (int i = 0; i < data_size; i++) { + // tmp_data[i] = (half_float::half)((*data_in)[i]); + tmp_data[i] = fp32_2_fp16((*data_in)[i]); + } + *data_in = (float *)tmp_data; // NOLINT + fpga_free(tmp); +} void format_filter(float **data_in, int num, int channel, int height, int width, int group_num, float max) { convert_to_hwc(data_in, num, channel, height, width); align_filter(data_in, num, channel, height, width); int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width); + convert_to_fp16(data_in, pixel_num); fpga_flush(*data_in, pixel_num * sizeof(float)); } @@ -115,6 +127,7 @@ void format_fc_filter(float **data_in, int num, int channel, int height, convert_fc_filter(data_in, num, chw); align_filter(data_in, num, channel, height, width); int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width); + convert_to_fp16(data_in, pixel_num); fpga_flush(*data_in, pixel_num * sizeof(float)); } diff --git a/src/fpga/V2/pe.cpp b/src/fpga/V2/pe.cpp index 35ef06de2d..d22bd17175 100644 --- a/src/fpga/V2/pe.cpp +++ b/src/fpga/V2/pe.cpp @@ -13,40 +13,53 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "fpga/common/pe.h" +#include "fpga/V2/api.h" #include "fpga/V2/filter.h" #include "fpga/V2/image.h" #include "fpga/common/config.h" #include "fpga/common/driver.h" +using namespace std; // NOLINT +using namespace paddle_mobile::fpga::driver; // NOLINT + namespace paddle_mobile { namespace fpga { -#define MUL8(x) ((x)*8) -#define BYPASS_DONE 1 +#define MUL8(x) (x * 8) +#define BYPASS_DONE 2 +#define CONV_DONE 1 + +static inline int get_image_out_axis(int src_len, int pad, int kernel_len, + int kernel_step) { + if (kernel_step == 0) { + return 0; + } + return (src_len + 2 * pad - kernel_len) / kernel_step + 1; +} float Findfp16Max() { uint16_t abs_vals[16]; uint64_t max_fp16; - max_fp16 = driver::reg_readq(MUL8(49)); - abs_vals[0] = (uint16_t)(0x0000007f & (max_fp16)); // NOLINT - abs_vals[1] = (uint16_t)(0x0000007f & (max_fp16 >> 16)); // NOLINT - abs_vals[2] = (uint16_t)(0x0000007f & (max_fp16 >> 32)); // NOLINT - abs_vals[3] = (uint16_t)(0x0000007f & (max_fp16 >> 48)); // NOLINT - max_fp16 = driver::reg_readq(MUL8(50)); - abs_vals[4] = (uint16_t)(0x0000007f & (max_fp16)); // NOLINT - abs_vals[5] = (uint16_t)(0x0000007f & (max_fp16 >> 16)); // NOLINT - abs_vals[6] = (uint16_t)(0x0000007f & (max_fp16 >> 32)); // NOLINT - abs_vals[7] = (uint16_t)(0x0000007f & (max_fp16 >> 48)); // NOLINT - max_fp16 = driver::reg_readq(MUL8(51)); - abs_vals[8] = (uint16_t)(0x0000007f & (max_fp16)); // NOLINT - abs_vals[9] = (uint16_t)(0x0000007f & (max_fp16 >> 16)); // NOLINT - abs_vals[10] = (uint16_t)(0x0000007f & (max_fp16 >> 32)); // NOLINT - abs_vals[11] = (uint16_t)(0x0000007f & (max_fp16 >> 48)); // NOLINT - max_fp16 = driver::reg_readq(MUL8(52)); - abs_vals[12] = (uint16_t)(0x0000007f & (max_fp16)); - abs_vals[13] = (uint16_t)(0x0000007f & (max_fp16 >> 16)); // NOLINT - abs_vals[14] = (uint16_t)(0x0000007f & (max_fp16 >> 32)); // NOLINT - abs_vals[15] = (uint16_t)(0x0000007f & (max_fp16 >> 48)); // NOLINT + max_fp16 = reg_readq(MUL8(49)); + abs_vals[0] = (uint16_t)(0x0000007fff & (max_fp16)); // NOLINT + abs_vals[1] = (uint16_t)(0x0000007fff & (max_fp16 >> 16)); // NOLINT + abs_vals[2] = (uint16_t)(0x0000007fff & (max_fp16 >> 32)); // NOLINT + abs_vals[3] = (uint16_t)(0x0000007fff & (max_fp16 >> 48)); // NOLINT + max_fp16 = reg_readq(MUL8(50)); + abs_vals[4] = (uint16_t)(0x0000007fff & (max_fp16)); // NOLINT + abs_vals[5] = (uint16_t)(0x0000007fff & (max_fp16 >> 16)); // NOLINT + abs_vals[6] = (uint16_t)(0x0000007fff & (max_fp16 >> 32)); // NOLINT + abs_vals[7] = (uint16_t)(0x0000007fff & (max_fp16 >> 48)); // NOLINT + max_fp16 = reg_readq(MUL8(51)); + abs_vals[8] = (uint16_t)(0x0000007fff & (max_fp16)); // NOLINT + abs_vals[9] = (uint16_t)(0x0000007fff & (max_fp16 >> 16)); // NOLINT + abs_vals[10] = (uint16_t)(0x0000007fff & (max_fp16 >> 32)); // NOLINT + abs_vals[11] = (uint16_t)(0x0000007fff & (max_fp16 >> 48)); // NOLINT + max_fp16 = reg_readq(MUL8(52)); + abs_vals[12] = (uint16_t)(0x0000007fff & (max_fp16)); + abs_vals[13] = (uint16_t)(0x0000007fff & (max_fp16 >> 16)); // NOLINT + abs_vals[14] = (uint16_t)(0x0000007fff & (max_fp16 >> 32)); // NOLINT + abs_vals[15] = (uint16_t)(0x0000007fff & (max_fp16 >> 48)); // NOLINT uint16_t tmp = 0; for (int i = 0; i < 16; i++) { @@ -54,6 +67,7 @@ float Findfp16Max() { tmp = abs_vals[i]; } } + DLOG << "max value found: " << fp16_2_fp32(tmp); return fp16_2_fp32(tmp) / 127.0f; } @@ -88,7 +102,473 @@ int ComputeBasicConv(const struct ConvArgs &args) { return 0; #endif - return 0; + uint64_t ifm_pixel_num = + ((args.image.width) * (args.image.height) * args.image.channels); + uint64_t ifm_memory_size = ifm_pixel_num * sizeof(short); // NOLINT + uint64_t flt_pixel_num = (args.filter_num * (args.kernel.width) * // NOLINT + (args.kernel.height) * args.image.channels); + uint64_t filter_memory_size = flt_pixel_num * sizeof(short); // NOLINT + + uint64_t bn_pixel_num = (args.filter_num * 2); // NOLINT + uint64_t bn_memory_size = bn_pixel_num * sizeof(float); + + uint64_t ofm_width = + ((args.image.width) + 2 * args.image.pad_width - args.kernel.width) / + (args.kernel.stride_w) + + 1; + uint64_t ofm_height = ((args.image.height) + 2 * (args.image.pad_height) - + (args.kernel.height)) / + (args.kernel.stride_h) + + 1; + + uint32_t filter_num = args.filter_num; + uint32_t image_channels = args.image.channels; + + DLOG << "filter_num: " << filter_num; + uint64_t ifm_src_paddr = vaddr_to_paddr((args.image.address)); + uint64_t flt_src_paddr = vaddr_to_paddr((args.filter_address)); + uint64_t sb_src_paddr = vaddr_to_paddr((args.free_space)); + uint64_t ifm_dst_paddr = vaddr_to_paddr((args.output.address)); + /**********BN******************/ + float image_inv_scale = (args.image.scale_address)[0]; + float filter_inv_scale = (args.filter_scale_address)[0]; + float scale_tmp = image_inv_scale * filter_inv_scale; + int idx = 0; + float tmp = 0.0; + float *convert_sb_addr = (float *)(args.free_space); // NOLINT + for (idx = 0; idx < args.filter_num * 2; idx++) { + if (idx % 2 == 1) { + tmp = ((float *)(args.sb_address))[idx] * scale_tmp; // NOLINT + } else { + tmp = ((float *)(args.sb_address))[idx]; // NOLINT + } + convert_sb_addr[idx] = tmp; // NOLINT + } + + fpga_flush(convert_sb_addr, args.filter_num * 2 * sizeof(float)); + reg_writeq(1, MUL8(24)); + usleep(1); + reg_writeq(0, MUL8(24)); + + reg_writeq(sb_src_paddr, MUL8(27)); + reg_writeq(0, MUL8(0)); + + uint64_t bps_addr = 0x8c00000000000000; + bps_addr += bn_memory_size; + reg_writeq(bps_addr, MUL8(0)); + int ret = -1; + ret = fpga_regpoll(MUL8(48), BYPASS_DONE, 0xffffff); + if (ret) { + DLOG << "conv bypass failed"; + return ret; + } + reg_readq(MUL8(63)); + + /*********configuring registers*************/ + uint32_t cmd_image_vir_base_addr = (uint32_t)ifm_src_paddr; + uint32_t cmd_filter_vir_base_addr = (uint32_t)flt_src_paddr; + uint32_t cmd_scale_base_addr = (uint32_t)sb_src_paddr; + uint32_t conv_ofm_addr_base = (uint32_t)ifm_dst_paddr; + uint64_t cmd_group_num = args.group_num; + uint64_t cmd_filter_per_group = filter_num / cmd_group_num; + + uint64_t cmd_flt_sqr_len = (args.kernel.width) * (args.kernel.height); + uint64_t cmd_ifm_pre_row_num = 0; + + if (1 == args.image.height) { + cmd_ifm_pre_row_num = 1; + } else { + cmd_ifm_pre_row_num = + (args.kernel.height) - (args.image.pad_height) + (args.kernel.stride_h); + } + uint64_t cmd_flt_pre_batch_num = 1; + uint64_t cmd_ifm_pack_num_per_row_mns1 = + (uint64_t)(((args.image.channels) + 127) / 128) - 1; + uint64_t cmd_bn_num = filter_num; + uint64_t cmd_bias_num = filter_num; + uint64_t cmd_ifm_stride_row_length = args.image.width * args.kernel.stride_h; + uint64_t cmd_flt_pack_num_per_kernel_mns1 = + (uint64_t)(((args.image.channels) + 127) / 128) - 1; + uint64_t cmd_ofm_width_mns1 = (uint64_t)( + ((args.image.width) - (args.kernel.width) + 2 * (args.image.pad_width)) / + (args.kernel.stride_w)); + uint64_t cmd_ofm_height = + (uint64_t)(((args.image.height) - (args.kernel.height) + + 2 * (args.image.pad_height)) / + (args.kernel.stride_h)) + + 1; + + uint64_t cmd_channel_num = 0; + uint64_t cmd_ifm_pack_len = 0; + uint64_t cmd_channel_per_group = 0; + uint64_t cmd_flt_batch_num_mns1 = 0; + uint64_t cmd_flt_N_impl = 8; + uint64_t cmd_ifm_C_impl = 16; + uint64_t cmd_flt_pack_length = 0; + uint64_t cmd_step_h_mul_row_byte_len = 0; + uint64_t cmd_pad_h_mul_row_byte_len = 0; + uint64_t cmd_ifm_pack_byte_length = 16 * ((((args.image.width) + 7) / 8) * 8); + uint64_t row_len_align = args.image.width; + if (image_channels > 64) { + cmd_channel_num = (uint64_t)((((args.image.channels) + 127)) / 128) * 128; + cmd_ifm_pack_len = 128 * (args.image.width); + cmd_channel_per_group = 128; + cmd_flt_batch_num_mns1 = (uint64_t)(((args.filter_num + 7)) / 8 - 1); + cmd_flt_N_impl = 8; + cmd_ifm_C_impl = 128; + cmd_flt_pack_length = (args.kernel.width) * (args.kernel.height) * 128; + cmd_step_h_mul_row_byte_len = + (args.kernel.stride_h) * cmd_channel_num * (args.image.width); + cmd_pad_h_mul_row_byte_len = + (args.image.pad_height) * cmd_channel_num * (args.image.width); + cmd_ifm_pack_byte_length = 128 * (args.image.width); + row_len_align = args.image.width * (cmd_ifm_pack_num_per_row_mns1 + 1); + } else if (image_channels > 32) { + cmd_channel_num = 64; + cmd_ifm_pack_len = 64 * (args.image.width); + cmd_channel_per_group = 64; + cmd_flt_batch_num_mns1 = (uint64_t)((((args.filter_num) + 15)) / 16 - 1); + cmd_flt_N_impl = 16; + cmd_ifm_C_impl = 64; + cmd_flt_pack_length = (args.kernel.width) * (args.kernel.height) * 64; + cmd_step_h_mul_row_byte_len = (args.kernel.stride_h) * cmd_channel_num * + ((((args.image.width) + 1)) / 2) * 2; + cmd_pad_h_mul_row_byte_len = (args.image.pad_height) * cmd_channel_num * + ((((args.image.width) + 1)) / 2) * 2; + cmd_ifm_pack_byte_length = + 64 * (uint64_t)((((args.image.width) + 1)) / 2) * 2; + row_len_align = (uint64_t)((((args.image.width) + 1)) / 2); + } else if (image_channels > 16) { + cmd_channel_num = 32; + cmd_ifm_pack_len = 32 * (args.image.width); + cmd_channel_per_group = 32; + cmd_flt_batch_num_mns1 = (uint64_t)((((args.filter_num) + 31)) / 32 - 1); + cmd_flt_N_impl = 32; + cmd_ifm_C_impl = 32; + cmd_flt_pack_length = (args.kernel.width) * (args.kernel.height) * 32; + cmd_step_h_mul_row_byte_len = (args.kernel.stride_h) * cmd_channel_num * + ((((args.image.width) + 3)) / 4) * 4; + cmd_pad_h_mul_row_byte_len = (args.image.pad_height) * cmd_channel_num * + ((((args.image.width) + 3)) / 4) * 4; + cmd_ifm_pack_byte_length = + 32 * (uint64_t)((((args.image.width) + 3)) / 4) * 4; + row_len_align = (uint64_t)((((args.image.width) + 3)) / 4); + } else { + cmd_channel_num = 16; + cmd_ifm_pack_len = 16 * (args.image.width); + cmd_channel_per_group = 16; + cmd_flt_batch_num_mns1 = (uint64_t)((((args.filter_num) + 63)) / 64 - 1); + cmd_flt_N_impl = 64; + cmd_ifm_C_impl = 16; + cmd_flt_pack_length = (args.kernel.width) * (args.kernel.height) * 16; + cmd_step_h_mul_row_byte_len = (args.kernel.stride_h) * cmd_channel_num * + ((((args.image.width) + 7)) / 8) * 8; + cmd_pad_h_mul_row_byte_len = (args.image.pad_height) * cmd_channel_num * + ((((args.image.width) + 7)) / 8) * 8; + cmd_ifm_pack_byte_length = 16 * ((((args.image.width) + 7)) / 8) * 8; + row_len_align = (uint64_t)((((args.image.width) + 7)) / 8); + } + uint64_t cmd_flt_length = + (args.kernel.width) * (args.kernel.height) * cmd_channel_num; + uint64_t cmd_ifm_row_byte_length = cmd_channel_num * (args.image.width); + + uint64_t cmd_ifm_buf_col_len = 0; + + uint64_t ifm_one_batch_len = + (1048576 / ((args.image.width) * cmd_channel_num)); + uint64_t cmd_ifm_batch_num_tmp = (uint64_t)( + ((args.image.height) + ifm_one_batch_len - 1) / ifm_one_batch_len); + if (1 == cmd_ifm_batch_num_tmp) { + cmd_ifm_buf_col_len = args.image.height; + } else { + if (((args.image.height) / (cmd_ifm_batch_num_tmp) % 2) == 0) { + cmd_ifm_buf_col_len = (args.image.height) / cmd_ifm_batch_num_tmp; + } else { + cmd_ifm_buf_col_len = (args.image.height) / cmd_ifm_batch_num_tmp - 1; + } + } + uint64_t cmd_ifm_batch_num_mns1 = + (((args.image.height) + cmd_ifm_buf_col_len - 1) / cmd_ifm_buf_col_len) - + 1; + uint64_t cmd_flt_cycle_num_mns1 = cmd_ifm_batch_num_mns1; + uint64_t cmd_flt_total_batch_num = filter_num / cmd_flt_N_impl; + uint64_t cmd_ifm_buf_col_len_rem = + (args.image.height) - + cmd_ifm_batch_num_mns1 * cmd_ifm_buf_col_len; //= -4; + uint64_t cmd_flt_N_len = args.kernel.width * args.kernel.height * + (cmd_flt_pack_num_per_kernel_mns1 + 1); + + //-------- ofm batch number reg && initial URAM reading address + // logic----------------- + uint64_t cmd_init_raddr_cnt = 1; + uint64_t cmd_init_raddr_flag = 0; + int64_t cmd_init_raddr_index = -8; + int64_t cmd_init_raddr_col_0 = -4; + int64_t cmd_init_raddr_col_1 = -4; + uint64_t conv_ofm_buf_col_len = 0; + uint64_t conv_ofm_buf_col_len_rem = 0; + + if (((args.image.pad_height) % (2 * (args.kernel.stride_h))) == 0) { + cmd_init_raddr_cnt = 0; + cmd_init_raddr_flag = 0; + cmd_init_raddr_index = + 0 - (int64_t)row_len_align * (((args.image.pad_height) + 1) / 2); + cmd_init_raddr_col_0 = cmd_init_raddr_index; + cmd_init_raddr_col_1 = cmd_init_raddr_index; + } else if (((args.image.pad_height) - + 2 * ((args.image.pad_height) / (2 * (args.kernel.stride_h)))) <= + (args.kernel.stride_h)) { + cmd_init_raddr_cnt = + (args.kernel.stride_h) - + ((args.image.pad_height) - + ((args.image.pad_height) / (2 * (args.kernel.stride_h)))); + cmd_init_raddr_flag = 1; + cmd_init_raddr_index = + 0 - (int64_t)row_len_align * (int64_t)(args.image.pad_height) - + (int64_t)row_len_align * + ((args.image.pad_height) / (2 * args.kernel.stride_h)); + cmd_init_raddr_col_0 = + 0 - (int64_t)row_len_align * (int64_t)(args.image.pad_height) - + (int64_t)row_len_align * + ((args.image.pad_height) / (2 * (args.kernel.stride_h))); + cmd_init_raddr_col_1 = 0; + } else if (((args.image.pad_height) - + 2 * ((args.image.pad_height) / (2 * (args.kernel.stride_h)))) <= + 2 * (args.kernel.stride_h)) { + cmd_init_raddr_cnt = + 2 * (args.kernel.stride_h) * + (((args.image.pad_height) + 2 * (args.kernel.stride_h) - 1) / + (2 * (args.kernel.stride_h))) - + (args.image.pad_height); + cmd_init_raddr_flag = 0; + cmd_init_raddr_index = + 0 - (int64_t)row_len_align * (int64_t)(args.kernel.stride_h) * + (((args.image.pad_height) + 2 * (args.kernel.stride_h) - 1) / + (2 * (args.kernel.stride_h))); + cmd_init_raddr_col_0 = + 0 - + (int64_t)row_len_align * + ((args.image.pad_height) / (2 * (args.kernel.stride_h))) - + (int64_t)row_len_align * + (2 * (args.kernel.stride_h) * + (((args.image.pad_height) + 2 * (args.kernel.stride_h) - 1) / + (2 * (args.kernel.stride_h))) - + (args.image.pad_height)); + cmd_init_raddr_col_1 = cmd_init_raddr_col_0; + } + + if (cmd_ifm_batch_num_mns1 == 0) { + if ((args.kernel.height) <= (args.kernel.stride_h)) { + conv_ofm_buf_col_len = (args.image.height) + 2 * (args.image.pad_height) - + 3 * (args.kernel.stride_h); + } else { + conv_ofm_buf_col_len = (args.image.height) + 2 * (args.image.pad_height) - + 2 * (args.kernel.stride_h) - (args.kernel.height); + } + conv_ofm_buf_col_len_rem = conv_ofm_buf_col_len; + } else { + int N_rem = 0; + int row_rem = 0; + + if ((args.kernel.height) <= (args.kernel.stride_h)) { + conv_ofm_buf_col_len = cmd_ifm_buf_col_len - 3 * (args.kernel.stride_h); + N_rem = (cmd_ifm_buf_col_len - (args.kernel.height)) / + (args.kernel.stride_h) + + 1; + row_rem = cmd_ifm_buf_col_len - (args.kernel.stride_h) * N_rem; + conv_ofm_buf_col_len_rem = cmd_ifm_buf_col_len_rem + + 2 * (args.image.pad_height) + row_rem - + 3 * (args.kernel.stride_h); + } else { + conv_ofm_buf_col_len = cmd_ifm_buf_col_len + 2 * (args.image.pad_height) - + 2 * (args.kernel.stride_h) - (args.kernel.height); + N_rem = (cmd_ifm_buf_col_len - (args.kernel.height)) / + (args.kernel.stride_h) + + 1; + row_rem = cmd_ifm_buf_col_len - (args.kernel.stride_h) * N_rem; + conv_ofm_buf_col_len_rem = + cmd_ifm_buf_col_len_rem + (args.image.pad_height) + row_rem - + 2 * (args.kernel.stride_h) - (args.kernel.height); + } + } + //----------------------- para functions -------------------------------- + float filter_quant_scale_tmp = ((args.filter_scale_address)[1]); + float image_quant_scale_tmp = ((args.image.scale_address)[1]); + + uint32_t cmd_filter_quant_scale = + *(uint32_t *)(&filter_quant_scale_tmp); // NOLINT + uint32_t cmd_image_quant_scale = + *(uint32_t *)(&image_quant_scale_tmp); // NOLINT + + uint64_t wParallelsim = cmd_flt_N_impl >> 3; + uint64_t wParallelsim_num = + (uint64_t)(((args.filter_num) + cmd_flt_N_impl - 1) / cmd_flt_N_impl) - 1; + uint64_t win_size = (args.kernel.width) * (args.kernel.height) * + (cmd_ifm_pack_num_per_row_mns1 + 1) - + 1; + uint64_t conv_ofm_width = (((args.image.width) - (args.kernel.width) + + (args.image.pad_width) + (args.image.pad_width)) / + (args.kernel.stride_w)); + uint64_t conv_ofm_dma_length = cmd_flt_N_impl * sizeof(short); // NOLINT + uint64_t conv_ofm_dma_stride = args.filter_num * sizeof(short); // NOLINT + uint64_t conv_ofm_height_batch_tmp = + get_image_out_axis(args.image.height, args.image.pad_height, + args.kernel.height, args.kernel.stride_h); + uint64_t conv_ofm_height_batch = (conv_ofm_height_batch_tmp + 1) / 2 - 1; + uint64_t o_ust_rst = 0; + uint64_t conv_ofm_dma_repeat = + (uint64_t)(((((args.image.width) - (args.kernel.width) + + (args.image.pad_width) + (args.image.pad_width))) / + (args.kernel.stride_w)) + + 1); + uint64_t conv_ofm_dma_offset = + args.filter_num * conv_ofm_dma_repeat * sizeof(short); // NOLINT + uint64_t conv_ofm_inter_stride = conv_ofm_dma_offset * 2; + //----------------- register contation ------------------ + uint64_t cmd_ifm_flt_base_addr = ((uint64_t)cmd_filter_vir_base_addr << 32) | + ((uint64_t)cmd_image_vir_base_addr); + uint64_t cmd_ifm_flt_dim = ((uint64_t)(args.kernel.height) << 48) | + ((uint64_t)(args.kernel.width) << 32) | + ((uint64_t)(args.image.height) << 16) | + ((uint64_t)(args.image.width)); + uint64_t cmd_pad_step_size = ((uint64_t)(args.kernel.stride_h) << 48) | + ((uint64_t)(args.kernel.stride_w) << 32) | + ((uint64_t)(args.image.pad_height) << 16) | + ((uint64_t)(args.image.pad_width)); + uint64_t cmd_param1 = ((uint64_t)cmd_filter_per_group << 48) | + ((uint64_t)cmd_channel_num << 32) | + ((uint64_t)filter_num << 16) | + ((uint64_t)cmd_group_num); + uint64_t cmd_param2 = + ((uint64_t)cmd_flt_sqr_len << 48) | ((uint64_t)cmd_ifm_pack_len << 32) | + ((uint64_t)cmd_ifm_pre_row_num << 16) | ((uint64_t)cmd_channel_per_group); + uint64_t cmd_param3 = ((uint64_t)cmd_flt_batch_num_mns1 << 48) | + ((uint64_t)cmd_flt_total_batch_num << 32) | + ((uint64_t)cmd_flt_N_impl << 16) | + ((uint64_t)cmd_flt_pre_batch_num); + uint64_t cmd_param4 = ((uint64_t)cmd_ifm_pack_num_per_row_mns1 << 48) | + ((uint64_t)cmd_bn_num << 32) | + ((uint64_t)cmd_bias_num << 16) | + ((uint64_t)cmd_flt_N_len); + uint64_t cmd_param5 = ((uint64_t)cmd_ifm_stride_row_length << 48) | + ((uint64_t)cmd_flt_pack_length << 32) | + ((uint64_t)cmd_flt_cycle_num_mns1 << 16) | + ((uint64_t)cmd_flt_pack_num_per_kernel_mns1); + uint64_t cmd_param6 = ((uint64_t)cmd_ofm_width_mns1 << 48) | + ((uint64_t)cmd_ifm_batch_num_mns1 << 32) | + ((uint64_t)cmd_ifm_buf_col_len << 16) | + ((uint64_t)cmd_ifm_C_impl); + uint64_t cmd_param7 = ((uint64_t)conv_ofm_inter_stride << 32) | + ((uint64_t)cmd_ifm_buf_col_len_rem << 16) | + ((uint64_t)cmd_ofm_height); + uint64_t cmd_param8 = + ((uint64_t)cmd_flt_length << 32) | ((uint64_t)cmd_ifm_row_byte_length); + uint64_t cmd_ifm_flt_quant_scale = + (((uint64_t)cmd_filter_quant_scale) << 32) | + ((uint64_t)cmd_image_quant_scale); + uint64_t cmd_step_pad_mul_row_len = + ((uint64_t)cmd_pad_h_mul_row_byte_len << 32) | + ((uint64_t)cmd_step_h_mul_row_byte_len); + //---- ofm paras ---- + uint64_t cmd_conv_param_reg = ((uint64_t)wParallelsim_num << 32) | + ((uint64_t)wParallelsim << 16) | + ((uint64_t)win_size); + uint64_t cmd_ofm_addr_width_reg = + ((uint64_t)conv_ofm_width << 32) | ((uint64_t)conv_ofm_addr_base); + uint64_t cmd_intra_stride_atoms_reg = + ((uint64_t)conv_ofm_dma_length << 32) | ((uint64_t)conv_ofm_dma_stride); + uint64_t cmd_ofm_height_batch_reg = + ((uint64_t)conv_ofm_buf_col_len_rem << 48) | + ((uint64_t)conv_ofm_buf_col_len << 32) | + ((uint64_t)conv_ofm_height_batch + 0x80000000); + uint64_t cmd_user_ctrl_reg = ((uint64_t)o_ust_rst); + uint64_t cmd_wdma_param_reg = + ((uint64_t)(conv_ofm_dma_repeat | 0x80000000) << 32) | + ((uint64_t)conv_ofm_dma_offset); + + uint64_t cmd_init_raddr_reg = ((cmd_init_raddr_col_1 & 0xffff) << 48) | + ((cmd_init_raddr_col_0 & 0xffff) << 32) | + (((cmd_init_raddr_index & 0xffff) << 16)) | + (cmd_init_raddr_flag & 0xffff) << 15 | + ((cmd_init_raddr_cnt & 0xffff)); + + uint64_t cmd_para31 = (cmd_para31 & 0x1) | args.relu_enabled; + + DLOG << "cmd_init_raddr_col_1 = " << hex << cmd_init_raddr_col_1; + + DLOG << "cmd_init_raddr_col_0 = " << hex << cmd_init_raddr_col_0; + DLOG << "cmd_init_raddr_index = " << hex << cmd_init_raddr_index; // + DLOG << "cmd_init_raddr_cnt = " << hex << cmd_init_raddr_cnt; + DLOG << "conv_ofm_height_batch = " << conv_ofm_height_batch; + + DLOG << "cmd_ifm_flt_base_addr = " << hex << cmd_ifm_flt_base_addr; + DLOG << "cmd_scale_base_addr = " << hex << cmd_scale_base_addr; + DLOG << "cmd_ifm_flt_dim = " << hex << cmd_ifm_flt_dim; + DLOG << "cmd_pad_step_size = " << hex << cmd_pad_step_size; + DLOG << "cmd_param1 = " << hex << cmd_param1; + DLOG << "cmd_param2 = " << hex << cmd_param2; + DLOG << "cmd_param3 = " << hex << cmd_param3; + DLOG << "cmd_param4 = " << hex << cmd_param4; + DLOG << "cmd_param5 = " << hex << cmd_param5; + DLOG << "cmd_param6 = " << hex << cmd_param6; + DLOG << "cmd_param7 = " << hex << cmd_param7; + DLOG << "cmd_param8 = " << hex << cmd_param8; + DLOG << "cmd_ifm_flt_quant_scale = " << hex << cmd_ifm_flt_quant_scale; + DLOG << "cmd_step_pad_mul_row_len = " << hex << cmd_step_pad_mul_row_len; + DLOG << "cmd_ifm_pack_byte_length = " << hex << cmd_ifm_pack_byte_length; + DLOG << "cmd_conv_param_reg = " << hex << cmd_conv_param_reg; + DLOG << "cmd_ofm_addr_width_reg = " << hex << cmd_ofm_addr_width_reg; + DLOG << "cmd_intra_stride_atoms_reg = " << hex << cmd_intra_stride_atoms_reg; + DLOG << "cmd_init_raddr_reg = " << hex << cmd_init_raddr_reg; + DLOG << "cmd_ofm_height_batch_reg = " << hex << cmd_ofm_height_batch_reg; + DLOG << "cmd_wdma_param_reg = " << hex << cmd_wdma_param_reg; + DLOG << "cmd_para31 = " << hex << cmd_para31; + + reg_writeq(cmd_ifm_flt_base_addr, MUL8(1)); + reg_writeq(cmd_scale_base_addr, MUL8(2)); + reg_writeq(cmd_ifm_flt_dim, MUL8(3)); + reg_writeq(cmd_pad_step_size, MUL8(4)); + reg_writeq(cmd_param1, MUL8(5)); + reg_writeq(cmd_param2, MUL8(6)); + reg_writeq(cmd_param3, MUL8(7)); + reg_writeq(cmd_param4, MUL8(8)); + reg_writeq(cmd_param5, MUL8(9)); + reg_writeq(cmd_param6, MUL8(10)); + reg_writeq(cmd_param7, MUL8(11)); + reg_writeq(cmd_param8, MUL8(12)); + reg_writeq(cmd_ifm_flt_quant_scale, MUL8(13)); + reg_writeq(cmd_step_pad_mul_row_len, MUL8(14)); + reg_writeq(cmd_ifm_pack_byte_length, MUL8(15)); + reg_writeq(cmd_conv_param_reg, MUL8(16)); + reg_writeq(cmd_ofm_addr_width_reg, MUL8(17)); + reg_writeq(cmd_intra_stride_atoms_reg, MUL8(18)); + + reg_writeq(cmd_init_raddr_reg, MUL8(29)); + reg_writeq(cmd_para31, MUL8(31)); + + reg_writeq(0, MUL8(19)); + reg_writeq(cmd_ofm_height_batch_reg, MUL8(19)); + reg_writeq(cmd_ofm_height_batch_reg & 0xffffffff00000000, MUL8(19)); + + reg_writeq(cmd_wdma_param_reg, MUL8(25)); + + reg_writeq(0, MUL8(0)); + reg_writeq(0x4000000000000000, MUL8(0)); + + ret = fpga_regpoll(MUL8(48), CONV_DONE, 0xffffff); + if (ret == -1) { + DLOG << "fpga conv no interrupt!!"; + return ret; + } + reg_readq(MUL8(63)); + + usleep(10); + float scale = Findfp16Max(); + (args.output.scale_address)[0] = scale; // NOLINT + (args.output.scale_address)[1] = (float)(1.0 / scale); // NOLINT + DLOG << "Findfp16Max scale = " << scale; + DLOG << "ret=" << ret; + return ret; } int ComputeFpgaPool(const struct PoolingArgs &args) { @@ -97,7 +577,6 @@ int ComputeFpgaPool(const struct PoolingArgs &args) { DLOG << " mode:" << args.mode << " kernel_reciprocal:" << fp16_2_fp32(args.kernel_reciprocal); DLOG << " image_address:" << args.image.address - << " image_scale_address:" << args.image.scale_address << " image_channels:" << args.image.channels << " image_height:" << args.image.height << " image_width:" << args.image.width @@ -107,13 +586,467 @@ int ComputeFpgaPool(const struct PoolingArgs &args) { << " kernel_width:" << args.kernel.width << " stride_h:" << args.kernel.stride_h << " stride_w:" << args.kernel.stride_w; - DLOG << " out_address:" << args.output.address - << " out_scale_address:" << args.output.scale_address; + DLOG << " out_address:" << args.output.address; #endif #ifndef PADDLE_MOBILE_ZU5 return 0; #endif - return 0; + + uint32_t filter_num_align = 0; + filter_num_align = args.image.channels; + + DLOG << "______db_______: begin to set registers. "; + uint64_t ifm_pixel_num = + ((args.image.width) * (args.image.height) * args.image.channels); + uint64_t ifm_memory_size = ifm_pixel_num * sizeof(short); // NOLINT + uint64_t flt_pixel_num = 0; + uint64_t filter_memory_size = 0; + //!! ??? + uint64_t bn_pixel_num = (filter_num_align * 2); + uint64_t bn_memory_size = bn_pixel_num * sizeof(uint16_t); + + uint64_t ofm_width = + ((args.image.width) + 2 * args.image.pad_width - args.kernel.width) / + (args.kernel.stride_w) + + 1; + uint64_t ofm_height = ((args.image.height) + 2 * (args.image.pad_height) - + (args.kernel.height)) / + (args.kernel.stride_h) + + 1; + + uint32_t filter_num = filter_num_align; + uint32_t image_channels = args.image.channels; + + uint64_t ifm_src_paddr = vaddr_to_paddr((args.image.address)); + uint64_t flt_src_paddr = 0; + uint64_t sb_src_paddr = 0; + uint64_t ifm_dst_paddr = vaddr_to_paddr((args.output.address)); + + /**********BN******************/ + float image_inv_scale = 0; + float filter_inv_scale = 0; + int idx = 0; + DLOG << "______db_______: reset registers. "; + reg_writeq(1, MUL8(24)); + usleep(1); + reg_writeq(0, MUL8(24)); + /*********configuring registers*************/ + uint32_t cmd_image_vir_base_addr = (uint32_t)ifm_src_paddr; + uint32_t cmd_filter_vir_base_addr = (uint32_t)flt_src_paddr; + uint32_t cmd_scale_base_addr = (uint32_t)sb_src_paddr; + uint32_t conv_ofm_addr_base = (uint32_t)ifm_dst_paddr; + uint64_t cmd_group_num = 1; // args.group_num; + uint64_t cmd_filter_per_group = filter_num / cmd_group_num; + + uint64_t cmd_flt_sqr_len = (args.kernel.width) * (args.kernel.height); + uint64_t cmd_ifm_pre_row_num = args.kernel.height; + if ((args.kernel.height == args.image.height) && + (0 == args.image.pad_height)) { + cmd_ifm_pre_row_num = (args.kernel.height); + } else { + cmd_ifm_pre_row_num = + (args.kernel.height) - (args.image.pad_height) + (args.kernel.stride_h); + } + uint64_t cmd_flt_pre_batch_num = 1; + uint64_t cmd_ifm_pack_num_per_row_mns1 = + (uint64_t)(((args.image.channels) + 63) / 64) - 1; + uint64_t cmd_bn_num = filter_num; + uint64_t cmd_bias_num = filter_num; + uint64_t cmd_ifm_stride_row_length = args.image.width * args.kernel.stride_h; + uint64_t cmd_flt_pack_num_per_kernel_mns1 = + (uint64_t)(((args.image.channels) + 63) / 64) - 1; + uint64_t cmd_ofm_width_mns1 = (uint64_t)( + ((args.image.width) - (args.kernel.width) + 2 * (args.image.pad_width)) / + (args.kernel.stride_w)); + uint64_t cmd_ofm_height = + (uint64_t)(((args.image.height) - (args.kernel.height) + + 2 * (args.image.pad_height)) / + (args.kernel.stride_h)) + + 1; + + uint64_t cmd_channel_num = 0; + uint64_t cmd_ifm_pack_len = 0; + uint64_t cmd_channel_per_group = 0; + uint64_t cmd_flt_batch_num_mns1 = 0; + uint64_t cmd_flt_N_impl = 8; + uint64_t cmd_ifm_C_impl = 16; + uint64_t cmd_flt_pack_length = 0; + uint64_t cmd_step_h_mul_row_byte_len = 0; + uint64_t cmd_pad_h_mul_row_byte_len = 0; + uint64_t cmd_ifm_pack_byte_length = 16 * ((((args.image.width) + 7) / 8) * 8); + uint64_t row_len_align = args.image.width; + uint64_t cmd_flt_cycle_num_mns1 = 0; + if (image_channels > 32) { + cmd_channel_num = (uint64_t)((((args.image.channels) + 63)) / 64) * 64; + cmd_ifm_pack_len = 64 * (args.image.width); + cmd_channel_per_group = 64; + cmd_flt_batch_num_mns1 = (uint64_t)(((filter_num + 7)) / 8 - 1); + cmd_flt_N_impl = 8; + cmd_ifm_C_impl = 64; + cmd_flt_pack_length = (args.kernel.width) * (args.kernel.height) * 64; + cmd_step_h_mul_row_byte_len = + (args.kernel.stride_h) * cmd_channel_num * args.image.width; + cmd_pad_h_mul_row_byte_len = + (args.image.pad_height) * cmd_channel_num * args.image.width; + cmd_ifm_pack_byte_length = 64 * args.image.width; + row_len_align = args.image.width * (cmd_ifm_pack_num_per_row_mns1 + 1); + cmd_flt_cycle_num_mns1 = (cmd_channel_num / 64) - 1; + } else if (image_channels > 16) { + cmd_channel_num = 32; + cmd_ifm_pack_len = 32 * (args.image.width); + cmd_channel_per_group = 32; + cmd_flt_batch_num_mns1 = (uint64_t)((((filter_num) + 15)) / 16 - 1); + cmd_flt_N_impl = 16; + cmd_ifm_C_impl = 32; + cmd_flt_pack_length = (args.kernel.width) * (args.kernel.height) * 32; + cmd_step_h_mul_row_byte_len = (args.kernel.stride_h) * cmd_channel_num * + ((((args.image.width) + 1)) / 2) * 2; + cmd_pad_h_mul_row_byte_len = (args.image.pad_height) * cmd_channel_num * + ((((args.image.width) + 1)) / 2) * 2; + cmd_ifm_pack_byte_length = + 32 * (uint64_t)((((args.image.width) + 1)) / 2) * 2; + row_len_align = (uint64_t)((((args.image.width) + 1)) / 2); + cmd_flt_cycle_num_mns1 = 0; + } else if (image_channels > 8) { + cmd_channel_num = 16; + cmd_ifm_pack_len = 16 * (args.image.width); + cmd_channel_per_group = 16; + cmd_flt_batch_num_mns1 = (uint64_t)((((filter_num) + 15)) / 16 - 1); + cmd_flt_N_impl = 32; + cmd_ifm_C_impl = 16; + cmd_flt_pack_length = (args.kernel.width) * (args.kernel.height) * 16; + cmd_step_h_mul_row_byte_len = (args.kernel.stride_h) * cmd_channel_num * + ((((args.image.width) + 3)) / 4) * 4; + cmd_pad_h_mul_row_byte_len = (args.image.pad_height) * cmd_channel_num * + ((((args.image.width) + 3)) / 4) * 4; + cmd_ifm_pack_byte_length = + 16 * (uint64_t)((((args.image.width) + 3)) / 4) * 4; + row_len_align = (uint64_t)((((args.image.width) + 3)) / 4); + cmd_flt_cycle_num_mns1 = 0; + } + + cmd_flt_N_impl = 16; + cmd_flt_batch_num_mns1 = 0; + cmd_flt_pack_length = 64; + uint64_t cmd_flt_N_len = 0; + uint64_t cmd_flt_length = 64; + + uint64_t cmd_ifm_row_byte_length = cmd_channel_num * (args.image.width); + + uint64_t cmd_ifm_buf_col_len = 0; + + uint64_t ifm_one_batch_len = + (1048576 / ((args.image.width) * cmd_channel_num)); + uint64_t cmd_ifm_batch_num_tmp = (uint64_t)( + ((args.image.height) + ifm_one_batch_len - 1) / ifm_one_batch_len); + if (1 == cmd_ifm_batch_num_tmp) { + cmd_ifm_buf_col_len = args.image.height; + } else { + if (((args.image.height) / (cmd_ifm_batch_num_tmp) % 2) == 0) { + cmd_ifm_buf_col_len = (args.image.height) / cmd_ifm_batch_num_tmp; + } else { + cmd_ifm_buf_col_len = (args.image.height) / cmd_ifm_batch_num_tmp - 1; + } + } + uint64_t cmd_ifm_batch_num_mns1 = + (((args.image.height) + cmd_ifm_buf_col_len - 1) / cmd_ifm_buf_col_len) - + 1; + + uint64_t cmd_flt_total_batch_num = 1; + uint64_t cmd_ifm_buf_col_len_rem = + (args.image.height) - + cmd_ifm_batch_num_mns1 * cmd_ifm_buf_col_len; //= -4; + + //-------- ofm batch number reg && initial URAM reading address + uint64_t cmd_init_raddr_cnt = 1; + uint64_t cmd_init_raddr_flag = 0; + int64_t cmd_init_raddr_index = -8; + int64_t cmd_init_raddr_col_0 = -4; + int64_t cmd_init_raddr_col_1 = -4; + int64_t conv_ofm_buf_col_len = 0; + int64_t conv_ofm_buf_col_len_rem = 0; + + if (((args.image.pad_height) % (2 * (args.kernel.stride_h))) == 0) { + cmd_init_raddr_cnt = 0; + cmd_init_raddr_flag = 0; + cmd_init_raddr_index = + 0 - (int64_t)row_len_align * (((args.image.pad_height) + 1) / 2); + cmd_init_raddr_col_0 = cmd_init_raddr_index; + cmd_init_raddr_col_1 = cmd_init_raddr_index; + } else if (((args.image.pad_height) - + 2 * ((args.image.pad_height) / (2 * (args.kernel.stride_h)))) <= + (args.kernel.stride_h)) { + cmd_init_raddr_cnt = + (args.kernel.stride_h) - + ((args.image.pad_height) - + ((args.image.pad_height) / (2 * (args.kernel.stride_h)))); + cmd_init_raddr_flag = 1; + cmd_init_raddr_index = + 0 - (int64_t)row_len_align * (int64_t)(args.image.pad_height) - + (int64_t)row_len_align * + ((args.image.pad_height) / (2 * args.kernel.stride_h)); + cmd_init_raddr_col_0 = + 0 - (int64_t)row_len_align * (int64_t)(args.image.pad_height) - + (int64_t)row_len_align * + ((args.image.pad_height) / (2 * (args.kernel.stride_h))); + cmd_init_raddr_col_1 = + cmd_init_raddr_col_0 + args.kernel.stride_h * (int64_t)row_len_align; + } else if (((args.image.pad_height) - + 2 * ((args.image.pad_height) / (2 * (args.kernel.stride_h)))) <= + 2 * (args.kernel.stride_h)) { + cmd_init_raddr_cnt = + 2 * (args.kernel.stride_h) * + (((args.image.pad_height) + 2 * (args.kernel.stride_h) - 1) / + (2 * (args.kernel.stride_h))) - + (args.image.pad_height); + cmd_init_raddr_flag = 0; + cmd_init_raddr_index = + 0 - (int64_t)row_len_align * (int64_t)(args.kernel.stride_h) * + (((args.image.pad_height) + 2 * (args.kernel.stride_h) - 1) / + (2 * (args.kernel.stride_h))); + cmd_init_raddr_col_0 = + 0 - + (int64_t)row_len_align * + ((args.image.pad_height) / (2 * (args.kernel.stride_h))) - + (int64_t)row_len_align * + (2 * (args.kernel.stride_h) * + (((args.image.pad_height) + 2 * (args.kernel.stride_h) - 1) / + (2 * (args.kernel.stride_h))) - + (args.image.pad_height)); + cmd_init_raddr_col_1 = cmd_init_raddr_col_0; + } + + if (cmd_ifm_batch_num_mns1 == 0) { + if ((args.kernel.height) <= (args.kernel.stride_h)) { + conv_ofm_buf_col_len = (args.image.height) + 2 * (args.image.pad_height) - + 3 * (args.kernel.stride_h); + } else { + conv_ofm_buf_col_len = (args.image.height) + 2 * (args.image.pad_height) - + 2 * (args.kernel.stride_h) - (args.kernel.height); + } + conv_ofm_buf_col_len_rem = conv_ofm_buf_col_len; + } else { + int N_rem = 0; + int row_rem = 0; + + if ((args.kernel.height) <= (args.kernel.stride_h)) { + conv_ofm_buf_col_len = cmd_ifm_buf_col_len - 3 * (args.kernel.stride_h); + N_rem = (cmd_ifm_buf_col_len - (args.kernel.height)) / + (args.kernel.stride_h) + + 1; + row_rem = cmd_ifm_buf_col_len - (args.kernel.stride_h) * N_rem; + conv_ofm_buf_col_len_rem = cmd_ifm_buf_col_len_rem + + 2 * (args.image.pad_height) + row_rem - + 3 * (args.kernel.stride_h); + } else { + conv_ofm_buf_col_len = cmd_ifm_buf_col_len + 2 * (args.image.pad_height) - + 2 * (args.kernel.stride_h) - (args.kernel.height); + N_rem = (cmd_ifm_buf_col_len - (args.kernel.height)) / + (args.kernel.stride_h) + + 1; + row_rem = cmd_ifm_buf_col_len - (args.kernel.stride_h) * N_rem; + conv_ofm_buf_col_len_rem = + cmd_ifm_buf_col_len_rem + (args.image.pad_height) + row_rem - + 2 * (args.kernel.stride_h) - (args.kernel.height); + } + } + + //----------------------- para functions -------------------------------- + uint64_t cmd_filter_quant_scale = 0x3c00; + uint64_t cmd_image_quant_scale = 0x3c00; + uint64_t wParallelsim = cmd_ifm_C_impl >> 3; + uint64_t wParallelsim_num = cmd_flt_cycle_num_mns1; + uint64_t win_size = (args.kernel.width) * (args.kernel.height) * + (cmd_ifm_pack_num_per_row_mns1 + 1) - + 1; // + uint64_t conv_ofm_width = (((args.image.width) - (args.kernel.width) + + (args.image.pad_width) + (args.image.pad_width)) / + (args.kernel.stride_w)); + uint64_t conv_ofm_dma_length = cmd_channel_num * sizeof(short); // NOLINT + uint64_t conv_ofm_dma_stride = conv_ofm_dma_length; + uint64_t conv_ofm_height_batch_tmp = + (args.image.height + 2 * args.image.pad_height - args.kernel.height) / + args.kernel.stride_h + + 1; + + uint64_t conv_ofm_height_batch = (conv_ofm_height_batch_tmp + 1) / 2 - 1; + uint64_t o_ust_rst = 0; + uint64_t conv_ofm_dma_repeat = + (uint64_t)(((((args.image.width) - (args.kernel.width) + + (args.image.pad_width) + (args.image.pad_width))) / + (args.kernel.stride_w)) + + 1); + uint64_t conv_ofm_dma_offset = + args.image.channels * conv_ofm_dma_repeat * sizeof(short); // NOLINT + uint64_t conv_ofm_inter_stride = conv_ofm_dma_offset * 2; + //----------------- register contation ------------------ + uint64_t cmd_ifm_flt_base_addr = ((uint64_t)cmd_filter_vir_base_addr << 32) | + ((uint64_t)cmd_image_vir_base_addr); + uint64_t cmd_ifm_flt_dim = ((uint64_t)(args.kernel.height) << 48) | + ((uint64_t)(args.kernel.width) << 32) | + ((uint64_t)(args.image.height) << 16) | + ((uint64_t)(args.image.width)); + uint64_t cmd_pad_step_size = ((uint64_t)(args.kernel.stride_h) << 48) | + ((uint64_t)(args.kernel.stride_w) << 32) | + ((uint64_t)(args.image.pad_height) << 16) | + ((uint64_t)(args.image.pad_width)); + uint64_t cmd_param1 = ((uint64_t)cmd_filter_per_group << 48) | + ((uint64_t)cmd_channel_num << 32) | + ((uint64_t)filter_num << 16) | + ((uint64_t)cmd_group_num); + uint64_t cmd_param2 = + ((uint64_t)cmd_flt_sqr_len << 48) | ((uint64_t)cmd_ifm_pack_len << 32) | + ((uint64_t)cmd_ifm_pre_row_num << 16) | ((uint64_t)cmd_channel_per_group); + uint64_t cmd_param3 = ((uint64_t)cmd_flt_batch_num_mns1 << 48) | + ((uint64_t)cmd_flt_total_batch_num << 32) | + ((uint64_t)cmd_flt_N_impl << 16) | + ((uint64_t)cmd_flt_pre_batch_num); + uint64_t cmd_param4 = ((uint64_t)cmd_ifm_pack_num_per_row_mns1 << 48) | + ((uint64_t)cmd_bn_num << 32) | + ((uint64_t)cmd_bias_num << 16) | + ((uint64_t)cmd_flt_N_len); + uint64_t cmd_param5 = ((uint64_t)cmd_ifm_stride_row_length << 48) | + ((uint64_t)cmd_flt_pack_length << 32) | + ((uint64_t)cmd_flt_cycle_num_mns1 << 16) | + ((uint64_t)cmd_flt_pack_num_per_kernel_mns1); + uint64_t cmd_param6 = ((uint64_t)cmd_ofm_width_mns1 << 48) | + ((uint64_t)cmd_ifm_batch_num_mns1 << 32) | + ((uint64_t)cmd_ifm_buf_col_len << 16) | + ((uint64_t)cmd_ifm_C_impl); + uint64_t cmd_param7 = ((uint64_t)conv_ofm_inter_stride << 32) | + ((uint64_t)cmd_ifm_buf_col_len_rem << 16) | + ((uint64_t)cmd_ofm_height); + uint64_t cmd_param8 = + ((uint64_t)cmd_flt_length << 32) | ((uint64_t)cmd_ifm_row_byte_length); + uint64_t cmd_ifm_flt_quant_scale = ((uint64_t)cmd_filter_quant_scale << 32) | + ((uint64_t)cmd_image_quant_scale); + uint64_t cmd_step_pad_mul_row_len = + ((uint64_t)cmd_pad_h_mul_row_byte_len << 32) | + ((uint64_t)cmd_step_h_mul_row_byte_len); + //---- ofm paras ---- + uint64_t cmd_conv_param_reg = ((uint64_t)wParallelsim_num << 32) | + ((uint64_t)wParallelsim << 16) | + ((uint64_t)win_size); + uint64_t cmd_ofm_addr_width_reg = + ((uint64_t)conv_ofm_width << 32) | ((uint64_t)conv_ofm_addr_base); + uint64_t cmd_intra_stride_atoms_reg = + ((uint64_t)conv_ofm_dma_length << 32) | ((uint64_t)conv_ofm_dma_stride); + uint64_t cmd_ofm_height_batch_reg = + ((uint64_t)(conv_ofm_buf_col_len_rem & 0xffff) << 48) | + ((uint64_t)(conv_ofm_buf_col_len & 0xffff) << 32) | + ((uint64_t)conv_ofm_height_batch + 0x80000000); + uint64_t cmd_user_ctrl_reg = ((uint64_t)o_ust_rst); + uint64_t cmd_wdma_param_reg = + ((uint64_t)(conv_ofm_dma_repeat | 0x80000000) << 32) | + ((uint64_t)conv_ofm_dma_offset); + uint64_t cmd_init_raddr_reg = ((cmd_init_raddr_col_1 & 0xffff) << 48) | + ((cmd_init_raddr_col_0 & 0xffff) << 32) | + (((cmd_init_raddr_index & 0xffff) << 16)) | + (cmd_init_raddr_flag & 0xffff) << 15 | + ((cmd_init_raddr_cnt & 0xffff)); + + DLOG << "cmd_init_raddr_col_1 = " << hex << cmd_init_raddr_col_1; + + DLOG << "cmd_init_raddr_col_0 = " << hex << cmd_init_raddr_col_0; + DLOG << "cmd_init_raddr_index = " << hex << cmd_init_raddr_index; // + DLOG << "cmd_init_raddr_cnt = " << hex << cmd_init_raddr_cnt; + DLOG << "conv_ofm_buf_col_len = " << hex << conv_ofm_buf_col_len; + DLOG << "conv_ofm_buf_col_len_rem = " << hex << conv_ofm_buf_col_len_rem; + DLOG << "cmd_ifm_flt_base_addr = " << hex << cmd_ifm_flt_base_addr; + DLOG << "cmd_scale_base_addr = " << hex << cmd_scale_base_addr; + DLOG << "cmd_ifm_flt_dim = " << hex << cmd_ifm_flt_dim; + DLOG << "cmd_pad_step_size = " << hex << cmd_pad_step_size; + DLOG << "cmd_param1 = " << hex << cmd_param1; + DLOG << "cmd_param2 = " << hex << cmd_param2; + DLOG << "cmd_param3 = " << hex << cmd_param3; + DLOG << "cmd_param4 = " << hex << cmd_param4; + DLOG << "cmd_param5 = " << hex << cmd_param5; + DLOG << "cmd_param6 = " << hex << cmd_param6; + DLOG << "cmd_param7 = " << hex << cmd_param7; + DLOG << "cmd_param8 = " << hex << cmd_param8; + DLOG << "cmd_ifm_flt_quant_scale = " << hex << cmd_ifm_flt_quant_scale; + DLOG << "cmd_step_pad_mul_row_len = " << hex << cmd_step_pad_mul_row_len; + DLOG << "cmd_ifm_pack_byte_length = " << hex << cmd_ifm_pack_byte_length; + DLOG << "cmd_conv_param_reg = " << hex << cmd_conv_param_reg; + DLOG << "cmd_ofm_addr_width_reg = " << hex << cmd_ofm_addr_width_reg; + DLOG << "cmd_intra_stride_atoms_reg = " << hex << cmd_intra_stride_atoms_reg; + DLOG << "cmd_init_raddr_reg = " << hex << cmd_init_raddr_reg; + DLOG << "cmd_ofm_height_batch_reg = " << hex << cmd_ofm_height_batch_reg; + DLOG << "cmd_wdma_param_reg = " << hex << cmd_wdma_param_reg; + DLOG << "pooling_mode = " << hex << args.mode; + + reg_writeq(cmd_ifm_flt_base_addr, MUL8(1)); + reg_writeq(cmd_scale_base_addr, MUL8(2)); + reg_writeq(cmd_ifm_flt_dim, MUL8(3)); + reg_writeq(cmd_pad_step_size, MUL8(4)); + reg_writeq(cmd_param1, MUL8(5)); + reg_writeq(cmd_param2, MUL8(6)); + reg_writeq(cmd_param3, MUL8(7)); + reg_writeq(cmd_param4, MUL8(8)); + reg_writeq(cmd_param5, MUL8(9)); + reg_writeq(cmd_param6, MUL8(10)); + reg_writeq(cmd_param7, MUL8(11)); + reg_writeq(cmd_param8, MUL8(12)); + reg_writeq(cmd_ifm_flt_quant_scale, MUL8(13)); + reg_writeq(cmd_step_pad_mul_row_len, MUL8(14)); + reg_writeq(cmd_ifm_pack_byte_length, MUL8(15)); + reg_writeq(cmd_conv_param_reg, MUL8(16)); + reg_writeq(cmd_ofm_addr_width_reg, MUL8(17)); + reg_writeq(cmd_intra_stride_atoms_reg, MUL8(18)); + + reg_writeq(cmd_init_raddr_reg, MUL8(29)); + + reg_writeq(0, MUL8(19)); + reg_writeq(cmd_ofm_height_batch_reg, MUL8(19)); + reg_writeq(cmd_ofm_height_batch_reg & 0xffffffff00000000, MUL8(19)); + + reg_writeq(cmd_wdma_param_reg, MUL8(25)); + + /******************************************************************/ + uint64_t cmd_mult_factor = ((uint64_t)args.kernel_reciprocal) | + ((uint64_t)args.kernel_reciprocal << 16); + reg_writeq(cmd_mult_factor, MUL8(30)); + /******************************************************************/ + + reg_writeq(0, MUL8(0)); + if (args.mode == 0) { // max pooling + reg_writeq(0x2200000000000000, MUL8(0)); + } else { // average pooling + reg_writeq(0x2400000000000000, MUL8(0)); + } + int ret = -1; + ret = fpga_regpoll(MUL8(48), CONV_DONE, 0x00ffff); + if (ret == -1) { + DLOG << "fpga pooling no interrupt!!"; + return ret; + } + reg_readq(MUL8(63)); + usleep(10); + // get max value + float scale = Findfp16Max(); + (args.output.scale_address)[0] = scale; // NOLINT + (args.output.scale_address)[1] = (float)(1.0 / scale); // NOLINT + DLOG << "Findfp16Max scale = " << scale; + DLOG << "ret=" << ret; + return ret; +} + +int get_ofm_batch_size(int width, int channel) { + int pad_channel, row_size; + + if (64 < channel) { + pad_channel = (int)((channel + 127) / 128) * 128; // NOLINT + } else if (32 < channel && channel <= 64) { + pad_channel = ((channel + 63) / (64)) * 64; + } else if (16 < channel && channel <= 32) { + pad_channel = ((channel + 31) / (32)) * 32; + } else if (channel <= 16) { + pad_channel = ((channel + 15) / (16)) * 16; + } + + row_size = pad_channel * width; + + return row_size; } int ComputeFpgaEWAdd(const struct EWAddArgs &args) { @@ -123,26 +1056,525 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) { << " const0:" << fp16_2_fp32(int16_t(args.const0)) << " const1:" << fp16_2_fp32(int16_t(args.const1)); DLOG << " image0_address:" << args.image0.address - << " image0_scale_address:" << args.image0.scale_address << " image0_channels:" << args.image0.channels << " image0_height:" << args.image0.height - << " image0_width:" << args.image0.width - << " pad0_height:" << args.image0.pad_height - << " pad0_width:" << args.image0.pad_width; + << " image0_width:" << args.image0.width; DLOG << " image1_address:" << args.image1.address - << " image1_scale_address:" << args.image1.scale_address << " image1_channels:" << args.image1.channels << " image1_height:" << args.image1.height - << " image1_width:" << args.image1.width - << " pad1_height:" << args.image1.pad_height - << " pad_width:" << args.image1.pad_width; - DLOG << " out_address:" << args.output.address - << " out_scale_address:" << args.output.scale_address; + << " image1_width:" << args.image1.width; + DLOG << " out_address:" << args.output.address; #endif #ifndef PADDLE_MOBILE_ZU5 return 0; #endif - return 0; + uint32_t filter_num_align = args.image0.channels; + + uint32_t const_kernel_width_1 = 1; + uint32_t const_stride_width_1 = 1; + uint32_t const_kernel_height_2 = 2; + uint32_t const_stride_height_2 = 2; + uint32_t const_pad_height_0 = 0; + uint32_t const_pad_width_0 = 0; + uint32_t ew_image_height = args.image0.height * 2; + + DLOG << "______db_______: begin to set registers. "; + uint64_t ifm_pixel_num = + ((args.image0.width) * (args.image0.height) * args.image0.channels); + uint64_t ifm_memory_size = ifm_pixel_num * sizeof(short); // NOLINT + uint64_t flt_pixel_num = 0; + uint64_t filter_memory_size = 0; + uint64_t bn_pixel_num = (filter_num_align * 2); + uint64_t bn_memory_size = bn_pixel_num * sizeof(uint16_t); + + uint64_t ofm_width = + ((args.image0.width) + 2 * const_pad_width_0 - const_kernel_width_1) / + (const_stride_width_1) + + 1; + uint64_t ofm_height = + ((ew_image_height) + 2 * (const_pad_height_0) - (const_kernel_height_2)) / + (const_stride_height_2) + + 1; + + uint32_t filter_num = filter_num_align; + uint32_t image_channels = args.image0.channels; + + uint64_t ifm_src_paddr = vaddr_to_paddr((args.image0.address)); + uint64_t flt_src_paddr = vaddr_to_paddr((args.image1.address)); + uint64_t ifm_dst_paddr = vaddr_to_paddr((args.output.address)); + float image_inv_scale = 0; + float filter_inv_scale = 0; + int idx = 0; + + DLOG << "______db_______: reset registers. "; + + reg_writeq(1, MUL8(24)); + usleep(1); + reg_writeq(0, MUL8(24)); + + /*********configuring registers*************/ + uint32_t cmd_image_vir_base_addr = (uint32_t)ifm_src_paddr; + uint32_t cmd_filter_vir_base_addr = (uint32_t)flt_src_paddr; + uint32_t cmd_scale_base_addr = 0; + uint32_t conv_ofm_addr_base = (uint32_t)ifm_dst_paddr; + uint64_t cmd_group_num = 1; + uint64_t cmd_filter_per_group = filter_num / cmd_group_num; + + uint64_t cmd_flt_sqr_len = (const_kernel_width_1) * (const_kernel_height_2); + uint64_t cmd_ifm_pre_row_num = const_kernel_height_2; + if ((const_kernel_height_2 == ew_image_height) && (0 == const_pad_height_0)) { + cmd_ifm_pre_row_num = (const_kernel_height_2); + } else { + cmd_ifm_pre_row_num = (const_kernel_height_2) - (const_pad_height_0) + + (const_stride_height_2); + } + uint64_t cmd_flt_pre_batch_num = 1; + uint64_t cmd_ifm_pack_num_per_row_mns1 = + (uint64_t)(((args.image0.channels) + 63) / 64) - 1; + uint64_t cmd_bn_num = filter_num; + uint64_t cmd_bias_num = filter_num; + uint64_t cmd_ifm_stride_row_length = + args.image0.width * const_stride_height_2; + uint64_t cmd_flt_pack_num_per_kernel_mns1 = + (uint64_t)(((args.image0.channels) + 63) / 64) - 1; + uint64_t cmd_ofm_width_mns1 = (uint64_t)( + ((args.image0.width) - (const_kernel_width_1) + 2 * (const_pad_width_0)) / + (const_stride_width_1)); + uint64_t cmd_ofm_height = + (uint64_t)(((args.image0.height) * 2 - (const_kernel_height_2) + + 2 * (const_pad_height_0)) / + (const_stride_height_2)) + + 1; + + uint64_t cmd_channel_num = 0; + uint64_t cmd_ifm_pack_len = 0; + uint64_t cmd_channel_per_group = 0; + uint64_t cmd_flt_batch_num_mns1 = 0; + uint64_t cmd_flt_N_impl = 8; + uint64_t cmd_ifm_C_impl = 16; + uint64_t cmd_flt_pack_length = 0; + uint64_t cmd_step_h_mul_row_byte_len = 0; + uint64_t cmd_pad_h_mul_row_byte_len = 0; + uint64_t cmd_ifm_pack_byte_length = + 16 * ((((args.image0.width) + 7) / 8) * 8); + uint64_t row_len_align = args.image0.width; + uint64_t cmd_flt_cycle_num_mns1 = 0; + if (image_channels > 32) { + cmd_channel_num = (uint64_t)((((args.image0.channels) + 63)) / 64) * 64; + cmd_ifm_pack_len = 64 * (args.image0.width); + cmd_channel_per_group = 64; + cmd_flt_batch_num_mns1 = (uint64_t)(((filter_num + 7)) / 8 - 1); + cmd_flt_N_impl = 8; + cmd_ifm_C_impl = 64; + cmd_flt_pack_length = (const_kernel_width_1) * (const_kernel_height_2)*64; + cmd_step_h_mul_row_byte_len = + (const_stride_height_2)*cmd_channel_num * args.image0.width; + cmd_pad_h_mul_row_byte_len = + (const_pad_height_0)*cmd_channel_num * args.image0.width; + cmd_ifm_pack_byte_length = 64 * args.image0.width; + row_len_align = args.image0.width; + cmd_flt_cycle_num_mns1 = (cmd_channel_num / 64) - 1; + } else if (image_channels > 16) { + cmd_channel_num = 32; + cmd_ifm_pack_len = 32 * (args.image0.width); + cmd_channel_per_group = 32; + cmd_flt_batch_num_mns1 = (uint64_t)((((filter_num) + 15)) / 16 - 1); + cmd_flt_N_impl = 16; + cmd_ifm_C_impl = 32; + cmd_flt_pack_length = (const_kernel_width_1) * (const_kernel_height_2)*32; + cmd_step_h_mul_row_byte_len = (const_stride_height_2)*cmd_channel_num * + ((((args.image0.width) + 1)) / 2) * 2; + cmd_pad_h_mul_row_byte_len = (const_pad_height_0)*cmd_channel_num * + ((((args.image0.width) + 1)) / 2) * 2; + cmd_ifm_pack_byte_length = + 32 * (uint64_t)((((args.image0.width) + 1)) / 2) * 2; + row_len_align = (uint64_t)((((args.image0.width) + 1)) / 2); + cmd_flt_cycle_num_mns1 = 0; + } else if (image_channels > 8) { + cmd_channel_num = 16; + cmd_ifm_pack_len = 16 * (args.image0.width); + cmd_channel_per_group = 16; + cmd_flt_batch_num_mns1 = (uint64_t)((((filter_num) + 15)) / 16 - 1); + cmd_flt_N_impl = 32; + cmd_ifm_C_impl = 16; + cmd_flt_pack_length = (const_kernel_width_1) * (const_kernel_height_2)*16; + cmd_step_h_mul_row_byte_len = (const_stride_height_2)*cmd_channel_num * + ((((args.image0.width) + 3)) / 4) * 4; + cmd_pad_h_mul_row_byte_len = (const_pad_height_0)*cmd_channel_num * + ((((args.image0.width) + 3)) / 4) * 4; + cmd_ifm_pack_byte_length = + 16 * (uint64_t)((((args.image0.width) + 3)) / 4) * 4; + row_len_align = (uint64_t)((((args.image0.width) + 3)) / 4); + cmd_flt_cycle_num_mns1 = 0; + } + + cmd_flt_N_impl = 16; + cmd_flt_batch_num_mns1 = 0; + cmd_flt_pack_length = 64; + uint64_t cmd_flt_N_len = 0; + uint64_t cmd_flt_length = 64; + uint64_t cmd_ifm_row_byte_length = cmd_channel_num * (args.image0.width); + uint64_t cmd_ifm_buf_col_len = 0; + uint64_t ifm_one_batch_len = + (1048576 / ((2 * row_len_align) * cmd_channel_num)); + uint64_t cmd_ifm_batch_num_tmp = (uint64_t)( + ((ew_image_height) + ifm_one_batch_len - 1) / ifm_one_batch_len); + DLOG << "ifm_one_batch_len = " << hex << ifm_one_batch_len; + DLOG << "cmd_ifm_batch_num_tmp = " << hex << cmd_ifm_batch_num_tmp; + + if (1 == cmd_ifm_batch_num_tmp) { + cmd_ifm_buf_col_len = ew_image_height; + } else { + cmd_ifm_buf_col_len = ifm_one_batch_len; + } + uint64_t cmd_ifm_batch_num_mns1 = + (((ew_image_height) + cmd_ifm_buf_col_len - 1) / cmd_ifm_buf_col_len) - 1; + DLOG << "___db____ew____:cmd_ifm_batch_num_mns1 = " << hex + << cmd_ifm_batch_num_mns1; + + uint64_t cmd_flt_total_batch_num = 1; + uint64_t cmd_ifm_buf_col_len_rem = + (ew_image_height)-cmd_ifm_batch_num_mns1 * cmd_ifm_buf_col_len; + //-------- ofm batch number reg && initial URAM reading address + // logic----------------- + uint64_t cmd_init_raddr_cnt = 1; + uint64_t cmd_init_raddr_flag = 0; + int64_t cmd_init_raddr_index = -8; + int64_t cmd_init_raddr_col_0 = -4; + int64_t cmd_init_raddr_col_1 = -4; + int64_t conv_ofm_buf_col_len = 0; + int64_t conv_ofm_buf_col_len_rem = 0; + + if (((const_pad_height_0) % (2 * (const_stride_height_2))) == 0) { + cmd_init_raddr_cnt = 0; + cmd_init_raddr_flag = 0; + cmd_init_raddr_index = + 0 - (int64_t)row_len_align * (((const_pad_height_0) + 1) / 2); + cmd_init_raddr_col_0 = cmd_init_raddr_index; + cmd_init_raddr_col_1 = cmd_init_raddr_index; + } else if (((const_pad_height_0)-2 * + ((const_pad_height_0) / (2 * (const_stride_height_2)))) <= + (const_stride_height_2)) { + cmd_init_raddr_cnt = + (const_stride_height_2) - + ((const_pad_height_0) - + ((const_pad_height_0) / (2 * (const_stride_height_2)))); + cmd_init_raddr_flag = 1; + cmd_init_raddr_index = + 0 - (int64_t)row_len_align * (int64_t)(const_pad_height_0) - + (int64_t)row_len_align * + ((const_pad_height_0) / (2 * const_stride_height_2)); + cmd_init_raddr_col_0 = + 0 - (int64_t)row_len_align * (int64_t)(const_pad_height_0) - + (int64_t)row_len_align * + ((const_pad_height_0) / (2 * (const_stride_height_2))); + cmd_init_raddr_col_1 = + cmd_init_raddr_col_0 + + const_stride_height_2 * (int64_t)row_len_align; // 0; + } else if (((const_pad_height_0)-2 * + ((const_pad_height_0) / (2 * (const_stride_height_2)))) <= + 2 * (const_stride_height_2)) { + cmd_init_raddr_cnt = + 2 * (const_stride_height_2) * + (((const_pad_height_0) + 2 * (const_stride_height_2)-1) / + (2 * (const_stride_height_2))) - + (const_pad_height_0); + cmd_init_raddr_flag = 0; + cmd_init_raddr_index = + 0 - (int64_t)row_len_align * (int64_t)(const_stride_height_2) * + (((const_pad_height_0) + 2 * (const_stride_height_2)-1) / + (2 * (const_stride_height_2))); + cmd_init_raddr_col_0 = + 0 - + (int64_t)row_len_align * + ((const_pad_height_0) / (2 * (const_stride_height_2))) - + (int64_t)row_len_align * + (2 * (const_stride_height_2) * + (((const_pad_height_0) + 2 * (const_stride_height_2)-1) / + (2 * (const_stride_height_2))) - + (const_pad_height_0)); + cmd_init_raddr_col_1 = cmd_init_raddr_col_0; + } + + if (cmd_ifm_batch_num_mns1 == 0) { + if ((const_kernel_height_2) <= (const_stride_height_2)) { + conv_ofm_buf_col_len = cmd_ifm_buf_col_len + 2 * (const_pad_height_0)-3 * + (const_stride_height_2); + } else { + conv_ofm_buf_col_len = + cmd_ifm_buf_col_len + + 2 * (const_pad_height_0)-3 * (const_stride_height_2) - + (const_kernel_height_2); + } + conv_ofm_buf_col_len_rem = conv_ofm_buf_col_len; + } else { + int N_rem = 0; + int row_rem = 0; + + if ((const_kernel_height_2) <= (const_stride_height_2)) { + conv_ofm_buf_col_len = cmd_ifm_buf_col_len - 3 * (const_stride_height_2); + N_rem = (cmd_ifm_buf_col_len - (const_kernel_height_2)) / + (const_stride_height_2) + + 1; + row_rem = cmd_ifm_buf_col_len - (const_stride_height_2)*N_rem; + conv_ofm_buf_col_len_rem = cmd_ifm_buf_col_len_rem + + 2 * (const_pad_height_0) + row_rem - + 3 * (const_stride_height_2); + } else { + conv_ofm_buf_col_len = + cmd_ifm_buf_col_len + + 2 * (const_pad_height_0)-3 * (const_stride_height_2) - + (const_kernel_height_2); + N_rem = (cmd_ifm_buf_col_len - (const_kernel_height_2)) / + (const_stride_height_2) + + 1; + row_rem = cmd_ifm_buf_col_len - (const_stride_height_2)*N_rem; + conv_ofm_buf_col_len_rem = + cmd_ifm_buf_col_len_rem + (const_pad_height_0) + row_rem - + 3 * (const_stride_height_2) - (const_kernel_height_2); + } + } + + //************************* + uint64_t ifm_height_raw_batch = 0; + uint64_t cmd_ofm_height_batch_reg; + uint64_t conv_ofm_height_batch_tmp = 0; + uint64_t conv_ofm_height_batch[16]; + int ofm_height_norm_batch; + int height_batch_num; + + int row_norm_size = get_ofm_batch_size(args.image0.width, cmd_channel_num); + int ifm_norm_size = + ew_image_height * row_norm_size * sizeof(short); // NOLINT + + if (ifm_norm_size <= (1024 * 1024)) { + conv_ofm_height_batch[0] = + get_image_out_axis(ew_image_height, const_pad_height_0, + const_kernel_height_2, const_stride_height_2); + height_batch_num = 0; + } else if (row_norm_size < (1024 * 1024)) { + // raw ifm batch ,should make ofm be 2*N + ifm_height_raw_batch = + (int)(((double)(1024 * 1024) - row_norm_size + 1) / // NOLINT + (double)(2 * row_norm_size)); // NOLINT + ofm_height_norm_batch = get_image_out_axis( + ifm_height_raw_batch, 0, const_kernel_height_2, const_stride_height_2); + if (ofm_height_norm_batch % 2 == 0) { + ofm_height_norm_batch = ofm_height_norm_batch; + } else { + ofm_height_norm_batch = ofm_height_norm_batch - 1; + } + + DLOG << "ofm_height_norm_batch = " << hex << ofm_height_norm_batch; + int ofm_height_rems = cmd_ofm_height; + int i = 0; + for (i = 0; 0 < ofm_height_rems; i++) { + if (ofm_height_norm_batch <= ofm_height_rems) { + ofm_height_rems = ofm_height_rems - ofm_height_norm_batch; + conv_ofm_height_batch[i] = ofm_height_norm_batch; + DLOG << "ofm_height_norm_batch[i] = " << hex + << conv_ofm_height_batch[i]; + } else { + conv_ofm_height_batch[i] = ofm_height_rems; + break; + } + } + height_batch_num = i; + } + //************************* + + //----------------------- para functions -------------------------------- + uint64_t cmd_filter_quant_scale = 0x3c00; + uint64_t cmd_image_quant_scale = 0x3c00; + uint64_t wParallelsim = cmd_ifm_C_impl >> 3; + uint64_t wParallelsim_num = cmd_flt_cycle_num_mns1; + uint64_t win_size = (const_kernel_width_1) * (const_kernel_height_2) * + (cmd_ifm_pack_num_per_row_mns1 + 1) - + 1; // + uint64_t conv_ofm_width = (((args.image0.width) - (const_kernel_width_1) + + (const_pad_width_0) + (const_pad_width_0)) / + (const_stride_width_1)); + uint64_t conv_ofm_dma_length = cmd_channel_num * sizeof(short); // NOLINT + uint64_t conv_ofm_dma_stride = cmd_channel_num * sizeof(short); // NOLINT + uint64_t cmd_image_addr_low = 0; + uint64_t cmd_image_addr_high = 0; + uint64_t cmd_image_addr_diff = 0; + + if (cmd_filter_vir_base_addr < cmd_image_vir_base_addr) { + cmd_image_addr_low = (uint64_t)cmd_filter_vir_base_addr; + cmd_image_addr_high = (uint64_t)cmd_image_vir_base_addr; + } else { + cmd_image_addr_low = (uint64_t)cmd_image_vir_base_addr; + cmd_image_addr_high = (uint64_t)cmd_filter_vir_base_addr; + } + + cmd_image_addr_diff = cmd_image_addr_high - cmd_image_addr_low; + uint64_t o_ust_rst = 0; + uint64_t conv_ofm_dma_repeat = + (uint64_t)(((((args.image0.width) - (const_kernel_width_1) + + (const_pad_width_0) + (const_pad_width_0))) / + (const_stride_width_1)) + + 1); + uint64_t conv_ofm_dma_offset = + cmd_channel_num * conv_ofm_dma_repeat * sizeof(short); // NOLINT + uint64_t conv_ofm_inter_stride = conv_ofm_dma_offset * 2; + //----------------- register contation ------------------ + uint64_t cmd_ifm_flt_base_addr = + (cmd_image_addr_high << 32) | (cmd_image_addr_low); + + uint64_t cmd_ifm_flt_dim = ((uint64_t)(const_kernel_height_2) << 48) | + ((uint64_t)(const_kernel_width_1) << 32) | + ((uint64_t)(ew_image_height) << 16) | + ((uint64_t)(args.image0.width)); + uint64_t cmd_pad_step_size = ((uint64_t)(const_stride_height_2) << 48) | + ((uint64_t)(const_stride_width_1) << 32) | + ((uint64_t)(const_pad_height_0) << 16) | + ((uint64_t)(const_pad_width_0)); + uint64_t cmd_param1 = ((uint64_t)cmd_filter_per_group << 48) | + ((uint64_t)cmd_channel_num << 32) | + ((uint64_t)filter_num << 16) | + ((uint64_t)cmd_group_num); + uint64_t cmd_param2 = + ((uint64_t)cmd_flt_sqr_len << 48) | ((uint64_t)cmd_ifm_pack_len << 32) | + ((uint64_t)cmd_ifm_pre_row_num << 16) | ((uint64_t)cmd_channel_per_group); + uint64_t cmd_param3 = ((uint64_t)cmd_flt_batch_num_mns1 << 48) | + ((uint64_t)cmd_flt_total_batch_num << 32) | + ((uint64_t)cmd_flt_N_impl << 16) | + ((uint64_t)cmd_flt_pre_batch_num); + uint64_t cmd_param4 = ((uint64_t)cmd_ifm_pack_num_per_row_mns1 << 48) | + ((uint64_t)cmd_bn_num << 32) | + ((uint64_t)cmd_bias_num << 16) | + ((uint64_t)cmd_flt_N_len); + uint64_t cmd_param5 = ((uint64_t)cmd_ifm_stride_row_length << 48) | + ((uint64_t)cmd_flt_pack_length << 32) | + ((uint64_t)cmd_flt_cycle_num_mns1 << 16) | + ((uint64_t)cmd_flt_pack_num_per_kernel_mns1); + uint64_t cmd_param6 = ((uint64_t)cmd_ofm_width_mns1 << 48) | + ((uint64_t)cmd_ifm_batch_num_mns1 << 32) | + ((uint64_t)cmd_ifm_buf_col_len << 16) | + ((uint64_t)cmd_ifm_C_impl); + uint64_t cmd_param7 = ((uint64_t)conv_ofm_inter_stride << 32) | + ((uint64_t)cmd_ifm_buf_col_len_rem << 16) | + ((uint64_t)cmd_ofm_height); + uint64_t cmd_param8 = + ((uint64_t)cmd_flt_length << 32) | ((uint64_t)cmd_ifm_row_byte_length); + uint64_t cmd_ifm_flt_quant_scale = ((uint64_t)cmd_filter_quant_scale << 32) | + ((uint64_t)cmd_image_quant_scale); + uint64_t cmd_step_pad_mul_row_len = + ((uint64_t)cmd_pad_h_mul_row_byte_len << 32) | + ((uint64_t)cmd_step_h_mul_row_byte_len); + //---- ofm paras ---- + uint64_t cmd_conv_param_reg = ((uint64_t)wParallelsim_num << 32) | + ((uint64_t)wParallelsim << 16) | + ((uint64_t)win_size); + uint64_t cmd_ofm_addr_width_reg = + ((uint64_t)conv_ofm_width << 32) | ((uint64_t)conv_ofm_addr_base); + uint64_t cmd_intra_stride_atoms_reg = + ((uint64_t)conv_ofm_dma_length << 32) | ((uint64_t)conv_ofm_dma_stride); + uint64_t cmd_user_ctrl_reg = ((uint64_t)o_ust_rst); + uint64_t cmd_wdma_param_reg = + ((uint64_t)(conv_ofm_dma_repeat | 0x80000000) << 32) | + ((uint64_t)conv_ofm_dma_offset); + uint64_t cmd_init_raddr_reg = ((cmd_init_raddr_col_1 & 0xffff) << 48) | + ((cmd_init_raddr_col_0 & 0xffff) << 32) | + (((cmd_init_raddr_index & 0xffff) << 16)) | + (cmd_init_raddr_flag & 0xffff) << 15 | + ((cmd_init_raddr_cnt & 0xffff)); + uint64_t cmd_mult_factor = + ((uint64_t)args.const0) | ((uint64_t)args.const1 << 16); + uint64_t cmd_para31 = (cmd_para31 & 0x1) | args.relu_enabled; + + DLOG << "cmd_init_raddr_col_1 = " << hex << cmd_init_raddr_col_1; + DLOG << "cmd_init_raddr_col_0 = " << hex << cmd_init_raddr_col_0; + DLOG << "cmd_init_raddr_index = " << hex << cmd_init_raddr_index; // + DLOG << "cmd_init_raddr_cnt = " << hex << cmd_init_raddr_cnt; + DLOG << "cmd_ifm_buf_col_len = " << hex << cmd_ifm_buf_col_len; + DLOG << "cmd_ifm_buf_col_len_rem = " << hex << cmd_ifm_buf_col_len_rem; + DLOG << "conv_ofm_buf_col_len = " << hex << conv_ofm_buf_col_len; + DLOG << "conv_ofm_buf_col_len_rem = " << hex << conv_ofm_buf_col_len_rem; + DLOG << "cmd_ifm_flt_base_addr = " << hex << cmd_ifm_flt_base_addr; + DLOG << "cmd_scale_base_addr = " << hex << cmd_scale_base_addr; + DLOG << "cmd_ifm_flt_dim = " << hex << cmd_ifm_flt_dim; + DLOG << "cmd_pad_step_size = " << hex << cmd_pad_step_size; + DLOG << "cmd_param1 = " << hex << cmd_param1; + DLOG << "cmd_param2 = " << hex << cmd_param2; + DLOG << "cmd_param3 = " << hex << cmd_param3; + DLOG << "cmd_param4 = " << hex << cmd_param4; + DLOG << "cmd_param5 = " << hex << cmd_param5; + DLOG << "cmd_param6 = " << hex << cmd_param6; + DLOG << "cmd_param7 = " << hex << cmd_param7; + DLOG << "cmd_param8 = " << hex << cmd_param8; + DLOG << "cmd_ifm_flt_quant_scale = " << hex << cmd_ifm_flt_quant_scale; + DLOG << "cmd_step_pad_mul_row_len = " << hex << cmd_step_pad_mul_row_len; + DLOG << "cmd_ifm_pack_byte_length = " << hex << cmd_ifm_pack_byte_length; + DLOG << "cmd_conv_param_reg = " << hex << cmd_conv_param_reg; + DLOG << "cmd_ofm_addr_width_reg = " << hex << cmd_ofm_addr_width_reg; + DLOG << "cmd_intra_stride_atoms_reg = " << hex << cmd_intra_stride_atoms_reg; + DLOG << "cmd_init_raddr_reg = " << hex << cmd_init_raddr_reg; + DLOG << "cmd_mult_factor = " << hex << cmd_mult_factor; + DLOG << "cmd_wdma_param_reg = " << hex << cmd_wdma_param_reg; + DLOG << "cmd_para31 = " << hex << cmd_para31; + + reg_writeq(cmd_ifm_flt_base_addr, MUL8(1)); + reg_writeq(cmd_scale_base_addr, MUL8(2)); + reg_writeq(cmd_ifm_flt_dim, MUL8(3)); + reg_writeq(cmd_pad_step_size, MUL8(4)); + reg_writeq(cmd_param1, MUL8(5)); + reg_writeq(cmd_param2, MUL8(6)); + reg_writeq(cmd_param3, MUL8(7)); + reg_writeq(cmd_param4, MUL8(8)); + reg_writeq(cmd_param5, MUL8(9)); + reg_writeq(cmd_param6, MUL8(10)); + reg_writeq(cmd_param7, MUL8(11)); + reg_writeq(cmd_param8, MUL8(12)); + reg_writeq(cmd_ifm_flt_quant_scale, MUL8(13)); + reg_writeq(cmd_step_pad_mul_row_len, MUL8(14)); + reg_writeq(cmd_ifm_pack_byte_length, MUL8(15)); + reg_writeq(cmd_conv_param_reg, MUL8(16)); + reg_writeq(cmd_ofm_addr_width_reg, MUL8(17)); + reg_writeq(cmd_intra_stride_atoms_reg, MUL8(18)); + + reg_writeq(cmd_init_raddr_reg, MUL8(29)); + reg_writeq(cmd_para31, MUL8(31)); + + reg_writeq(0, MUL8(19)); + for (int i = 0; i < height_batch_num + 1; i++) { + conv_ofm_height_batch_tmp = + int((conv_ofm_height_batch[i] + 1) / 2) - 1; // NOLINT + cmd_ofm_height_batch_reg = + ((uint64_t)(conv_ofm_buf_col_len_rem & 0xffff) << 48) | + ((uint64_t)(conv_ofm_buf_col_len & 0xffff) << 32) | + ((uint64_t)conv_ofm_height_batch_tmp + 0x80000000); + reg_writeq(cmd_ofm_height_batch_reg, MUL8(19)); + reg_writeq(cmd_ofm_height_batch_reg & 0xffffffff00000000, MUL8(19)); + usleep(1); + } + reg_writeq(cmd_wdma_param_reg, MUL8(25)); + DLOG << "cmd_ofm_height_batch_reg = " << hex << cmd_ofm_height_batch_reg; + + /******************************************************************/ + reg_writeq(cmd_mult_factor, MUL8(30)); + /******************************************************************/ + + reg_writeq(0, MUL8(0)); + + reg_writeq(0x2100000000000000, MUL8(0)); + + int ret = fpga_regpoll(MUL8(48), CONV_DONE, 0xffffff); + if (ret == -1) { + DLOG << "fpga EW no interrupt!!"; + return ret; + } + reg_readq(MUL8(63)); + usleep(10); + // get max value + float scale = Findfp16Max(); + (args.output.scale_address)[0] = scale; // NOLINT + (args.output.scale_address)[1] = (float)(1.0 / scale); // NOLINT + DLOG << "Findfp16Max scale = " << scale; + + DLOG << "ret=" << ret; + return ret; } int PerformBypass(const struct BypassArgs &args) { @@ -166,60 +1598,63 @@ int PerformBypass(const struct BypassArgs &args) { return 0; #endif - // uint64_t ifm_src_paddr = driver::vaddr_to_paddr(args.image.address); - // uint64_t ifm_dst_paddr = driver::vaddr_to_paddr(args.output.address); - // uint64_t bp_enable; - // int64_t length; - // uint64_t pixels; - // - // // fp32->fp16 - // if ((args.input_data_type) && (!args.output_data_type)) { - // pixels = (args.image.channels) * (args.image.width) * - // (args.image.height); length = pixels * sizeof(float); bp_enable = - // 0x8800000000000000 + length; - // } - // // fp16->fp32 - // else if ((!args.input_data_type) && (args.output_data_type)) { - // pixels = filter::calc_aligned_channel((args.image.channels)) * - // (args.image.width) * (args.image.height); - // length = pixels * sizeof(short); - // length = align_to_x((int)length, 64); // NOLINT - // bp_enable = 0x8a00000000000000 + length; - // } - // // fp16->fp16 findmax - // else if ((!args.input_data_type) && (!args.output_data_type)) { - // pixels = (args.image.channels) * (args.image.width) * - // (args.image.height); length = pixels * sizeof(short); bp_enable = - // 0x8900000000000000 + length; - // } else { - // return -1; - // } - // - // // start bypass - // driver::reg_writeq(ifm_src_paddr, MUL8(27)); - // driver::reg_writeq(ifm_dst_paddr, MUL8(28)); - // driver::reg_writeq(0, MUL8(0)); - // driver::reg_writeq(bp_enable, MUL8(0)); - // // poll - // int ret = -1; - // ret = driver::fpga_regpoll(MUL8(48), BYPASS_DONE, 0xffffffff); - // if (ret != -1) { - // // clear "irq" - // driver::reg_readq(MUL8(63)); - // } - // // get max value - // if ((!args.input_data_type) && (!args.output_data_type)) { - // float scale = Findfp16Max(); - // args.output.scale_address[0] = (float)(1.0 / scale); // NOLINT - // args.output.scale_address[1] = scale; - // } - // return ret; + uint64_t ifm_src_paddr = vaddr_to_paddr(args.image.address); + uint64_t ifm_dst_paddr = vaddr_to_paddr(args.output.address); + uint64_t bp_enable; + int64_t length; + uint64_t pixels; + + // fp32->fp16 + if ((args.input_data_type) && (!args.output_data_type)) { + DLOG << "fp32-fp16"; + pixels = (args.image.channels) * (args.image.width) * (args.image.height); + length = pixels * sizeof(float); + bp_enable = 0x8800000000000000UL + (uint64_t)length; + } + // fp16->fp32 + else if ((!args.input_data_type) && (args.output_data_type)) { // NOLINT + DLOG << "fp16-fp32"; + pixels = filter::calc_aligned_channel((args.image.channels)) * + (args.image.width) * (args.image.height); + length = pixels * sizeof(short); // NOLINT + length = align_to_x((int)length, 64); // NOLINT + bp_enable = 0x8a00000000000000UL + length; + } + // fp16->fp16 findmax + else if ((!args.input_data_type) && (!args.output_data_type)) { // NOLINT + DLOG << "16-16"; + pixels = (args.image.channels) * (args.image.width) * (args.image.height); + length = pixels * sizeof(short); // NOLINT + bp_enable = 0x8900000000000000 + length; + } else { + return -1; + } + // start bypass + reg_writeq(0, MUL8(0)); + reg_writeq(ifm_src_paddr, MUL8(27)); + reg_writeq(ifm_dst_paddr, MUL8(28)); + reg_writeq(bp_enable, MUL8(0)); + int ret = -1; + ret = fpga_regpoll(MUL8(48), BYPASS_DONE, 0xffffff); + + if (ret != -1) { + DLOG << "test done"; + } + reg_readq(MUL8(63)); + usleep(10); + // get max value + float scale = Findfp16Max(); + args.output.scale_address[0] = scale; // NOLINT + args.output.scale_address[1] = (float)(1.0 / scale); // NOLINT + DLOG << "ret=" << ret; + return ret; } int ComputeFPGAConcat(const struct ConcatArgs &args) { #ifdef FPGA_PRINT_MODE DLOG << "=============ComputeFpgaConcat==========="; DLOG << " Image_num: " << args.image_num + << " out_address:" << args.image_out << " out_scale_address:" << args.scale_out << " out_channel:" << args.out_channel; diff --git a/src/fpga/common/fpga_common.cpp b/src/fpga/common/fpga_common.cpp old mode 100644 new mode 100755 index 53c9173723..16be003955 --- a/src/fpga/common/fpga_common.cpp +++ b/src/fpga/common/fpga_common.cpp @@ -113,6 +113,12 @@ int fpga_invalidate(void *address, size_t size) { return 0; #endif } - +uint64_t vaddr_to_paddr(void *address) { +#ifdef PADDLE_MOBILE_ZU5 + return driver::vaddr_to_paddr(address); +#else + return 0; +#endif +} } // namespace fpga } // namespace paddle_mobile diff --git a/src/operators/kernel/fpga/V2/feed_kernel.cpp b/src/operators/kernel/fpga/V2/feed_kernel.cpp index d1a721b5eb..7c4b999e7c 100644 --- a/src/operators/kernel/fpga/V2/feed_kernel.cpp +++ b/src/operators/kernel/fpga/V2/feed_kernel.cpp @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "operators/kernel/feed_kernel.h" - +#include "fpga/V2/filter.h" namespace paddle_mobile { namespace operators { @@ -24,7 +24,6 @@ bool FeedKernel::Init(FeedParam *param) { fpga::format_fp16_ofm(output, aligned_channel); return true; } - template <> void FeedKernel::Compute(const FeedParam ¶m) { auto input = @@ -33,6 +32,9 @@ void FeedKernel::Compute(const FeedParam ¶m) { auto input_ptr = input->data(); Tensor *output = param.Out(); auto output_ptr = output->data(); + auto channel = input->dims()[1]; + uint32_t aligned_channels = + fpga::filter::calc_aligned_channel((int)channel); // NOLINT fpga::BypassArgs args = {fpga::DATA_TYPE_FP32}; @@ -41,7 +43,7 @@ void FeedKernel::Compute(const FeedParam ¶m) { args.input_layout_type = fpga::LAYOUT_CHW; args.output_layout_type = fpga::LAYOUT_HWC; args.image.address = reinterpret_cast(input_ptr); - args.image.channels = (uint32_t)input->dims()[1]; + args.image.channels = aligned_channels; args.image.height = (uint32_t)input->dims()[2]; args.image.width = (uint32_t)input->dims()[3]; args.image.pad_height = 0; diff --git a/src/operators/kernel/fpga/V2/softmax_kernel.cpp b/src/operators/kernel/fpga/V2/softmax_kernel.cpp old mode 100644 new mode 100755 index bbdb35b715..5232364ac2 --- a/src/operators/kernel/fpga/V2/softmax_kernel.cpp +++ b/src/operators/kernel/fpga/V2/softmax_kernel.cpp @@ -25,7 +25,7 @@ bool SoftmaxKernel::Init(SoftmaxParam *param) { auto input_ptr = input->data(); auto float_input = new Tensor; float_input->mutable_data({1, input->dims()[1]}); - fpga::format_fp32_ofm(float_input, 8); + fpga::format_fp32_ofm(float_input, 1024); fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; args.input_layout_type = fpga::LAYOUT_HWC; -- GitLab