diff --git a/src/fpga/V1/api.cpp b/src/fpga/V1/api.cpp index f2a16b2784f1d07d1322023470539bcd26315751..878544d07191cd7fdb24590b4c14335656b89813 100644 --- a/src/fpga/V1/api.cpp +++ b/src/fpga/V1/api.cpp @@ -81,6 +81,13 @@ int get_plit_num(framework::Tensor *filter_tensor) { int div_capacity = filter::calc_division_capacity(chw); return filter::calc_split_num(num, div_capacity); } +int get_deconv_plit_num(framework::Tensor *filter_tensor, int stride) { + auto dims = filter_tensor->dims(); + auto chw = dims[1] * dims[2] / stride * dims[3] / stride; + auto num = dims[0] * stride; + int div_capacity = filter::calc_division_capacity(chw); + return filter::calc_split_num(num, div_capacity); +} int get_filter_num_per_div(framework::Tensor *filter_tensor, int group_num) { auto dims = filter_tensor->dims(); @@ -90,6 +97,15 @@ int get_filter_num_per_div(framework::Tensor *filter_tensor, int group_num) { return filter::calc_num_per_div(num, group_num, div_capacity); } +int get_deconv_filter_num_per_div(framework::Tensor *filter_tensor, + int group_num, int stride) { + auto dims = filter_tensor->dims(); + auto chw = dims[1] * dims[2] / stride * dims[3] / stride; + auto num = dims[0] * stride; + int div_capacity = filter::calc_division_capacity(chw); + return filter::calc_num_per_div(num, group_num, div_capacity); +} + int get_aligned_filter_element_num(int chw) { return align_to_x(chw, FILTER_ELEMENT_ALIGNMENT); } @@ -448,14 +464,20 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, arg->sub_output_height = (uint32_t)sub_output_height; arg->omit_size = (uint32_t)deconv_filter::deconv_get_omit( stride_w, (int)filter->dims()[3], padding_w); - arg->conv_args = (ConvArgs *)fpga_malloc(sub_conv_num * sizeof(ConvArgs)); - auto sub_channels = (int)input->dims()[1]; + arg->output.address = out_ptr; + arg->output.scale_address = out->scale; + + int sub_channels = (int)input->dims()[1]; + int omit_size = arg->omit_size; + int real_out_width = sub_output_width * sub_conv_num - 2 * omit_size; + int real_out_height = sub_output_height * sub_conv_num - 2 * omit_size; int sub_filter_num = sub_conv_num * (arg->filter_num); int conv_output_size = (align_to_x(sub_output_width * sub_filter_num, IMAGE_ALIGNMENT)) * sub_output_height; + int ouput_size = conv_output_size * sub_conv_num; int align_sub_filter_num = align_to_x(sub_filter_num, FILTER_NUM_ALIGNMENT); int align_sub_filter_count = @@ -464,50 +486,160 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, int align_conv_sub_filter_count = align_sub_filter_count * align_sub_filter_num; + int split_num = + group_num == 1 ? (uint32_t)get_deconv_plit_num(filter, sub_conv_num) : 1; + + arg->split_conv_args = + (SplitConvArgs *)fpga_malloc(sub_conv_num * sizeof(SplitConvArgs)); for (int i = 0; i < sub_conv_num; ++i) { - arg->conv_args[i].filter_num = arg->sub_conv_num * arg->filter_num; - arg->conv_args[i].group_num = (uint32_t)group_num; - - arg->conv_args[i].filter_scale_address = filter->scale; - arg->conv_args[i].relu_enabled = relu_enabled; - - arg->conv_args[i].kernel.width = (uint32_t)sub_filter_width; - arg->conv_args[i].kernel.height = (uint32_t)sub_filter_width; - arg->conv_args[i].kernel.stride_w = 1; - arg->conv_args[i].kernel.stride_h = 1; - - arg->conv_args[i].image.scale_address = input->scale; - arg->conv_args[i].image.channels = (uint32_t)sub_channels; - arg->conv_args[i].image.width = (uint32_t)input->dims()[3]; - arg->conv_args[i].image.height = (uint32_t)input->dims()[2]; - arg->conv_args[i].image.pad_width = (uint32_t)sub_pad; - arg->conv_args[i].image.pad_height = (uint32_t)sub_pad; - arg->conv_args[i].image.address = input_ptr; - arg->conv_args[i].sb_address = bs_ptr; - - auto filter_sub_space = - (char *)fpga_malloc(align_conv_sub_filter_count * sizeof(char)); - fpga_copy(filter_sub_space, - (char *)filter_ptr + i * align_conv_sub_filter_count, - (size_t)align_conv_sub_filter_count); - arg->conv_args[i].filter_address = filter_sub_space; - fpga_flush(filter_sub_space, (size_t)align_conv_sub_filter_count); + arg->split_conv_args[i].filter_num = + (arg->sub_conv_num) * (arg->filter_num); + arg->split_conv_args[i].group_num = (uint32_t)group_num; + arg->split_conv_args[i].split_num = split_num; + arg->split_conv_args[i].conv_arg = + (ConvArgs *)fpga_malloc(split_num * sizeof(ConvArgs)); + + arg->split_conv_args[i].concat_arg.height = sub_output_height; + arg->split_conv_args[i].concat_arg.width = sub_output_width; + arg->split_conv_args[i].concat_arg.image_num = split_num; + arg->split_conv_args[i].concat_arg.images_in = + (half **)fpga_malloc(split_num * sizeof(half *)); + arg->split_conv_args[i].concat_arg.scales_in = + (float **)fpga_malloc(split_num * sizeof(float *)); + arg->split_conv_args[i].concat_arg.channel_num = + (uint32_t *)fpga_malloc(split_num * sizeof(uint32_t)); + // arg->split_conv_args[i].concat_arg.image_out = + // fpga_malloc(conv_output_size * sizeof(half)); + // arg->split_conv_args[i].concat_arg.scale_out = fpga_malloc(2 * + // sizeof(float)); + } + int filter_num_per_div = + get_deconv_filter_num_per_div(filter, group_num, stride_w); + int element_num = get_aligned_filter_element_num( + (int)(sub_channels * sub_filter_width * sub_filter_width)); + + int chw = sub_channels * sub_filter_width * sub_filter_width; + int division_capacity = filter::calc_division_capacity(chw); + int num_per_div_before_alignment = + filter::calc_num_per_div(sub_filter_num, group_num, division_capacity); + int num_per_div_after_alignment = + align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT); + int div_num = (sub_filter_num + num_per_div_before_alignment - 1) / + num_per_div_before_alignment; + int residual = sub_filter_num % num_per_div_before_alignment; + int num_after_alignment = num_per_div_after_alignment * + ((residual == 0) ? div_num : (div_num - 1)) + + align_to_x(residual, FILTER_NUM_ALIGNMENT); + + int filter_sub_conv_offset = element_num * num_after_alignment; + for (int i = 0; i < sub_conv_num; ++i) { if (sub_conv_num == 1) { - arg->conv_args[i].output.address = out_ptr; - arg->conv_args[i].output.scale_address = out->scale; + arg->split_conv_args[i].output.address = arg->output.address; + arg->split_conv_args[i].output.scale_address = arg->output.scale_address; + } else { - auto ptr_output = fpga_malloc(conv_output_size * sizeof(half)); - arg->conv_args[i].output.address = ptr_output; + auto ptr_output = (half *)fpga_malloc(conv_output_size * sizeof(half)); + arg->split_conv_args[i].output.address = (void *)((half *)ptr_output); auto ptr_output_scale = (float *)fpga_malloc(2 * sizeof(float)); - arg->conv_args[i].output.scale_address = ptr_output_scale; + arg->split_conv_args[i].output.scale_address = ptr_output_scale; } - expand_conv_arg(&arg->conv_args[i]); - } - arg->output.address = out_ptr; - arg->output.scale_address = out->scale; + for (int j = 0; j < split_num; ++j) { + arg->split_conv_args[i].conv_arg[j].relu_enabled = relu_enabled; + arg->split_conv_args[i].conv_arg[j].group_num = (uint32_t)group_num; + + arg->split_conv_args[i].conv_arg[j].kernel.width = + (uint32_t)sub_filter_width; + arg->split_conv_args[i].conv_arg[j].kernel.height = + (uint32_t)sub_filter_width; + arg->split_conv_args[i].conv_arg[j].kernel.stride_w = 1; + arg->split_conv_args[i].conv_arg[j].kernel.stride_h = 1; + + arg->split_conv_args[i].conv_arg[j].image.scale_address = input->scale; + arg->split_conv_args[i].conv_arg[j].image.channels = + (uint32_t)sub_channels; + arg->split_conv_args[i].conv_arg[j].image.width = + (uint32_t)input->dims()[3]; + arg->split_conv_args[i].conv_arg[j].image.height = + (uint32_t)input->dims()[2]; + arg->split_conv_args[i].conv_arg[j].image.pad_width = (uint32_t)sub_pad; + arg->split_conv_args[i].conv_arg[j].image.pad_height = (uint32_t)sub_pad; + arg->split_conv_args[i].conv_arg[j].image.address = input_ptr; + + arg->split_conv_args[i].conv_arg[j].filter_scale_address = filter->scale; + arg->split_conv_args[i].conv_arg[j].filter_num = (uint32_t)( + j == split_num - 1 + ? sub_filter_num - (split_num - 1) * filter_num_per_div // NOLINT + : filter_num_per_div); + + size_t filter_size = + element_num * + align_to_x(arg->split_conv_args[i].conv_arg[j].filter_num, + FILTER_NUM_ALIGNMENT) * + sizeof(int8_t); + auto filter_head = + &((int8_t *)filter_ptr)[j * element_num * filter_num_per_div + + i * filter_sub_conv_offset]; + arg->split_conv_args[i].conv_arg[j].filter_address = + fpga_malloc(filter_size); + memcpy(arg->split_conv_args[i].conv_arg[j].filter_address, filter_head, + filter_size); + fpga_flush(arg->split_conv_args[i].conv_arg[j].filter_address, + filter_size); + + { + static int test_cnt = 0; + signed char result = 0; + if (test_cnt <= 1) { + std::string filename = "deconv_split_flt" + std::to_string(test_cnt); + + fpga::savefile( + filename, arg->split_conv_args[i].conv_arg[j].filter_address, + filter_size, result); + test_cnt++; + } + } + + size_t bs_align_num = align_to_x( + arg->split_conv_args[i].conv_arg[j].filter_num, BS_NUM_ALIGNMENT); + size_t bs_size = 2 * bs_align_num * sizeof(float); + auto bs_head = &bs_ptr[j * filter_num_per_div * 2]; + + arg->split_conv_args[i].conv_arg[j].sb_address = fpga_malloc(bs_size); + memcpy(arg->split_conv_args[i].conv_arg[j].sb_address, bs_head, bs_size); + fpga_flush(arg->split_conv_args[i].conv_arg[j].sb_address, bs_size); + + if (split_num == 1) { + arg->split_conv_args[i].conv_arg[j].output.address = + arg->split_conv_args[i].output.address; + arg->split_conv_args[i].conv_arg[j].output.scale_address = + arg->split_conv_args[i].output.scale_address; + } else { + auto ptr_output = (half *)fpga_malloc(conv_output_size * sizeof(half)); + arg->split_conv_args[i].conv_arg[j].output.address = + (void *)((half *)ptr_output); + auto ptr_output_scale = (float *)fpga_malloc(2 * sizeof(float)); + arg->split_conv_args[i].conv_arg[j].output.scale_address = + ptr_output_scale; + } + arg->split_conv_args[i].concat_arg.images_in[j] = + (half *)arg->split_conv_args[i].conv_arg[j].output.address; // NOLINT + arg->split_conv_args[i].concat_arg.scales_in[j] = + arg->split_conv_args[i].conv_arg[j].output.scale_address; + arg->split_conv_args[i].concat_arg.channel_num[j] = + arg->split_conv_args[i].conv_arg[j].filter_num; + + expand_conv_arg(&(arg->split_conv_args[i].conv_arg[j])); + } + + arg->split_conv_args[i].concat_arg.image_out = + arg->split_conv_args[i].output.address; + arg->split_conv_args[i].concat_arg.scale_out = + arg->split_conv_args[i].output.scale_address; + } filter->reset_data_ptr(nullptr); + fpga_free(bs_ptr); } // fill_deconv_arg } // namespace fpga diff --git a/src/fpga/V1/api.h b/src/fpga/V1/api.h index 712973b16dd32a389c06526802a06d78bad4fab1..f6ce24c4edf0d81c0cb6a9bfa65b02ea6eeb813a 100644 --- a/src/fpga/V1/api.h +++ b/src/fpga/V1/api.h @@ -27,7 +27,12 @@ void format_fp32_ofm(framework::Tensor* ofm_tensor); float filter_find_max(framework::Tensor* filter_tensor); int get_filter_num_per_div(framework::Tensor* filter_tensor, int group_num); +int get_deconv_filter_num_per_div(framework::Tensor* filter_tensor, + int group_num, int stride); + int get_plit_num(framework::Tensor* filter_tensor); +int get_deconv_plit_num(framework::Tensor* filter_tensor, int stride); + int get_aligned_filter_element_num(int chw); void format_filter(framework::Tensor* filter_tensor, float max_value, int group_num); diff --git a/src/fpga/V1/pe.cpp b/src/fpga/V1/pe.cpp index 4121b4f0a458728bece76bbef44201991c46a0a7..f58e3a5bfde2f66b26d26c529ade229f79d5d05a 100644 --- a/src/fpga/V1/pe.cpp +++ b/src/fpga/V1/pe.cpp @@ -13,15 +13,25 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "fpga/common/pe.h" +#include "common/types.h" #include "fpga/V1/filter.h" #include "fpga/V1/image.h" #include "fpga/common/config.h" #include "fpga/common/driver.h" +#ifdef COST_TIME_PRINT +#include +#include +#include +#include +//#include +#endif + namespace paddle_mobile { namespace fpga { using namespace driver; // NOLINT +using namespace std; #define USE_RELU 1 #define USE_BIAS 2 @@ -162,15 +172,17 @@ int ComputeFpgaConv(const struct SplitConvArgs &args) { << " group_num:" << args.group_num << " split_num:" << args.split_num; #endif - + int ret = 0; int split_num = args.split_num; for (int i = 0; i < split_num; i++) { - ComputeBasicConv(args.conv_arg[i]); + ret |= ComputeBasicConv(args.conv_arg[i]); } if (split_num > 1) { ComputeFPGAConcat(args.concat_arg); } + + return ret; } int ComputeBasicConv(const struct ConvArgs &args) { @@ -250,12 +262,13 @@ int ComputeBasicConv(const struct ConvArgs &args) { reg_writeq(args.driver.post_prog_full_cnt, 0xd10); reg_writeq(args.driver.fpga_bias_scale_len / 4, 0xd20); reg_writeq(args.driver.cmd, REG_CONV_CMD); - + DLOG << "before reg poll"; if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_CONV, PE_IRQ_TIMEOUT)) { g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status = ERROR; ret = -EIO; DLOG << "Conv Wait Irq Timeout!"; } + DLOG << "after reg poll"; output_scale = reg_readq(REG_SCALE_PARAMETER); output_scale = (output_scale << 32) | (output_scale >> 32); @@ -289,6 +302,8 @@ int ComputeFpgaPool(const struct PoolingArgs &args) { << " out_scale_address:" << args.output.scale_address; #endif #ifdef PADDLE_MOBILE_ZU5 + DLOG << "Polling"; + // return 0; uint64_t output_scale = 0; uint64_t timer_cnt = 0; int ret = 0; @@ -561,11 +576,13 @@ int PerformBypass(const struct BypassArgs &args) { reg_writeq(datalen, REG_CONVERT_LENGTH); reg_writeq(cmd, REG_CONVERT_CMD); + DLOG << "before reg poll"; if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_BYPASS, PE_IRQ_TIMEOUT)) { g_fpgainfo.pe_data->pes[PE_IDX_BYPASS]->status = ERROR; ret = -EIO; DLOG << "BYPASS Wait Irq Timeout!"; } + DLOG << "after reg poll"; output_scale = reg_readq(REG_SCALE_PARAMETER); output_scale = (output_scale << 32) | (output_scale >> 32); @@ -619,37 +636,29 @@ void deconv_post_process(const struct DeconvArgs &args) { int align_deconv_row_len = align_to_x(deconv_row_len, 16); for (int idx = 0; idx < sub_conv_n; ++idx) { - fpga_invalidate(args.conv_args[idx].output.address, - align_origin_w * origin_h * sizeof(int16_t)); + paddle_mobile::fpga::fpga_invalidate( + args.split_conv_args[idx].output.address, + align_origin_w * origin_h * sizeof(int16_t)); } - auto ptr_deconv = (int16_t *)fpga_malloc(num * align_deconv_row_len * - deconv_h * sizeof(int16_t)); - memset(ptr_deconv, 0, - num * align_deconv_row_len * deconv_h * sizeof(int16_t)); int deconv_idx = 0; for (int nn = 0; nn < num; ++nn) { for (int hh = 0; hh < origin_h; ++hh) { int hx = (hh % sub_conv_n); auto sub_t = - (int16_t *)(args.conv_args[sub_conv_n - hx - 1].output.address); + (int16_t *)(args.split_conv_args[sub_conv_n - hx - 1].output.address); int hi = (hh / sub_conv_n); if ((hh < omit_size) || (hh >= (origin_h - omit_size))) continue; int sidx = (nn * origin_h * align_origin_w + hi * align_origin_w + omit_size * channel); - - fpga_copy(ptr_deconv + deconv_idx, sub_t + sidx, + fpga_copy((int16_t *)(args.output.address) + deconv_idx, sub_t + sidx, sizeof(int16_t) * deconv_row_len); deconv_idx += align_deconv_row_len; } } - fpga_copy(args.output.address, ptr_deconv, - num * align_deconv_row_len * deconv_h * sizeof(int16_t)); fpga_flush(args.output.address, num * align_deconv_row_len * deconv_h * sizeof(int16_t)); - fpga_free(ptr_deconv); - -} // deconv_post_process +} int ComputeFpgaDeconv(const struct DeconvArgs &args) { #ifdef FPGA_PRINT_MODE @@ -661,32 +670,70 @@ int ComputeFpgaDeconv(const struct DeconvArgs &args) { << " sub_conv_num:" << args.sub_conv_num; DLOG << "args.output.address: " << args.output.address << "args.output.scale_address: " << args.output.scale_address; - DLOG << "args.conv_args.sb_address: " << (args.conv_args)->sb_address - << "args.conv_args.filter_address: " << (args.conv_args)->filter_address; -#endif -#ifndef PADDLE_MOBILE_ZU5 - return 0; #endif int sub_conv_num = args.sub_conv_num; + +#ifdef COST_TIME_PRINT + timeval start, end; + long dif_sec, dif_usec; +#endif + for (int i = 0; i < sub_conv_num; i++) { - ComputeBasicConv(args.conv_args[i]); +#ifdef COST_TIME_PRINT + gettimeofday(&start, NULL); +#endif + + ComputeFpgaConv(args.split_conv_args[i]); +#ifdef COST_TIME_PRINT + gettimeofday(&end, NULL); + dif_sec = end.tv_sec - start.tv_sec; + dif_usec = end.tv_usec - start.tv_usec; + std::cout << "deconv basic_conv: " << i << " times: " + << " cost time: " << (dif_sec * 1000000 + dif_usec) << "us" + << std::endl; +#endif } if (sub_conv_num > 1) { float max_scale = -1.0f; +#ifdef COST_TIME_PRINT + gettimeofday(&start, NULL); +#endif for (int i = 0; i < sub_conv_num; i++) { paddle_mobile::fpga::fpga_invalidate( - args.conv_args[i].output.scale_address, 2 * sizeof(float)); - float ptr_scale = (args.conv_args[i].output.scale_address)[0]; + args.split_conv_args[i].output.scale_address, 2 * sizeof(float)); + float ptr_scale = (args.split_conv_args[i].output.scale_address)[0]; if (ptr_scale > max_scale) { args.output.scale_address[0] = ptr_scale; args.output.scale_address[1] = - (args.conv_args[i].output.scale_address)[1]; + (args.split_conv_args[i].output.scale_address)[1]; } } + +#ifdef COST_TIME_PRINT + gettimeofday(&end, NULL); + dif_sec = end.tv_sec - start.tv_sec; + dif_usec = end.tv_usec - start.tv_usec; + std::cout << "deconv scale " + << " cost time: " << (dif_sec * 1000000 + dif_usec) << "us" + << std::endl; +#endif + + // fpga_flush(args.output.scale_address, 2 * sizeof(float)); +#ifdef COST_TIME_PRINT + gettimeofday(&start, NULL); +#endif deconv_post_process(args); +#ifdef COST_TIME_PRINT + gettimeofday(&end, NULL); + dif_sec = end.tv_sec - start.tv_sec; + dif_usec = end.tv_usec - start.tv_usec; + std::cout << "deconv_post_process " + << " cost time: " << (dif_sec * 1000000 + dif_usec) << "us" + << std::endl; +#endif } return 0; diff --git a/src/fpga/common/fpga_common.cpp b/src/fpga/common/fpga_common.cpp index d9f4078eb514d73a7d7a1481d557323d94d6f8cc..00b546a5d6bbde97591827fbcdf4854b6ee82a38 100755 --- a/src/fpga/common/fpga_common.cpp +++ b/src/fpga/common/fpga_common.cpp @@ -59,6 +59,9 @@ int close_device() { void *fpga_malloc(size_t size) { static uint64_t counter = 0; + if (size <= 0) { + size = 1; + } #ifdef PADDLE_MOBILE_ZU5 auto ptr = driver::fpga_malloc_driver(size); #else diff --git a/src/fpga/common/fpga_common.h b/src/fpga/common/fpga_common.h index af6f18eb57b0e710359235790529c15ce4e82917..eabe03c29f2f8a6805b63131e767de9e86d7c055 100644 --- a/src/fpga/common/fpga_common.h +++ b/src/fpga/common/fpga_common.h @@ -210,7 +210,7 @@ struct DeconvArgs { uint32_t sub_output_width; uint32_t sub_output_height; struct ImageOutputArgs output; - struct ConvArgs* conv_args; + struct SplitConvArgs* split_conv_args; }; // static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; diff --git a/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp b/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp index 7ab25193c86228ba290bc4a79fd1fa203a8e3617..18435d29ab64182ae26721374967fb7c0774437d 100644 --- a/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp +++ b/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp @@ -54,11 +54,11 @@ bool DeconvAddKernel::Init(FusionDeconvAddParam *param) { fpga::format_deconv_filter(filter, max_value, param->Groups(), param->Strides()[0]); - // int element_num_per_div = - // fpga::get_filter_num_per_div(filter, param->Groups()); + int element_num_per_div = + fpga::get_deconv_filter_num_per_div(filter, param->Groups(), sub_conv_n); - // deconv only support group=1 && no spilt - fpga::format_bias_scale_array(&bs_ptr, channel * sub_conv_n, + // + fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel * sub_conv_n); fpga::format_fp16_ofm(out); diff --git a/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp b/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp index a9a6909b00b7608d5edb26aa943c1c9cae663b3e..92f3addd821a36974de1bab9226d152329259ccd 100644 --- a/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp +++ b/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp @@ -55,11 +55,10 @@ bool DeconvAddReluKernel::Init( fpga::format_deconv_filter(filter, max_value, param->Groups(), param->Strides()[0]); - // int element_num_per_div = - // fpga::get_filter_num_per_div(filter, param->Groups()); + int element_num_per_div = + fpga::get_deconv_filter_num_per_div(filter, param->Groups(), sub_conv_n); - // deconv only support group=1 && no spilt - fpga::format_bias_scale_array(&bs_ptr, channel * sub_conv_n, + fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel * sub_conv_n); fpga::format_fp16_ofm(out);