From 6b14134f17c97606fb17f1f3861c5ee22829f350 Mon Sep 17 00:00:00 2001 From: zhangyang Date: Mon, 3 Dec 2018 14:07:45 +0800 Subject: [PATCH] add deconv op for V1 for FPGA track --- src/fpga/V1/api.cpp | 28 +- src/fpga/V1/filter.cpp | 43 +- src/fpga/V1/pe.cpp | 787 +++++++++++++++++++++++++++++++++- src/fpga/common/driver.cpp | 18 +- src/fpga/common/driver.h | 9 +- src/fpga/common/fpga_common.h | 31 +- 6 files changed, 864 insertions(+), 52 deletions(-) diff --git a/src/fpga/V1/api.cpp b/src/fpga/V1/api.cpp index 7c1f15f7c9..ef7d5c13dc 100644 --- a/src/fpga/V1/api.cpp +++ b/src/fpga/V1/api.cpp @@ -196,19 +196,35 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, arg->conv_arg[i].image.pad_height = (uint32_t)padding_h; arg->conv_arg[i].image.pad_width = (uint32_t)padding_w; arg->conv_arg[i].filter_scale_address = filter->scale; - arg->conv_arg[i].filter_address = &( - (int8_t *)filter_ptr)[i * element_num * filter_num_per_div]; // NOLINT - arg->conv_arg[i].sb_address = &bs_ptr[i * filter_num_per_div * 2]; + // arg->conv_arg[i].filter_address = &( + // (int8_t *)filter_ptr)[i * element_num * filter_num_per_div]; // + // NOLINT + // arg->conv_arg[i].sb_address = &bs_ptr[i * filter_num_per_div * 2]; + arg->conv_arg[i].filter_num = (uint32_t)( i == n - 1 ? channel - (n - 1) * filter_num_per_div // NOLINT : filter_num_per_div); + size_t filter_size = + element_num * arg->conv_arg[i].filter_num * sizeof(int8_t); + auto filter_head = + &((int8_t *)filter_ptr)[i * element_num * filter_num_per_div]; + arg->conv_arg[i].filter_address = fpga_malloc(filter_size); + memcpy(arg->conv_arg[i].filter_address, filter_head, filter_size); + fpga_flush(arg->conv_arg[i].filter_address, filter_size); + + size_t bs_size = 2 * arg->conv_arg[i].filter_num * sizeof(float); + auto bs_head = &bs_ptr[i * filter_num_per_div * 2]; + arg->conv_arg[i].sb_address = fpga_malloc(bs_size); + memcpy(arg->conv_arg[i].sb_address, bs_head, bs_size); + fpga_flush(arg->conv_arg[i].sb_address, bs_size); + if (n > 1) { arg->conv_arg[i].output.scale_address = (float *)fpga_malloc(2 * sizeof(float)); // NOLINT arg->conv_arg[i].output.address = - fpga_malloc(input->dims()[2] * - align_to_x(input->dims()[3] * arg->conv_arg[i].filter_num, + fpga_malloc(out->dims()[2] * + align_to_x(out->dims()[3] * arg->conv_arg[i].filter_num, IMAGE_ALIGNMENT) * sizeof(half)); } else { @@ -221,6 +237,8 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, arg->concat_arg.scales_in[i] = arg->conv_arg[i].output.scale_address; arg->concat_arg.channel_num[i] = arg->conv_arg[i].filter_num; } + filter->reset_data_ptr(nullptr); + fpga_free(bs_ptr); } } // namespace fpga diff --git a/src/fpga/V1/filter.cpp b/src/fpga/V1/filter.cpp index 157ac90a60..d67c9fdc18 100644 --- a/src/fpga/V1/filter.cpp +++ b/src/fpga/V1/filter.cpp @@ -137,24 +137,23 @@ void align_num(char **data_in, int num_per_div_before_alignment, int num, int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT); int num_per_div_after_alignment = align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT); - if (num_per_div_after_alignment != num_per_div_before_alignment) { - char *tmp = *data_in; - int div_num = - (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment; - int num_element = div_num * num_per_div_after_alignment * align_chw; - char *data_tmp = (char *)fpga_malloc(num_element * sizeof(char)); // NOLINT - memset(data_tmp, 0, num_element * sizeof(char)); + char *tmp = *data_in; + int div_num = + (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment; + int num_element = div_num * num_per_div_after_alignment * align_chw; + char *data_tmp = (char *)fpga_malloc(num_element * sizeof(char)); // NOLINT - for (i = 0; i < div_num; i++) { - memcpy(data_tmp + num_per_div_after_alignment * align_chw * i, - *data_in + num_per_div_before_alignment * align_chw * i, - num_per_div_before_alignment * align_chw); - } + memset(data_tmp, 0, num_element * sizeof(char)); - *data_in = data_tmp; - fpga_free(tmp); + for (i = 0; i < div_num; i++) { + memcpy(data_tmp + num_per_div_after_alignment * align_chw * i, + *data_in + num_per_div_before_alignment * align_chw * i, + num_per_div_before_alignment * align_chw); } + + *data_in = data_tmp; + fpga_free(tmp); } void reorder(char **data_in, int num_after_alignment, int chw) { @@ -223,7 +222,10 @@ void format_filter(float **data_in, int num, int channel, int height, int width, char **quantize_data = (char **)data_in; // NOLINT convert_to_hwc(quantize_data, num, channel, height, width); align_element(quantize_data, num, chw); - align_num(quantize_data, num_per_div_before_alignment, num, chw); + if (num_after_alignment != num) { + align_num(quantize_data, num_per_div_before_alignment, num, chw); + } + reorder(quantize_data, num_after_alignment, chw); interleave(quantize_data, num_after_alignment, chw); fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) * @@ -254,15 +256,18 @@ void format_fc_filter(float **data_in, int num, int channel, int height, align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT); int div_num = (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment; - int num_after_alignment = num_per_div_after_alignment * div_num; + int residual = num % num_per_div_before_alignment; + int num_after_alignment = num_per_div_after_alignment * + ((residual == 0) ? div_num : (div_num - 1)) + + align_to_x(residual, FILTER_NUM_ALIGNMENT); quantize(data_in, data_size, max); - char **quantize_data = (char **)data_in; // NOLINT - convert_fc_filter(quantize_data, num, chw); align_element(quantize_data, num, chw); - align_num(quantize_data, num_per_div_before_alignment, num, chw); + if (num_after_alignment != num) { + align_num(quantize_data, num_per_div_before_alignment, num, chw); + } reorder(quantize_data, num_after_alignment, chw); interleave(quantize_data, num_after_alignment, chw); fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) * diff --git a/src/fpga/V1/pe.cpp b/src/fpga/V1/pe.cpp index 1f0e5768a7..d62f015e66 100644 --- a/src/fpga/V1/pe.cpp +++ b/src/fpga/V1/pe.cpp @@ -13,16 +13,172 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "fpga/common/pe.h" +#include +#include +#include +#include "common/types.h" #include "fpga/V1/filter.h" #include "fpga/V1/image.h" #include "fpga/common/config.h" #include "fpga/common/driver.h" +using namespace std; +using namespace paddle_mobile::fpga::driver; // NOLINT namespace paddle_mobile { namespace fpga { +#define IMAGE_ALIGN 16 +#define FILTER_ALIGN 16 +#define FILTER_NUM_ALIGN 32 +#define USE_RELU 1 +#define USE_BIAS 2 + +// bypass cmd +#define CMD_FP16_TO_FP16 0 +#define CMD_FP16_TO_FP32 1 +#define CMD_FP32_TO_FP16 2 +#define CMD_FP32_TO_FP32 3 + +// bypass macro +#define SIZE_FP16 2 +#define SIZE_FP32 4 + +#define PE_IRQ_TIMEOUT 1000000 + +/* Interrupt bit-set offset*/ +#define INTERRUPT_RSVD 0x0001 +#define INTERRUPT_BYPASS 0x0002 +#define INTERRUPT_CONV 0x0004 +#define INTERRUPT_POOLING 0x0008 +#define INTERRUPT_EW 0x0010 +//#define INTERRUPT_RESIZE 0x0020 + +/* Register offset */ +#define REG_INTERRUPT 0x000 +#define REG_VERSION 0x008 +#define REG_TEMPERATURE 0x010 +#define REG_FPGA_RESET 0x018 +#define REG_TEST_REGISTER 0x048 +#define REG_HARDWARE_STATUS 0x050 + +#define REG_TIMER_COUNTER 0x070 + +#define REG_SCALE_PARAMETER 0x080 + +#define REG_FLASH_CMD 0x200 +#define REG_FLASH_DATA 0x208 +#define REG_FLASH_CONFIG 0x210 +#define REG_FLASH_STATUS 0x218 +#define REG_SN 0x220 + +//#define REG_READ_SCALE +//#define REG_WRITE_SCALE + +/*bypass*/ +#define REG_CONVERT_CMD 0x400 +#define REG_CONVERT_SRC_ADDR 0x408 +#define REG_CONVERT_DST_ADDR 0x410 +#define REG_CONVERT_LENGTH 0x418 + +/*resize*/ +#define REG_RESIZE_CMD 0x600 +#define REG_RESIZE_CHANNEL_NUMBER 0x608 +#define REG_RESIZE_INPUT_IMAGE_PIXEL 0x610 +#define REG_RESIZE_OUTPUT_IMAGE_PIXEL 0x618 +#define REG_RESIZE_INPUT_BASE_ADDR 0x620 +#define REG_RESIZE_WEIGHT_BASE_ADDR 0x628 +#define REG_RESIZE_SRC_POS_BASE_ADDR 0x630 +#define REG_RESIZE_OUTPUT_BASE_ADDR 0x638 + +/*pooling*/ +#define REG_POOLING_CMD 0x800 +#define REG_POOLING_IMAGE_BASE_ADDR 0x808 +#define REG_POOLING_RESULT_BASE_ADDR 0x810 +#define REG_POOLING_IMAGE_PIXEL 0x818 +#define REG_POOLING_WINDOW_SIZE 0x820 +#define REG_POOLING_RESULT_PIXEL 0x828 +#define REG_POOLING_PAD_PIXEL 0x830 +#define REG_POOLING_STEP_PIXEL 0x838 +#define REG_POOLING_CHANNEL_NUMBER 0x840 +#define REG_POOLING_IMAGE_AMOUNT_PER_ROW 0x848 +#define REG_POOLING_IMAGE_ONE_PAD_PER_ROW 0x850 +#define REG_POOLING_IMAGE_TWO_PAD_PER_ROW 0x858 +#define REG_POOLING_IMAGE_ROW_MUL_WINDOW_HEIGHT 0x860 +#define REG_POOLING_IMAGE_ROW_MUL_PAD_HEIGHT 0x868 +#define REG_POOLING_IMAGE_ROW_MUL_STEP_HEIGHT 0x870 +#define REG_POOLING_RESULT_AMOUNT_ALIGN_32 0x878 +#define REG_POOLING_RESULT_AMOUNT_ALIGN_64 0x880 +#define REG_POOLING_IMAGE_CALCU_HEIGHT 0x888 +#define REG_POOLING_IMAGE_PADLEFT_SKIPWINDOW 0x898 +#define REG_POOLING_MODE_RECIPROCAL 0x890 + +/*conv*/ +#define REG_CONV_CMD 0xC00 +#define REG_CONV_IMAGE_BASE_ADDR 0xC08 +#define REG_CONV_FILTER_BASE_ADDR 0xC10 +#define REG_CONV_SB_BASE_ADDR 0xC18 +#define REG_CONV_RESULT_BASE_ADDR 0xC20 +#define REG_CONV_IMAGE_PIXEL 0xC28 +#define REG_CONV_FILTER_PIXEL 0xC30 +#define REG_CONV_RESULT_PIXEL 0xC38 +#define REG_CONV_PAD_PIXEL 0xC40 +#define REG_CONV_STEP_PIXEL 0xC48 +#define REG_CONV_GROUP_NUMBER 0xC50 +#define REG_CONV_FILTER_NUMBER 0xC58 +#define REG_CONV_CHANNEL_NUMBER 0xC60 +#define REG_CONV_FILTER_PER_GROUP 0xC68 +#define REG_CONV_CHANNEL_PER_GROUP 0xC70 +#define REG_CONV_IMAGE_AMOUNT_PER_ROW 0xC78 +#define REG_CONV_IMAGE_ONE_PAD_PER_ROW 0xC80 +#define REG_CONV_IMAGE_TWO_PAD_PER_ROW 0xC88 +#define REG_CONV_FILTER_AMOUNT_ALL 0xC90 +#define REG_CONV_RESULT_AMOUNT_PER_ROW 0xC98 +#define REG_CONV_RESULT_LAST_VALID 0xCA0 + +#define REG_CONV_BLOCK_AMOUNT_PER_ROW 0xCA8 +#define REG_CONV_FILTER_PAD_WIDTH_MUL_CH 0xCB0 +#define REG_CONV_IMAGE_AMOUNT_PER_ROW_MUL_WIN_F 0xCB8 +#define REG_CONV_IMAGE_AMOUNT_PER_ROW_MUL_WIN 0xCC0 +#define REG_CONV_IMAGE_BLOCK_NUM 0xCC8 +#define REG_CONV_IMAGE_BLOCK_LEN 0xCD0 +#define REG_CONV_IMAGE_BLOCK_LEN_LAST 0xCD8 +#define REG_CONV_IMAGE_WIN_CNT 0xCE0 +#define REG_CONV_IMAGE_WIN_CNT_LAST 0xCE8 +#define REG_CONV_RES_ROW_DATA_ALIGN4_PAD 0xCF8 +#define REG_CONV_PROG_FULL_CNT 0xD08 +#define REG_CONV_POST_PROG_FULL_CNT 0xD10 +#define REG_CONV_FPGA_BIAS_SCALE_LEN 0xD20 + +#define REG_CONV_IMAGE_SCALE 0xD28 +#define REG_CONV_FILTER_SCALE 0xD30 + +/*ew*/ +#define REG_EW_CMD 0x0F00 +#define REG_EW_IMAGE0_BASE_ADDR 0x0F08 +#define REG_EW_IMAGE1_BASE_ADDR 0x0F10 +#define REG_EW_RESULT_BASE_ADDR 0x0F18 +#define REG_EW_DATA_LEN 0x0F20 +#define REG_EW_COEFFICIENT 0x0F28 +#define REG_EW_IMAGE_PIXEL 0x0F30 +#define REG_EW_IMAGE_AMOUNT_PER_ROW 0x0F38 + int ComputeFpgaConv(const struct SplitConvArgs &args) { - ComputeBasicConv(args.conv_arg[0]); +// ComputeBasicConv(args.conv_arg[0]); +#ifdef FPGA_PRINT_MODE + DLOG << "=============ComputeFPGAConv==========="; + DLOG << " filter_num:" << args.filter_num + << " group_num:" << args.group_num + << " split_num:" << args.split_num; +#endif + + int split_num = args.split_num; + for (int i = 0; i < split_num; i++) { + ComputeBasicConv(args.conv_arg[i]); + } + + if (split_num > 1) { + ComputeFPGAConcat(args.concat_arg); + } } int ComputeBasicConv(const struct ConvArgs &args) { @@ -47,9 +203,237 @@ int ComputeBasicConv(const struct ConvArgs &args) { DLOG << " out_address:" << args.output.address << " out_scale_address:" << args.output.scale_address; #endif + cout << " relu_enabled:" << args.relu_enabled + << " sb_address:" << args.sb_address + << " filter_address:" << args.filter_address + << " filter_num:" << args.filter_num + << " group_num:" << args.group_num; + cout << " image_address:" << args.image.address + << " image_scale_address:" << args.image.scale_address + << " image_channels:" << args.image.channels + << " image_height:" << args.image.height + << " image_width:" << args.image.width + << " pad_height:" << args.image.pad_height + << " pad_width:" << args.image.pad_width; + cout << " kernel_height:" << args.kernel.height + << " kernel_width:" << args.kernel.width + << " stride_h:" << args.kernel.stride_h + << " stride_w:" << args.kernel.stride_w; + cout << " out_address:" << args.output.address + << " out_scale_address:" << args.output.scale_address; -#ifndef PADDLE_MOBILE_ZU5 - return 0; +#ifdef PADDLE_MOBILE_ZU5 + DLOG << "Conv"; + // return 0; + uint64_t timer_cnt; + uint64_t output_scale; + uint64_t image_scale; + uint64_t filter_scale; + uint64_t image_address_phy = 0; + uint64_t sb_address_phy = 0; + uint64_t filter_address_phy = 0; + uint64_t output_address_phy = 0; + int ret = 0; + + fpga_copy(&image_scale, args.image.scale_address, 2 * sizeof(float)); + fpga_copy(&filter_scale, args.filter_scale_address, 2 * sizeof(float)); + + cout << "image_scale :" << hex << (image_scale) << endl; + cout << "filter_scale :" << hex << (filter_scale) << endl; + + uint64_t filterlen = (uint64_t)args.kernel.width * + (uint64_t)args.kernel.height * + (uint64_t)args.image.channels; + filterlen = align_to_x(filterlen, FILTER_ALIGN); + filterlen *= align_to_x((uint64_t)args.filter_num, FILTER_NUM_ALIGN); + uint64_t fpga_bias_scale_len = + align_to_x(args.filter_num / args.group_num, 8) * args.group_num; + + uint64_t output_height = + (args.image.height + args.image.pad_height * 2 - args.kernel.height) / + args.kernel.stride_h + + 1; + uint64_t output_width = + (args.image.width + args.image.pad_width * 2 - args.kernel.width) / + args.kernel.stride_w + + 1; + uint64_t output_size = + output_height * output_width * (uint64_t)args.filter_num; + + uint64_t filter_per_group = (uint64_t)(args.filter_num / args.group_num); + uint64_t channel_per_group = (uint64_t)(args.image.channels / args.group_num); + + uint64_t image_row_count = ((uint64_t)args.image.width) * + ((uint64_t)args.image.channels); // without align + uint64_t image_amount_per_row = align_to_x(image_row_count, IMAGE_ALIGN); + uint64_t image_one_pad_per_row = + align_to_x(image_row_count, IMAGE_ALIGN) + + ((uint64_t)args.image.pad_width) * ((uint64_t)args.image.channels); + uint64_t filter_amount_all = + align_to_x(((uint64_t)args.kernel.height) * + ((uint64_t)args.kernel.width) * channel_per_group, + FILTER_ALIGN); + + uint64_t output_amount_per_row = + align_to_x(output_width * ((uint64_t)args.filter_num), IMAGE_ALIGN); + + // find the opt partition strategy + uint64_t res_win; + uint64_t res_fit = 0; + for (res_win = 1; res_win <= output_width; res_win = res_win + 1) { + if ((align_to_x( + (args.image.channels * + (args.kernel.width + (res_win - 1) * args.kernel.stride_w)), + IMAGE_ALIGN) / + 16 + + 1) * + args.kernel.height > + 2048) { + break; + } + } + + if (res_win != output_width) { + res_win -= 1; + } + + if (((res_win % 2) != 0) && (res_win != 1)) { + res_win = res_win - 1; + } + res_fit = res_win; + + uint64_t block_num = (output_width + res_fit - 1) / res_fit; + uint64_t block_len = res_fit; + uint64_t block_last = output_width - res_fit * (block_num - 1); + + uint64_t res_amount_per_row = output_width * args.filter_num; + uint64_t res_amount_per_row_pad = output_amount_per_row - res_amount_per_row; + + uint64_t image_block_amount_per_row = + args.kernel.stride_w * (res_fit)*args.image.channels; + uint64_t filter_pad_width_mul_channel = + args.image.pad_width * args.image.channels; + uint64_t image_amount_per_row_multi_win_first = + image_amount_per_row * (4 * args.kernel.stride_h - args.image.pad_height); + uint64_t image_amount_per_row_multi_win = + image_amount_per_row * (4 * args.kernel.stride_h); + + uint64_t image_block_num = block_num; + uint64_t image_block_len = + align_to_x((args.image.channels * + (args.kernel.width + (block_len - 1) * args.kernel.stride_w)), + IMAGE_ALIGN) / + 16 + + 1; + uint64_t image_block_len_last = + align_to_x( + (args.image.channels * + (args.kernel.width + (block_last - 1) * args.kernel.stride_w)), + IMAGE_ALIGN) / + 16 + + 1; + uint64_t image_win_cnt = block_len; + uint64_t image_win_cnt_last = block_last; + uint64_t res_row_data_align4_pad = res_amount_per_row_pad / 8; + uint64_t prog_full_cnt = 2048 / (filter_amount_all / 16 * 2) - 1; + if (prog_full_cnt == 1023) { + prog_full_cnt--; + } + uint64_t post_prog_full_cnt = + (512 / (align_to_x(args.filter_num, 4) / 4 * 2) > 2) + ? (512 / (align_to_x(args.filter_num, 4) / 4 * 2) - 2) + : 0; + + image_address_phy = vaddr_to_paddr(args.image.address); + sb_address_phy = vaddr_to_paddr(args.sb_address); + filter_address_phy = vaddr_to_paddr(args.filter_address); + output_address_phy = vaddr_to_paddr(args.output.address); + + /*SDK刷Cache保证数据一致性*/ + uint64_t cmd = 0UL | (args.relu_enabled ? USE_RELU : 0) | USE_BIAS; + + pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); + if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status) { + ret = -EIO; + DLOG << "Conv Status Error!"; + pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); + return ret; + } + + /*restart scale*/ + reg_writeq(output_scale, REG_SCALE_PARAMETER); + + reg_writeq(image_address_phy, REG_CONV_IMAGE_BASE_ADDR); + reg_writeq(filter_address_phy, REG_CONV_FILTER_BASE_ADDR); + reg_writeq(sb_address_phy, REG_CONV_SB_BASE_ADDR); + reg_writeq(output_address_phy, REG_CONV_RESULT_BASE_ADDR); + + reg_writeq( + ((uint64_t)args.image.height) | (((uint64_t)args.image.width) << 32), + REG_CONV_IMAGE_PIXEL); + reg_writeq( + ((uint64_t)args.kernel.height) | (((uint64_t)args.kernel.width) << 32), + REG_CONV_FILTER_PIXEL); + reg_writeq(output_height | (output_width << 32), REG_CONV_RESULT_PIXEL); + reg_writeq(((uint64_t)args.image.pad_height) | + (((uint64_t)args.image.pad_width) << 32), + REG_CONV_PAD_PIXEL); + reg_writeq(((uint64_t)args.kernel.stride_h) | + (((uint64_t)args.kernel.stride_w) << 32), + REG_CONV_STEP_PIXEL); + + reg_writeq((uint64_t)args.group_num, REG_CONV_GROUP_NUMBER); + reg_writeq((uint64_t)args.filter_num, REG_CONV_FILTER_NUMBER); + reg_writeq((uint64_t)args.image.channels, REG_CONV_CHANNEL_NUMBER); + + reg_writeq(filter_per_group, REG_CONV_FILTER_PER_GROUP); + reg_writeq(channel_per_group, REG_CONV_CHANNEL_PER_GROUP); + + reg_writeq(image_amount_per_row, REG_CONV_IMAGE_AMOUNT_PER_ROW); + reg_writeq(image_one_pad_per_row, REG_CONV_IMAGE_ONE_PAD_PER_ROW); + reg_writeq(filter_amount_all, REG_CONV_FILTER_AMOUNT_ALL); + reg_writeq(output_amount_per_row, REG_CONV_RESULT_AMOUNT_PER_ROW); + + reg_writeq(image_block_amount_per_row, 0xca8); + reg_writeq(filter_pad_width_mul_channel, 0xcb0); + reg_writeq(image_amount_per_row_multi_win_first, 0xcb8); + reg_writeq(image_amount_per_row_multi_win, 0xcc0); + reg_writeq(image_block_num, 0xcc8); + reg_writeq(image_block_len, 0xcd0); + reg_writeq(image_block_len_last, 0xcd8); + reg_writeq(image_win_cnt, 0xce0); + reg_writeq(image_win_cnt_last, 0xce8); + reg_writeq(res_row_data_align4_pad, 0xcf8); + reg_writeq(prog_full_cnt, 0xd08); + reg_writeq(post_prog_full_cnt, 0xd10); + reg_writeq(fpga_bias_scale_len / 4, 0xd20); + + /*write scale*/ + reg_writeq(image_scale, REG_CONV_IMAGE_SCALE); + reg_writeq(filter_scale, REG_CONV_FILTER_SCALE); + + reg_writeq(cmd, REG_CONV_CMD); + + DLOG << "before reg poll"; + if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_CONV, PE_IRQ_TIMEOUT)) { + g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status = ERROR; + ret = -EIO; + DLOG << "Conv Wait Irq Timeout!"; + } + DLOG << "after reg poll"; + usleep(40); + + /*SDK 无效 Cache保证数据一致性*/ + + output_scale = reg_readq(REG_SCALE_PARAMETER); + output_scale = (output_scale << 32) | (output_scale >> 32); + fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); + cout << "output_scale :" << hex << (output_scale) << endl; + + //*(args.output.scale_address) = output_scale; + pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); + + return ret; #endif return 0; @@ -74,8 +458,135 @@ int ComputeFpgaPool(const struct PoolingArgs &args) { DLOG << " out_address:" << args.output.address << " out_scale_address:" << args.output.scale_address; #endif -#ifndef PADDLE_MOBILE_ZU5 - return 0; +#ifdef PADDLE_MOBILE_ZU5 + DLOG << "Polling"; + // return 0; + uint64_t output_scale = 0; + uint64_t timer_cnt = 0; + int ret = 0; + uint64_t cmd = 0; + + uint64_t image_physical_address = 0; + uint64_t output_physical_address = 0; + + image_physical_address = vaddr_to_paddr(args.image.address); + output_physical_address = vaddr_to_paddr(args.output.address); + + uint32_t output_height = (uint32_t)( + (args.image.height + args.image.pad_height * 2 - args.kernel.height) / + args.kernel.stride_h + + 1); + uint32_t output_width = (uint32_t)( + (args.image.width + args.image.pad_width * 2 - args.kernel.width) / + args.kernel.stride_w + + 1); + + uint64_t image_amount_per_row = align_to_x( + (uint64_t)args.image.width * (uint64_t)args.image.channels, IMAGE_ALIGN); + uint64_t image_one_pad_per_row = + align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels, + FILTER_ALIGN) + + (uint64_t)args.image.pad_width * (uint64_t)args.image.channels; + uint64_t image_two_pad_per_row = align_to_x( + ((uint64_t)args.image.width + (uint64_t)args.image.pad_width * 2) * + (uint64_t)args.image.channels, + IMAGE_ALIGN); + uint64_t image_row_mul_pooling_hight = + image_amount_per_row * (uint64_t)args.kernel.height; + uint64_t image_row_mul_pad_hight = + image_amount_per_row * (uint64_t)args.image.pad_height; + uint64_t image_row_mul_step_hight = + image_amount_per_row * (uint64_t)args.kernel.stride_h; + uint64_t result_amount_align_32 = align_to_x( + (uint64_t)output_width * (uint64_t)args.image.channels, FILTER_ALIGN); + uint64_t result_amount_align_64 = align_to_x( + (uint64_t)output_width * (uint64_t)args.image.channels, IMAGE_ALIGN); + uint64_t image_calcu_height = + (uint64_t)args.kernel.height + + ((uint64_t)output_height - 1) * (uint64_t)args.kernel.stride_h; + + uint64_t image_pad_left = args.image.channels * args.image.pad_width; + uint64_t image_skip_window = args.image.channels * args.kernel.stride_w; + + uint64_t image_padleft_skipwindow = + (image_skip_window << 32) | image_pad_left; + + uint64_t mode_reciprocal = (uint64_t)0 | ((uint64_t)args.mode) << 16 | + (((uint64_t)args.kernel_reciprocal)); + + pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); + if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) { + ret = -EIO; + DLOG << "Conv Status Error!"; + pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); + return ret; + } + + /*restart scale*/ + reg_writeq(output_scale, REG_SCALE_PARAMETER); + + reg_writeq(image_physical_address, REG_POOLING_IMAGE_BASE_ADDR); + reg_writeq(output_physical_address, REG_POOLING_RESULT_BASE_ADDR); + + reg_writeq( + ((uint64_t)args.image.height) | (((uint64_t)args.image.width) << 32), + REG_POOLING_IMAGE_PIXEL); + reg_writeq( + ((uint64_t)args.kernel.height) | (((uint64_t)args.kernel.width) << 32), + REG_POOLING_WINDOW_SIZE); + + reg_writeq(((uint64_t)output_height) | (((uint64_t)output_width) << 32), + REG_POOLING_RESULT_PIXEL); + + reg_writeq(((uint64_t)args.image.pad_height) | + (((uint64_t)args.image.pad_width) << 32), + REG_POOLING_PAD_PIXEL); + reg_writeq(((uint64_t)args.kernel.stride_h) | + (((uint64_t)args.kernel.stride_w) << 32), + REG_POOLING_STEP_PIXEL); + + reg_writeq((uint64_t)args.image.channels, REG_POOLING_CHANNEL_NUMBER); + + reg_writeq(image_amount_per_row, REG_POOLING_IMAGE_AMOUNT_PER_ROW); + reg_writeq(image_one_pad_per_row, REG_POOLING_IMAGE_ONE_PAD_PER_ROW); + reg_writeq(image_two_pad_per_row, REG_POOLING_IMAGE_TWO_PAD_PER_ROW); + + reg_writeq(image_row_mul_pooling_hight, + REG_POOLING_IMAGE_ROW_MUL_WINDOW_HEIGHT); + reg_writeq(image_row_mul_pad_hight, REG_POOLING_IMAGE_ROW_MUL_PAD_HEIGHT); + reg_writeq(image_row_mul_step_hight, REG_POOLING_IMAGE_ROW_MUL_STEP_HEIGHT); + + reg_writeq(result_amount_align_32, REG_POOLING_RESULT_AMOUNT_ALIGN_32); + reg_writeq(result_amount_align_64, REG_POOLING_RESULT_AMOUNT_ALIGN_64); + + reg_writeq(image_calcu_height, REG_POOLING_IMAGE_CALCU_HEIGHT); + + reg_writeq(image_padleft_skipwindow, REG_POOLING_IMAGE_PADLEFT_SKIPWINDOW); + reg_writeq(mode_reciprocal, REG_POOLING_MODE_RECIPROCAL); + + /*SDK刷Cache保证数据一致性*/ + + reg_writeq(cmd, REG_POOLING_CMD); + + DLOG << "before reg poll"; + if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) { + g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status = ERROR; + ret = -EIO; + DLOG << "Pooling Wait Irq Timeout!"; + } + DLOG << "after reg poll"; + usleep(40); + + /*SDK 无效 Cache保证数据一致性*/ + + // *(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER); + output_scale = reg_readq(REG_SCALE_PARAMETER); + output_scale = (output_scale << 32) | (output_scale >> 32); + fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); + //*(args.output.timer_cnt) = reg_readq(REG_TIMER_COUNTER); + pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); + + return ret; #endif return 0; } @@ -103,8 +614,73 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) { DLOG << " out_address:" << args.output.address << " out_scale_address:" << args.output.scale_address; #endif -#ifndef PADDLE_MOBILE_ZU5 - return 0; +#ifdef PADDLE_MOBILE_ZU5 + DLOG << "Conv"; + // return 0; + int ret = 0; + uint64_t output_scale = 0; + uint64_t timer_cnt = 0; + uint64_t image0_address_phy = 0; + uint64_t image1_address_phy = 0; + uint64_t output_address_phy = 0; + + uint64_t cmd = args.relu_enabled ? USE_RELU : 0; + uint64_t datalen = (uint64_t)args.image0.width * + (uint64_t)args.image0.height * + (uint64_t)args.image0.channels; + uint64_t coefficient = (uint64_t)args.const0 << 32 | (uint64_t)args.const1; + + pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); + if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) { + ret = -EIO; + DLOG << "Conv Status Error!"; + pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); + return ret; + } + + image0_address_phy = vaddr_to_paddr(args.image0.address); + image1_address_phy = vaddr_to_paddr(args.image1.address); + output_address_phy = vaddr_to_paddr(args.output.address); + + uint64_t image_amount_per_row = + align_to_x((uint64_t)args.image0.width * (uint64_t)args.image0.channels, + IMAGE_ALIGN); + uint64_t image_image_pixel = ((uint64_t)args.image0.channels << 32) | + ((uint64_t)args.image0.width << 16) | + (uint64_t)args.image0.height; + + /*SDK刷Cache保证数据一致性*/ + + /*restart scale*/ + reg_writeq(output_scale, REG_SCALE_PARAMETER); + + reg_writeq(image0_address_phy, REG_EW_IMAGE0_BASE_ADDR); + reg_writeq(image1_address_phy, REG_EW_IMAGE1_BASE_ADDR); + reg_writeq(datalen, REG_EW_DATA_LEN); + reg_writeq(image_image_pixel, REG_EW_IMAGE_PIXEL); + reg_writeq(image_amount_per_row, REG_EW_IMAGE_AMOUNT_PER_ROW); + + reg_writeq(output_address_phy, REG_EW_RESULT_BASE_ADDR); + reg_writeq(coefficient, REG_EW_COEFFICIENT); + + reg_writeq(cmd, REG_EW_CMD); + + if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) { + g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status = ERROR; + ret = -EIO; + DLOG << "EW Wait Irq Timeout!"; + } + usleep(40); + + /*SDK 无效 Cache保证数据一致性*/ + output_scale = reg_readq(REG_SCALE_PARAMETER); + output_scale = (output_scale << 32) | (output_scale >> 32); + fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); + + //*(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER); + //*(args.output.timer_cnt) = reg_readq(REG_TIMER_COUNTER); + pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); + return ret; #endif return 0; } @@ -126,8 +702,117 @@ int PerformBypass(const struct BypassArgs &args) { DLOG << " out_address:" << args.output.address << " out_scale_address:" << args.output.scale_address; #endif -#ifndef PADDLE_MOBILE_ZU5 - return 0; +#ifdef PADDLE_MOBILE_ZU5 + DLOG << "Bypass"; + // return 0; + struct fpga_pe *pe; + uint64_t output_scale = 0; + uint64_t timer_cnt = 0; + uint64_t cmd = 0; + uint64_t datalen = 0; + uint64_t input_address_phy = 0; + uint64_t output_address_phy = 0; + uint8_t data_cell_in = 0; + uint8_t data_cell_out = 0; + + int ret = 0; + + datalen = (uint64_t)args.image.width * (uint64_t)args.image.height * + (uint64_t)args.image.channels; + datalen = align_to_x(datalen, 16); + + input_address_phy = vaddr_to_paddr(args.image.address); + output_address_phy = vaddr_to_paddr(args.output.address); + DLOG << "input_phy:" << input_address_phy; + DLOG << "output_phy:" << output_address_phy; + + switch (args.input_data_type) { + case DATA_TYPE_FP16: { + switch (args.output_data_type) { + case DATA_TYPE_FP16: + data_cell_in = SIZE_FP16; + data_cell_out = SIZE_FP16; + cmd = CMD_FP16_TO_FP16; + break; + + case DATA_TYPE_FP32: + data_cell_in = SIZE_FP16; + data_cell_out = SIZE_FP32; + cmd = CMD_FP16_TO_FP32; + break; + + default: + break; + } + } break; + + case DATA_TYPE_FP32: { + switch (args.output_data_type) { + case DATA_TYPE_FP16: + data_cell_in = SIZE_FP32; + data_cell_out = SIZE_FP16; + cmd = CMD_FP32_TO_FP16; + break; + + case DATA_TYPE_FP32: + data_cell_in = SIZE_FP32; + data_cell_out = SIZE_FP32; + cmd = CMD_FP32_TO_FP32; + break; + + default: + break; + } + } break; + + default: + break; + } + if (cmd != CMD_FP16_TO_FP16 && cmd != CMD_FP16_TO_FP32 && + cmd != CMD_FP32_TO_FP16 && cmd != CMD_FP32_TO_FP32) { + return -EFAULT; + } + if ((data_cell_in != SIZE_FP16 && data_cell_in != SIZE_FP32) || + (data_cell_out != SIZE_FP16 && data_cell_out != SIZE_FP32)) { + return -EFAULT; + } + + pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); + if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_BYPASS]->status) { + ret = -EIO; + DLOG << "Bypass Status Error!"; + pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); + return ret; + } + + /*restart scale*/ + reg_writeq(output_scale, REG_SCALE_PARAMETER); + + reg_writeq(input_address_phy, REG_CONVERT_SRC_ADDR); + reg_writeq(output_address_phy, REG_CONVERT_DST_ADDR); + reg_writeq(datalen, REG_CONVERT_LENGTH); + + /*SDK刷Cache保证数据一致性*/ + reg_writeq(cmd, REG_CONVERT_CMD); + + DLOG << "before reg poll"; + if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_BYPASS, PE_IRQ_TIMEOUT)) { + g_fpgainfo.pe_data->pes[PE_IDX_BYPASS]->status = ERROR; + ret = -EIO; + DLOG << "BYPASS Wait Irq Timeout!"; + } + DLOG << "after reg poll"; + usleep(40); + + /*SDK 无效 Cache保证数据一致性*/ + output_scale = reg_readq(REG_SCALE_PARAMETER); + output_scale = (output_scale << 32) | (output_scale >> 32); + fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); + + //*(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER); + //*(args.output.timer_cnt) = reg_readq(REG_TIMER_COUNTER); + pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); + return ret; #endif return 0; @@ -138,11 +823,14 @@ int ComputeFPGAConcat(const struct ConcatArgs &args) { DLOG << "=============ComputeFpgaConcat==========="; DLOG << " Image_num: " << args.image_num << " out_address:" << args.image_out - << " out_scale_address:" << args.scale_out; + << " out_scale_address:" << args.scale_out + << " out_channel:" << args.out_channel; DLOG << " image_height:" << args.height << " image_width:" << args.width; for (int i = 0; i < args.image_num; i++) { DLOG << " " << i << "th: "; - DLOG << " channel_num:" << args.channel_num[i] + DLOG << " channel_num:" + << args.channel_num[i] + // << " aligned_channel_num:" << args.aligned_channel_num[i] << " image_address:" << args.images_in[i] << " image_scale_address:" << args.scales_in[i]; } @@ -154,6 +842,82 @@ int ComputeFPGAConcat(const struct ConcatArgs &args) { return 0; } +void deconv_post_process(half **data_in, int sub_conv_n, int num, int channel, + int sub_height, int sub_width, int omit_size) { + int origin_h = sub_height * sub_conv_n; + int origin_w = sub_width * sub_conv_n; + int align_origin_w = align_to_x(origin_w * channel, 16); + int deconv_h = origin_h - 2 * omit_size; + int deconv_w = origin_w - 2 * omit_size; + int deconv_row_len = deconv_w * channel; + int align_deconv_row_len = align_to_x(deconv_row_len, 16); + half *ptr_tmp = *data_in; + half *ptr_deconv = + (half *)fpga_malloc(num * align_deconv_row_len * deconv_h * sizeof(half)); + memset(ptr_deconv, 0, num * align_deconv_row_len * deconv_h * sizeof(half)); + int deconv_idx = 0; + for (int nn = 0; nn < num; ++nn) { + for (int hh = 0; hh < origin_h; ++hh) { + int hx = (hh % sub_conv_n); + half *sub_t = ptr_tmp + hx * sub_height * align_origin_w; // sub(hx,:); + + int hi = (hh / sub_conv_n); + + if ((hh < omit_size) || (hh >= (origin_h - omit_size))) continue; + + // for (int ww = 0; ww < origin_w; ++ww){ + + // if((ww < omit_size) )// || (ww >= (origin_w-omit_size)) + // continue; + + int sidx = (nn * origin_h * align_origin_w + hi * align_origin_w + + omit_size * channel); + + fpga_copy(ptr_deconv + deconv_idx, sub_t + sidx, + sizeof(half) * deconv_row_len); + deconv_idx += align_deconv_row_len; + //} + } + } + + *data_in = ptr_deconv; + fpga_free(ptr_tmp); +} +int ComputeFpgaDeconv(const struct DeconvArgs &args) { +#ifdef FPGA_TEST_MODE + DLOG << "=============ComputeFPGADeConv==========="; + DLOG << " filter_num:" << args.filter_num + << " group_num:" << args.group_num + << " sub_conv_num:" << args.sub_conv_num; +#endif + + int sub_conv_num = args.sub_conv_num; + + for (int i = 0; i < sub_conv_num; i++) { + //#if CPU_SIMULATE + + //#else + ComputeBasicConv(args.conv_args[i]); + //#endif + } + + if (sub_conv_num > 1) { + float max_scale = -1.0; + for (int i = 0; i < sub_conv_num; i++) { + float ptr_scale = (args.conv_args[i].output.scale_address)[0]; + if (ptr_scale > max_scale) { + args.output.scale_address[0] = ptr_scale; + args.output.scale_address[1] = + (args.conv_args[i].output.scale_address)[1]; + } + } + deconv_post_process((half **)(&(args.output.address)), args.sub_conv_num, 1, + args.filter_num, (args.sub_output_height), + (args.sub_output_width), args.omit_size); + } + return 0; +} + int ComputeFPGASplit(const struct SplitArgs &args) { #ifdef FPGA_PRINT_MODE DLOG << "=============ComputeFpgaSplit==========="; @@ -173,6 +937,5 @@ int ComputeFPGASplit(const struct SplitArgs &args) { args.height, args.width); return 0; } - } // namespace fpga } // namespace paddle_mobile diff --git a/src/fpga/common/driver.cpp b/src/fpga/common/driver.cpp index 8c59ac14fb..2f592fe45d 100644 --- a/src/fpga/common/driver.cpp +++ b/src/fpga/common/driver.cpp @@ -137,11 +137,13 @@ int fpga_regpoll(uint64_t reg, uint64_t val, int time) { for (i = 0; i < timeout; i++) { if (val == reg_readq(reg)) { + std::cout << "fpga_regpoll:" << i << "val:" << val << "reg:" << reg + << std::endl; break; } } - if (i <= timeout) { + if (i < timeout) { return 0; } else { return -1; @@ -153,6 +155,12 @@ int memory_request(struct fpga_memory *memory, size_t size, uint64_t *addr) { uint64_t _nr = DIV_ROUND_UP(size, FPGA_PAGE_SIZE); unsigned int nr = (unsigned int)_nr; int ret = 0; + DLOG << size; + DLOG << _nr; + DLOG << nr; + + uint64_t a_size = FPGA_PAGE_SIZE * nr; + DLOG << a_size; pthread_mutex_lock(&memory->mutex); @@ -166,6 +174,7 @@ int memory_request(struct fpga_memory *memory, size_t size, uint64_t *addr) { *addr = address_ofset; } else { + DLOG << "memory request failed!"; ret = -ENOMEM; } @@ -282,7 +291,7 @@ uint64_t vaddr_to_paddr(void *address) { if (iter != g_fpgainfo.fpga_vaddr2paddr_map.end()) { paddr = iter->second; } else { - DLOG << "Invalid pointer"; + DLOG << "Invalid pointer: " << address; } return paddr; @@ -348,6 +357,11 @@ void fpga_free_driver(void *ptr) { fpga_bitmap::bitmap_clear(g_fpgainfo.memory_info->bitmap, pos, g_fpgainfo.memory_info->nr[pos]); pthread_mutex_unlock(&g_fpgainfo.memory_info->mutex); + + auto iter = g_fpgainfo.fpga_vaddr2paddr_map.find(ptr); + if (iter != g_fpgainfo.fpga_vaddr2paddr_map.end()) { + g_fpgainfo.fpga_vaddr2paddr_map.erase(iter); + } } else { DLOG << "Invalid pointer"; } diff --git a/src/fpga/common/driver.h b/src/fpga/common/driver.h index 2dad07ec52..c204370be7 100644 --- a/src/fpga/common/driver.h +++ b/src/fpga/common/driver.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include +#include #include #include @@ -44,7 +45,7 @@ const int PE_IDX_POOLING = 1; const int PE_IDX_EW = 2; const int PE_IDX_BYPASS = 3; -enum pe_status { IDLE = 0, BUSY = 1 }; +enum pe_status { IDLE = 0, BUSY = 1, ERROR = 2 }; struct MemoryCacheArgs { void *offset; @@ -58,7 +59,7 @@ struct MemoryCacheArgs { struct fpga_pe { char type_name[MAX_TYPE_NAME_LENTH + 1]; struct pe_data_s *outer; - pe_status status; // 0=idle 1=busy -1=fail + pe_status status; uint64_t interrupt_cnt; }; @@ -106,6 +107,8 @@ inline uint64_t reg_readq(uint32_t offset) { uint64_t value = *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + // NOLINT offset); // NOLINT + // DLOG << "read end"; + usleep(10); return value; } @@ -114,6 +117,8 @@ inline void reg_writeq(uint64_t value, uint32_t offset) { // DLOG << "offset : " << offset << ", value : " << value; *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + // NOLINT offset) = value; + // DLOG << "write end"; + usleep(10); } int open_device_driver(); diff --git a/src/fpga/common/fpga_common.h b/src/fpga/common/fpga_common.h index b3f619f2f2..fdda65afda 100644 --- a/src/fpga/common/fpga_common.h +++ b/src/fpga/common/fpga_common.h @@ -74,12 +74,21 @@ struct ConcatArgs { void* image_out; float* scale_out; uint32_t* channel_num; - // uint32_t* aligned_channel_num; - // uint32_t out_channel; + uint32_t* aligned_channel_num; + uint32_t out_channel; uint32_t height; uint32_t width; }; +struct SplitConvArgs { + uint32_t split_num; + uint32_t group_num; + uint32_t filter_num; + struct ImageOutputArgs output; + struct ConvArgs* conv_arg; + struct ConcatArgs concat_arg; +}; + struct SplitArgs { uint32_t image_num; int16_t* image_in; @@ -91,15 +100,6 @@ struct SplitArgs { uint32_t width; }; -struct SplitConvArgs { - uint32_t split_num; - uint32_t group_num; - uint32_t filter_num; - struct ImageOutputArgs output; - struct ConvArgs* conv_arg; - struct ConcatArgs concat_arg; -}; - struct PoolingArgs { int16_t mode; // mode: 0:max, 1:avg int16_t kernel_reciprocal; @@ -127,7 +127,14 @@ struct BypassArgs { }; struct DeconvArgs { - struct ConvArgs conv_arg; + uint32_t sub_conv_num; + uint32_t group_num; + uint32_t filter_num; + uint32_t omit_size; + uint32_t sub_output_width; + uint32_t sub_output_height; + struct ImageOutputArgs output; + struct ConvArgs* conv_args; }; static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; } -- GitLab