diff --git a/src/fpga/V1/bias_scale.cpp b/src/fpga/V1/bias_scale.cpp index 215ba8c04f361d2aedc645fc202d90231e91b6c7..ffb5303c854bc0ac96ce05441a199232ac22d54b 100644 --- a/src/fpga/V1/bias_scale.cpp +++ b/src/fpga/V1/bias_scale.cpp @@ -86,20 +86,15 @@ void format_bias_array(float **bias_array, int num) { float *ptr_unaligned = *bias_array; int num_before_align = num; int num_after_align = align_to_x(num_before_align, BIAS_NUM_ALIGNMENT); - float *ptr_aligned = - (float *)fpga_malloc(num_after_align * sizeof(float)); // NOLINT + int16_t *ptr_aligned = + (int16_t *)fpga_malloc(num_after_align * sizeof(int16_t)); // NOLINT - memset(ptr_aligned, 0, num_after_align * sizeof(float)); - if (num < 16) { - memcpy(ptr_aligned, ptr_unaligned, num * sizeof(float)); - for (int i = num; i < num_after_align; i++) { - ptr_aligned[i] = ptr_unaligned[i % num]; - } - } else { - memcpy(ptr_aligned, ptr_unaligned, num * sizeof(float)); + memset(ptr_aligned, 0, num_after_align * sizeof(int16_t)); + for (int i = 0; i < num_before_align; i++) { + ptr_aligned[i] = fp32_2_fp16(ptr_unaligned[i]); } + *bias_array = (float *)ptr_aligned; // NOLINT fpga_free(ptr_unaligned); - *bias_array = ptr_aligned; } } // namespace bias_scale diff --git a/src/fpga/V1/filter.cpp b/src/fpga/V1/filter.cpp old mode 100644 new mode 100755 index 28fcbc3a201b9f17c3888404cd949dd5b35817ef..197448d515d67459b280bf33a14b8f8419970fc2 --- a/src/fpga/V1/filter.cpp +++ b/src/fpga/V1/filter.cpp @@ -292,34 +292,25 @@ void convert_to_hwn(int16_t **data_in, int num, int height, int width) { fpga_free(tmp); } -void align_element_nw(int16_t **data_in, int num, int height, int width) { - int unalign_nw = num * width; - int align_nw = align_to_x(num * width, FILTER_ELEMENT_ALIGNMENT); - if (unalign_nw == align_nw) { +void align_element_n(int16_t **data_in, int num, int height, int width) { + int unalign_n = num; + int align_n = align_to_x(num, FILTER_ELEMENT_ALIGNMENT); + if (unalign_n == align_n) { return; } else { int16_t *tmp = *data_in; - int num_element = height * align_nw; + int num_element = height * width * align_n; int16_t *data_tmp = (int16_t *)fpga_malloc(num_element * sizeof(int16_t)); // NOLINT memset(data_tmp, 0, num_element * sizeof(int16_t)); - if (unalign_nw >= FILTER_ELEMENT_ALIGNMENT) { - for (int h = 0; h < height; h++) { - int offset_unalign = h * unalign_nw; - int offset_align = h * align_nw; - for (int nw = 0; nw < unalign_nw; nw++) { - data_tmp[offset_align + nw] = *((*data_in) + offset_unalign + nw); - } - } - } else { - for (int h = 0; h < height; h++) { - int offset_unalign = h * unalign_nw; - int offset_align = h * align_nw; - for (int nw = 0; nw < align_nw; nw++) { - data_tmp[offset_align + nw] = - *((*data_in) + offset_unalign + nw % unalign_nw); + for (int h = 0; h < height; h++) { + for (int w = 0; w < width; w++) { + int offset_unalign = h * width * unalign_n + w * unalign_n; + int offset_align = h * width * align_n + w * align_n; + for (int n = 0; n < unalign_n; n++) { + data_tmp[offset_align + n] = *((*data_in) + offset_unalign + n); } } } @@ -351,9 +342,9 @@ void format_dwconv_filter(float **data_in, int num, int height, int width, quantize_to_fp16(data_in, num, height, width, scale_ptr); int16_t **quantize_data = (int16_t **)data_in; // NOLINT convert_to_hwn(quantize_data, num, height, width); - align_element_nw(quantize_data, num, height, width); - fpga_flush(*quantize_data, align_to_x(num * width, FILTER_ELEMENT_ALIGNMENT) * - height * sizeof(char)); + align_element_n(quantize_data, num, height, width); + fpga_flush(*quantize_data, align_to_x(num, FILTER_ELEMENT_ALIGNMENT) * + height * width * sizeof(int16_t)); } } // namespace filter } // namespace fpga diff --git a/src/fpga/V1/filter.h b/src/fpga/V1/filter.h old mode 100644 new mode 100755 index c4f44fc72c5010c5a338498f39bda7e9cd594e64..4812a75af2af97047f4b46a5dc7fdb9dfa11b456 --- a/src/fpga/V1/filter.h +++ b/src/fpga/V1/filter.h @@ -39,7 +39,7 @@ void format_fc_filter(float** data_in, int num, int channel, int height, int width, int group_num, float max); void convert_to_hwn(int16_t** data_in, int num, int height, int width); -void align_element_nw(int16_t** data_in, int num, int height, int width); +void align_element_n(int16_t** data_in, int num, int height, int width); void quantize_to_fp16(float** data_in, int num, int height, int width, float* scale_ptr); void format_dwconv_filter(float** data_in, int num, int height, int width, diff --git a/src/fpga/V1/pe.cpp b/src/fpga/V1/pe.cpp index 2791cc71a0c7f1323bb5ae603d9c766f777269ca..728d502af8c7ac1a37d200901e11ccc2913521b5 100644 --- a/src/fpga/V1/pe.cpp +++ b/src/fpga/V1/pe.cpp @@ -159,6 +159,12 @@ using namespace std; // NOLINT #define REG_EW_IMAGE_PIXEL 0x0F30 #define REG_EW_IMAGE_AMOUNT_PER_ROW 0x0F38 +/*dwconv*/ +#define REG_DWCONV_FILTER_BASE_ADDR 0xe08 +#define REG_DWCONV_FILTER_SHAPE 0xe10 +#define REG_DWCONV_FILTER_N_ALIGN 0xe18 +#define REG_DWCONV_CMD 0xe00 + int ComputeFpgaConv(const struct SplitConvArgs &args) { // ComputeBasicConv(args.conv_arg[0]); #ifdef FPGA_PRINT_MODE @@ -746,6 +752,162 @@ int ComputeFPGASplit(const struct SplitArgs &args) { args.height, args.width); return 0; } // ComputeFPGASplit +int ComputeDWConv(const struct DWconvArgs &args) { +#ifdef FPGA_PRINT_MODE + DLOG << "=============ComputeDWConv==========="; + DLOG << " mode:" << args.relu_enabled; + DLOG << " image_address:" << args.image.address + << " image_scale_address:" << args.image.scale_address + << " image_channels:" << args.image.channels + << " image_height:" << args.image.height + << " image_width:" << args.image.width + << " pad_height:" << args.image.pad_height + << " pad_width:" << args.image.pad_width; + DLOG << " filter_address:" << args.filter_address + << " bias_address:" << args.bias_address; + DLOG << " kernel_height:" << args.kernel.height + << " kernel_width:" << args.kernel.width + << " stride_h:" << args.kernel.stride_h + << " stride_w:" << args.kernel.stride_w; + DLOG << " out_address:" << args.output.address + << " out_scale_address:" << args.output.scale_address; +#endif +#ifdef PADDLE_MOBILE_ZU5 + DLOG << "DWConv"; + // return 0; + uint64_t output_scale = 0; + uint64_t timer_cnt = 0; + int ret = 0; + uint64_t cmd = args.relu_enabled; + uint64_t image_physical_address = 0; + uint64_t output_physical_address = 0; + uint64_t filter_physical_address = 0; + uint64_t bias_physical_address = 0; + + image_physical_address = vaddr_to_paddr(args.image.address); + output_physical_address = vaddr_to_paddr(args.output.address); + filter_physical_address = vaddr_to_paddr(args.filter_address); + bias_physical_address = vaddr_to_paddr(args.bias_address); + uint64_t filter_N_align = + align_to_x((uint64_t)args.image.channels, IMAGE_ALIGNMENT); + uint64_t filter_amount_per_row_align = + filter_N_align * (uint64_t)args.kernel.width; + uint64_t filter_amount_align = filter_N_align * (uint64_t)args.kernel.width * + (uint64_t)args.kernel.height; + + uint32_t output_height = (uint32_t)( + (args.image.height + args.image.pad_height * 2 - args.kernel.height) / + args.kernel.stride_h + + 1); + uint32_t output_width = (uint32_t)( + (args.image.width + args.image.pad_width * 2 - args.kernel.width) / + args.kernel.stride_w + + 1); + uint64_t image_amount_per_row = + align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels, + IMAGE_ALIGNMENT); + uint64_t image_one_pad_per_row = + align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels, + FILTER_ELEMENT_ALIGNMENT) + + (uint64_t)args.image.pad_width * (uint64_t)args.image.channels; + uint64_t image_two_pad_per_row = align_to_x( + ((uint64_t)args.image.width + (uint64_t)args.image.pad_width * 2) * + (uint64_t)args.image.channels, + IMAGE_ALIGNMENT); + uint64_t image_row_mul_pooling_hight = + image_amount_per_row * (uint64_t)args.kernel.height; + uint64_t image_row_mul_pad_hight = + image_amount_per_row * (uint64_t)args.image.pad_height; + uint64_t image_row_mul_step_hight = + image_amount_per_row * (uint64_t)args.kernel.stride_h; + uint64_t result_amount_align_32 = + align_to_x((uint64_t)output_width * (uint64_t)args.image.channels, + FILTER_ELEMENT_ALIGNMENT); + uint64_t result_amount_align_64 = align_to_x( + (uint64_t)output_width * (uint64_t)args.image.channels, IMAGE_ALIGNMENT); + uint64_t image_calcu_height = + (uint64_t)args.kernel.height + + ((uint64_t)output_height - 1) * (uint64_t)args.kernel.stride_h; + uint64_t image_pad_left = args.image.channels * args.image.pad_width; + uint64_t image_skip_window = args.image.channels * args.kernel.stride_w; + + uint64_t image_padleft_skipwindow = + (image_skip_window << 32) | image_pad_left; + + pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); + if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) { + ret = -EIO; + DLOG << "Conv Status Error!"; + pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); + return ret; + } + + /*restart scale*/ + reg_writeq(output_scale, REG_SCALE_PARAMETER); + reg_writeq(image_physical_address, REG_POOLING_IMAGE_BASE_ADDR); + reg_writeq(output_physical_address, REG_POOLING_RESULT_BASE_ADDR); + reg_writeq((bias_physical_address << 32 | filter_physical_address), + REG_DWCONV_FILTER_BASE_ADDR); + reg_writeq(filter_amount_per_row_align | (filter_amount_align << 32), + REG_DWCONV_FILTER_SHAPE); + reg_writeq(filter_N_align, REG_DWCONV_FILTER_N_ALIGN); + + reg_writeq( + ((uint64_t)args.image.height) | (((uint64_t)args.image.width) << 32), + REG_POOLING_IMAGE_PIXEL); + reg_writeq( + ((uint64_t)args.kernel.height) | (((uint64_t)args.kernel.width) << 32), + REG_POOLING_WINDOW_SIZE); + + reg_writeq(((uint64_t)output_height) | (((uint64_t)output_width) << 32), + REG_POOLING_RESULT_PIXEL); + + reg_writeq(((uint64_t)args.image.pad_height) | + (((uint64_t)args.image.pad_width) << 32), + REG_POOLING_PAD_PIXEL); + reg_writeq(((uint64_t)args.kernel.stride_h) | + (((uint64_t)args.kernel.stride_w) << 32), + REG_POOLING_STEP_PIXEL); + + reg_writeq((uint64_t)args.image.channels, REG_POOLING_CHANNEL_NUMBER); + + reg_writeq(image_amount_per_row, REG_POOLING_IMAGE_AMOUNT_PER_ROW); + reg_writeq(image_one_pad_per_row, REG_POOLING_IMAGE_ONE_PAD_PER_ROW); + reg_writeq(image_two_pad_per_row, REG_POOLING_IMAGE_TWO_PAD_PER_ROW); + + reg_writeq(image_row_mul_pooling_hight, + REG_POOLING_IMAGE_ROW_MUL_WINDOW_HEIGHT); + reg_writeq(image_row_mul_pad_hight, REG_POOLING_IMAGE_ROW_MUL_PAD_HEIGHT); + reg_writeq(image_row_mul_step_hight, REG_POOLING_IMAGE_ROW_MUL_STEP_HEIGHT); + + reg_writeq(result_amount_align_32, REG_POOLING_RESULT_AMOUNT_ALIGN_32); + reg_writeq(result_amount_align_64, REG_POOLING_RESULT_AMOUNT_ALIGN_64); + + reg_writeq(image_calcu_height, REG_POOLING_IMAGE_CALCU_HEIGHT); + + reg_writeq(image_padleft_skipwindow, REG_POOLING_IMAGE_PADLEFT_SKIPWINDOW); + + /*SDK刷Cache保证数据一致性*/ + + reg_writeq(cmd, REG_DWCONV_CMD); + + DLOG << "before reg poll"; + if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) { + g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status = ERROR; + ret = -EIO; + DLOG << "Pooling Wait Irq Timeout!"; + } + DLOG << "after reg poll"; + + // *(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER); + output_scale = reg_readq(REG_SCALE_PARAMETER); + output_scale = (output_scale << 32) | (output_scale >> 32); + fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); + pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); + return ret; +#endif + return 0; +} } // namespace fpga } // namespace paddle_mobile diff --git a/src/fpga/common/pe.h b/src/fpga/common/pe.h index 38a5ff92f4e301ce2da7d2111e50be6047186ab3..9f2800428e431ea302d6cd33685e8ff1dcdc2751 100644 --- a/src/fpga/common/pe.h +++ b/src/fpga/common/pe.h @@ -27,6 +27,6 @@ int ComputeFpgaConv(const struct SplitConvArgs& args); int ComputeFPGAConcat(const struct ConcatArgs& args); int ComputeFPGASplit(const struct SplitArgs& args); int ComputeFpgaDeconv(const struct DeconvArgs& args); - +int ComputeDWConv(const struct DWconvArgs& args); } // namespace fpga } // namespace paddle_mobile diff --git a/src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp index b19db3cffc25282d3414a4da26bbd951494a8f86..567b42ef08b7047225013ac48eb95262a353082a 100644 --- a/src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp +++ b/src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp @@ -83,7 +83,7 @@ template <> void ConvAddBNReluKernel::Compute( const FusionConvAddBNReluParam ¶m) { if (param.Groups() == param.Output()->dims()[1]) { - // fpga::ComputeFpgaConv(param.FpgaDwconvArgs()); + fpga::ComputeDWConv(param.FpgaDwconvArgs()); } else { fpga::ComputeFpgaConv(param.FpgaArgs()); } diff --git a/src/operators/kernel/fpga/V1/reshape_kernel.cpp b/src/operators/kernel/fpga/V1/reshape_kernel.cpp index bef69df54a8a2a26c9eea1491bb08d13201ccd1a..f5495e6d005f7f7c14ebd3d290ea9be02b9f0951 100644 --- a/src/operators/kernel/fpga/V1/reshape_kernel.cpp +++ b/src/operators/kernel/fpga/V1/reshape_kernel.cpp @@ -21,6 +21,7 @@ namespace operators { template <> bool ReshapeKernel::Init(ReshapeParam *param) { + param->Out()->ShareDataWith(*param->InputX()); return true; }