From e407c66cdc64c42b4068f8b212a9d06d34e426ab Mon Sep 17 00:00:00 2001 From: qnqinan Date: Fri, 15 Nov 2019 17:55:03 +0800 Subject: [PATCH] update FPGA v2 pe cpp file and ew kernel files, test=develop --- mobile/src/fpga/V2/api.cpp | 2 +- mobile/src/fpga/V2/pe.cpp | 455 ++++++++---------- .../fpga/V2/anchor_generator_kernel.cpp | 2 +- .../kernel/fpga/V2/elementwise_add_kernel.cpp | 33 +- .../fpga/V2/elementwise_add_relu_kernel.cpp | 31 +- 5 files changed, 259 insertions(+), 264 deletions(-) mode change 100644 => 100755 mobile/src/fpga/V2/pe.cpp mode change 100644 => 100755 mobile/src/operators/kernel/fpga/V2/anchor_generator_kernel.cpp mode change 100644 => 100755 mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp mode change 100644 => 100755 mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp diff --git a/mobile/src/fpga/V2/api.cpp b/mobile/src/fpga/V2/api.cpp index f39d012e08..1a90cb5bdc 100644 --- a/mobile/src/fpga/V2/api.cpp +++ b/mobile/src/fpga/V2/api.cpp @@ -623,7 +623,7 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, arg->concat_arg.images_in[i] = (int8_t *)arg->conv_arg[i].output.address; // NOLINT - arg->concat_arg.scales_in[i] = arg->conv_arg[i].output.scale_address; + arg->concat_arg.scales_in[i] = out->scale; arg->concat_arg.channel_num[i] = arg->conv_arg[i].filter_num; expand_conv_arg(&arg->conv_arg[i]); diff --git a/mobile/src/fpga/V2/pe.cpp b/mobile/src/fpga/V2/pe.cpp old mode 100644 new mode 100755 index aa150e0c6c..3b8f2d2a71 --- a/mobile/src/fpga/V2/pe.cpp +++ b/mobile/src/fpga/V2/pe.cpp @@ -109,7 +109,7 @@ using namespace std; // NOLINT #define REG_POOLING_IMAGE_ROW_MUL_PAD_HEIGHT 0x868 #define REG_POOLING_IMAGE_ROW_MUL_STEP_HEIGHT 0x870 #define REG_POOLING_RESULT_AMOUNT_ALIGN_32 0x878 -#define REG_POOLING_RESULT_AMOUNT_ALIGN_64 0x880 +#define REG_POOLING_RESULT_AMOUNT_ALIGN_16 0x880 #define REG_POOLING_IMAGE_CALCU_HEIGHT 0x888 #define REG_POOLING_IMAGE_PADLEFT_SKIPWINDOW 0x898 #define REG_POOLING_MODE_RECIPROCAL 0x890 @@ -270,10 +270,10 @@ int ComputeBasicConv(const struct ConvArgs &args) { args.driver.filter_pad_width_mul_channel, REG_CONV_REG1); - reg_writeq((args.driver.stride_h << 48) | (args.driver.skip_window << 28) | - (args.driver.filter_row << 8) | - (args.driver.filter_height << 4) | args.driver.filter_width, - REG_CONV_REG2); + reg_writeq((args.driver.stride_h << 50) | (args.driver.skip_window << 30) | + (args.driver.filter_row << 10) | + (args.driver.filter_height << 5) | args.driver.filter_width, + REG_CONV_REG2); reg_writeq((args.driver.filter_num << 42) | (args.driver.filter_align << 26) | (args.driver.prog_full_cnt << 16) | @@ -358,7 +358,6 @@ int ComputeFpgaPool(const struct PoolingArgs &args) { << " out_scale_address:" << args.output.scale_address; #endif #ifdef PADDLE_MOBILE_ZU5 - DLOG << "Polling"; // return 0; uint64_t output_scale = 0; uint64_t timer_cnt = 0; @@ -367,65 +366,73 @@ int ComputeFpgaPool(const struct PoolingArgs &args) { uint64_t image_physical_address = 0; uint64_t output_physical_address = 0; - // uint64_t reg_ActivationArgs = 0; - // active function:{none,leakeyrelu,sigmoid,tanh} - // ActivationArgs active_args; - // active_args.activation_type = LEAKYRELU; - // active_args.activation_type = args.output.activation.activation_type; - - // active_args.leaky_relu_negative_slope = - // args.output.activation.leaky_relu_negative_slope; - - // reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) | - // active_args.leaky_relu_negative_slope; - - // DLOG << " activation_type:" << active_args.activation_type - // << " leaky_relu_negative_slope:" - // << active_args.leaky_relu_negative_slope; - // DLOG << " reg_ActivationArgs:" << reg_ActivationArgs; - - image_physical_address = vaddr_to_paddr_driver(args.image.address); - output_physical_address = vaddr_to_paddr_driver(args.output.address); - uint32_t output_height = (uint32_t)( + image_physical_address = vaddr_to_paddr(args.image.address); + output_physical_address = vaddr_to_paddr(args.output.address); + uint64_t C_paral_64 = align_to_x((uint64_t)args.image.channels, 64); + uint64_t C_align_32 = align_to_x((uint64_t)args.image.channels, 32); + uint64_t output_height = (uint64_t)( (args.image.height + args.image.pad_height * 2 - args.kernel.height) / - args.kernel.stride_h + - 1); - uint32_t output_width = (uint32_t)( + args.kernel.stride_h + 1); + uint64_t output_width = (uint64_t)( (args.image.width + args.image.pad_width * 2 - args.kernel.width) / - args.kernel.stride_w + - 1); + args.kernel.stride_w + 1); + uint64_t image_amount_per_row = align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels, IMAGE_ALIGNMENT); - uint64_t image_one_pad_per_row = - align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels, - FILTER_ELEMENT_ALIGNMENT) + - (uint64_t)args.image.pad_width * (uint64_t)args.image.channels; - uint64_t image_two_pad_per_row = align_to_x( - ((uint64_t)args.image.width + (uint64_t)args.image.pad_width * 2) * - (uint64_t)args.image.channels, - IMAGE_ALIGNMENT); - uint64_t image_row_mul_pooling_hight = - image_amount_per_row * (uint64_t)args.kernel.height; - uint64_t image_row_mul_pad_hight = - image_amount_per_row * (uint64_t)args.image.pad_height; - uint64_t image_row_mul_step_hight = - image_amount_per_row * (uint64_t)args.kernel.stride_h; - uint64_t result_amount_align_32 = - align_to_x((uint64_t)output_width * (uint64_t)args.image.channels, - FILTER_ELEMENT_ALIGNMENT); - uint64_t result_amount_align_64 = align_to_x( - (uint64_t)output_width * (uint64_t)args.image.channels, IMAGE_ALIGNMENT); - uint64_t image_calcu_height = - (uint64_t)args.kernel.height + - ((uint64_t)output_height - 1) * (uint64_t)args.kernel.stride_h; - uint64_t image_pad_left = args.image.channels * args.image.pad_width; - uint64_t image_skip_window = args.image.channels * args.kernel.stride_w; - uint64_t image_padleft_skipwindow = - (image_skip_window << 32) | image_pad_left; - uint64_t mode_reciprocal = (uint64_t)0 | ((uint64_t)args.mode) << 16 | - (((uint64_t)args.kernel_reciprocal)); - + uint64_t image_one_pad_per_row = (uint64_t)args.image.width * + (uint64_t)args.image.channels +(uint64_t)args.image.pad_width * + (uint64_t)args.image.channels; + + uint64_t result_amount_align_32 = align_to_x((uint64_t)output_width * + (uint64_t)args.image.channels, 32); + uint64_t result_addr_row = + (result_amount_align_32 << 32) | output_physical_address; + uint64_t row_padding_down = + (uint64_t)args.image.height + (uint64_t)args.image.pad_height; + uint64_t kernel_width_sub1 = + (uint64_t)args.kernel.width - 1; + uint64_t kernel_padding_step = row_padding_down | + ((uint64_t)args.image.pad_height << 16) | + ((uint64_t)args.kernel.stride_h << 24) | + ((uint64_t)kernel_width_sub1<<32) | + ((uint64_t)args.kernel.height << 40) | + ((uint64_t)(args.kernel.height-1) << 48); + uint64_t image_calcu_height = (uint64_t)args.kernel.height + + (output_height - 1) * (uint64_t)args.kernel.stride_h; + uint64_t result_size_calcu_height = (output_height - 1) | + ((output_width - 1) << 16) | (image_calcu_height << 32); + uint64_t col_padding_down = ((uint64_t)args.image.width + + (uint64_t)args.image.pad_width) * (uint64_t)args.image.channels; + + uint64_t image_row_col_padding_down = + image_amount_per_row | (col_padding_down << 32); + uint64_t image_rowXpadding_h = + image_amount_per_row * (uint64_t)args.image.pad_height; + uint64_t image_rowXstep_h = + image_amount_per_row * (uint64_t)args.kernel.stride_h; + uint64_t image_rowXpad_h_rowXstep_h = + image_rowXpadding_h | (image_rowXstep_h << 32); + uint64_t channelXpad_w = + (uint64_t)args.image.channels * (uint64_t)args.image.pad_width; + uint64_t channelXstep_w = + (uint64_t)args.image.channels * (uint64_t)args.kernel.stride_w; + uint64_t channelXpad_w_channelXstep_w = + channelXpad_w | (channelXstep_w << 32); + uint64_t filter_row_align = + C_align_32 * (uint64_t)args.kernel.width; + uint64_t sub_filter_amount_align = C_align_32 * + (uint64_t)args.kernel.width * (uint64_t)args.kernel.height; + uint64_t mult_factor = 0; + float average_reciprocal = args.kernel_reciprocal; + uint32_t* kernel_reciprocal; + kernel_reciprocal =(reinterpret_cast(&average_reciprocal)); + if (args.mode == 1) + mult_factor = (uint64_t)(*kernel_reciprocal) | + ((uint64_t)1 << 32) | ((uint64_t)1 << 40); + else + mult_factor = + (uint64_t)0x3f800000 | ((uint64_t)1 << 32) | ((uint64_t)1 << 40); pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) { ret = -EIO; @@ -434,40 +441,20 @@ int ComputeFpgaPool(const struct PoolingArgs &args) { return ret; } - // reg_writeq(reg_ActivationArgs, - // REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); // active functoion - - // reg_writeq(output_scale, REG_SCALE_PARAMETER); - reg_writeq(image_physical_address, REG_POOLING_IMAGE_BASE_ADDR); - reg_writeq(output_physical_address, REG_POOLING_RESULT_BASE_ADDR); - reg_writeq( - ((uint64_t)args.image.height) | (((uint64_t)args.image.width) << 32), - REG_POOLING_IMAGE_PIXEL); - reg_writeq( - ((uint64_t)args.kernel.height) | (((uint64_t)args.kernel.width) << 32), - REG_POOLING_WINDOW_SIZE); - reg_writeq(((uint64_t)output_height) | (((uint64_t)output_width) << 32), - REG_POOLING_RESULT_PIXEL); - reg_writeq(((uint64_t)args.image.pad_height) | - (((uint64_t)args.image.pad_width) << 32), - REG_POOLING_PAD_PIXEL); - reg_writeq(((uint64_t)args.kernel.stride_h) | - (((uint64_t)args.kernel.stride_w) << 32), - REG_POOLING_STEP_PIXEL); - reg_writeq((uint64_t)args.image.channels, REG_POOLING_CHANNEL_NUMBER); - reg_writeq(image_amount_per_row, REG_POOLING_IMAGE_AMOUNT_PER_ROW); - reg_writeq(image_one_pad_per_row, REG_POOLING_IMAGE_ONE_PAD_PER_ROW); - reg_writeq(image_two_pad_per_row, REG_POOLING_IMAGE_TWO_PAD_PER_ROW); - reg_writeq(image_row_mul_pooling_hight, - REG_POOLING_IMAGE_ROW_MUL_WINDOW_HEIGHT); - reg_writeq(image_row_mul_pad_hight, REG_POOLING_IMAGE_ROW_MUL_PAD_HEIGHT); - reg_writeq(image_row_mul_step_hight, REG_POOLING_IMAGE_ROW_MUL_STEP_HEIGHT); - reg_writeq(result_amount_align_32, REG_POOLING_RESULT_AMOUNT_ALIGN_32); - reg_writeq(result_amount_align_64, REG_POOLING_RESULT_AMOUNT_ALIGN_64); - reg_writeq(image_calcu_height, REG_POOLING_IMAGE_CALCU_HEIGHT); - reg_writeq(image_padleft_skipwindow, REG_POOLING_IMAGE_PADLEFT_SKIPWINDOW); - reg_writeq(mode_reciprocal, REG_POOLING_MODE_RECIPROCAL); - reg_writeq(cmd, REG_POOLING_CMD); + reg_writeq(image_physical_address, 0x808); + reg_writeq(result_addr_row, 0x810); + reg_writeq(kernel_padding_step, 0x818); + reg_writeq(result_size_calcu_height, 0x820); + reg_writeq((uint64_t)args.image.channels, 0x828); + reg_writeq(image_row_col_padding_down, 0x830); + reg_writeq(image_rowXpad_h_rowXstep_h, 0x838); + reg_writeq(mult_factor, 0x840); // dw donot care + reg_writeq(channelXpad_w_channelXstep_w, 0x848); + if (args.mode == 1) + cmd = (uint64_t)4; + else + cmd = (uint64_t)8; + reg_writeq(cmd, 0x800); DLOG << "before reg poll"; if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) { @@ -478,14 +465,6 @@ int ComputeFpgaPool(const struct PoolingArgs &args) { } DLOG << "after reg poll"; - // *(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER); - // output_scale = reg_readq(REG_SCALE_PARAMETER); - // output_scale = (output_scale << 32) | (output_scale >> 32); - // fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); - - // active_args.activation_type = NONE; - // reg_writeq(reg_ActivationArgs, REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); - pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); return ret; @@ -518,19 +497,7 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) { #endif #ifdef PADDLE_MOBILE_ZU5 int ret = 0; - uint64_t output_scale = 0; - // uint64_t reg_ActivationArgs = 0; - // ActivationArgs active_args; - // active_args.activation_type = args.output.activation.activation_type; - // active_args.leaky_relu_negative_slope = - // args.output.activation.leaky_relu_negative_slope; - // reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) | - // active_args.leaky_relu_negative_slope; - // DLOG << " activation_type:" << active_args.activation_type - // << " leaky_relu_negative_slope:" - // << active_args.leaky_relu_negative_slope; - // DLOG << " reg_ActivationArgs:" << reg_ActivationArgs; pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_EW]->status) { @@ -540,18 +507,46 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) { return ret; } - // reg_writeq(reg_ActivationArgs, - // REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); // active functoion - reg_writeq(output_scale, REG_SCALE_PARAMETER); - reg_writeq(args.driver.image0_address_phy, REG_EW_IMAGE0_BASE_ADDR); - reg_writeq(args.driver.image1_address_phy, REG_EW_IMAGE1_BASE_ADDR); - reg_writeq(args.driver.datalen, REG_EW_DATA_LEN); - reg_writeq(args.driver.image_image_pixel, REG_EW_IMAGE_PIXEL); - reg_writeq(args.driver.image_amount_per_row, REG_EW_IMAGE_AMOUNT_PER_ROW); - reg_writeq(args.driver.output_address_phy, REG_EW_RESULT_BASE_ADDR); - reg_writeq(args.driver.coefficient, REG_EW_COEFFICIENT); - reg_writeq(args.driver.cmd, REG_EW_CMD); + uint64_t image0_physical_address = 0; + uint64_t image1_physical_address = 0; + uint64_t image_physical_address = 0; + uint64_t output_physical_address = 0; + image0_physical_address = vaddr_to_paddr(args.image0.address); + image1_physical_address = vaddr_to_paddr(args.image1.address); + image_physical_address = + image0_physical_address | (image1_physical_address << 32); + output_physical_address = vaddr_to_paddr(args.output.address); + uint64_t image_amount_per_row = + align_to_x((uint64_t)args.image0.width * + (uint64_t)args.image0.channels, IMAGE_ALIGNMENT); + uint64_t result_addr_row = + output_physical_address | (image_amount_per_row << 32); + uint64_t kernel_padding_step = 0; + kernel_padding_step = ((uint64_t)args.image0.height * 2) | + ((uint64_t)2 << 24) | ((uint64_t)2 << 40) | ((uint64_t)1 << 48); + uint64_t result_size_calcu_height = ((uint64_t)args.image0.height - 1) | + ((image_amount_per_row / 32 - 1) << 16) | + (((uint64_t)args.image0.height * 2) << 32); + uint64_t image_row_col_padding_down = image_amount_per_row | + (image_amount_per_row << 32); + float quantParam = (args.output.scale_address)[0]; + uint32_t* ew_scale = reinterpret_cast(&quantParam); + uint64_t ew_scale_mult_factor = (*ew_scale) | + ((uint64_t)args.const0 << 32) | ((uint64_t)args.const1 << 40); + reg_writeq(image_physical_address, 0x808); + reg_writeq(result_addr_row, 0x810); + reg_writeq(kernel_padding_step, 0x818); + reg_writeq(result_size_calcu_height, 0x820); + reg_writeq(32, 0x828); + reg_writeq(image_row_col_padding_down, 0x830); + reg_writeq(((image_amount_per_row*2) << 32), 0x838); + reg_writeq(ew_scale_mult_factor, 0x840); // dw donot care + reg_writeq(((uint64_t)32 << 32), 0x848); + reg_writeq(0, 0x858); + uint64_t cmd = 0; + cmd = (uint64_t)2 | (((uint64_t)args.relu_enabled) << 8); + reg_writeq(cmd, 0x800); if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) { g_fpgainfo.pe_data->pes[PE_IDX_EW]->status = ERROR; @@ -560,12 +555,6 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) { PADDLE_MOBILE_ENFORCE(0, "EW Wait Irq Timeout!"); } - // output_scale = reg_readq(REG_SCALE_PARAMETER); - // output_scale = (output_scale << 32) | (output_scale >> 32); - // fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); - // active_args.activation_type = NONE; - // reg_writeq(reg_ActivationArgs, REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); - pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); return ret; #endif @@ -870,7 +859,7 @@ int ComputeFpgaDeconv(const struct DeconvArgs &args) { #endif } - if (sub_conv_num > 1) { + /*if (sub_conv_num > 1) { float max_scale = -1.0f; #ifdef COST_TIME_PRINT gettimeofday(&start, NULL); @@ -894,19 +883,7 @@ int ComputeFpgaDeconv(const struct DeconvArgs &args) { << " cost time: " << (dif_sec * 1000000 + dif_usec) << "us" << std::endl; #endif - - // fpga_flush(args.output.scale_address, 2 * sizeof(float)); - /*#ifdef COST_TIME_PRINT - gettimeofday(&start,NULL); - #endif - //deconv_post_process(args); - #ifdef COST_TIME_PRINT - gettimeofday(&end,NULL); - dif_sec = end.tv_sec - start.tv_sec; - dif_usec = end.tv_usec - start.tv_usec; - std::cout << "deconv_post_process " << " cost time: " << - (dif_sec*1000000+dif_usec) << "us" << std::endl; #endif*/ - } + }*/ return 0; } // ComputeFpgaDeconv @@ -940,8 +917,8 @@ int ComputeDWConv(const struct DWconvArgs &args) { << " image_width:" << args.image.width << " pad_height:" << args.image.pad_height << " pad_width:" << args.image.pad_width; - DLOG << " filter_address:" << args.filter_address - << " bias_address:" << args.bias_address; + DLOG << " filter_address:" << args.filter_address; + //<< " bias_address:" << args.bias_address; DLOG << " kernel_height:" << args.kernel.height << " kernel_width:" << args.kernel.width << " stride_h:" << args.kernel.stride_h @@ -952,10 +929,8 @@ int ComputeDWConv(const struct DWconvArgs &args) { #ifdef PADDLE_MOBILE_ZU5 DLOG << "DWConv"; // return 0; - uint64_t output_scale = 0; uint64_t timer_cnt = 0; int ret = 0; - // uint64_t cmd = args.relu_enabled; uint64_t cmd = 0; uint64_t image_physical_address = 0; uint64_t output_physical_address = 0; @@ -966,57 +941,69 @@ int ComputeDWConv(const struct DWconvArgs &args) { output_physical_address = vaddr_to_paddr(args.output.address); filter_physical_address = vaddr_to_paddr(args.filter_address); bias_physical_address = vaddr_to_paddr(args.bias_address); - uint64_t filter_N_align = - align_to_x((uint64_t)args.image.channels, IMAGE_ALIGNMENT); - uint64_t filter_amount_per_row_align = - filter_N_align * (uint64_t)args.kernel.width; - uint64_t sub_filter_amount_align = filter_N_align * - (uint64_t)args.kernel.width * - (uint64_t)args.kernel.height; - uint64_t filter_amount_align = - sub_filter_amount_align * (uint64_t)args.sub_conv_num; - - uint32_t output_height = (uint32_t)( - (args.image.height + args.image.pad_height * 2 - args.kernel.height) / - args.kernel.stride_h + - 1); - uint32_t output_width = (uint32_t)( - ((args.image.width + args.image.pad_width * 2 - args.kernel.width) / - args.kernel.stride_w + - 1) * - args.sub_conv_num); + uint64_t C_align_64 = align_to_x((uint64_t)args.image.channels, 64); + uint64_t C_align_32 = align_to_x((uint64_t)args.image.channels, 32); + uint64_t output_height = (uint64_t) + ((args.image.height + args.image.pad_height * 2 - + args.kernel.height) / args.kernel.stride_h +1); + uint64_t output_width = (uint64_t) + (((args.image.width + args.image.pad_width * 2 - args.kernel.width) / + args.kernel.stride_w + 1) * args.sub_conv_num); uint64_t image_amount_per_row = - align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels, - IMAGE_ALIGNMENT); + align_to_x((uint64_t)args.image.width * + (uint64_t)args.image.channels, IMAGE_ALIGNMENT); uint64_t image_one_pad_per_row = - align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels, - FILTER_ELEMENT_ALIGNMENT) + - (uint64_t)args.image.pad_width * (uint64_t)args.image.channels; - uint64_t image_two_pad_per_row = align_to_x( - ((uint64_t)args.image.width + (uint64_t)args.image.pad_width * 2) * - (uint64_t)args.image.channels, - IMAGE_ALIGNMENT); - uint64_t image_row_mul_pooling_hight = - image_amount_per_row * (uint64_t)args.kernel.height; - uint64_t image_row_mul_pad_hight = - image_amount_per_row * (uint64_t)args.image.pad_height; - uint64_t image_row_mul_step_hight = - image_amount_per_row * (uint64_t)args.kernel.stride_h; - uint64_t result_amount_align_32 = - align_to_x((uint64_t)output_width * (uint64_t)args.image.channels, - FILTER_ELEMENT_ALIGNMENT); - uint64_t result_amount_align_64 = align_to_x( - (uint64_t)output_width * (uint64_t)args.image.channels, IMAGE_ALIGNMENT); - uint64_t image_calcu_height = - (uint64_t)args.kernel.height + - ((uint64_t)output_height - 1) * (uint64_t)args.kernel.stride_h; - uint64_t image_pad_left = args.image.channels * args.image.pad_width; - uint64_t image_skip_window = args.image.channels * args.kernel.stride_w; - - uint64_t image_padleft_skipwindow = - (image_skip_window << 32) | image_pad_left; - + (uint64_t)args.image.width * (uint64_t)args.image.channels + + (uint64_t)args.image.pad_width * (uint64_t)args.image.channels; + + uint64_t result_amount_align_32 = align_to_x( + (uint64_t)output_width * (uint64_t)args.image.channels, 32); + uint64_t result_addr_row = + (result_amount_align_32 << 32) | output_physical_address; + uint64_t row_padding_down = + (uint64_t)args.image.height + (uint64_t)args.image.pad_height; + uint64_t kernel_width_sub1 = (uint64_t)args.kernel.width - 1; + uint64_t kernel_padding_step = row_padding_down | + ((uint64_t)args.image.pad_height << 16) | + ((uint64_t)args.kernel.stride_h << 24) | + ((uint64_t)kernel_width_sub1<<32) | + ((uint64_t)args.kernel.height << 40) | + ((uint64_t)(args.kernel.height-1) << 48); + uint64_t image_calcu_height = (uint64_t)args.kernel.height + + (output_height - 1) * (uint64_t)args.kernel.stride_h; + uint64_t result_size_calcu_height = (output_height - 1) | + ((output_width - 1) << 16) | (image_calcu_height << 32); + uint64_t col_padding_down = ((uint64_t)args.image.width + + (uint64_t)args.image.pad_width) * (uint64_t)args.image.channels; + + uint64_t image_row_col_padding_down = + image_amount_per_row | (col_padding_down << 32); + uint64_t image_rowXpadding_h = + image_amount_per_row * (uint64_t)args.image.pad_height; + uint64_t image_rowXstep_h = + image_amount_per_row * (uint64_t)args.kernel.stride_h; + uint64_t image_rowXpad_h_rowXstep_h = + image_rowXpadding_h | (image_rowXstep_h << 32); + uint64_t channelXpad_w = + (uint64_t)args.image.channels * (uint64_t)args.image.pad_width; + uint64_t channelXstep_w = + (uint64_t)args.image.channels * (uint64_t)args.kernel.stride_w; + uint64_t channelXpad_w_channelXstep_w = + channelXpad_w | (channelXstep_w << 32); + + uint64_t filter_row_align = + C_align_64 * (uint64_t)args.kernel.width; + uint64_t sub_filter_amount_align = C_align_64 * + (uint64_t)args.kernel.width * + (uint64_t)args.kernel.height; + uint64_t filter_amount_align = + sub_filter_amount_align * (uint64_t)args.sub_conv_num; + uint64_t filter_param = filter_row_align | (filter_amount_align << 16) | + (sub_filter_amount_align << 32) | + (((uint64_t)args.sub_conv_num -1) << 48); + uint64_t channel_parameter = + (uint64_t)args.image.channels | (C_align_64 << 16); pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) { ret = -EIO; @@ -1025,72 +1012,30 @@ int ComputeDWConv(const struct DWconvArgs &args) { return ret; } - /*restart scale*/ - reg_writeq(output_scale, REG_SCALE_PARAMETER); - - reg_writeq(image_physical_address, REG_POOLING_IMAGE_BASE_ADDR); - reg_writeq(output_physical_address, REG_POOLING_RESULT_BASE_ADDR); - reg_writeq((bias_physical_address << 32 | filter_physical_address), - REG_DWCONV_FILTER_BASE_ADDR); - reg_writeq(filter_amount_per_row_align | (filter_amount_align << 32), - REG_DWCONV_FILTER_SHAPE); - reg_writeq(sub_filter_amount_align | (((uint64_t)args.sub_conv_num) << 32), - REG_DWCONV_FILTER_SUBNUMBER); - reg_writeq(filter_N_align, REG_DWCONV_FILTER_N_ALIGN); - - reg_writeq( - ((uint64_t)args.image.height) | (((uint64_t)args.image.width) << 32), - REG_POOLING_IMAGE_PIXEL); - reg_writeq( - ((uint64_t)args.kernel.height) | (((uint64_t)args.kernel.width) << 32), - REG_POOLING_WINDOW_SIZE); - - reg_writeq(((uint64_t)output_height) | (((uint64_t)output_width) << 32), - REG_POOLING_RESULT_PIXEL); - - reg_writeq(((uint64_t)args.image.pad_height) | - (((uint64_t)args.image.pad_width) << 32), - REG_POOLING_PAD_PIXEL); - reg_writeq(((uint64_t)args.kernel.stride_h) | - (((uint64_t)args.kernel.stride_w) << 32), - REG_POOLING_STEP_PIXEL); - - reg_writeq((uint64_t)args.image.channels, REG_POOLING_CHANNEL_NUMBER); - - reg_writeq(image_amount_per_row, REG_POOLING_IMAGE_AMOUNT_PER_ROW); - reg_writeq(image_one_pad_per_row, REG_POOLING_IMAGE_ONE_PAD_PER_ROW); - reg_writeq(image_two_pad_per_row, REG_POOLING_IMAGE_TWO_PAD_PER_ROW); - - reg_writeq(image_row_mul_pooling_hight, - REG_POOLING_IMAGE_ROW_MUL_WINDOW_HEIGHT); - reg_writeq(image_row_mul_pad_hight, REG_POOLING_IMAGE_ROW_MUL_PAD_HEIGHT); - reg_writeq(image_row_mul_step_hight, REG_POOLING_IMAGE_ROW_MUL_STEP_HEIGHT); - - reg_writeq(result_amount_align_32, REG_POOLING_RESULT_AMOUNT_ALIGN_32); - reg_writeq(result_amount_align_64, REG_POOLING_RESULT_AMOUNT_ALIGN_64); - - reg_writeq(image_calcu_height, REG_POOLING_IMAGE_CALCU_HEIGHT); - - reg_writeq(image_padleft_skipwindow, REG_POOLING_IMAGE_PADLEFT_SKIPWINDOW); - - /*SDK刷Cache保证数据一致性*/ - - reg_writeq(cmd, REG_DWCONV_CMD); + reg_writeq(image_physical_address, 0x808); + reg_writeq(result_addr_row, 0x810); + reg_writeq(kernel_padding_step, 0x818); + reg_writeq(result_size_calcu_height, 0x820); + reg_writeq(channel_parameter, 0x828); + reg_writeq(image_row_col_padding_down, 0x830); + reg_writeq(image_rowXpad_h_rowXstep_h, 0x838); + reg_writeq(0, 0x840); + reg_writeq(channelXpad_w_channelXstep_w, 0x848); + reg_writeq(filter_physical_address, 0x850); + reg_writeq(filter_param, 0x858); + reg_writeq(((bias_physical_address+C_align_64*4) | + (bias_physical_address << 32)), 0x860); + cmd = (uint64_t)1 | (((uint64_t)args.relu_enabled) << 8); + reg_writeq(cmd, 0x800); DLOG << "before reg poll"; if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) { g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status = ERROR; ret = -EIO; - DLOG << "Pooling Wait Irq Timeout!"; + DLOG << "DWconv Wait Irq Timeout!"; PADDLE_MOBILE_ENFORCE(0, "DWConv Wait Irq Timeout"); } DLOG << "after reg poll"; - - // *(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER); - output_scale = reg_readq(REG_SCALE_PARAMETER); - output_scale = (output_scale << 32) | (output_scale >> 32); - fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); - DLOG << "output_scale:" << output_scale; pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); return ret; #endif diff --git a/mobile/src/operators/kernel/fpga/V2/anchor_generator_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/anchor_generator_kernel.cpp old mode 100644 new mode 100755 index 951fbb5f37..56cc8927f0 --- a/mobile/src/operators/kernel/fpga/V2/anchor_generator_kernel.cpp +++ b/mobile/src/operators/kernel/fpga/V2/anchor_generator_kernel.cpp @@ -37,7 +37,7 @@ bool AnchorGeneratorKernel::Init( int anchors_offset[] = {-2, -2, 18, 18, -10, -9, 26, 25, -23, -20, 39, 36, -43, -34, 59, 49, -63, -54, 79, 69, -96, -77, 112, 93, -137, -118, 153, - 134, -204, -188, 220, 204, -281, -395, 296, 441}; + 134, -204, -188, 220, 204, -281, -395, 296, 411}; int anchors_offset2[] = {0, 0, 51, 77, 0, 0, 30, 35, 0, 0, 81, 103, 0, 0, 20, 21, 0, 0, 36, 44, 0, 0, 43, 58, diff --git a/mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp old mode 100644 new mode 100755 index 43b9355c99..57ccf9f00d --- a/mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp +++ b/mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp @@ -12,12 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifdef ELEMENTWISEADD_OP - +#include #include "operators/kernel/elementwise_add_kernel.h" -#include -#include "fpga/V2/api.h" - namespace paddle_mobile { namespace operators { @@ -60,10 +57,36 @@ bool ElementwiseAddKernel::Init(ElementwiseAddParam *param) { return true; } +void ComputeCPUEWAdd(fpga::EWAddArgs ewaddArgs) { + int inputc = ewaddArgs.image0.channels; + int inputh = ewaddArgs.image0.height; + int inputw = ewaddArgs.image0.width; + float inScale0 = + (reinterpret_cast(ewaddArgs.image0.scale_address))[0]; + float inScale1 = + (reinterpret_cast(ewaddArgs.image1.scale_address))[0]; + float outScale = + (reinterpret_cast(ewaddArgs.output.scale_address))[0]; + int8_t* inPtr0 = reinterpret_cast(ewaddArgs.image0.address); + int8_t* inPtr1 = reinterpret_cast(ewaddArgs.image1.address); + int8_t* outPtr = reinterpret_cast(ewaddArgs.output.address); + int datasize = inputc * inputh * inputw; + float const0 = inScale0 / outScale; + float const1 = inScale1 / outScale; + fpga::fpga_invalidate(inPtr0, datasize * sizeof(int8_t)); + fpga::fpga_invalidate(inPtr1, datasize * sizeof(int8_t)); + for (int i = 0; i < datasize; i++) { + float tmpF = inPtr0[i] * const0 + inPtr1[i] * const1; + int tmpI = static_cast(round(tmpF)); + outPtr[i] = (int8_t)((tmpI > 127 ? 127 : (tmpI < -127 ? -127 : tmpI))); + } + fpga::fpga_flush(outPtr, datasize * sizeof(int8_t)); +} template <> void ElementwiseAddKernel::Compute( const ElementwiseAddParam ¶m) { - fpga::ComputeFpgaEWAdd(param.FpgaArgs()); + // fpga::ComputeFpgaEWAdd(param.FpgaArgs()); + ComputeCPUEWAdd(param.FpgaArgs()); } } // namespace operators } // namespace paddle_mobile diff --git a/mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp old mode 100644 new mode 100755 index 6d5ad50573..de60341874 --- a/mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp +++ b/mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifdef FUSION_ELEMENTWISEADDRELU_OP - +#include #include "operators/kernel/elementwise_add_relu_kernel.h" namespace paddle_mobile { @@ -58,10 +58,37 @@ bool ElementwiseAddReluKernel::Init( return true; } +void ComputeCPUEWAddRelu(fpga::EWAddArgs ewaddArgs) { + int inputc = ewaddArgs.image0.channels; + int inputh = ewaddArgs.image0.height; + int inputw = ewaddArgs.image0.width; + float inScale0 = + (reinterpret_cast(ewaddArgs.image0.scale_address))[0]; + float inScale1 = + (reinterpret_cast(ewaddArgs.image1.scale_address))[0]; + float outScale = + (reinterpret_cast(ewaddArgs.output.scale_address))[0]; + int8_t* inPtr0 = reinterpret_cast(ewaddArgs.image0.address); + int8_t* inPtr1 = reinterpret_cast(ewaddArgs.image1.address); + int8_t* outPtr = reinterpret_cast(ewaddArgs.output.address); + int datasize = inputc * inputh * inputw; + float const0 = inScale0 / outScale; + float const1 = inScale1 / outScale; + fpga::fpga_invalidate(inPtr0, datasize * sizeof(int8_t)); + fpga::fpga_invalidate(inPtr1, datasize * sizeof(int8_t)); + for (int i = 0; i < datasize; i++) { + float tmpF = inPtr0[i] * const0 + inPtr1[i] * const1; + int tmpI = static_cast(round(tmpF)); + outPtr[i] = (int8_t)((tmpI > 127 ? 127 : (tmpI < 0 ? 0 : tmpI))); + } + fpga::fpga_flush(outPtr, datasize * sizeof(int8_t)); +} + template <> void ElementwiseAddReluKernel::Compute( const ElementwiseAddReluParam ¶m) { - fpga::ComputeFpgaEWAdd(param.FpgaArgs()); + // fpga::ComputeFpgaEWAdd(param.FpgaArgs()); + ComputeCPUEWAddRelu(param.FpgaArgs()); } } // namespace operators } // namespace paddle_mobile -- GitLab