提交 b5a0488a 编写于 作者: qnqinan's avatar qnqinan 提交者: zhangyang0701

update FPGA v2 pe cpp file and ew kernel files, test=develop , closes #2432 (#2433)

* update proposal and psroipool kernel file in FPGA V2 track

* update, test=develop

* update FPGA v2 pe cpp file and ew kernel files, test=develop
上级 ce21ff5d
...@@ -623,7 +623,7 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, ...@@ -623,7 +623,7 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
arg->concat_arg.images_in[i] = arg->concat_arg.images_in[i] =
(int8_t *)arg->conv_arg[i].output.address; // NOLINT (int8_t *)arg->conv_arg[i].output.address; // NOLINT
arg->concat_arg.scales_in[i] = arg->conv_arg[i].output.scale_address; arg->concat_arg.scales_in[i] = out->scale;
arg->concat_arg.channel_num[i] = arg->conv_arg[i].filter_num; arg->concat_arg.channel_num[i] = arg->conv_arg[i].filter_num;
expand_conv_arg(&arg->conv_arg[i]); expand_conv_arg(&arg->conv_arg[i]);
......
...@@ -109,7 +109,7 @@ using namespace std; // NOLINT ...@@ -109,7 +109,7 @@ using namespace std; // NOLINT
#define REG_POOLING_IMAGE_ROW_MUL_PAD_HEIGHT 0x868 #define REG_POOLING_IMAGE_ROW_MUL_PAD_HEIGHT 0x868
#define REG_POOLING_IMAGE_ROW_MUL_STEP_HEIGHT 0x870 #define REG_POOLING_IMAGE_ROW_MUL_STEP_HEIGHT 0x870
#define REG_POOLING_RESULT_AMOUNT_ALIGN_32 0x878 #define REG_POOLING_RESULT_AMOUNT_ALIGN_32 0x878
#define REG_POOLING_RESULT_AMOUNT_ALIGN_64 0x880 #define REG_POOLING_RESULT_AMOUNT_ALIGN_16 0x880
#define REG_POOLING_IMAGE_CALCU_HEIGHT 0x888 #define REG_POOLING_IMAGE_CALCU_HEIGHT 0x888
#define REG_POOLING_IMAGE_PADLEFT_SKIPWINDOW 0x898 #define REG_POOLING_IMAGE_PADLEFT_SKIPWINDOW 0x898
#define REG_POOLING_MODE_RECIPROCAL 0x890 #define REG_POOLING_MODE_RECIPROCAL 0x890
...@@ -270,10 +270,10 @@ int ComputeBasicConv(const struct ConvArgs &args) { ...@@ -270,10 +270,10 @@ int ComputeBasicConv(const struct ConvArgs &args) {
args.driver.filter_pad_width_mul_channel, args.driver.filter_pad_width_mul_channel,
REG_CONV_REG1); REG_CONV_REG1);
reg_writeq((args.driver.stride_h << 48) | (args.driver.skip_window << 28) | reg_writeq((args.driver.stride_h << 50) | (args.driver.skip_window << 30) |
(args.driver.filter_row << 8) | (args.driver.filter_row << 10) |
(args.driver.filter_height << 4) | args.driver.filter_width, (args.driver.filter_height << 5) | args.driver.filter_width,
REG_CONV_REG2); REG_CONV_REG2);
reg_writeq((args.driver.filter_num << 42) | (args.driver.filter_align << 26) | reg_writeq((args.driver.filter_num << 42) | (args.driver.filter_align << 26) |
(args.driver.prog_full_cnt << 16) | (args.driver.prog_full_cnt << 16) |
...@@ -358,7 +358,6 @@ int ComputeFpgaPool(const struct PoolingArgs &args) { ...@@ -358,7 +358,6 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
<< " out_scale_address:" << args.output.scale_address; << " out_scale_address:" << args.output.scale_address;
#endif #endif
#ifdef PADDLE_MOBILE_ZU5 #ifdef PADDLE_MOBILE_ZU5
DLOG << "Polling";
// return 0; // return 0;
uint64_t output_scale = 0; uint64_t output_scale = 0;
uint64_t timer_cnt = 0; uint64_t timer_cnt = 0;
...@@ -367,65 +366,73 @@ int ComputeFpgaPool(const struct PoolingArgs &args) { ...@@ -367,65 +366,73 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
uint64_t image_physical_address = 0; uint64_t image_physical_address = 0;
uint64_t output_physical_address = 0; uint64_t output_physical_address = 0;
// uint64_t reg_ActivationArgs = 0; image_physical_address = vaddr_to_paddr(args.image.address);
// active function:{none,leakeyrelu,sigmoid,tanh} output_physical_address = vaddr_to_paddr(args.output.address);
// ActivationArgs active_args; uint64_t C_paral_64 = align_to_x((uint64_t)args.image.channels, 64);
// active_args.activation_type = LEAKYRELU; uint64_t C_align_32 = align_to_x((uint64_t)args.image.channels, 32);
// active_args.activation_type = args.output.activation.activation_type; uint64_t output_height = (uint64_t)(
// active_args.leaky_relu_negative_slope =
// args.output.activation.leaky_relu_negative_slope;
// reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) |
// active_args.leaky_relu_negative_slope;
// DLOG << " activation_type:" << active_args.activation_type
// << " leaky_relu_negative_slope:"
// << active_args.leaky_relu_negative_slope;
// DLOG << " reg_ActivationArgs:" << reg_ActivationArgs;
image_physical_address = vaddr_to_paddr_driver(args.image.address);
output_physical_address = vaddr_to_paddr_driver(args.output.address);
uint32_t output_height = (uint32_t)(
(args.image.height + args.image.pad_height * 2 - args.kernel.height) / (args.image.height + args.image.pad_height * 2 - args.kernel.height) /
args.kernel.stride_h + args.kernel.stride_h + 1);
1); uint64_t output_width = (uint64_t)(
uint32_t output_width = (uint32_t)(
(args.image.width + args.image.pad_width * 2 - args.kernel.width) / (args.image.width + args.image.pad_width * 2 - args.kernel.width) /
args.kernel.stride_w + args.kernel.stride_w + 1);
1);
uint64_t image_amount_per_row = uint64_t image_amount_per_row =
align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels, align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels,
IMAGE_ALIGNMENT); IMAGE_ALIGNMENT);
uint64_t image_one_pad_per_row = uint64_t image_one_pad_per_row = (uint64_t)args.image.width *
align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels, (uint64_t)args.image.channels +(uint64_t)args.image.pad_width *
FILTER_ELEMENT_ALIGNMENT) + (uint64_t)args.image.channels;
(uint64_t)args.image.pad_width * (uint64_t)args.image.channels;
uint64_t image_two_pad_per_row = align_to_x( uint64_t result_amount_align_32 = align_to_x((uint64_t)output_width *
((uint64_t)args.image.width + (uint64_t)args.image.pad_width * 2) * (uint64_t)args.image.channels, 32);
(uint64_t)args.image.channels, uint64_t result_addr_row =
IMAGE_ALIGNMENT); (result_amount_align_32 << 32) | output_physical_address;
uint64_t image_row_mul_pooling_hight = uint64_t row_padding_down =
image_amount_per_row * (uint64_t)args.kernel.height; (uint64_t)args.image.height + (uint64_t)args.image.pad_height;
uint64_t image_row_mul_pad_hight = uint64_t kernel_width_sub1 =
image_amount_per_row * (uint64_t)args.image.pad_height; (uint64_t)args.kernel.width - 1;
uint64_t image_row_mul_step_hight = uint64_t kernel_padding_step = row_padding_down |
image_amount_per_row * (uint64_t)args.kernel.stride_h; ((uint64_t)args.image.pad_height << 16) |
uint64_t result_amount_align_32 = ((uint64_t)args.kernel.stride_h << 24) |
align_to_x((uint64_t)output_width * (uint64_t)args.image.channels, ((uint64_t)kernel_width_sub1<<32) |
FILTER_ELEMENT_ALIGNMENT); ((uint64_t)args.kernel.height << 40) |
uint64_t result_amount_align_64 = align_to_x( ((uint64_t)(args.kernel.height-1) << 48);
(uint64_t)output_width * (uint64_t)args.image.channels, IMAGE_ALIGNMENT); uint64_t image_calcu_height = (uint64_t)args.kernel.height +
uint64_t image_calcu_height = (output_height - 1) * (uint64_t)args.kernel.stride_h;
(uint64_t)args.kernel.height + uint64_t result_size_calcu_height = (output_height - 1) |
((uint64_t)output_height - 1) * (uint64_t)args.kernel.stride_h; ((output_width - 1) << 16) | (image_calcu_height << 32);
uint64_t image_pad_left = args.image.channels * args.image.pad_width; uint64_t col_padding_down = ((uint64_t)args.image.width +
uint64_t image_skip_window = args.image.channels * args.kernel.stride_w; (uint64_t)args.image.pad_width) * (uint64_t)args.image.channels;
uint64_t image_padleft_skipwindow =
(image_skip_window << 32) | image_pad_left; uint64_t image_row_col_padding_down =
uint64_t mode_reciprocal = (uint64_t)0 | ((uint64_t)args.mode) << 16 | image_amount_per_row | (col_padding_down << 32);
(((uint64_t)args.kernel_reciprocal)); uint64_t image_rowXpadding_h =
image_amount_per_row * (uint64_t)args.image.pad_height;
uint64_t image_rowXstep_h =
image_amount_per_row * (uint64_t)args.kernel.stride_h;
uint64_t image_rowXpad_h_rowXstep_h =
image_rowXpadding_h | (image_rowXstep_h << 32);
uint64_t channelXpad_w =
(uint64_t)args.image.channels * (uint64_t)args.image.pad_width;
uint64_t channelXstep_w =
(uint64_t)args.image.channels * (uint64_t)args.kernel.stride_w;
uint64_t channelXpad_w_channelXstep_w =
channelXpad_w | (channelXstep_w << 32);
uint64_t filter_row_align =
C_align_32 * (uint64_t)args.kernel.width;
uint64_t sub_filter_amount_align = C_align_32 *
(uint64_t)args.kernel.width * (uint64_t)args.kernel.height;
uint64_t mult_factor = 0;
float average_reciprocal = args.kernel_reciprocal;
uint32_t* kernel_reciprocal;
kernel_reciprocal =(reinterpret_cast<uint32_t*>(&average_reciprocal));
if (args.mode == 1)
mult_factor = (uint64_t)(*kernel_reciprocal) |
((uint64_t)1 << 32) | ((uint64_t)1 << 40);
else
mult_factor =
(uint64_t)0x3f800000 | ((uint64_t)1 << 32) | ((uint64_t)1 << 40);
pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) { if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) {
ret = -EIO; ret = -EIO;
...@@ -434,40 +441,20 @@ int ComputeFpgaPool(const struct PoolingArgs &args) { ...@@ -434,40 +441,20 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
return ret; return ret;
} }
// reg_writeq(reg_ActivationArgs, reg_writeq(image_physical_address, 0x808);
// REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); // active functoion reg_writeq(result_addr_row, 0x810);
reg_writeq(kernel_padding_step, 0x818);
// reg_writeq(output_scale, REG_SCALE_PARAMETER); reg_writeq(result_size_calcu_height, 0x820);
reg_writeq(image_physical_address, REG_POOLING_IMAGE_BASE_ADDR); reg_writeq((uint64_t)args.image.channels, 0x828);
reg_writeq(output_physical_address, REG_POOLING_RESULT_BASE_ADDR); reg_writeq(image_row_col_padding_down, 0x830);
reg_writeq( reg_writeq(image_rowXpad_h_rowXstep_h, 0x838);
((uint64_t)args.image.height) | (((uint64_t)args.image.width) << 32), reg_writeq(mult_factor, 0x840); // dw donot care
REG_POOLING_IMAGE_PIXEL); reg_writeq(channelXpad_w_channelXstep_w, 0x848);
reg_writeq( if (args.mode == 1)
((uint64_t)args.kernel.height) | (((uint64_t)args.kernel.width) << 32), cmd = (uint64_t)4;
REG_POOLING_WINDOW_SIZE); else
reg_writeq(((uint64_t)output_height) | (((uint64_t)output_width) << 32), cmd = (uint64_t)8;
REG_POOLING_RESULT_PIXEL); reg_writeq(cmd, 0x800);
reg_writeq(((uint64_t)args.image.pad_height) |
(((uint64_t)args.image.pad_width) << 32),
REG_POOLING_PAD_PIXEL);
reg_writeq(((uint64_t)args.kernel.stride_h) |
(((uint64_t)args.kernel.stride_w) << 32),
REG_POOLING_STEP_PIXEL);
reg_writeq((uint64_t)args.image.channels, REG_POOLING_CHANNEL_NUMBER);
reg_writeq(image_amount_per_row, REG_POOLING_IMAGE_AMOUNT_PER_ROW);
reg_writeq(image_one_pad_per_row, REG_POOLING_IMAGE_ONE_PAD_PER_ROW);
reg_writeq(image_two_pad_per_row, REG_POOLING_IMAGE_TWO_PAD_PER_ROW);
reg_writeq(image_row_mul_pooling_hight,
REG_POOLING_IMAGE_ROW_MUL_WINDOW_HEIGHT);
reg_writeq(image_row_mul_pad_hight, REG_POOLING_IMAGE_ROW_MUL_PAD_HEIGHT);
reg_writeq(image_row_mul_step_hight, REG_POOLING_IMAGE_ROW_MUL_STEP_HEIGHT);
reg_writeq(result_amount_align_32, REG_POOLING_RESULT_AMOUNT_ALIGN_32);
reg_writeq(result_amount_align_64, REG_POOLING_RESULT_AMOUNT_ALIGN_64);
reg_writeq(image_calcu_height, REG_POOLING_IMAGE_CALCU_HEIGHT);
reg_writeq(image_padleft_skipwindow, REG_POOLING_IMAGE_PADLEFT_SKIPWINDOW);
reg_writeq(mode_reciprocal, REG_POOLING_MODE_RECIPROCAL);
reg_writeq(cmd, REG_POOLING_CMD);
DLOG << "before reg poll"; DLOG << "before reg poll";
if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) { if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) {
...@@ -478,14 +465,6 @@ int ComputeFpgaPool(const struct PoolingArgs &args) { ...@@ -478,14 +465,6 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
} }
DLOG << "after reg poll"; DLOG << "after reg poll";
// *(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER);
// output_scale = reg_readq(REG_SCALE_PARAMETER);
// output_scale = (output_scale << 32) | (output_scale >> 32);
// fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
// active_args.activation_type = NONE;
// reg_writeq(reg_ActivationArgs, REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);
pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
return ret; return ret;
...@@ -518,19 +497,7 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) { ...@@ -518,19 +497,7 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
#endif #endif
#ifdef PADDLE_MOBILE_ZU5 #ifdef PADDLE_MOBILE_ZU5
int ret = 0; int ret = 0;
uint64_t output_scale = 0;
// uint64_t reg_ActivationArgs = 0;
// ActivationArgs active_args;
// active_args.activation_type = args.output.activation.activation_type;
// active_args.leaky_relu_negative_slope =
// args.output.activation.leaky_relu_negative_slope;
// reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) |
// active_args.leaky_relu_negative_slope;
// DLOG << " activation_type:" << active_args.activation_type
// << " leaky_relu_negative_slope:"
// << active_args.leaky_relu_negative_slope;
// DLOG << " reg_ActivationArgs:" << reg_ActivationArgs;
pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_EW]->status) { if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_EW]->status) {
...@@ -540,18 +507,46 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) { ...@@ -540,18 +507,46 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
return ret; return ret;
} }
// reg_writeq(reg_ActivationArgs,
// REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); // active functoion
reg_writeq(output_scale, REG_SCALE_PARAMETER); uint64_t image0_physical_address = 0;
reg_writeq(args.driver.image0_address_phy, REG_EW_IMAGE0_BASE_ADDR); uint64_t image1_physical_address = 0;
reg_writeq(args.driver.image1_address_phy, REG_EW_IMAGE1_BASE_ADDR); uint64_t image_physical_address = 0;
reg_writeq(args.driver.datalen, REG_EW_DATA_LEN); uint64_t output_physical_address = 0;
reg_writeq(args.driver.image_image_pixel, REG_EW_IMAGE_PIXEL); image0_physical_address = vaddr_to_paddr(args.image0.address);
reg_writeq(args.driver.image_amount_per_row, REG_EW_IMAGE_AMOUNT_PER_ROW); image1_physical_address = vaddr_to_paddr(args.image1.address);
reg_writeq(args.driver.output_address_phy, REG_EW_RESULT_BASE_ADDR); image_physical_address =
reg_writeq(args.driver.coefficient, REG_EW_COEFFICIENT); image0_physical_address | (image1_physical_address << 32);
reg_writeq(args.driver.cmd, REG_EW_CMD); output_physical_address = vaddr_to_paddr(args.output.address);
uint64_t image_amount_per_row =
align_to_x((uint64_t)args.image0.width *
(uint64_t)args.image0.channels, IMAGE_ALIGNMENT);
uint64_t result_addr_row =
output_physical_address | (image_amount_per_row << 32);
uint64_t kernel_padding_step = 0;
kernel_padding_step = ((uint64_t)args.image0.height * 2) |
((uint64_t)2 << 24) | ((uint64_t)2 << 40) | ((uint64_t)1 << 48);
uint64_t result_size_calcu_height = ((uint64_t)args.image0.height - 1) |
((image_amount_per_row / 32 - 1) << 16) |
(((uint64_t)args.image0.height * 2) << 32);
uint64_t image_row_col_padding_down = image_amount_per_row |
(image_amount_per_row << 32);
float quantParam = (args.output.scale_address)[0];
uint32_t* ew_scale = reinterpret_cast<uint32_t*>(&quantParam);
uint64_t ew_scale_mult_factor = (*ew_scale) |
((uint64_t)args.const0 << 32) | ((uint64_t)args.const1 << 40);
reg_writeq(image_physical_address, 0x808);
reg_writeq(result_addr_row, 0x810);
reg_writeq(kernel_padding_step, 0x818);
reg_writeq(result_size_calcu_height, 0x820);
reg_writeq(32, 0x828);
reg_writeq(image_row_col_padding_down, 0x830);
reg_writeq(((image_amount_per_row*2) << 32), 0x838);
reg_writeq(ew_scale_mult_factor, 0x840); // dw donot care
reg_writeq(((uint64_t)32 << 32), 0x848);
reg_writeq(0, 0x858);
uint64_t cmd = 0;
cmd = (uint64_t)2 | (((uint64_t)args.relu_enabled) << 8);
reg_writeq(cmd, 0x800);
if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) { if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) {
g_fpgainfo.pe_data->pes[PE_IDX_EW]->status = ERROR; g_fpgainfo.pe_data->pes[PE_IDX_EW]->status = ERROR;
...@@ -560,12 +555,6 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) { ...@@ -560,12 +555,6 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
PADDLE_MOBILE_ENFORCE(0, "EW Wait Irq Timeout!"); PADDLE_MOBILE_ENFORCE(0, "EW Wait Irq Timeout!");
} }
// output_scale = reg_readq(REG_SCALE_PARAMETER);
// output_scale = (output_scale << 32) | (output_scale >> 32);
// fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
// active_args.activation_type = NONE;
// reg_writeq(reg_ActivationArgs, REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);
pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
return ret; return ret;
#endif #endif
...@@ -870,7 +859,7 @@ int ComputeFpgaDeconv(const struct DeconvArgs &args) { ...@@ -870,7 +859,7 @@ int ComputeFpgaDeconv(const struct DeconvArgs &args) {
#endif #endif
} }
if (sub_conv_num > 1) { /*if (sub_conv_num > 1) {
float max_scale = -1.0f; float max_scale = -1.0f;
#ifdef COST_TIME_PRINT #ifdef COST_TIME_PRINT
gettimeofday(&start, NULL); gettimeofday(&start, NULL);
...@@ -894,19 +883,7 @@ int ComputeFpgaDeconv(const struct DeconvArgs &args) { ...@@ -894,19 +883,7 @@ int ComputeFpgaDeconv(const struct DeconvArgs &args) {
<< " cost time: " << (dif_sec * 1000000 + dif_usec) << "us" << " cost time: " << (dif_sec * 1000000 + dif_usec) << "us"
<< std::endl; << std::endl;
#endif #endif
}*/
// fpga_flush(args.output.scale_address, 2 * sizeof(float));
/*#ifdef COST_TIME_PRINT
gettimeofday(&start,NULL);
#endif
//deconv_post_process(args);
#ifdef COST_TIME_PRINT
gettimeofday(&end,NULL);
dif_sec = end.tv_sec - start.tv_sec;
dif_usec = end.tv_usec - start.tv_usec;
std::cout << "deconv_post_process " << " cost time: " <<
(dif_sec*1000000+dif_usec) << "us" << std::endl; #endif*/
}
return 0; return 0;
} // ComputeFpgaDeconv } // ComputeFpgaDeconv
...@@ -940,8 +917,8 @@ int ComputeDWConv(const struct DWconvArgs &args) { ...@@ -940,8 +917,8 @@ int ComputeDWConv(const struct DWconvArgs &args) {
<< " image_width:" << args.image.width << " image_width:" << args.image.width
<< " pad_height:" << args.image.pad_height << " pad_height:" << args.image.pad_height
<< " pad_width:" << args.image.pad_width; << " pad_width:" << args.image.pad_width;
DLOG << " filter_address:" << args.filter_address DLOG << " filter_address:" << args.filter_address;
<< " bias_address:" << args.bias_address; //<< " bias_address:" << args.bias_address;
DLOG << " kernel_height:" << args.kernel.height DLOG << " kernel_height:" << args.kernel.height
<< " kernel_width:" << args.kernel.width << " kernel_width:" << args.kernel.width
<< " stride_h:" << args.kernel.stride_h << " stride_h:" << args.kernel.stride_h
...@@ -952,10 +929,8 @@ int ComputeDWConv(const struct DWconvArgs &args) { ...@@ -952,10 +929,8 @@ int ComputeDWConv(const struct DWconvArgs &args) {
#ifdef PADDLE_MOBILE_ZU5 #ifdef PADDLE_MOBILE_ZU5
DLOG << "DWConv"; DLOG << "DWConv";
// return 0; // return 0;
uint64_t output_scale = 0;
uint64_t timer_cnt = 0; uint64_t timer_cnt = 0;
int ret = 0; int ret = 0;
// uint64_t cmd = args.relu_enabled;
uint64_t cmd = 0; uint64_t cmd = 0;
uint64_t image_physical_address = 0; uint64_t image_physical_address = 0;
uint64_t output_physical_address = 0; uint64_t output_physical_address = 0;
...@@ -966,57 +941,69 @@ int ComputeDWConv(const struct DWconvArgs &args) { ...@@ -966,57 +941,69 @@ int ComputeDWConv(const struct DWconvArgs &args) {
output_physical_address = vaddr_to_paddr(args.output.address); output_physical_address = vaddr_to_paddr(args.output.address);
filter_physical_address = vaddr_to_paddr(args.filter_address); filter_physical_address = vaddr_to_paddr(args.filter_address);
bias_physical_address = vaddr_to_paddr(args.bias_address); bias_physical_address = vaddr_to_paddr(args.bias_address);
uint64_t filter_N_align = uint64_t C_align_64 = align_to_x((uint64_t)args.image.channels, 64);
align_to_x((uint64_t)args.image.channels, IMAGE_ALIGNMENT); uint64_t C_align_32 = align_to_x((uint64_t)args.image.channels, 32);
uint64_t filter_amount_per_row_align = uint64_t output_height = (uint64_t)
filter_N_align * (uint64_t)args.kernel.width; ((args.image.height + args.image.pad_height * 2 -
uint64_t sub_filter_amount_align = filter_N_align * args.kernel.height) / args.kernel.stride_h +1);
(uint64_t)args.kernel.width * uint64_t output_width = (uint64_t)
(uint64_t)args.kernel.height; (((args.image.width + args.image.pad_width * 2 - args.kernel.width) /
uint64_t filter_amount_align = args.kernel.stride_w + 1) * args.sub_conv_num);
sub_filter_amount_align * (uint64_t)args.sub_conv_num;
uint32_t output_height = (uint32_t)(
(args.image.height + args.image.pad_height * 2 - args.kernel.height) /
args.kernel.stride_h +
1);
uint32_t output_width = (uint32_t)(
((args.image.width + args.image.pad_width * 2 - args.kernel.width) /
args.kernel.stride_w +
1) *
args.sub_conv_num);
uint64_t image_amount_per_row = uint64_t image_amount_per_row =
align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels, align_to_x((uint64_t)args.image.width *
IMAGE_ALIGNMENT); (uint64_t)args.image.channels, IMAGE_ALIGNMENT);
uint64_t image_one_pad_per_row = uint64_t image_one_pad_per_row =
align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels, (uint64_t)args.image.width * (uint64_t)args.image.channels +
FILTER_ELEMENT_ALIGNMENT) + (uint64_t)args.image.pad_width * (uint64_t)args.image.channels;
(uint64_t)args.image.pad_width * (uint64_t)args.image.channels;
uint64_t image_two_pad_per_row = align_to_x( uint64_t result_amount_align_32 = align_to_x(
((uint64_t)args.image.width + (uint64_t)args.image.pad_width * 2) * (uint64_t)output_width * (uint64_t)args.image.channels, 32);
(uint64_t)args.image.channels, uint64_t result_addr_row =
IMAGE_ALIGNMENT); (result_amount_align_32 << 32) | output_physical_address;
uint64_t image_row_mul_pooling_hight = uint64_t row_padding_down =
image_amount_per_row * (uint64_t)args.kernel.height; (uint64_t)args.image.height + (uint64_t)args.image.pad_height;
uint64_t image_row_mul_pad_hight = uint64_t kernel_width_sub1 = (uint64_t)args.kernel.width - 1;
image_amount_per_row * (uint64_t)args.image.pad_height; uint64_t kernel_padding_step = row_padding_down |
uint64_t image_row_mul_step_hight = ((uint64_t)args.image.pad_height << 16) |
image_amount_per_row * (uint64_t)args.kernel.stride_h; ((uint64_t)args.kernel.stride_h << 24) |
uint64_t result_amount_align_32 = ((uint64_t)kernel_width_sub1<<32) |
align_to_x((uint64_t)output_width * (uint64_t)args.image.channels, ((uint64_t)args.kernel.height << 40) |
FILTER_ELEMENT_ALIGNMENT); ((uint64_t)(args.kernel.height-1) << 48);
uint64_t result_amount_align_64 = align_to_x( uint64_t image_calcu_height = (uint64_t)args.kernel.height +
(uint64_t)output_width * (uint64_t)args.image.channels, IMAGE_ALIGNMENT); (output_height - 1) * (uint64_t)args.kernel.stride_h;
uint64_t image_calcu_height = uint64_t result_size_calcu_height = (output_height - 1) |
(uint64_t)args.kernel.height + ((output_width - 1) << 16) | (image_calcu_height << 32);
((uint64_t)output_height - 1) * (uint64_t)args.kernel.stride_h; uint64_t col_padding_down = ((uint64_t)args.image.width +
uint64_t image_pad_left = args.image.channels * args.image.pad_width; (uint64_t)args.image.pad_width) * (uint64_t)args.image.channels;
uint64_t image_skip_window = args.image.channels * args.kernel.stride_w;
uint64_t image_row_col_padding_down =
uint64_t image_padleft_skipwindow = image_amount_per_row | (col_padding_down << 32);
(image_skip_window << 32) | image_pad_left; uint64_t image_rowXpadding_h =
image_amount_per_row * (uint64_t)args.image.pad_height;
uint64_t image_rowXstep_h =
image_amount_per_row * (uint64_t)args.kernel.stride_h;
uint64_t image_rowXpad_h_rowXstep_h =
image_rowXpadding_h | (image_rowXstep_h << 32);
uint64_t channelXpad_w =
(uint64_t)args.image.channels * (uint64_t)args.image.pad_width;
uint64_t channelXstep_w =
(uint64_t)args.image.channels * (uint64_t)args.kernel.stride_w;
uint64_t channelXpad_w_channelXstep_w =
channelXpad_w | (channelXstep_w << 32);
uint64_t filter_row_align =
C_align_64 * (uint64_t)args.kernel.width;
uint64_t sub_filter_amount_align = C_align_64 *
(uint64_t)args.kernel.width *
(uint64_t)args.kernel.height;
uint64_t filter_amount_align =
sub_filter_amount_align * (uint64_t)args.sub_conv_num;
uint64_t filter_param = filter_row_align | (filter_amount_align << 16) |
(sub_filter_amount_align << 32) |
(((uint64_t)args.sub_conv_num -1) << 48);
uint64_t channel_parameter =
(uint64_t)args.image.channels | (C_align_64 << 16);
pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) { if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) {
ret = -EIO; ret = -EIO;
...@@ -1025,72 +1012,30 @@ int ComputeDWConv(const struct DWconvArgs &args) { ...@@ -1025,72 +1012,30 @@ int ComputeDWConv(const struct DWconvArgs &args) {
return ret; return ret;
} }
/*restart scale*/ reg_writeq(image_physical_address, 0x808);
reg_writeq(output_scale, REG_SCALE_PARAMETER); reg_writeq(result_addr_row, 0x810);
reg_writeq(kernel_padding_step, 0x818);
reg_writeq(image_physical_address, REG_POOLING_IMAGE_BASE_ADDR); reg_writeq(result_size_calcu_height, 0x820);
reg_writeq(output_physical_address, REG_POOLING_RESULT_BASE_ADDR); reg_writeq(channel_parameter, 0x828);
reg_writeq((bias_physical_address << 32 | filter_physical_address), reg_writeq(image_row_col_padding_down, 0x830);
REG_DWCONV_FILTER_BASE_ADDR); reg_writeq(image_rowXpad_h_rowXstep_h, 0x838);
reg_writeq(filter_amount_per_row_align | (filter_amount_align << 32), reg_writeq(0, 0x840);
REG_DWCONV_FILTER_SHAPE); reg_writeq(channelXpad_w_channelXstep_w, 0x848);
reg_writeq(sub_filter_amount_align | (((uint64_t)args.sub_conv_num) << 32), reg_writeq(filter_physical_address, 0x850);
REG_DWCONV_FILTER_SUBNUMBER); reg_writeq(filter_param, 0x858);
reg_writeq(filter_N_align, REG_DWCONV_FILTER_N_ALIGN); reg_writeq(((bias_physical_address+C_align_64*4) |
(bias_physical_address << 32)), 0x860);
reg_writeq( cmd = (uint64_t)1 | (((uint64_t)args.relu_enabled) << 8);
((uint64_t)args.image.height) | (((uint64_t)args.image.width) << 32), reg_writeq(cmd, 0x800);
REG_POOLING_IMAGE_PIXEL);
reg_writeq(
((uint64_t)args.kernel.height) | (((uint64_t)args.kernel.width) << 32),
REG_POOLING_WINDOW_SIZE);
reg_writeq(((uint64_t)output_height) | (((uint64_t)output_width) << 32),
REG_POOLING_RESULT_PIXEL);
reg_writeq(((uint64_t)args.image.pad_height) |
(((uint64_t)args.image.pad_width) << 32),
REG_POOLING_PAD_PIXEL);
reg_writeq(((uint64_t)args.kernel.stride_h) |
(((uint64_t)args.kernel.stride_w) << 32),
REG_POOLING_STEP_PIXEL);
reg_writeq((uint64_t)args.image.channels, REG_POOLING_CHANNEL_NUMBER);
reg_writeq(image_amount_per_row, REG_POOLING_IMAGE_AMOUNT_PER_ROW);
reg_writeq(image_one_pad_per_row, REG_POOLING_IMAGE_ONE_PAD_PER_ROW);
reg_writeq(image_two_pad_per_row, REG_POOLING_IMAGE_TWO_PAD_PER_ROW);
reg_writeq(image_row_mul_pooling_hight,
REG_POOLING_IMAGE_ROW_MUL_WINDOW_HEIGHT);
reg_writeq(image_row_mul_pad_hight, REG_POOLING_IMAGE_ROW_MUL_PAD_HEIGHT);
reg_writeq(image_row_mul_step_hight, REG_POOLING_IMAGE_ROW_MUL_STEP_HEIGHT);
reg_writeq(result_amount_align_32, REG_POOLING_RESULT_AMOUNT_ALIGN_32);
reg_writeq(result_amount_align_64, REG_POOLING_RESULT_AMOUNT_ALIGN_64);
reg_writeq(image_calcu_height, REG_POOLING_IMAGE_CALCU_HEIGHT);
reg_writeq(image_padleft_skipwindow, REG_POOLING_IMAGE_PADLEFT_SKIPWINDOW);
/*SDK刷Cache保证数据一致性*/
reg_writeq(cmd, REG_DWCONV_CMD);
DLOG << "before reg poll"; DLOG << "before reg poll";
if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) { if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) {
g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status = ERROR; g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status = ERROR;
ret = -EIO; ret = -EIO;
DLOG << "Pooling Wait Irq Timeout!"; DLOG << "DWconv Wait Irq Timeout!";
PADDLE_MOBILE_ENFORCE(0, "DWConv Wait Irq Timeout"); PADDLE_MOBILE_ENFORCE(0, "DWConv Wait Irq Timeout");
} }
DLOG << "after reg poll"; DLOG << "after reg poll";
// *(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER);
output_scale = reg_readq(REG_SCALE_PARAMETER);
output_scale = (output_scale << 32) | (output_scale >> 32);
fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
DLOG << "output_scale:" << output_scale;
pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
return ret; return ret;
#endif #endif
......
...@@ -37,7 +37,7 @@ bool AnchorGeneratorKernel<FPGA, float>::Init( ...@@ -37,7 +37,7 @@ bool AnchorGeneratorKernel<FPGA, float>::Init(
int anchors_offset[] = {-2, -2, 18, 18, -10, -9, 26, 25, -23, int anchors_offset[] = {-2, -2, 18, 18, -10, -9, 26, 25, -23,
-20, 39, 36, -43, -34, 59, 49, -63, -54, -20, 39, 36, -43, -34, 59, 49, -63, -54,
79, 69, -96, -77, 112, 93, -137, -118, 153, 79, 69, -96, -77, 112, 93, -137, -118, 153,
134, -204, -188, 220, 204, -281, -395, 296, 441}; 134, -204, -188, 220, 204, -281, -395, 296, 411};
int anchors_offset2[] = {0, 0, 51, 77, 0, 0, 30, 35, 0, 0, 81, 103, int anchors_offset2[] = {0, 0, 51, 77, 0, 0, 30, 35, 0, 0, 81, 103,
0, 0, 20, 21, 0, 0, 36, 44, 0, 0, 43, 58, 0, 0, 20, 21, 0, 0, 36, 44, 0, 0, 43, 58,
......
...@@ -12,12 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,12 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef ELEMENTWISEADD_OP #ifdef ELEMENTWISEADD_OP
#include <math.h>
#include "operators/kernel/elementwise_add_kernel.h" #include "operators/kernel/elementwise_add_kernel.h"
#include <string>
#include "fpga/V2/api.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
...@@ -60,10 +57,36 @@ bool ElementwiseAddKernel<FPGA, float>::Init(ElementwiseAddParam<FPGA> *param) { ...@@ -60,10 +57,36 @@ bool ElementwiseAddKernel<FPGA, float>::Init(ElementwiseAddParam<FPGA> *param) {
return true; return true;
} }
void ComputeCPUEWAdd(fpga::EWAddArgs ewaddArgs) {
int inputc = ewaddArgs.image0.channels;
int inputh = ewaddArgs.image0.height;
int inputw = ewaddArgs.image0.width;
float inScale0 =
(reinterpret_cast<float*>(ewaddArgs.image0.scale_address))[0];
float inScale1 =
(reinterpret_cast<float*>(ewaddArgs.image1.scale_address))[0];
float outScale =
(reinterpret_cast<float*>(ewaddArgs.output.scale_address))[0];
int8_t* inPtr0 = reinterpret_cast<int8_t*>(ewaddArgs.image0.address);
int8_t* inPtr1 = reinterpret_cast<int8_t*>(ewaddArgs.image1.address);
int8_t* outPtr = reinterpret_cast<int8_t*>(ewaddArgs.output.address);
int datasize = inputc * inputh * inputw;
float const0 = inScale0 / outScale;
float const1 = inScale1 / outScale;
fpga::fpga_invalidate(inPtr0, datasize * sizeof(int8_t));
fpga::fpga_invalidate(inPtr1, datasize * sizeof(int8_t));
for (int i = 0; i < datasize; i++) {
float tmpF = inPtr0[i] * const0 + inPtr1[i] * const1;
int tmpI = static_cast<int>(round(tmpF));
outPtr[i] = (int8_t)((tmpI > 127 ? 127 : (tmpI < -127 ? -127 : tmpI)));
}
fpga::fpga_flush(outPtr, datasize * sizeof(int8_t));
}
template <> template <>
void ElementwiseAddKernel<FPGA, float>::Compute( void ElementwiseAddKernel<FPGA, float>::Compute(
const ElementwiseAddParam<FPGA> &param) { const ElementwiseAddParam<FPGA> &param) {
fpga::ComputeFpgaEWAdd(param.FpgaArgs()); // fpga::ComputeFpgaEWAdd(param.FpgaArgs());
ComputeCPUEWAdd(param.FpgaArgs());
} }
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
......
...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef FUSION_ELEMENTWISEADDRELU_OP #ifdef FUSION_ELEMENTWISEADDRELU_OP
#include <math.h>
#include "operators/kernel/elementwise_add_relu_kernel.h" #include "operators/kernel/elementwise_add_relu_kernel.h"
namespace paddle_mobile { namespace paddle_mobile {
...@@ -58,10 +58,37 @@ bool ElementwiseAddReluKernel<FPGA, float>::Init( ...@@ -58,10 +58,37 @@ bool ElementwiseAddReluKernel<FPGA, float>::Init(
return true; return true;
} }
void ComputeCPUEWAddRelu(fpga::EWAddArgs ewaddArgs) {
int inputc = ewaddArgs.image0.channels;
int inputh = ewaddArgs.image0.height;
int inputw = ewaddArgs.image0.width;
float inScale0 =
(reinterpret_cast<float*>(ewaddArgs.image0.scale_address))[0];
float inScale1 =
(reinterpret_cast<float*>(ewaddArgs.image1.scale_address))[0];
float outScale =
(reinterpret_cast<float*>(ewaddArgs.output.scale_address))[0];
int8_t* inPtr0 = reinterpret_cast<int8_t*>(ewaddArgs.image0.address);
int8_t* inPtr1 = reinterpret_cast<int8_t*>(ewaddArgs.image1.address);
int8_t* outPtr = reinterpret_cast<int8_t*>(ewaddArgs.output.address);
int datasize = inputc * inputh * inputw;
float const0 = inScale0 / outScale;
float const1 = inScale1 / outScale;
fpga::fpga_invalidate(inPtr0, datasize * sizeof(int8_t));
fpga::fpga_invalidate(inPtr1, datasize * sizeof(int8_t));
for (int i = 0; i < datasize; i++) {
float tmpF = inPtr0[i] * const0 + inPtr1[i] * const1;
int tmpI = static_cast<int>(round(tmpF));
outPtr[i] = (int8_t)((tmpI > 127 ? 127 : (tmpI < 0 ? 0 : tmpI)));
}
fpga::fpga_flush(outPtr, datasize * sizeof(int8_t));
}
template <> template <>
void ElementwiseAddReluKernel<FPGA, float>::Compute( void ElementwiseAddReluKernel<FPGA, float>::Compute(
const ElementwiseAddReluParam<FPGA> &param) { const ElementwiseAddReluParam<FPGA> &param) {
fpga::ComputeFpgaEWAdd(param.FpgaArgs()); // fpga::ComputeFpgaEWAdd(param.FpgaArgs());
ComputeCPUEWAddRelu(param.FpgaArgs());
} }
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册