From a1cc931d4af44728a11d3ebfeb125d46b3a9637a Mon Sep 17 00:00:00 2001 From: jameswu2014 <545426914@qq.com> Date: Wed, 8 May 2019 19:30:17 +0800 Subject: [PATCH] V2-conv-hellocase pass & V1 verify-pass (#1608) --- src/fpga/V2/api.cpp | 150 ++++++----- src/fpga/V2/api.h | 24 +- src/fpga/V2/pe.cpp | 238 ++++++++++-------- src/fpga/common/fpga_common.cpp | 6 +- src/fpga/common/fpga_common.h | 67 +++-- src/framework/executor.cpp | 4 +- .../kernel/fpga/V2/conv_add_bn_kernel.cpp | 11 +- .../fpga/V2/conv_add_bn_relu_kernel.cpp | 20 +- .../kernel/fpga/V2/conv_add_kernel.cpp | 5 +- .../kernel/fpga/V2/conv_add_relu_kernel.cpp | 5 +- .../kernel/fpga/V2/conv_bn_kernel.cpp | 5 +- .../kernel/fpga/V2/conv_bn_relu_kernel.cpp | 13 +- src/operators/kernel/fpga/V2/conv_kernel.cpp | 5 +- .../kernel/fpga/V2/conv_transpose_kernel.cpp | 13 +- .../kernel/fpga/V2/deconv_add_bn_kernel.cpp | 13 +- .../fpga/V2/deconv_add_bn_relu_kernel.cpp | 13 +- .../kernel/fpga/V2/deconv_add_kernel.cpp | 13 +- .../kernel/fpga/V2/deconv_add_relu_kernel.cpp | 13 +- .../kernel/fpga/V2/deconv_bn_relu_kernel.cpp | 13 +- src/operators/kernel/fpga/V2/feed_kernel.cpp | 1 - .../kernel/fpga/V2/fusion_fc_kernel.cpp | 5 +- .../kernel/fpga/V2/fusion_fc_relu_kernel.cpp | 5 +- 22 files changed, 355 insertions(+), 287 deletions(-) diff --git a/src/fpga/V2/api.cpp b/src/fpga/V2/api.cpp index be95caa5d8..9e7c3bb6c3 100644 --- a/src/fpga/V2/api.cpp +++ b/src/fpga/V2/api.cpp @@ -22,6 +22,7 @@ limitations under the License. */ namespace paddle_mobile { namespace fpga { +#define USE_RELU 1 #define USE_BIAS 2 void format_image(framework::Tensor *image_tensor) { @@ -301,7 +302,9 @@ void expand_conv_arg(ConvArgs *arg) { ConvArgs args = *arg; auto fpga_bias_scale_len = - align_to_x(args.filter_num / args.group_num, 8) * args.group_num; + align_to_x(args.filter_num / args.group_num, BS_NUM_ALIGNMENT) * + args.group_num; + fpga_bias_scale_len = fpga_bias_scale_len / BIAS_SCALE_DMA_NUM; auto output_height = (args.image.height + args.image.pad_height * 2 - args.kernel.height) / @@ -325,7 +328,7 @@ void expand_conv_arg(ConvArgs *arg) { auto output_amount_per_row = align_to_x( (output_width - (args.deconv_tx_param.omit_size) * 2) * args.filter_num, - IMAGE_ALIGNMENT); + RESULT_ALIGNMENT); // find the opt partition strategy uint64_t res_win; @@ -335,10 +338,10 @@ void expand_conv_arg(ConvArgs *arg) { (args.image.channels * (args.kernel.width + (res_win - 1) * args.kernel.stride_w)), IMAGE_ALIGNMENT) / - 16 + + IMAGE_ALIGNMENT + 1) * args.kernel.height > - 2048) { + 256) { break; } } @@ -350,6 +353,7 @@ void expand_conv_arg(ConvArgs *arg) { if (((res_win % 2) != 0) && (res_win != 1)) { res_win = res_win - 1; } + PADDLE_MOBILE_ENFORCE(res_win >= 2, "window too bigger than fpga volume"); res_fit = res_win; auto block_num = (output_width + res_fit - 1) / res_fit; @@ -375,14 +379,14 @@ void expand_conv_arg(ConvArgs *arg) { align_to_x((args.image.channels * (args.kernel.width + (block_len - 1) * args.kernel.stride_w)), IMAGE_ALIGNMENT) / - 16 + + IMAGE_ALIGNMENT + 1; auto image_block_len_last = align_to_x( (args.image.channels * (args.kernel.width + (block_last - 1) * args.kernel.stride_w)), IMAGE_ALIGNMENT) / - 16 + + IMAGE_ALIGNMENT + 1; auto image_win_cnt = block_len; auto image_win_cnt_last = block_last; @@ -395,46 +399,85 @@ void expand_conv_arg(ConvArgs *arg) { (512 / (align_to_x(args.filter_num, 4) / 4 * 2) > 2) ? (512 / (align_to_x(args.filter_num, 4) / 4 * 2) - 2) : 0; - // auto cmd = 0UL | (args.relu_enabled ? USE_RELU : 0) | USE_BIAS; - auto cmd = 0UL | USE_BIAS; + auto cmd = 0UL | (args.relu_enabled ? USE_RELU : 0) | USE_BIAS; + // auto cmd = 0UL | USE_BIAS; auto deconv_param = ((args.deconv_tx_param.deconv_en) << 16) | ((args.deconv_tx_param.sub_conv_num) << 8) | ((args.deconv_tx_param.omit_size) << 0); - (*arg).driver.image_address_phy = vaddr_to_paddr(args.image.address); - (*arg).driver.sb_address_phy = vaddr_to_paddr(args.sb_address); - (*arg).driver.filter_address_phy = vaddr_to_paddr(args.filter_address); - (*arg).driver.output_address_phy = vaddr_to_paddr(args.output.address) + - args.deconv_tx_param.out_addr_offset; - (*arg).driver.output_height = output_height; - (*arg).driver.output_width = output_width; + (*arg).driver.filter_per_group = filter_per_group; (*arg).driver.channel_per_group = channel_per_group; - (*arg).driver.image_amount_per_row = image_amount_per_row; (*arg).driver.image_one_pad_per_row = image_one_pad_per_row; - (*arg).driver.filter_amount_all = filter_amount_all; - (*arg).driver.output_amount_per_row = output_amount_per_row; + (*arg).driver.deconv_param = deconv_param; + // new + (*arg).driver.col_padding_up = args.image.pad_width * args.image.channels; + (*arg).driver.col_padding_down = image_one_pad_per_row; + (*arg).driver.row_padding_up = args.image.pad_height; + (*arg).driver.row_padding_down = args.image.pad_height + args.image.height; (*arg).driver.image_block_amount_per_row = image_block_amount_per_row; (*arg).driver.filter_pad_width_mul_channel = filter_pad_width_mul_channel; + (*arg).driver.image_win_cnt = image_win_cnt; + (*arg).driver.image_win_cnt_last = image_win_cnt_last; + (*arg).driver.filter_row = args.kernel.width * args.image.channels; + (*arg).driver.filter_width = args.kernel.width; + (*arg).driver.filter_height = args.kernel.height; + (*arg).driver.skip_window = args.image.channels * args.kernel.stride_w; + (*arg).driver.stride_h = args.kernel.stride_h; + (*arg).driver.filter_amount_all = filter_amount_all; + (*arg).driver.prog_full_cnt = prog_full_cnt; + (*arg).driver.filter_align = args.filter_num / (4 * PE_COLUMN) + + (((args.filter_num % (4 * PE_COLUMN))) ? 1 : 0); + (*arg).driver.filter_num = args.filter_num; + (*arg).driver.output_width = output_width; + (*arg).driver.output_amount_per_row = output_amount_per_row; + (*arg).driver.res_row_data_align4_pad = res_row_data_align4_pad; + (*arg).driver.cal_res_num = output_height / ROW_PARALLEL_NUM + + ((output_height % ROW_PARALLEL_NUM) ? 1 : 0) - 1; + (*arg).driver.last_cal_res_row_num = + (output_height % (ROW_PARALLEL_NUM)) + ? (output_height % (ROW_PARALLEL_NUM)) + : (ROW_PARALLEL_NUM); + + (*arg).driver.post_prog_full_cnt = post_prog_full_cnt; + (*arg).driver.deconv_skip_row = + ROW_PARALLEL_NUM * + args.deconv_tx_param.sub_conv_num; // paralvl*deconv_group + (*arg).driver.deconv_res_skip_row = + args.deconv_tx_param.sub_conv_num * + output_amount_per_row; // deconv_group * result_amount_per_row + (*arg).driver.deconv_ena = args.deconv_tx_param.deconv_en; + (*arg).driver.deconv_dump = args.deconv_tx_param.omit_size; + (*arg).driver.output_address_phy = vaddr_to_paddr(args.output.address) + + args.deconv_tx_param.out_addr_offset; + (*arg).driver.output_height = output_height; + (*arg).driver.result_amount_per_row_multi_para = + output_amount_per_row / RESULT_ALIGNMENT * + (args.deconv_tx_param.deconv_en ? (*arg).driver.deconv_skip_row + : ROW_PARALLEL_NUM); + (*arg).driver.sb_address_phy = vaddr_to_paddr(args.sb_address); + (*arg).driver.fpga_bias_scale_len = fpga_bias_scale_len; + (*arg).driver.filter_amount_whole = filter_amount_all; + (*arg).driver.filter_address_phy = vaddr_to_paddr(args.filter_address); + (*arg).driver.filters_amount_whole = + filter_amount_all * (*arg).driver.filter_align * (4 * PE_COLUMN); + (*arg).driver.image_address_phy = vaddr_to_paddr(args.image.address); + (*arg).driver.image_hight = args.image.height; + (*arg).driver.image_amount_per_row = image_amount_per_row; (*arg).driver.image_amount_per_row_multi_win_first = image_amount_per_row_multi_win_first; (*arg).driver.image_amount_per_row_multi_win = image_amount_per_row_multi_win; + (*arg).driver.filter_pad_hight = args.image.pad_height; (*arg).driver.image_block_num = image_block_num; (*arg).driver.image_block_len = image_block_len; (*arg).driver.image_block_len_last = image_block_len_last; - (*arg).driver.image_win_cnt = image_win_cnt; - (*arg).driver.image_win_cnt_last = image_win_cnt_last; - (*arg).driver.res_row_data_align4_pad = res_row_data_align4_pad; - (*arg).driver.prog_full_cnt = prog_full_cnt; - (*arg).driver.post_prog_full_cnt = post_prog_full_cnt; - (*arg).driver.fpga_bias_scale_len = fpga_bias_scale_len; + (*arg).driver.cmd = cmd; - (*arg).driver.deconv_param = deconv_param; } // expand_conv_arg() void expand_EW_arg(EWAddArgs *arg) { EWAddArgs args = *arg; - uint64_t cmd = 0; + uint64_t cmd = args.relu_enabled ? USE_RELU : 0; uint64_t datalen = (uint64_t)args.image0.width * (uint64_t)args.image0.height * (uint64_t)args.image0.channels; @@ -462,10 +505,8 @@ void expand_EW_arg(EWAddArgs *arg) { void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, framework::Tensor *out, framework::Tensor *filter, - ActivationType activation_enable, - int16_t leaky_relu_negative_slope, int group_num, - int stride_h, int stride_w, int padding_h, int padding_w, - float *bs_ptr) { + bool relu_enabled, int group_num, int stride_h, + int stride_w, int padding_h, int padding_w, float *bs_ptr) { auto input_ptr = input->data(); auto filter_ptr = filter->data(); auto out_ptr = out->data(); @@ -473,6 +514,7 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, arg->group_num = (uint32_t)group_num; // Either group_num or split_num = 1; + PADDLE_MOBILE_ENFORCE(group_num == 1, "group_num is not equal to 1"); arg->split_num = group_num == 1 ? (uint32_t)get_plit_num(filter) : 1; arg->filter_num = (uint32_t)filter->dims()[0]; arg->output.address = out_ptr; @@ -511,9 +553,7 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, filter->dims()[3])); for (int i = 0; i < n; i++) { - arg->conv_arg[i].output.activation.activation_type = activation_enable; - arg->conv_arg[i].output.activation.leaky_relu_negative_slope = - leaky_relu_negative_slope; + arg->conv_arg[i].relu_enabled = relu_enabled; arg->conv_arg[i].group_num = (uint32_t)group_num; arg->conv_arg[i].kernel.stride_h = (uint32_t)stride_h; arg->conv_arg[i].kernel.stride_w = (uint32_t)stride_w; @@ -585,9 +625,8 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, framework::Tensor *out, framework::Tensor *filter, - ActivationType activation_enable, - int16_t leaky_relu_negative_slope, int group_num, - int stride_h, int stride_w, int padding_h, int padding_w, + bool relu_enabled, int group_num, int stride_h, + int stride_w, int padding_h, int padding_w, float *bs_ptr) { auto input_ptr = input->data(); auto filter_ptr = filter->data(); @@ -713,12 +752,14 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, } for (int j = 0; j < split_num; ++j) { - arg->split_conv_args[i]->conv_arg[j].output.activation.activation_type = - activation_enable; - arg->split_conv_args[i] - ->conv_arg[j] - .output.activation.leaky_relu_negative_slope = - leaky_relu_negative_slope; + // arg->split_conv_args[i]->conv_arg[j].output.activation.activation_type + // = + // activation_enable; + // arg->split_conv_args[i] + // ->conv_arg[j] + // .output.activation.leaky_relu_negative_slope = + // leaky_relu_negative_slope; + arg->split_conv_args[i]->conv_arg[j].relu_enabled = relu_enabled; arg->split_conv_args[i]->conv_arg[j].group_num = (uint32_t)group_num; arg->split_conv_args[i]->conv_arg[j].kernel.width = @@ -831,16 +872,14 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input, framework::Tensor *out, framework::Tensor *filter, - ActivationType activation_enable, - int16_t leaky_relu_negative_slope, int stride_h, - int stride_w, int padding_h, int padding_w, - float *bias_ptr) { + bool relu_enabled, int stride_h, int stride_w, + int padding_h, int padding_w, float *bias_ptr) { auto filter_ptr = filter->data(); auto input_ptr = input->data(); auto output_ptr = out->mutable_data(); arg->sub_conv_num = 1; - arg->output.activation.activation_type = activation_enable; - arg->output.activation.leaky_relu_negative_slope = leaky_relu_negative_slope; + arg->relu_enabled = relu_enabled; + // arg->output.activation.activation_type = activation_enable; arg->bias_address = bias_ptr; arg->filter_address = filter_ptr; arg->kernel.height = (uint32_t)filter->dims()[2]; @@ -860,10 +899,8 @@ void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input, void fill_DWDeconv_arg(struct DWDeconvArgs *arg, framework::Tensor *input, framework::Tensor *out, framework::Tensor *filter, - ActivationType activation_enable, - int16_t leaky_relu_negative_slope, int stride_h, - int stride_w, int padding_h, int padding_w, - float *bias_ptr) { + bool relu_enabled, int stride_h, int stride_w, + int padding_h, int padding_w, float *bias_ptr) { auto filter_ptr = filter->data(); auto input_ptr = input->data(); @@ -913,10 +950,11 @@ void fill_DWDeconv_arg(struct DWDeconvArgs *arg, framework::Tensor *input, arg->dw_conv_args.push_back(std::make_shared()); arg->dw_conv_args[i]->sub_conv_num = sub_conv_num; - // arg->dw_conv_args[i]->relu_enabled = relu_enabled; - arg->dw_conv_args[i]->output.activation.activation_type = activation_enable; - arg->dw_conv_args[i]->output.activation.leaky_relu_negative_slope = - leaky_relu_negative_slope; + arg->dw_conv_args[i]->relu_enabled = relu_enabled; + // arg->dw_conv_args[i]->output.activation.activation_type = + // activation_enable; + // arg->dw_conv_args[i]->output.activation.leaky_relu_negative_slope = + // leaky_relu_negative_slope; arg->dw_conv_args[i]->bias_address = bias_ptr; arg->dw_conv_args[i]->filter_address = diff --git a/src/fpga/V2/api.h b/src/fpga/V2/api.h index c8774f6ab2..d8674c4401 100644 --- a/src/fpga/V2/api.h +++ b/src/fpga/V2/api.h @@ -48,28 +48,20 @@ void format_concat_output(framework::Tensor* out, int height, int width, void fill_split_arg(struct SplitConvArgs* arg, framework::Tensor* input, framework::Tensor* out, framework::Tensor* filter, - ActivationType activation_enable, - int16_t leaky_relu_negative_slope, int group_num, - int stride_h, int stride_w, int padding_h, int padding_w, - float* bs_ptr); + bool relu_enabled, int group_num, int stride_h, + int stride_w, int padding_h, int padding_w, float* bs_ptr); void fill_deconv_arg(struct DeconvArgs* arg, framework::Tensor* input, framework::Tensor* out, framework::Tensor* filter, - ActivationType activation_enable, - int16_t leaky_relu_negative_slope, int group_num, - int stride_h, int stride_w, int padding_h, int padding_w, - float* bs_ptr); + bool relu_enabled, int group_num, int stride_h, + int stride_w, int padding_h, int padding_w, float* bs_ptr); void fill_dwconv_arg(struct DWconvArgs* arg, framework::Tensor* input, framework::Tensor* out, framework::Tensor* filter, - ActivationType activation_enable, - int16_t leaky_relu_negative_slope, int stride_h, - int stride_w, int padding_h, int padding_w, - float* bias_ptr); + bool relu_enabled, int stride_h, int stride_w, + int padding_h, int padding_w, float* bias_ptr); void fill_DWDeconv_arg(struct DWDeconvArgs* arg, framework::Tensor* input, framework::Tensor* out, framework::Tensor* filter, - ActivationType activation_enable, - int16_t leaky_relu_negative_slope, int stride_h, - int stride_w, int padding_h, int padding_w, - float* bs_ptr); + bool relu_enabled, int stride_h, int stride_w, + int padding_h, int padding_w, float* bs_ptr); void format_deconv_filter(framework::Tensor* filter_tensor, float max_value, int group_num, int stride); diff --git a/src/fpga/V2/pe.cpp b/src/fpga/V2/pe.cpp index 5bcaa9321b..cc9d8d20cd 100644 --- a/src/fpga/V2/pe.cpp +++ b/src/fpga/V2/pe.cpp @@ -115,6 +115,19 @@ using namespace std; // NOLINT /*conv*/ #define REG_CONV_CMD 0xC00 +#define REG_CONV_REG0 0xC08 +#define REG_CONV_REG1 0xC10 +#define REG_CONV_REG2 0xC18 +#define REG_CONV_REG3 0xC20 +#define REG_CONV_REG4 0xC28 +#define REG_CONV_REG5 0xC30 +#define REG_CONV_REG6 0xC38 +#define REG_CONV_REG7 0xC40 +#define REG_CONV_REG8 0xC48 +#define REG_CONV_REG9 0xC50 +#define REG_CONV_REG10 0xC58 +#define REG_CONV_REG11 0xC60 + #define REG_CONV_IMAGE_BASE_ADDR 0xC08 #define REG_CONV_FILTER_BASE_ADDR 0xC10 #define REG_CONV_SB_BASE_ADDR 0xC18 @@ -194,7 +207,7 @@ int ComputeFpgaConv(const struct SplitConvArgs &args) { int ComputeBasicConv(const struct ConvArgs &args) { #ifdef FPGA_PRINT_MODE DLOG << "======Compute Basic Conv======"; - // DLOG << " relu_enabled:" << args.relu_enabled + DLOG << " relu_enabled:" << args.relu_enabled; DLOG << " sb_address:" << args.sb_address << " filter_address:" << args.filter_address << " filter_num:" << args.filter_num @@ -218,23 +231,23 @@ int ComputeBasicConv(const struct ConvArgs &args) { int ret = 0; uint64_t output_scale = 0; - uint64_t reg_ActivationArgs = 0; + // uint64_t reg_ActivationArgs = 0; // active function:{none,leakeyrelu,sigmoid,tanh} - ActivationArgs active_args; + // ActivationArgs active_args; // active_args.activation_type = LEAKYRELU; - active_args.activation_type = args.output.activation.activation_type; + // active_args.activation_type = args.output.activation.activation_type; - active_args.leaky_relu_negative_slope = - args.output.activation.leaky_relu_negative_slope; + // active_args.leaky_relu_negative_slope = + // args.output.activation.leaky_relu_negative_slope; - reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) | - active_args.leaky_relu_negative_slope; + // reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) | + // active_args.leaky_relu_negative_slope; - DLOG << " activation_type:" << active_args.activation_type - << " leaky_relu_negative_slope:" - << active_args.leaky_relu_negative_slope; - DLOG << " reg_ActivationArgs:" << reg_ActivationArgs; + // DLOG << " activation_type:" << active_args.activation_type + // << " leaky_relu_negative_slope:" + // << active_args.leaky_relu_negative_slope; + // DLOG << " reg_ActivationArgs:" << reg_ActivationArgs; pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status) { @@ -243,63 +256,71 @@ int ComputeBasicConv(const struct ConvArgs &args) { pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); return ret; } + // new + reg_writeq((args.driver.row_padding_down << 45) | + (args.driver.row_padding_up << 34) | + (args.driver.col_padding_down << 17) | + args.driver.col_padding_up, + REG_CONV_REG0); + + reg_writeq((args.driver.image_win_cnt_last << 50) | + (args.driver.image_win_cnt << 39) | + (args.driver.image_block_amount_per_row << 20) | + args.driver.filter_pad_width_mul_channel, + REG_CONV_REG1); + + reg_writeq((args.driver.stride_h << 48) | (args.driver.skip_window << 28) | + (args.driver.filter_row << 8) | + (args.driver.filter_height << 4) | args.driver.filter_width, + REG_CONV_REG2); + + reg_writeq((args.driver.filter_num << 42) | (args.driver.filter_align << 26) | + (args.driver.prog_full_cnt << 16) | + args.driver.filter_amount_all, + REG_CONV_REG3); + + reg_writeq((args.driver.post_prog_full_cnt << 54) | + (args.driver.last_cal_res_row_num << 50) | + (args.driver.cal_res_num << 39) | + (args.driver.res_row_data_align4_pad << 35) | + (args.driver.output_amount_per_row << 16) | + args.driver.output_width, + REG_CONV_REG4); + + reg_writeq((args.driver.deconv_dump << 40) | (args.driver.deconv_ena << 39) | + (args.driver.deconv_res_skip_row << 7) | + args.driver.deconv_skip_row, + REG_CONV_REG5); + + reg_writeq((args.driver.result_amount_per_row_multi_para << 43) | + (args.driver.output_height << 32) | + args.driver.output_address_phy, + REG_CONV_REG6); + + reg_writeq((args.driver.filter_amount_whole << 48) | + (args.driver.fpga_bias_scale_len << 32) | + args.driver.sb_address_phy, + REG_CONV_REG7); - reg_writeq(reg_ActivationArgs, - REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); // active functoion - - reg_writeq(output_scale, REG_SCALE_PARAMETER); - reg_writeq( - ((uint64_t)args.image.height) | (((uint64_t)args.image.width) << 32), - REG_CONV_IMAGE_PIXEL); reg_writeq( - ((uint64_t)args.kernel.height) | (((uint64_t)args.kernel.width) << 32), - REG_CONV_FILTER_PIXEL); - - uint64_t output_height_fraction = - args.driver.output_height / ROW_PARALLEL_NUM; - uint64_t output_height_remainder = - args.driver.output_height % ROW_PARALLEL_NUM; - reg_writeq(args.driver.output_height | (output_height_fraction << 16) | - (output_height_remainder << 26) | - (args.driver.output_width << 32), - REG_CONV_RESULT_PIXEL); - reg_writeq(((uint64_t)args.image.pad_height) | - (((uint64_t)args.image.pad_width) << 32), - REG_CONV_PAD_PIXEL); - reg_writeq(((uint64_t)args.kernel.stride_h) | - (((uint64_t)args.kernel.stride_w) << 32), - REG_CONV_STEP_PIXEL); - reg_writeq((uint64_t)args.group_num, REG_CONV_GROUP_NUMBER); - reg_writeq((uint64_t)args.filter_num, REG_CONV_FILTER_NUMBER); - reg_writeq((uint64_t)args.image.channels, REG_CONV_CHANNEL_NUMBER); - reg_writeq(*(uint64_t *)args.image.scale_address, // NOLINT - REG_CONV_IMAGE_SCALE); - reg_writeq(*(uint64_t *)args.filter_scale_address, // NOLINT - REG_CONV_FILTER_SCALE); - reg_writeq(args.driver.image_address_phy, REG_CONV_IMAGE_BASE_ADDR); - reg_writeq(args.driver.filter_address_phy, REG_CONV_FILTER_BASE_ADDR); - reg_writeq(args.driver.sb_address_phy, REG_CONV_SB_BASE_ADDR); - reg_writeq(args.driver.output_address_phy, REG_CONV_RESULT_BASE_ADDR); - reg_writeq(args.driver.filter_per_group, REG_CONV_FILTER_PER_GROUP); - reg_writeq(args.driver.channel_per_group, REG_CONV_CHANNEL_PER_GROUP); - reg_writeq(args.driver.image_amount_per_row, REG_CONV_IMAGE_AMOUNT_PER_ROW); - reg_writeq(args.driver.image_one_pad_per_row, REG_CONV_IMAGE_ONE_PAD_PER_ROW); - reg_writeq(args.driver.filter_amount_all, REG_CONV_FILTER_AMOUNT_ALL); - reg_writeq(args.driver.output_amount_per_row, REG_CONV_RESULT_AMOUNT_PER_ROW); - reg_writeq(args.driver.image_block_amount_per_row, 0xca8); - reg_writeq(args.driver.filter_pad_width_mul_channel, 0xcb0); - reg_writeq(args.driver.image_amount_per_row_multi_win_first, 0xcb8); - reg_writeq(args.driver.image_amount_per_row_multi_win, 0xcc0); - reg_writeq(args.driver.image_block_num, 0xcc8); - reg_writeq(args.driver.image_block_len, 0xcd0); - reg_writeq(args.driver.image_block_len_last, 0xcd8); - reg_writeq(args.driver.image_win_cnt, 0xce0); - reg_writeq(args.driver.image_win_cnt_last, 0xce8); - reg_writeq(args.driver.res_row_data_align4_pad, 0xcf8); - reg_writeq(args.driver.prog_full_cnt, 0xd08); - reg_writeq(args.driver.post_prog_full_cnt, 0xd10); - reg_writeq(args.driver.deconv_param, 0xd18); - reg_writeq(args.driver.fpga_bias_scale_len / 4, 0xd20); + (args.driver.filters_amount_whole << 32) | args.driver.filter_address_phy, + REG_CONV_REG8); + + reg_writeq((args.driver.image_amount_per_row << 43) | + (args.driver.image_hight << 32) | + args.driver.image_address_phy, + REG_CONV_REG9); + + reg_writeq((args.driver.filter_pad_hight << 46) | + (args.driver.image_amount_per_row_multi_win << 23) | + args.driver.image_amount_per_row_multi_win_first, + REG_CONV_REG10); + + reg_writeq((args.driver.image_block_num << 48) | + (args.driver.image_block_len << 24) | + args.driver.image_block_len_last, + REG_CONV_REG11); + reg_writeq(args.driver.cmd, REG_CONV_CMD); if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_CONV, PE_IRQ_TIMEOUT)) { g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status = ERROR; @@ -307,12 +328,7 @@ int ComputeBasicConv(const struct ConvArgs &args) { DLOG << "Conv Wait Irq Timeout!"; PADDLE_MOBILE_ENFORCE(0, "Conv Wait Irq Timeout"); } - output_scale = reg_readq(REG_SCALE_PARAMETER); - output_scale = (output_scale << 32) | (output_scale >> 32); - fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); - - active_args.activation_type = NONE; - reg_writeq(reg_ActivationArgs, REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); + DLOG << "after reg poll"; pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); @@ -350,22 +366,22 @@ int ComputeFpgaPool(const struct PoolingArgs &args) { uint64_t image_physical_address = 0; uint64_t output_physical_address = 0; - uint64_t reg_ActivationArgs = 0; + // uint64_t reg_ActivationArgs = 0; // active function:{none,leakeyrelu,sigmoid,tanh} - ActivationArgs active_args; + // ActivationArgs active_args; // active_args.activation_type = LEAKYRELU; - active_args.activation_type = args.output.activation.activation_type; + // active_args.activation_type = args.output.activation.activation_type; - active_args.leaky_relu_negative_slope = - args.output.activation.leaky_relu_negative_slope; + // active_args.leaky_relu_negative_slope = + // args.output.activation.leaky_relu_negative_slope; - reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) | - active_args.leaky_relu_negative_slope; + // reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) | + // active_args.leaky_relu_negative_slope; - DLOG << " activation_type:" << active_args.activation_type - << " leaky_relu_negative_slope:" - << active_args.leaky_relu_negative_slope; - DLOG << " reg_ActivationArgs:" << reg_ActivationArgs; + // DLOG << " activation_type:" << active_args.activation_type + // << " leaky_relu_negative_slope:" + // << active_args.leaky_relu_negative_slope; + // DLOG << " reg_ActivationArgs:" << reg_ActivationArgs; image_physical_address = vaddr_to_paddr_driver(args.image.address); output_physical_address = vaddr_to_paddr_driver(args.output.address); @@ -417,10 +433,10 @@ int ComputeFpgaPool(const struct PoolingArgs &args) { return ret; } - reg_writeq(reg_ActivationArgs, - REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); // active functoion + // reg_writeq(reg_ActivationArgs, + // REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); // active functoion - reg_writeq(output_scale, REG_SCALE_PARAMETER); + // reg_writeq(output_scale, REG_SCALE_PARAMETER); reg_writeq(image_physical_address, REG_POOLING_IMAGE_BASE_ADDR); reg_writeq(output_physical_address, REG_POOLING_RESULT_BASE_ADDR); reg_writeq( @@ -462,12 +478,12 @@ int ComputeFpgaPool(const struct PoolingArgs &args) { DLOG << "after reg poll"; // *(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER); - output_scale = reg_readq(REG_SCALE_PARAMETER); - output_scale = (output_scale << 32) | (output_scale >> 32); - fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); + // output_scale = reg_readq(REG_SCALE_PARAMETER); + // output_scale = (output_scale << 32) | (output_scale >> 32); + // fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); - active_args.activation_type = NONE; - reg_writeq(reg_ActivationArgs, REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); + // active_args.activation_type = NONE; + // reg_writeq(reg_ActivationArgs, REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); @@ -479,7 +495,7 @@ int ComputeFpgaPool(const struct PoolingArgs &args) { int ComputeFpgaEWAdd(const struct EWAddArgs &args) { #ifdef FPGA_PRINT_MODE DLOG << "=============ComputeFpgaEWAdd==========="; - // DLOG << " relu_enabled:" << args.relu_enabled + DLOG << " relu_enabled:" << args.relu_enabled; DLOG << " const0:" << fp16_2_fp32(int16_t(args.const0)) << " const1:" << fp16_2_fp32(int16_t(args.const1)); DLOG << " image0_address:" << args.image0.address @@ -503,17 +519,17 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) { int ret = 0; uint64_t output_scale = 0; - uint64_t reg_ActivationArgs = 0; - ActivationArgs active_args; - active_args.activation_type = args.output.activation.activation_type; - active_args.leaky_relu_negative_slope = - args.output.activation.leaky_relu_negative_slope; - reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) | - active_args.leaky_relu_negative_slope; - DLOG << " activation_type:" << active_args.activation_type - << " leaky_relu_negative_slope:" - << active_args.leaky_relu_negative_slope; - DLOG << " reg_ActivationArgs:" << reg_ActivationArgs; + // uint64_t reg_ActivationArgs = 0; + // ActivationArgs active_args; + // active_args.activation_type = args.output.activation.activation_type; + // active_args.leaky_relu_negative_slope = + // args.output.activation.leaky_relu_negative_slope; + // reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) | + // active_args.leaky_relu_negative_slope; + // DLOG << " activation_type:" << active_args.activation_type + // << " leaky_relu_negative_slope:" + // << active_args.leaky_relu_negative_slope; + // DLOG << " reg_ActivationArgs:" << reg_ActivationArgs; pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_EW]->status) { @@ -523,8 +539,8 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) { return ret; } - reg_writeq(reg_ActivationArgs, - REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); // active functoion + // reg_writeq(reg_ActivationArgs, + // REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); // active functoion reg_writeq(output_scale, REG_SCALE_PARAMETER); reg_writeq(args.driver.image0_address_phy, REG_EW_IMAGE0_BASE_ADDR); @@ -543,11 +559,11 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) { PADDLE_MOBILE_ENFORCE(0, "EW Wait Irq Timeout!"); } - output_scale = reg_readq(REG_SCALE_PARAMETER); - output_scale = (output_scale << 32) | (output_scale >> 32); - fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); - active_args.activation_type = NONE; - reg_writeq(reg_ActivationArgs, REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); + // output_scale = reg_readq(REG_SCALE_PARAMETER); + // output_scale = (output_scale << 32) | (output_scale >> 32); + // fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); + // active_args.activation_type = NONE; + // reg_writeq(reg_ActivationArgs, REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); return ret; diff --git a/src/fpga/common/fpga_common.cpp b/src/fpga/common/fpga_common.cpp index 5ac45847dc..52ad3565b3 100644 --- a/src/fpga/common/fpga_common.cpp +++ b/src/fpga/common/fpga_common.cpp @@ -200,10 +200,10 @@ uint64_t vaddr_to_paddr(void *address) { } uint32_t paddle_mobile_version() { - uint32_t v_master = 35; - uint32_t v_slave = 35; + uint32_t v_master = 52; + uint32_t v_slave = 52; - uint32_t first = 1, second = 2, fourth_master = 1, fourth_slave = 2; + uint32_t first = 1, second = 2, fourth_master = 1, fourth_slave = 1; uint32_t master = first << 24 | second << 16 | v_master << 8 | fourth_master; uint32_t slave = first << 24 | second << 16 | v_slave << 8 | fourth_slave; diff --git a/src/fpga/common/fpga_common.h b/src/fpga/common/fpga_common.h index f13492a2d1..f75add11c3 100644 --- a/src/fpga/common/fpga_common.h +++ b/src/fpga/common/fpga_common.h @@ -32,8 +32,12 @@ limitations under the License. */ #define FILTER_NUM_ALIGNMENT (32) // Filter number aligned to 32 #define FILTER_ELEMENT_ALIGNMENT (16) // Filter element number aligned to 16 #define BS_NUM_ALIGNMENT (8) +#define BIAS_SCALE_DMA_NUM (4) +#define RESULT_ALIGNMENT (32) +#define PE_COLUMN (8) +#define ROW_PARALLEL_NUM (2) #define BIAS_NUM_ALIGNMENT (16) -#define ROW_PARALLEL_NUM (3) + #endif namespace paddle_mobile { @@ -89,37 +93,59 @@ struct ImageOutputArgs { }; struct ConvDriverParam { - uint64_t image_address_phy; - uint64_t filter_address_phy; - uint64_t sb_address_phy; - uint64_t output_address_phy; - - uint64_t output_height; - uint64_t output_width; uint64_t filter_per_group; uint64_t channel_per_group; - uint64_t image_amount_per_row; uint64_t image_one_pad_per_row; - uint64_t filter_amount_all; - uint64_t output_amount_per_row; + uint64_t deconv_param; + + uint64_t col_padding_up; + uint64_t col_padding_down; + uint64_t row_padding_up; + uint64_t row_padding_down; uint64_t image_block_amount_per_row; uint64_t filter_pad_width_mul_channel; - uint64_t image_amount_per_row_multi_win_first; - uint64_t image_amount_per_row_multi_win; - uint64_t image_block_num; - uint64_t image_block_len; - uint64_t image_block_len_last; uint64_t image_win_cnt; uint64_t image_win_cnt_last; - uint64_t res_row_data_align4_pad; + uint64_t filter_row; + uint64_t filter_width; + uint64_t filter_height; + uint64_t skip_window; + uint64_t stride_h; + uint64_t filter_amount_all; uint64_t prog_full_cnt; + uint64_t filter_align; + uint64_t filter_num; + uint64_t output_width; + uint64_t output_amount_per_row; + uint64_t res_row_data_align4_pad; + uint64_t cal_res_num; + uint64_t last_cal_res_row_num; uint64_t post_prog_full_cnt; + uint64_t deconv_skip_row; // paralvl*deconv_group + uint64_t deconv_res_skip_row; // deconv_group * result_amount_per_row + uint64_t deconv_ena; + uint64_t deconv_dump; + uint64_t output_address_phy; + uint64_t output_height; + uint64_t result_amount_per_row_multi_para; + uint64_t sb_address_phy; uint64_t fpga_bias_scale_len; - uint64_t cmd; + uint64_t filter_amount_whole; + uint64_t filter_address_phy; + uint64_t filters_amount_whole; + uint64_t image_address_phy; + uint64_t image_hight; + uint64_t image_amount_per_row; + uint64_t image_amount_per_row_multi_win_first; + uint64_t image_amount_per_row_multi_win; + uint64_t filter_pad_hight; + uint64_t image_block_num; + uint64_t image_block_len; + uint64_t image_block_len_last; - uint64_t deconv_param; + uint64_t cmd; }; struct EWAddDriverParam { @@ -141,6 +167,7 @@ struct DeconvTxParm { }; struct ConvArgs { + bool relu_enabled; void* sb_address; // scale and bias void* filter_address; float* filter_scale_address; @@ -209,6 +236,7 @@ struct PoolingArgs { }; struct EWAddArgs { + bool relu_enabled; uint32_t const0; // output0 = const0 x input0 + const1 x input1; uint32_t const1; struct ImageInputArgs image0; @@ -238,6 +266,7 @@ struct DeconvArgs { }; struct DWconvArgs { uint32_t sub_conv_num; + bool relu_enabled; void* bias_address; void* filter_address; struct KernelArgs kernel; diff --git a/src/framework/executor.cpp b/src/framework/executor.cpp index a4b74f9606..5a6c29e5b5 100644 --- a/src/framework/executor.cpp +++ b/src/framework/executor.cpp @@ -14,6 +14,7 @@ limitations under the License. */ #include "framework/executor.h" #include +#include #include #include #include "common/enforce.h" @@ -638,7 +639,8 @@ std::map LoadQuantValFromFile(std::string filename) { std::ifstream in; in.open(filename, std::ios::in); if (!in.is_open()) { - std::cout << "open File Failed." << std::endl; + // std::cout << "open File Failed." << std::endl; + DLOG << "open File Failed."; exit(-1); } diff --git a/src/operators/kernel/fpga/V2/conv_add_bn_kernel.cpp b/src/operators/kernel/fpga/V2/conv_add_bn_kernel.cpp index 9289339123..2e4a8871fc 100644 --- a/src/operators/kernel/fpga/V2/conv_add_bn_kernel.cpp +++ b/src/operators/kernel/fpga/V2/conv_add_bn_kernel.cpp @@ -22,6 +22,7 @@ namespace operators { template <> bool ConvAddBNKernel::Init(FusionConvAddBNParam *param) { + bool relu_enabled = false; paddle_mobile::fpga::ActivationType activation_enable = paddle_mobile::fpga::NONE; int16_t leaky_relu_negative_slope = 0; @@ -34,7 +35,7 @@ bool ConvAddBNKernel::Init(FusionConvAddBNParam *param) { auto out = param->Output(); float Si = input->scale[0]; float So = out->scale[0]; - float Sf = fpga::filter_find_max(filter) / 127; + float Sf = fpga::filter_find_max(filter); auto bn_mean_ptr = param->InputMean()->data(); auto bn_var_ptr = param->InputVariance()->data(); @@ -64,10 +65,10 @@ bool ConvAddBNKernel::Init(FusionConvAddBNParam *param) { fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); fpga::SplitConvArgs conv_arg = {0}; - fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable, - leaky_relu_negative_slope, param->Groups(), - param->Strides()[0], param->Strides()[1], - param->Paddings()[0], param->Paddings()[1], bs_ptr); + fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled, + param->Groups(), param->Strides()[0], + param->Strides()[1], param->Paddings()[0], + param->Paddings()[1], bs_ptr); param->SetFpgaArgs(conv_arg); delete new_scale; diff --git a/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp index 270dbf459f..8bf1ead85c 100644 --- a/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp +++ b/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp @@ -23,9 +23,9 @@ namespace operators { template <> bool ConvAddBNReluKernel::Init( FusionConvAddBNReluParam *param) { - paddle_mobile::fpga::ActivationType activation_enable = - paddle_mobile::fpga::LEAKYRELU; - int16_t leaky_relu_negative_slope = 0; + bool relu_enabled = true; + // paddle_mobile::fpga::ActivationType activation_enable = + // paddle_mobile::fpga::LEAKYRELU; auto input = const_cast(param->Input()); auto bias = param->Bias(); auto bias_ptr = bias->data(); @@ -34,7 +34,7 @@ bool ConvAddBNReluKernel::Init( const int groups = param->Groups(); float Si = input->scale[0]; float So = out->scale[0]; - float Sf = fpga::filter_find_max(filter) / 127; + float Sf = fpga::filter_find_max(filter); vector paddings = param->Paddings(); vector strides = param->Strides(); auto bn_mean_ptr = param->InputMean()->data(); @@ -70,17 +70,17 @@ bool ConvAddBNReluKernel::Init( if (groups == channel) { fpga::format_dwconv_data(filter, out, new_scale_ptr, &new_bias_ptr); fpga::DWconvArgs dwconv_arg = {0}; - fpga::fill_dwconv_arg(&dwconv_arg, input, out, filter, activation_enable, - leaky_relu_negative_slope, strides[0], strides[1], - paddings[0], paddings[1], new_bias_ptr); + fpga::fill_dwconv_arg(&dwconv_arg, input, out, filter, relu_enabled, + strides[0], strides[1], paddings[0], paddings[1], + new_bias_ptr); param->SetFpgaArgs(dwconv_arg); fpga::fpga_free(bs_ptr); } else { fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); fpga::SplitConvArgs conv_arg = {0}; - fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable, - leaky_relu_negative_slope, param->Groups(), strides[0], - strides[1], paddings[0], paddings[1], bs_ptr); + fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled, + param->Groups(), strides[0], strides[1], paddings[0], + paddings[1], bs_ptr); param->SetFpgaArgs(conv_arg); } delete new_scale; diff --git a/src/operators/kernel/fpga/V2/conv_add_kernel.cpp b/src/operators/kernel/fpga/V2/conv_add_kernel.cpp index 2557dc262e..d0a08abdda 100644 --- a/src/operators/kernel/fpga/V2/conv_add_kernel.cpp +++ b/src/operators/kernel/fpga/V2/conv_add_kernel.cpp @@ -31,7 +31,7 @@ bool ConvAddKernel::Init(FusionConvAddParam *param) { auto out = param->Output(); float Si = input->scale[0]; float So = out->scale[0]; - float Sf = fpga::filter_find_max(filter) / 127; + float Sf = fpga::filter_find_max(filter); PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], "Output channel should be equal to bias number"); @@ -45,8 +45,7 @@ bool ConvAddKernel::Init(FusionConvAddParam *param) { fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); fpga::SplitConvArgs conv_arg = {0}; - fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable, - leaky_relu_negative_slope, param->Groups(), + fpga::fill_split_arg(&conv_arg, input, out, filter, false, param->Groups(), param->Strides()[0], param->Strides()[1], param->Paddings()[0], param->Paddings()[1], bs_ptr); param->SetFpgaArgs(conv_arg); diff --git a/src/operators/kernel/fpga/V2/conv_add_relu_kernel.cpp b/src/operators/kernel/fpga/V2/conv_add_relu_kernel.cpp index 793a3de414..508e835b67 100644 --- a/src/operators/kernel/fpga/V2/conv_add_relu_kernel.cpp +++ b/src/operators/kernel/fpga/V2/conv_add_relu_kernel.cpp @@ -31,7 +31,7 @@ bool ConvAddReluKernel::Init(FusionConvAddReluParam *param) { auto out = param->Output(); float Si = input->scale[0]; float So = out->scale[0]; - float Sf = fpga::filter_find_max(filter) / 127; + float Sf = fpga::filter_find_max(filter); PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], "Output channel should be equal to bias number"); @@ -45,8 +45,7 @@ bool ConvAddReluKernel::Init(FusionConvAddReluParam *param) { fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); fpga::SplitConvArgs conv_arg = {0}; - fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable, - leaky_relu_negative_slope, param->Groups(), + fpga::fill_split_arg(&conv_arg, input, out, filter, true, param->Groups(), param->Strides()[0], param->Strides()[1], param->Paddings()[0], param->Paddings()[1], bs_ptr); param->SetFpgaArgs(conv_arg); diff --git a/src/operators/kernel/fpga/V2/conv_bn_kernel.cpp b/src/operators/kernel/fpga/V2/conv_bn_kernel.cpp index 74615b6bdd..d3de98705e 100644 --- a/src/operators/kernel/fpga/V2/conv_bn_kernel.cpp +++ b/src/operators/kernel/fpga/V2/conv_bn_kernel.cpp @@ -30,7 +30,7 @@ bool ConvBNKernel::Init(FusionConvBNParam *param) { auto out = param->Output(); float Si = input->scale[0]; float So = out->scale[0]; - float Sf = fpga::filter_find_max(filter) / 127; + float Sf = fpga::filter_find_max(filter); auto bn_mean_ptr = param->InputMean()->data(); auto bn_var_ptr = param->InputVariance()->data(); auto bn_scale_ptr = param->InputScale()->data(); @@ -56,8 +56,7 @@ bool ConvBNKernel::Init(FusionConvBNParam *param) { fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); fpga::SplitConvArgs conv_arg = {0}; - fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable, - leaky_relu_negative_slope, param->Groups(), + fpga::fill_split_arg(&conv_arg, input, out, filter, false, param->Groups(), param->Strides()[0], param->Strides()[1], param->Paddings()[0], param->Paddings()[1], bs_ptr); param->SetFpgaArgs(conv_arg); diff --git a/src/operators/kernel/fpga/V2/conv_bn_relu_kernel.cpp b/src/operators/kernel/fpga/V2/conv_bn_relu_kernel.cpp index 9ed709e3e6..9ea962c111 100644 --- a/src/operators/kernel/fpga/V2/conv_bn_relu_kernel.cpp +++ b/src/operators/kernel/fpga/V2/conv_bn_relu_kernel.cpp @@ -29,7 +29,7 @@ bool ConvBNReluKernel::Init(FusionConvBNReluParam *param) { auto out = param->Output(); float Si = input->scale[0]; float So = out->scale[0]; - float Sf = fpga::filter_find_max(filter) / 127; + float Sf = fpga::filter_find_max(filter); auto bn_mean_ptr = param->InputMean()->data(); auto bn_var_ptr = param->InputVariance()->data(); auto bn_scale_ptr = param->InputScale()->data(); @@ -58,17 +58,16 @@ bool ConvBNReluKernel::Init(FusionConvBNReluParam *param) { if (groups == channel) { fpga::format_dwconv_data(filter, out, new_scale_ptr, &new_bias_ptr); fpga::DWconvArgs dwconv_arg = {0}; - fpga::fill_dwconv_arg(&dwconv_arg, input, out, filter, activation_enable, - leaky_relu_negative_slope, param->Strides()[0], - param->Strides()[1], param->Paddings()[0], - param->Paddings()[1], new_bias_ptr); + fpga::fill_dwconv_arg(&dwconv_arg, input, out, filter, true, + param->Strides()[0], param->Strides()[1], + param->Paddings()[0], param->Paddings()[1], + new_bias_ptr); param->SetFpgaArgs(dwconv_arg); fpga::fpga_free(bs_ptr); } else { fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); fpga::SplitConvArgs conv_arg = {0}; - fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable, - leaky_relu_negative_slope, param->Groups(), + fpga::fill_split_arg(&conv_arg, input, out, filter, true, param->Groups(), param->Strides()[0], param->Strides()[1], param->Paddings()[0], param->Paddings()[1], bs_ptr); param->SetFpgaArgs(conv_arg); diff --git a/src/operators/kernel/fpga/V2/conv_kernel.cpp b/src/operators/kernel/fpga/V2/conv_kernel.cpp index 0bc70a9472..9a003543d5 100644 --- a/src/operators/kernel/fpga/V2/conv_kernel.cpp +++ b/src/operators/kernel/fpga/V2/conv_kernel.cpp @@ -29,7 +29,7 @@ bool ConvKernel::Init(ConvParam *param) { auto out = param->Output(); float Si = input->scale[0]; float So = out->scale[0]; - float Sf = fpga::filter_find_max(filter) / 127; + float Sf = fpga::filter_find_max(filter); int channel = out->dims()[1]; auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT @@ -40,8 +40,7 @@ bool ConvKernel::Init(ConvParam *param) { fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); fpga::SplitConvArgs conv_arg = {0}; - fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable, - leaky_relu_negative_slope, param->Groups(), + fpga::fill_split_arg(&conv_arg, input, out, filter, false, param->Groups(), param->Strides()[0], param->Strides()[1], param->Paddings()[0], param->Paddings()[1], bs_ptr); param->SetFpgaArgs(conv_arg); diff --git a/src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp b/src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp index acc19401c8..c09e1ced8a 100644 --- a/src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp +++ b/src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp @@ -31,7 +31,7 @@ bool ConvTransposeKernel::Init(ConvTransposeParam *param) { auto out = param->Output(); float Si = input->scale[0]; float So = out->scale[0]; - float Sf = fpga::filter_find_max(filter) / 127; + float Sf = fpga::filter_find_max(filter); int channel = out->dims()[1]; @@ -58,8 +58,7 @@ bool ConvTransposeKernel::Init(ConvTransposeParam *param) { fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n); fpga::DWDeconvArgs DWDeconv_arg = {0}; - fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, - activation_enable, leaky_relu_negative_slope, + fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, false, param->Strides()[0], param->Strides()[1], param->Paddings()[0], param->Paddings()[1], bs_ptr); param->SetFpgaArgs(DWDeconv_arg); @@ -70,10 +69,10 @@ bool ConvTransposeKernel::Init(ConvTransposeParam *param) { } fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n); fpga::DeconvArgs deconv_arg = {0}; - fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable, - leaky_relu_negative_slope, param->Groups(), - param->Strides()[0], param->Strides()[1], - param->Paddings()[0], param->Paddings()[1], bs_ptr); + fpga::fill_deconv_arg(&deconv_arg, input, out, filter, false, + param->Groups(), param->Strides()[0], + param->Strides()[1], param->Paddings()[0], + param->Paddings()[1], bs_ptr); param->SetFpgaArgs(deconv_arg); } return true; diff --git a/src/operators/kernel/fpga/V2/deconv_add_bn_kernel.cpp b/src/operators/kernel/fpga/V2/deconv_add_bn_kernel.cpp index 614974b2ac..1dcb5d7d41 100644 --- a/src/operators/kernel/fpga/V2/deconv_add_bn_kernel.cpp +++ b/src/operators/kernel/fpga/V2/deconv_add_bn_kernel.cpp @@ -33,7 +33,7 @@ bool DeconvAddBNKernel::Init(FusionDeconvAddBNParam *param) { auto out = param->Output(); float Si = input->scale[0]; float So = out->scale[0]; - float Sf = fpga::filter_find_max(filter) / 127; + float Sf = fpga::filter_find_max(filter); PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], "Output channel should be equal to bias number"); int channel = out->dims()[1]; @@ -61,8 +61,7 @@ bool DeconvAddBNKernel::Init(FusionDeconvAddBNParam *param) { fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n); fpga::DWDeconvArgs DWDeconv_arg = {0}; - fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, - activation_enable, leaky_relu_negative_slope, + fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, false, param->Strides()[0], param->Strides()[1], param->Paddings()[0], param->Paddings()[1], bs_ptr); param->SetFpgaArgs(DWDeconv_arg); @@ -73,10 +72,10 @@ bool DeconvAddBNKernel::Init(FusionDeconvAddBNParam *param) { } fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n); fpga::DeconvArgs deconv_arg = {0}; - fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable, - leaky_relu_negative_slope, param->Groups(), - param->Strides()[0], param->Strides()[1], - param->Paddings()[0], param->Paddings()[1], bs_ptr); + fpga::fill_deconv_arg(&deconv_arg, input, out, filter, false, + param->Groups(), param->Strides()[0], + param->Strides()[1], param->Paddings()[0], + param->Paddings()[1], bs_ptr); param->SetFpgaArgs(deconv_arg); } return true; diff --git a/src/operators/kernel/fpga/V2/deconv_add_bn_relu_kernel.cpp b/src/operators/kernel/fpga/V2/deconv_add_bn_relu_kernel.cpp index 972dbdef63..4c8b4ec3c2 100644 --- a/src/operators/kernel/fpga/V2/deconv_add_bn_relu_kernel.cpp +++ b/src/operators/kernel/fpga/V2/deconv_add_bn_relu_kernel.cpp @@ -34,7 +34,7 @@ bool DeconvAddBNReluKernel::Init( auto out = param->Output(); float Si = input->scale[0]; float So = out->scale[0]; - float Sf = fpga::filter_find_max(filter) / 127; + float Sf = fpga::filter_find_max(filter); PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], "Output channel should be equal to bias number"); int channel = out->dims()[1]; @@ -62,8 +62,7 @@ bool DeconvAddBNReluKernel::Init( fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n); fpga::DWDeconvArgs DWDeconv_arg = {0}; - fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, - activation_enable, leaky_relu_negative_slope, + fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, true, param->Strides()[0], param->Strides()[1], param->Paddings()[0], param->Paddings()[1], bs_ptr); param->SetFpgaArgs(DWDeconv_arg); @@ -74,10 +73,10 @@ bool DeconvAddBNReluKernel::Init( } fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n); fpga::DeconvArgs deconv_arg = {0}; - fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable, - leaky_relu_negative_slope, param->Groups(), - param->Strides()[0], param->Strides()[1], - param->Paddings()[0], param->Paddings()[1], bs_ptr); + fpga::fill_deconv_arg(&deconv_arg, input, out, filter, true, + param->Groups(), param->Strides()[0], + param->Strides()[1], param->Paddings()[0], + param->Paddings()[1], bs_ptr); param->SetFpgaArgs(deconv_arg); } return true; diff --git a/src/operators/kernel/fpga/V2/deconv_add_kernel.cpp b/src/operators/kernel/fpga/V2/deconv_add_kernel.cpp index 9ce3319762..179d58ac99 100644 --- a/src/operators/kernel/fpga/V2/deconv_add_kernel.cpp +++ b/src/operators/kernel/fpga/V2/deconv_add_kernel.cpp @@ -33,7 +33,7 @@ bool DeconvAddKernel::Init(FusionDeconvAddParam *param) { auto out = param->Output(); float Si = input->scale[0]; float So = out->scale[0]; - float Sf = fpga::filter_find_max(filter) / 127; + float Sf = fpga::filter_find_max(filter); PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], "Output channel should be equal to bias number"); int channel = out->dims()[1]; @@ -61,8 +61,7 @@ bool DeconvAddKernel::Init(FusionDeconvAddParam *param) { fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n); fpga::DWDeconvArgs DWDeconv_arg = {0}; - fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, - activation_enable, leaky_relu_negative_slope, + fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, false, param->Strides()[0], param->Strides()[1], param->Paddings()[0], param->Paddings()[1], bs_ptr); param->SetFpgaArgs(DWDeconv_arg); @@ -73,10 +72,10 @@ bool DeconvAddKernel::Init(FusionDeconvAddParam *param) { } fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n); fpga::DeconvArgs deconv_arg = {0}; - fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable, - leaky_relu_negative_slope, param->Groups(), - param->Strides()[0], param->Strides()[1], - param->Paddings()[0], param->Paddings()[1], bs_ptr); + fpga::fill_deconv_arg(&deconv_arg, input, out, filter, false, + param->Groups(), param->Strides()[0], + param->Strides()[1], param->Paddings()[0], + param->Paddings()[1], bs_ptr); param->SetFpgaArgs(deconv_arg); } diff --git a/src/operators/kernel/fpga/V2/deconv_add_relu_kernel.cpp b/src/operators/kernel/fpga/V2/deconv_add_relu_kernel.cpp index 50ae9764ea..c7e728a169 100644 --- a/src/operators/kernel/fpga/V2/deconv_add_relu_kernel.cpp +++ b/src/operators/kernel/fpga/V2/deconv_add_relu_kernel.cpp @@ -34,7 +34,7 @@ bool DeconvAddReluKernel::Init( auto out = param->Output(); float Si = input->scale[0]; float So = out->scale[0]; - float Sf = fpga::filter_find_max(filter) / 127; + float Sf = fpga::filter_find_max(filter); PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], "Output channel should be equal to bias number"); int channel = out->dims()[1]; @@ -57,8 +57,7 @@ bool DeconvAddReluKernel::Init( fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n); fpga::DWDeconvArgs DWDeconv_arg = {0}; - fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, - activation_enable, leaky_relu_negative_slope, + fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, true, param->Strides()[0], param->Strides()[1], param->Paddings()[0], param->Paddings()[1], bs_ptr); param->SetFpgaArgs(DWDeconv_arg); @@ -69,10 +68,10 @@ bool DeconvAddReluKernel::Init( } fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n); fpga::DeconvArgs deconv_arg = {0}; - fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable, - leaky_relu_negative_slope, param->Groups(), - param->Strides()[0], param->Strides()[1], - param->Paddings()[0], param->Paddings()[1], bs_ptr); + fpga::fill_deconv_arg(&deconv_arg, input, out, filter, true, + param->Groups(), param->Strides()[0], + param->Strides()[1], param->Paddings()[0], + param->Paddings()[1], bs_ptr); param->SetFpgaArgs(deconv_arg); } return true; diff --git a/src/operators/kernel/fpga/V2/deconv_bn_relu_kernel.cpp b/src/operators/kernel/fpga/V2/deconv_bn_relu_kernel.cpp index a1e69f57b9..081087b7ad 100644 --- a/src/operators/kernel/fpga/V2/deconv_bn_relu_kernel.cpp +++ b/src/operators/kernel/fpga/V2/deconv_bn_relu_kernel.cpp @@ -35,7 +35,7 @@ bool DeconvBNReluKernel::Init( auto out = param->Output(); float Si = input->scale[0]; float So = out->scale[0]; - float Sf = fpga::filter_find_max(filter) / 127; + float Sf = fpga::filter_find_max(filter); auto bn_mean_ptr = param->InputMean()->data(); auto bn_var_ptr = param->InputVariance()->data(); auto bn_scale_ptr = param->InputScale()->data(); @@ -80,18 +80,17 @@ bool DeconvBNReluKernel::Init( fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n); fpga::DWDeconvArgs DWDeconv_arg = {0}; - fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, - activation_enable, leaky_relu_negative_slope, + fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, true, param->Strides()[0], param->Strides()[1], param->Paddings()[0], param->Paddings()[1], bs_ptr); param->SetFpgaArgs(DWDeconv_arg); } else { fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n); fpga::DeconvArgs deconv_arg = {0}; - fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable, - leaky_relu_negative_slope, param->Groups(), - param->Strides()[0], param->Strides()[1], - param->Paddings()[0], param->Paddings()[1], bs_ptr); + fpga::fill_deconv_arg(&deconv_arg, input, out, filter, true, + param->Groups(), param->Strides()[0], + param->Strides()[1], param->Paddings()[0], + param->Paddings()[1], bs_ptr); param->SetFpgaArgs(deconv_arg); } delete new_scale; diff --git a/src/operators/kernel/fpga/V2/feed_kernel.cpp b/src/operators/kernel/fpga/V2/feed_kernel.cpp index a706c48e12..ec47dca386 100644 --- a/src/operators/kernel/fpga/V2/feed_kernel.cpp +++ b/src/operators/kernel/fpga/V2/feed_kernel.cpp @@ -44,7 +44,6 @@ void FeedKernel::Compute(const FeedParam ¶m) { } fpga::format_image(input); output->ShareDataWith(*input); - input->external_data = nullptr; } template class FeedKernel; diff --git a/src/operators/kernel/fpga/V2/fusion_fc_kernel.cpp b/src/operators/kernel/fpga/V2/fusion_fc_kernel.cpp index 6f4e096112..4767b08e73 100644 --- a/src/operators/kernel/fpga/V2/fusion_fc_kernel.cpp +++ b/src/operators/kernel/fpga/V2/fusion_fc_kernel.cpp @@ -20,6 +20,7 @@ namespace operators { template <> bool FusionFcKernel::Init(FusionFcParam *param) { + bool relu_enabled = false; paddle_mobile::fpga::ActivationType activation_enable = paddle_mobile::fpga::NONE; int16_t leaky_relu_negative_slope = 0; @@ -58,8 +59,8 @@ bool FusionFcKernel::Init(FusionFcParam *param) { fpga::format_ofm(out); fpga::SplitConvArgs conv_arg = {0}; - fpga::fill_split_arg(&conv_arg, input_x, out, filter, activation_enable, - leaky_relu_negative_slope, 1, 1, 1, 0, 0, bs_ptr); + fpga::fill_split_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, + 0, 0, bs_ptr); param->SetFpgaArgs(conv_arg); return true; } diff --git a/src/operators/kernel/fpga/V2/fusion_fc_relu_kernel.cpp b/src/operators/kernel/fpga/V2/fusion_fc_relu_kernel.cpp index bc4fc96829..9748327355 100644 --- a/src/operators/kernel/fpga/V2/fusion_fc_relu_kernel.cpp +++ b/src/operators/kernel/fpga/V2/fusion_fc_relu_kernel.cpp @@ -20,6 +20,7 @@ namespace operators { template <> bool FusionFcReluKernel::Init(FusionFcReluParam *param) { + bool relu_enabled = false; paddle_mobile::fpga::ActivationType activation_enable = paddle_mobile::fpga::LEAKYRELU; int16_t leaky_relu_negative_slope = 0; @@ -58,8 +59,8 @@ bool FusionFcReluKernel::Init(FusionFcReluParam *param) { fpga::format_ofm(out); fpga::SplitConvArgs conv_arg = {0}; - fpga::fill_split_arg(&conv_arg, input_x, out, filter, activation_enable, - leaky_relu_negative_slope, 1, 1, 1, 0, 0, bs_ptr); + fpga::fill_split_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, + 0, 0, bs_ptr); param->SetFpgaArgs(conv_arg); return true; } -- GitLab