diff --git a/src/fpga/V2/api.cpp b/src/fpga/V2/api.cpp index 03d305930a88da3ccc43a8234827f34cfb12cc69..f1d19364f89cfa7118397ab7f33db66c3a78785d 100644 --- a/src/fpga/V2/api.cpp +++ b/src/fpga/V2/api.cpp @@ -495,6 +495,8 @@ void expand_EW_arg(EWAddArgs *arg) { uint64_t image_amount_per_row = align_to_x((uint64_t)args.image0.width * (uint64_t)args.image0.channels, IMAGE_ALIGNMENT); + uint64_t image_amount_per_row_p = align_to_x( + (uint64_t)args.image0.width * (uint64_t)args.image0.channels, 16); uint64_t image_image_pixel = ((uint64_t)args.image0.channels << 32) | ((uint64_t)args.image0.width << 16) | (uint64_t)args.image0.height; @@ -503,7 +505,8 @@ void expand_EW_arg(EWAddArgs *arg) { (*arg).driver.image1_address_phy = image1_address_phy; (*arg).driver.datalen = datalen; (*arg).driver.image_image_pixel = image_image_pixel; - (*arg).driver.image_amount_per_row = image_amount_per_row; + (*arg).driver.image_amount_per_row = + (uint64_t)image_amount_per_row | (uint64_t)(image_amount_per_row_p << 32); (*arg).driver.output_address_phy = output_address_phy; (*arg).driver.coefficient = coefficient; (*arg).driver.cmd = cmd; diff --git a/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp b/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp index 5e426e1cc3706bb5b385fc642a26e13f6a0a5d91..43b9355c99be4a22781cac10309a24c7dd3ac76c 100644 --- a/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp +++ b/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp @@ -25,9 +25,6 @@ template <> bool ElementwiseAddKernel::Init(ElementwiseAddParam *param) { auto *input_y = const_cast(param->InputY()); auto *out = param->Out(); - paddle_mobile::fpga::ActivationType activation_enable = - paddle_mobile::fpga::NONE; - int16_t leaky_relu_negative_slope = 0; auto *input_x = const_cast(param->InputX()); auto input_x_ptr = input_x->data(); auto input_y_ptr = input_y->data(); @@ -39,11 +36,9 @@ bool ElementwiseAddKernel::Init(ElementwiseAddParam *param) { float C1 = Si_1 / So; float C2 = Si_2 / So; fpga::EWAddArgs ewaddArgs = {0}; - ewaddArgs.output.activation.activation_type = activation_enable; - ewaddArgs.output.activation.leaky_relu_negative_slope = - leaky_relu_negative_slope; ewaddArgs.const0 = fpga::fp32_2_fp16(C1); ewaddArgs.const1 = fpga::fp32_2_fp16(C2); + ewaddArgs.relu_enabled = 0; ewaddArgs.image0.address = input_x_ptr; ewaddArgs.image0.channels = (uint32_t)input_x->dims()[1]; ewaddArgs.image0.scale_address = input_x->scale; diff --git a/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp b/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp index 4c477b83af83c1ece9d2eec03a20317c14c09d90..6d5ad505732f58cfc9a50f8627a07956cd96d45c 100644 --- a/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp +++ b/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp @@ -21,9 +21,6 @@ namespace operators { template <> bool ElementwiseAddReluKernel::Init( ElementwiseAddReluParam *param) { - paddle_mobile::fpga::ActivationType activation_enable = - paddle_mobile::fpga::LEAKYRELU; - int16_t leaky_relu_negative_slope = 0; auto *input_x = const_cast(param->InputX()); auto *input_y = const_cast(param->InputY()); auto *out = param->Out(); @@ -37,9 +34,7 @@ bool ElementwiseAddReluKernel::Init( float C1 = Si_1 / So; float C2 = Si_2 / So; fpga::EWAddArgs ewaddArgs = {0}; - ewaddArgs.output.activation.activation_type = activation_enable; - ewaddArgs.output.activation.leaky_relu_negative_slope = - leaky_relu_negative_slope; + ewaddArgs.relu_enabled = 1; ewaddArgs.const0 = fpga::fp32_2_fp16(C1); ewaddArgs.const1 = fpga::fp32_2_fp16(C2); ewaddArgs.image0.address = input_x_ptr; diff --git a/src/operators/kernel/fpga/V2/feed_kernel.cpp b/src/operators/kernel/fpga/V2/feed_kernel.cpp index 03ec75925e197343579383c3f427048654e9cf68..b797b3faf8c9f659bfa3caab7ee5a759f997ce88 100644 --- a/src/operators/kernel/fpga/V2/feed_kernel.cpp +++ b/src/operators/kernel/fpga/V2/feed_kernel.cpp @@ -44,8 +44,19 @@ void FeedKernel::Compute(const FeedParam ¶m) { return; } fpga::format_image(input); - output->ShareDataWith(*input); - input->external_data = nullptr; + + auto output_ptr = output->data(); + int channel = output->dims()[1]; + int height = output->dims()[2]; + int width = output->dims()[3]; + int size = fpga::align_to_x(channel * width, IMAGE_ALIGNMENT) * height; + auto input_ptr = input->data(); + fpga::fpga_invalidate(input_ptr, size * sizeof(int8_t)); + memcpy(output_ptr, input_ptr, size * sizeof(int8_t)); + + fpga::fpga_flush(output_ptr, + fpga::align_to_x(channel * width, IMAGE_ALIGNMENT) * height * + sizeof(int8_t)); } template class FeedKernel; diff --git a/src/operators/kernel/fpga/V2/proposal_kernel.cpp b/src/operators/kernel/fpga/V2/proposal_kernel.cpp index a0cc4ab61dc3728e70bb4d85875fe77fd851205d..ecc2577bd6ba9f8f21d4cccb94bdc27466b4a5d1 100644 --- a/src/operators/kernel/fpga/V2/proposal_kernel.cpp +++ b/src/operators/kernel/fpga/V2/proposal_kernel.cpp @@ -380,37 +380,54 @@ void ProposalKernel::Compute(const ProposalParam ¶m) { auto bbox_tmp_data = bbox_tmp->data(); int64_t amount_per_side = score_width * score_height; int idx = 0; - fpga::fpga_invalidate(input_score_data, score_height * score_width * - score_channels * sizeof(int8_t)); + int alignedCW = + fpga::align_to_x(score_width * score_channels, IMAGE_ALIGNMENT); + int unalignedCW = score_width * score_channels; + fpga::fpga_invalidate(input_score_data, + score_height * alignedCW * sizeof(int8_t)); for (int h = 0; h < score_height; h++) { for (int w = 0; w < score_width; w++) { for (int c = 0; c < score_channels; c++) { - idx++; - *(score_tmp_data + c * amount_per_side + score_width * h + w) = - (*(input_score_data++)); + if (alignedCW == unalignedCW) { + *(score_tmp_data + c * amount_per_side + score_width * h + w) = + (*(input_score_data++)); + } else { + idx = h * alignedCW + w * score_channels + c; + *(score_tmp_data + c * amount_per_side + score_width * h + w) = + input_score_data[idx]; + } } } } amount_per_side = bbox_width * bbox_height; - fpga::fpga_invalidate(input_bbox_data, bbox_height * bbox_width * - bbox_channels * sizeof(int8_t)); + alignedCW = fpga::align_to_x(bbox_width * bbox_channels, IMAGE_ALIGNMENT); + unalignedCW = bbox_width * bbox_channels; + fpga::fpga_invalidate(input_bbox_data, + bbox_height * alignedCW * sizeof(int8_t)); for (int h = 0; h < bbox_height; h++) { for (int w = 0; w < bbox_width; w++) { for (int c = 0; c < bbox_channels; c++) { - idx++; - *(bbox_tmp_data + c * amount_per_side + bbox_width * h + w) = - (*(input_bbox_data++)); + if (alignedCW == unalignedCW) { + *(bbox_tmp_data + c * amount_per_side + bbox_width * h + w) = + (*(input_bbox_data++)); + } else { + idx = h * alignedCW + w * bbox_channels + c; + *(bbox_tmp_data + c * amount_per_side + bbox_width * h + w) = + input_bbox_data[idx]; + } } } } auto score_tensor = param.float_score.get(); for (int i = 0; i < score_height * score_width * score_channels; i++) { - score_tensor->data()[i] = score_tmp_data[i] * input_score->scale[0]; + score_tensor->data()[i] = + score_tmp_data[i] / 127.0 * input_score->scale[0]; } auto bbox_tensor = param.float_bbox.get(); for (int i = 0; i < bbox_height * bbox_width * bbox_channels; i++) { - bbox_tensor->data()[i] = bbox_tmp_data[i] * input_bbox->scale[0]; + bbox_tensor->data()[i] = + bbox_tmp_data[i] / 127.0 * input_bbox->scale[0]; } auto *scores = param.float_score.get(); auto *bbox_deltas = param.float_bbox.get(); diff --git a/src/operators/kernel/fpga/V2/psroi_pool_kernel.cpp b/src/operators/kernel/fpga/V2/psroi_pool_kernel.cpp index b1aed80b30911468daea69fcfaee95b352124eab..b8b5202e27369a74430aa130db68501ff6891eec 100644 --- a/src/operators/kernel/fpga/V2/psroi_pool_kernel.cpp +++ b/src/operators/kernel/fpga/V2/psroi_pool_kernel.cpp @@ -103,7 +103,7 @@ void PSRoiPoolKernel::Compute(const PSRoiPoolParam& param) { auto float_input_tensor = param.float_input.get(); auto float_input_data = float_input_tensor->data(); for (int i = 0; i < float_input_tensor->numel(); i++) { - float_input_data[i] = input_data[i] * Si; + float_input_data[i] = input_data[i] / 127.0 * Si; } auto* in = float_input_tensor; diff --git a/src/operators/kernel/fpga/V2/slice_kernel.cpp b/src/operators/kernel/fpga/V2/slice_kernel.cpp index a8412904000278a6e879f06e8bfa41e528cbcf72..a1500ecdb0246d4c7235de490437945ec381d5a4 100644 --- a/src/operators/kernel/fpga/V2/slice_kernel.cpp +++ b/src/operators/kernel/fpga/V2/slice_kernel.cpp @@ -25,7 +25,7 @@ bool SliceKernel::Init(SliceParam* param) { fpga::format_ofm(output); DLOG << "input: " << param->input_; DLOG << "output: " << param->output_; - if (param->input_->type() != type_id()) { + if (param->input_->type() != type_id()) { DLOG << "wrong type"; } return true; diff --git a/src/operators/kernel/fpga/V2/softmax_kernel.cpp b/src/operators/kernel/fpga/V2/softmax_kernel.cpp index d8b58b9d6deac74651e54b0fae5ae08b2ad9f7ec..07bf3bb807eb3d23300f77403847fb0e0e4ff3aa 100755 --- a/src/operators/kernel/fpga/V2/softmax_kernel.cpp +++ b/src/operators/kernel/fpga/V2/softmax_kernel.cpp @@ -123,8 +123,8 @@ void SoftmaxKernel::Compute(const SoftmaxParam ¶m) { float_out->Resize(in_x->dims()); float_out->init(type_id().hash_code()); fpga::format_ofm(float_out.get()); - auto float_out_data = float_out->data(); math::SoftmaxFuntor()(in_x, float_out.get()); + auto float_out_data = float_out->data(); int dataNum = n * h * fpga::align_to_x(w * c, IMAGE_ALIGNMENT); for (int i = 0; i < dataNum; i++) { float tmp_out = float_out_data[i] * 127;