diff --git a/src/fpga/V2/api.cpp b/src/fpga/V2/api.cpp index f2562f9c53902a790849cf4bdac2e462031c02f6..83e68c51cf5bf6a0bc5bf07b96143b08b3b593e5 100644 --- a/src/fpga/V2/api.cpp +++ b/src/fpga/V2/api.cpp @@ -40,8 +40,6 @@ void format_image(framework::Tensor *image_tensor) { void format_ofm(framework::Tensor *ofm_tensor) { if (ofm_tensor->type() == type_id()) { format_fp32_ofm(ofm_tensor); - } else if (ofm_tensor->type() == type_id()) { - format_fp16_ofm(ofm_tensor); } else { format_int8_ofm(ofm_tensor); } @@ -85,24 +83,6 @@ void format_int8_ofm(framework::Tensor *ofm_tensor, framework::DDim dims) { fpga::fpga_flush(p, memory_size); } -void format_fp16_ofm(framework::Tensor *ofm_tensor) { - auto dims = ofm_tensor->dims(); - size_t memory_size = 0; - if (dims.size() == 4) { - auto channel = dims[1], height = dims[2], width = dims[3], num = dims[0]; - memory_size = num * height * align_to_x(channel * width, IMAGE_ALIGNMENT) * - sizeof(half); - } else if (dims.size() == 2) { - memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(half); - } else { - DLOG << "Wrong ofm dimension"; - } - auto p = fpga_malloc(memory_size); - ofm_tensor->reset_data_ptr(p); - ofm_tensor->set_type(type_id().hash_code()); - ofm_tensor->fpga_data_num = memory_size / sizeof(half); -} - void format_fp32_ofm(framework::Tensor *ofm_tensor) { auto dims = ofm_tensor->dims(); size_t memory_size = 0; @@ -208,9 +188,6 @@ void format_DWDconv_filter(framework::Tensor *filter_tensor, float *scale_ptr, deconv_filter::DWDconv_format_filter(&new_data, num, channel, height, width, scale_ptr, stride); - // framework::DDim dims_new = - // framework::make_ddim({num, 1, height, width}); - // filter_tensor->Resize(dims_new); filter_tensor->reset_data_ptr(new_data); filter_tensor->set_type(type_id().hash_code()); } @@ -314,13 +291,13 @@ void format_DWDeconv_data(framework::Tensor *filter_tensor, framework::Tensor *ofm_tensor, float **bs_ptr, int group, int sub_conv_n) { int channel = ofm_tensor->dims()[1]; - // dw-deconv format_DWDconv_filter( filter_tensor, (reinterpret_cast(*bs_ptr) + sub_conv_n * channel), sub_conv_n); format_bias_array(bs_ptr, channel); format_ofm(ofm_tensor); } + void expand_conv_arg(ConvArgs *arg) { ConvArgs args = *arg; @@ -458,7 +435,6 @@ void expand_conv_arg(ConvArgs *arg) { void expand_EW_arg(EWAddArgs *arg) { EWAddArgs args = *arg; - // uint64_t cmd = args.relu_enabled ? USE_RELU : 0; uint64_t cmd = 0; uint64_t datalen = (uint64_t)args.image0.width * (uint64_t)args.image0.height * diff --git a/src/fpga/V2/api.h b/src/fpga/V2/api.h index 01d900602f933459622d64790acebe090a41e294..c8774f6ab2da2c773297add227f2d5e99893815e 100644 --- a/src/fpga/V2/api.h +++ b/src/fpga/V2/api.h @@ -26,8 +26,6 @@ void format_image(framework::Tensor* image_tensor); void format_ofm(framework::Tensor* ofm_tensor); void format_int8_ofm(framework::Tensor* ofm_tensor); void format_int8_ofm(framework::Tensor* ofm_tensor, framework::DDim dims); -void format_fp16_ofm(framework::Tensor* ofm_tensor); // only allocate memory -void format_fp16_ofm(framework::Tensor* ofm_tensor, framework::DDim dims); void format_fp32_ofm(framework::Tensor* ofm_tensor); float filter_find_max(framework::Tensor* filter_tensor); diff --git a/src/fpga/V2/image.cpp b/src/fpga/V2/image.cpp index 8cbcc74c83714d21c6a0f5d11bd14a2298552d41..dc3c3356e838c88023d0efa1c40bf6f910aece89 100644 --- a/src/fpga/V2/image.cpp +++ b/src/fpga/V2/image.cpp @@ -100,7 +100,7 @@ void concat_images(int8_t **images_in, float **scales_in, void *image_out, align_each_in_area_cw = align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT); memcpy( - (int16_t *)image_out + tmp_channel + // NOLINT + (int8_t *)image_out + tmp_channel + // NOLINT k * align_each_out_area_cw_differ, images_in_tmp[i] + j * channel_num[i] + k * align_each_in_area_cw, channel_num[i] * sizeof(int8_t)); diff --git a/src/fpga/V2/image.h b/src/fpga/V2/image.h index f5fe6493916f9b8ca55a4ccebcc614f3c839ef4b..11988ee11d070c6d91a79cdd682c3c2bc2f84570 100644 --- a/src/fpga/V2/image.h +++ b/src/fpga/V2/image.h @@ -26,10 +26,6 @@ void convert_to_hwc(float** data_in, int channel, int height, int width, int num = 1); void convert_to_chw(float** data_in, int channel, int height, int width, int num = 1); -// template -// void align_element_conv(Dtype** data_in, int height, int cw); -// template -// void format_image(T** data_in, int channel, int height, int width); template void align_element_conv(Dtype** data_in, int height, int cw); template diff --git a/src/fpga/common/fpga_common.cpp b/src/fpga/common/fpga_common.cpp index 57bd162f02566ccb7b4cb5efa54c245abc51c350..5ac45847dcea320b249b192c66018a46523250f7 100644 --- a/src/fpga/common/fpga_common.cpp +++ b/src/fpga/common/fpga_common.cpp @@ -97,7 +97,7 @@ float fp16_2_fp32(int16_t fp16_num) { } else if (se_fp16 < 63) { e_fp32 = 0x80000000 + ((se_fp16 - 32) << 23); offset = 1024; - } else { // se_fp16 == 63 + } else { e_fp32 = 0xC7800000; offset = 1024; } diff --git a/src/fpga/common/fpga_common.h b/src/fpga/common/fpga_common.h index a1532c6c876a02931164f61860f06be73ffba409..f13492a2d158065ae937de040bfc41d41ce22d5e 100644 --- a/src/fpga/common/fpga_common.h +++ b/src/fpga/common/fpga_common.h @@ -209,7 +209,6 @@ struct PoolingArgs { }; struct EWAddArgs { - // bool relu_enabled; uint32_t const0; // output0 = const0 x input0 + const1 x input1; uint32_t const1; struct ImageInputArgs image0; diff --git a/src/operators/kernel/fpga/V2/elementwise_mul_kernel.cpp b/src/operators/kernel/fpga/V2/elementwise_mul_kernel.cpp index d744ae2c07810ae89418641799a37ea978d14139..d1138d06bbed8ac8435e0a671a7683229d237da5 100644 --- a/src/operators/kernel/fpga/V2/elementwise_mul_kernel.cpp +++ b/src/operators/kernel/fpga/V2/elementwise_mul_kernel.cpp @@ -35,7 +35,7 @@ bool ElementwiseMulKernel::Init(ElementwiseMulParam *param) { fpga::format_fp32_ofm(&(param->float_out)); auto *out = param->Out(); - fpga::format_fp16_ofm(out); + fpga::format_ofm(out); return true; } diff --git a/src/operators/kernel/fpga/V2/feed_kernel.cpp b/src/operators/kernel/fpga/V2/feed_kernel.cpp index 28559b2b4bb96404febf5cf65a75e264166df20f..5b918092f7966de60365d7ba5b706574c4ff178f 100644 --- a/src/operators/kernel/fpga/V2/feed_kernel.cpp +++ b/src/operators/kernel/fpga/V2/feed_kernel.cpp @@ -23,14 +23,14 @@ bool FeedKernel::Init(FeedParam *param) { int col = param->Col(); DLOG << "col = " << col; auto input = const_cast(¶m->InputX()->at(col)); - input->init(type_id().hash_code()); - input->Resize(output->dims()); if (output->dims().size() != 4) { + input->init(type_id().hash_code()); return true; } - - fpga::format_fp16_ofm(output); + input->init(type_id().hash_code()); + input->Resize(output->dims()); + fpga::format_ofm(output); return true; } @@ -39,15 +39,6 @@ void FeedKernel::Compute(const FeedParam ¶m) { auto output = param.Out(); int col = param.Col(); auto input = const_cast(¶m.InputX()->at(col)); - kTypeId_t input_type = input->type(); - - if (input_type == type_id()) { - input->init(type_id().hash_code()); - } else { - input->init(type_id().hash_code()); - } - input->Resize(output->dims()); - if (output->dims().size() != 4) { size_t size = output->numel() * sizeof(float); auto output_ptr = output->data(); @@ -58,49 +49,8 @@ void FeedKernel::Compute(const FeedParam ¶m) { input->external_data = nullptr; return; } - fpga::format_image(input); - auto output_ptr = output->data(); - fpga::BypassArgs args = {fpga::DATA_TYPE_FP32}; - if (input_type == type_id()) { - auto input_ptr = input->data(); - auto external_ptr = reinterpret_cast(input->external_data); - float *p_data = external_ptr == nullptr ? input_ptr : external_ptr; - - args.input_data_type = fpga::DATA_TYPE_FP32; - args.output_data_type = fpga::DATA_TYPE_FP16; - args.input_layout_type = fpga::LAYOUT_CHW; - args.output_layout_type = fpga::LAYOUT_HWC; - args.image.address = p_data; - args.image.channels = (uint32_t)input->dims()[1]; - args.image.height = (uint32_t)input->dims()[2]; - args.image.width = (uint32_t)input->dims()[3]; - args.image.pad_height = 0; - args.image.pad_width = 0; - args.output.address = output_ptr; - args.output.scale_address = output->scale; - fpga::PerformBypass(args); - input->external_data = nullptr; - } else { - auto input_ptr = input->data(); - auto external_ptr = reinterpret_cast(input->external_data); - int8_t *p_data = external_ptr == nullptr ? input_ptr : external_ptr; - - args.input_data_type = fpga::DATA_TYPE_INT8; - args.output_data_type = fpga::DATA_TYPE_FP16; - args.input_layout_type = fpga::LAYOUT_CHW; - args.output_layout_type = fpga::LAYOUT_HWC; - args.image.address = p_data; - args.image.channels = (uint32_t)input->dims()[1]; - args.image.height = (uint32_t)input->dims()[2]; - args.image.width = (uint32_t)input->dims()[3]; - args.image.pad_height = 0; - args.image.pad_width = 0; - args.output.address = output_ptr; - args.output.scale_address = output->scale; - fpga::PerformBypass(args); - input->external_data = nullptr; - } + output->ShareDataWith(*input); } template class FeedKernel; diff --git a/src/operators/kernel/fpga/V2/fetch_kernel.cpp b/src/operators/kernel/fpga/V2/fetch_kernel.cpp index 87ede2af1ab2fa3225c0cd3e75c3fe0c8c8fb509..10b7eb8c01681bbbc3a373f4c0a23888fedda974 100644 --- a/src/operators/kernel/fpga/V2/fetch_kernel.cpp +++ b/src/operators/kernel/fpga/V2/fetch_kernel.cpp @@ -21,18 +21,16 @@ bool FetchKernel::Init(FetchParam *param) { int col = param->Col(); DLOG << "col = " << col; auto output = &(param->Out()->at(col)); + output->init(type_id().hash_code()); + output->Resize(input->dims()); if (input->type() == type_id()) { return true; } - output->init(type_id().hash_code()); - output->Resize(input->dims()); - fpga::format_fp32_ofm(output); + auto aligned_output = param->aligned_out; int outC = 1; - int outH = 1; int outW = 1; if (output->dims().size() == 4) { outC = output->dims()[1]; - outH = output->dims()[2]; outW = output->dims()[3]; } else { // 2 outC = output->dims()[1]; @@ -40,27 +38,10 @@ bool FetchKernel::Init(FetchParam *param) { int unalignedCW = outC * outW; int alignedCW = fpga::align_to_x(unalignedCW, IMAGE_ALIGNMENT); if (alignedCW != unalignedCW) { - param->aligned_out.Resize(input->dims()); - param->aligned_out.mutable_data(input->dims()); - fpga::fpga_flush(param->aligned_out.data(), - outH * unalignedCW * sizeof(float)); + aligned_output.init(type_id().hash_code()); + aligned_output.Resize(input->dims()); + fpga::format_fp32_ofm(&aligned_output); } - fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; - - args.input_data_type = fpga::DATA_TYPE_FP16; - args.output_data_type = fpga::DATA_TYPE_FP32; - args.input_layout_type = fpga::LAYOUT_CHW; - args.output_layout_type = fpga::LAYOUT_HWC; - args.image.address = input->data(); - args.image.channels = (uint32_t)(input->fpga_data_num); - args.image.height = 1; - args.image.width = 1; - args.image.pad_height = 0; - args.image.pad_width = 0; - args.output.address = output->data(); - args.output.scale_address = output->scale; - param->fpga_bypass_args = args; - return true; } void dealign(float *src, float *dst, int input_c, int input_h, int input_w) { @@ -83,22 +64,22 @@ void FetchKernel::Compute(const FetchParam ¶m) { return; } - fpga::BypassArgs args = param.fpga_bypass_args; - auto input_address = (input->data()); - args.image.address = static_cast(input_address); - float *outdata_ptr = - reinterpret_cast(param.fpga_bypass_args.output.address); + auto input_address = input->data(); + float Si = input->scale[0]; + auto aligned_ptr = const_cast(param.aligned_out.data()); + auto outdata_ptr = output->data(); const int num_th = 32; - if (output->fpga_data_num < num_th) { - fpga::fpga_invalidate(input_address, (input->fpga_data_num) * sizeof(half)); - + fpga::fpga_invalidate(input_address, (input->fpga_data_num) * sizeof(int8_t)); + if (input->fpga_data_num < num_th) { for (int idx = 0; idx < product(input->dims()); ++idx) { - outdata_ptr[idx] = fpga::fp16_2_fp32(input_address[idx]); + outdata_ptr[idx] = input_address[idx] * Si; } + fpga::fpga_flush(outdata_ptr, product(input->dims()) * sizeof(float)); return; } - - fpga::PerformBypass(args); + for (int idx = 0; idx < input->fpga_data_num; ++idx) { + aligned_ptr[idx] = input_address[idx] * Si; + } int outC = 1; int outH = 1; int outW = 1; @@ -110,16 +91,15 @@ void FetchKernel::Compute(const FetchParam ¶m) { outC = output->dims()[1]; } - fpga::fpga_invalidate(param.fpga_bypass_args.output.address, - output->fpga_data_num * sizeof(float)); int unalignedCW = outC * outW; int alignedCW = fpga::align_to_x(unalignedCW, IMAGE_ALIGNMENT); if (unalignedCW != alignedCW) { - auto aligned_ptr = const_cast(param.aligned_out.data()); - dealign(outdata_ptr, aligned_ptr, outC, outH, outW); - memcpy(outdata_ptr, aligned_ptr, outC * outH * outW * sizeof(float)); + dealign(aligned_ptr, outdata_ptr, outC, outH, outW); fpga::fpga_flush(outdata_ptr, outC * outH * outW * sizeof(float)); + return; } + memcpy(outdata_ptr, aligned_ptr, outC * outH * outW * sizeof(float)); + fpga::fpga_flush(outdata_ptr, outC * outH * outW * sizeof(float)); } template class FetchKernel; diff --git a/src/operators/kernel/fpga/V2/proposal_kernel.cpp b/src/operators/kernel/fpga/V2/proposal_kernel.cpp index bd6703bb81f1a4b70f2a3406b312160116ad38f5..a0cc4ab61dc3728e70bb4d85875fe77fd851205d 100644 --- a/src/operators/kernel/fpga/V2/proposal_kernel.cpp +++ b/src/operators/kernel/fpga/V2/proposal_kernel.cpp @@ -32,9 +32,6 @@ bool ProposalKernel::Init(ProposalParam *param) { param->rpn_rois_->mutable_data({total, 4}); param->rpn_probs_->mutable_data({total, 1}); - // DLOG << *param->rpn_rois_; - // DLOG << *param->rpn_probs_; - param->float_bbox = std::make_shared(); param->float_bbox->Resize(param->bbox_deltas_->dims()); param->float_bbox->init(type_id().hash_code()); @@ -44,29 +41,7 @@ bool ProposalKernel::Init(ProposalParam *param) { param->float_score->init(type_id().hash_code()); fpga::format_fp32_ofm(param->float_score.get()); - auto input = param->bbox_deltas_; - fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; - args.input_layout_type = fpga::LAYOUT_HWC; - args.output_layout_type = fpga::LAYOUT_HWC; - args.input_data_type = fpga::DATA_TYPE_FP16; - args.output_data_type = fpga::DATA_TYPE_FP32; - args.image.address = input->data(); - args.image.height = (uint32_t)input->dims()[2]; - args.image.width = (uint32_t)input->dims()[3]; - args.image.channels = (uint32_t)input->dims()[1]; - args.output.address = param->float_bbox->mutable_data(); - args.output.scale_address = param->float_bbox->scale; - param->bbox_arg = args; - - input = param->scores_; - args.image.address = input->data(); - args.image.height = (uint32_t)input->dims()[2]; - args.image.width = (uint32_t)input->dims()[3]; - args.image.channels = (uint32_t)input->dims()[1]; - args.output.address = param->float_score->mutable_data(); - args.output.scale_address = param->float_score->scale; - param->score_arg = args; - + auto input = param->scores_; param->score_index_ = std::make_shared(); param->score_index_->mutable_data({input->numel()}); auto score_index = param->score_index_->data(); @@ -136,51 +111,17 @@ static inline void BoxCoder(Tensor *all_anchors, Tensor *bbox_deltas, T bbox_center_x = 0, bbox_center_y = 0; T bbox_width = 0, bbox_height = 0; - /* - if (variances) { - bbox_center_x = - variances_data[i * len] * bbox_deltas_data[i * len] * anchor_width - + anchor_center_x; bbox_center_y = variances_data[i * len + 1] * - bbox_deltas_data[i * len + 1] * anchor_height + - anchor_center_y; - bbox_width = std::exp(std::min(variances_data[i * len + 2] * - bbox_deltas_data[i * len + 2], - kBBoxClipDefault)) * - anchor_width; - bbox_height = std::exp(std::min(variances_data[i * len + 3] * - bbox_deltas_data[i * len + 3], - kBBoxClipDefault)) * - anchor_height; - } else { - */ bbox_center_x = bbox_deltas_data[i * len] * anchor_width + anchor_center_x; bbox_center_y = bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y; - - /* - bbox_width = std::exp(std::min(bbox_deltas_data[i * len + 2], - kBBoxClipDefault)) * - anchor_width; - bbox_height = std::exp(std::min(bbox_deltas_data[i * len + 3], - kBBoxClipDefault)) * - anchor_height; - */ bbox_width = std::exp(bbox_deltas_data[i * len + 2]) * anchor_width; bbox_height = std::exp(bbox_deltas_data[i * len + 3]) * anchor_height; - // } proposals_data[i * len] = bbox_center_x - bbox_width / 2; proposals_data[i * len + 1] = bbox_center_y - bbox_height / 2; - /* - //wong - proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2 - 1; - proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2 - 1; - //wong - */ proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2; proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2; } - // return proposals; } template @@ -252,8 +193,6 @@ static inline std::vector> GetSortedScoreIndex( template static inline T BBoxArea(const T *box, bool normalized) { if (box[2] < box[0] || box[3] < box[1]) { - // If coordinate values are is invalid - // (e.g. xmax < xmin or ymax < ymin), return 0. return static_cast(0.); } else { const T w = box[2] - box[0]; @@ -351,9 +290,6 @@ std::pair ProposalForOneImage( Tensor index_t; index_t.Resize({scores_slice.numel()}); int *index = index_t.mutable_data(); - /*for (int i = 0; i < scores_slice.numel(); ++i) { - index[i] = i; - }*/ std::memcpy(index, score_index.data(), scores_slice.numel() * sizeof(int)); @@ -397,7 +333,6 @@ std::pair ProposalForOneImage( return std::make_pair(bbox_sel, scores_filter); } - // Tensor keep_nms = NMS(&bbox_sel, &scores_filter, nms_thresh, eta); Tensor keep_nms = NMS(&bbox_sel, &scores_filter, nms_thresh, eta, post_nms_top_n); @@ -408,8 +343,6 @@ std::pair ProposalForOneImage( proposals.mutable_data({keep_nms.numel(), 4}); // original scores_sel.mutable_data({keep_nms.numel(), 1}); // original - // proposals.mutable_data({post_nms_top_n, 4}); // wong - // scores_sel.mutable_data({post_nms_top_n, 1}); // wong CPUGather(bbox_sel, keep_nms, &proposals); CPUGather(scores_filter, keep_nms, &scores_sel); return std::make_pair(proposals, scores_sel); @@ -418,13 +351,11 @@ std::pair ProposalForOneImage( template <> void ProposalKernel::Compute(const ProposalParam ¶m) { auto input_score = param.scores_; - auto input_score_data = input_score->data(); - auto input_score_data_tmp = input_score->data(); + auto input_score_data = input_score->data(); uint32_t score_n, score_height, score_width, score_channels; auto input_bbox = param.bbox_deltas_; - auto input_bbox_data = input_bbox->data(); - auto input_bbox_data_tmp = input_bbox->data(); + auto input_bbox_data = input_bbox->data(); uint32_t bbox_n, bbox_height, bbox_width, bbox_channels; score_n = (uint32_t)(input_score->dims()[0]); @@ -439,61 +370,48 @@ void ProposalKernel::Compute(const ProposalParam ¶m) { std::shared_ptr score_tmp = std::make_shared(); score_tmp->Resize(param.scores_->dims()); - score_tmp->mutable_data(); + score_tmp->mutable_data(); std::shared_ptr bbox_tmp = std::make_shared(); bbox_tmp->Resize(param.bbox_deltas_->dims()); - bbox_tmp->mutable_data(); + bbox_tmp->mutable_data(); - auto score_tmp_data = score_tmp->data(); - auto bbox_tmp_data = bbox_tmp->data(); + auto score_tmp_data = score_tmp->data(); + auto bbox_tmp_data = bbox_tmp->data(); int64_t amount_per_side = score_width * score_height; int idx = 0; - fpga::fpga_invalidate( - input_score_data_tmp, - score_height * score_width * score_channels * sizeof(half)); + fpga::fpga_invalidate(input_score_data, score_height * score_width * + score_channels * sizeof(int8_t)); for (int h = 0; h < score_height; h++) { for (int w = 0; w < score_width; w++) { for (int c = 0; c < score_channels; c++) { idx++; - // DLOG << "wong input_score: "<< - // paddle_mobile::fpga::fp16_2_fp32(input_score_data[idx]); *(score_tmp_data + c * amount_per_side + score_width * h + w) = - (*(input_score_data_tmp++)); + (*(input_score_data++)); } } } amount_per_side = bbox_width * bbox_height; - fpga::fpga_invalidate(input_bbox_data_tmp, bbox_height * bbox_width * - bbox_channels * sizeof(half)); + fpga::fpga_invalidate(input_bbox_data, bbox_height * bbox_width * + bbox_channels * sizeof(int8_t)); for (int h = 0; h < bbox_height; h++) { for (int w = 0; w < bbox_width; w++) { for (int c = 0; c < bbox_channels; c++) { idx++; - // DLOG << "wong input_score: "<< - // paddle_mobile::fpga::fp16_2_fp32(input_score_data[idx]); *(bbox_tmp_data + c * amount_per_side + bbox_width * h + w) = - (*(input_bbox_data_tmp++)); + (*(input_bbox_data++)); } } } - struct paddle_mobile::fpga::BypassArgs temp_score_arg; - struct paddle_mobile::fpga::BypassArgs temp_bbox_arg; - temp_score_arg = param.score_arg; - temp_score_arg.image.address = score_tmp->data(); - temp_bbox_arg = param.bbox_arg; - temp_bbox_arg.image.address = bbox_tmp->data(); auto score_tensor = param.float_score.get(); - fpga::PerformBypass(param.score_arg); - fpga::fpga_invalidate(score_tensor->data(), - score_tensor->numel() * sizeof(float)); - + for (int i = 0; i < score_height * score_width * score_channels; i++) { + score_tensor->data()[i] = score_tmp_data[i] * input_score->scale[0]; + } auto bbox_tensor = param.float_bbox.get(); - fpga::PerformBypass(param.bbox_arg); - fpga::fpga_invalidate(bbox_tensor->data(), - bbox_tensor->numel() * sizeof(float)); - + for (int i = 0; i < bbox_height * bbox_width * bbox_channels; i++) { + bbox_tensor->data()[i] = bbox_tmp_data[i] * input_bbox->scale[0]; + } auto *scores = param.float_score.get(); auto *bbox_deltas = param.float_bbox.get(); auto *im_info = param.im_info_; @@ -507,7 +425,6 @@ void ProposalKernel::Compute(const ProposalParam ¶m) { int pre_nms_top_n = param.pre_nms_topn_; int post_nms_top_n = param.post_nms_topn_; - // DLOG << " param.post_nms_topn_ : " << param.post_nms_topn_; float nms_thresh = param.nms_thresh_ / 2.0f; float min_size = param.min_size_; diff --git a/src/operators/kernel/fpga/V2/psroi_pool_kernel.cpp b/src/operators/kernel/fpga/V2/psroi_pool_kernel.cpp index 32851aa93d3394de145c0bede69e7789d9073682..b1aed80b30911468daea69fcfaee95b352124eab 100644 --- a/src/operators/kernel/fpga/V2/psroi_pool_kernel.cpp +++ b/src/operators/kernel/fpga/V2/psroi_pool_kernel.cpp @@ -31,21 +31,6 @@ bool PSRoiPoolKernel::Init(PSRoiPoolParam* param) { param->float_input = std::make_shared(); param->float_input->mutable_data(param->input_x_->dims()); - // param->float_output = std::make_shared(); - - auto input = param->input_x_; - fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; - args.input_layout_type = fpga::LAYOUT_HWC; - args.output_layout_type = fpga::LAYOUT_HWC; - args.input_data_type = fpga::DATA_TYPE_FP16; - args.output_data_type = fpga::DATA_TYPE_FP32; - args.image.address = input->data(); - args.image.height = (uint32_t)input->dims()[2]; - args.image.width = (uint32_t)input->dims()[3]; - args.image.channels = (uint32_t)input->dims()[1]; - args.output.address = param->float_input->mutable_data(); - args.output.scale_address = param->float_input->scale; - param->input_arg = args; auto* rois = param->input_rois_; int rois_num = rois->dims()[0]; @@ -53,81 +38,11 @@ bool PSRoiPoolKernel::Init(PSRoiPoolParam* param) { {rois_num, param->output_->dims()[1], param->output_->dims()[2], param->output_->dims()[3]}); param->output_->Resize(dims_out_new); - // fpga::format_fp16_ofm(param->output_); param->output_->mutable_data(dims_out_new); - // auto output = param->float_output.get(); - // param->output_ = output; - /* args.input_data_type = fpga::DATA_TYPE_FP32; - args.output_data_type = fpga::DATA_TYPE_FP16; - args.image.address = output->data(); - args.image.height = (uint32_t)output->dims()[2]; - args.image.width = (uint32_t)output->dims()[3]; - args.image.channels = (uint32_t)output->dims()[1] ; - args.output.address = param->output_->mutable_data(); - args.output.scale_address = param->output_->scale; - param->output_arg = args;*/ - return true; } -/* - template - void PSROIPoolingForward( - const Dtype* bottom_data, - const int height, const int width, const int input_channel, - Dtype* top_data, - const int pooled_height, const int pooled_width, const int output_channel, - const Dtype* bottom_rois, - const Dtype Bin_size_h, const Dtype Bin_size_w, const Dtype roi_start_h, - const Dtype roi_start_w, const int pw, const int ph, const int roi_batch_ind) - { - - int hstart = floor(static_cast(ph) * Bin_size_h + roi_start_h); - int wstart = floor(static_cast(pw)* Bin_size_w + roi_start_w); - int hend = ceil(static_cast(ph + 1) * Bin_size_h + roi_start_h); - int wend = ceil(static_cast(pw + 1) * Bin_size_w + roi_start_w); - - hstart = std::min(std::max(hstart, 0), height); - hend = std::min(std::max(hend, 0), height); - wstart = std::min(std::max(wstart, 0), width); - wend = std::min(std::max(wend, 0), width); - bool is_empty = (hend <= hstart) || (wend <= wstart); - - float32x4_t sum_pixels_low_c= vdupq_n_f32(0); - float32x4_t sum_pixels_high_c= vdupq_n_f32(0); - - if(!is_empty){ - Dtype bin_area = (hend - hstart) * (wend - wstart); - float rev_bin_area = 1 / bin_area; - float32x4_t q_bin_area = vdupq_n_f32(rev_bin_area); - //static_cast(bin_area) float pixels_c[output_channel]; - - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - int pixel_offset = (h * width + w) * input_channel; - for(int output_c = 0; output_c < output_channel; output_c++){ - int input_channel_offset = output_c * pooled_height * - pooled_width; int input_bias = pixel_offset + input_channel_offset + ph * - pooled_width + pw; pixels_c[output_c] = bottom_data[input_bias]; - } - float32x4_t pixel_low_c = vld1q_f32(pixels_c); - float32x4_t pixel_high_c = vld1q_f32(pixels_c + 4); - sum_pixels_low_c = vaddq_f32(sum_pixels_low_c, pixel_low_c); - sum_pixels_high_c = vaddq_f32(sum_pixels_high_c, pixel_high_c); - } - } - sum_pixels_low_c = vmulq_f32(sum_pixels_low_c, q_bin_area); - sum_pixels_high_c = vmulq_f32(sum_pixels_high_c, q_bin_area); - } - - int output_index_base = (ph * pooled_width + pw) * output_channel; - top_data += output_index_base; - vst1q_f32(top_data, sum_pixels_low_c); - top_data += 4; - vst1q_f32(top_data, sum_pixels_high_c); - }*/ - template void PSROIPoolingForward(const Dtype* bottom_data, const int height, const int width, const int input_channel, @@ -182,14 +97,18 @@ void PSROIPoolingForward(const Dtype* bottom_data, const int height, template <> void PSRoiPoolKernel::Compute(const PSRoiPoolParam& param) { - auto input_tensor = param.float_input.get(); - fpga::PerformBypass(param.input_arg); - fpga::fpga_invalidate(input_tensor->data(), - input_tensor->numel() * sizeof(float)); + auto input_tensor = param.input_x_; + auto input_data = input_tensor->data(); + auto Si = input_tensor->scale[0]; + auto float_input_tensor = param.float_input.get(); + auto float_input_data = float_input_tensor->data(); + for (int i = 0; i < float_input_tensor->numel(); i++) { + float_input_data[i] = input_data[i] * Si; + } - auto* in = input_tensor; + auto* in = float_input_tensor; auto* rois = param.input_rois_; - auto* out = param.output_; // param.float_output.get(); + auto* out = param.output_; auto pooled_height = param.pooled_height_; auto pooled_width = param.pooled_width_; @@ -205,14 +124,13 @@ void PSRoiPoolKernel::Compute(const PSRoiPoolParam& param) { auto data_nhwc = in->mutable_data(); - // fpga::image::convert_to_chw(&data_nhwc, input_channels, height, width); framework::DDim dims_out_new = framework::make_ddim( {rois_num, (param.output_)->dims()[1], (((param.output_)->dims()[2])), (param.output_)->dims()[3]}); (param.output_)->Resize(dims_out_new); - const float* input_data = data_nhwc; // in->data(); + const float* input_data_tmp = data_nhwc; // in->data(); framework::Tensor rois_batch_id_list; rois_batch_id_list.Resize({rois_num}); auto rois_batch_id_data = rois_batch_id_list.mutable_data(); @@ -268,11 +186,11 @@ void PSRoiPoolKernel::Compute(const PSRoiPoolParam& param) { for (int ph = 0; ph < pooled_height; ph++) { for (int pw = 0; pw < pooled_width; pw++) { - PSROIPoolingForward(input_data, height, width, input_channels, - offset_output_data, pooled_height, - pooled_width, output_channels, input_rois, - bin_size_h, bin_size_w, roi_start_h, - roi_start_w, pw, ph, roi_batch_ind); + PSROIPoolingForward( + input_data_tmp, height, width, input_channels, offset_output_data, + pooled_height, pooled_width, output_channels, input_rois, + bin_size_h, bin_size_w, roi_start_h, roi_start_w, pw, ph, + roi_batch_ind); } } } diff --git a/src/operators/kernel/fpga/V2/sigmoid_kernel.cpp b/src/operators/kernel/fpga/V2/sigmoid_kernel.cpp index e61f00a09a333249a72c8dddafbf9d36ffe9c8e7..2171432cba5700844ccd58fbb32ffcf23d3c132d 100644 --- a/src/operators/kernel/fpga/V2/sigmoid_kernel.cpp +++ b/src/operators/kernel/fpga/V2/sigmoid_kernel.cpp @@ -25,18 +25,18 @@ bool SigmoidKernel::Init(SigmoidParam *param) { paddle_mobile::fpga::SIGMOID; int16_t leaky_relu_negative_slope = 0; auto input = const_cast(param->InputX()); - auto input_ptr = input->data(); + auto input_ptr = input->data(); auto out = param->Out(); - fpga::format_fp16_ofm(out); + fpga::format_ofm(out); - fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; - args.input_data_type = fpga::DATA_TYPE_FP16; - args.output_data_type = fpga::DATA_TYPE_FP16; + fpga::BypassArgs args = {fpga::DATA_TYPE_INT8}; + args.input_data_type = fpga::DATA_TYPE_INT8; + args.output_data_type = fpga::DATA_TYPE_INT8; args.image.address = input_ptr; args.image.height = 1; args.image.width = 1; args.image.channels = input->fpga_data_num; - args.output.address = out->data(); + args.output.address = out->data(); args.output.scale_address = out->scale; args.output.activation.activation_type = activation_enable; args.output.activation.leaky_relu_negative_slope = leaky_relu_negative_slope; diff --git a/src/operators/kernel/fpga/V2/softmax_kernel.cpp b/src/operators/kernel/fpga/V2/softmax_kernel.cpp index ba86787c646c3fc67992c76f5ce34efdcb5bbe4a..91346fe8b3e2e245e1d3d8fb818353f432bb84c8 100755 --- a/src/operators/kernel/fpga/V2/softmax_kernel.cpp +++ b/src/operators/kernel/fpga/V2/softmax_kernel.cpp @@ -24,13 +24,13 @@ template <> bool SoftmaxKernel::Init(SoftmaxParam *param) { auto input = const_cast(param->InputX()); auto dims = framework::vectorize(input->dims()); - half *input_ptr; + int8_t *input_ptr; auto out = param->Out(); if (input->type() == type_id()) { out->Resize(framework::make_ddim(dims)); out->mutable_data(framework::make_ddim(dims)); } else { - input_ptr = input->data(); + input_ptr = input->data(); } auto float_input = new LoDTensor; @@ -52,8 +52,6 @@ bool SoftmaxKernel::Init(SoftmaxParam *param) { out->mutable_data(framework::make_ddim(dims)); float_input->init(type_id().hash_code()); float_input->mutable_data(framework::make_ddim(dims)); - // fpga::format_fp32_ofm(float_input); - // fpga::format_fp32_ofm(out); fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; args.input_layout_type = fpga::LAYOUT_HWC; @@ -69,7 +67,7 @@ bool SoftmaxKernel::Init(SoftmaxParam *param) { param->SetFloatInput(float_input); param->SetFpgaArgs(args); } else { // Use FPGA - fpga::format_fp16_ofm(out); + fpga::format_ofm(out); fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; args.input_layout_type = fpga::LAYOUT_HWC; args.output_layout_type = fpga::LAYOUT_CHW; @@ -91,7 +89,7 @@ bool SoftmaxKernel::Init(SoftmaxParam *param) { template <> void SoftmaxKernel::Compute(const SoftmaxParam ¶m) { auto *in_x = (param.InputX()); - if (in_x->type() == type_id()) { + if (in_x->type() == type_id()) { fpga::PerformBypass(param.FpgaArgs()); if (param.FpgaArgs().output.activation.activation_type != fpga::SOFTMAX) { Tensor *out = param.Out(); diff --git a/src/operators/kernel/fpga/V2/split_kernel.cpp b/src/operators/kernel/fpga/V2/split_kernel.cpp index ccfe918963077150ff1ad3496b29040357bf30c7..af3fe9df00e8d8de5485793c9f4f1b887726f0fb 100644 --- a/src/operators/kernel/fpga/V2/split_kernel.cpp +++ b/src/operators/kernel/fpga/V2/split_kernel.cpp @@ -36,7 +36,7 @@ bool SplitKernel::Init(SplitParam *param) { fpga::fpga_malloc(image_num * sizeof(uint32_t))); DLOG << "input: " << in; for (int i = 0; i < image_num; i++) { - fpga::format_fp16_ofm(outs[i]); + fpga::format_ofm(outs[i]); DLOG << "output: " << outs[i]; images_out[i] = outs[i]->mutable_data(); scales_out[i] = outs[i]->scale; diff --git a/src/operators/op_param.h b/src/operators/op_param.h index c10f86f9870638ac4f1541fdf1a11c569e4e08a0..bfc9973f23c542264387c929cc54ebf16bc0ff3a 100644 --- a/src/operators/op_param.h +++ b/src/operators/op_param.h @@ -1270,7 +1270,9 @@ class FetchParam : public OpParam { #ifdef PADDLE_MOBILE_FPGA public: +#ifdef PADDLE_MOBILE_FPGA_V1 fpga::BypassArgs fpga_bypass_args; +#endif Tensor aligned_out; #endif };