diff --git a/src/fpga/V1/api.cpp b/src/fpga/V1/api.cpp index ffe5f18f5e64ac8ce43177c61c272c25b6923fda..c8746bc1f7d405098ba84724ba253aae5b7522f1 100644 --- a/src/fpga/V1/api.cpp +++ b/src/fpga/V1/api.cpp @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "fpga/V1/api.h" +#include #include "fpga/V1/bias_scale.h" #include "fpga/V1/deconv_filter.h" #include "fpga/V1/filter.h" @@ -368,7 +369,8 @@ void expand_conv_arg(ConvArgs *arg) { auto filter_pad_width_mul_channel = args.image.pad_width * args.image.channels; auto image_amount_per_row_multi_win_first = - image_amount_per_row * (ROW_PARALLEL_NUM * args.kernel.stride_h - args.image.pad_height); + image_amount_per_row * + (ROW_PARALLEL_NUM * args.kernel.stride_h - args.image.pad_height); auto image_amount_per_row_multi_win = image_amount_per_row * (ROW_PARALLEL_NUM * args.kernel.stride_h); diff --git a/src/fpga/common/driver.cpp b/src/fpga/common/driver.cpp index 89a22ba955b011527799aeb4bef5a2cd303c7e3b..0774cab71e99ce28987e922e22d46ab9a63b1a93 100644 --- a/src/fpga/common/driver.cpp +++ b/src/fpga/common/driver.cpp @@ -26,6 +26,7 @@ limitations under the License. */ #include #include #include +#include #include "common/enforce.h" #include "fpga/common/driver.h" @@ -147,8 +148,6 @@ int fpga_regpoll(uint64_t reg, uint64_t val, int time) { } } - - void memory_release(struct fpga_memory *memory) { void *ptr = nullptr; @@ -160,8 +159,6 @@ void memory_release(struct fpga_memory *memory) { } } - - uint64_t vaddr_to_paddr_driver(void *address) { uint64_t paddr = 0; auto iter = g_fpgainfo.fpga_vaddr2paddr_map.find(address); @@ -209,14 +206,14 @@ void *fpga_malloc_driver(size_t size) { struct MemoryVM2PHYArgs args; struct MemoryCacheArgs args_c; - // memory_request(g_fpgainfo.memory_info, size, &phy_addr); + // memory_request(g_fpgainfo.memory_info, size, &phy_addr); ret = mmap64(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, g_fpgainfo.fd_mem, FPGA_MEM_PHY_ADDR); PADDLE_MOBILE_ENFORCE(ret != (void *)-1, "Should not be -1"); - args.pVM= (void *)ret; - args.pPHY =(void *)0; + args.pVM = reinterpret_cast(ret); + args.pPHY = reinterpret_cast(0); do_ioctl(IOCTL_MEMORY_VM2PHY, &args); phy_addr = (uint64_t)args.pPHY; @@ -237,9 +234,8 @@ void fpga_free_driver(void *ptr) { g_fpgainfo.fpga_addr2size_map.erase(iter); munmap(ptr, size); - p_addr = vaddr_to_paddr_driver(ptr); - pos = (p_addr - g_fpgainfo.memory_info->mem_start) / FPGA_PAGE_SIZE; - + // p_addr = vaddr_to_paddr_driver(ptr); + // pos = (p_addr - g_fpgainfo.memory_info->mem_start) / FPGA_PAGE_SIZE; auto iter = g_fpgainfo.fpga_vaddr2paddr_map.find(ptr); if (iter != g_fpgainfo.fpga_vaddr2paddr_map.end()) { @@ -299,7 +295,7 @@ int open_device_driver() { g_fpgainfo.FpgaRegVirAddr = (uint64_t *)fpga_reg_malloc(FPGA_REG_SIZE); // NOLINT - //fpga_memory_add(); + // fpga_memory_add(); pl_init(); @@ -310,7 +306,7 @@ int close_device_driver() { pl_destroy(); fpga_reg_free(g_fpgainfo.FpgaRegVirAddr); memory_release(g_fpgainfo.memory_info); - + return 0; } diff --git a/src/fpga/common/driver.h b/src/fpga/common/driver.h index 89f419accaae70a15a9af68ad097f214f828175b..87c68cbb5a1abe935b97ed9783785be65030ffff 100644 --- a/src/fpga/common/driver.h +++ b/src/fpga/common/driver.h @@ -53,15 +53,14 @@ struct MemoryCacheArgs { }; struct MemoryVM2PHYArgs { - void* pVM; - void* pPHY; + void *pVM; + void *pPHY; }; #define IOCTL_FPGA_MAGIC 'F' #define IOCTL_MEMCACHE_INVAL _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryCacheArgs) #define IOCTL_MEMCACHE_FLUSH _IOW(IOCTL_FPGA_MAGIC, 13, struct MemoryCacheArgs) -#define IOCTL_MEMORY_VM2PHY _IOWR(IOCTL_FPGA_MAGIC, 15, struct MemoryVM2PHYArgs) - +#define IOCTL_MEMORY_VM2PHY _IOWR(IOCTL_FPGA_MAGIC, 15, struct MemoryVM2PHYArgs) struct fpga_pe { char type_name[MAX_TYPE_NAME_LENTH + 1]; diff --git a/src/fpga/common/fpga_common.h b/src/fpga/common/fpga_common.h index cd9a29e34d730dbe4aadd44e9aa370eab5952691..24cbff3878aad14f564ed3e5c8b20fe6b90e474b 100644 --- a/src/fpga/common/fpga_common.h +++ b/src/fpga/common/fpga_common.h @@ -25,7 +25,7 @@ limitations under the License. */ #define FILTER_ELEMENT_ALIGNMENT (16) // Filter element number aligned to 16 #define BS_NUM_ALIGNMENT (8) #define BIAS_NUM_ALIGNMENT (16) -#define ROW_PARALLEL_NUM (3) +#define ROW_PARALLEL_NUM (3) #endif namespace paddle_mobile { diff --git a/src/operators/detection_ops.cpp b/src/operators/detection_ops.cpp index f198711de27ade4049e40f01b05b6968c607243e..b87d1d3e80fd7945dd0cf4571041c18378e6ac1a 100644 --- a/src/operators/detection_ops.cpp +++ b/src/operators/detection_ops.cpp @@ -74,15 +74,14 @@ void RoiAlignPoolOp::InferShape() const { auto out_dims = this->param_.input_x_->dims(); out_dims[0] = rois_dims[0]; - // out_dims[1] = - // output_channels; // input_dims[1] / (pooled_height * pooled_width); + // out_dims[1] = + // output_channels; // input_dims[1] / (pooled_height * pooled_width); out_dims[2] = pooled_height; out_dims[3] = pooled_width; this->param_.output_->Resize(out_dims); } #endif - #ifdef ROI_PERSPECTIVE_OP template void RoiPerspectiveOp::InferShape() const { diff --git a/src/operators/detection_ops.h b/src/operators/detection_ops.h index 5b90ac3ee19a2523e368c6586ca04d7823f24131..3b3a54dc4ba2e99eabe2250de63f38c7c7744d47 100644 --- a/src/operators/detection_ops.h +++ b/src/operators/detection_ops.h @@ -38,7 +38,6 @@ DECLARE_OPERATOR(PSRoiPool, PSRoiPoolParam, PSRoiPoolKernel); DECLARE_OPERATOR(RoiAlignPool, RoiAlignPoolParam, RoiAlignPoolKernel); #endif - #ifdef ROI_PERSPECTIVE_OP DECLARE_OPERATOR(RoiPerspective, RoiPerspectiveParam, RoiPerspectiveKernel); #endif diff --git a/src/operators/kernel/detection_kernel.h b/src/operators/kernel/detection_kernel.h index 93ed78b10eab71ed162feb50dcfa6e1b2af3e871..77c35b0253d06f2bc979861e53daeba815b46647 100644 --- a/src/operators/kernel/detection_kernel.h +++ b/src/operators/kernel/detection_kernel.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include #include "framework/operator.h" #include "operators/op_param.h" @@ -157,18 +158,20 @@ DECLARE_KERNEL(PSRoiPool, PSRoiPoolParam); template class RoiAlignPoolParam : public OpParam { public: - RoiAlignPoolParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { + RoiAlignPoolParam(const VariableNameMap &inputs, + const VariableNameMap &outputs, const AttributeMap &attrs, + Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { input_x_ = OpParam::GetVarValue("X", inputs, *scope); input_rois_ = OpParam::GetVarValue("ROIs", inputs, *scope); - output_ = OpParam::GetVarValue("Out", outputs, *scope); + output_ = + OpParam::GetVarValue("Out", outputs, *scope); pooled_height_ = OpParam::GetAttr("pooled_height", attrs); pooled_width_ = OpParam::GetAttr("pooled_width", attrs); spatial_scale_ = OpParam::GetAttr("spatial_scale", attrs); - sampling_ratio_ = OpParam::GetAttr("sampling_ratio", attrs); + sampling_ratio_ = OpParam::GetAttr("sampling_ratio", attrs); } public: @@ -180,10 +183,9 @@ class RoiAlignPoolParam : public OpParam { float spatial_scale_; int sampling_ratio_; #ifdef PADDLE_MOBILE_FPGA - std::shared_ptr float_input, float_output; - fpga::BypassArgs input_arg, output_arg; + std::shared_ptr float_input, float_output; + fpga::BypassArgs input_arg, output_arg; #endif - }; DECLARE_KERNEL(RoiAlignPool, RoiAlignPoolParam); diff --git a/src/operators/kernel/fpga/V1/fetch_kernel.cpp b/src/operators/kernel/fpga/V1/fetch_kernel.cpp index c876a67e6ad762e169170e6942e6c95d4d97449c..d32375f1c66b8db5c3ae933ec5a1b00cdb508d5f 100644 --- a/src/operators/kernel/fpga/V1/fetch_kernel.cpp +++ b/src/operators/kernel/fpga/V1/fetch_kernel.cpp @@ -56,7 +56,7 @@ void dealign(float *src, float *dst, int input_c, int input_h, int input_w) { } template <> void FetchKernel::Compute(const FetchParam ¶m) { - auto input = param.InputX(); + auto input = const_cast(param.InputX()); if (input->type() == typeid(float)) { auto output = param.Out(); output->ShareDataWith(*input); @@ -73,15 +73,14 @@ void FetchKernel::Compute(const FetchParam ¶m) { reinterpret_cast(param.fpga_bypass_args.output.address); fpga::fpga_invalidate(param.fpga_bypass_args.output.address, param.Out()->fpga_data_num * sizeof(float)); - - if(param.Out()->fpga_data_num != product(input->dims())){ - float *data_tmp = - reinterpret_cast(malloc(outC * outH * outW * sizeof(float))); - dealign(outdata_ptr, data_tmp, outC, outH, outW); - memcpy(outdata_ptr, data_tmp, outC * outH * outW * sizeof(float)); - free(data_tmp); + + if (param.Out()->fpga_data_num != product(input->dims())) { + float *data_tmp = + reinterpret_cast(malloc(outC * outH * outW * sizeof(float))); + dealign(outdata_ptr, data_tmp, outC, outH, outW); + memcpy(outdata_ptr, data_tmp, outC * outH * outW * sizeof(float)); + free(data_tmp); } - } template class FetchKernel; diff --git a/src/operators/kernel/fpga/V1/pool_kernel.cpp b/src/operators/kernel/fpga/V1/pool_kernel.cpp index 72062193ede4054503e8c450be06ccb29475dd24..0bba15be7757ed3170402a47780e40cb94b9cfa0 100644 --- a/src/operators/kernel/fpga/V1/pool_kernel.cpp +++ b/src/operators/kernel/fpga/V1/pool_kernel.cpp @@ -74,10 +74,11 @@ void PoolKernel::Compute(const PoolParam ¶m) { auto *output = param.Output(); auto in = input->data(); auto N = input->dims()[0]; - output->Resize({N, output->dims()[1], output->dims()[2], output->dims()[3]}); + output->Resize( + {N, output->dims()[1], output->dims()[2], output->dims()[3]}); auto len = output->numel(); auto out = output->mutable_data(); - int C = input->dims()[1], H = input->dims()[2],//N = input->dims()[0], + int C = input->dims()[1], H = input->dims()[2], // N = input->dims()[0], W = input->dims()[3]; int HW = H * W, CHW = C * H * W, WC = W * C; diff --git a/src/operators/kernel/fpga/V1/proposal_kernel.cpp b/src/operators/kernel/fpga/V1/proposal_kernel.cpp index fe91612c760b2f43f1ad66ba5a046e16c8462db9..0489d86da5335b9abbc487f115875307b5d95990 100644 --- a/src/operators/kernel/fpga/V1/proposal_kernel.cpp +++ b/src/operators/kernel/fpga/V1/proposal_kernel.cpp @@ -65,13 +65,12 @@ bool ProposalKernel::Init(ProposalParam *param) { args.output.scale_address = param->float_score->scale; param->score_arg = args; - param->score_index_= std::make_shared(); + param->score_index_ = std::make_shared(); param->score_index_->mutable_data({input->numel()}); auto score_index = param->score_index_->data(); - for (int i = 0; i < input->numel(); ++i){ - score_index[i] = i; + for (int i = 0; i < input->numel(); ++i) { + score_index[i] = i; } - return true; } @@ -342,9 +341,8 @@ std::pair ProposalForOneImage( const Tensor &im_info_slice, const Tensor &anchors, const Tensor &variances, const Tensor &bbox_deltas_slice, // [M, 4] const Tensor &scores_slice, // [N, 1] - const Tensor &score_index, - int pre_nms_top_n, int post_nms_top_n, float nms_thresh, float min_size, - float eta) { + const Tensor &score_index, int pre_nms_top_n, int post_nms_top_n, + float nms_thresh, float min_size, float eta) { auto *scores_data = scores_slice.data(); // Sort index @@ -354,8 +352,9 @@ std::pair ProposalForOneImage( /*for (int i = 0; i < scores_slice.numel(); ++i) { index[i] = i; }*/ - std::memcpy(index,score_index.data(),scores_slice.numel()*sizeof(int) ); - + std::memcpy(index, score_index.data(), + scores_slice.numel() * sizeof(int)); + auto compare = [scores_data](const int64_t &i, const int64_t &j) { return scores_data[i] > scores_data[j]; }; @@ -504,7 +503,7 @@ void ProposalKernel::Compute(const ProposalParam ¶m) { auto score_index = *(param.score_index_.get()); int pre_nms_top_n = param.pre_nms_topn_; - int post_nms_top_n = 100;//param.post_nms_topn_; + int post_nms_top_n = 100; // param.post_nms_topn_; float nms_thresh = param.nms_thresh_; float min_size = param.min_size_; float eta = param.eta_; @@ -541,8 +540,8 @@ void ProposalKernel::Compute(const ProposalParam ¶m) { scores_slice.Resize({h_score * w_score * c_score, 1}); std::pair tensor_pair = ProposalForOneImage( - im_info_slice, anchors, variances, bbox_deltas_slice, scores_slice,score_index, - pre_nms_top_n, post_nms_top_n, nms_thresh, min_size, eta); + im_info_slice, anchors, variances, bbox_deltas_slice, scores_slice, + score_index, pre_nms_top_n, post_nms_top_n, nms_thresh, min_size, eta); Tensor &proposals = tensor_pair.first; Tensor &scores = tensor_pair.second; diff --git a/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp b/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp index 2eeedcf9a773b4572d76f4e8ec3efc1952b45da4..170d245c0212c06b8a25243a79c4f1bd25d314c4 100644 --- a/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp +++ b/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp @@ -1,124 +1,119 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef PSROI_POOL_OP - -#include -#include -#include "operators/kernel/detection_kernel.h" - -#include "fpga/V1/api.h" -#include "fpga/V1/image.h" -namespace paddle_mobile { -namespace operators { - -template <> -bool PSRoiPoolKernel::Init(PSRoiPoolParam* param) { - auto dims = param->input_x_->dims(); - PADDLE_MOBILE_ENFORCE(dims[1] * dims[3] % IMAGE_ALIGNMENT == 0, - "data not aligned"); - - param->float_input = std::make_shared(); - param->float_input->mutable_data(param->input_x_->dims()); - // param->float_output = std::make_shared(); - - auto input = param->input_x_; - fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; - args.input_layout_type = fpga::LAYOUT_HWC; - args.output_layout_type = fpga::LAYOUT_HWC; - args.input_data_type = fpga::DATA_TYPE_FP16; - args.output_data_type = fpga::DATA_TYPE_FP32; - args.image.address = input->data(); - args.image.height = (uint32_t)input->dims()[2]; - args.image.width = (uint32_t)input->dims()[3]; - args.image.channels = (uint32_t)input->dims()[1]; - args.output.address = param->float_input->mutable_data(); - args.output.scale_address = param->float_input->scale; - param->input_arg = args; - - auto* rois = param->input_rois_; - int rois_num = rois->dims()[0]; - framework::DDim dims_out_new = framework::make_ddim( - {rois_num, param->output_->dims()[1], param->output_->dims()[2], - param->output_->dims()[3]}); - param->output_->Resize(dims_out_new); - // fpga::format_fp16_ofm(param->output_); - - param->output_->mutable_data(dims_out_new); - // auto output = param->float_output.get(); - // param->output_ = output; - /* args.input_data_type = fpga::DATA_TYPE_FP32; - args.output_data_type = fpga::DATA_TYPE_FP16; - args.image.address = output->data(); - args.image.height = (uint32_t)output->dims()[2]; - args.image.width = (uint32_t)output->dims()[3]; - args.image.channels = (uint32_t)output->dims()[1] ; - args.output.address = param->output_->mutable_data(); - args.output.scale_address = param->output_->scale; - param->output_arg = args;*/ - - return true; -} - -template -void PSROIPooling( -const Dtype* bottom_data, const int channels, -const int height, const int width, -const int pooled_height, const int pooled_width, -const Dtype* bottom_rois, const int output_dim, -const int group_size, Dtype* top_data, -int index, int nid, -const Dtype Bin_size_h, -const Dtype Bin_size_w, -const Dtype roi_start_h, -const Dtype roi_start_w, -const int ctop, const int ph, const int roi_batch_ind) -{ - int pw = index; - int hstart = floor(static_cast(ph) * Bin_size_h + roi_start_h); - int wstart = floor(static_cast(pw)* Bin_size_w + roi_start_w); - int hend = ceil(static_cast(ph + 1) * Bin_size_h + roi_start_h); - int wend = ceil(static_cast(pw + 1) * Bin_size_w + roi_start_w); - - // Add roi offsets and clip to input boundaries - hstart = std::min(std::max(hstart, 0), height); - hend = std::min(std::max(hend, 0), height); - wstart = std::min(std::max(wstart, 0), width); - wend = std::min(std::max(wend, 0), width); - bool is_empty = (hend <= hstart) || (wend <= wstart); - - int c = (ctop*group_size + ph)*group_size + pw; - - Dtype bin_area = (hend - hstart)*(wend - wstart); - bottom_data += (roi_batch_ind * channels + c) * height * width; - Dtype out_sum = 0; - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - int bottom_index = h * width + w; - out_sum += bottom_data[bottom_index]; - } - } - - top_data[nid + index] = is_empty? 0. : out_sum/bin_area; - -} - -void convert_to_chw(float **data_in, int channel, int height, int width, - int num) { - float* data_in_tmp = *data_in; - float *data_tmp = - (float *)fpga::fpga_malloc(channel * height * width * sizeof(float)); // NOLINT +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PSROI_POOL_OP + +#include +#include +#include +#include "operators/kernel/detection_kernel.h" + +#include "fpga/V1/api.h" +#include "fpga/V1/image.h" +namespace paddle_mobile { +namespace operators { + +template <> +bool PSRoiPoolKernel::Init(PSRoiPoolParam* param) { + auto dims = param->input_x_->dims(); + PADDLE_MOBILE_ENFORCE(dims[1] * dims[3] % IMAGE_ALIGNMENT == 0, + "data not aligned"); + + param->float_input = std::make_shared(); + param->float_input->mutable_data(param->input_x_->dims()); + // param->float_output = std::make_shared(); + + auto input = param->input_x_; + fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; + args.input_layout_type = fpga::LAYOUT_HWC; + args.output_layout_type = fpga::LAYOUT_HWC; + args.input_data_type = fpga::DATA_TYPE_FP16; + args.output_data_type = fpga::DATA_TYPE_FP32; + args.image.address = input->data(); + args.image.height = (uint32_t)input->dims()[2]; + args.image.width = (uint32_t)input->dims()[3]; + args.image.channels = (uint32_t)input->dims()[1]; + args.output.address = param->float_input->mutable_data(); + args.output.scale_address = param->float_input->scale; + param->input_arg = args; + + auto* rois = param->input_rois_; + int rois_num = rois->dims()[0]; + framework::DDim dims_out_new = framework::make_ddim( + {rois_num, param->output_->dims()[1], param->output_->dims()[2], + param->output_->dims()[3]}); + param->output_->Resize(dims_out_new); + // fpga::format_fp16_ofm(param->output_); + + param->output_->mutable_data(dims_out_new); + // auto output = param->float_output.get(); + // param->output_ = output; + /* args.input_data_type = fpga::DATA_TYPE_FP32; + args.output_data_type = fpga::DATA_TYPE_FP16; + args.image.address = output->data(); + args.image.height = (uint32_t)output->dims()[2]; + args.image.width = (uint32_t)output->dims()[3]; + args.image.channels = (uint32_t)output->dims()[1] ; + args.output.address = param->output_->mutable_data(); + args.output.scale_address = param->output_->scale; + param->output_arg = args;*/ + + return true; +} + +template +void PSROIPooling(const Dtype* bottom_data, const int channels, + const int height, const int width, const int pooled_height, + const int pooled_width, const Dtype* bottom_rois, + const int output_dim, const int group_size, Dtype* top_data, + int index, int nid, const Dtype Bin_size_h, + const Dtype Bin_size_w, const Dtype roi_start_h, + const Dtype roi_start_w, const int ctop, const int ph, + const int roi_batch_ind) { + int pw = index; + int hstart = floor(static_cast(ph) * Bin_size_h + roi_start_h); + int wstart = floor(static_cast(pw) * Bin_size_w + roi_start_w); + int hend = ceil(static_cast(ph + 1) * Bin_size_h + roi_start_h); + int wend = ceil(static_cast(pw + 1) * Bin_size_w + roi_start_w); + + // Add roi offsets and clip to input boundaries + hstart = std::min(std::max(hstart, 0), height); + hend = std::min(std::max(hend, 0), height); + wstart = std::min(std::max(wstart, 0), width); + wend = std::min(std::max(wend, 0), width); + bool is_empty = (hend <= hstart) || (wend <= wstart); + + int c = (ctop * group_size + ph) * group_size + pw; + + Dtype bin_area = (hend - hstart) * (wend - wstart); + bottom_data += (roi_batch_ind * channels + c) * height * width; + Dtype out_sum = 0; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + int bottom_index = h * width + w; + out_sum += bottom_data[bottom_index]; + } + } + + top_data[nid + index] = is_empty ? 0. : out_sum / bin_area; +} + +void convert_to_chw(float** data_in, int channel, int height, int width, + int num) { + float* data_in_tmp = *data_in; + float* data_tmp = reinterpret_cast( + fpga::fpga_malloc(channel * height * width * sizeof(float))); // NOLINT int64_t amount_per_side = width * height; for (int n = 0; n < num; n++) { for (int h = 0; h < height; h++) { @@ -130,15 +125,15 @@ void convert_to_chw(float **data_in, int channel, int height, int width, } } } - *data_in = data_tmp; - fpga::fpga_free(data_in_tmp); -} - -void convert_to_hwc(float **data_in, int channel, int height, int width, - int num) { - float* data_in_tmp = *data_in; - float *data_tmp = reinterpret_cast( - fpga::fpga_malloc(num * channel * height * width * sizeof(float))); + *data_in = data_tmp; + fpga::fpga_free(data_in_tmp); +} + +void convert_to_hwc(float** data_in, int channel, int height, int width, + int num) { + float* data_in_tmp = *data_in; + float* data_tmp = reinterpret_cast( + fpga::fpga_malloc(num * channel * height * width * sizeof(float))); int64_t amount_per_row = width * channel; for (int n = 0; n < num; n++) { for (int c = 0; c < channel; c++) { @@ -151,110 +146,116 @@ void convert_to_hwc(float **data_in, int channel, int height, int width, } } } - *data_in = data_tmp; - fpga::fpga_free(data_in_tmp); -} - - -template <> -void PSRoiPoolKernel::Compute(const PSRoiPoolParam& param) { - auto input_tensor = param.float_input.get(); - fpga::PerformBypass(param.input_arg); - fpga::fpga_invalidate(input_tensor->data(), - input_tensor->numel() * sizeof(float)); - - auto* in = input_tensor; - auto* rois = param.input_rois_; - auto* out = param.output_; // param.float_output.get(); - - auto pooled_height = param.pooled_height_; - auto pooled_width = param.pooled_width_; - auto spatial_scale = param.spatial_scale_; - auto output_channels = param.output_channels_; - - auto in_dims = in->dims(); - int batch_size = in_dims[0]; - int input_channels = in_dims[1]; - int height = in_dims[2]; - int width = in_dims[3]; - int rois_num = rois->dims()[0]; - - auto data_nhwc = in->mutable_data(); - convert_to_chw(&data_nhwc, input_channels, height, width, 1); - framework::DDim dims_out_new = framework::make_ddim( - {rois_num, (param.output_)->dims()[1], (((param.output_)->dims()[2])), - (param.output_)->dims()[3]}); - (param.output_)->Resize(dims_out_new); - - const float* input_data = data_nhwc; // in->data(); - framework::Tensor rois_batch_id_list; - rois_batch_id_list.Resize({rois_num}); - auto rois_batch_id_data = rois_batch_id_list.mutable_data(); - - PADDLE_MOBILE_ENFORCE(rois->NumLevels() > 0, "ROIS should not be empty"); - - auto rois_lod = rois->lod().back(); - int rois_batch_size = rois_lod.size() - 1; - PADDLE_MOBILE_ENFORCE( - rois_batch_size == batch_size, - "the rois_batch_size and input(X) batch_size should be the same."); - int rois_num_with_lod = rois_lod[rois_batch_size]; - PADDLE_MOBILE_ENFORCE(rois_num_with_lod == rois_num, - "the rois_num from input and lod must be the same"); - - PADDLE_MOBILE_ENFORCE( - input_channels == output_channels * pooled_height * pooled_width, - "the channels of input X should equal the product of " - "output_channels x pooled_height x pooled_width"); - - // calculate batch id index for each roi according to LoD - //for (int n = 0; n < rois_batch_size; ++n) { - //for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - //rois_batch_id_data[i] = n; - // } - //} - auto output_data = out->mutable_data(); - auto input_rois = rois->data(); - - // calculate psroipooling, parallel processing can be implemented per ROI - for (int n = 0; n < rois_num; ++n) { - // [start, end) interval for spatial sampling - auto offset_input_rois = input_rois + n * 4; - auto roi_start_w = static_cast(round(offset_input_rois[0])) * spatial_scale; - auto roi_start_h = static_cast(round(offset_input_rois[1])) * spatial_scale; - auto roi_end_w = static_cast(round(offset_input_rois[2]) + 1.) * spatial_scale; - auto roi_end_h = static_cast(round(offset_input_rois[3]) + 1.) * spatial_scale; - - // Force too small rois to be 1 x 1 - auto roi_height = std::max(roi_end_h - roi_start_h, 0.1f); // avoid 0 - auto roi_width = std::max(roi_end_w - roi_start_w, 0.1f); - - // Compute bin size w and h at input feature map - auto bin_size_h = roi_height / static_cast(pooled_height); - auto bin_size_w = roi_width / static_cast(pooled_width); - - int roi_batch_ind = 0;//rois_batch_id_data[n]; - //std::cout << "roi_batch_ind: " << roi_batch_ind << std::endl; - for(int c = 0; c < output_channels; ++c){ - - for(int ph = 0; ph < pooled_height; ph++){ - int index = pooled_width; - int nid = n * output_channels * pooled_height * pooled_width + c * pooled_width * pooled_height + ph * pooled_width; - for(int idx = 0; idx < index; idx++){ - PSROIPooling(input_data,input_channels,height,width,pooled_height,pooled_width, - input_rois,output_channels,pooled_height,output_data, idx, nid, bin_size_h, bin_size_w, roi_start_h, roi_start_w, c, ph, roi_batch_ind); - } - } - } - } - - convert_to_hwc(&output_data, output_channels, pooled_height, - pooled_width, rois_num); - out->reset_data_ptr(output_data); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif // PSROI_POOL_OP - + *data_in = data_tmp; + fpga::fpga_free(data_in_tmp); +} + +template <> +void PSRoiPoolKernel::Compute(const PSRoiPoolParam& param) { + auto input_tensor = param.float_input.get(); + fpga::PerformBypass(param.input_arg); + fpga::fpga_invalidate(input_tensor->data(), + input_tensor->numel() * sizeof(float)); + + auto* in = input_tensor; + auto* rois = param.input_rois_; + auto* out = param.output_; // param.float_output.get(); + + auto pooled_height = param.pooled_height_; + auto pooled_width = param.pooled_width_; + auto spatial_scale = param.spatial_scale_; + auto output_channels = param.output_channels_; + + auto in_dims = in->dims(); + int batch_size = in_dims[0]; + int input_channels = in_dims[1]; + int height = in_dims[2]; + int width = in_dims[3]; + int rois_num = rois->dims()[0]; + + auto data_nhwc = in->mutable_data(); + fpga::image::convert_to_chw(&data_nhwc, input_channels, height, width, 1); + framework::DDim dims_out_new = framework::make_ddim( + {rois_num, (param.output_)->dims()[1], (((param.output_)->dims()[2])), + (param.output_)->dims()[3]}); + (param.output_)->Resize(dims_out_new); + + float* input_data = data_nhwc; // in->data(); + // shared_ptr input_data(data_nhwc); + framework::Tensor rois_batch_id_list; + rois_batch_id_list.Resize({rois_num}); + auto rois_batch_id_data = rois_batch_id_list.mutable_data(); + + PADDLE_MOBILE_ENFORCE(rois->NumLevels() > 0, "ROIS should not be empty"); + + auto rois_lod = rois->lod().back(); + int rois_batch_size = rois_lod.size() - 1; + PADDLE_MOBILE_ENFORCE( + rois_batch_size == batch_size, + "the rois_batch_size and input(X) batch_size should be the same."); + int rois_num_with_lod = rois_lod[rois_batch_size]; + PADDLE_MOBILE_ENFORCE(rois_num_with_lod == rois_num, + "the rois_num from input and lod must be the same"); + + PADDLE_MOBILE_ENFORCE( + input_channels == output_channels * pooled_height * pooled_width, + "the channels of input X should equal the product of " + "output_channels x pooled_height x pooled_width"); + + // calculate batch id index for each roi according to LoD + // for (int n = 0; n < rois_batch_size; ++n) { + // for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { + // rois_batch_id_data[i] = n; + // } + //} + auto output_data = out->mutable_data(); + auto input_rois = rois->data(); + + // calculate psroipooling, parallel processing can be implemented per ROI + for (int n = 0; n < rois_num; ++n) { + // [start, end) interval for spatial sampling + auto offset_input_rois = input_rois + n * 4; + auto roi_start_w = + static_cast(round(offset_input_rois[0])) * spatial_scale; + auto roi_start_h = + static_cast(round(offset_input_rois[1])) * spatial_scale; + auto roi_end_w = + static_cast(round(offset_input_rois[2]) + 1.) * spatial_scale; + auto roi_end_h = + static_cast(round(offset_input_rois[3]) + 1.) * spatial_scale; + + // Force too small rois to be 1 x 1 + auto roi_height = std::max(roi_end_h - roi_start_h, 0.1f); // avoid 0 + auto roi_width = std::max(roi_end_w - roi_start_w, 0.1f); + + // Compute bin size w and h at input feature map + auto bin_size_h = roi_height / static_cast(pooled_height); + auto bin_size_w = roi_width / static_cast(pooled_width); + + int roi_batch_ind = 0; // rois_batch_id_data[n]; + // std::cout << "roi_batch_ind: " << roi_batch_ind << std::endl; + for (int c = 0; c < output_channels; ++c) { + for (int ph = 0; ph < pooled_height; ph++) { + int index = pooled_width; + int nid = n * output_channels * pooled_height * pooled_width + + c * pooled_width * pooled_height + ph * pooled_width; + for (int idx = 0; idx < index; idx++) { + PSROIPooling(input_data, input_channels, height, width, + pooled_height, pooled_width, input_rois, + output_channels, pooled_height, output_data, idx, + nid, bin_size_h, bin_size_w, roi_start_h, + roi_start_w, c, ph, roi_batch_ind); + } + } + } + } + fpga::fpga_free(input_data); + fpga::image::convert_to_hwc(&output_data, output_channels, pooled_height, + pooled_width, rois_num); + out->reset_data_ptr(output_data); +} + +} // namespace operators +} // namespace paddle_mobile + +#endif // PSROI_POOL_OP diff --git a/src/operators/kernel/fpga/V1/roialign_pool_kernel.cpp b/src/operators/kernel/fpga/V1/roialign_pool_kernel.cpp index 92a76646c02c53121cfa26861e298330b41f8e95..ec8d19db800742693516e08215ccd3889ec86c37 100644 --- a/src/operators/kernel/fpga/V1/roialign_pool_kernel.cpp +++ b/src/operators/kernel/fpga/V1/roialign_pool_kernel.cpp @@ -24,10 +24,8 @@ limitations under the License. */ namespace paddle_mobile { namespace operators { - template <> bool RoiAlignPoolKernel::Init(RoiAlignPoolParam* param) { - auto dims = param->input_x_->dims(); PADDLE_MOBILE_ENFORCE(dims[1] * dims[3] % IMAGE_ALIGNMENT == 0, "data not aligned"); @@ -58,11 +56,9 @@ bool RoiAlignPoolKernel::Init(RoiAlignPoolParam* param) { param->output_->mutable_data(dims_out_new); - return true; } - template struct PreCalc { int pos1; @@ -77,30 +73,22 @@ struct PreCalc { template void pre_calc_for_bilinear_interpolate( - const int height, - const int width, - const int pooled_height, - const int pooled_width, - const int iy_upper, - const int ix_upper, - T roi_start_h, - T roi_start_w, - T bin_size_h, - T bin_size_w, - int roi_bin_grid_h, - int roi_bin_grid_w, - std::vector>& pre_calc) { + const int height, const int width, const int pooled_height, + const int pooled_width, const int iy_upper, const int ix_upper, + T roi_start_h, T roi_start_w, T bin_size_h, T bin_size_w, + int roi_bin_grid_h, int roi_bin_grid_w, + std::vector>& pre_calc) { // NOLINT int pre_calc_index = 0; for (int ph = 0; ph < pooled_height; ph++) { for (int pw = 0; pw < pooled_width; pw++) { for (int iy = 0; iy < iy_upper; iy++) { const T yy = roi_start_h + ph * bin_size_h + - static_cast(iy + .5f) * bin_size_h / - static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 for (int ix = 0; ix < ix_upper; ix++) { const T xx = roi_start_w + pw * bin_size_w + - static_cast(ix + .5f) * bin_size_w / - static_cast(roi_bin_grid_w); + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); T x = xx; T y = yy; @@ -128,8 +116,8 @@ void pre_calc_for_bilinear_interpolate( x = 0; } - int y_low = (int)y; - int x_low = (int)x; + int y_low = static_cast(y); + int x_low = static_cast(x); int y_high; int x_high; @@ -172,22 +160,13 @@ void pre_calc_for_bilinear_interpolate( } template -void ROIAlignForward( - const int nthreads, - const T* bottom_data, - const T& spatial_scale, - const int channels, - const int height, - const int width, - const int pooled_height, - const int pooled_width, - const int sampling_ratio, - const T* bottom_rois, - T* top_data) { - +void ROIAlignForward(const int nthreads, const T* bottom_data, + const T& spatial_scale, const int channels, + const int height, const int width, const int pooled_height, + const int pooled_width, const int sampling_ratio, + const T* bottom_rois, T* top_data) { int n_rois = nthreads / channels / pooled_width / pooled_height; - for (int n = 0; n < n_rois; n++) { int index_n = n * channels * pooled_width * pooled_height; @@ -195,8 +174,8 @@ void ROIAlignForward( const T* offset_bottom_rois = bottom_rois + n * 4; int roi_batch_ind = 0; // if (roi_cols == 5) { - // roi_batch_ind = offset_bottom_rois[0]; - // offset_bottom_rois++; + // roi_batch_ind = offset_bottom_rois[0]; + // offset_bottom_rois++; // } // Do not using rounding; this implementation detail is critical @@ -217,70 +196,58 @@ void ROIAlignForward( // We use roi_bin_grid to sample the grid and mimic integral int roi_bin_grid_h = (sampling_ratio > 0) - ? sampling_ratio - : ceil(roi_height / pooled_height); // e.g., = 2 + ? sampling_ratio + : ceil(roi_height / pooled_height); // e.g., = 2 int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); // We do average (integral) pooling inside a bin - const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 + const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 // we want to precalculate indeces and weights shared by all chanels, // this is the key point of optimiation - std::vector> pre_calc( - roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height); + std::vector> pre_calc(roi_bin_grid_h * roi_bin_grid_w * + pooled_width * pooled_height); pre_calc_for_bilinear_interpolate( - height, - width, - pooled_height, - pooled_width, - roi_bin_grid_h, - roi_bin_grid_w, - roi_start_h, - roi_start_w, - bin_size_h, - bin_size_w, - roi_bin_grid_h, - roi_bin_grid_w, - pre_calc); - - - for (int c = 0; c < channels; c++) { - int index_n_c = index_n + c * pooled_width * pooled_height; - const T* offset_bottom_data = - bottom_data + (roi_batch_ind * channels + c) * height * width; - int pre_calc_index = 0; - - for (int ph = 0; ph < pooled_height; ph++) { - for (int pw = 0; pw < pooled_width; pw++) { - int index = index_n_c + ph * pooled_width + pw; - - T output_val = 0.; - for (int iy = 0; iy < roi_bin_grid_h; iy++) { - for (int ix = 0; ix < roi_bin_grid_w; ix++) { - PreCalc pc = pre_calc[pre_calc_index]; - output_val += pc.w1 * offset_bottom_data[pc.pos1] + - pc.w2 * offset_bottom_data[pc.pos2] + - pc.w3 * offset_bottom_data[pc.pos3] + - pc.w4 * offset_bottom_data[pc.pos4]; - - pre_calc_index += 1; - } + height, width, pooled_height, pooled_width, roi_bin_grid_h, + roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w, + roi_bin_grid_h, roi_bin_grid_w, pre_calc); + + for (int c = 0; c < channels; c++) { + int index_n_c = index_n + c * pooled_width * pooled_height; + const T* offset_bottom_data = + bottom_data + (roi_batch_ind * channels + c) * height * width; + int pre_calc_index = 0; + + for (int ph = 0; ph < pooled_height; ph++) { + for (int pw = 0; pw < pooled_width; pw++) { + int index = index_n_c + ph * pooled_width + pw; + + T output_val = 0.; + for (int iy = 0; iy < roi_bin_grid_h; iy++) { + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + PreCalc pc = pre_calc[pre_calc_index]; + output_val += pc.w1 * offset_bottom_data[pc.pos1] + + pc.w2 * offset_bottom_data[pc.pos2] + + pc.w3 * offset_bottom_data[pc.pos3] + + pc.w4 * offset_bottom_data[pc.pos4]; + + pre_calc_index += 1; } - output_val /= count; + } + output_val /= count; - top_data[index] = output_val; - } // for pw - } // for ph - } // for c - } // for n + top_data[index] = output_val; + } // for pw + } // for ph + } // for c + } // for n } - template <> -void RoiAlignPoolKernel::Compute(const RoiAlignPoolParam& param) { - - auto input_tensor = param.float_input.get(); +void RoiAlignPoolKernel::Compute( + const RoiAlignPoolParam& param) { + auto input_tensor = param.float_input.get(); fpga::PerformBypass(param.input_arg); fpga::fpga_invalidate(input_tensor->data(), input_tensor->numel() * sizeof(float)); @@ -308,23 +275,22 @@ void RoiAlignPoolKernel::Compute(const RoiAlignPoolParam& par {rois_num, (param.output_)->dims()[1], (((param.output_)->dims()[2])), (param.output_)->dims()[3]}); (param.output_)->Resize(dims_out_new); - + const int index = input_channels * pooled_height * pooled_width * rois_num; auto rois_data = rois->data(); auto top_data = param.output_->mutable_data(); - for (int i = 0; i < index; ++i){ - ROIAlignForward( index,data_nhwc,spatial_scale,input_channels,height,width, - pooled_height,pooled_width,sampe_ratio,rois_data,top_data); + for (int i = 0; i < index; ++i) { + ROIAlignForward(index, data_nhwc, spatial_scale, input_channels, + height, width, pooled_height, pooled_width, + sampe_ratio, rois_data, top_data); } - fpga::image::convert_to_hwc(&top_data, input_channels, pooled_height, + fpga::image::convert_to_hwc(&top_data, input_channels, pooled_height, pooled_width, rois_num); - out->reset_data_ptr(top_data); - + out->reset_data_ptr(top_data); } } // namespace operators } // namespace paddle_mobile #endif // ROIALIGN_POOL_OP - diff --git a/src/operators/kernel/fpga/V1/softmax_kernel.cpp b/src/operators/kernel/fpga/V1/softmax_kernel.cpp index 78d920a9602790898c36b3afe3871b95aae10689..116a9594ee45ce862d8d4f58990637a062dfb092 100644 --- a/src/operators/kernel/fpga/V1/softmax_kernel.cpp +++ b/src/operators/kernel/fpga/V1/softmax_kernel.cpp @@ -105,7 +105,8 @@ void SoftmaxKernel::Compute(const SoftmaxParam ¶m) { } else { if (param.FpgaArgs().output.activation.activation_type != fpga::SOFTMAX) { Tensor *out = param.Out(); - out->Resize({in_x->dims()[0], out->dims()[1], out->dims()[2], out->dims()[3]}); + out->Resize( + {in_x->dims()[0], out->dims()[1], out->dims()[2], out->dims()[3]}); math::SoftmaxFuntor()(in_x, out); } } diff --git a/src/operators/kernel/fpga/V1/transpose2_kernel.cpp b/src/operators/kernel/fpga/V1/transpose2_kernel.cpp index a9734f8e4491a2f0b3b75e9e73cf997c5442f485..cc839a971ee7f827f150ecdfff0bd75e2a8aafe2 100644 --- a/src/operators/kernel/fpga/V1/transpose2_kernel.cpp +++ b/src/operators/kernel/fpga/V1/transpose2_kernel.cpp @@ -44,8 +44,9 @@ void Transpose2Kernel::Compute( // Transpose2Compute(param); auto input = param.InputX(); auto output = param.Out(); - - output->Resize({input->dims()[0], output->dims()[1], output->dims()[2], output->dims()[3]}); + + output->Resize({input->dims()[0], output->dims()[1], output->dims()[2], + output->dims()[3]}); } } // namespace operators