From 2f507f7692cc0ce1be7fee1810fb591393c2339b Mon Sep 17 00:00:00 2001 From: jameswu2014 <545426914@qq.com> Date: Wed, 6 Mar 2019 06:28:30 -0800 Subject: [PATCH] 75percentParallel+kerneldriver+ROIALIGN+psroi-bug --- src/common/types.cpp | 2 + src/common/types.h | 1 + src/fpga/V1/api.cpp | 4 +- src/fpga/common/bitmap.cpp | 131 ----- src/fpga/common/bitmap.h | 37 -- src/fpga/common/driver.cpp | 145 +----- src/fpga/common/driver.h | 11 +- src/fpga/common/fpga_common.h | 1 + src/operators/detection_ops.cpp | 22 + src/operators/detection_ops.h | 5 + src/operators/kernel/detection_kernel.h | 38 ++ src/operators/kernel/fpga/V1/fetch_kernel.cpp | 18 +- src/operators/kernel/fpga/V1/pool_kernel.cpp | 4 +- .../kernel/fpga/V1/proposal_kernel.cpp | 21 +- .../kernel/fpga/V1/psroi_pool_kernel.cpp | 464 ++++++++++-------- .../kernel/fpga/V1/roialign_pool_kernel.cpp | 330 +++++++++++++ .../kernel/fpga/V1/softmax_kernel.cpp | 1 + .../kernel/fpga/V1/transpose2_kernel.cpp | 4 + tools/op.cmake | 4 + 19 files changed, 723 insertions(+), 520 deletions(-) delete mode 100644 src/fpga/common/bitmap.cpp delete mode 100644 src/fpga/common/bitmap.h create mode 100644 src/operators/kernel/fpga/V1/roialign_pool_kernel.cpp diff --git a/src/common/types.cpp b/src/common/types.cpp index 170d262e98..20656acb20 100755 --- a/src/common/types.cpp +++ b/src/common/types.cpp @@ -109,6 +109,7 @@ const char *G_OP_TYPE_SLICE = "slice"; const char *G_OP_TYPE_ANCHOR_GENERATOR = "anchor_generator"; const char *G_OP_TYPE_GENERATE_PROPOSALS = "generate_proposals"; const char *G_OP_TYPE_PSROI_POOL = "psroi_pool"; +const char *G_OP_TYPE_ROIALIGN_POOL = "roialign_pool"; const char *G_OP_TYPE_ROI_PERSPECTIVE = "roi_perspective_transform"; const char *G_OP_TYPE_PAD2D = "pad2d"; const char *G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU = "fusion_deconv_add_bn_relu"; @@ -213,6 +214,7 @@ std::unordered_map< {{"Scores", "BboxDeltas", "ImInfo", "Anchors", "Variances"}, {"RpnRois", "RpnRoiProbs"}}}, {G_OP_TYPE_PSROI_POOL, {{"X", "ROIs"}, {"Out"}}}, + {G_OP_TYPE_ROIALIGN_POOL, {{"X", "ROIs"}, {"Out"}}}, {G_OP_TYPE_ROI_PERSPECTIVE, {{"X", "ROIs"}, {"Out"}}}, {G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU, {{"Input"}, {"Out"}}}, {G_OP_TYPE_FUSION_DECONV_ADD_BN, {{"Input"}, {"Out"}}}, diff --git a/src/common/types.h b/src/common/types.h index 45e86500ab..e3b5e52218 100755 --- a/src/common/types.h +++ b/src/common/types.h @@ -198,6 +198,7 @@ extern const char *G_OP_TYPE_SLICE; extern const char *G_OP_TYPE_ANCHOR_GENERATOR; extern const char *G_OP_TYPE_GENERATE_PROPOSALS; extern const char *G_OP_TYPE_PSROI_POOL; +extern const char *G_OP_TYPE_ROIALIGN_POOL; extern const char *G_OP_TYPE_ROI_PERSPECTIVE; extern const char *G_OP_TYPE_PAD2D; extern const char *G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU; diff --git a/src/fpga/V1/api.cpp b/src/fpga/V1/api.cpp index 0f9f96dc65..ffe5f18f5e 100644 --- a/src/fpga/V1/api.cpp +++ b/src/fpga/V1/api.cpp @@ -368,9 +368,9 @@ void expand_conv_arg(ConvArgs *arg) { auto filter_pad_width_mul_channel = args.image.pad_width * args.image.channels; auto image_amount_per_row_multi_win_first = - image_amount_per_row * (2 * args.kernel.stride_h - args.image.pad_height); + image_amount_per_row * (ROW_PARALLEL_NUM * args.kernel.stride_h - args.image.pad_height); auto image_amount_per_row_multi_win = - image_amount_per_row * (2 * args.kernel.stride_h); + image_amount_per_row * (ROW_PARALLEL_NUM * args.kernel.stride_h); auto image_block_num = block_num; auto image_block_len = diff --git a/src/fpga/common/bitmap.cpp b/src/fpga/common/bitmap.cpp deleted file mode 100644 index 9742a45599..0000000000 --- a/src/fpga/common/bitmap.cpp +++ /dev/null @@ -1,131 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "fpga/common/bitmap.h" - -namespace fpga_bitmap { -void bitmap_set(uint64_t *map, unsigned int start, int len) { - uint64_t *p = map + BIT_WORD(start); - const unsigned int size = start + len; - int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG); - uint64_t mask_to_set = BITMAP_FIRST_WORD_MASK(start); - - while (len - bits_to_set >= 0) { - *p |= mask_to_set; - len -= bits_to_set; - bits_to_set = BITS_PER_LONG; - mask_to_set = ~0UL; - p++; - } - if (len) { - mask_to_set &= BITMAP_LAST_WORD_MASK(size); - *p |= mask_to_set; - } -} - -void bitmap_clear(uint64_t *map, unsigned int start, int len) { - uint64_t *p = map + BIT_WORD(start); - const unsigned int size = start + len; - int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG); - uint64_t mask_to_clear = BITMAP_FIRST_WORD_MASK(start); - - while (len - bits_to_clear >= 0) { - *p &= ~mask_to_clear; - len -= bits_to_clear; - bits_to_clear = BITS_PER_LONG; - mask_to_clear = ~0UL; - p++; - } - if (len) { - mask_to_clear &= BITMAP_LAST_WORD_MASK(size); - *p &= ~mask_to_clear; - } -} - -static uint64_t ffs(uint64_t data) { - uint64_t bit = 0; - int i = 0; - - for (i = 0; i < sizeof(data) * 8; i++) { - if (data & (1UL << i)) { - bit = i; - break; - } - } - - return bit; -} - -static uint64_t _find_next_bit(const uint64_t *addr, uint64_t nbits, - uint64_t start, uint64_t invert) { - uint64_t tmp = 0; - - if (!nbits || start >= nbits) return nbits; - - tmp = addr[start / BITS_PER_LONG] ^ invert; - - /* Handle 1st word. */ - tmp &= BITMAP_FIRST_WORD_MASK(start); - start = round_down(start, BITS_PER_LONG); - - while (!tmp) { - start += BITS_PER_LONG; - if (start >= nbits) return nbits; - - tmp = addr[start / BITS_PER_LONG] ^ invert; - } - - return (start + ffs(tmp)) < nbits ? (start + ffs(tmp)) : nbits; -} - -uint64_t find_next_zero_bit(const uint64_t *addr, uint64_t size, - uint64_t offset) { - return _find_next_bit(addr, size, offset, ~0UL); -} - -uint64_t find_next_bit(const uint64_t *addr, uint64_t size, uint64_t offset) { - return _find_next_bit(addr, size, offset, 0UL); -} - -uint64_t bitmap_find_next_zero_area_off(uint64_t *map, uint64_t size, - uint64_t start, unsigned int nr, - uint64_t align_mask, - uint64_t align_offset) { - uint64_t index = 0; - uint64_t end = 0; - uint64_t i = 0; - -again: - index = find_next_zero_bit(map, size, start); - - /* Align allocation */ - index = __ALIGN_MASK(index + align_offset, align_mask) - align_offset; - - end = index + nr; - if (end > size) return end; - i = find_next_bit(map, end, index); - if (i < end) { - start = i + 1; - goto again; - } - - return index; -} - -uint64_t bitmap_find_next_zero_area(uint64_t *map, uint64_t size, - uint64_t start, unsigned int nr, - uint64_t align_mask) { - return bitmap_find_next_zero_area_off(map, size, start, nr, align_mask, 0); -} -} // namespace fpga_bitmap diff --git a/src/fpga/common/bitmap.h b/src/fpga/common/bitmap.h deleted file mode 100644 index 4cb1673d91..0000000000 --- a/src/fpga/common/bitmap.h +++ /dev/null @@ -1,37 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include - -#define BITS_PER_LONG 64 -#define BIT_WORD(nr) ((nr) / BITS_PER_LONG) -#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1))) -#define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1))) - -#define __ALIGN_KERNEL_MASK(x, mask) (((x) + (mask)) & ~(mask)) -#define __ALIGN_MASK(x, mask) __ALIGN_KERNEL_MASK((x), (mask)) - -#define round_down(x, y) ((x) & ~((y)-1)) - -namespace fpga_bitmap { -void bitmap_set(uint64_t *map, unsigned int start, int len); -void bitmap_clear(uint64_t *map, unsigned int start, int len); -uint64_t bitmap_find_next_zero_area(uint64_t *map, uint64_t size, - uint64_t start, unsigned int nr, - uint64_t align_mask); - -} // namespace fpga_bitmap diff --git a/src/fpga/common/driver.cpp b/src/fpga/common/driver.cpp index b1d3559dbb..89a22ba955 100644 --- a/src/fpga/common/driver.cpp +++ b/src/fpga/common/driver.cpp @@ -28,7 +28,6 @@ limitations under the License. */ #include #include "common/enforce.h" -#include "fpga/common/bitmap.h" #include "fpga/common/driver.h" namespace paddle_mobile { @@ -148,33 +147,7 @@ int fpga_regpoll(uint64_t reg, uint64_t val, int time) { } } -/*内存管理*/ -int memory_request(struct fpga_memory *memory, size_t size, uint64_t *addr) { - uint64_t _nr = DIV_ROUND_UP(size, FPGA_PAGE_SIZE); - unsigned int nr = (unsigned int)_nr; - int ret = 0; - uint64_t a_size = FPGA_PAGE_SIZE * nr; - - pthread_mutex_lock(&memory->mutex); - unsigned int pos = (unsigned int)fpga_bitmap::bitmap_find_next_zero_area( - memory->bitmap, memory->page_num, 0, nr, 0); - if (pos <= memory->page_num) { - uint64_t address_ofset = - memory->mem_start + ((uint64_t)pos) * FPGA_PAGE_SIZE; - fpga_bitmap::bitmap_set(memory->bitmap, pos, nr); - memory->nr[pos] = nr; - - *addr = address_ofset; - } else { - DLOG << "memory request failed!"; - ret = -ENOMEM; - } - - pthread_mutex_unlock(&memory->mutex); - - return ret; -} void memory_release(struct fpga_memory *memory) { void *ptr = nullptr; @@ -187,96 +160,7 @@ void memory_release(struct fpga_memory *memory) { } } -int create_fpga_memory_inner(struct fpga_memory *memory, size_t memory_size) { - int rc = 0; - - uint64_t *bitmap = nullptr; - unsigned int *nr = nullptr; - - // 不允许多份memory创建,所以创建memory结构体不存在互斥 - // pthread_mutex_lock(&memory->mutex); - memory->page_num = (unsigned int)(memory_size / FPGA_PAGE_SIZE); - memory->page_num_long = DIV_ROUND_UP(memory->page_num, BITS_PER_LONG); - - bitmap = - (uint64_t *)malloc(sizeof(int64_t) * memory->page_num_long); // NOLINT - if (!bitmap) { - rc = -EFAULT; - return rc; - } - memory->bitmap = bitmap; - - nr = (unsigned int *)calloc(memory->page_num, sizeof(unsigned int)); - if (!nr) { - rc = -EFAULT; - free(bitmap); - return rc; - } - memory->nr = nr; - - memory->mem_start = FPGA_MEM_PHY_ADDR; - memory->mem_end = FPGA_MEM_SIZE; - // pthread_mutex_unlock(memory->mutex); - - return rc; -} - -int create_fpga_memory(struct fpga_memory **memory_info) { - int rc = 0; - - *memory_info = (struct fpga_memory *)malloc(sizeof(struct fpga_memory)); - if (*memory_info == NULL) { - rc = -EFAULT; - return rc; - } - pthread_mutex_init(&((*memory_info)->mutex), nullptr); - - rc = create_fpga_memory_inner(*memory_info, FPGA_MEM_SIZE); - if (rc) { - free(*memory_info); - } - - return rc; -} - -int init_fpga_memory(struct fpga_memory *memory) { - int rc = 0; - - if (!memory) { - rc = -EFAULT; - return rc; - } - - fpga_bitmap::bitmap_clear(memory->bitmap, 0, memory->page_num); - fpga_bitmap::bitmap_set(memory->bitmap, 0, 1); // NOTE reserve fpga page 0. - - return 0; -} - -void destroy_fpga_memory(struct fpga_memory *memory) { - if (memory) { - free(memory->nr); - free(memory->bitmap); - free(memory); - } -} - -int fpga_memory_add() { - int rc = 0; - - rc = create_fpga_memory(&g_fpgainfo.memory_info); - if (rc) { - return rc; - } - - rc = init_fpga_memory(g_fpgainfo.memory_info); - if (rc) { - destroy_fpga_memory(g_fpgainfo.memory_info); - return rc; - } - return 0; -} uint64_t vaddr_to_paddr_driver(void *address) { uint64_t paddr = 0; @@ -314,17 +198,28 @@ void *fpga_reg_free(void *ptr) { } } +static inline int do_ioctl(int64_t req, const void *arg) { + return ioctl(g_fpgainfo.fd_mem, req, arg); +} + void *fpga_malloc_driver(size_t size) { void *ret = nullptr; uint64_t phy_addr = 0; int i = 0; + struct MemoryVM2PHYArgs args; + struct MemoryCacheArgs args_c; - memory_request(g_fpgainfo.memory_info, size, &phy_addr); + // memory_request(g_fpgainfo.memory_info, size, &phy_addr); ret = mmap64(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, - g_fpgainfo.fd_mem, phy_addr); + g_fpgainfo.fd_mem, FPGA_MEM_PHY_ADDR); PADDLE_MOBILE_ENFORCE(ret != (void *)-1, "Should not be -1"); + args.pVM= (void *)ret; + args.pPHY =(void *)0; + do_ioctl(IOCTL_MEMORY_VM2PHY, &args); + phy_addr = (uint64_t)args.pPHY; + g_fpgainfo.fpga_vaddr2paddr_map.insert(std::make_pair(ret, phy_addr)); g_fpgainfo.fpga_addr2size_map.insert(std::make_pair(ret, size)); @@ -345,11 +240,6 @@ void fpga_free_driver(void *ptr) { p_addr = vaddr_to_paddr_driver(ptr); pos = (p_addr - g_fpgainfo.memory_info->mem_start) / FPGA_PAGE_SIZE; - /*clear bitmap*/ - pthread_mutex_lock(&g_fpgainfo.memory_info->mutex); - fpga_bitmap::bitmap_clear(g_fpgainfo.memory_info->bitmap, pos, - g_fpgainfo.memory_info->nr[pos]); - pthread_mutex_unlock(&g_fpgainfo.memory_info->mutex); auto iter = g_fpgainfo.fpga_vaddr2paddr_map.find(ptr); if (iter != g_fpgainfo.fpga_vaddr2paddr_map.end()) { @@ -360,10 +250,6 @@ void fpga_free_driver(void *ptr) { } } -static inline int do_ioctl(int64_t req, const void *arg) { - return ioctl(g_fpgainfo.fd_mem, req, arg); -} - int fpga_flush_driver(void *address, size_t size) { struct MemoryCacheArgs args; uint64_t p_addr; @@ -413,7 +299,7 @@ int open_device_driver() { g_fpgainfo.FpgaRegVirAddr = (uint64_t *)fpga_reg_malloc(FPGA_REG_SIZE); // NOLINT - fpga_memory_add(); + //fpga_memory_add(); pl_init(); @@ -424,8 +310,7 @@ int close_device_driver() { pl_destroy(); fpga_reg_free(g_fpgainfo.FpgaRegVirAddr); memory_release(g_fpgainfo.memory_info); - destroy_fpga_memory(g_fpgainfo.memory_info); - + return 0; } diff --git a/src/fpga/common/driver.h b/src/fpga/common/driver.h index d35627cd46..89f419acca 100644 --- a/src/fpga/common/driver.h +++ b/src/fpga/common/driver.h @@ -31,8 +31,8 @@ namespace driver { #define FPGA_REG_PHY_ADDR 0x80000000 #define FPGA_REG_SIZE 0x1000 -#define FPGA_MEM_PHY_ADDR 0x40000000 -#define FPGA_MEM_SIZE 0x80000000 +#define FPGA_MEM_PHY_ADDR 0x20000000 +#define FPGA_MEM_SIZE 0x20000000 #define FPGA_PAGE_SIZE (16UL * 1024UL) @@ -52,9 +52,16 @@ struct MemoryCacheArgs { size_t size; }; +struct MemoryVM2PHYArgs { + void* pVM; + void* pPHY; +}; + #define IOCTL_FPGA_MAGIC 'F' #define IOCTL_MEMCACHE_INVAL _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryCacheArgs) #define IOCTL_MEMCACHE_FLUSH _IOW(IOCTL_FPGA_MAGIC, 13, struct MemoryCacheArgs) +#define IOCTL_MEMORY_VM2PHY _IOWR(IOCTL_FPGA_MAGIC, 15, struct MemoryVM2PHYArgs) + struct fpga_pe { char type_name[MAX_TYPE_NAME_LENTH + 1]; diff --git a/src/fpga/common/fpga_common.h b/src/fpga/common/fpga_common.h index 898e76a654..cd9a29e34d 100644 --- a/src/fpga/common/fpga_common.h +++ b/src/fpga/common/fpga_common.h @@ -25,6 +25,7 @@ limitations under the License. */ #define FILTER_ELEMENT_ALIGNMENT (16) // Filter element number aligned to 16 #define BS_NUM_ALIGNMENT (8) #define BIAS_NUM_ALIGNMENT (16) +#define ROW_PARALLEL_NUM (3) #endif namespace paddle_mobile { diff --git a/src/operators/detection_ops.cpp b/src/operators/detection_ops.cpp index 630b672225..f198711de2 100644 --- a/src/operators/detection_ops.cpp +++ b/src/operators/detection_ops.cpp @@ -65,6 +65,24 @@ void PSRoiPoolOp::InferShape() const { } #endif +#ifdef ROIALIGN_POOL_OP +template +void RoiAlignPoolOp::InferShape() const { + const auto &rois_dims = this->param_.input_rois_->dims(); + const int pooled_height = this->param_.pooled_height_; + const int pooled_width = this->param_.pooled_width_; + + auto out_dims = this->param_.input_x_->dims(); + out_dims[0] = rois_dims[0]; + // out_dims[1] = + // output_channels; // input_dims[1] / (pooled_height * pooled_width); + out_dims[2] = pooled_height; + out_dims[3] = pooled_width; + this->param_.output_->Resize(out_dims); +} +#endif + + #ifdef ROI_PERSPECTIVE_OP template void RoiPerspectiveOp::InferShape() const { @@ -110,4 +128,8 @@ REGISTER_OPERATOR_FPGA(generate_proposals, ops::ProposalOp); #ifdef PSROI_POOL_OP REGISTER_OPERATOR_FPGA(psroi_pool, ops::PSRoiPoolOp); #endif +#ifdef ROIALIGN_POOL_OP +REGISTER_OPERATOR_FPGA(roialign_pool, ops::RoiAlignPoolOp); +#endif + #endif diff --git a/src/operators/detection_ops.h b/src/operators/detection_ops.h index 38d0890756..5b90ac3ee1 100644 --- a/src/operators/detection_ops.h +++ b/src/operators/detection_ops.h @@ -34,6 +34,11 @@ DECLARE_OPERATOR(Proposal, ProposalParam, ProposalKernel); DECLARE_OPERATOR(PSRoiPool, PSRoiPoolParam, PSRoiPoolKernel); #endif +#ifdef ROIALIGN_POOL_OP +DECLARE_OPERATOR(RoiAlignPool, RoiAlignPoolParam, RoiAlignPoolKernel); +#endif + + #ifdef ROI_PERSPECTIVE_OP DECLARE_OPERATOR(RoiPerspective, RoiPerspectiveParam, RoiPerspectiveKernel); #endif diff --git a/src/operators/kernel/detection_kernel.h b/src/operators/kernel/detection_kernel.h index 124bdbb04f..93ed78b10e 100644 --- a/src/operators/kernel/detection_kernel.h +++ b/src/operators/kernel/detection_kernel.h @@ -98,6 +98,8 @@ class ProposalParam : public OpParam { framework::Tensor *anchors_; framework::Tensor *variances_; + std::shared_ptr score_index_; + framework::LoDTensor *rpn_rois_; framework::LoDTensor *rpn_probs_; @@ -151,6 +153,42 @@ class PSRoiPoolParam : public OpParam { DECLARE_KERNEL(PSRoiPool, PSRoiPoolParam); #endif +#ifdef ROIALIGN_POOL_OP +template +class RoiAlignPoolParam : public OpParam { + public: + RoiAlignPoolParam(const VariableNameMap &inputs, const VariableNameMap &outputs, + const AttributeMap &attrs, const Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + input_x_ = OpParam::GetVarValue("X", inputs, *scope); + input_rois_ = + OpParam::GetVarValue("ROIs", inputs, *scope); + output_ = OpParam::GetVarValue("Out", outputs, *scope); + + pooled_height_ = OpParam::GetAttr("pooled_height", attrs); + pooled_width_ = OpParam::GetAttr("pooled_width", attrs); + spatial_scale_ = OpParam::GetAttr("spatial_scale", attrs); + sampling_ratio_ = OpParam::GetAttr("sampling_ratio", attrs); + } + + public: + framework::Tensor *input_x_; + framework::LoDTensor *input_rois_; + framework::Tensor *output_; + int pooled_height_; + int pooled_width_; + float spatial_scale_; + int sampling_ratio_; +#ifdef PADDLE_MOBILE_FPGA + std::shared_ptr float_input, float_output; + fpga::BypassArgs input_arg, output_arg; +#endif + +}; + +DECLARE_KERNEL(RoiAlignPool, RoiAlignPoolParam); +#endif + #ifdef ROI_PERSPECTIVE_OP template class RoiPerspectiveParam : public OpParam { diff --git a/src/operators/kernel/fpga/V1/fetch_kernel.cpp b/src/operators/kernel/fpga/V1/fetch_kernel.cpp index 54fd12bfd3..c876a67e6a 100644 --- a/src/operators/kernel/fpga/V1/fetch_kernel.cpp +++ b/src/operators/kernel/fpga/V1/fetch_kernel.cpp @@ -62,7 +62,10 @@ void FetchKernel::Compute(const FetchParam ¶m) { output->ShareDataWith(*input); return; } - fpga::PerformBypass(param.fpga_bypass_args); + fpga::BypassArgs args = param.fpga_bypass_args; + auto input_address = (input->data()); + args.image.address = static_cast(input_address); + fpga::PerformBypass(args); auto outC = param.Out()->dims()[1]; auto outH = param.Out()->dims()[2]; auto outW = param.Out()->dims()[3]; @@ -70,10 +73,15 @@ void FetchKernel::Compute(const FetchParam ¶m) { reinterpret_cast(param.fpga_bypass_args.output.address); fpga::fpga_invalidate(param.fpga_bypass_args.output.address, param.Out()->fpga_data_num * sizeof(float)); - float *data_tmp = - reinterpret_cast(malloc(outC * outH * outW * sizeof(float))); - dealign(outdata_ptr, data_tmp, outC, outH, outW); - memcpy(outdata_ptr, data_tmp, outC * outH * outW * sizeof(float)); + + if(param.Out()->fpga_data_num != product(input->dims())){ + float *data_tmp = + reinterpret_cast(malloc(outC * outH * outW * sizeof(float))); + dealign(outdata_ptr, data_tmp, outC, outH, outW); + memcpy(outdata_ptr, data_tmp, outC * outH * outW * sizeof(float)); + free(data_tmp); + } + } template class FetchKernel; diff --git a/src/operators/kernel/fpga/V1/pool_kernel.cpp b/src/operators/kernel/fpga/V1/pool_kernel.cpp index e3bcbd25ea..72062193ed 100644 --- a/src/operators/kernel/fpga/V1/pool_kernel.cpp +++ b/src/operators/kernel/fpga/V1/pool_kernel.cpp @@ -73,9 +73,11 @@ void PoolKernel::Compute(const PoolParam ¶m) { if (input->type() == typeid(float)) { auto *output = param.Output(); auto in = input->data(); + auto N = input->dims()[0]; + output->Resize({N, output->dims()[1], output->dims()[2], output->dims()[3]}); auto len = output->numel(); auto out = output->mutable_data(); - int N = input->dims()[0], C = input->dims()[1], H = input->dims()[2], + int C = input->dims()[1], H = input->dims()[2],//N = input->dims()[0], W = input->dims()[3]; int HW = H * W, CHW = C * H * W, WC = W * C; diff --git a/src/operators/kernel/fpga/V1/proposal_kernel.cpp b/src/operators/kernel/fpga/V1/proposal_kernel.cpp index 3f0ba42f05..fe91612c76 100644 --- a/src/operators/kernel/fpga/V1/proposal_kernel.cpp +++ b/src/operators/kernel/fpga/V1/proposal_kernel.cpp @@ -65,6 +65,14 @@ bool ProposalKernel::Init(ProposalParam *param) { args.output.scale_address = param->float_score->scale; param->score_arg = args; + param->score_index_= std::make_shared(); + param->score_index_->mutable_data({input->numel()}); + auto score_index = param->score_index_->data(); + for (int i = 0; i < input->numel(); ++i){ + score_index[i] = i; + } + + return true; } template @@ -334,6 +342,7 @@ std::pair ProposalForOneImage( const Tensor &im_info_slice, const Tensor &anchors, const Tensor &variances, const Tensor &bbox_deltas_slice, // [M, 4] const Tensor &scores_slice, // [N, 1] + const Tensor &score_index, int pre_nms_top_n, int post_nms_top_n, float nms_thresh, float min_size, float eta) { auto *scores_data = scores_slice.data(); @@ -342,9 +351,11 @@ std::pair ProposalForOneImage( Tensor index_t; index_t.Resize({scores_slice.numel()}); int *index = index_t.mutable_data(); - for (int i = 0; i < scores_slice.numel(); ++i) { + /*for (int i = 0; i < scores_slice.numel(); ++i) { index[i] = i; - } + }*/ + std::memcpy(index,score_index.data(),scores_slice.numel()*sizeof(int) ); + auto compare = [scores_data](const int64_t &i, const int64_t &j) { return scores_data[i] > scores_data[j]; }; @@ -490,8 +501,10 @@ void ProposalKernel::Compute(const ProposalParam ¶m) { auto *rpn_rois = param.rpn_rois_; auto *rpn_roi_probs = param.rpn_probs_; + auto score_index = *(param.score_index_.get()); + int pre_nms_top_n = param.pre_nms_topn_; - int post_nms_top_n = param.post_nms_topn_; + int post_nms_top_n = 100;//param.post_nms_topn_; float nms_thresh = param.nms_thresh_; float min_size = param.min_size_; float eta = param.eta_; @@ -528,7 +541,7 @@ void ProposalKernel::Compute(const ProposalParam ¶m) { scores_slice.Resize({h_score * w_score * c_score, 1}); std::pair tensor_pair = ProposalForOneImage( - im_info_slice, anchors, variances, bbox_deltas_slice, scores_slice, + im_info_slice, anchors, variances, bbox_deltas_slice, scores_slice,score_index, pre_nms_top_n, post_nms_top_n, nms_thresh, min_size, eta); Tensor &proposals = tensor_pair.first; Tensor &scores = tensor_pair.second; diff --git a/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp b/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp index 3309f9f7ee..2eeedcf9a7 100644 --- a/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp +++ b/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp @@ -1,212 +1,260 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef PSROI_POOL_OP - -#include -#include -#include "operators/kernel/detection_kernel.h" - -#include "fpga/V1/api.h" -#include "fpga/V1/image.h" -namespace paddle_mobile { -namespace operators { - -template <> -bool PSRoiPoolKernel::Init(PSRoiPoolParam* param) { - auto dims = param->input_x_->dims(); - PADDLE_MOBILE_ENFORCE(dims[1] * dims[3] % IMAGE_ALIGNMENT == 0, - "data not aligned"); - - param->float_input = std::make_shared(); - param->float_input->mutable_data(param->input_x_->dims()); - // param->float_output = std::make_shared(); - - auto input = param->input_x_; - fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; - args.input_layout_type = fpga::LAYOUT_HWC; - args.output_layout_type = fpga::LAYOUT_HWC; - args.input_data_type = fpga::DATA_TYPE_FP16; - args.output_data_type = fpga::DATA_TYPE_FP32; - args.image.address = input->data(); - args.image.height = (uint32_t)input->dims()[2]; - args.image.width = (uint32_t)input->dims()[3]; - args.image.channels = (uint32_t)input->dims()[1]; - args.output.address = param->float_input->mutable_data(); - args.output.scale_address = param->float_input->scale; - param->input_arg = args; - - auto* rois = param->input_rois_; - int rois_num = rois->dims()[0]; - framework::DDim dims_out_new = framework::make_ddim( - {rois_num, param->output_->dims()[1], param->output_->dims()[2], - param->output_->dims()[3]}); - param->output_->Resize(dims_out_new); - // fpga::format_fp16_ofm(param->output_); - - param->output_->mutable_data(dims_out_new); - // auto output = param->float_output.get(); - // param->output_ = output; - /* args.input_data_type = fpga::DATA_TYPE_FP32; - args.output_data_type = fpga::DATA_TYPE_FP16; - args.image.address = output->data(); - args.image.height = (uint32_t)output->dims()[2]; - args.image.width = (uint32_t)output->dims()[3]; - args.image.channels = (uint32_t)output->dims()[1] ; - args.output.address = param->output_->mutable_data(); - args.output.scale_address = param->output_->scale; - param->output_arg = args;*/ - - return true; -} - -template -void PSROIPooling(const Dtype* bottom_data, const Dtype spatial_scale, - const int channels, const int height, const int width, - const int pooled_height, const int pooled_width, - const Dtype* bottom_rois, const int output_dim, - const int group_size, Dtype* top_data, - // int* mapping_channel, - int index, int* rois_batch_id) { - // The output is in order (n, ctop, ph, pw) - // static int cnt = 0; - int pw = index % pooled_width; - int ph = (index / pooled_width) % pooled_height; - int ctop = (index / pooled_width / pooled_height) % output_dim; - int n = index / pooled_width / pooled_height / output_dim; - - // [start, end) interval for spatial sampling - bottom_rois += n * 4; - int roi_batch_ind = rois_batch_id[n]; // bottom_rois[0]; - Dtype roi_start_w = static_cast(round(bottom_rois[0])) * spatial_scale; - Dtype roi_start_h = static_cast(round(bottom_rois[1])) * spatial_scale; - Dtype roi_end_w = - static_cast(round(bottom_rois[2]) + 1.) * spatial_scale; - Dtype roi_end_h = - static_cast(round(bottom_rois[3]) + 1.) * spatial_scale; - - // Force too small ROIs to be 1x1 - Dtype roi_width = std::max(roi_end_w - roi_start_w, 0.1f); // avoid 0 - Dtype roi_height = std::max(roi_end_h - roi_start_h, 0.1f); - - // Compute w and h at bottom - Dtype bin_size_h = roi_height / static_cast(pooled_height); - Dtype bin_size_w = roi_width / static_cast(pooled_width); - - int hstart = floor(static_cast(ph) * bin_size_h + roi_start_h); - int wstart = floor(static_cast(pw) * bin_size_w + roi_start_w); - int hend = ceil(static_cast(ph + 1) * bin_size_h + roi_start_h); - int wend = ceil(static_cast(pw + 1) * bin_size_w + roi_start_w); - // Add roi offsets and clip to input boundaries - hstart = std::min(std::max(hstart, 0), height); - hend = std::min(std::max(hend, 0), height); - wstart = std::min(std::max(wstart, 0), width); - wend = std::min(std::max(wend, 0), width); - bool is_empty = (hend <= hstart) || (wend <= wstart); - - int gw = pw; - int gh = ph; - int c = (ctop * group_size + gh) * group_size + gw; - - bottom_data += (roi_batch_ind * channels + c) * height * width; - Dtype out_sum = 0; - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - int bottom_index = h * width + w; - out_sum += bottom_data[bottom_index]; +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PSROI_POOL_OP + +#include +#include +#include "operators/kernel/detection_kernel.h" + +#include "fpga/V1/api.h" +#include "fpga/V1/image.h" +namespace paddle_mobile { +namespace operators { + +template <> +bool PSRoiPoolKernel::Init(PSRoiPoolParam* param) { + auto dims = param->input_x_->dims(); + PADDLE_MOBILE_ENFORCE(dims[1] * dims[3] % IMAGE_ALIGNMENT == 0, + "data not aligned"); + + param->float_input = std::make_shared(); + param->float_input->mutable_data(param->input_x_->dims()); + // param->float_output = std::make_shared(); + + auto input = param->input_x_; + fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; + args.input_layout_type = fpga::LAYOUT_HWC; + args.output_layout_type = fpga::LAYOUT_HWC; + args.input_data_type = fpga::DATA_TYPE_FP16; + args.output_data_type = fpga::DATA_TYPE_FP32; + args.image.address = input->data(); + args.image.height = (uint32_t)input->dims()[2]; + args.image.width = (uint32_t)input->dims()[3]; + args.image.channels = (uint32_t)input->dims()[1]; + args.output.address = param->float_input->mutable_data(); + args.output.scale_address = param->float_input->scale; + param->input_arg = args; + + auto* rois = param->input_rois_; + int rois_num = rois->dims()[0]; + framework::DDim dims_out_new = framework::make_ddim( + {rois_num, param->output_->dims()[1], param->output_->dims()[2], + param->output_->dims()[3]}); + param->output_->Resize(dims_out_new); + // fpga::format_fp16_ofm(param->output_); + + param->output_->mutable_data(dims_out_new); + // auto output = param->float_output.get(); + // param->output_ = output; + /* args.input_data_type = fpga::DATA_TYPE_FP32; + args.output_data_type = fpga::DATA_TYPE_FP16; + args.image.address = output->data(); + args.image.height = (uint32_t)output->dims()[2]; + args.image.width = (uint32_t)output->dims()[3]; + args.image.channels = (uint32_t)output->dims()[1] ; + args.output.address = param->output_->mutable_data(); + args.output.scale_address = param->output_->scale; + param->output_arg = args;*/ + + return true; +} + +template +void PSROIPooling( +const Dtype* bottom_data, const int channels, +const int height, const int width, +const int pooled_height, const int pooled_width, +const Dtype* bottom_rois, const int output_dim, +const int group_size, Dtype* top_data, +int index, int nid, +const Dtype Bin_size_h, +const Dtype Bin_size_w, +const Dtype roi_start_h, +const Dtype roi_start_w, +const int ctop, const int ph, const int roi_batch_ind) +{ + int pw = index; + int hstart = floor(static_cast(ph) * Bin_size_h + roi_start_h); + int wstart = floor(static_cast(pw)* Bin_size_w + roi_start_w); + int hend = ceil(static_cast(ph + 1) * Bin_size_h + roi_start_h); + int wend = ceil(static_cast(pw + 1) * Bin_size_w + roi_start_w); + + // Add roi offsets and clip to input boundaries + hstart = std::min(std::max(hstart, 0), height); + hend = std::min(std::max(hend, 0), height); + wstart = std::min(std::max(wstart, 0), width); + wend = std::min(std::max(wend, 0), width); + bool is_empty = (hend <= hstart) || (wend <= wstart); + + int c = (ctop*group_size + ph)*group_size + pw; + + Dtype bin_area = (hend - hstart)*(wend - wstart); + bottom_data += (roi_batch_ind * channels + c) * height * width; + Dtype out_sum = 0; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + int bottom_index = h * width + w; + out_sum += bottom_data[bottom_index]; + } + } + + top_data[nid + index] = is_empty? 0. : out_sum/bin_area; + +} + +void convert_to_chw(float **data_in, int channel, int height, int width, + int num) { + float* data_in_tmp = *data_in; + float *data_tmp = + (float *)fpga::fpga_malloc(channel * height * width * sizeof(float)); // NOLINT + int64_t amount_per_side = width * height; + for (int n = 0; n < num; n++) { + for (int h = 0; h < height; h++) { + for (int w = 0; w < width; w++) { + for (int c = 0; c < channel; c++) { + *(data_tmp + n * height * width * channel + c * amount_per_side + + width * h + w) = *((*data_in)++); + } + } } } - - Dtype bin_area = (hend - hstart) * (wend - wstart); - top_data[index] = is_empty ? 0. : out_sum / bin_area; -} -template <> -void PSRoiPoolKernel::Compute(const PSRoiPoolParam& param) { - auto input_tensor = param.float_input.get(); - fpga::PerformBypass(param.input_arg); - fpga::fpga_invalidate(input_tensor->data(), - input_tensor->numel() * sizeof(float)); - - auto* in = input_tensor; - auto* rois = param.input_rois_; - auto* out = param.output_; // param.float_output.get(); - - auto pooled_height = param.pooled_height_; - auto pooled_width = param.pooled_width_; - auto spatial_scale = param.spatial_scale_; - auto output_channels = param.output_channels_; - - auto in_dims = in->dims(); - int batch_size = in_dims[0]; - int input_channels = in_dims[1]; - int height = in_dims[2]; - int width = in_dims[3]; - int rois_num = rois->dims()[0]; - - auto data_nhwc = in->mutable_data(); - fpga::image::convert_to_chw(&data_nhwc, input_channels, height, width); - framework::DDim dims_out_new = framework::make_ddim( - {rois_num, (param.output_)->dims()[1], (((param.output_)->dims()[2])), - (param.output_)->dims()[3]}); - (param.output_)->Resize(dims_out_new); - - const float* input_data = data_nhwc; // in->data(); - framework::Tensor rois_batch_id_list; - rois_batch_id_list.Resize({rois_num}); - auto rois_batch_id_data = rois_batch_id_list.mutable_data(); - - PADDLE_MOBILE_ENFORCE(rois->NumLevels() > 0, "ROIS should not be empty"); - - auto rois_lod = rois->lod().back(); - int rois_batch_size = rois_lod.size() - 1; - PADDLE_MOBILE_ENFORCE( - rois_batch_size == batch_size, - "the rois_batch_size and input(X) batch_size should be the same."); - int rois_num_with_lod = rois_lod[rois_batch_size]; - PADDLE_MOBILE_ENFORCE(rois_num_with_lod == rois_num, - "the rois_num from input and lod must be the same"); - - PADDLE_MOBILE_ENFORCE( - input_channels == output_channels * pooled_height * pooled_width, - "the channels of input X should equal the product of " - "output_channels x pooled_height x pooled_width"); - - // calculate batch id index for each roi according to LoD - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - rois_batch_id_data[i] = n; + *data_in = data_tmp; + fpga::fpga_free(data_in_tmp); +} + +void convert_to_hwc(float **data_in, int channel, int height, int width, + int num) { + float* data_in_tmp = *data_in; + float *data_tmp = reinterpret_cast( + fpga::fpga_malloc(num * channel * height * width * sizeof(float))); + int64_t amount_per_row = width * channel; + for (int n = 0; n < num; n++) { + for (int c = 0; c < channel; c++) { + for (int h = 0; h < height; h++) { + int64_t offset_height = h * amount_per_row; + for (int w = 0; w < width; w++) { + *(data_tmp + n * channel * height * width + offset_height + + w * channel + c) = *((*data_in)++); + } + } } } - auto output_data = out->mutable_data(); - auto input_rois = rois->data(); - - // calculate psroipooling, parallel processing can be implemented per ROI - - int index = pooled_height * pooled_width * output_channels * rois_num; - for (int idx = 0; idx < index; idx++) { - PSROIPooling(input_data, spatial_scale, input_channels, height, - width, pooled_height, pooled_width, input_rois, - output_channels, pooled_height, output_data, idx, - rois_batch_id_data); - } - // - fpga::image::convert_to_hwc(&output_data, output_channels, pooled_height, - pooled_width, rois_num); - out->reset_data_ptr(output_data); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif // PSROI_POOL_OP + *data_in = data_tmp; + fpga::fpga_free(data_in_tmp); +} + + +template <> +void PSRoiPoolKernel::Compute(const PSRoiPoolParam& param) { + auto input_tensor = param.float_input.get(); + fpga::PerformBypass(param.input_arg); + fpga::fpga_invalidate(input_tensor->data(), + input_tensor->numel() * sizeof(float)); + + auto* in = input_tensor; + auto* rois = param.input_rois_; + auto* out = param.output_; // param.float_output.get(); + + auto pooled_height = param.pooled_height_; + auto pooled_width = param.pooled_width_; + auto spatial_scale = param.spatial_scale_; + auto output_channels = param.output_channels_; + + auto in_dims = in->dims(); + int batch_size = in_dims[0]; + int input_channels = in_dims[1]; + int height = in_dims[2]; + int width = in_dims[3]; + int rois_num = rois->dims()[0]; + + auto data_nhwc = in->mutable_data(); + convert_to_chw(&data_nhwc, input_channels, height, width, 1); + framework::DDim dims_out_new = framework::make_ddim( + {rois_num, (param.output_)->dims()[1], (((param.output_)->dims()[2])), + (param.output_)->dims()[3]}); + (param.output_)->Resize(dims_out_new); + + const float* input_data = data_nhwc; // in->data(); + framework::Tensor rois_batch_id_list; + rois_batch_id_list.Resize({rois_num}); + auto rois_batch_id_data = rois_batch_id_list.mutable_data(); + + PADDLE_MOBILE_ENFORCE(rois->NumLevels() > 0, "ROIS should not be empty"); + + auto rois_lod = rois->lod().back(); + int rois_batch_size = rois_lod.size() - 1; + PADDLE_MOBILE_ENFORCE( + rois_batch_size == batch_size, + "the rois_batch_size and input(X) batch_size should be the same."); + int rois_num_with_lod = rois_lod[rois_batch_size]; + PADDLE_MOBILE_ENFORCE(rois_num_with_lod == rois_num, + "the rois_num from input and lod must be the same"); + + PADDLE_MOBILE_ENFORCE( + input_channels == output_channels * pooled_height * pooled_width, + "the channels of input X should equal the product of " + "output_channels x pooled_height x pooled_width"); + + // calculate batch id index for each roi according to LoD + //for (int n = 0; n < rois_batch_size; ++n) { + //for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { + //rois_batch_id_data[i] = n; + // } + //} + auto output_data = out->mutable_data(); + auto input_rois = rois->data(); + + // calculate psroipooling, parallel processing can be implemented per ROI + for (int n = 0; n < rois_num; ++n) { + // [start, end) interval for spatial sampling + auto offset_input_rois = input_rois + n * 4; + auto roi_start_w = static_cast(round(offset_input_rois[0])) * spatial_scale; + auto roi_start_h = static_cast(round(offset_input_rois[1])) * spatial_scale; + auto roi_end_w = static_cast(round(offset_input_rois[2]) + 1.) * spatial_scale; + auto roi_end_h = static_cast(round(offset_input_rois[3]) + 1.) * spatial_scale; + + // Force too small rois to be 1 x 1 + auto roi_height = std::max(roi_end_h - roi_start_h, 0.1f); // avoid 0 + auto roi_width = std::max(roi_end_w - roi_start_w, 0.1f); + + // Compute bin size w and h at input feature map + auto bin_size_h = roi_height / static_cast(pooled_height); + auto bin_size_w = roi_width / static_cast(pooled_width); + + int roi_batch_ind = 0;//rois_batch_id_data[n]; + //std::cout << "roi_batch_ind: " << roi_batch_ind << std::endl; + for(int c = 0; c < output_channels; ++c){ + + for(int ph = 0; ph < pooled_height; ph++){ + int index = pooled_width; + int nid = n * output_channels * pooled_height * pooled_width + c * pooled_width * pooled_height + ph * pooled_width; + for(int idx = 0; idx < index; idx++){ + PSROIPooling(input_data,input_channels,height,width,pooled_height,pooled_width, + input_rois,output_channels,pooled_height,output_data, idx, nid, bin_size_h, bin_size_w, roi_start_h, roi_start_w, c, ph, roi_batch_ind); + } + } + } + } + + convert_to_hwc(&output_data, output_channels, pooled_height, + pooled_width, rois_num); + out->reset_data_ptr(output_data); +} + +} // namespace operators +} // namespace paddle_mobile + +#endif // PSROI_POOL_OP + diff --git a/src/operators/kernel/fpga/V1/roialign_pool_kernel.cpp b/src/operators/kernel/fpga/V1/roialign_pool_kernel.cpp new file mode 100644 index 0000000000..92a76646c0 --- /dev/null +++ b/src/operators/kernel/fpga/V1/roialign_pool_kernel.cpp @@ -0,0 +1,330 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef ROIALIGN_POOL_OP + +#include +#include +#include "operators/kernel/detection_kernel.h" + +#include "fpga/V1/api.h" +#include "fpga/V1/image.h" + +namespace paddle_mobile { +namespace operators { + + +template <> +bool RoiAlignPoolKernel::Init(RoiAlignPoolParam* param) { + + auto dims = param->input_x_->dims(); + PADDLE_MOBILE_ENFORCE(dims[1] * dims[3] % IMAGE_ALIGNMENT == 0, + "data not aligned"); + + param->float_input = std::make_shared(); + param->float_input->mutable_data(param->input_x_->dims()); + + auto input = param->input_x_; + fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; + args.input_layout_type = fpga::LAYOUT_HWC; + args.output_layout_type = fpga::LAYOUT_HWC; + args.input_data_type = fpga::DATA_TYPE_FP16; + args.output_data_type = fpga::DATA_TYPE_FP32; + args.image.address = input->data(); + args.image.height = (uint32_t)input->dims()[2]; + args.image.width = (uint32_t)input->dims()[3]; + args.image.channels = (uint32_t)input->dims()[1]; + args.output.address = param->float_input->mutable_data(); + args.output.scale_address = param->float_input->scale; + param->input_arg = args; + + auto* rois = param->input_rois_; + int rois_num = rois->dims()[0]; + framework::DDim dims_out_new = framework::make_ddim( + {rois_num, param->output_->dims()[1], param->output_->dims()[2], + param->output_->dims()[3]}); + param->output_->Resize(dims_out_new); + + param->output_->mutable_data(dims_out_new); + + + return true; +} + + +template +struct PreCalc { + int pos1; + int pos2; + int pos3; + int pos4; + T w1; + T w2; + T w3; + T w4; +}; + +template +void pre_calc_for_bilinear_interpolate( + const int height, + const int width, + const int pooled_height, + const int pooled_width, + const int iy_upper, + const int ix_upper, + T roi_start_h, + T roi_start_w, + T bin_size_h, + T bin_size_w, + int roi_bin_grid_h, + int roi_bin_grid_w, + std::vector>& pre_calc) { + int pre_calc_index = 0; + for (int ph = 0; ph < pooled_height; ph++) { + for (int pw = 0; pw < pooled_width; pw++) { + for (int iy = 0; iy < iy_upper; iy++) { + const T yy = roi_start_h + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 + for (int ix = 0; ix < ix_upper; ix++) { + const T xx = roi_start_w + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + + T x = xx; + T y = yy; + // deal with: inverse elements are out of feature map boundary + if (y < -1.0 || y > height || x < -1.0 || x > width) { + // empty + PreCalc pc; + pc.pos1 = 0; + pc.pos2 = 0; + pc.pos3 = 0; + pc.pos4 = 0; + pc.w1 = 0; + pc.w2 = 0; + pc.w3 = 0; + pc.w4 = 0; + pre_calc[pre_calc_index] = pc; + pre_calc_index += 1; + continue; + } + + if (y <= 0) { + y = 0; + } + if (x <= 0) { + x = 0; + } + + int y_low = (int)y; + int x_low = (int)x; + int y_high; + int x_high; + + if (y_low >= height - 1) { + y_high = y_low = height - 1; + y = (T)y_low; + } else { + y_high = y_low + 1; + } + + if (x_low >= width - 1) { + x_high = x_low = width - 1; + x = (T)x_low; + } else { + x_high = x_low + 1; + } + + T ly = y - y_low; + T lx = x - x_low; + T hy = 1. - ly, hx = 1. - lx; + T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; + + // save weights and indeces + PreCalc pc; + pc.pos1 = y_low * width + x_low; + pc.pos2 = y_low * width + x_high; + pc.pos3 = y_high * width + x_low; + pc.pos4 = y_high * width + x_high; + pc.w1 = w1; + pc.w2 = w2; + pc.w3 = w3; + pc.w4 = w4; + pre_calc[pre_calc_index] = pc; + + pre_calc_index += 1; + } + } + } + } +} + +template +void ROIAlignForward( + const int nthreads, + const T* bottom_data, + const T& spatial_scale, + const int channels, + const int height, + const int width, + const int pooled_height, + const int pooled_width, + const int sampling_ratio, + const T* bottom_rois, + T* top_data) { + + int n_rois = nthreads / channels / pooled_width / pooled_height; + + + for (int n = 0; n < n_rois; n++) { + int index_n = n * channels * pooled_width * pooled_height; + + // roi could have 4 or 5 columns + const T* offset_bottom_rois = bottom_rois + n * 4; + int roi_batch_ind = 0; + // if (roi_cols == 5) { + // roi_batch_ind = offset_bottom_rois[0]; + // offset_bottom_rois++; + // } + + // Do not using rounding; this implementation detail is critical + T roi_start_w = offset_bottom_rois[0] * spatial_scale; + T roi_start_h = offset_bottom_rois[1] * spatial_scale; + T roi_end_w = offset_bottom_rois[2] * spatial_scale; + T roi_end_h = offset_bottom_rois[3] * spatial_scale; + // T roi_start_w = round(offset_bottom_rois[0] * spatial_scale); + // T roi_start_h = round(offset_bottom_rois[1] * spatial_scale); + // T roi_end_w = round(offset_bottom_rois[2] * spatial_scale); + // T roi_end_h = round(offset_bottom_rois[3] * spatial_scale); + + // Force malformed ROIs to be 1x1 + T roi_width = std::max(roi_end_w - roi_start_w, (T)1.); + T roi_height = std::max(roi_end_h - roi_start_h, (T)1.); + T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); + T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); + + // We use roi_bin_grid to sample the grid and mimic integral + int roi_bin_grid_h = (sampling_ratio > 0) + ? sampling_ratio + : ceil(roi_height / pooled_height); // e.g., = 2 + int roi_bin_grid_w = + (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); + + // We do average (integral) pooling inside a bin + const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 + + // we want to precalculate indeces and weights shared by all chanels, + // this is the key point of optimiation + std::vector> pre_calc( + roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height); + pre_calc_for_bilinear_interpolate( + height, + width, + pooled_height, + pooled_width, + roi_bin_grid_h, + roi_bin_grid_w, + roi_start_h, + roi_start_w, + bin_size_h, + bin_size_w, + roi_bin_grid_h, + roi_bin_grid_w, + pre_calc); + + + for (int c = 0; c < channels; c++) { + int index_n_c = index_n + c * pooled_width * pooled_height; + const T* offset_bottom_data = + bottom_data + (roi_batch_ind * channels + c) * height * width; + int pre_calc_index = 0; + + for (int ph = 0; ph < pooled_height; ph++) { + for (int pw = 0; pw < pooled_width; pw++) { + int index = index_n_c + ph * pooled_width + pw; + + T output_val = 0.; + for (int iy = 0; iy < roi_bin_grid_h; iy++) { + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + PreCalc pc = pre_calc[pre_calc_index]; + output_val += pc.w1 * offset_bottom_data[pc.pos1] + + pc.w2 * offset_bottom_data[pc.pos2] + + pc.w3 * offset_bottom_data[pc.pos3] + + pc.w4 * offset_bottom_data[pc.pos4]; + + pre_calc_index += 1; + } + } + output_val /= count; + + top_data[index] = output_val; + } // for pw + } // for ph + } // for c + } // for n +} + + +template <> +void RoiAlignPoolKernel::Compute(const RoiAlignPoolParam& param) { + + auto input_tensor = param.float_input.get(); + fpga::PerformBypass(param.input_arg); + fpga::fpga_invalidate(input_tensor->data(), + input_tensor->numel() * sizeof(float)); + + auto* in = input_tensor; + auto* rois = param.input_rois_; + auto* out = param.output_; // param.float_output.get(); + + auto pooled_height = param.pooled_height_; + auto pooled_width = param.pooled_width_; + auto spatial_scale = param.spatial_scale_; + auto sampe_ratio = param.sampling_ratio_; + + auto in_dims = in->dims(); + int batch_size = in_dims[0]; + int input_channels = in_dims[1]; + int height = in_dims[2]; + int width = in_dims[3]; + int rois_num = rois->dims()[0]; + + auto data_nhwc = in->mutable_data(); + + fpga::image::convert_to_chw(&data_nhwc, input_channels, height, width); + framework::DDim dims_out_new = framework::make_ddim( + {rois_num, (param.output_)->dims()[1], (((param.output_)->dims()[2])), + (param.output_)->dims()[3]}); + (param.output_)->Resize(dims_out_new); + + const int index = input_channels * pooled_height * pooled_width * rois_num; + auto rois_data = rois->data(); + auto top_data = param.output_->mutable_data(); + for (int i = 0; i < index; ++i){ + ROIAlignForward( index,data_nhwc,spatial_scale,input_channels,height,width, + pooled_height,pooled_width,sampe_ratio,rois_data,top_data); + } + + fpga::image::convert_to_hwc(&top_data, input_channels, pooled_height, + pooled_width, rois_num); + out->reset_data_ptr(top_data); + +} + +} // namespace operators +} // namespace paddle_mobile + +#endif // ROIALIGN_POOL_OP + diff --git a/src/operators/kernel/fpga/V1/softmax_kernel.cpp b/src/operators/kernel/fpga/V1/softmax_kernel.cpp index bbe5296582..78d920a960 100644 --- a/src/operators/kernel/fpga/V1/softmax_kernel.cpp +++ b/src/operators/kernel/fpga/V1/softmax_kernel.cpp @@ -105,6 +105,7 @@ void SoftmaxKernel::Compute(const SoftmaxParam ¶m) { } else { if (param.FpgaArgs().output.activation.activation_type != fpga::SOFTMAX) { Tensor *out = param.Out(); + out->Resize({in_x->dims()[0], out->dims()[1], out->dims()[2], out->dims()[3]}); math::SoftmaxFuntor()(in_x, out); } } diff --git a/src/operators/kernel/fpga/V1/transpose2_kernel.cpp b/src/operators/kernel/fpga/V1/transpose2_kernel.cpp index f74839f1fc..a9734f8e44 100644 --- a/src/operators/kernel/fpga/V1/transpose2_kernel.cpp +++ b/src/operators/kernel/fpga/V1/transpose2_kernel.cpp @@ -42,6 +42,10 @@ template <> void Transpose2Kernel::Compute( const Transpose2Param ¶m) { // Transpose2Compute(param); + auto input = param.InputX(); + auto output = param.Out(); + + output->Resize({input->dims()[0], output->dims()[1], output->dims()[2], output->dims()[3]}); } } // namespace operators diff --git a/tools/op.cmake b/tools/op.cmake index 3b613473df..83d972d3b2 100755 --- a/tools/op.cmake +++ b/tools/op.cmake @@ -128,6 +128,7 @@ if (CON GREATER -1) set(FUSION_CONVADDBN_OP ON) set(RESHAPE2_OP ON) set(PSROI_POOL_OP ON) + set(ROIALIGN_POOL_OP ON) set(PROPOSAL_OP ON) set(ANCHOR_GENERATOR_OP ON) set(SLICE_OP ON) @@ -603,6 +604,9 @@ endif() if (PSROI_POOL_OP) add_definitions(-DPSROI_POOL_OP) endif() +if (ROIALIGN_POOL_OP) + add_definitions(-DROIALIGN_POOL_OP) +endif() if (ROI_PERSPECTIVE_OP) add_definitions(-DROI_PERSPECTIVE_OP) endif() -- GitLab