未验证 提交 2761ebe0 编写于 作者: Z zhangyang0701 提交者: GitHub

Merge pull request #1472 from jameswu2014/my-cool-stuff

75percentParallel+kerneldriver+ROIALIGN+psroi-bug for FPGA track
......@@ -109,6 +109,7 @@ const char *G_OP_TYPE_SLICE = "slice";
const char *G_OP_TYPE_ANCHOR_GENERATOR = "anchor_generator";
const char *G_OP_TYPE_GENERATE_PROPOSALS = "generate_proposals";
const char *G_OP_TYPE_PSROI_POOL = "psroi_pool";
const char *G_OP_TYPE_ROIALIGN_POOL = "roialign_pool";
const char *G_OP_TYPE_ROI_PERSPECTIVE = "roi_perspective_transform";
const char *G_OP_TYPE_PAD2D = "pad2d";
const char *G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU = "fusion_deconv_add_bn_relu";
......@@ -213,6 +214,7 @@ std::unordered_map<
{{"Scores", "BboxDeltas", "ImInfo", "Anchors", "Variances"},
{"RpnRois", "RpnRoiProbs"}}},
{G_OP_TYPE_PSROI_POOL, {{"X", "ROIs"}, {"Out"}}},
{G_OP_TYPE_ROIALIGN_POOL, {{"X", "ROIs"}, {"Out"}}},
{G_OP_TYPE_ROI_PERSPECTIVE, {{"X", "ROIs"}, {"Out"}}},
{G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU, {{"Input"}, {"Out"}}},
{G_OP_TYPE_FUSION_DECONV_ADD_BN, {{"Input"}, {"Out"}}},
......
......@@ -198,6 +198,7 @@ extern const char *G_OP_TYPE_SLICE;
extern const char *G_OP_TYPE_ANCHOR_GENERATOR;
extern const char *G_OP_TYPE_GENERATE_PROPOSALS;
extern const char *G_OP_TYPE_PSROI_POOL;
extern const char *G_OP_TYPE_ROIALIGN_POOL;
extern const char *G_OP_TYPE_ROI_PERSPECTIVE;
extern const char *G_OP_TYPE_PAD2D;
extern const char *G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU;
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "fpga/V1/api.h"
#include <memory>
#include "fpga/V1/bias_scale.h"
#include "fpga/V1/deconv_filter.h"
#include "fpga/V1/filter.h"
......@@ -368,9 +369,10 @@ void expand_conv_arg(ConvArgs *arg) {
auto filter_pad_width_mul_channel =
args.image.pad_width * args.image.channels;
auto image_amount_per_row_multi_win_first =
image_amount_per_row * (2 * args.kernel.stride_h - args.image.pad_height);
image_amount_per_row *
(ROW_PARALLEL_NUM * args.kernel.stride_h - args.image.pad_height);
auto image_amount_per_row_multi_win =
image_amount_per_row * (2 * args.kernel.stride_h);
image_amount_per_row * (ROW_PARALLEL_NUM * args.kernel.stride_h);
auto image_block_num = block_num;
auto image_block_len =
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "fpga/common/bitmap.h"
namespace fpga_bitmap {
void bitmap_set(uint64_t *map, unsigned int start, int len) {
uint64_t *p = map + BIT_WORD(start);
const unsigned int size = start + len;
int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG);
uint64_t mask_to_set = BITMAP_FIRST_WORD_MASK(start);
while (len - bits_to_set >= 0) {
*p |= mask_to_set;
len -= bits_to_set;
bits_to_set = BITS_PER_LONG;
mask_to_set = ~0UL;
p++;
}
if (len) {
mask_to_set &= BITMAP_LAST_WORD_MASK(size);
*p |= mask_to_set;
}
}
void bitmap_clear(uint64_t *map, unsigned int start, int len) {
uint64_t *p = map + BIT_WORD(start);
const unsigned int size = start + len;
int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG);
uint64_t mask_to_clear = BITMAP_FIRST_WORD_MASK(start);
while (len - bits_to_clear >= 0) {
*p &= ~mask_to_clear;
len -= bits_to_clear;
bits_to_clear = BITS_PER_LONG;
mask_to_clear = ~0UL;
p++;
}
if (len) {
mask_to_clear &= BITMAP_LAST_WORD_MASK(size);
*p &= ~mask_to_clear;
}
}
static uint64_t ffs(uint64_t data) {
uint64_t bit = 0;
int i = 0;
for (i = 0; i < sizeof(data) * 8; i++) {
if (data & (1UL << i)) {
bit = i;
break;
}
}
return bit;
}
static uint64_t _find_next_bit(const uint64_t *addr, uint64_t nbits,
uint64_t start, uint64_t invert) {
uint64_t tmp = 0;
if (!nbits || start >= nbits) return nbits;
tmp = addr[start / BITS_PER_LONG] ^ invert;
/* Handle 1st word. */
tmp &= BITMAP_FIRST_WORD_MASK(start);
start = round_down(start, BITS_PER_LONG);
while (!tmp) {
start += BITS_PER_LONG;
if (start >= nbits) return nbits;
tmp = addr[start / BITS_PER_LONG] ^ invert;
}
return (start + ffs(tmp)) < nbits ? (start + ffs(tmp)) : nbits;
}
uint64_t find_next_zero_bit(const uint64_t *addr, uint64_t size,
uint64_t offset) {
return _find_next_bit(addr, size, offset, ~0UL);
}
uint64_t find_next_bit(const uint64_t *addr, uint64_t size, uint64_t offset) {
return _find_next_bit(addr, size, offset, 0UL);
}
uint64_t bitmap_find_next_zero_area_off(uint64_t *map, uint64_t size,
uint64_t start, unsigned int nr,
uint64_t align_mask,
uint64_t align_offset) {
uint64_t index = 0;
uint64_t end = 0;
uint64_t i = 0;
again:
index = find_next_zero_bit(map, size, start);
/* Align allocation */
index = __ALIGN_MASK(index + align_offset, align_mask) - align_offset;
end = index + nr;
if (end > size) return end;
i = find_next_bit(map, end, index);
if (i < end) {
start = i + 1;
goto again;
}
return index;
}
uint64_t bitmap_find_next_zero_area(uint64_t *map, uint64_t size,
uint64_t start, unsigned int nr,
uint64_t align_mask) {
return bitmap_find_next_zero_area_off(map, size, start, nr, align_mask, 0);
}
} // namespace fpga_bitmap
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stdint.h>
#include <stdio.h>
#define BITS_PER_LONG 64
#define BIT_WORD(nr) ((nr) / BITS_PER_LONG)
#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1)))
#define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1)))
#define __ALIGN_KERNEL_MASK(x, mask) (((x) + (mask)) & ~(mask))
#define __ALIGN_MASK(x, mask) __ALIGN_KERNEL_MASK((x), (mask))
#define round_down(x, y) ((x) & ~((y)-1))
namespace fpga_bitmap {
void bitmap_set(uint64_t *map, unsigned int start, int len);
void bitmap_clear(uint64_t *map, unsigned int start, int len);
uint64_t bitmap_find_next_zero_area(uint64_t *map, uint64_t size,
uint64_t start, unsigned int nr,
uint64_t align_mask);
} // namespace fpga_bitmap
......@@ -26,9 +26,9 @@ limitations under the License. */
#include <fstream>
#include <iomanip>
#include <iostream>
#include <utility>
#include "common/enforce.h"
#include "fpga/common/bitmap.h"
#include "fpga/common/driver.h"
namespace paddle_mobile {
......@@ -148,34 +148,6 @@ int fpga_regpoll(uint64_t reg, uint64_t val, int time) {
}
}
/*内存管理*/
int memory_request(struct fpga_memory *memory, size_t size, uint64_t *addr) {
uint64_t _nr = DIV_ROUND_UP(size, FPGA_PAGE_SIZE);
unsigned int nr = (unsigned int)_nr;
int ret = 0;
uint64_t a_size = FPGA_PAGE_SIZE * nr;
pthread_mutex_lock(&memory->mutex);
unsigned int pos = (unsigned int)fpga_bitmap::bitmap_find_next_zero_area(
memory->bitmap, memory->page_num, 0, nr, 0);
if (pos <= memory->page_num) {
uint64_t address_ofset =
memory->mem_start + ((uint64_t)pos) * FPGA_PAGE_SIZE;
fpga_bitmap::bitmap_set(memory->bitmap, pos, nr);
memory->nr[pos] = nr;
*addr = address_ofset;
} else {
DLOG << "memory request failed!";
ret = -ENOMEM;
}
pthread_mutex_unlock(&memory->mutex);
return ret;
}
void memory_release(struct fpga_memory *memory) {
void *ptr = nullptr;
......@@ -187,97 +159,6 @@ void memory_release(struct fpga_memory *memory) {
}
}
int create_fpga_memory_inner(struct fpga_memory *memory, size_t memory_size) {
int rc = 0;
uint64_t *bitmap = nullptr;
unsigned int *nr = nullptr;
// 不允许多份memory创建,所以创建memory结构体不存在互斥
// pthread_mutex_lock(&memory->mutex);
memory->page_num = (unsigned int)(memory_size / FPGA_PAGE_SIZE);
memory->page_num_long = DIV_ROUND_UP(memory->page_num, BITS_PER_LONG);
bitmap =
(uint64_t *)malloc(sizeof(int64_t) * memory->page_num_long); // NOLINT
if (!bitmap) {
rc = -EFAULT;
return rc;
}
memory->bitmap = bitmap;
nr = (unsigned int *)calloc(memory->page_num, sizeof(unsigned int));
if (!nr) {
rc = -EFAULT;
free(bitmap);
return rc;
}
memory->nr = nr;
memory->mem_start = FPGA_MEM_PHY_ADDR;
memory->mem_end = FPGA_MEM_SIZE;
// pthread_mutex_unlock(memory->mutex);
return rc;
}
int create_fpga_memory(struct fpga_memory **memory_info) {
int rc = 0;
*memory_info = (struct fpga_memory *)malloc(sizeof(struct fpga_memory));
if (*memory_info == NULL) {
rc = -EFAULT;
return rc;
}
pthread_mutex_init(&((*memory_info)->mutex), nullptr);
rc = create_fpga_memory_inner(*memory_info, FPGA_MEM_SIZE);
if (rc) {
free(*memory_info);
}
return rc;
}
int init_fpga_memory(struct fpga_memory *memory) {
int rc = 0;
if (!memory) {
rc = -EFAULT;
return rc;
}
fpga_bitmap::bitmap_clear(memory->bitmap, 0, memory->page_num);
fpga_bitmap::bitmap_set(memory->bitmap, 0, 1); // NOTE reserve fpga page 0.
return 0;
}
void destroy_fpga_memory(struct fpga_memory *memory) {
if (memory) {
free(memory->nr);
free(memory->bitmap);
free(memory);
}
}
int fpga_memory_add() {
int rc = 0;
rc = create_fpga_memory(&g_fpgainfo.memory_info);
if (rc) {
return rc;
}
rc = init_fpga_memory(g_fpgainfo.memory_info);
if (rc) {
destroy_fpga_memory(g_fpgainfo.memory_info);
return rc;
}
return 0;
}
uint64_t vaddr_to_paddr_driver(void *address) {
uint64_t paddr = 0;
auto iter = g_fpgainfo.fpga_vaddr2paddr_map.find(address);
......@@ -314,17 +195,28 @@ void *fpga_reg_free(void *ptr) {
}
}
static inline int do_ioctl(int64_t req, const void *arg) {
return ioctl(g_fpgainfo.fd_mem, req, arg);
}
void *fpga_malloc_driver(size_t size) {
void *ret = nullptr;
uint64_t phy_addr = 0;
int i = 0;
struct MemoryVM2PHYArgs args;
struct MemoryCacheArgs args_c;
memory_request(g_fpgainfo.memory_info, size, &phy_addr);
// memory_request(g_fpgainfo.memory_info, size, &phy_addr);
ret = mmap64(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED,
g_fpgainfo.fd_mem, phy_addr);
g_fpgainfo.fd_mem, FPGA_MEM_PHY_ADDR);
PADDLE_MOBILE_ENFORCE(ret != (void *)-1, "Should not be -1");
args.pVM = reinterpret_cast<void *>(ret);
args.pPHY = reinterpret_cast<void *>(0);
do_ioctl(IOCTL_MEMORY_VM2PHY, &args);
phy_addr = (uint64_t)args.pPHY;
g_fpgainfo.fpga_vaddr2paddr_map.insert(std::make_pair(ret, phy_addr));
g_fpgainfo.fpga_addr2size_map.insert(std::make_pair(ret, size));
......@@ -342,14 +234,8 @@ void fpga_free_driver(void *ptr) {
g_fpgainfo.fpga_addr2size_map.erase(iter);
munmap(ptr, size);
p_addr = vaddr_to_paddr_driver(ptr);
pos = (p_addr - g_fpgainfo.memory_info->mem_start) / FPGA_PAGE_SIZE;
/*clear bitmap*/
pthread_mutex_lock(&g_fpgainfo.memory_info->mutex);
fpga_bitmap::bitmap_clear(g_fpgainfo.memory_info->bitmap, pos,
g_fpgainfo.memory_info->nr[pos]);
pthread_mutex_unlock(&g_fpgainfo.memory_info->mutex);
// p_addr = vaddr_to_paddr_driver(ptr);
// pos = (p_addr - g_fpgainfo.memory_info->mem_start) / FPGA_PAGE_SIZE;
auto iter = g_fpgainfo.fpga_vaddr2paddr_map.find(ptr);
if (iter != g_fpgainfo.fpga_vaddr2paddr_map.end()) {
......@@ -360,10 +246,6 @@ void fpga_free_driver(void *ptr) {
}
}
static inline int do_ioctl(int64_t req, const void *arg) {
return ioctl(g_fpgainfo.fd_mem, req, arg);
}
int fpga_flush_driver(void *address, size_t size) {
struct MemoryCacheArgs args;
uint64_t p_addr;
......@@ -413,7 +295,7 @@ int open_device_driver() {
g_fpgainfo.FpgaRegVirAddr =
(uint64_t *)fpga_reg_malloc(FPGA_REG_SIZE); // NOLINT
fpga_memory_add();
// fpga_memory_add();
pl_init();
......@@ -424,7 +306,6 @@ int close_device_driver() {
pl_destroy();
fpga_reg_free(g_fpgainfo.FpgaRegVirAddr);
memory_release(g_fpgainfo.memory_info);
destroy_fpga_memory(g_fpgainfo.memory_info);
return 0;
}
......
......@@ -31,8 +31,8 @@ namespace driver {
#define FPGA_REG_PHY_ADDR 0x80000000
#define FPGA_REG_SIZE 0x1000
#define FPGA_MEM_PHY_ADDR 0x40000000
#define FPGA_MEM_SIZE 0x80000000
#define FPGA_MEM_PHY_ADDR 0x20000000
#define FPGA_MEM_SIZE 0x20000000
#define FPGA_PAGE_SIZE (16UL * 1024UL)
......@@ -52,9 +52,15 @@ struct MemoryCacheArgs {
size_t size;
};
struct MemoryVM2PHYArgs {
void *pVM;
void *pPHY;
};
#define IOCTL_FPGA_MAGIC 'F'
#define IOCTL_MEMCACHE_INVAL _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryCacheArgs)
#define IOCTL_MEMCACHE_FLUSH _IOW(IOCTL_FPGA_MAGIC, 13, struct MemoryCacheArgs)
#define IOCTL_MEMORY_VM2PHY _IOWR(IOCTL_FPGA_MAGIC, 15, struct MemoryVM2PHYArgs)
struct fpga_pe {
char type_name[MAX_TYPE_NAME_LENTH + 1];
......
......@@ -25,6 +25,7 @@ limitations under the License. */
#define FILTER_ELEMENT_ALIGNMENT (16) // Filter element number aligned to 16
#define BS_NUM_ALIGNMENT (8)
#define BIAS_NUM_ALIGNMENT (16)
#define ROW_PARALLEL_NUM (3)
#endif
namespace paddle_mobile {
......
......@@ -65,6 +65,23 @@ void PSRoiPoolOp<DeviceType, T>::InferShape() const {
}
#endif
#ifdef ROIALIGN_POOL_OP
template <typename DeviceType, typename T>
void RoiAlignPoolOp<DeviceType, T>::InferShape() const {
const auto &rois_dims = this->param_.input_rois_->dims();
const int pooled_height = this->param_.pooled_height_;
const int pooled_width = this->param_.pooled_width_;
auto out_dims = this->param_.input_x_->dims();
out_dims[0] = rois_dims[0];
// out_dims[1] =
// output_channels; // input_dims[1] / (pooled_height * pooled_width);
out_dims[2] = pooled_height;
out_dims[3] = pooled_width;
this->param_.output_->Resize(out_dims);
}
#endif
#ifdef ROI_PERSPECTIVE_OP
template <typename DeviceType, typename T>
void RoiPerspectiveOp<DeviceType, T>::InferShape() const {
......@@ -110,4 +127,8 @@ REGISTER_OPERATOR_FPGA(generate_proposals, ops::ProposalOp);
#ifdef PSROI_POOL_OP
REGISTER_OPERATOR_FPGA(psroi_pool, ops::PSRoiPoolOp);
#endif
#ifdef ROIALIGN_POOL_OP
REGISTER_OPERATOR_FPGA(roialign_pool, ops::RoiAlignPoolOp);
#endif
#endif
......@@ -34,6 +34,10 @@ DECLARE_OPERATOR(Proposal, ProposalParam, ProposalKernel);
DECLARE_OPERATOR(PSRoiPool, PSRoiPoolParam, PSRoiPoolKernel);
#endif
#ifdef ROIALIGN_POOL_OP
DECLARE_OPERATOR(RoiAlignPool, RoiAlignPoolParam, RoiAlignPoolKernel);
#endif
#ifdef ROI_PERSPECTIVE_OP
DECLARE_OPERATOR(RoiPerspective, RoiPerspectiveParam, RoiPerspectiveKernel);
#endif
......
......@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once
#include <memory>
#include <vector>
#include "framework/operator.h"
#include "operators/op_param.h"
......@@ -98,6 +99,8 @@ class ProposalParam : public OpParam {
framework::Tensor *anchors_;
framework::Tensor *variances_;
std::shared_ptr<Tensor> score_index_;
framework::LoDTensor *rpn_rois_;
framework::LoDTensor *rpn_probs_;
......@@ -151,6 +154,43 @@ class PSRoiPoolParam : public OpParam {
DECLARE_KERNEL(PSRoiPool, PSRoiPoolParam);
#endif
#ifdef ROIALIGN_POOL_OP
template <typename Dtype>
class RoiAlignPoolParam : public OpParam {
public:
RoiAlignPoolParam(const VariableNameMap &inputs,
const VariableNameMap &outputs, const AttributeMap &attrs,
Scope *scope)
: OpParam(inputs, outputs, attrs, scope) {
input_x_ = OpParam::GetVarValue<framework::LoDTensor>("X", inputs, *scope);
input_rois_ =
OpParam::GetVarValue<framework::LoDTensor>("ROIs", inputs, *scope);
output_ =
OpParam::GetVarValue<framework::LoDTensor>("Out", outputs, *scope);
pooled_height_ = OpParam::GetAttr<int>("pooled_height", attrs);
pooled_width_ = OpParam::GetAttr<int>("pooled_width", attrs);
spatial_scale_ = OpParam::GetAttr<float>("spatial_scale", attrs);
sampling_ratio_ = OpParam::GetAttr<float>("sampling_ratio", attrs);
}
public:
framework::Tensor *input_x_;
framework::LoDTensor *input_rois_;
framework::Tensor *output_;
int pooled_height_;
int pooled_width_;
float spatial_scale_;
int sampling_ratio_;
#ifdef PADDLE_MOBILE_FPGA
std::shared_ptr<Tensor> float_input, float_output;
fpga::BypassArgs input_arg, output_arg;
#endif
};
DECLARE_KERNEL(RoiAlignPool, RoiAlignPoolParam);
#endif
#ifdef ROI_PERSPECTIVE_OP
template <typename Dtype>
class RoiPerspectiveParam : public OpParam {
......
......@@ -56,13 +56,16 @@ void dealign(float *src, float *dst, int input_c, int input_h, int input_w) {
}
template <>
void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
auto input = param.InputX();
auto input = const_cast<Tensor *>(param.InputX());
if (input->type() == typeid(float)) {
auto output = param.Out();
output->ShareDataWith(*input);
return;
}
fpga::PerformBypass(param.fpga_bypass_args);
fpga::BypassArgs args = param.fpga_bypass_args;
auto input_address = (input->data<half>());
args.image.address = static_cast<void *>(input_address);
fpga::PerformBypass(args);
auto outC = param.Out()->dims()[1];
auto outH = param.Out()->dims()[2];
auto outW = param.Out()->dims()[3];
......@@ -70,10 +73,14 @@ void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
reinterpret_cast<float *>(param.fpga_bypass_args.output.address);
fpga::fpga_invalidate(param.fpga_bypass_args.output.address,
param.Out()->fpga_data_num * sizeof(float));
float *data_tmp =
reinterpret_cast<float *>(malloc(outC * outH * outW * sizeof(float)));
dealign(outdata_ptr, data_tmp, outC, outH, outW);
memcpy(outdata_ptr, data_tmp, outC * outH * outW * sizeof(float));
if (param.Out()->fpga_data_num != product(input->dims())) {
float *data_tmp =
reinterpret_cast<float *>(malloc(outC * outH * outW * sizeof(float)));
dealign(outdata_ptr, data_tmp, outC, outH, outW);
memcpy(outdata_ptr, data_tmp, outC * outH * outW * sizeof(float));
free(data_tmp);
}
}
template class FetchKernel<FPGA, float>;
......
......@@ -73,9 +73,12 @@ void PoolKernel<FPGA, float>::Compute(const PoolParam<FPGA> &param) {
if (input->type() == typeid(float)) {
auto *output = param.Output();
auto in = input->data<float>();
auto N = input->dims()[0];
output->Resize(
{N, output->dims()[1], output->dims()[2], output->dims()[3]});
auto len = output->numel();
auto out = output->mutable_data<float>();
int N = input->dims()[0], C = input->dims()[1], H = input->dims()[2],
int C = input->dims()[1], H = input->dims()[2], // N = input->dims()[0],
W = input->dims()[3];
int HW = H * W, CHW = C * H * W, WC = W * C;
......
......@@ -65,6 +65,13 @@ bool ProposalKernel<FPGA, float>::Init(ProposalParam<FPGA> *param) {
args.output.scale_address = param->float_score->scale;
param->score_arg = args;
param->score_index_ = std::make_shared<Tensor>();
param->score_index_->mutable_data<int32_t>({input->numel()});
auto score_index = param->score_index_->data<int32_t>();
for (int i = 0; i < input->numel(); ++i) {
score_index[i] = i;
}
return true;
}
template <typename T>
......@@ -334,17 +341,20 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
const Tensor &im_info_slice, const Tensor &anchors, const Tensor &variances,
const Tensor &bbox_deltas_slice, // [M, 4]
const Tensor &scores_slice, // [N, 1]
int pre_nms_top_n, int post_nms_top_n, float nms_thresh, float min_size,
float eta) {
const Tensor &score_index, int pre_nms_top_n, int post_nms_top_n,
float nms_thresh, float min_size, float eta) {
auto *scores_data = scores_slice.data<T>();
// Sort index
Tensor index_t;
index_t.Resize({scores_slice.numel()});
int *index = index_t.mutable_data<int>();
for (int i = 0; i < scores_slice.numel(); ++i) {
/*for (int i = 0; i < scores_slice.numel(); ++i) {
index[i] = i;
}
}*/
std::memcpy(index, score_index.data<int32_t>(),
scores_slice.numel() * sizeof(int));
auto compare = [scores_data](const int64_t &i, const int64_t &j) {
return scores_data[i] > scores_data[j];
};
......@@ -490,8 +500,10 @@ void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) {
auto *rpn_rois = param.rpn_rois_;
auto *rpn_roi_probs = param.rpn_probs_;
auto score_index = *(param.score_index_.get());
int pre_nms_top_n = param.pre_nms_topn_;
int post_nms_top_n = param.post_nms_topn_;
int post_nms_top_n = 100; // param.post_nms_topn_;
float nms_thresh = param.nms_thresh_;
float min_size = param.min_size_;
float eta = param.eta_;
......@@ -529,7 +541,7 @@ void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) {
std::pair<Tensor, Tensor> tensor_pair = ProposalForOneImage<float>(
im_info_slice, anchors, variances, bbox_deltas_slice, scores_slice,
pre_nms_top_n, post_nms_top_n, nms_thresh, min_size, eta);
score_index, pre_nms_top_n, post_nms_top_n, nms_thresh, min_size, eta);
Tensor &proposals = tensor_pair.first;
Tensor &scores = tensor_pair.second;
......
......@@ -15,6 +15,7 @@ limitations under the License. */
#ifdef PSROI_POOL_OP
#include <cmath>
#include <memory>
#include <vector>
#include "operators/kernel/detection_kernel.h"
......@@ -72,42 +73,20 @@ bool PSRoiPoolKernel<FPGA, float>::Init(PSRoiPoolParam<FPGA>* param) {
}
template <typename Dtype>
void PSROIPooling(const Dtype* bottom_data, const Dtype spatial_scale,
const int channels, const int height, const int width,
const int pooled_height, const int pooled_width,
const Dtype* bottom_rois, const int output_dim,
const int group_size, Dtype* top_data,
// int* mapping_channel,
int index, int* rois_batch_id) {
// The output is in order (n, ctop, ph, pw)
// static int cnt = 0;
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int ctop = (index / pooled_width / pooled_height) % output_dim;
int n = index / pooled_width / pooled_height / output_dim;
// [start, end) interval for spatial sampling
bottom_rois += n * 4;
int roi_batch_ind = rois_batch_id[n]; // bottom_rois[0];
Dtype roi_start_w = static_cast<Dtype>(round(bottom_rois[0])) * spatial_scale;
Dtype roi_start_h = static_cast<Dtype>(round(bottom_rois[1])) * spatial_scale;
Dtype roi_end_w =
static_cast<Dtype>(round(bottom_rois[2]) + 1.) * spatial_scale;
Dtype roi_end_h =
static_cast<Dtype>(round(bottom_rois[3]) + 1.) * spatial_scale;
// Force too small ROIs to be 1x1
Dtype roi_width = std::max(roi_end_w - roi_start_w, 0.1f); // avoid 0
Dtype roi_height = std::max(roi_end_h - roi_start_h, 0.1f);
// Compute w and h at bottom
Dtype bin_size_h = roi_height / static_cast<Dtype>(pooled_height);
Dtype bin_size_w = roi_width / static_cast<Dtype>(pooled_width);
int hstart = floor(static_cast<Dtype>(ph) * bin_size_h + roi_start_h);
int wstart = floor(static_cast<Dtype>(pw) * bin_size_w + roi_start_w);
int hend = ceil(static_cast<Dtype>(ph + 1) * bin_size_h + roi_start_h);
int wend = ceil(static_cast<Dtype>(pw + 1) * bin_size_w + roi_start_w);
void PSROIPooling(const Dtype* bottom_data, const int channels,
const int height, const int width, const int pooled_height,
const int pooled_width, const Dtype* bottom_rois,
const int output_dim, const int group_size, Dtype* top_data,
int index, int nid, const Dtype Bin_size_h,
const Dtype Bin_size_w, const Dtype roi_start_h,
const Dtype roi_start_w, const int ctop, const int ph,
const int roi_batch_ind) {
int pw = index;
int hstart = floor(static_cast<Dtype>(ph) * Bin_size_h + roi_start_h);
int wstart = floor(static_cast<Dtype>(pw) * Bin_size_w + roi_start_w);
int hend = ceil(static_cast<Dtype>(ph + 1) * Bin_size_h + roi_start_h);
int wend = ceil(static_cast<Dtype>(pw + 1) * Bin_size_w + roi_start_w);
// Add roi offsets and clip to input boundaries
hstart = std::min(std::max(hstart, 0), height);
hend = std::min(std::max(hend, 0), height);
......@@ -115,10 +94,9 @@ void PSROIPooling(const Dtype* bottom_data, const Dtype spatial_scale,
wend = std::min(std::max(wend, 0), width);
bool is_empty = (hend <= hstart) || (wend <= wstart);
int gw = pw;
int gh = ph;
int c = (ctop * group_size + gh) * group_size + gw;
int c = (ctop * group_size + ph) * group_size + pw;
Dtype bin_area = (hend - hstart) * (wend - wstart);
bottom_data += (roi_batch_ind * channels + c) * height * width;
Dtype out_sum = 0;
for (int h = hstart; h < hend; ++h) {
......@@ -128,9 +106,50 @@ void PSROIPooling(const Dtype* bottom_data, const Dtype spatial_scale,
}
}
Dtype bin_area = (hend - hstart) * (wend - wstart);
top_data[index] = is_empty ? 0. : out_sum / bin_area;
top_data[nid + index] = is_empty ? 0. : out_sum / bin_area;
}
void convert_to_chw(float** data_in, int channel, int height, int width,
int num) {
float* data_in_tmp = *data_in;
float* data_tmp = reinterpret_cast<float*>(
fpga::fpga_malloc(channel * height * width * sizeof(float))); // NOLINT
int64_t amount_per_side = width * height;
for (int n = 0; n < num; n++) {
for (int h = 0; h < height; h++) {
for (int w = 0; w < width; w++) {
for (int c = 0; c < channel; c++) {
*(data_tmp + n * height * width * channel + c * amount_per_side +
width * h + w) = *((*data_in)++);
}
}
}
}
*data_in = data_tmp;
fpga::fpga_free(data_in_tmp);
}
void convert_to_hwc(float** data_in, int channel, int height, int width,
int num) {
float* data_in_tmp = *data_in;
float* data_tmp = reinterpret_cast<float*>(
fpga::fpga_malloc(num * channel * height * width * sizeof(float)));
int64_t amount_per_row = width * channel;
for (int n = 0; n < num; n++) {
for (int c = 0; c < channel; c++) {
for (int h = 0; h < height; h++) {
int64_t offset_height = h * amount_per_row;
for (int w = 0; w < width; w++) {
*(data_tmp + n * channel * height * width + offset_height +
w * channel + c) = *((*data_in)++);
}
}
}
}
*data_in = data_tmp;
fpga::fpga_free(data_in_tmp);
}
template <>
void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
auto input_tensor = param.float_input.get();
......@@ -155,13 +174,14 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
int rois_num = rois->dims()[0];
auto data_nhwc = in->mutable_data<float>();
fpga::image::convert_to_chw(&data_nhwc, input_channels, height, width);
fpga::image::convert_to_chw(&data_nhwc, input_channels, height, width, 1);
framework::DDim dims_out_new = framework::make_ddim(
{rois_num, (param.output_)->dims()[1], (((param.output_)->dims()[2])),
(param.output_)->dims()[3]});
(param.output_)->Resize(dims_out_new);
const float* input_data = data_nhwc; // in->data<float>();
float* input_data = data_nhwc; // in->data<float>();
// shared_ptr<float> input_data(data_nhwc);
framework::Tensor rois_batch_id_list;
rois_batch_id_list.Resize({rois_num});
auto rois_batch_id_data = rois_batch_id_list.mutable_data<int>();
......@@ -183,24 +203,53 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
"output_channels x pooled_height x pooled_width");
// calculate batch id index for each roi according to LoD
for (int n = 0; n < rois_batch_size; ++n) {
for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
rois_batch_id_data[i] = n;
}
}
// for (int n = 0; n < rois_batch_size; ++n) {
// for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
// rois_batch_id_data[i] = n;
// }
//}
auto output_data = out->mutable_data<float>();
auto input_rois = rois->data<float>();
// calculate psroipooling, parallel processing can be implemented per ROI
int index = pooled_height * pooled_width * output_channels * rois_num;
for (int idx = 0; idx < index; idx++) {
PSROIPooling<float>(input_data, spatial_scale, input_channels, height,
width, pooled_height, pooled_width, input_rois,
output_channels, pooled_height, output_data, idx,
rois_batch_id_data);
for (int n = 0; n < rois_num; ++n) {
// [start, end) interval for spatial sampling
auto offset_input_rois = input_rois + n * 4;
auto roi_start_w =
static_cast<float>(round(offset_input_rois[0])) * spatial_scale;
auto roi_start_h =
static_cast<float>(round(offset_input_rois[1])) * spatial_scale;
auto roi_end_w =
static_cast<float>(round(offset_input_rois[2]) + 1.) * spatial_scale;
auto roi_end_h =
static_cast<float>(round(offset_input_rois[3]) + 1.) * spatial_scale;
// Force too small rois to be 1 x 1
auto roi_height = std::max(roi_end_h - roi_start_h, 0.1f); // avoid 0
auto roi_width = std::max(roi_end_w - roi_start_w, 0.1f);
// Compute bin size w and h at input feature map
auto bin_size_h = roi_height / static_cast<float>(pooled_height);
auto bin_size_w = roi_width / static_cast<float>(pooled_width);
int roi_batch_ind = 0; // rois_batch_id_data[n];
// std::cout << "roi_batch_ind: " << roi_batch_ind << std::endl;
for (int c = 0; c < output_channels; ++c) {
for (int ph = 0; ph < pooled_height; ph++) {
int index = pooled_width;
int nid = n * output_channels * pooled_height * pooled_width +
c * pooled_width * pooled_height + ph * pooled_width;
for (int idx = 0; idx < index; idx++) {
PSROIPooling<float>(input_data, input_channels, height, width,
pooled_height, pooled_width, input_rois,
output_channels, pooled_height, output_data, idx,
nid, bin_size_h, bin_size_w, roi_start_h,
roi_start_w, c, ph, roi_batch_ind);
}
}
}
}
//
fpga::fpga_free(input_data);
fpga::image::convert_to_hwc(&output_data, output_channels, pooled_height,
pooled_width, rois_num);
out->reset_data_ptr(output_data);
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef ROIALIGN_POOL_OP
#include <cmath>
#include <vector>
#include "operators/kernel/detection_kernel.h"
#include "fpga/V1/api.h"
#include "fpga/V1/image.h"
namespace paddle_mobile {
namespace operators {
template <>
bool RoiAlignPoolKernel<FPGA, float>::Init(RoiAlignPoolParam<FPGA>* param) {
auto dims = param->input_x_->dims();
PADDLE_MOBILE_ENFORCE(dims[1] * dims[3] % IMAGE_ALIGNMENT == 0,
"data not aligned");
param->float_input = std::make_shared<Tensor>();
param->float_input->mutable_data<float>(param->input_x_->dims());
auto input = param->input_x_;
fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
args.input_layout_type = fpga::LAYOUT_HWC;
args.output_layout_type = fpga::LAYOUT_HWC;
args.input_data_type = fpga::DATA_TYPE_FP16;
args.output_data_type = fpga::DATA_TYPE_FP32;
args.image.address = input->data<half>();
args.image.height = (uint32_t)input->dims()[2];
args.image.width = (uint32_t)input->dims()[3];
args.image.channels = (uint32_t)input->dims()[1];
args.output.address = param->float_input->mutable_data<float>();
args.output.scale_address = param->float_input->scale;
param->input_arg = args;
auto* rois = param->input_rois_;
int rois_num = rois->dims()[0];
framework::DDim dims_out_new = framework::make_ddim(
{rois_num, param->output_->dims()[1], param->output_->dims()[2],
param->output_->dims()[3]});
param->output_->Resize(dims_out_new);
param->output_->mutable_data<float>(dims_out_new);
return true;
}
template <typename T>
struct PreCalc {
int pos1;
int pos2;
int pos3;
int pos4;
T w1;
T w2;
T w3;
T w4;
};
template <typename T>
void pre_calc_for_bilinear_interpolate(
const int height, const int width, const int pooled_height,
const int pooled_width, const int iy_upper, const int ix_upper,
T roi_start_h, T roi_start_w, T bin_size_h, T bin_size_w,
int roi_bin_grid_h, int roi_bin_grid_w,
std::vector<PreCalc<T>>& pre_calc) { // NOLINT
int pre_calc_index = 0;
for (int ph = 0; ph < pooled_height; ph++) {
for (int pw = 0; pw < pooled_width; pw++) {
for (int iy = 0; iy < iy_upper; iy++) {
const T yy = roi_start_h + ph * bin_size_h +
static_cast<T>(iy + .5f) * bin_size_h /
static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
for (int ix = 0; ix < ix_upper; ix++) {
const T xx = roi_start_w + pw * bin_size_w +
static_cast<T>(ix + .5f) * bin_size_w /
static_cast<T>(roi_bin_grid_w);
T x = xx;
T y = yy;
// deal with: inverse elements are out of feature map boundary
if (y < -1.0 || y > height || x < -1.0 || x > width) {
// empty
PreCalc<T> pc;
pc.pos1 = 0;
pc.pos2 = 0;
pc.pos3 = 0;
pc.pos4 = 0;
pc.w1 = 0;
pc.w2 = 0;
pc.w3 = 0;
pc.w4 = 0;
pre_calc[pre_calc_index] = pc;
pre_calc_index += 1;
continue;
}
if (y <= 0) {
y = 0;
}
if (x <= 0) {
x = 0;
}
int y_low = static_cast<int>(y);
int x_low = static_cast<int>(x);
int y_high;
int x_high;
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = (T)y_low;
} else {
y_high = y_low + 1;
}
if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = (T)x_low;
} else {
x_high = x_low + 1;
}
T ly = y - y_low;
T lx = x - x_low;
T hy = 1. - ly, hx = 1. - lx;
T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
// save weights and indeces
PreCalc<T> pc;
pc.pos1 = y_low * width + x_low;
pc.pos2 = y_low * width + x_high;
pc.pos3 = y_high * width + x_low;
pc.pos4 = y_high * width + x_high;
pc.w1 = w1;
pc.w2 = w2;
pc.w3 = w3;
pc.w4 = w4;
pre_calc[pre_calc_index] = pc;
pre_calc_index += 1;
}
}
}
}
}
template <typename T>
void ROIAlignForward(const int nthreads, const T* bottom_data,
const T& spatial_scale, const int channels,
const int height, const int width, const int pooled_height,
const int pooled_width, const int sampling_ratio,
const T* bottom_rois, T* top_data) {
int n_rois = nthreads / channels / pooled_width / pooled_height;
for (int n = 0; n < n_rois; n++) {
int index_n = n * channels * pooled_width * pooled_height;
// roi could have 4 or 5 columns
const T* offset_bottom_rois = bottom_rois + n * 4;
int roi_batch_ind = 0;
// if (roi_cols == 5) {
// roi_batch_ind = offset_bottom_rois[0];
// offset_bottom_rois++;
// }
// Do not using rounding; this implementation detail is critical
T roi_start_w = offset_bottom_rois[0] * spatial_scale;
T roi_start_h = offset_bottom_rois[1] * spatial_scale;
T roi_end_w = offset_bottom_rois[2] * spatial_scale;
T roi_end_h = offset_bottom_rois[3] * spatial_scale;
// T roi_start_w = round(offset_bottom_rois[0] * spatial_scale);
// T roi_start_h = round(offset_bottom_rois[1] * spatial_scale);
// T roi_end_w = round(offset_bottom_rois[2] * spatial_scale);
// T roi_end_h = round(offset_bottom_rois[3] * spatial_scale);
// Force malformed ROIs to be 1x1
T roi_width = std::max(roi_end_w - roi_start_w, (T)1.);
T roi_height = std::max(roi_end_h - roi_start_h, (T)1.);
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h = (sampling_ratio > 0)
? sampling_ratio
: ceil(roi_height / pooled_height); // e.g., = 2
int roi_bin_grid_w =
(sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
// We do average (integral) pooling inside a bin
const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
// we want to precalculate indeces and weights shared by all chanels,
// this is the key point of optimiation
std::vector<PreCalc<T>> pre_calc(roi_bin_grid_h * roi_bin_grid_w *
pooled_width * pooled_height);
pre_calc_for_bilinear_interpolate(
height, width, pooled_height, pooled_width, roi_bin_grid_h,
roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w,
roi_bin_grid_h, roi_bin_grid_w, pre_calc);
for (int c = 0; c < channels; c++) {
int index_n_c = index_n + c * pooled_width * pooled_height;
const T* offset_bottom_data =
bottom_data + (roi_batch_ind * channels + c) * height * width;
int pre_calc_index = 0;
for (int ph = 0; ph < pooled_height; ph++) {
for (int pw = 0; pw < pooled_width; pw++) {
int index = index_n_c + ph * pooled_width + pw;
T output_val = 0.;
for (int iy = 0; iy < roi_bin_grid_h; iy++) {
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
PreCalc<T> pc = pre_calc[pre_calc_index];
output_val += pc.w1 * offset_bottom_data[pc.pos1] +
pc.w2 * offset_bottom_data[pc.pos2] +
pc.w3 * offset_bottom_data[pc.pos3] +
pc.w4 * offset_bottom_data[pc.pos4];
pre_calc_index += 1;
}
}
output_val /= count;
top_data[index] = output_val;
} // for pw
} // for ph
} // for c
} // for n
}
template <>
void RoiAlignPoolKernel<FPGA, float>::Compute(
const RoiAlignPoolParam<FPGA>& param) {
auto input_tensor = param.float_input.get();
fpga::PerformBypass(param.input_arg);
fpga::fpga_invalidate(input_tensor->data<float>(),
input_tensor->numel() * sizeof(float));
auto* in = input_tensor;
auto* rois = param.input_rois_;
auto* out = param.output_; // param.float_output.get();
auto pooled_height = param.pooled_height_;
auto pooled_width = param.pooled_width_;
auto spatial_scale = param.spatial_scale_;
auto sampe_ratio = param.sampling_ratio_;
auto in_dims = in->dims();
int batch_size = in_dims[0];
int input_channels = in_dims[1];
int height = in_dims[2];
int width = in_dims[3];
int rois_num = rois->dims()[0];
auto data_nhwc = in->mutable_data<float>();
fpga::image::convert_to_chw(&data_nhwc, input_channels, height, width);
framework::DDim dims_out_new = framework::make_ddim(
{rois_num, (param.output_)->dims()[1], (((param.output_)->dims()[2])),
(param.output_)->dims()[3]});
(param.output_)->Resize(dims_out_new);
const int index = input_channels * pooled_height * pooled_width * rois_num;
auto rois_data = rois->data<float>();
auto top_data = param.output_->mutable_data<float>();
for (int i = 0; i < index; ++i) {
ROIAlignForward<float>(index, data_nhwc, spatial_scale, input_channels,
height, width, pooled_height, pooled_width,
sampe_ratio, rois_data, top_data);
}
fpga::image::convert_to_hwc(&top_data, input_channels, pooled_height,
pooled_width, rois_num);
out->reset_data_ptr(top_data);
}
} // namespace operators
} // namespace paddle_mobile
#endif // ROIALIGN_POOL_OP
......@@ -105,6 +105,8 @@ void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) {
} else {
if (param.FpgaArgs().output.activation.activation_type != fpga::SOFTMAX) {
Tensor *out = param.Out();
out->Resize(
{in_x->dims()[0], out->dims()[1], out->dims()[2], out->dims()[3]});
math::SoftmaxFuntor<CPU, float>()(in_x, out);
}
}
......
......@@ -42,6 +42,11 @@ template <>
void Transpose2Kernel<FPGA, float>::Compute(
const Transpose2Param<FPGA> &param) {
// Transpose2Compute<float>(param);
auto input = param.InputX();
auto output = param.Out();
output->Resize({input->dims()[0], output->dims()[1], output->dims()[2],
output->dims()[3]});
}
} // namespace operators
......
......@@ -128,6 +128,7 @@ if (CON GREATER -1)
set(FUSION_CONVADDBN_OP ON)
set(RESHAPE2_OP ON)
set(PSROI_POOL_OP ON)
set(ROIALIGN_POOL_OP ON)
set(PROPOSAL_OP ON)
set(ANCHOR_GENERATOR_OP ON)
set(SLICE_OP ON)
......@@ -603,6 +604,9 @@ endif()
if (PSROI_POOL_OP)
add_definitions(-DPSROI_POOL_OP)
endif()
if (ROIALIGN_POOL_OP)
add_definitions(-DROIALIGN_POOL_OP)
endif()
if (ROI_PERSPECTIVE_OP)
add_definitions(-DROI_PERSPECTIVE_OP)
endif()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册