提交 2a4209e2 编写于 作者: qnqinan's avatar qnqinan 提交者: jameswu2014

update FPGA v2 kernel file including proposal, sigmoid and anchor generator,...

update FPGA v2 kernel file including proposal, sigmoid and anchor generator, test=develop  Closes #2369 (#2370)

* update proposal and psroipool kernel file in FPGA V2 track

* update, test=develop
上级 3a647c37
......@@ -359,7 +359,7 @@ void expand_conv_arg(ConvArgs *arg) {
if (((res_win % 2) != 0) && (res_win != 1)) {
res_win = res_win - 1;
}
PADDLE_MOBILE_ENFORCE(res_win >= 2, "window too bigger than fpga volume");
// PADDLE_MOBILE_ENFORCE(res_win >= 2, "window too bigger than fpga volume");
res_fit = res_win;
auto block_num = (output_width + res_fit - 1) / res_fit;
......@@ -885,7 +885,7 @@ void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input,
int padding_h, int padding_w, float *bias_ptr) {
auto filter_ptr = filter->data<int16_t>();
auto input_ptr = input->data<int8_t>();
auto output_ptr = out->mutable_data<int8_t>();
auto output_ptr = out->data<int8_t>();
arg->sub_conv_num = 1;
arg->relu_enabled = relu_enabled;
// arg->output.activation.activation_type = activation_enable;
......
......@@ -14,6 +14,7 @@ limitations under the License. */
#include "fpga/V2/bias_scale.h"
#include <memory.h>
#include <math.h>
#include "fpga/common/fpga_common.h"
namespace paddle_mobile {
......@@ -55,10 +56,22 @@ void align_element(float **data_in, int num_per_div_before_alignment, int num) {
*data_in = ptr_aligned;
}
void fixed_scale_bias_new(void*data_in, int data_len) {
int* data_tmp = static_cast<int*>(data_in);
for (int idx = 0; idx < data_len/2; ++idx) {
float tmp = (static_cast<float*>(data_in))[idx];
data_tmp[idx] = static_cast<int>(round(tmp*pow(2.0, 23.0)));
tmp = (static_cast<float*>(data_in))[idx+data_len/2];
data_tmp[idx+data_len/2] = static_cast<int>(round(tmp*pow(2.0, 30.0)));
}
return;
}
void interleave(float **data_in, int num_after_alignment) {
// num_after_alignment: number of bias after alignment
float *ptr_uninterleaved = *data_in;
// fixed_scale_bias_new(ptr_uninterleaved, 2 * num_after_alignment);
float *ptr_interleaved =
(float *)fpga_malloc(2 * num_after_alignment * sizeof(float)); // NOLINT
int num = num_after_alignment / 4;
......
......@@ -79,7 +79,8 @@ using namespace std; // NOLINT
#define REG_CONVERT_CMD 0x400
#define REG_CONVERT_SRC_ADDR 0x408
#define REG_CONVERT_DST_ADDR 0x410
#define REG_CONVERT_LENGTH 0x418
#define REG_CONVERT_RD_LENGTH 0x418
#define REG_CONVERT_WR_LENGTH 0x420
/*resize*/
#define REG_RESIZE_CMD 0x600
......@@ -693,7 +694,8 @@ int PerformBypass(const struct BypassArgs &args) {
reg_writeq(output_scale, REG_SCALE_PARAMETER);
reg_writeq(input_address_phy, REG_CONVERT_SRC_ADDR);
reg_writeq(output_address_phy, REG_CONVERT_DST_ADDR);
reg_writeq(datalen, REG_CONVERT_LENGTH);
reg_writeq(datalen, REG_CONVERT_RD_LENGTH);
reg_writeq(datalen, REG_CONVERT_WR_LENGTH);
reg_writeq(cmd, REG_CONVERT_CMD);
DLOG << "before reg poll";
if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_BYPASS, PE_IRQ_TIMEOUT)) {
......
......@@ -134,6 +134,7 @@ int fpga_regpoll(uint64_t reg, uint64_t val, int time) {
uint64_t i = 0;
/*timeout精确性待确认*/
int64_t timeout = time * 6;
usleep(1);
for (i = 0; i < timeout; i++) {
if (val == reg_readq(reg)) {
......
......@@ -45,9 +45,9 @@ bool AnchorGeneratorKernel<FPGA, float>::Init(
if (offset > 0.6) {
memcpy(anchors_offset, anchors_offset2, sizeof(anchors_offset));
std::cout << "anchor generator marker" << std::endl;
DLOG << "anchor generator marker";
} else {
std::cout << "anchor generator rfcn" << std::endl;
DLOG << "anchor generator rfcn";
}
int num_anchors = sizeof(anchors_offset) / (sizeof(int) * 4);
......
......@@ -30,16 +30,12 @@ bool ProposalKernel<FPGA, float>::Init(ProposalParam<FPGA> *param) {
int64_t batch = param->scores_->dims()[0];
auto total = post_nms_top_n * batch;
param->rpn_rois_->mutable_data<float>({total, 4});
param->rpn_probs_->mutable_data<float>({total, 1});
param->rpn_probs_->mutable_data<int8_t>({total, 1});
param->float_bbox = std::make_shared<Tensor>();
param->float_bbox->Resize(param->bbox_deltas_->dims());
param->float_bbox->init(type_id<float>().hash_code());
fpga::format_fp32_ofm(param->float_bbox.get());
param->float_score = std::make_shared<Tensor>();
param->float_score->Resize(param->scores_->dims());
param->float_score->init(type_id<float>().hash_code());
fpga::format_fp32_ofm(param->float_score.get());
auto input = param->scores_;
param->score_index_ = std::make_shared<Tensor>();
......@@ -88,7 +84,7 @@ void AppendProposals(Tensor *dst, int64_t offset, const Tensor &src) {
template <class T>
static inline void BoxCoder(Tensor *all_anchors, Tensor *bbox_deltas,
Tensor *variances, Tensor *proposals) {
Tensor *proposals) {
T *proposals_data = proposals->mutable_data<T>();
int64_t row = all_anchors->dims()[0];
......@@ -96,10 +92,6 @@ static inline void BoxCoder(Tensor *all_anchors, Tensor *bbox_deltas,
auto *bbox_deltas_data = bbox_deltas->data<T>();
auto *anchor_data = all_anchors->data<T>();
const T *variances_data = nullptr;
if (variances) {
variances_data = variances->data<T>();
}
for (int64_t i = 0; i < row; ++i) {
T anchor_width = anchor_data[i * len + 2] - anchor_data[i * len] + 1.0;
......@@ -244,10 +236,10 @@ static inline Tensor NMS(Tensor *bbox, Tensor *scores, T nms_threshold,
// 4: [xmin ymin xmax ymax]
int64_t box_size = bbox->dims()[1];
std::vector<T> scores_data(num_boxes);
std::copy_n(scores->data<T>(), num_boxes, scores_data.begin());
std::vector<std::pair<T, int>> sorted_indices =
GetSortedScoreIndex<T>(scores_data);
std::vector<int8_t> scores_data(num_boxes);
std::copy_n(scores->data<int8_t>(), num_boxes, scores_data.begin());
std::vector<std::pair<int8_t, int>> sorted_indices =
GetSortedScoreIndex<int8_t>(scores_data);
std::vector<int> selected_indices;
int selected_num = 0;
......@@ -284,8 +276,7 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
const Tensor &scores_slice, // [N, 1]
const Tensor &score_index, int pre_nms_top_n, int post_nms_top_n,
float nms_thresh, float min_size, float eta) {
auto *scores_data = scores_slice.data<T>();
auto *scores_data = scores_slice.data<int8_t>();
// Sort index
Tensor index_t;
index_t.Resize({scores_slice.numel()});
......@@ -306,17 +297,17 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
}
Tensor scores_sel, bbox_sel, anchor_sel, var_sel;
scores_sel.mutable_data<T>({index_t.numel(), 1});
scores_sel.mutable_data<int8_t>({index_t.numel(), 1});
bbox_sel.mutable_data<T>({index_t.numel(), 4});
anchor_sel.mutable_data<T>({index_t.numel(), 4});
var_sel.mutable_data<T>({index_t.numel(), 4});
CPUGather<T>(scores_slice, index_t, &scores_sel);
CPUGather<int8_t>(scores_slice, index_t, &scores_sel);
CPUGather<T>(bbox_deltas_slice, index_t, &bbox_sel);
CPUGather<T>(anchors, index_t, &anchor_sel);
Tensor proposals;
proposals.mutable_data<T>({index_t.numel(), 4});
BoxCoder<T>(&anchor_sel, &bbox_sel, nullptr, &proposals);
BoxCoder<T>(&anchor_sel, &bbox_sel, &proposals);
ClipTiledBoxes<T>(im_info_slice, &proposals);
......@@ -325,10 +316,10 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
Tensor scores_filter;
bbox_sel.mutable_data<T>({keep.numel(), 4});
scores_filter.mutable_data<T>({keep.numel(), 1});
scores_filter.mutable_data<int8_t>({keep.numel(), 1});
CPUGather<T>(proposals, keep, &bbox_sel);
CPUGather<T>(scores_sel, keep, &scores_filter);
CPUGather<int8_t>(scores_sel, keep, &scores_filter);
if (nms_thresh <= 0) {
return std::make_pair(bbox_sel, scores_filter);
}
......@@ -341,10 +332,10 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
}
proposals.mutable_data<T>({keep_nms.numel(), 4}); // original
scores_sel.mutable_data<T>({keep_nms.numel(), 1}); // original
scores_sel.mutable_data<int8_t>({keep_nms.numel(), 1}); // original
CPUGather<T>(bbox_sel, keep_nms, &proposals);
CPUGather<T>(scores_filter, keep_nms, &scores_sel);
CPUGather<int8_t>(scores_filter, keep_nms, &scores_sel);
return std::make_pair(proposals, scores_sel);
}
......@@ -368,69 +359,43 @@ void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) {
bbox_height = (uint32_t)(input_bbox->dims()[2]);
bbox_width = (uint32_t)(input_bbox->dims()[3]);
std::shared_ptr<Tensor> score_tmp = std::make_shared<Tensor>();
score_tmp->Resize(param.scores_->dims());
score_tmp->mutable_data<int8_t>();
std::shared_ptr<Tensor> bbox_tmp = std::make_shared<Tensor>();
bbox_tmp->Resize(param.bbox_deltas_->dims());
bbox_tmp->mutable_data<int8_t>();
auto score_tmp_data = score_tmp->data<int8_t>();
auto bbox_tmp_data = bbox_tmp->data<int8_t>();
int64_t amount_per_side = score_width * score_height;
int idx = 0;
int alignedCW =
fpga::align_to_x(score_width * score_channels, IMAGE_ALIGNMENT);
int unalignedCW = score_width * score_channels;
fpga::fpga_invalidate(input_score_data,
score_height * alignedCW * sizeof(int8_t));
Tensor score_tensor = *input_score;
for (int h = 0; h < score_height; h++) {
for (int w = 0; w < score_width; w++) {
for (int c = 0; c < score_channels; c++) {
if (alignedCW == unalignedCW) {
*(score_tmp_data + c * amount_per_side + score_width * h + w) =
(*(input_score_data++));
} else {
idx = h * alignedCW + w * score_channels + c;
*(score_tmp_data + c * amount_per_side + score_width * h + w) =
input_score_data[idx];
}
for (int c = 0; c < score_channels; ++c) {
int dstidx = h*unalignedCW + w*score_channels + c;
int srcidx = h*alignedCW + w*score_channels + c;
score_tensor.data<int8_t>()[dstidx] = input_score_data[srcidx];
}
}
}
amount_per_side = bbox_width * bbox_height;
alignedCW = fpga::align_to_x(bbox_width * bbox_channels, IMAGE_ALIGNMENT);
unalignedCW = bbox_width * bbox_channels;
fpga::fpga_invalidate(input_bbox_data,
bbox_height * alignedCW * sizeof(int8_t));
auto bbox_tensor = param.float_bbox.get();
for (int h = 0; h < bbox_height; h++) {
for (int w = 0; w < bbox_width; w++) {
for (int c = 0; c < bbox_channels; c++) {
if (alignedCW == unalignedCW) {
*(bbox_tmp_data + c * amount_per_side + bbox_width * h + w) =
(*(input_bbox_data++));
} else {
idx = h * alignedCW + w * bbox_channels + c;
*(bbox_tmp_data + c * amount_per_side + bbox_width * h + w) =
input_bbox_data[idx];
}
for (int c = 0; c < bbox_channels; ++c) {
int dstidx = h*unalignedCW + w*bbox_channels + c;
int srcidx = h*alignedCW + w*bbox_channels + c;
bbox_tensor->data<float>()[dstidx] =
(static_cast<int>(input_bbox_data[srcidx]))/127.0*
input_bbox->scale[0];
}
}
}
auto score_tensor = param.float_score.get();
for (int i = 0; i < score_height * score_width * score_channels; i++) {
score_tensor->data<float>()[i] =
score_tmp_data[i] / 127.0 * input_score->scale[0];
}
auto bbox_tensor = param.float_bbox.get();
for (int i = 0; i < bbox_height * bbox_width * bbox_channels; i++) {
bbox_tensor->data<float>()[i] =
bbox_tmp_data[i] / 127.0 * input_bbox->scale[0];
}
auto *scores = param.float_score.get();
auto *bbox_deltas = param.float_bbox.get();
auto *im_info = param.im_info_;
auto anchors = *param.anchors_;
auto variances = *param.variances_;
......@@ -447,37 +412,23 @@ void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) {
float min_size = param.min_size_;
float eta = param.eta_;
auto &scores_dim = scores->dims();
int64_t num = scores_dim[0];
int64_t c_score = scores_dim[1];
int64_t h_score = scores_dim[2];
int64_t w_score = scores_dim[3];
auto &bbox_dim = bbox_deltas->dims();
int64_t c_bbox = bbox_dim[1];
int64_t h_bbox = bbox_dim[2];
int64_t w_bbox = bbox_dim[3];
//
rpn_rois->mutable_data<float>({bbox_deltas->numel(), 4});
rpn_roi_probs->mutable_data<float>({scores->numel(), 1});
rpn_rois->mutable_data<float>({bbox_tensor->numel()/4, 4});
rpn_roi_probs->mutable_data<int8_t>({input_score->numel()/4, 1});
framework::LoD lod;
lod.resize(1);
auto &lod0 = lod[0];
lod0.push_back(0);
anchors.Resize({anchors.numel(), 4});
variances.Resize({variances.numel(), 4});
anchors.Resize({anchors.numel()/4, 4});
variances.Resize({variances.numel()/4, 4});
int64_t num_proposals = 0;
for (int64_t i = 0; i < num; ++i) {
for (int64_t i = 0; i < score_n; ++i) {
Tensor im_info_slice = im_info->Slice(i, i + 1);
Tensor bbox_deltas_slice = (*bbox_tensor).Slice(i, i + 1);
Tensor scores_slice = (*score_tensor).Slice(i, i + 1);
bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox, 4});
scores_slice.Resize({h_score * w_score * c_score, 1});
Tensor scores_slice = score_tensor.Slice(i, i + 1);
bbox_deltas_slice.Resize({bbox_height * bbox_width * bbox_channels / 4, 4});
scores_slice.Resize({score_height * score_width * score_channels, 1});
std::pair<Tensor, Tensor> tensor_pair = ProposalForOneImage<float>(
im_info_slice, anchors, variances, bbox_deltas_slice, scores_slice,
score_index, pre_nms_top_n, post_nms_top_n, nms_thresh, min_size, eta);
......
......@@ -44,14 +44,14 @@ bool PSRoiPoolKernel<FPGA, float>::Init(PSRoiPoolParam<FPGA>* param) {
}
template <typename Dtype>
void PSROIPoolingForward(const Dtype* bottom_data, const int height,
void PSROIPoolingForward(const int8_t* bottom_data, const int height,
const int width, const int input_channel,
Dtype* top_data, const int pooled_height,
const int pooled_width, const int output_channel,
const Dtype* bottom_rois, const Dtype Bin_size_h,
const Dtype Bin_size_w, const Dtype roi_start_h,
const Dtype roi_start_w, const int pw, const int ph,
const int roi_batch_ind) {
float scale, const int roi_batch_ind) {
int hstart = floor(static_cast<Dtype>(ph) * Bin_size_h + roi_start_h);
int wstart = floor(static_cast<Dtype>(pw) * Bin_size_w + roi_start_w);
int hend = ceil(static_cast<Dtype>(ph + 1) * Bin_size_h + roi_start_h);
......@@ -64,11 +64,12 @@ void PSROIPoolingForward(const Dtype* bottom_data, const int height,
wend = std::min(std::max(wend, 0), width);
bool is_empty = (hend <= hstart) || (wend <= wstart);
float sum_pixels_c[output_channel] = {0};
float pixels_c[output_channel] = {0};
float avg_pixels_c[output_channel] = {0};
int sum_pixels_c[output_channel] = {0};
int8_t pixels_c[output_channel] = {0};
if (!is_empty) {
Dtype bin_area = (hend - hstart) * (wend - wstart);
float rec_bin_area = 1 / bin_area;
float scale_fuse = scale / bin_area;
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
......@@ -86,27 +87,21 @@ void PSROIPoolingForward(const Dtype* bottom_data, const int height,
}
}
for (int output_c = 0; output_c < output_channel; output_c++) {
sum_pixels_c[output_c] *= rec_bin_area;
avg_pixels_c[output_c] = sum_pixels_c[output_c] * scale_fuse;
}
}
int output_index_base = (ph * pooled_width + pw) * output_channel;
top_data += output_index_base;
memcpy(top_data, sum_pixels_c, output_channel * 4);
memcpy(top_data, avg_pixels_c, output_channel * 4);
}
template <>
void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
auto input_tensor = param.input_x_;
auto input_data = input_tensor->data<int8_t>();
auto Si = input_tensor->scale[0];
auto float_input_tensor = param.float_input.get();
auto float_input_data = float_input_tensor->data<float>();
for (int i = 0; i < float_input_tensor->numel(); i++) {
float_input_data[i] = input_data[i] / 127.0 * Si;
}
auto* in = float_input_tensor;
auto scale = input_tensor->scale[0] / 127.0;
fpga::fpga_invalidate(input_data, input_tensor->numel() * sizeof(int8_t));
auto* rois = param.input_rois_;
auto* out = param.output_;
......@@ -115,22 +110,19 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
auto spatial_scale = param.spatial_scale_;
auto output_channels = param.output_channels_;
auto in_dims = in->dims();
auto in_dims = input_tensor->dims();
int batch_size = in_dims[0];
int input_channels = in_dims[1];
int height = in_dims[2];
int width = in_dims[3];
int rois_num = rois->dims()[0];
auto data_nhwc = in->mutable_data<float>();
framework::DDim dims_out_new = framework::make_ddim(
{rois_num, (param.output_)->dims()[1], (((param.output_)->dims()[2])),
(param.output_)->dims()[3]});
(param.output_)->Resize(dims_out_new);
const float* input_data_tmp = data_nhwc; // in->data<float>();
framework::Tensor rois_batch_id_list;
rois_batch_id_list.Resize({rois_num});
auto rois_batch_id_data = rois_batch_id_list.mutable_data<int>();
......@@ -151,12 +143,7 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
"the channels of input X should equal the product of "
"output_channels x pooled_height x pooled_width");
// calculate batch id index for each roi according to LoD
for (int n = 0; n < rois_batch_size; ++n) {
for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
rois_batch_id_data[i] = n;
}
}
auto output_data = out->mutable_data<float>();
auto input_rois = rois->data<float>();
......@@ -187,10 +174,10 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
for (int ph = 0; ph < pooled_height; ph++) {
for (int pw = 0; pw < pooled_width; pw++) {
PSROIPoolingForward<float>(
input_data_tmp, height, width, input_channels, offset_output_data,
input_data, height, width, input_channels, offset_output_data,
pooled_height, pooled_width, output_channels, input_rois,
bin_size_h, bin_size_w, roi_start_h, roi_start_w, pw, ph,
roi_batch_ind);
scale, roi_batch_ind);
}
}
}
......
......@@ -25,6 +25,7 @@ bool Reshape2Kernel<FPGA, float>::Init(Reshape2Param<FPGA> *param) {
auto input = const_cast<LoDTensor *>(param->InputX());
auto output = param->Out();
auto shape = param->Shape();
output->scale[0] = input->scale[0];
auto num_in = framework::product(input->dims());
auto num_shape = framework::product(framework::make_ddim(shape));
......
......@@ -81,6 +81,7 @@ void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) {
auto w = 1;
auto c = 1;
if (dims.size() == 4) {
n = dims[0];
h = dims[1];
w = dims[2];
c = dims[3];
......@@ -90,6 +91,7 @@ void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) {
h = 1;
}
} else if (dims.size() == 2) {
n = dims[0];
c = dims[1];
}
if ((c == 2) && (in_x->type() == type_id<int8_t>())) {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册