未验证 提交 ceda3f4e 编写于 作者: qnqinan's avatar qnqinan 提交者: GitHub

update kernel and related files for static quantization in FPGA v2 track fixed#1589 (#1590)

* update concat and split kernel and related files in FPGA v2(v3) track

* update

* update

* update kernel and related files in FPGA v2 track

* update

* update

* update kernel and related files for static quantization in FPGA v2 track

* update
上级 15794c0a
......@@ -40,8 +40,6 @@ void format_image(framework::Tensor *image_tensor) {
void format_ofm(framework::Tensor *ofm_tensor) {
if (ofm_tensor->type() == type_id<float>()) {
format_fp32_ofm(ofm_tensor);
} else if (ofm_tensor->type() == type_id<half>()) {
format_fp16_ofm(ofm_tensor);
} else {
format_int8_ofm(ofm_tensor);
}
......@@ -85,24 +83,6 @@ void format_int8_ofm(framework::Tensor *ofm_tensor, framework::DDim dims) {
fpga::fpga_flush(p, memory_size);
}
void format_fp16_ofm(framework::Tensor *ofm_tensor) {
auto dims = ofm_tensor->dims();
size_t memory_size = 0;
if (dims.size() == 4) {
auto channel = dims[1], height = dims[2], width = dims[3], num = dims[0];
memory_size = num * height * align_to_x(channel * width, IMAGE_ALIGNMENT) *
sizeof(half);
} else if (dims.size() == 2) {
memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(half);
} else {
DLOG << "Wrong ofm dimension";
}
auto p = fpga_malloc(memory_size);
ofm_tensor->reset_data_ptr(p);
ofm_tensor->set_type(type_id<half>().hash_code());
ofm_tensor->fpga_data_num = memory_size / sizeof(half);
}
void format_fp32_ofm(framework::Tensor *ofm_tensor) {
auto dims = ofm_tensor->dims();
size_t memory_size = 0;
......@@ -208,9 +188,6 @@ void format_DWDconv_filter(framework::Tensor *filter_tensor, float *scale_ptr,
deconv_filter::DWDconv_format_filter(&new_data, num, channel, height, width,
scale_ptr, stride);
// framework::DDim dims_new =
// framework::make_ddim({num, 1, height, width});
// filter_tensor->Resize(dims_new);
filter_tensor->reset_data_ptr(new_data);
filter_tensor->set_type(type_id<int16_t>().hash_code());
}
......@@ -314,13 +291,13 @@ void format_DWDeconv_data(framework::Tensor *filter_tensor,
framework::Tensor *ofm_tensor, float **bs_ptr,
int group, int sub_conv_n) {
int channel = ofm_tensor->dims()[1];
// dw-deconv
format_DWDconv_filter(
filter_tensor,
(reinterpret_cast<float *>(*bs_ptr) + sub_conv_n * channel), sub_conv_n);
format_bias_array(bs_ptr, channel);
format_ofm(ofm_tensor);
}
void expand_conv_arg(ConvArgs *arg) {
ConvArgs args = *arg;
......@@ -458,7 +435,6 @@ void expand_conv_arg(ConvArgs *arg) {
void expand_EW_arg(EWAddArgs *arg) {
EWAddArgs args = *arg;
// uint64_t cmd = args.relu_enabled ? USE_RELU : 0;
uint64_t cmd = 0;
uint64_t datalen = (uint64_t)args.image0.width *
(uint64_t)args.image0.height *
......
......@@ -26,8 +26,6 @@ void format_image(framework::Tensor* image_tensor);
void format_ofm(framework::Tensor* ofm_tensor);
void format_int8_ofm(framework::Tensor* ofm_tensor);
void format_int8_ofm(framework::Tensor* ofm_tensor, framework::DDim dims);
void format_fp16_ofm(framework::Tensor* ofm_tensor); // only allocate memory
void format_fp16_ofm(framework::Tensor* ofm_tensor, framework::DDim dims);
void format_fp32_ofm(framework::Tensor* ofm_tensor);
float filter_find_max(framework::Tensor* filter_tensor);
......
......@@ -100,7 +100,7 @@ void concat_images(int8_t **images_in, float **scales_in, void *image_out,
align_each_in_area_cw =
align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT);
memcpy(
(int16_t *)image_out + tmp_channel + // NOLINT
(int8_t *)image_out + tmp_channel + // NOLINT
k * align_each_out_area_cw_differ,
images_in_tmp[i] + j * channel_num[i] + k * align_each_in_area_cw,
channel_num[i] * sizeof(int8_t));
......
......@@ -26,10 +26,6 @@ void convert_to_hwc(float** data_in, int channel, int height, int width,
int num = 1);
void convert_to_chw(float** data_in, int channel, int height, int width,
int num = 1);
// template <typename Dtype>
// void align_element_conv(Dtype** data_in, int height, int cw);
// template <typename T>
// void format_image(T** data_in, int channel, int height, int width);
template <typename Dtype>
void align_element_conv(Dtype** data_in, int height, int cw);
template <typename Dtype>
......
......@@ -97,7 +97,7 @@ float fp16_2_fp32(int16_t fp16_num) {
} else if (se_fp16 < 63) {
e_fp32 = 0x80000000 + ((se_fp16 - 32) << 23);
offset = 1024;
} else { // se_fp16 == 63
} else {
e_fp32 = 0xC7800000;
offset = 1024;
}
......
......@@ -209,7 +209,6 @@ struct PoolingArgs {
};
struct EWAddArgs {
// bool relu_enabled;
uint32_t const0; // output0 = const0 x input0 + const1 x input1;
uint32_t const1;
struct ImageInputArgs image0;
......
......@@ -35,7 +35,7 @@ bool ElementwiseMulKernel<FPGA, float>::Init(ElementwiseMulParam<FPGA> *param) {
fpga::format_fp32_ofm(&(param->float_out));
auto *out = param->Out();
fpga::format_fp16_ofm(out);
fpga::format_ofm(out);
return true;
}
......
......@@ -23,14 +23,14 @@ bool FeedKernel<FPGA, float>::Init(FeedParam<FPGA> *param) {
int col = param->Col();
DLOG << "col = " << col;
auto input = const_cast<LoDTensor *>(&param->InputX()->at(col));
input->init(type_id<float>().hash_code());
input->Resize(output->dims());
if (output->dims().size() != 4) {
input->init(type_id<float>().hash_code());
return true;
}
fpga::format_fp16_ofm(output);
input->init(type_id<int8_t>().hash_code());
input->Resize(output->dims());
fpga::format_ofm(output);
return true;
}
......@@ -39,15 +39,6 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
auto output = param.Out();
int col = param.Col();
auto input = const_cast<LoDTensor *>(&param.InputX()->at(col));
kTypeId_t input_type = input->type();
if (input_type == type_id<float>()) {
input->init(type_id<float>().hash_code());
} else {
input->init(type_id<int8_t>().hash_code());
}
input->Resize(output->dims());
if (output->dims().size() != 4) {
size_t size = output->numel() * sizeof(float);
auto output_ptr = output->data<float>();
......@@ -58,49 +49,8 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
input->external_data = nullptr;
return;
}
fpga::format_image(input);
auto output_ptr = output->data<half>();
fpga::BypassArgs args = {fpga::DATA_TYPE_FP32};
if (input_type == type_id<float>()) {
auto input_ptr = input->data<float>();
auto external_ptr = reinterpret_cast<float *>(input->external_data);
float *p_data = external_ptr == nullptr ? input_ptr : external_ptr;
args.input_data_type = fpga::DATA_TYPE_FP32;
args.output_data_type = fpga::DATA_TYPE_FP16;
args.input_layout_type = fpga::LAYOUT_CHW;
args.output_layout_type = fpga::LAYOUT_HWC;
args.image.address = p_data;
args.image.channels = (uint32_t)input->dims()[1];
args.image.height = (uint32_t)input->dims()[2];
args.image.width = (uint32_t)input->dims()[3];
args.image.pad_height = 0;
args.image.pad_width = 0;
args.output.address = output_ptr;
args.output.scale_address = output->scale;
fpga::PerformBypass(args);
input->external_data = nullptr;
} else {
auto input_ptr = input->data<int8_t>();
auto external_ptr = reinterpret_cast<int8_t *>(input->external_data);
int8_t *p_data = external_ptr == nullptr ? input_ptr : external_ptr;
args.input_data_type = fpga::DATA_TYPE_INT8;
args.output_data_type = fpga::DATA_TYPE_FP16;
args.input_layout_type = fpga::LAYOUT_CHW;
args.output_layout_type = fpga::LAYOUT_HWC;
args.image.address = p_data;
args.image.channels = (uint32_t)input->dims()[1];
args.image.height = (uint32_t)input->dims()[2];
args.image.width = (uint32_t)input->dims()[3];
args.image.pad_height = 0;
args.image.pad_width = 0;
args.output.address = output_ptr;
args.output.scale_address = output->scale;
fpga::PerformBypass(args);
input->external_data = nullptr;
}
output->ShareDataWith(*input);
}
template class FeedKernel<FPGA, float>;
......
......@@ -21,18 +21,16 @@ bool FetchKernel<FPGA, float>::Init(FetchParam<FPGA> *param) {
int col = param->Col();
DLOG << "col = " << col;
auto output = &(param->Out()->at(col));
output->init(type_id<float>().hash_code());
output->Resize(input->dims());
if (input->type() == type_id<float>()) {
return true;
}
output->init(type_id<float>().hash_code());
output->Resize(input->dims());
fpga::format_fp32_ofm(output);
auto aligned_output = param->aligned_out;
int outC = 1;
int outH = 1;
int outW = 1;
if (output->dims().size() == 4) {
outC = output->dims()[1];
outH = output->dims()[2];
outW = output->dims()[3];
} else { // 2
outC = output->dims()[1];
......@@ -40,27 +38,10 @@ bool FetchKernel<FPGA, float>::Init(FetchParam<FPGA> *param) {
int unalignedCW = outC * outW;
int alignedCW = fpga::align_to_x(unalignedCW, IMAGE_ALIGNMENT);
if (alignedCW != unalignedCW) {
param->aligned_out.Resize(input->dims());
param->aligned_out.mutable_data<float>(input->dims());
fpga::fpga_flush(param->aligned_out.data<float>(),
outH * unalignedCW * sizeof(float));
aligned_output.init(type_id<float>().hash_code());
aligned_output.Resize(input->dims());
fpga::format_fp32_ofm(&aligned_output);
}
fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
args.input_data_type = fpga::DATA_TYPE_FP16;
args.output_data_type = fpga::DATA_TYPE_FP32;
args.input_layout_type = fpga::LAYOUT_CHW;
args.output_layout_type = fpga::LAYOUT_HWC;
args.image.address = input->data<half>();
args.image.channels = (uint32_t)(input->fpga_data_num);
args.image.height = 1;
args.image.width = 1;
args.image.pad_height = 0;
args.image.pad_width = 0;
args.output.address = output->data<float>();
args.output.scale_address = output->scale;
param->fpga_bypass_args = args;
return true;
}
void dealign(float *src, float *dst, int input_c, int input_h, int input_w) {
......@@ -83,22 +64,22 @@ void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
return;
}
fpga::BypassArgs args = param.fpga_bypass_args;
auto input_address = (input->data<half>());
args.image.address = static_cast<void *>(input_address);
float *outdata_ptr =
reinterpret_cast<float *>(param.fpga_bypass_args.output.address);
auto input_address = input->data<int8_t>();
float Si = input->scale[0];
auto aligned_ptr = const_cast<float *>(param.aligned_out.data<float>());
auto outdata_ptr = output->data<float>();
const int num_th = 32;
if (output->fpga_data_num < num_th) {
fpga::fpga_invalidate(input_address, (input->fpga_data_num) * sizeof(half));
fpga::fpga_invalidate(input_address, (input->fpga_data_num) * sizeof(int8_t));
if (input->fpga_data_num < num_th) {
for (int idx = 0; idx < product(input->dims()); ++idx) {
outdata_ptr[idx] = fpga::fp16_2_fp32(input_address[idx]);
outdata_ptr[idx] = input_address[idx] * Si;
}
fpga::fpga_flush(outdata_ptr, product(input->dims()) * sizeof(float));
return;
}
fpga::PerformBypass(args);
for (int idx = 0; idx < input->fpga_data_num; ++idx) {
aligned_ptr[idx] = input_address[idx] * Si;
}
int outC = 1;
int outH = 1;
int outW = 1;
......@@ -110,16 +91,15 @@ void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
outC = output->dims()[1];
}
fpga::fpga_invalidate(param.fpga_bypass_args.output.address,
output->fpga_data_num * sizeof(float));
int unalignedCW = outC * outW;
int alignedCW = fpga::align_to_x(unalignedCW, IMAGE_ALIGNMENT);
if (unalignedCW != alignedCW) {
auto aligned_ptr = const_cast<float *>(param.aligned_out.data<float>());
dealign(outdata_ptr, aligned_ptr, outC, outH, outW);
memcpy(outdata_ptr, aligned_ptr, outC * outH * outW * sizeof(float));
dealign(aligned_ptr, outdata_ptr, outC, outH, outW);
fpga::fpga_flush(outdata_ptr, outC * outH * outW * sizeof(float));
return;
}
memcpy(outdata_ptr, aligned_ptr, outC * outH * outW * sizeof(float));
fpga::fpga_flush(outdata_ptr, outC * outH * outW * sizeof(float));
}
template class FetchKernel<FPGA, float>;
......
......@@ -32,9 +32,6 @@ bool ProposalKernel<FPGA, float>::Init(ProposalParam<FPGA> *param) {
param->rpn_rois_->mutable_data<float>({total, 4});
param->rpn_probs_->mutable_data<float>({total, 1});
// DLOG << *param->rpn_rois_;
// DLOG << *param->rpn_probs_;
param->float_bbox = std::make_shared<Tensor>();
param->float_bbox->Resize(param->bbox_deltas_->dims());
param->float_bbox->init(type_id<float>().hash_code());
......@@ -44,29 +41,7 @@ bool ProposalKernel<FPGA, float>::Init(ProposalParam<FPGA> *param) {
param->float_score->init(type_id<float>().hash_code());
fpga::format_fp32_ofm(param->float_score.get());
auto input = param->bbox_deltas_;
fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
args.input_layout_type = fpga::LAYOUT_HWC;
args.output_layout_type = fpga::LAYOUT_HWC;
args.input_data_type = fpga::DATA_TYPE_FP16;
args.output_data_type = fpga::DATA_TYPE_FP32;
args.image.address = input->data<half>();
args.image.height = (uint32_t)input->dims()[2];
args.image.width = (uint32_t)input->dims()[3];
args.image.channels = (uint32_t)input->dims()[1];
args.output.address = param->float_bbox->mutable_data<float>();
args.output.scale_address = param->float_bbox->scale;
param->bbox_arg = args;
input = param->scores_;
args.image.address = input->data<half>();
args.image.height = (uint32_t)input->dims()[2];
args.image.width = (uint32_t)input->dims()[3];
args.image.channels = (uint32_t)input->dims()[1];
args.output.address = param->float_score->mutable_data<float>();
args.output.scale_address = param->float_score->scale;
param->score_arg = args;
auto input = param->scores_;
param->score_index_ = std::make_shared<Tensor>();
param->score_index_->mutable_data<int32_t>({input->numel()});
auto score_index = param->score_index_->data<int32_t>();
......@@ -136,51 +111,17 @@ static inline void BoxCoder(Tensor *all_anchors, Tensor *bbox_deltas,
T bbox_center_x = 0, bbox_center_y = 0;
T bbox_width = 0, bbox_height = 0;
/*
if (variances) {
bbox_center_x =
variances_data[i * len] * bbox_deltas_data[i * len] * anchor_width
+ anchor_center_x; bbox_center_y = variances_data[i * len + 1] *
bbox_deltas_data[i * len + 1] * anchor_height +
anchor_center_y;
bbox_width = std::exp(std::min<T>(variances_data[i * len + 2] *
bbox_deltas_data[i * len + 2],
kBBoxClipDefault)) *
anchor_width;
bbox_height = std::exp(std::min<T>(variances_data[i * len + 3] *
bbox_deltas_data[i * len + 3],
kBBoxClipDefault)) *
anchor_height;
} else {
*/
bbox_center_x = bbox_deltas_data[i * len] * anchor_width + anchor_center_x;
bbox_center_y =
bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y;
/*
bbox_width = std::exp(std::min<T>(bbox_deltas_data[i * len + 2],
kBBoxClipDefault)) *
anchor_width;
bbox_height = std::exp(std::min<T>(bbox_deltas_data[i * len + 3],
kBBoxClipDefault)) *
anchor_height;
*/
bbox_width = std::exp(bbox_deltas_data[i * len + 2]) * anchor_width;
bbox_height = std::exp(bbox_deltas_data[i * len + 3]) * anchor_height;
// }
proposals_data[i * len] = bbox_center_x - bbox_width / 2;
proposals_data[i * len + 1] = bbox_center_y - bbox_height / 2;
/*
//wong
proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2 - 1;
proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2 - 1;
//wong
*/
proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2;
proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2;
}
// return proposals;
}
template <class T>
......@@ -252,8 +193,6 @@ static inline std::vector<std::pair<T, int>> GetSortedScoreIndex(
template <class T>
static inline T BBoxArea(const T *box, bool normalized) {
if (box[2] < box[0] || box[3] < box[1]) {
// If coordinate values are is invalid
// (e.g. xmax < xmin or ymax < ymin), return 0.
return static_cast<T>(0.);
} else {
const T w = box[2] - box[0];
......@@ -351,9 +290,6 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
Tensor index_t;
index_t.Resize({scores_slice.numel()});
int *index = index_t.mutable_data<int>();
/*for (int i = 0; i < scores_slice.numel(); ++i) {
index[i] = i;
}*/
std::memcpy(index, score_index.data<int32_t>(),
scores_slice.numel() * sizeof(int));
......@@ -397,7 +333,6 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
return std::make_pair(bbox_sel, scores_filter);
}
// Tensor keep_nms = NMS<T>(&bbox_sel, &scores_filter, nms_thresh, eta);
Tensor keep_nms =
NMS<T>(&bbox_sel, &scores_filter, nms_thresh, eta, post_nms_top_n);
......@@ -408,8 +343,6 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
proposals.mutable_data<T>({keep_nms.numel(), 4}); // original
scores_sel.mutable_data<T>({keep_nms.numel(), 1}); // original
// proposals.mutable_data<T>({post_nms_top_n, 4}); // wong
// scores_sel.mutable_data<T>({post_nms_top_n, 1}); // wong
CPUGather<T>(bbox_sel, keep_nms, &proposals);
CPUGather<T>(scores_filter, keep_nms, &scores_sel);
return std::make_pair(proposals, scores_sel);
......@@ -418,13 +351,11 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
template <>
void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) {
auto input_score = param.scores_;
auto input_score_data = input_score->data<half>();
auto input_score_data_tmp = input_score->data<half>();
auto input_score_data = input_score->data<int8_t>();
uint32_t score_n, score_height, score_width, score_channels;
auto input_bbox = param.bbox_deltas_;
auto input_bbox_data = input_bbox->data<half>();
auto input_bbox_data_tmp = input_bbox->data<half>();
auto input_bbox_data = input_bbox->data<int8_t>();
uint32_t bbox_n, bbox_height, bbox_width, bbox_channels;
score_n = (uint32_t)(input_score->dims()[0]);
......@@ -439,61 +370,48 @@ void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) {
std::shared_ptr<Tensor> score_tmp = std::make_shared<Tensor>();
score_tmp->Resize(param.scores_->dims());
score_tmp->mutable_data<half>();
score_tmp->mutable_data<int8_t>();
std::shared_ptr<Tensor> bbox_tmp = std::make_shared<Tensor>();
bbox_tmp->Resize(param.bbox_deltas_->dims());
bbox_tmp->mutable_data<half>();
bbox_tmp->mutable_data<int8_t>();
auto score_tmp_data = score_tmp->data<half>();
auto bbox_tmp_data = bbox_tmp->data<half>();
auto score_tmp_data = score_tmp->data<int8_t>();
auto bbox_tmp_data = bbox_tmp->data<int8_t>();
int64_t amount_per_side = score_width * score_height;
int idx = 0;
fpga::fpga_invalidate(
input_score_data_tmp,
score_height * score_width * score_channels * sizeof(half));
fpga::fpga_invalidate(input_score_data, score_height * score_width *
score_channels * sizeof(int8_t));
for (int h = 0; h < score_height; h++) {
for (int w = 0; w < score_width; w++) {
for (int c = 0; c < score_channels; c++) {
idx++;
// DLOG << "wong input_score: "<<
// paddle_mobile::fpga::fp16_2_fp32(input_score_data[idx]);
*(score_tmp_data + c * amount_per_side + score_width * h + w) =
(*(input_score_data_tmp++));
(*(input_score_data++));
}
}
}
amount_per_side = bbox_width * bbox_height;
fpga::fpga_invalidate(input_bbox_data_tmp, bbox_height * bbox_width *
bbox_channels * sizeof(half));
fpga::fpga_invalidate(input_bbox_data, bbox_height * bbox_width *
bbox_channels * sizeof(int8_t));
for (int h = 0; h < bbox_height; h++) {
for (int w = 0; w < bbox_width; w++) {
for (int c = 0; c < bbox_channels; c++) {
idx++;
// DLOG << "wong input_score: "<<
// paddle_mobile::fpga::fp16_2_fp32(input_score_data[idx]);
*(bbox_tmp_data + c * amount_per_side + bbox_width * h + w) =
(*(input_bbox_data_tmp++));
(*(input_bbox_data++));
}
}
}
struct paddle_mobile::fpga::BypassArgs temp_score_arg;
struct paddle_mobile::fpga::BypassArgs temp_bbox_arg;
temp_score_arg = param.score_arg;
temp_score_arg.image.address = score_tmp->data<half>();
temp_bbox_arg = param.bbox_arg;
temp_bbox_arg.image.address = bbox_tmp->data<half>();
auto score_tensor = param.float_score.get();
fpga::PerformBypass(param.score_arg);
fpga::fpga_invalidate(score_tensor->data<float>(),
score_tensor->numel() * sizeof(float));
for (int i = 0; i < score_height * score_width * score_channels; i++) {
score_tensor->data<float>()[i] = score_tmp_data[i] * input_score->scale[0];
}
auto bbox_tensor = param.float_bbox.get();
fpga::PerformBypass(param.bbox_arg);
fpga::fpga_invalidate(bbox_tensor->data<float>(),
bbox_tensor->numel() * sizeof(float));
for (int i = 0; i < bbox_height * bbox_width * bbox_channels; i++) {
bbox_tensor->data<float>()[i] = bbox_tmp_data[i] * input_bbox->scale[0];
}
auto *scores = param.float_score.get();
auto *bbox_deltas = param.float_bbox.get();
auto *im_info = param.im_info_;
......@@ -507,7 +425,6 @@ void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) {
int pre_nms_top_n = param.pre_nms_topn_;
int post_nms_top_n = param.post_nms_topn_;
// DLOG << " param.post_nms_topn_ : " << param.post_nms_topn_;
float nms_thresh = param.nms_thresh_ / 2.0f;
float min_size = param.min_size_;
......
......@@ -31,21 +31,6 @@ bool PSRoiPoolKernel<FPGA, float>::Init(PSRoiPoolParam<FPGA>* param) {
param->float_input = std::make_shared<Tensor>();
param->float_input->mutable_data<float>(param->input_x_->dims());
// param->float_output = std::make_shared<Tensor>();
auto input = param->input_x_;
fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
args.input_layout_type = fpga::LAYOUT_HWC;
args.output_layout_type = fpga::LAYOUT_HWC;
args.input_data_type = fpga::DATA_TYPE_FP16;
args.output_data_type = fpga::DATA_TYPE_FP32;
args.image.address = input->data<half>();
args.image.height = (uint32_t)input->dims()[2];
args.image.width = (uint32_t)input->dims()[3];
args.image.channels = (uint32_t)input->dims()[1];
args.output.address = param->float_input->mutable_data<float>();
args.output.scale_address = param->float_input->scale;
param->input_arg = args;
auto* rois = param->input_rois_;
int rois_num = rois->dims()[0];
......@@ -53,81 +38,11 @@ bool PSRoiPoolKernel<FPGA, float>::Init(PSRoiPoolParam<FPGA>* param) {
{rois_num, param->output_->dims()[1], param->output_->dims()[2],
param->output_->dims()[3]});
param->output_->Resize(dims_out_new);
// fpga::format_fp16_ofm(param->output_);
param->output_->mutable_data<float>(dims_out_new);
// auto output = param->float_output.get();
// param->output_ = output;
/* args.input_data_type = fpga::DATA_TYPE_FP32;
args.output_data_type = fpga::DATA_TYPE_FP16;
args.image.address = output->data<float>();
args.image.height = (uint32_t)output->dims()[2];
args.image.width = (uint32_t)output->dims()[3];
args.image.channels = (uint32_t)output->dims()[1] ;
args.output.address = param->output_->mutable_data<half>();
args.output.scale_address = param->output_->scale;
param->output_arg = args;*/
return true;
}
/*
template <typename Dtype>
void PSROIPoolingForward(
const Dtype* bottom_data,
const int height, const int width, const int input_channel,
Dtype* top_data,
const int pooled_height, const int pooled_width, const int output_channel,
const Dtype* bottom_rois,
const Dtype Bin_size_h, const Dtype Bin_size_w, const Dtype roi_start_h,
const Dtype roi_start_w, const int pw, const int ph, const int roi_batch_ind)
{
int hstart = floor(static_cast<Dtype>(ph) * Bin_size_h + roi_start_h);
int wstart = floor(static_cast<Dtype>(pw)* Bin_size_w + roi_start_w);
int hend = ceil(static_cast<Dtype>(ph + 1) * Bin_size_h + roi_start_h);
int wend = ceil(static_cast<Dtype>(pw + 1) * Bin_size_w + roi_start_w);
hstart = std::min(std::max(hstart, 0), height);
hend = std::min(std::max(hend, 0), height);
wstart = std::min(std::max(wstart, 0), width);
wend = std::min(std::max(wend, 0), width);
bool is_empty = (hend <= hstart) || (wend <= wstart);
float32x4_t sum_pixels_low_c= vdupq_n_f32(0);
float32x4_t sum_pixels_high_c= vdupq_n_f32(0);
if(!is_empty){
Dtype bin_area = (hend - hstart) * (wend - wstart);
float rev_bin_area = 1 / bin_area;
float32x4_t q_bin_area = vdupq_n_f32(rev_bin_area);
//static_cast<float>(bin_area) float pixels_c[output_channel];
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
int pixel_offset = (h * width + w) * input_channel;
for(int output_c = 0; output_c < output_channel; output_c++){
int input_channel_offset = output_c * pooled_height *
pooled_width; int input_bias = pixel_offset + input_channel_offset + ph *
pooled_width + pw; pixels_c[output_c] = bottom_data[input_bias];
}
float32x4_t pixel_low_c = vld1q_f32(pixels_c);
float32x4_t pixel_high_c = vld1q_f32(pixels_c + 4);
sum_pixels_low_c = vaddq_f32(sum_pixels_low_c, pixel_low_c);
sum_pixels_high_c = vaddq_f32(sum_pixels_high_c, pixel_high_c);
}
}
sum_pixels_low_c = vmulq_f32(sum_pixels_low_c, q_bin_area);
sum_pixels_high_c = vmulq_f32(sum_pixels_high_c, q_bin_area);
}
int output_index_base = (ph * pooled_width + pw) * output_channel;
top_data += output_index_base;
vst1q_f32(top_data, sum_pixels_low_c);
top_data += 4;
vst1q_f32(top_data, sum_pixels_high_c);
}*/
template <typename Dtype>
void PSROIPoolingForward(const Dtype* bottom_data, const int height,
const int width, const int input_channel,
......@@ -182,14 +97,18 @@ void PSROIPoolingForward(const Dtype* bottom_data, const int height,
template <>
void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
auto input_tensor = param.float_input.get();
fpga::PerformBypass(param.input_arg);
fpga::fpga_invalidate(input_tensor->data<float>(),
input_tensor->numel() * sizeof(float));
auto input_tensor = param.input_x_;
auto input_data = input_tensor->data<int8_t>();
auto Si = input_tensor->scale[0];
auto float_input_tensor = param.float_input.get();
auto float_input_data = float_input_tensor->data<float>();
for (int i = 0; i < float_input_tensor->numel(); i++) {
float_input_data[i] = input_data[i] * Si;
}
auto* in = input_tensor;
auto* in = float_input_tensor;
auto* rois = param.input_rois_;
auto* out = param.output_; // param.float_output.get();
auto* out = param.output_;
auto pooled_height = param.pooled_height_;
auto pooled_width = param.pooled_width_;
......@@ -205,14 +124,13 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
auto data_nhwc = in->mutable_data<float>();
// fpga::image::convert_to_chw(&data_nhwc, input_channels, height, width);
framework::DDim dims_out_new = framework::make_ddim(
{rois_num, (param.output_)->dims()[1], (((param.output_)->dims()[2])),
(param.output_)->dims()[3]});
(param.output_)->Resize(dims_out_new);
const float* input_data = data_nhwc; // in->data<float>();
const float* input_data_tmp = data_nhwc; // in->data<float>();
framework::Tensor rois_batch_id_list;
rois_batch_id_list.Resize({rois_num});
auto rois_batch_id_data = rois_batch_id_list.mutable_data<int>();
......@@ -268,11 +186,11 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
for (int ph = 0; ph < pooled_height; ph++) {
for (int pw = 0; pw < pooled_width; pw++) {
PSROIPoolingForward<float>(input_data, height, width, input_channels,
offset_output_data, pooled_height,
pooled_width, output_channels, input_rois,
bin_size_h, bin_size_w, roi_start_h,
roi_start_w, pw, ph, roi_batch_ind);
PSROIPoolingForward<float>(
input_data_tmp, height, width, input_channels, offset_output_data,
pooled_height, pooled_width, output_channels, input_rois,
bin_size_h, bin_size_w, roi_start_h, roi_start_w, pw, ph,
roi_batch_ind);
}
}
}
......
......@@ -25,18 +25,18 @@ bool SigmoidKernel<FPGA, float>::Init(SigmoidParam<FPGA> *param) {
paddle_mobile::fpga::SIGMOID;
int16_t leaky_relu_negative_slope = 0;
auto input = const_cast<LoDTensor *>(param->InputX());
auto input_ptr = input->data<half>();
auto input_ptr = input->data<int8_t>();
auto out = param->Out();
fpga::format_fp16_ofm(out);
fpga::format_ofm(out);
fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
args.input_data_type = fpga::DATA_TYPE_FP16;
args.output_data_type = fpga::DATA_TYPE_FP16;
fpga::BypassArgs args = {fpga::DATA_TYPE_INT8};
args.input_data_type = fpga::DATA_TYPE_INT8;
args.output_data_type = fpga::DATA_TYPE_INT8;
args.image.address = input_ptr;
args.image.height = 1;
args.image.width = 1;
args.image.channels = input->fpga_data_num;
args.output.address = out->data<half>();
args.output.address = out->data<int8_t>();
args.output.scale_address = out->scale;
args.output.activation.activation_type = activation_enable;
args.output.activation.leaky_relu_negative_slope = leaky_relu_negative_slope;
......
......@@ -24,13 +24,13 @@ template <>
bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
auto input = const_cast<LoDTensor *>(param->InputX());
auto dims = framework::vectorize(input->dims());
half *input_ptr;
int8_t *input_ptr;
auto out = param->Out();
if (input->type() == type_id<float>()) {
out->Resize(framework::make_ddim(dims));
out->mutable_data<float>(framework::make_ddim(dims));
} else {
input_ptr = input->data<half>();
input_ptr = input->data<int8_t>();
}
auto float_input = new LoDTensor;
......@@ -52,8 +52,6 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
out->mutable_data<float>(framework::make_ddim(dims));
float_input->init(type_id<float>().hash_code());
float_input->mutable_data<float>(framework::make_ddim(dims));
// fpga::format_fp32_ofm(float_input);
// fpga::format_fp32_ofm(out);
fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
args.input_layout_type = fpga::LAYOUT_HWC;
......@@ -69,7 +67,7 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
param->SetFloatInput(float_input);
param->SetFpgaArgs(args);
} else { // Use FPGA
fpga::format_fp16_ofm(out);
fpga::format_ofm(out);
fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
args.input_layout_type = fpga::LAYOUT_HWC;
args.output_layout_type = fpga::LAYOUT_CHW;
......@@ -91,7 +89,7 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
template <>
void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) {
auto *in_x = (param.InputX());
if (in_x->type() == type_id<half>()) {
if (in_x->type() == type_id<int8_t>()) {
fpga::PerformBypass(param.FpgaArgs());
if (param.FpgaArgs().output.activation.activation_type != fpga::SOFTMAX) {
Tensor *out = param.Out();
......
......@@ -36,7 +36,7 @@ bool SplitKernel<FPGA, float>::Init(SplitParam<FPGA> *param) {
fpga::fpga_malloc(image_num * sizeof(uint32_t)));
DLOG << "input: " << in;
for (int i = 0; i < image_num; i++) {
fpga::format_fp16_ofm(outs[i]);
fpga::format_ofm(outs[i]);
DLOG << "output: " << outs[i];
images_out[i] = outs[i]->mutable_data<int8_t>();
scales_out[i] = outs[i]->scale;
......
......@@ -1270,7 +1270,9 @@ class FetchParam : public OpParam {
#ifdef PADDLE_MOBILE_FPGA
public:
#ifdef PADDLE_MOBILE_FPGA_V1
fpga::BypassArgs fpga_bypass_args;
#endif
Tensor aligned_out;
#endif
};
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册