提交 a10c01f7 编写于 作者: qnqinan's avatar qnqinan 提交者: jameswu2014

update proposal kernel and other kernels in FPGA v2 track fixed#1676 (#1677)

* update concat and split kernel and related files in FPGA v2(v3) track

* update

* update

* update kernel and related files in FPGA v2 track

* update

* update

* update kernel and related files for static quantization in FPGA v2 track

* update

* update feed and fetch kernel in FPGA v2 track

* update io file

* update feed fetch and softmax kernel in FPGA v2 track

* update proposal kernel and other kernels in FPGA v2 track
上级 6126d29c
......@@ -495,6 +495,8 @@ void expand_EW_arg(EWAddArgs *arg) {
uint64_t image_amount_per_row =
align_to_x((uint64_t)args.image0.width * (uint64_t)args.image0.channels,
IMAGE_ALIGNMENT);
uint64_t image_amount_per_row_p = align_to_x(
(uint64_t)args.image0.width * (uint64_t)args.image0.channels, 16);
uint64_t image_image_pixel = ((uint64_t)args.image0.channels << 32) |
((uint64_t)args.image0.width << 16) |
(uint64_t)args.image0.height;
......@@ -503,7 +505,8 @@ void expand_EW_arg(EWAddArgs *arg) {
(*arg).driver.image1_address_phy = image1_address_phy;
(*arg).driver.datalen = datalen;
(*arg).driver.image_image_pixel = image_image_pixel;
(*arg).driver.image_amount_per_row = image_amount_per_row;
(*arg).driver.image_amount_per_row =
(uint64_t)image_amount_per_row | (uint64_t)(image_amount_per_row_p << 32);
(*arg).driver.output_address_phy = output_address_phy;
(*arg).driver.coefficient = coefficient;
(*arg).driver.cmd = cmd;
......
......@@ -25,9 +25,6 @@ template <>
bool ElementwiseAddKernel<FPGA, float>::Init(ElementwiseAddParam<FPGA> *param) {
auto *input_y = const_cast<LoDTensor *>(param->InputY());
auto *out = param->Out();
paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::NONE;
int16_t leaky_relu_negative_slope = 0;
auto *input_x = const_cast<LoDTensor *>(param->InputX());
auto input_x_ptr = input_x->data<int8_t>();
auto input_y_ptr = input_y->data<int8_t>();
......@@ -39,11 +36,9 @@ bool ElementwiseAddKernel<FPGA, float>::Init(ElementwiseAddParam<FPGA> *param) {
float C1 = Si_1 / So;
float C2 = Si_2 / So;
fpga::EWAddArgs ewaddArgs = {0};
ewaddArgs.output.activation.activation_type = activation_enable;
ewaddArgs.output.activation.leaky_relu_negative_slope =
leaky_relu_negative_slope;
ewaddArgs.const0 = fpga::fp32_2_fp16(C1);
ewaddArgs.const1 = fpga::fp32_2_fp16(C2);
ewaddArgs.relu_enabled = 0;
ewaddArgs.image0.address = input_x_ptr;
ewaddArgs.image0.channels = (uint32_t)input_x->dims()[1];
ewaddArgs.image0.scale_address = input_x->scale;
......
......@@ -21,9 +21,6 @@ namespace operators {
template <>
bool ElementwiseAddReluKernel<FPGA, float>::Init(
ElementwiseAddReluParam<FPGA> *param) {
paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::LEAKYRELU;
int16_t leaky_relu_negative_slope = 0;
auto *input_x = const_cast<LoDTensor *>(param->InputX());
auto *input_y = const_cast<LoDTensor *>(param->InputY());
auto *out = param->Out();
......@@ -37,9 +34,7 @@ bool ElementwiseAddReluKernel<FPGA, float>::Init(
float C1 = Si_1 / So;
float C2 = Si_2 / So;
fpga::EWAddArgs ewaddArgs = {0};
ewaddArgs.output.activation.activation_type = activation_enable;
ewaddArgs.output.activation.leaky_relu_negative_slope =
leaky_relu_negative_slope;
ewaddArgs.relu_enabled = 1;
ewaddArgs.const0 = fpga::fp32_2_fp16(C1);
ewaddArgs.const1 = fpga::fp32_2_fp16(C2);
ewaddArgs.image0.address = input_x_ptr;
......
......@@ -44,8 +44,19 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
return;
}
fpga::format_image(input);
output->ShareDataWith(*input);
input->external_data = nullptr;
auto output_ptr = output->data<int8_t>();
int channel = output->dims()[1];
int height = output->dims()[2];
int width = output->dims()[3];
int size = fpga::align_to_x(channel * width, IMAGE_ALIGNMENT) * height;
auto input_ptr = input->data<int8_t>();
fpga::fpga_invalidate(input_ptr, size * sizeof(int8_t));
memcpy(output_ptr, input_ptr, size * sizeof(int8_t));
fpga::fpga_flush(output_ptr,
fpga::align_to_x(channel * width, IMAGE_ALIGNMENT) * height *
sizeof(int8_t));
}
template class FeedKernel<FPGA, float>;
......
......@@ -380,37 +380,54 @@ void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) {
auto bbox_tmp_data = bbox_tmp->data<int8_t>();
int64_t amount_per_side = score_width * score_height;
int idx = 0;
fpga::fpga_invalidate(input_score_data, score_height * score_width *
score_channels * sizeof(int8_t));
int alignedCW =
fpga::align_to_x(score_width * score_channels, IMAGE_ALIGNMENT);
int unalignedCW = score_width * score_channels;
fpga::fpga_invalidate(input_score_data,
score_height * alignedCW * sizeof(int8_t));
for (int h = 0; h < score_height; h++) {
for (int w = 0; w < score_width; w++) {
for (int c = 0; c < score_channels; c++) {
idx++;
*(score_tmp_data + c * amount_per_side + score_width * h + w) =
(*(input_score_data++));
if (alignedCW == unalignedCW) {
*(score_tmp_data + c * amount_per_side + score_width * h + w) =
(*(input_score_data++));
} else {
idx = h * alignedCW + w * score_channels + c;
*(score_tmp_data + c * amount_per_side + score_width * h + w) =
input_score_data[idx];
}
}
}
}
amount_per_side = bbox_width * bbox_height;
fpga::fpga_invalidate(input_bbox_data, bbox_height * bbox_width *
bbox_channels * sizeof(int8_t));
alignedCW = fpga::align_to_x(bbox_width * bbox_channels, IMAGE_ALIGNMENT);
unalignedCW = bbox_width * bbox_channels;
fpga::fpga_invalidate(input_bbox_data,
bbox_height * alignedCW * sizeof(int8_t));
for (int h = 0; h < bbox_height; h++) {
for (int w = 0; w < bbox_width; w++) {
for (int c = 0; c < bbox_channels; c++) {
idx++;
*(bbox_tmp_data + c * amount_per_side + bbox_width * h + w) =
(*(input_bbox_data++));
if (alignedCW == unalignedCW) {
*(bbox_tmp_data + c * amount_per_side + bbox_width * h + w) =
(*(input_bbox_data++));
} else {
idx = h * alignedCW + w * bbox_channels + c;
*(bbox_tmp_data + c * amount_per_side + bbox_width * h + w) =
input_bbox_data[idx];
}
}
}
}
auto score_tensor = param.float_score.get();
for (int i = 0; i < score_height * score_width * score_channels; i++) {
score_tensor->data<float>()[i] = score_tmp_data[i] * input_score->scale[0];
score_tensor->data<float>()[i] =
score_tmp_data[i] / 127.0 * input_score->scale[0];
}
auto bbox_tensor = param.float_bbox.get();
for (int i = 0; i < bbox_height * bbox_width * bbox_channels; i++) {
bbox_tensor->data<float>()[i] = bbox_tmp_data[i] * input_bbox->scale[0];
bbox_tensor->data<float>()[i] =
bbox_tmp_data[i] / 127.0 * input_bbox->scale[0];
}
auto *scores = param.float_score.get();
auto *bbox_deltas = param.float_bbox.get();
......
......@@ -103,7 +103,7 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
auto float_input_tensor = param.float_input.get();
auto float_input_data = float_input_tensor->data<float>();
for (int i = 0; i < float_input_tensor->numel(); i++) {
float_input_data[i] = input_data[i] * Si;
float_input_data[i] = input_data[i] / 127.0 * Si;
}
auto* in = float_input_tensor;
......
......@@ -25,7 +25,7 @@ bool SliceKernel<FPGA, float>::Init(SliceParam<FPGA>* param) {
fpga::format_ofm(output);
DLOG << "input: " << param->input_;
DLOG << "output: " << param->output_;
if (param->input_->type() != type_id<half>()) {
if (param->input_->type() != type_id<int8_t>()) {
DLOG << "wrong type";
}
return true;
......
......@@ -123,8 +123,8 @@ void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) {
float_out->Resize(in_x->dims());
float_out->init(type_id<float>().hash_code());
fpga::format_ofm(float_out.get());
auto float_out_data = float_out->data<float>();
math::SoftmaxFuntor<CPU, float>()(in_x, float_out.get());
auto float_out_data = float_out->data<float>();
int dataNum = n * h * fpga::align_to_x(w * c, IMAGE_ALIGNMENT);
for (int i = 0; i < dataNum; i++) {
float tmp_out = float_out_data[i] * 127;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册