提交 a10c01f7 编写于 作者: qnqinan's avatar qnqinan 提交者: jameswu2014

update proposal kernel and other kernels in FPGA v2 track fixed#1676 (#1677)

* update concat and split kernel and related files in FPGA v2(v3) track

* update

* update

* update kernel and related files in FPGA v2 track

* update

* update

* update kernel and related files for static quantization in FPGA v2 track

* update

* update feed and fetch kernel in FPGA v2 track

* update io file

* update feed fetch and softmax kernel in FPGA v2 track

* update proposal kernel and other kernels in FPGA v2 track
上级 6126d29c
...@@ -495,6 +495,8 @@ void expand_EW_arg(EWAddArgs *arg) { ...@@ -495,6 +495,8 @@ void expand_EW_arg(EWAddArgs *arg) {
uint64_t image_amount_per_row = uint64_t image_amount_per_row =
align_to_x((uint64_t)args.image0.width * (uint64_t)args.image0.channels, align_to_x((uint64_t)args.image0.width * (uint64_t)args.image0.channels,
IMAGE_ALIGNMENT); IMAGE_ALIGNMENT);
uint64_t image_amount_per_row_p = align_to_x(
(uint64_t)args.image0.width * (uint64_t)args.image0.channels, 16);
uint64_t image_image_pixel = ((uint64_t)args.image0.channels << 32) | uint64_t image_image_pixel = ((uint64_t)args.image0.channels << 32) |
((uint64_t)args.image0.width << 16) | ((uint64_t)args.image0.width << 16) |
(uint64_t)args.image0.height; (uint64_t)args.image0.height;
...@@ -503,7 +505,8 @@ void expand_EW_arg(EWAddArgs *arg) { ...@@ -503,7 +505,8 @@ void expand_EW_arg(EWAddArgs *arg) {
(*arg).driver.image1_address_phy = image1_address_phy; (*arg).driver.image1_address_phy = image1_address_phy;
(*arg).driver.datalen = datalen; (*arg).driver.datalen = datalen;
(*arg).driver.image_image_pixel = image_image_pixel; (*arg).driver.image_image_pixel = image_image_pixel;
(*arg).driver.image_amount_per_row = image_amount_per_row; (*arg).driver.image_amount_per_row =
(uint64_t)image_amount_per_row | (uint64_t)(image_amount_per_row_p << 32);
(*arg).driver.output_address_phy = output_address_phy; (*arg).driver.output_address_phy = output_address_phy;
(*arg).driver.coefficient = coefficient; (*arg).driver.coefficient = coefficient;
(*arg).driver.cmd = cmd; (*arg).driver.cmd = cmd;
......
...@@ -25,9 +25,6 @@ template <> ...@@ -25,9 +25,6 @@ template <>
bool ElementwiseAddKernel<FPGA, float>::Init(ElementwiseAddParam<FPGA> *param) { bool ElementwiseAddKernel<FPGA, float>::Init(ElementwiseAddParam<FPGA> *param) {
auto *input_y = const_cast<LoDTensor *>(param->InputY()); auto *input_y = const_cast<LoDTensor *>(param->InputY());
auto *out = param->Out(); auto *out = param->Out();
paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::NONE;
int16_t leaky_relu_negative_slope = 0;
auto *input_x = const_cast<LoDTensor *>(param->InputX()); auto *input_x = const_cast<LoDTensor *>(param->InputX());
auto input_x_ptr = input_x->data<int8_t>(); auto input_x_ptr = input_x->data<int8_t>();
auto input_y_ptr = input_y->data<int8_t>(); auto input_y_ptr = input_y->data<int8_t>();
...@@ -39,11 +36,9 @@ bool ElementwiseAddKernel<FPGA, float>::Init(ElementwiseAddParam<FPGA> *param) { ...@@ -39,11 +36,9 @@ bool ElementwiseAddKernel<FPGA, float>::Init(ElementwiseAddParam<FPGA> *param) {
float C1 = Si_1 / So; float C1 = Si_1 / So;
float C2 = Si_2 / So; float C2 = Si_2 / So;
fpga::EWAddArgs ewaddArgs = {0}; fpga::EWAddArgs ewaddArgs = {0};
ewaddArgs.output.activation.activation_type = activation_enable;
ewaddArgs.output.activation.leaky_relu_negative_slope =
leaky_relu_negative_slope;
ewaddArgs.const0 = fpga::fp32_2_fp16(C1); ewaddArgs.const0 = fpga::fp32_2_fp16(C1);
ewaddArgs.const1 = fpga::fp32_2_fp16(C2); ewaddArgs.const1 = fpga::fp32_2_fp16(C2);
ewaddArgs.relu_enabled = 0;
ewaddArgs.image0.address = input_x_ptr; ewaddArgs.image0.address = input_x_ptr;
ewaddArgs.image0.channels = (uint32_t)input_x->dims()[1]; ewaddArgs.image0.channels = (uint32_t)input_x->dims()[1];
ewaddArgs.image0.scale_address = input_x->scale; ewaddArgs.image0.scale_address = input_x->scale;
......
...@@ -21,9 +21,6 @@ namespace operators { ...@@ -21,9 +21,6 @@ namespace operators {
template <> template <>
bool ElementwiseAddReluKernel<FPGA, float>::Init( bool ElementwiseAddReluKernel<FPGA, float>::Init(
ElementwiseAddReluParam<FPGA> *param) { ElementwiseAddReluParam<FPGA> *param) {
paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::LEAKYRELU;
int16_t leaky_relu_negative_slope = 0;
auto *input_x = const_cast<LoDTensor *>(param->InputX()); auto *input_x = const_cast<LoDTensor *>(param->InputX());
auto *input_y = const_cast<LoDTensor *>(param->InputY()); auto *input_y = const_cast<LoDTensor *>(param->InputY());
auto *out = param->Out(); auto *out = param->Out();
...@@ -37,9 +34,7 @@ bool ElementwiseAddReluKernel<FPGA, float>::Init( ...@@ -37,9 +34,7 @@ bool ElementwiseAddReluKernel<FPGA, float>::Init(
float C1 = Si_1 / So; float C1 = Si_1 / So;
float C2 = Si_2 / So; float C2 = Si_2 / So;
fpga::EWAddArgs ewaddArgs = {0}; fpga::EWAddArgs ewaddArgs = {0};
ewaddArgs.output.activation.activation_type = activation_enable; ewaddArgs.relu_enabled = 1;
ewaddArgs.output.activation.leaky_relu_negative_slope =
leaky_relu_negative_slope;
ewaddArgs.const0 = fpga::fp32_2_fp16(C1); ewaddArgs.const0 = fpga::fp32_2_fp16(C1);
ewaddArgs.const1 = fpga::fp32_2_fp16(C2); ewaddArgs.const1 = fpga::fp32_2_fp16(C2);
ewaddArgs.image0.address = input_x_ptr; ewaddArgs.image0.address = input_x_ptr;
......
...@@ -44,8 +44,19 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) { ...@@ -44,8 +44,19 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
return; return;
} }
fpga::format_image(input); fpga::format_image(input);
output->ShareDataWith(*input);
input->external_data = nullptr; auto output_ptr = output->data<int8_t>();
int channel = output->dims()[1];
int height = output->dims()[2];
int width = output->dims()[3];
int size = fpga::align_to_x(channel * width, IMAGE_ALIGNMENT) * height;
auto input_ptr = input->data<int8_t>();
fpga::fpga_invalidate(input_ptr, size * sizeof(int8_t));
memcpy(output_ptr, input_ptr, size * sizeof(int8_t));
fpga::fpga_flush(output_ptr,
fpga::align_to_x(channel * width, IMAGE_ALIGNMENT) * height *
sizeof(int8_t));
} }
template class FeedKernel<FPGA, float>; template class FeedKernel<FPGA, float>;
......
...@@ -380,37 +380,54 @@ void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) { ...@@ -380,37 +380,54 @@ void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) {
auto bbox_tmp_data = bbox_tmp->data<int8_t>(); auto bbox_tmp_data = bbox_tmp->data<int8_t>();
int64_t amount_per_side = score_width * score_height; int64_t amount_per_side = score_width * score_height;
int idx = 0; int idx = 0;
fpga::fpga_invalidate(input_score_data, score_height * score_width * int alignedCW =
score_channels * sizeof(int8_t)); fpga::align_to_x(score_width * score_channels, IMAGE_ALIGNMENT);
int unalignedCW = score_width * score_channels;
fpga::fpga_invalidate(input_score_data,
score_height * alignedCW * sizeof(int8_t));
for (int h = 0; h < score_height; h++) { for (int h = 0; h < score_height; h++) {
for (int w = 0; w < score_width; w++) { for (int w = 0; w < score_width; w++) {
for (int c = 0; c < score_channels; c++) { for (int c = 0; c < score_channels; c++) {
idx++; if (alignedCW == unalignedCW) {
*(score_tmp_data + c * amount_per_side + score_width * h + w) = *(score_tmp_data + c * amount_per_side + score_width * h + w) =
(*(input_score_data++)); (*(input_score_data++));
} else {
idx = h * alignedCW + w * score_channels + c;
*(score_tmp_data + c * amount_per_side + score_width * h + w) =
input_score_data[idx];
}
} }
} }
} }
amount_per_side = bbox_width * bbox_height; amount_per_side = bbox_width * bbox_height;
fpga::fpga_invalidate(input_bbox_data, bbox_height * bbox_width * alignedCW = fpga::align_to_x(bbox_width * bbox_channels, IMAGE_ALIGNMENT);
bbox_channels * sizeof(int8_t)); unalignedCW = bbox_width * bbox_channels;
fpga::fpga_invalidate(input_bbox_data,
bbox_height * alignedCW * sizeof(int8_t));
for (int h = 0; h < bbox_height; h++) { for (int h = 0; h < bbox_height; h++) {
for (int w = 0; w < bbox_width; w++) { for (int w = 0; w < bbox_width; w++) {
for (int c = 0; c < bbox_channels; c++) { for (int c = 0; c < bbox_channels; c++) {
idx++; if (alignedCW == unalignedCW) {
*(bbox_tmp_data + c * amount_per_side + bbox_width * h + w) = *(bbox_tmp_data + c * amount_per_side + bbox_width * h + w) =
(*(input_bbox_data++)); (*(input_bbox_data++));
} else {
idx = h * alignedCW + w * bbox_channels + c;
*(bbox_tmp_data + c * amount_per_side + bbox_width * h + w) =
input_bbox_data[idx];
}
} }
} }
} }
auto score_tensor = param.float_score.get(); auto score_tensor = param.float_score.get();
for (int i = 0; i < score_height * score_width * score_channels; i++) { for (int i = 0; i < score_height * score_width * score_channels; i++) {
score_tensor->data<float>()[i] = score_tmp_data[i] * input_score->scale[0]; score_tensor->data<float>()[i] =
score_tmp_data[i] / 127.0 * input_score->scale[0];
} }
auto bbox_tensor = param.float_bbox.get(); auto bbox_tensor = param.float_bbox.get();
for (int i = 0; i < bbox_height * bbox_width * bbox_channels; i++) { for (int i = 0; i < bbox_height * bbox_width * bbox_channels; i++) {
bbox_tensor->data<float>()[i] = bbox_tmp_data[i] * input_bbox->scale[0]; bbox_tensor->data<float>()[i] =
bbox_tmp_data[i] / 127.0 * input_bbox->scale[0];
} }
auto *scores = param.float_score.get(); auto *scores = param.float_score.get();
auto *bbox_deltas = param.float_bbox.get(); auto *bbox_deltas = param.float_bbox.get();
......
...@@ -103,7 +103,7 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) { ...@@ -103,7 +103,7 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
auto float_input_tensor = param.float_input.get(); auto float_input_tensor = param.float_input.get();
auto float_input_data = float_input_tensor->data<float>(); auto float_input_data = float_input_tensor->data<float>();
for (int i = 0; i < float_input_tensor->numel(); i++) { for (int i = 0; i < float_input_tensor->numel(); i++) {
float_input_data[i] = input_data[i] * Si; float_input_data[i] = input_data[i] / 127.0 * Si;
} }
auto* in = float_input_tensor; auto* in = float_input_tensor;
......
...@@ -25,7 +25,7 @@ bool SliceKernel<FPGA, float>::Init(SliceParam<FPGA>* param) { ...@@ -25,7 +25,7 @@ bool SliceKernel<FPGA, float>::Init(SliceParam<FPGA>* param) {
fpga::format_ofm(output); fpga::format_ofm(output);
DLOG << "input: " << param->input_; DLOG << "input: " << param->input_;
DLOG << "output: " << param->output_; DLOG << "output: " << param->output_;
if (param->input_->type() != type_id<half>()) { if (param->input_->type() != type_id<int8_t>()) {
DLOG << "wrong type"; DLOG << "wrong type";
} }
return true; return true;
......
...@@ -123,8 +123,8 @@ void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) { ...@@ -123,8 +123,8 @@ void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) {
float_out->Resize(in_x->dims()); float_out->Resize(in_x->dims());
float_out->init(type_id<float>().hash_code()); float_out->init(type_id<float>().hash_code());
fpga::format_ofm(float_out.get()); fpga::format_ofm(float_out.get());
auto float_out_data = float_out->data<float>();
math::SoftmaxFuntor<CPU, float>()(in_x, float_out.get()); math::SoftmaxFuntor<CPU, float>()(in_x, float_out.get());
auto float_out_data = float_out->data<float>();
int dataNum = n * h * fpga::align_to_x(w * c, IMAGE_ALIGNMENT); int dataNum = n * h * fpga::align_to_x(w * c, IMAGE_ALIGNMENT);
for (int i = 0; i < dataNum; i++) { for (int i = 0; i < dataNum; i++) {
float tmp_out = float_out_data[i] * 127; float tmp_out = float_out_data[i] * 127;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册