update proposal kernel and other kernels in FPGA v2 track fixed#1676 (#1677)

* update concat and split kernel and related files in FPGA v2(v3) track * update * update * update kernel and related files in FPGA v2 track * update * update * update kernel and related files for static quantization in FPGA v2 track * update * update feed and fetch kernel in FPGA v2 track * update io file * update feed fetch and softmax kernel in FPGA v2 track * update proposal kernel and other kernels in FPGA v2 track

update proposal kernel and other kernels in FPGA v2 track fixed#1676 (#1677)
* update concat and split kernel and related files in FPGA v2(v3) track * update * update * update kernel and related files in FPGA v2 track * update * update * update kernel and related files for static quantization in FPGA v2 track * update * update feed and fetch kernel in FPGA v2 track * update io file * update feed fetch and softmax kernel in FPGA v2 track * update proposal kernel and other kernels in FPGA v2 track
a10c01f7 · qnqinan · jameswu2014 · 6126d29c · a10c01f7 · a10c01f7
8 changed file
--- a/src/fpga/V2/api.cpp
+++ b/src/fpga/V2/api.cpp
@@ -495,6 +495,8 @@ void expand_EW_arg(EWAddArgs *arg) {
  uint64_t image_amount_per_row =
      align_to_x((uint64_t)args.image0.width * (uint64_t)args.image0.channels,
                 IMAGE_ALIGNMENT);
+  uint64_t image_amount_per_row_p = align_to_x(
+      (uint64_t)args.image0.width * (uint64_t)args.image0.channels, 16);
  uint64_t image_image_pixel = ((uint64_t)args.image0.channels << 32) |
                               ((uint64_t)args.image0.width << 16) |
                               (uint64_t)args.image0.height;
@@ -503,7 +505,8 @@ void expand_EW_arg(EWAddArgs *arg) {
  (*arg).driver.image1_address_phy = image1_address_phy;
  (*arg).driver.datalen = datalen;
  (*arg).driver.image_image_pixel = image_image_pixel;
-  (*arg).driver.image_amount_per_row = image_amount_per_row;
+  (*arg).driver.image_amount_per_row =
+      (uint64_t)image_amount_per_row | (uint64_t)(image_amount_per_row_p << 32);
  (*arg).driver.output_address_phy = output_address_phy;
  (*arg).driver.coefficient = coefficient;
  (*arg).driver.cmd = cmd;

--- a/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp
@@ -25,9 +25,6 @@ template <>
 bool ElementwiseAddKernel<FPGA, float>::Init(ElementwiseAddParam<FPGA> *param) {
  auto *input_y = const_cast<LoDTensor *>(param->InputY());
  auto *out = param->Out();
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::NONE;
-  int16_t leaky_relu_negative_slope = 0;
  auto *input_x = const_cast<LoDTensor *>(param->InputX());
  auto input_x_ptr = input_x->data<int8_t>();
  auto input_y_ptr = input_y->data<int8_t>();
@@ -39,11 +36,9 @@ bool ElementwiseAddKernel<FPGA, float>::Init(ElementwiseAddParam<FPGA> *param) {
  float C1 = Si_1 / So;
  float C2 = Si_2 / So;
  fpga::EWAddArgs ewaddArgs = {0};
-  ewaddArgs.output.activation.activation_type = activation_enable;
-  ewaddArgs.output.activation.leaky_relu_negative_slope =
-      leaky_relu_negative_slope;
  ewaddArgs.const0 = fpga::fp32_2_fp16(C1);
  ewaddArgs.const1 = fpga::fp32_2_fp16(C2);
+  ewaddArgs.relu_enabled = 0;
  ewaddArgs.image0.address = input_x_ptr;
  ewaddArgs.image0.channels = (uint32_t)input_x->dims()[1];
  ewaddArgs.image0.scale_address = input_x->scale;

--- a/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
@@ -21,9 +21,6 @@ namespace operators {
 template <>
 bool ElementwiseAddReluKernel<FPGA, float>::Init(
    ElementwiseAddReluParam<FPGA> *param) {
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::LEAKYRELU;
-  int16_t leaky_relu_negative_slope = 0;
  auto *input_x = const_cast<LoDTensor *>(param->InputX());
  auto *input_y = const_cast<LoDTensor *>(param->InputY());
  auto *out = param->Out();
@@ -37,9 +34,7 @@ bool ElementwiseAddReluKernel<FPGA, float>::Init(
  float C1 = Si_1 / So;
  float C2 = Si_2 / So;
  fpga::EWAddArgs ewaddArgs = {0};
-  ewaddArgs.output.activation.activation_type = activation_enable;
-  ewaddArgs.output.activation.leaky_relu_negative_slope =
-      leaky_relu_negative_slope;
+  ewaddArgs.relu_enabled = 1;
  ewaddArgs.const0 = fpga::fp32_2_fp16(C1);
  ewaddArgs.const1 = fpga::fp32_2_fp16(C2);
  ewaddArgs.image0.address = input_x_ptr;

--- a/src/operators/kernel/fpga/V2/feed_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/feed_kernel.cpp
@@ -44,8 +44,19 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
    return;
  }
  fpga::format_image(input);
-  output->ShareDataWith(*input);
-  input->external_data = nullptr;
+
+  auto output_ptr = output->data<int8_t>();
+  int channel = output->dims()[1];
+  int height = output->dims()[2];
+  int width = output->dims()[3];
+  int size = fpga::align_to_x(channel * width, IMAGE_ALIGNMENT) * height;
+  auto input_ptr = input->data<int8_t>();
+  fpga::fpga_invalidate(input_ptr, size * sizeof(int8_t));
+  memcpy(output_ptr, input_ptr, size * sizeof(int8_t));
+
+  fpga::fpga_flush(output_ptr,
+                   fpga::align_to_x(channel * width, IMAGE_ALIGNMENT) * height *
+                       sizeof(int8_t));
 }
 template class FeedKernel<FPGA, float>;


--- a/src/operators/kernel/fpga/V2/proposal_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/proposal_kernel.cpp
@@ -380,37 +380,54 @@ void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) {
  auto bbox_tmp_data = bbox_tmp->data<int8_t>();
  int64_t amount_per_side = score_width * score_height;
  int idx = 0;
-  fpga::fpga_invalidate(input_score_data, score_height * score_width *
-                                              score_channels * sizeof(int8_t));
+  int alignedCW =
+      fpga::align_to_x(score_width * score_channels, IMAGE_ALIGNMENT);
+  int unalignedCW = score_width * score_channels;
+  fpga::fpga_invalidate(input_score_data,
+                        score_height * alignedCW * sizeof(int8_t));
  for (int h = 0; h < score_height; h++) {
    for (int w = 0; w < score_width; w++) {
      for (int c = 0; c < score_channels; c++) {
-        idx++;
-        *(score_tmp_data + c * amount_per_side + score_width * h + w) =
-            (*(input_score_data++));
+        if (alignedCW == unalignedCW) {
+          *(score_tmp_data + c * amount_per_side + score_width * h + w) =
+              (*(input_score_data++));
+        } else {
+          idx = h * alignedCW + w * score_channels + c;
+          *(score_tmp_data + c * amount_per_side + score_width * h + w) =
+              input_score_data[idx];
+        }
      }
    }
  }
  amount_per_side = bbox_width * bbox_height;
-  fpga::fpga_invalidate(input_bbox_data, bbox_height * bbox_width *
-                                             bbox_channels * sizeof(int8_t));
+  alignedCW = fpga::align_to_x(bbox_width * bbox_channels, IMAGE_ALIGNMENT);
+  unalignedCW = bbox_width * bbox_channels;
+  fpga::fpga_invalidate(input_bbox_data,
+                        bbox_height * alignedCW * sizeof(int8_t));
  for (int h = 0; h < bbox_height; h++) {
    for (int w = 0; w < bbox_width; w++) {
      for (int c = 0; c < bbox_channels; c++) {
-        idx++;
-        *(bbox_tmp_data + c * amount_per_side + bbox_width * h + w) =
-            (*(input_bbox_data++));
+        if (alignedCW == unalignedCW) {
+          *(bbox_tmp_data + c * amount_per_side + bbox_width * h + w) =
+              (*(input_bbox_data++));
+        } else {
+          idx = h * alignedCW + w * bbox_channels + c;
+          *(bbox_tmp_data + c * amount_per_side + bbox_width * h + w) =
+              input_bbox_data[idx];
+        }
      }
    }
  }

  auto score_tensor = param.float_score.get();
  for (int i = 0; i < score_height * score_width * score_channels; i++) {
-    score_tensor->data<float>()[i] = score_tmp_data[i] * input_score->scale[0];
+    score_tensor->data<float>()[i] =
+        score_tmp_data[i] / 127.0 * input_score->scale[0];
  }
  auto bbox_tensor = param.float_bbox.get();
  for (int i = 0; i < bbox_height * bbox_width * bbox_channels; i++) {
-    bbox_tensor->data<float>()[i] = bbox_tmp_data[i] * input_bbox->scale[0];
+    bbox_tensor->data<float>()[i] =
+        bbox_tmp_data[i] / 127.0 * input_bbox->scale[0];
  }
  auto *scores = param.float_score.get();
  auto *bbox_deltas = param.float_bbox.get();

--- a/src/operators/kernel/fpga/V2/psroi_pool_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/psroi_pool_kernel.cpp
@@ -103,7 +103,7 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
  auto float_input_tensor = param.float_input.get();
  auto float_input_data = float_input_tensor->data<float>();
  for (int i = 0; i < float_input_tensor->numel(); i++) {
-    float_input_data[i] = input_data[i] * Si;
+    float_input_data[i] = input_data[i] / 127.0 * Si;
  }

  auto* in = float_input_tensor;

--- a/src/operators/kernel/fpga/V2/slice_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/slice_kernel.cpp
@@ -25,7 +25,7 @@ bool SliceKernel<FPGA, float>::Init(SliceParam<FPGA>* param) {
  fpga::format_ofm(output);
  DLOG << "input: " << param->input_;
  DLOG << "output: " << param->output_;
-  if (param->input_->type() != type_id<half>()) {
+  if (param->input_->type() != type_id<int8_t>()) {
    DLOG << "wrong type";
  }
  return true;

--- a/src/operators/kernel/fpga/V2/softmax_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/softmax_kernel.cpp
@@ -123,8 +123,8 @@ void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) {
    float_out->Resize(in_x->dims());
    float_out->init(type_id<float>().hash_code());
    fpga::format_ofm(float_out.get());
-    auto float_out_data = float_out->data<float>();
    math::SoftmaxFuntor<CPU, float>()(in_x, float_out.get());
+    auto float_out_data = float_out->data<float>();
    int dataNum = n * h * fpga::align_to_x(w * c, IMAGE_ALIGNMENT);
    for (int i = 0; i < dataNum; i++) {
      float tmp_out = float_out_data[i] * 127;