update feed fetch and softmax kernel in FPGA v2 track fixed#1615 (#1616)

* update concat and split kernel and related files in FPGA v2(v3) track * update * update * update kernel and related files in FPGA v2 track * update * update * update kernel and related files for static quantization in FPGA v2 track * update * update feed and fetch kernel in FPGA v2 track * update io file * update feed fetch and softmax kernel in FPGA v2 track

update feed fetch and softmax kernel in FPGA v2 track fixed#1615 (#1616)
* update concat and split kernel and related files in FPGA v2(v3) track * update * update * update kernel and related files in FPGA v2 track * update * update * update kernel and related files for static quantization in FPGA v2 track * update * update feed and fetch kernel in FPGA v2 track * update io file * update feed fetch and softmax kernel in FPGA v2 track
cf542116 · qnqinan · GitHub · 3a9016d7 · cf542116 · cf542116
5 changed file
--- a/src/fpga/V2/api.cpp
+++ b/src/fpga/V2/api.cpp
@@ -50,11 +50,13 @@ void format_int8_ofm(framework::Tensor *ofm_tensor) {
  auto dims = ofm_tensor->dims();
  size_t memory_size = 0;
  if (dims.size() == 4) {
-    auto channel = dims[1], height = dims[2], width = dims[3], num = dims[0];
+    auto num = (dims[0] == 0) ? 1 : dims[0], channel = dims[1],
+         height = dims[2], width = dims[3];
    memory_size = num * height * align_to_x(channel * width, IMAGE_ALIGNMENT) *
                  sizeof(int8_t);
  } else if (dims.size() == 2) {
-    memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(int8_t);
+    auto num = (dims[0] == 0) ? 1 : dims[0], channel = dims[1];
+    memory_size = num * align_to_x(channel, IMAGE_ALIGNMENT) * sizeof(int8_t);
  } else {
    DLOG << "Wrong ofm dimension";
  }
@@ -68,11 +70,13 @@ void format_int8_ofm(framework::Tensor *ofm_tensor) {
 void format_int8_ofm(framework::Tensor *ofm_tensor, framework::DDim dims) {
  size_t memory_size = 0;
  if (dims.size() == 4) {
-    auto channel = dims[1], height = dims[2], width = dims[3];
-    memory_size =
-        height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(int8_t);
+    auto num = (dims[0] == 0) ? 1 : dims[0], channel = dims[1],
+         height = dims[2], width = dims[3];
+    memory_size = num * height * align_to_x(channel * width, IMAGE_ALIGNMENT) *
+                  sizeof(int8_t);
  } else if (dims.size() == 2) {
-    memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(int8_t);
+    auto num = (dims[0] == 0) ? 1 : dims[0], channel = dims[1];
+    memory_size = num * align_to_x(channel, IMAGE_ALIGNMENT) * sizeof(int8_t);
  } else {
    DLOG << "Wrong ofm dimension";
  }
@@ -87,11 +91,13 @@ void format_fp32_ofm(framework::Tensor *ofm_tensor) {
  auto dims = ofm_tensor->dims();
  size_t memory_size = 0;
  if (dims.size() == 4) {
-    auto channel = dims[1], height = dims[2], width = dims[3];
-    memory_size =
-        height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(float);
+    auto num = (dims[0] == 0) ? 1 : dims[0], channel = dims[1],
+         height = dims[2], width = dims[3];
+    memory_size = num * height * align_to_x(channel * width, IMAGE_ALIGNMENT) *
+                  sizeof(float);
  } else if (dims.size() == 2) {
-    memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(float);
+    auto num = (dims[0] == 0) ? 1 : dims[0], channel = dims[1];
+    memory_size = num * align_to_x(channel, IMAGE_ALIGNMENT) * sizeof(float);
  } else {
    DLOG << "Wrong ofm dimension";
  }

--- a/src/operators/kernel/fpga/V2/feed_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/feed_kernel.cpp
@@ -21,6 +21,7 @@ template <>
 bool FeedKernel<FPGA, float>::Init(FeedParam<FPGA> *param) {
  auto output = param->Out();
  if (output->dims().size() != 4) {
+    output->init(type_id<float>().hash_code());
    return true;
  }
  fpga::format_ofm(output);
@@ -44,6 +45,7 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
  }
  fpga::format_image(input);
  output->ShareDataWith(*input);
+  input->external_data = nullptr;
 }
 template class FeedKernel<FPGA, float>;


--- a/src/operators/kernel/fpga/V2/fetch_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/fetch_kernel.cpp
@@ -23,9 +23,7 @@ bool FetchKernel<FPGA, float>::Init(FetchParam<FPGA> *param) {
  auto output = &(param->Out()->at(col));
  output->init(type_id<float>().hash_code());
  output->mutable_data<float>(input->dims());
-  if (input->type() == type_id<float>()) {
-    return true;
-  }
+
  auto aligned_output = param->aligned_out;
  int outC = 1;
  int outW = 1;
@@ -61,13 +59,33 @@ void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
  auto input = const_cast<LoDTensor *>(param.InputX());
  int col = param.Col();
  auto output = &param.Out()->at(col);
+  auto outdata_ptr = const_cast<float *>(output->data<float>());
+  int outC = 1;
+  int outH = 1;
+  int outW = 1;
+  if (output->dims().size() == 4) {
+    outC = output->dims()[1];
+    outH = output->dims()[2];
+    outW = output->dims()[3];
+  } else {  // 2
+    outC = output->dims()[1];
+  }
+  int unalignedCW = outC * outW;
+  int alignedCW = fpga::align_to_x(unalignedCW, IMAGE_ALIGNMENT);
  if (input->type() == type_id<float>()) {
-    output->ShareDataWith(*input);
+    if (unalignedCW == alignedCW) {
+      output->ShareDataWith(*input);
+    } else {
+      auto input_address = input->data<float>();
+      dealign(input_address, outdata_ptr, outC, outH, outW);
+      fpga::fpga_flush(outdata_ptr, outC * outH * outW * sizeof(float));
+    }
+
    return;
  }
  auto input_address = input->data<int8_t>();
  float Si = input->scale[0];
-  auto outdata_ptr = const_cast<float *>(output->data<float>());
+
  const int num_th = 32;
  fpga::fpga_invalidate(input_address, (input->fpga_data_num) * sizeof(int8_t));
  if (input->fpga_data_num < num_th) {
@@ -77,21 +95,11 @@ void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
    fpga::fpga_flush(outdata_ptr, product(input->dims()) * sizeof(float));
    return;
  }
-  int outC = 1;
-  int outH = 1;
-  int outW = 1;
-  if (output->dims().size() == 4) {
-    outC = output->dims()[1];
-    outH = output->dims()[2];
-    outW = output->dims()[3];
-  } else {  // 2
-    outC = output->dims()[1];
-  }
-  int unalignedCW = outC * outW;
-  int alignedCW = fpga::align_to_x(unalignedCW, IMAGE_ALIGNMENT);
+
  auto aligned_out = param.aligned_out.get();
  if (unalignedCW != alignedCW) {
    auto aligned_ptr = aligned_out->data<float>();
+    fpga::fpga_invalidate(aligned_ptr, (input->fpga_data_num) * sizeof(float));
    for (int idx = 0; idx < input->fpga_data_num; ++idx) {
      aligned_ptr[idx] = input_address[idx] * Si;
    }

--- a/src/operators/kernel/fpga/V2/softmax_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/softmax_kernel.cpp
@@ -24,16 +24,11 @@ template <>
 bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
  auto input = const_cast<LoDTensor *>(param->InputX());
  auto dims = framework::vectorize(input->dims());
-  int8_t *input_ptr;
-  auto out = param->Out();
-  if (input->type() == type_id<float>()) {
-    out->Resize(framework::make_ddim(dims));
-    out->mutable_data<float>(framework::make_ddim(dims));
-  } else {
-    input_ptr = input->data<int8_t>();
-  }

-  auto float_input = new LoDTensor;
+  auto out = param->Out();
+  out->Resize(framework::make_ddim(dims));
+  out->mutable_data<int8_t>(framework::make_ddim(dims));
+  fpga::format_ofm(out);

  PADDLE_MOBILE_ENFORCE(input->dims().size() == 4,
                        "Softmax should have 4-order input");
@@ -45,28 +40,10 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
    dims[1] = 1;
  }
  input->Resize(framework::make_ddim(dims));
-  float_input->Resize(framework::make_ddim(dims));
-
-  if (channel != 2) {  // Use CPU
-    out->Resize(framework::make_ddim(dims));
-    out->mutable_data<float>(framework::make_ddim(dims));
-    float_input->init(type_id<float>().hash_code());
-    float_input->mutable_data<float>(framework::make_ddim(dims));
-
-    fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
-    args.input_layout_type = fpga::LAYOUT_HWC;
-    args.output_layout_type = fpga::LAYOUT_CHW;
-    args.input_data_type = fpga::DATA_TYPE_FP16;
-    args.output_data_type = fpga::DATA_TYPE_FP32;
-    args.image.address = input_ptr;
-    args.image.height = (uint32_t)dims[1] * dims[0];
-    args.image.width = (uint32_t)dims[2];
-    args.image.channels = (uint32_t)dims[3];
-    args.output.address = float_input->data<float>();
-    args.output.scale_address = float_input->scale;
-    param->SetFloatInput(float_input);
-    param->SetFpgaArgs(args);
-  } else {  // Use FPGA
+  if ((channel == 2) && (input->type() == type_id<int8_t>())) {
+    auto input_ptr = input->data<int8_t>();
+    float Si = input->scale[0];
+    int16_t slope = fpga::fp32_2_fp16(Si / 127);
    fpga::format_ofm(out);
    fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
    args.input_layout_type = fpga::LAYOUT_HWC;
@@ -77,10 +54,28 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
    args.image.height = (uint32_t)input->dims()[1];
    args.image.width = (uint32_t)input->dims()[2];
    args.image.channels = (uint32_t)input->dims()[3];
-    args.output.address = out->data<half>();
+    args.output.address = out->data<int8_t>();
    args.output.scale_address = out->scale;
    args.output.activation.activation_type = fpga::SOFTMAX;
+    args.output.activation.leaky_relu_negative_slope = slope;
    param->SetFpgaArgs(args);
+  } else if (input->type() == type_id<int8_t>()) {
+    auto float_input_x = param->float_input_x_;
+    float_input_x = std::make_shared<Tensor>();
+    float_input_x->Resize(input->dims());
+    float_input_x->init(type_id<float>().hash_code());
+    fpga::format_ofm(float_input_x.get());
+    auto float_out = param->float_out;
+    float_out = std::make_shared<Tensor>();
+    float_out->Resize(input->dims());
+    float_out->init(type_id<float>().hash_code());
+    fpga::format_ofm(float_out.get());
+  } else {
+    auto float_out = param->float_out;
+    float_out = std::make_shared<Tensor>();
+    float_out->Resize(input->dims());
+    float_out->init(type_id<float>().hash_code());
+    fpga::format_ofm(float_out.get());
  }

  return true;
@@ -89,24 +84,54 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
 template <>
 void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) {
  auto *in_x = (param.InputX());
-  if (in_x->type() == type_id<int8_t>()) {
+  auto dims = in_x->dims();
+  auto n = dims[0];
+  auto h = dims[1];
+  auto w = dims[2];
+  auto c = dims[3];
+  if ((c == 2) && (in_x->type() == type_id<int8_t>())) {
    fpga::PerformBypass(param.FpgaArgs());
-    if (param.FpgaArgs().output.activation.activation_type != fpga::SOFTMAX) {
-      Tensor *out = param.Out();
-      Tensor *in_x2 = param.FloatInput();
-
-      fpga::fpga_invalidate(in_x2->data<float>(),
-                            in_x2->numel() * sizeof(float));
-      math::SoftmaxFuntor<CPU, float>()(in_x2, out);
-      fpga::fpga_flush(out->data<float>(), out->memory_size());
+  } else if (in_x->type() == type_id<int8_t>()) {
+    auto in_data = in_x->data<int8_t>();
+    float Si = in_x->scale[0];
+    Tensor *out = param.Out();
+    out->Resize(
+        {in_x->dims()[0], out->dims()[1], out->dims()[2], out->dims()[3]});
+    auto out_data = out->data<int8_t>();
+    auto float_input_x = param.float_input_x_;
+    auto float_input_x_data = float_input_x->data<float>();
+    int dataNum = n * h * fpga::align_to_x(w * c, IMAGE_ALIGNMENT);
+    for (int i = 0; i < dataNum; i++) {
+      float_input_x_data[i] = in_data[i] * Si / 127;
+    }
+    auto float_out = param.float_out;
+    auto float_out_data = float_out->data<float>();
+    math::SoftmaxFuntor<CPU, float>()(float_input_x.get(), float_out.get());
+    for (int i = 0; i < dataNum; i++) {
+      float tmp_out = float_out_data[i] * 127;
+      out_data[i] = tmp_out < 0 ? (signed char)(tmp_out - 0.5)
+                                : (signed char)(tmp_out + 0.5);
    }
+    fpga::fpga_flush(out_data, dataNum * sizeof(int8_t));
  } else {
-    if (param.FpgaArgs().output.activation.activation_type != fpga::SOFTMAX) {
-      Tensor *out = param.Out();
-      out->Resize(
-          {in_x->dims()[0], out->dims()[1], out->dims()[2], out->dims()[3]});
-      math::SoftmaxFuntor<CPU, float>()(in_x, out);
+    Tensor *out = param.Out();
+    out->Resize(
+        {in_x->dims()[0], out->dims()[1], out->dims()[2], out->dims()[3]});
+    auto out_data = out->data<int8_t>();
+    auto float_out = param.float_out;
+    float_out = std::make_shared<Tensor>();
+    float_out->Resize(in_x->dims());
+    float_out->init(type_id<float>().hash_code());
+    fpga::format_ofm(float_out.get());
+    auto float_out_data = float_out->data<float>();
+    math::SoftmaxFuntor<CPU, float>()(in_x, float_out.get());
+    int dataNum = n * h * fpga::align_to_x(w * c, IMAGE_ALIGNMENT);
+    for (int i = 0; i < dataNum; i++) {
+      float tmp_out = float_out_data[i] * 127;
+      out_data[i] = tmp_out < 0 ? (signed char)(tmp_out - 0.5)
+                                : (signed char)(tmp_out + 0.5);
    }
+    fpga::fpga_flush(out_data, dataNum * sizeof(int8_t));
  }
 }


--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -1101,6 +1101,8 @@ class SoftmaxParam : public OpParam {

 #ifdef PADDLE_MOBILE_FPGA

+#ifdef PADDLE_MOBILE_FPGA_V1
+
 private:
  std::shared_ptr<GType> float_input_x_;
  fpga::BypassArgs fpga_bypass_args;
@@ -1112,6 +1114,18 @@ class SoftmaxParam : public OpParam {
  void SetFloatInput(LoDTensor *input) { float_input_x_.reset(input); }
  const fpga::BypassArgs &FpgaArgs() const { return fpga_bypass_args; }
  void SetFpgaArgs(const fpga::BypassArgs &args) { fpga_bypass_args = args; }
+#else
+
+ private:
+  fpga::BypassArgs fpga_bypass_args;
+
+ public:
+  const fpga::BypassArgs &FpgaArgs() const { return fpga_bypass_args; }
+  void SetFpgaArgs(const fpga::BypassArgs &args) { fpga_bypass_args = args; }
+
+ public:
+  std::shared_ptr<Tensor> float_input_x_, float_out;
+#endif
 #endif
 };
 #endif