update v1 and v3 kernel file in FPGA track fixed#1696 (#1697)

* update concat and split kernel and related files in FPGA v2(v3) track * update * update * update kernel and related files in FPGA v2 track * update * update * update kernel and related files for static quantization in FPGA v2 track * update * update feed and fetch kernel in FPGA v2 track * update io file * update feed fetch and softmax kernel in FPGA v2 track * update proposal kernel and other kernels in FPGA v2 track * update fetch and softmax kernel in fpga v2 track * update v1 and v3 kernel file in FPGA track

update v1 and v3 kernel file in FPGA track fixed#1696 (#1697)
* update concat and split kernel and related files in FPGA v2(v3) track * update * update * update kernel and related files in FPGA v2 track * update * update * update kernel and related files for static quantization in FPGA v2 track * update * update feed and fetch kernel in FPGA v2 track * update io file * update feed fetch and softmax kernel in FPGA v2 track * update proposal kernel and other kernels in FPGA v2 track * update fetch and softmax kernel in fpga v2 track * update v1 and v3 kernel file in FPGA track
56f4465b · qnqinan · jameswu2014 · 8fca2857 · 56f4465b · 56f4465b
5 changed file
--- a/src/operators/kernel/fpga/V1/softmax_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/softmax_kernel.cpp
@@ -35,19 +35,23 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
  auto float_input = new LoDTensor;
-  PADDLE_MOBILE_ENFORCE(input->dims().size() == 4,
+  int input_n = 1, input_c = 1, input_h = 1, input_w = 1;
-                        "Softmax should have 4-order input");
+  if (dims.size() == 4) {
+    input_h = dims[1];
-  auto channel = dims[3];
+    input_w = dims[2];
-  if (channel == 1) {  // This input is generated by FC op, dims = [N C 1 1]
+    input_c = dims[3];
-    PADDLE_MOBILE_ENFORCE(dims[2] == 1, "Softmax input must come from FC op");
+    if (input_c == 1) {  // This input is generated by FC op, dims = [N C 1 1]
-    dims[3] = dims[1];
+      PADDLE_MOBILE_ENFORCE(input_w == 1, "Softmax input must come from FC op");
-    dims[1] = 1;
+      input_c = dims[1];
+      input_h = 1;
+    }
+  } else if (dims.size() == 2) {
+    input_c = dims[1];
  }
  input->Resize(framework::make_ddim(dims));
  float_input->Resize(framework::make_ddim(dims));
-  if (channel == 2 && input->type() == type_id<half>()) {  // Use FPGA
+  if (input_c == 2 && input->type() == type_id<half>()) {  // Use FPGA
    fpga::format_fp16_ofm(out);
    fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
    args.input_layout_type = fpga::LAYOUT_HWC;
@@ -55,9 +59,9 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
    args.input_data_type = fpga::DATA_TYPE_FP16;
    args.output_data_type = fpga::DATA_TYPE_FP16;
    args.image.address = input_ptr;
-    args.image.height = (uint32_t)input->dims()[1];
+    args.image.height = input_h;
-    args.image.width = (uint32_t)input->dims()[2];
+    args.image.width = input_w;
-    args.image.channels = (uint32_t)input->dims()[3];
+    args.image.channels = input_c;
    args.output.address = out->data<half>();
    args.output.scale_address = out->scale;
    args.output.activation.activation_type = fpga::SOFTMAX;
@@ -67,8 +71,8 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
    out->mutable_data<float>(framework::make_ddim(dims));
    float_input->init(type_id<float>().hash_code());
    float_input->mutable_data<float>(framework::make_ddim(dims));
-    //  fpga::format_fp32_ofm(float_input);
+    fpga::format_fp32_ofm(float_input);
-    // fpga::format_fp32_ofm(out);
+    fpga::format_fp32_ofm(out);
    fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
    args.input_layout_type = fpga::LAYOUT_HWC;
@@ -76,9 +80,9 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
    args.input_data_type = fpga::DATA_TYPE_FP16;
    args.output_data_type = fpga::DATA_TYPE_FP32;
    args.image.address = input_ptr;
-    args.image.height = (uint32_t)dims[1] * dims[0];
+    args.image.height = input_h;
-    args.image.width = (uint32_t)dims[2];
+    args.image.width = input_w;
-    args.image.channels = (uint32_t)dims[3];
+    args.image.channels = input_c;
    args.output.address = float_input->data<float>();
    args.output.scale_address = float_input->scale;
    param->SetFloatInput(float_input);
@@ -91,6 +95,23 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
 template <>
 void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) {
  auto *in_x = (param.InputX());
+  auto dims = in_x->dims();
+  auto n = 1;
+  auto h = 1;
+  auto w = 1;
+  auto c = 1;
+  if (dims.size() == 4) {
+    h = dims[1];
+    w = dims[2];
+    c = dims[3];
+    if (c == 1) {  // This input is generated by FC op, dims = [N C 1 1]
+      PADDLE_MOBILE_ENFORCE(w == 1, "Softmax input must come from FC op");
+      c = dims[1];
+      h = 1;
+    }
+  } else if (dims.size() == 2) {
+    c = dims[1];
+  }
  if (in_x->type() == type_id<half>()) {
    fpga::PerformBypass(param.FpgaArgs());
    if (param.FpgaArgs().output.activation.activation_type != fpga::SOFTMAX) {
@@ -105,8 +126,7 @@ void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) {
  } else {
    if (param.FpgaArgs().output.activation.activation_type != fpga::SOFTMAX) {
      Tensor *out = param.Out();
-      out->Resize(
+      out->Resize({n, h, w, c});
-          {in_x->dims()[0], out->dims()[1], out->dims()[2], out->dims()[3]});
      math::SoftmaxFuntor<CPU, float>()(in_x, out);
    }
  }

--- a/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp
@@ -19,13 +19,10 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
 template <>
 bool ConvAddBNReluKernel<FPGA, float>::Init(
    FusionConvAddBNReluParam<FPGA> *param) {
  bool relu_enabled = true;
-  // paddle_mobile::fpga::ActivationType activation_enable =
-  //    paddle_mobile::fpga::LEAKYRELU;
  auto input = const_cast<LoDTensor *>(param->Input());
  auto bias = param->Bias();
  auto bias_ptr = bias->data<float>();
@@ -42,6 +39,7 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
  auto bn_scale_ptr = param->InputScale()->data<float>();
  auto bn_bias_ptr = param->InputBias()->data<float>();
  const float epsilon = param->Epsilon();
  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0] &&
                            bias->dims()[0] == param->InputBias()->dims()[0],
                        "Output channel should be equal to bias number");
@@ -75,6 +73,7 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
                          new_bias_ptr);
    param->SetFpgaArgs(dwconv_arg);
    fpga::fpga_free(bs_ptr);
+    delete new_scale;
  } else {
    fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
    fpga::SplitConvArgs conv_arg = {0};
@@ -82,9 +81,10 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
                         param->Groups(), strides[0], strides[1], paddings[0],
                         paddings[1], bs_ptr);
    param->SetFpgaArgs(conv_arg);
-  }
    delete new_scale;
    delete new_bias;
+  }
  return true;
 }

--- a/src/operators/kernel/fpga/V2/reshape2_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/reshape2_kernel.cpp
@@ -114,6 +114,7 @@ void Reshape2Kernel<FPGA, float>::Compute(const Reshape2Param<FPGA> &param) {
    output->ShareDataWith(*input);
    framework::LoD lod = input->lod();
    output->set_lod(lod);
+    output->scale[0] = input->scale[0];
    return;
  }

--- a/src/operators/kernel/fpga/V2/sigmoid_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/sigmoid_kernel.cpp
@@ -21,11 +21,12 @@ namespace operators {
 template <>
 bool SigmoidKernel<FPGA, float>::Init(SigmoidParam<FPGA> *param) {
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::SIGMOID;
-  int16_t leaky_relu_negative_slope = 0;
  auto input = const_cast<LoDTensor *>(param->InputX());
  auto input_ptr = input->data<int8_t>();
+  paddle_mobile::fpga::ActivationType activation_enable =
+      paddle_mobile::fpga::SIGMOID;
+  int16_t leaky_relu_negative_slope =
+      fpga::fp32_2_fp16(input->scale[0] / 127.0);
  auto out = param->Out();
  fpga::format_ofm(out);
@@ -47,6 +48,7 @@ bool SigmoidKernel<FPGA, float>::Init(SigmoidParam<FPGA> *param) {
 template <>
 void SigmoidKernel<FPGA, float>::Compute(const SigmoidParam<FPGA> &param) {
  fpga::PerformBypass(param.FpgaArgs());
+  param.Out()->scale[0] = 127.0;
 }
 }  // namespace operators

--- a/src/operators/kernel/fpga/V2/softmax_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/softmax_kernel.cpp
@@ -28,17 +28,22 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
  auto out = param->Out();
  out->Resize(framework::make_ddim(dims));
-  PADDLE_MOBILE_ENFORCE(input->dims().size() == 4,
+  int input_c = 1, input_h = 1, input_w = 1;
-                        "Softmax should have 4-order input");
+  if (dims.size() == 4) {
+    input_h = dims[1];
-  auto channel = dims[3];
+    input_w = dims[2];
-  if (channel == 1) {  // This input is generated by FC op, dims = [N C 1 1]
+    input_c = dims[3];
-    PADDLE_MOBILE_ENFORCE(dims[2] == 1, "Softmax input must come from FC op");
+    if (input_c == 1) {  // This input is generated by FC op, dims = [N C 1 1]
-    dims[3] = dims[1];
+      PADDLE_MOBILE_ENFORCE(input_w == 1, "Softmax input must come from FC op");
-    dims[1] = 1;
+      input_c = dims[1];
+      input_h = 1;
+    }
+  } else if (dims.size() == 2) {
+    input_c = dims[1];
  }
  input->Resize(framework::make_ddim(dims));
-  if ((channel == 2) && (input->type() == type_id<int8_t>())) {
+  if ((input_c == 2) && (input->type() == type_id<int8_t>())) {
    auto input_ptr = input->data<int8_t>();
    float Si = input->scale[0];
    int16_t slope = fpga::fp32_2_fp16(Si / 127);
@@ -50,22 +55,14 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
    args.input_data_type = fpga::DATA_TYPE_FP16;
    args.output_data_type = fpga::DATA_TYPE_FP16;
    args.image.address = input_ptr;
-    args.image.height = (uint32_t)input->dims()[1];
+    args.image.height = input_h;
-    args.image.width = (uint32_t)input->dims()[2];
+    args.image.width = input_w;
-    args.image.channels = (uint32_t)input->dims()[3];
+    args.image.channels = input_c;
    args.output.address = out->data<int8_t>();
    args.output.scale_address = out->scale;
    args.output.activation.activation_type = fpga::SOFTMAX;
    args.output.activation.leaky_relu_negative_slope = slope;
    param->SetFpgaArgs(args);
-  } else if (input->type() == type_id<int8_t>()) {
-    auto float_input_x = param->float_input_x_;
-    float_input_x = std::make_shared<Tensor>();
-    float_input_x->Resize(input->dims());
-    float_input_x->init(type_id<float>().hash_code());
-    fpga::format_ofm(float_input_x.get());
-    out->mutable_data<float>(framework::make_ddim(dims));
-    fpga::format_ofm(out);
  } else {
    out->mutable_data<float>(framework::make_ddim(dims));
    fpga::format_ofm(out);
@@ -78,36 +75,45 @@ template <>
 void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) {
  auto *in_x = (param.InputX());
  auto dims = in_x->dims();
-  auto n = dims[0];
-  auto h = dims[1];
+  auto n = 1;
-  auto w = dims[2];
+  auto h = 1;
-  auto c = dims[3];
+  auto w = 1;
+  auto c = 1;
+  if (dims.size() == 4) {
+    h = dims[1];
+    w = dims[2];
+    c = dims[3];
+    if (c == 1) {  // This input is generated by FC op, dims = [N C 1 1]
+      PADDLE_MOBILE_ENFORCE(w == 1, "Softmax input must come from FC op");
+      c = dims[1];
+      h = 1;
+    }
+  } else if (dims.size() == 2) {
+    c = dims[1];
+  }
  if ((c == 2) && (in_x->type() == type_id<int8_t>())) {
    fpga::PerformBypass(param.FpgaArgs());
  } else if (in_x->type() == type_id<int8_t>()) {
    auto in_data = in_x->data<int8_t>();
    float Si = in_x->scale[0];
    Tensor *out = param.Out();
-    out->Resize(
+    out->Resize({n, h, w, c});
-        {in_x->dims()[0], out->dims()[1], out->dims()[2], out->dims()[3]});
    auto float_input_x = param.float_input_x_;
+    float_input_x = std::make_shared<Tensor>();
+    float_input_x->Resize(in_x->dims());
+    float_input_x->init(type_id<float>().hash_code());
+    fpga::format_fp32_ofm(float_input_x.get());
    auto float_input_x_data = float_input_x->data<float>();
    int dataNum = n * h * fpga::align_to_x(w * c, IMAGE_ALIGNMENT);
    for (int i = 0; i < dataNum; i++) {
      float_input_x_data[i] = in_data[i] * Si / 127;
    }
    math::SoftmaxFuntor<CPU, float>()(float_input_x.get(), out);
-    auto out_data = out->data<float>();
-    fpga::fpga_flush(out_data, dataNum * sizeof(float));
  } else {
    Tensor *out = param.Out();
-    out->Resize(
+    out->Resize({n, h, w, c});
-        {in_x->dims()[0], out->dims()[1], out->dims()[2], out->dims()[3]});
    math::SoftmaxFuntor<CPU, float>()(in_x, out);
-    int dataNum = n * h * fpga::align_to_x(w * c, IMAGE_ALIGNMENT);
-    auto out_data = out->data<float>();
-    fpga::fpga_flush(out_data, dataNum * sizeof(float));
  }
 }