Merge pull request #1487 from qnqinan/develop

add int8 to fp16 convert function in FPGA track fixed#1486

Merge pull request #1487 from qnqinan/develop
add int8 to fp16 convert function in FPGA track fixed#1486
b574d803 · qnqinan · GitHub · 95dd3047 · af0ddeea · b574d803
8 changed file
--- a/src/fpga/V1/api.cpp
+++ b/src/fpga/V1/api.cpp
@@ -28,14 +28,26 @@ namespace fpga {
 void format_image(framework::Tensor *image_tensor) {
  auto dims = image_tensor->dims();
  auto channel = dims[1], height = dims[2], width = dims[3];
+  std::type_index input_type = image_tensor->type();
+  if (input_type == typeid(float)) {
    auto data_ptr = image_tensor->data<float>();
    auto external_ptr = reinterpret_cast<float *>(image_tensor->external_data);
    float *p_data = external_ptr == nullptr ? data_ptr : external_ptr;

-  image::format_image(&p_data, channel, height, width);
+    image::format_image<float>(&p_data, channel, height, width);
    if (p_data != data_ptr && external_ptr == nullptr) {
      image_tensor->reset_data_ptr(p_data);
    }
+  } else {
+    auto data_ptr = image_tensor->data<int8_t>();
+    auto external_ptr = reinterpret_cast<int8_t *>(image_tensor->external_data);
+    int8_t *p_data = external_ptr == nullptr ? data_ptr : external_ptr;
+
+    image::format_image<int8_t>(&p_data, channel, height, width);
+    if (p_data != data_ptr && external_ptr == nullptr) {
+      image_tensor->reset_data_ptr(p_data);
+    }
+  }
 }

 void format_ofm(framework::Tensor *ofm_tensor) {

--- a/src/fpga/V1/image.cpp
+++ b/src/fpga/V1/image.cpp
@@ -13,9 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "fpga/V1/image.h"
-#include <memory.h>
-#include <algorithm>
-#include "fpga/common/fpga_common.h"

 namespace paddle_mobile {
 namespace fpga {
@@ -58,37 +55,6 @@ void convert_to_chw(float **data_in, int channel, int height, int width,
  *data_in = data_tmp;
 }

-void align_element_conv(float **data_in, int height, int cw) {
-  int h = 0;
-  int align_cw = align_to_x(cw, IMAGE_ALIGNMENT);
-
-  float *data_tmp =
-      (float *)fpga_malloc(height * align_cw * sizeof(float));  // NOLINT
-
-  memset(data_tmp, 0, height * align_cw * sizeof(float));
-
-  for (h = 0; h < height; h++) {
-    memcpy((void *)(data_tmp + h * align_cw),  // NOLINT
-           (void *)(*data_in + h * cw),        // NOLINT
-           cw * sizeof(float));
-  }
-
-  *data_in = data_tmp;
-}
-
-void format_image(float **data_in, int channel, int height, int width) {
-  // convert_to_hwc(data_in, channel, height, width);
-  int cw = channel * width;
-  int align_cw = align_to_x(cw, IMAGE_ALIGNMENT);
-  if (align_cw != cw) {
-    float *hwc_temp = *data_in;
-    align_element_conv(data_in, height, channel * width);
-    fpga_free(hwc_temp);
-  }
-  fpga_flush(*data_in, align_to_x(channel * width, IMAGE_ALIGNMENT) * height *
-                           sizeof(float));
-}
-
 void concat_images(int16_t **images_in, float **scales_in, void *image_out,
                   float *scale_out, int image_num, uint32_t *channel_num,
                   int height, int width) {

--- a/src/fpga/V1/image.h
+++ b/src/fpga/V1/image.h
@@ -14,8 +14,10 @@ limitations under the License. */

 #pragma once

+#include <memory.h>
+#include <algorithm>
 #include <cstdint>
-
+#include "fpga/common/fpga_common.h"
 namespace paddle_mobile {
 namespace fpga {
 namespace image {
@@ -24,10 +26,42 @@ void convert_to_hwc(float** data_in, int channel, int height, int width,
                    int num = 1);
 void convert_to_chw(float** data_in, int channel, int height, int width,
                    int num = 1);
+// template <typename Dtype>
+// void align_element_conv(Dtype** data_in, int height, int cw);
+// template <typename T>
+// void format_image(T** data_in, int channel, int height, int width);
+template <typename Dtype>
+void align_element_conv(Dtype** data_in, int height, int cw);
+template <typename Dtype>
+void align_element_conv(Dtype** data_in, int height, int cw) {
+  int h = 0;
+  int align_cw = align_to_x(cw, IMAGE_ALIGNMENT);
+
+  Dtype* data_tmp =
+      (Dtype*)fpga_malloc(height * align_cw * sizeof(Dtype));  // NOLINT
+
+  memset(data_tmp, 0, height * align_cw * sizeof(Dtype));

-void align_element_conv(float** data_in, int height, int cw);
-void format_image(float** data_in, int channel, int height, int width);
+  for (h = 0; h < height; h++) {
+    memcpy((void*)(data_tmp + h * align_cw),  // NOLINT
+           (void*)(*data_in + h * cw),        // NOLINT
+           cw * sizeof(Dtype));
+  }

+  *data_in = data_tmp;
+}
+template <typename T>
+void format_image(T** data_in, int channel, int height, int width) {
+  int cw = channel * width;
+  int align_cw = align_to_x(cw, IMAGE_ALIGNMENT);
+  if (align_cw != cw) {
+    T* hwc_temp = *data_in;
+    align_element_conv(data_in, height, channel * width);
+    fpga_free(hwc_temp);
+  }
+  fpga_flush(*data_in,
+             align_to_x(channel * width, IMAGE_ALIGNMENT) * height * sizeof(T));
+}
 // Concat featuremaps along channel direction
 void concat_images(int16_t** images_in, float** scales_in, void* image_out,
                   float* scale_out, int image_num, uint32_t* channel_num,

--- a/src/fpga/V1/pe.cpp
+++ b/src/fpga/V1/pe.cpp
@@ -38,10 +38,12 @@ using namespace std;     // NOLINT
 #define CMD_FP16_TO_FP32 1
 #define CMD_FP32_TO_FP16 2
 #define CMD_FP32_TO_FP32 3
+#define CMD_INT8_TO_FP16 4

 // bypass macro
 #define SIZE_FP16 2
 #define SIZE_FP32 4
+#define SIZE_INT8 1

 #define PE_IRQ_TIMEOUT 1000000

@@ -607,6 +609,16 @@ int PerformBypass(const struct BypassArgs &args) {
      }
    } break;

+    case DATA_TYPE_INT8: {
+      if (args.output_data_type != DATA_TYPE_FP16) {
+        DLOG << "error:Output Datetype error,not DATA_TYPE_FP16: "
+             << args.output_data_type;
+      }
+      data_cell_in = SIZE_INT8;
+      data_cell_out = SIZE_FP16;
+      cmd = CMD_INT8_TO_FP16;
+    } break;
+
    case DATA_TYPE_FP32: {
      switch (args.output_data_type) {
        case DATA_TYPE_FP16:
@@ -630,10 +642,13 @@ int PerformBypass(const struct BypassArgs &args) {
      break;
  }
  if (cmd != CMD_FP16_TO_FP16 && cmd != CMD_FP16_TO_FP32 &&
-      cmd != CMD_FP32_TO_FP16 && cmd != CMD_FP32_TO_FP32) {
+      cmd != CMD_FP32_TO_FP16 && cmd != CMD_FP32_TO_FP32 &&
+      cmd != CMD_INT8_TO_FP16) {
+    //   std::cout<< " err back Error1!" <<std::endl;
    return -EFAULT;
  }
-  if ((data_cell_in != SIZE_FP16 && data_cell_in != SIZE_FP32) ||
+  if ((data_cell_in != SIZE_FP16 && data_cell_in != SIZE_FP32 &&
+       data_cell_in != SIZE_INT8) ||
      (data_cell_out != SIZE_FP16 && data_cell_out != SIZE_FP32)) {
    return -EFAULT;
  }

--- a/src/fpga/common/fpga_common.h
+++ b/src/fpga/common/fpga_common.h
@@ -31,6 +31,7 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace fpga {
 enum DataType {
+  DATA_TYPE_INT8 = 2,
  DATA_TYPE_FP32 = 1,
  DATA_TYPE_FP16 = 0,
 };

--- a/src/operators/kernel/fpga/V1/feed_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/feed_kernel.cpp
@@ -20,13 +20,10 @@ namespace operators {
 template <>
 bool FeedKernel<FPGA, float>::Init(FeedParam<FPGA> *param) {
  auto output = param->Out();
-  auto input = const_cast<LoDTensor *>(param->InputX());
-  input->init(typeid(float));
-  input->Resize(output->dims());
-
  if (output->dims().size() != 4) {
    return true;
  }
+
  fpga::format_fp16_ofm(output);
  return true;
 }
@@ -35,6 +32,14 @@ template <>
 void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
  auto output = param.Out();
  auto input = const_cast<LoDTensor *>(param.InputX());
+  std::type_index input_type = input->type();
+
+  if (input_type == typeid(float)) {
+    input->init(typeid(float));
+  } else {  // input_type == typeid(int8_t)
+    input->init(typeid(int8_t));
+  }
+  input->Resize(output->dims());

  if (output->dims().size() != 4) {
    size_t size = output->numel() * sizeof(float);
@@ -48,14 +53,13 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
  }

  fpga::format_image(input);
+  auto output_ptr = output->data<half>();
+  fpga::BypassArgs args = {fpga::DATA_TYPE_FP32};
+  if (input_type == typeid(float)) {
    auto input_ptr = input->data<float>();
    auto external_ptr = reinterpret_cast<float *>(input->external_data);
    float *p_data = external_ptr == nullptr ? input_ptr : external_ptr;

-  auto output_ptr = output->data<half>();
-
-  fpga::BypassArgs args = {fpga::DATA_TYPE_FP32};
-
    args.input_data_type = fpga::DATA_TYPE_FP32;
    args.output_data_type = fpga::DATA_TYPE_FP16;
    args.input_layout_type = fpga::LAYOUT_CHW;
@@ -69,8 +73,27 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
    args.output.address = output_ptr;
    args.output.scale_address = output->scale;
    fpga::PerformBypass(args);
+    input->external_data = nullptr;
+  } else {  // input_type == typeid(int8_t)
+    auto input_ptr = input->data<int8_t>();
+    auto external_ptr = reinterpret_cast<int8_t *>(input->external_data);
+    int8_t *p_data = external_ptr == nullptr ? input_ptr : external_ptr;

+    args.input_data_type = fpga::DATA_TYPE_INT8;
+    args.output_data_type = fpga::DATA_TYPE_FP16;
+    args.input_layout_type = fpga::LAYOUT_CHW;
+    args.output_layout_type = fpga::LAYOUT_HWC;
+    args.image.address = p_data;
+    args.image.channels = (uint32_t)input->dims()[1];
+    args.image.height = (uint32_t)input->dims()[2];
+    args.image.width = (uint32_t)input->dims()[3];
+    args.image.pad_height = 0;
+    args.image.pad_width = 0;
+    args.output.address = output_ptr;
+    args.output.scale_address = output->scale;
+    fpga::PerformBypass(args);
    input->external_data = nullptr;
+  }
 }
 template class FeedKernel<FPGA, float>;


--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -2554,13 +2554,13 @@ class FusionDeconvBNReluParam : public ConvTransposeParam<Dtype> {
 public:
  FusionDeconvBNReluParam(const VariableNameMap &inputs,
                          const VariableNameMap &outputs,
-                          const AttributeMap &attrs, const Scope &scope)
+                          const AttributeMap &attrs, Scope *scope)
      : ConvTransposeParam<Dtype>(inputs, outputs, attrs, scope) {
-    output_ = OpParam::OutFrom<GType>(outputs, scope);
-    input_bias_ = OpParam::InputBiasFrom<GType>(inputs, scope);
-    input_mean_ = OpParam::InputMeanFrom<GType>(inputs, scope);
-    input_scale_ = OpParam::InputScaleFrom<GType>(inputs, scope);
-    input_variance_ = OpParam::InputVarianceFrom<GType>(inputs, scope);
+    output_ = OpParam::OutFrom<GType>(outputs, *scope);
+    input_bias_ = OpParam::InputBiasFrom<GType>(inputs, *scope);
+    input_mean_ = OpParam::InputMeanFrom<GType>(inputs, *scope);
+    input_scale_ = OpParam::InputScaleFrom<GType>(inputs, *scope);
+    input_variance_ = OpParam::InputVarianceFrom<GType>(inputs, *scope);
    epsilon_ = OpParam::GetAttr<float>("epsilon", attrs);
    momentum_ = OpParam::GetAttr<float>("momentum", attrs);
  }

--- a/tools/op.cmake
+++ b/tools/op.cmake
@@ -138,6 +138,8 @@ if (CON GREATER -1)
  set(CONV_TRANSPOSE_OP ON)
  set(FUSION_DECONVADDBNRELU_OP ON)
  set(FUSION_DECONVADDBN_OP ON)
+  set(FUSION_DECONVBNRELU_OP ON)
+  set(CONV_OP ON)
  set(ELEMENTWISEMUL_OP ON)
  set(FUSION_FCRELU_OP ON)
  set(RELU_OP ON)
@@ -616,6 +618,9 @@ endif()
 if (FUSION_DECONVADDBNRELU_OP)
  add_definitions(-DFUSION_DECONVADDBNRELU_OP)
 endif()
+if (FUSION_DECONVBNRELU_OP)
+  add_definitions(-DFUSION_DECONVBNRELU_OP)
+endif()
 if (FUSION_DECONVADDBN_OP)
  add_definitions(-DFUSION_DECONVADDBN_OP)
 endif()