update kernel and related files for static quantization in FPGA v2 track fixed#1589 (#1590)

* update concat and split kernel and related files in FPGA v2(v3) track * update * update * update kernel and related files in FPGA v2 track * update * update * update kernel and related files for static quantization in FPGA v2 track * update

update kernel and related files for static quantization in FPGA v2 track fixed#1589 (#1590)
* update concat and split kernel and related files in FPGA v2(v3) track * update * update * update kernel and related files in FPGA v2 track * update * update * update kernel and related files for static quantization in FPGA v2 track * update
491737d0 · qnqinan · GitHub · 343ca535 · 491737d0 · 491737d0
15 changed file
--- a/src/fpga/V2/api.cpp
+++ b/src/fpga/V2/api.cpp
@@ -40,8 +40,6 @@ void format_image(framework::Tensor *image_tensor) {
 void format_ofm(framework::Tensor *ofm_tensor) {
  if (ofm_tensor->type() == type_id<float>()) {
    format_fp32_ofm(ofm_tensor);
-  } else if (ofm_tensor->type() == type_id<half>()) {
-    format_fp16_ofm(ofm_tensor);
  } else {
    format_int8_ofm(ofm_tensor);
  }
@@ -85,24 +83,6 @@ void format_int8_ofm(framework::Tensor *ofm_tensor, framework::DDim dims) {
  fpga::fpga_flush(p, memory_size);
 }

-void format_fp16_ofm(framework::Tensor *ofm_tensor) {
-  auto dims = ofm_tensor->dims();
-  size_t memory_size = 0;
-  if (dims.size() == 4) {
-    auto channel = dims[1], height = dims[2], width = dims[3], num = dims[0];
-    memory_size = num * height * align_to_x(channel * width, IMAGE_ALIGNMENT) *
-                  sizeof(half);
-  } else if (dims.size() == 2) {
-    memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(half);
-  } else {
-    DLOG << "Wrong ofm dimension";
-  }
-  auto p = fpga_malloc(memory_size);
-  ofm_tensor->reset_data_ptr(p);
-  ofm_tensor->set_type(type_id<half>().hash_code());
-  ofm_tensor->fpga_data_num = memory_size / sizeof(half);
-}
-
 void format_fp32_ofm(framework::Tensor *ofm_tensor) {
  auto dims = ofm_tensor->dims();
  size_t memory_size = 0;
@@ -208,9 +188,6 @@ void format_DWDconv_filter(framework::Tensor *filter_tensor, float *scale_ptr,
  deconv_filter::DWDconv_format_filter(&new_data, num, channel, height, width,
                                       scale_ptr, stride);

-  //  framework::DDim dims_new =
-  //      framework::make_ddim({num, 1, height, width});
-  //  filter_tensor->Resize(dims_new);
  filter_tensor->reset_data_ptr(new_data);
  filter_tensor->set_type(type_id<int16_t>().hash_code());
 }
@@ -314,13 +291,13 @@ void format_DWDeconv_data(framework::Tensor *filter_tensor,
                          framework::Tensor *ofm_tensor, float **bs_ptr,
                          int group, int sub_conv_n) {
  int channel = ofm_tensor->dims()[1];
-  // dw-deconv
  format_DWDconv_filter(
      filter_tensor,
      (reinterpret_cast<float *>(*bs_ptr) + sub_conv_n * channel), sub_conv_n);
  format_bias_array(bs_ptr, channel);
  format_ofm(ofm_tensor);
 }
+
 void expand_conv_arg(ConvArgs *arg) {
  ConvArgs args = *arg;

@@ -458,7 +435,6 @@ void expand_conv_arg(ConvArgs *arg) {

 void expand_EW_arg(EWAddArgs *arg) {
  EWAddArgs args = *arg;
-  // uint64_t cmd = args.relu_enabled ? USE_RELU : 0;
  uint64_t cmd = 0;
  uint64_t datalen = (uint64_t)args.image0.width *
                     (uint64_t)args.image0.height *

--- a/src/fpga/V2/api.h
+++ b/src/fpga/V2/api.h
@@ -26,8 +26,6 @@ void format_image(framework::Tensor* image_tensor);
 void format_ofm(framework::Tensor* ofm_tensor);
 void format_int8_ofm(framework::Tensor* ofm_tensor);
 void format_int8_ofm(framework::Tensor* ofm_tensor, framework::DDim dims);
-void format_fp16_ofm(framework::Tensor* ofm_tensor);  // only allocate memory
-void format_fp16_ofm(framework::Tensor* ofm_tensor, framework::DDim dims);
 void format_fp32_ofm(framework::Tensor* ofm_tensor);

 float filter_find_max(framework::Tensor* filter_tensor);

--- a/src/fpga/V2/image.cpp
+++ b/src/fpga/V2/image.cpp
@@ -100,7 +100,7 @@ void concat_images(int8_t **images_in, float **scales_in, void *image_out,
        align_each_in_area_cw =
            align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT);
        memcpy(
-            (int16_t *)image_out + tmp_channel +  // NOLINT
+            (int8_t *)image_out + tmp_channel +  // NOLINT
                k * align_each_out_area_cw_differ,
            images_in_tmp[i] + j * channel_num[i] + k * align_each_in_area_cw,
            channel_num[i] * sizeof(int8_t));

--- a/src/fpga/V2/image.h
+++ b/src/fpga/V2/image.h
@@ -26,10 +26,6 @@ void convert_to_hwc(float** data_in, int channel, int height, int width,
                    int num = 1);
 void convert_to_chw(float** data_in, int channel, int height, int width,
                    int num = 1);
-// template <typename Dtype>
-// void align_element_conv(Dtype** data_in, int height, int cw);
-// template <typename T>
-// void format_image(T** data_in, int channel, int height, int width);
 template <typename Dtype>
 void align_element_conv(Dtype** data_in, int height, int cw);
 template <typename Dtype>

--- a/src/fpga/common/fpga_common.cpp
+++ b/src/fpga/common/fpga_common.cpp
@@ -97,7 +97,7 @@ float fp16_2_fp32(int16_t fp16_num) {
  } else if (se_fp16 < 63) {
    e_fp32 = 0x80000000 + ((se_fp16 - 32) << 23);
    offset = 1024;
-  } else {  // se_fp16 == 63
+  } else {
    e_fp32 = 0xC7800000;
    offset = 1024;
  }

--- a/src/fpga/common/fpga_common.h
+++ b/src/fpga/common/fpga_common.h
@@ -209,7 +209,6 @@ struct PoolingArgs {
 };

 struct EWAddArgs {
-  // bool relu_enabled;
  uint32_t const0;  // output0 = const0 x input0 + const1 x input1;
  uint32_t const1;
  struct ImageInputArgs image0;

--- a/src/operators/kernel/fpga/V2/elementwise_mul_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/elementwise_mul_kernel.cpp
@@ -35,7 +35,7 @@ bool ElementwiseMulKernel<FPGA, float>::Init(ElementwiseMulParam<FPGA> *param) {
  fpga::format_fp32_ofm(&(param->float_out));

  auto *out = param->Out();
-  fpga::format_fp16_ofm(out);
+  fpga::format_ofm(out);
  return true;
 }


--- a/src/operators/kernel/fpga/V2/feed_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/feed_kernel.cpp
@@ -23,14 +23,14 @@ bool FeedKernel<FPGA, float>::Init(FeedParam<FPGA> *param) {
  int col = param->Col();
  DLOG << "col = " << col;
  auto input = const_cast<LoDTensor *>(&param->InputX()->at(col));
-  input->init(type_id<float>().hash_code());
-  input->Resize(output->dims());

  if (output->dims().size() != 4) {
+    input->init(type_id<float>().hash_code());
    return true;
  }
-
-  fpga::format_fp16_ofm(output);
+  input->init(type_id<int8_t>().hash_code());
+  input->Resize(output->dims());
+  fpga::format_ofm(output);
  return true;
 }

@@ -39,15 +39,6 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
  auto output = param.Out();
  int col = param.Col();
  auto input = const_cast<LoDTensor *>(&param.InputX()->at(col));
-  kTypeId_t input_type = input->type();
-
-  if (input_type == type_id<float>()) {
-    input->init(type_id<float>().hash_code());
-  } else {
-    input->init(type_id<int8_t>().hash_code());
-  }
-  input->Resize(output->dims());
-
  if (output->dims().size() != 4) {
    size_t size = output->numel() * sizeof(float);
    auto output_ptr = output->data<float>();
@@ -58,49 +49,8 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
    input->external_data = nullptr;
    return;
  }
-
  fpga::format_image(input);
-  auto output_ptr = output->data<half>();
-  fpga::BypassArgs args = {fpga::DATA_TYPE_FP32};
-  if (input_type == type_id<float>()) {
-    auto input_ptr = input->data<float>();
-    auto external_ptr = reinterpret_cast<float *>(input->external_data);
-    float *p_data = external_ptr == nullptr ? input_ptr : external_ptr;
-
-    args.input_data_type = fpga::DATA_TYPE_FP32;
-    args.output_data_type = fpga::DATA_TYPE_FP16;
-    args.input_layout_type = fpga::LAYOUT_CHW;
-    args.output_layout_type = fpga::LAYOUT_HWC;
-    args.image.address = p_data;
-    args.image.channels = (uint32_t)input->dims()[1];
-    args.image.height = (uint32_t)input->dims()[2];
-    args.image.width = (uint32_t)input->dims()[3];
-    args.image.pad_height = 0;
-    args.image.pad_width = 0;
-    args.output.address = output_ptr;
-    args.output.scale_address = output->scale;
-    fpga::PerformBypass(args);
-    input->external_data = nullptr;
-  } else {
-    auto input_ptr = input->data<int8_t>();
-    auto external_ptr = reinterpret_cast<int8_t *>(input->external_data);
-    int8_t *p_data = external_ptr == nullptr ? input_ptr : external_ptr;
-
-    args.input_data_type = fpga::DATA_TYPE_INT8;
-    args.output_data_type = fpga::DATA_TYPE_FP16;
-    args.input_layout_type = fpga::LAYOUT_CHW;
-    args.output_layout_type = fpga::LAYOUT_HWC;
-    args.image.address = p_data;
-    args.image.channels = (uint32_t)input->dims()[1];
-    args.image.height = (uint32_t)input->dims()[2];
-    args.image.width = (uint32_t)input->dims()[3];
-    args.image.pad_height = 0;
-    args.image.pad_width = 0;
-    args.output.address = output_ptr;
-    args.output.scale_address = output->scale;
-    fpga::PerformBypass(args);
-    input->external_data = nullptr;
-  }
+  output->ShareDataWith(*input);
 }
 template class FeedKernel<FPGA, float>;


--- a/src/operators/kernel/fpga/V2/fetch_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/fetch_kernel.cpp
@@ -21,18 +21,16 @@ bool FetchKernel<FPGA, float>::Init(FetchParam<FPGA> *param) {
  int col = param->Col();
  DLOG << "col = " << col;
  auto output = &(param->Out()->at(col));
+  output->init(type_id<float>().hash_code());
+  output->Resize(input->dims());
  if (input->type() == type_id<float>()) {
    return true;
  }
-  output->init(type_id<float>().hash_code());
-  output->Resize(input->dims());
-  fpga::format_fp32_ofm(output);
+  auto aligned_output = param->aligned_out;
  int outC = 1;
-  int outH = 1;
  int outW = 1;
  if (output->dims().size() == 4) {
    outC = output->dims()[1];
-    outH = output->dims()[2];
    outW = output->dims()[3];
  } else {  // 2
    outC = output->dims()[1];
@@ -40,27 +38,10 @@ bool FetchKernel<FPGA, float>::Init(FetchParam<FPGA> *param) {
  int unalignedCW = outC * outW;
  int alignedCW = fpga::align_to_x(unalignedCW, IMAGE_ALIGNMENT);
  if (alignedCW != unalignedCW) {
-    param->aligned_out.Resize(input->dims());
-    param->aligned_out.mutable_data<float>(input->dims());
-    fpga::fpga_flush(param->aligned_out.data<float>(),
-                     outH * unalignedCW * sizeof(float));
+    aligned_output.init(type_id<float>().hash_code());
+    aligned_output.Resize(input->dims());
+    fpga::format_fp32_ofm(&aligned_output);
  }
-  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
-
-  args.input_data_type = fpga::DATA_TYPE_FP16;
-  args.output_data_type = fpga::DATA_TYPE_FP32;
-  args.input_layout_type = fpga::LAYOUT_CHW;
-  args.output_layout_type = fpga::LAYOUT_HWC;
-  args.image.address = input->data<half>();
-  args.image.channels = (uint32_t)(input->fpga_data_num);
-  args.image.height = 1;
-  args.image.width = 1;
-  args.image.pad_height = 0;
-  args.image.pad_width = 0;
-  args.output.address = output->data<float>();
-  args.output.scale_address = output->scale;
-  param->fpga_bypass_args = args;
-
  return true;
 }
 void dealign(float *src, float *dst, int input_c, int input_h, int input_w) {
@@ -83,22 +64,22 @@ void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
    return;
  }

-  fpga::BypassArgs args = param.fpga_bypass_args;
-  auto input_address = (input->data<half>());
-  args.image.address = static_cast<void *>(input_address);
-  float *outdata_ptr =
-      reinterpret_cast<float *>(param.fpga_bypass_args.output.address);
+  auto input_address = input->data<int8_t>();
+  float Si = input->scale[0];
+  auto aligned_ptr = const_cast<float *>(param.aligned_out.data<float>());
+  auto outdata_ptr = output->data<float>();
  const int num_th = 32;
-  if (output->fpga_data_num < num_th) {
-    fpga::fpga_invalidate(input_address, (input->fpga_data_num) * sizeof(half));
-
+  fpga::fpga_invalidate(input_address, (input->fpga_data_num) * sizeof(int8_t));
+  if (input->fpga_data_num < num_th) {
    for (int idx = 0; idx < product(input->dims()); ++idx) {
-      outdata_ptr[idx] = fpga::fp16_2_fp32(input_address[idx]);
+      outdata_ptr[idx] = input_address[idx] * Si;
    }
+    fpga::fpga_flush(outdata_ptr, product(input->dims()) * sizeof(float));
    return;
  }
-
-  fpga::PerformBypass(args);
+  for (int idx = 0; idx < input->fpga_data_num; ++idx) {
+    aligned_ptr[idx] = input_address[idx] * Si;
+  }
  int outC = 1;
  int outH = 1;
  int outW = 1;
@@ -110,16 +91,15 @@ void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
    outC = output->dims()[1];
  }

-  fpga::fpga_invalidate(param.fpga_bypass_args.output.address,
-                        output->fpga_data_num * sizeof(float));
  int unalignedCW = outC * outW;
  int alignedCW = fpga::align_to_x(unalignedCW, IMAGE_ALIGNMENT);
  if (unalignedCW != alignedCW) {
-    auto aligned_ptr = const_cast<float *>(param.aligned_out.data<float>());
-    dealign(outdata_ptr, aligned_ptr, outC, outH, outW);
-    memcpy(outdata_ptr, aligned_ptr, outC * outH * outW * sizeof(float));
+    dealign(aligned_ptr, outdata_ptr, outC, outH, outW);
    fpga::fpga_flush(outdata_ptr, outC * outH * outW * sizeof(float));
+    return;
  }
+  memcpy(outdata_ptr, aligned_ptr, outC * outH * outW * sizeof(float));
+  fpga::fpga_flush(outdata_ptr, outC * outH * outW * sizeof(float));
 }
 template class FetchKernel<FPGA, float>;


--- a/src/operators/kernel/fpga/V2/proposal_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/proposal_kernel.cpp
@@ -32,9 +32,6 @@ bool ProposalKernel<FPGA, float>::Init(ProposalParam<FPGA> *param) {
  param->rpn_rois_->mutable_data<float>({total, 4});
  param->rpn_probs_->mutable_data<float>({total, 1});

-  //  DLOG << *param->rpn_rois_;
-  //  DLOG << *param->rpn_probs_;
-
  param->float_bbox = std::make_shared<Tensor>();
  param->float_bbox->Resize(param->bbox_deltas_->dims());
  param->float_bbox->init(type_id<float>().hash_code());
@@ -44,29 +41,7 @@ bool ProposalKernel<FPGA, float>::Init(ProposalParam<FPGA> *param) {
  param->float_score->init(type_id<float>().hash_code());
  fpga::format_fp32_ofm(param->float_score.get());

-  auto input = param->bbox_deltas_;
-  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
-  args.input_layout_type = fpga::LAYOUT_HWC;
-  args.output_layout_type = fpga::LAYOUT_HWC;
-  args.input_data_type = fpga::DATA_TYPE_FP16;
-  args.output_data_type = fpga::DATA_TYPE_FP32;
-  args.image.address = input->data<half>();
-  args.image.height = (uint32_t)input->dims()[2];
-  args.image.width = (uint32_t)input->dims()[3];
-  args.image.channels = (uint32_t)input->dims()[1];
-  args.output.address = param->float_bbox->mutable_data<float>();
-  args.output.scale_address = param->float_bbox->scale;
-  param->bbox_arg = args;
-
-  input = param->scores_;
-  args.image.address = input->data<half>();
-  args.image.height = (uint32_t)input->dims()[2];
-  args.image.width = (uint32_t)input->dims()[3];
-  args.image.channels = (uint32_t)input->dims()[1];
-  args.output.address = param->float_score->mutable_data<float>();
-  args.output.scale_address = param->float_score->scale;
-  param->score_arg = args;
-
+  auto input = param->scores_;
  param->score_index_ = std::make_shared<Tensor>();
  param->score_index_->mutable_data<int32_t>({input->numel()});
  auto score_index = param->score_index_->data<int32_t>();
@@ -136,51 +111,17 @@ static inline void BoxCoder(Tensor *all_anchors, Tensor *bbox_deltas,
    T bbox_center_x = 0, bbox_center_y = 0;
    T bbox_width = 0, bbox_height = 0;

-    /*
-        if (variances) {
-          bbox_center_x =
-              variances_data[i * len] * bbox_deltas_data[i * len] * anchor_width
-       + anchor_center_x; bbox_center_y = variances_data[i * len + 1] *
-                              bbox_deltas_data[i * len + 1] * anchor_height +
-                          anchor_center_y;
-          bbox_width = std::exp(std::min<T>(variances_data[i * len + 2] *
-                                                bbox_deltas_data[i * len + 2],
-                                            kBBoxClipDefault)) *
-                       anchor_width;
-          bbox_height = std::exp(std::min<T>(variances_data[i * len + 3] *
-                                                 bbox_deltas_data[i * len + 3],
-                                             kBBoxClipDefault)) *
-                        anchor_height;
-        } else {
-    */
    bbox_center_x = bbox_deltas_data[i * len] * anchor_width + anchor_center_x;
    bbox_center_y =
        bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y;
-
-    /*
-          bbox_width = std::exp(std::min<T>(bbox_deltas_data[i * len + 2],
-                                            kBBoxClipDefault)) *
-                       anchor_width;
-          bbox_height = std::exp(std::min<T>(bbox_deltas_data[i * len + 3],
-                                             kBBoxClipDefault)) *
-                        anchor_height;
-    */
    bbox_width = std::exp(bbox_deltas_data[i * len + 2]) * anchor_width;
    bbox_height = std::exp(bbox_deltas_data[i * len + 3]) * anchor_height;
-    //    }

    proposals_data[i * len] = bbox_center_x - bbox_width / 2;
    proposals_data[i * len + 1] = bbox_center_y - bbox_height / 2;
-    /*
-        //wong
-        proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2 - 1;
-        proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2 - 1;
-        //wong
-    */
    proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2;
    proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2;
  }
-  // return proposals;
 }

 template <class T>
@@ -252,8 +193,6 @@ static inline std::vector<std::pair<T, int>> GetSortedScoreIndex(
 template <class T>
 static inline T BBoxArea(const T *box, bool normalized) {
  if (box[2] < box[0] || box[3] < box[1]) {
-    // If coordinate values are is invalid
-    // (e.g. xmax < xmin or ymax < ymin), return 0.
    return static_cast<T>(0.);
  } else {
    const T w = box[2] - box[0];
@@ -351,9 +290,6 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
  Tensor index_t;
  index_t.Resize({scores_slice.numel()});
  int *index = index_t.mutable_data<int>();
-  /*for (int i = 0; i < scores_slice.numel(); ++i) {
-    index[i] = i;
-  }*/
  std::memcpy(index, score_index.data<int32_t>(),
              scores_slice.numel() * sizeof(int));

@@ -397,7 +333,6 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
    return std::make_pair(bbox_sel, scores_filter);
  }

-  // Tensor keep_nms = NMS<T>(&bbox_sel, &scores_filter, nms_thresh, eta);
  Tensor keep_nms =
      NMS<T>(&bbox_sel, &scores_filter, nms_thresh, eta, post_nms_top_n);

@@ -408,8 +343,6 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
  proposals.mutable_data<T>({keep_nms.numel(), 4});   // original
  scores_sel.mutable_data<T>({keep_nms.numel(), 1});  // original

-  // proposals.mutable_data<T>({post_nms_top_n, 4});   // wong
-  // scores_sel.mutable_data<T>({post_nms_top_n, 1});  // wong
  CPUGather<T>(bbox_sel, keep_nms, &proposals);
  CPUGather<T>(scores_filter, keep_nms, &scores_sel);
  return std::make_pair(proposals, scores_sel);
@@ -418,13 +351,11 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
 template <>
 void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) {
  auto input_score = param.scores_;
-  auto input_score_data = input_score->data<half>();
-  auto input_score_data_tmp = input_score->data<half>();
+  auto input_score_data = input_score->data<int8_t>();
  uint32_t score_n, score_height, score_width, score_channels;

  auto input_bbox = param.bbox_deltas_;
-  auto input_bbox_data = input_bbox->data<half>();
-  auto input_bbox_data_tmp = input_bbox->data<half>();
+  auto input_bbox_data = input_bbox->data<int8_t>();
  uint32_t bbox_n, bbox_height, bbox_width, bbox_channels;

  score_n = (uint32_t)(input_score->dims()[0]);
@@ -439,61 +370,48 @@ void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) {

  std::shared_ptr<Tensor> score_tmp = std::make_shared<Tensor>();
  score_tmp->Resize(param.scores_->dims());
-  score_tmp->mutable_data<half>();
+  score_tmp->mutable_data<int8_t>();

  std::shared_ptr<Tensor> bbox_tmp = std::make_shared<Tensor>();
  bbox_tmp->Resize(param.bbox_deltas_->dims());
-  bbox_tmp->mutable_data<half>();
+  bbox_tmp->mutable_data<int8_t>();

-  auto score_tmp_data = score_tmp->data<half>();
-  auto bbox_tmp_data = bbox_tmp->data<half>();
+  auto score_tmp_data = score_tmp->data<int8_t>();
+  auto bbox_tmp_data = bbox_tmp->data<int8_t>();
  int64_t amount_per_side = score_width * score_height;
  int idx = 0;
-  fpga::fpga_invalidate(
-      input_score_data_tmp,
-      score_height * score_width * score_channels * sizeof(half));
+  fpga::fpga_invalidate(input_score_data, score_height * score_width *
+                                              score_channels * sizeof(int8_t));
  for (int h = 0; h < score_height; h++) {
    for (int w = 0; w < score_width; w++) {
      for (int c = 0; c < score_channels; c++) {
        idx++;
-        // DLOG  << "wong input_score: "<<
-        // paddle_mobile::fpga::fp16_2_fp32(input_score_data[idx]);
        *(score_tmp_data + c * amount_per_side + score_width * h + w) =
-            (*(input_score_data_tmp++));
+            (*(input_score_data++));
      }
    }
  }
  amount_per_side = bbox_width * bbox_height;
-  fpga::fpga_invalidate(input_bbox_data_tmp, bbox_height * bbox_width *
-                                                 bbox_channels * sizeof(half));
+  fpga::fpga_invalidate(input_bbox_data, bbox_height * bbox_width *
+                                             bbox_channels * sizeof(int8_t));
  for (int h = 0; h < bbox_height; h++) {
    for (int w = 0; w < bbox_width; w++) {
      for (int c = 0; c < bbox_channels; c++) {
        idx++;
-        // DLOG  << "wong input_score: "<<
-        // paddle_mobile::fpga::fp16_2_fp32(input_score_data[idx]);
        *(bbox_tmp_data + c * amount_per_side + bbox_width * h + w) =
-            (*(input_bbox_data_tmp++));
+            (*(input_bbox_data++));
      }
    }
  }
-  struct paddle_mobile::fpga::BypassArgs temp_score_arg;
-  struct paddle_mobile::fpga::BypassArgs temp_bbox_arg;
-  temp_score_arg = param.score_arg;
-  temp_score_arg.image.address = score_tmp->data<half>();

-  temp_bbox_arg = param.bbox_arg;
-  temp_bbox_arg.image.address = bbox_tmp->data<half>();
  auto score_tensor = param.float_score.get();
-  fpga::PerformBypass(param.score_arg);
-  fpga::fpga_invalidate(score_tensor->data<float>(),
-                        score_tensor->numel() * sizeof(float));
-
+  for (int i = 0; i < score_height * score_width * score_channels; i++) {
+    score_tensor->data<float>()[i] = score_tmp_data[i] * input_score->scale[0];
+  }
  auto bbox_tensor = param.float_bbox.get();
-  fpga::PerformBypass(param.bbox_arg);
-  fpga::fpga_invalidate(bbox_tensor->data<float>(),
-                        bbox_tensor->numel() * sizeof(float));
-
+  for (int i = 0; i < bbox_height * bbox_width * bbox_channels; i++) {
+    bbox_tensor->data<float>()[i] = bbox_tmp_data[i] * input_bbox->scale[0];
+  }
  auto *scores = param.float_score.get();
  auto *bbox_deltas = param.float_bbox.get();
  auto *im_info = param.im_info_;
@@ -507,7 +425,6 @@ void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) {

  int pre_nms_top_n = param.pre_nms_topn_;
  int post_nms_top_n = param.post_nms_topn_;
-  // DLOG << " param.post_nms_topn_ : " << param.post_nms_topn_;

  float nms_thresh = param.nms_thresh_ / 2.0f;
  float min_size = param.min_size_;

--- a/src/operators/kernel/fpga/V2/psroi_pool_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/psroi_pool_kernel.cpp
@@ -31,21 +31,6 @@ bool PSRoiPoolKernel<FPGA, float>::Init(PSRoiPoolParam<FPGA>* param) {

  param->float_input = std::make_shared<Tensor>();
  param->float_input->mutable_data<float>(param->input_x_->dims());
-  // param->float_output = std::make_shared<Tensor>();
-
-  auto input = param->input_x_;
-  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
-  args.input_layout_type = fpga::LAYOUT_HWC;
-  args.output_layout_type = fpga::LAYOUT_HWC;
-  args.input_data_type = fpga::DATA_TYPE_FP16;
-  args.output_data_type = fpga::DATA_TYPE_FP32;
-  args.image.address = input->data<half>();
-  args.image.height = (uint32_t)input->dims()[2];
-  args.image.width = (uint32_t)input->dims()[3];
-  args.image.channels = (uint32_t)input->dims()[1];
-  args.output.address = param->float_input->mutable_data<float>();
-  args.output.scale_address = param->float_input->scale;
-  param->input_arg = args;

  auto* rois = param->input_rois_;
  int rois_num = rois->dims()[0];
@@ -53,81 +38,11 @@ bool PSRoiPoolKernel<FPGA, float>::Init(PSRoiPoolParam<FPGA>* param) {
      {rois_num, param->output_->dims()[1], param->output_->dims()[2],
       param->output_->dims()[3]});
  param->output_->Resize(dims_out_new);
-  // fpga::format_fp16_ofm(param->output_);

  param->output_->mutable_data<float>(dims_out_new);
-  //  auto output = param->float_output.get();
-  // param->output_ = output;
-  /* args.input_data_type = fpga::DATA_TYPE_FP32;
-   args.output_data_type = fpga::DATA_TYPE_FP16;
-   args.image.address = output->data<float>();
-   args.image.height = (uint32_t)output->dims()[2];
-   args.image.width = (uint32_t)output->dims()[3];
-   args.image.channels = (uint32_t)output->dims()[1]  ;
-   args.output.address = param->output_->mutable_data<half>();
-   args.output.scale_address = param->output_->scale;
-   param->output_arg = args;*/
-
  return true;
 }

-/*
-    template <typename Dtype>
-    void PSROIPoolingForward(
-    const Dtype* bottom_data,
-    const int height, const int width, const int input_channel,
-    Dtype* top_data,
-    const int pooled_height, const int pooled_width, const int output_channel,
-    const Dtype* bottom_rois,
-    const Dtype Bin_size_h, const Dtype Bin_size_w, const Dtype roi_start_h,
-   const Dtype roi_start_w, const int pw, const int ph, const int roi_batch_ind)
-    {
-
-      int hstart = floor(static_cast<Dtype>(ph) * Bin_size_h + roi_start_h);
-      int wstart = floor(static_cast<Dtype>(pw)* Bin_size_w + roi_start_w);
-      int hend = ceil(static_cast<Dtype>(ph + 1) * Bin_size_h + roi_start_h);
-      int wend = ceil(static_cast<Dtype>(pw + 1) * Bin_size_w + roi_start_w);
-
-      hstart = std::min(std::max(hstart, 0), height);
-      hend = std::min(std::max(hend, 0), height);
-      wstart = std::min(std::max(wstart, 0), width);
-      wend = std::min(std::max(wend, 0), width);
-      bool is_empty = (hend <= hstart) || (wend <= wstart);
-
-      float32x4_t sum_pixels_low_c= vdupq_n_f32(0);
-      float32x4_t sum_pixels_high_c= vdupq_n_f32(0);
-
-      if(!is_empty){
-          Dtype bin_area = (hend - hstart) * (wend - wstart);
-          float rev_bin_area = 1 / bin_area;
-          float32x4_t q_bin_area = vdupq_n_f32(rev_bin_area);
-   //static_cast<float>(bin_area) float pixels_c[output_channel];
-
-          for (int h = hstart; h < hend; ++h) {
-            for (int w = wstart; w < wend; ++w) {
-                int pixel_offset = (h * width + w) * input_channel;
-                for(int output_c = 0; output_c < output_channel; output_c++){
-                    int input_channel_offset = output_c * pooled_height *
-   pooled_width; int input_bias = pixel_offset + input_channel_offset + ph *
-   pooled_width + pw; pixels_c[output_c] = bottom_data[input_bias];
-                }
-                float32x4_t pixel_low_c = vld1q_f32(pixels_c);
-                float32x4_t pixel_high_c = vld1q_f32(pixels_c + 4);
-                sum_pixels_low_c = vaddq_f32(sum_pixels_low_c, pixel_low_c);
-                sum_pixels_high_c = vaddq_f32(sum_pixels_high_c, pixel_high_c);
-            }
-          }
-          sum_pixels_low_c = vmulq_f32(sum_pixels_low_c, q_bin_area);
-          sum_pixels_high_c = vmulq_f32(sum_pixels_high_c, q_bin_area);
-        }
-
-      int output_index_base = (ph * pooled_width + pw) * output_channel;
-      top_data += output_index_base;
-      vst1q_f32(top_data, sum_pixels_low_c);
-      top_data += 4;
-      vst1q_f32(top_data, sum_pixels_high_c);
-    }*/
-
 template <typename Dtype>
 void PSROIPoolingForward(const Dtype* bottom_data, const int height,
                         const int width, const int input_channel,
@@ -182,14 +97,18 @@ void PSROIPoolingForward(const Dtype* bottom_data, const int height,

 template <>
 void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
-  auto input_tensor = param.float_input.get();
-  fpga::PerformBypass(param.input_arg);
-  fpga::fpga_invalidate(input_tensor->data<float>(),
-                        input_tensor->numel() * sizeof(float));
+  auto input_tensor = param.input_x_;
+  auto input_data = input_tensor->data<int8_t>();
+  auto Si = input_tensor->scale[0];
+  auto float_input_tensor = param.float_input.get();
+  auto float_input_data = float_input_tensor->data<float>();
+  for (int i = 0; i < float_input_tensor->numel(); i++) {
+    float_input_data[i] = input_data[i] * Si;
+  }

-  auto* in = input_tensor;
+  auto* in = float_input_tensor;
  auto* rois = param.input_rois_;
-  auto* out = param.output_;  // param.float_output.get();
+  auto* out = param.output_;

  auto pooled_height = param.pooled_height_;
  auto pooled_width = param.pooled_width_;
@@ -205,14 +124,13 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {

  auto data_nhwc = in->mutable_data<float>();

-  //  fpga::image::convert_to_chw(&data_nhwc, input_channels, height, width);
  framework::DDim dims_out_new = framework::make_ddim(
      {rois_num, (param.output_)->dims()[1], (((param.output_)->dims()[2])),
       (param.output_)->dims()[3]});

  (param.output_)->Resize(dims_out_new);

-  const float* input_data = data_nhwc;  // in->data<float>();
+  const float* input_data_tmp = data_nhwc;  // in->data<float>();
  framework::Tensor rois_batch_id_list;
  rois_batch_id_list.Resize({rois_num});
  auto rois_batch_id_data = rois_batch_id_list.mutable_data<int>();
@@ -268,11 +186,11 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {

    for (int ph = 0; ph < pooled_height; ph++) {
      for (int pw = 0; pw < pooled_width; pw++) {
-        PSROIPoolingForward<float>(input_data, height, width, input_channels,
-                                   offset_output_data, pooled_height,
-                                   pooled_width, output_channels, input_rois,
-                                   bin_size_h, bin_size_w, roi_start_h,
-                                   roi_start_w, pw, ph, roi_batch_ind);
+        PSROIPoolingForward<float>(
+            input_data_tmp, height, width, input_channels, offset_output_data,
+            pooled_height, pooled_width, output_channels, input_rois,
+            bin_size_h, bin_size_w, roi_start_h, roi_start_w, pw, ph,
+            roi_batch_ind);
      }
    }
  }

--- a/src/operators/kernel/fpga/V2/sigmoid_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/sigmoid_kernel.cpp
@@ -25,18 +25,18 @@ bool SigmoidKernel<FPGA, float>::Init(SigmoidParam<FPGA> *param) {
      paddle_mobile::fpga::SIGMOID;
  int16_t leaky_relu_negative_slope = 0;
  auto input = const_cast<LoDTensor *>(param->InputX());
-  auto input_ptr = input->data<half>();
+  auto input_ptr = input->data<int8_t>();
  auto out = param->Out();
-  fpga::format_fp16_ofm(out);
+  fpga::format_ofm(out);

-  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
-  args.input_data_type = fpga::DATA_TYPE_FP16;
-  args.output_data_type = fpga::DATA_TYPE_FP16;
+  fpga::BypassArgs args = {fpga::DATA_TYPE_INT8};
+  args.input_data_type = fpga::DATA_TYPE_INT8;
+  args.output_data_type = fpga::DATA_TYPE_INT8;
  args.image.address = input_ptr;
  args.image.height = 1;
  args.image.width = 1;
  args.image.channels = input->fpga_data_num;
-  args.output.address = out->data<half>();
+  args.output.address = out->data<int8_t>();
  args.output.scale_address = out->scale;
  args.output.activation.activation_type = activation_enable;
  args.output.activation.leaky_relu_negative_slope = leaky_relu_negative_slope;

--- a/src/operators/kernel/fpga/V2/softmax_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/softmax_kernel.cpp
@@ -24,13 +24,13 @@ template <>
 bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
  auto input = const_cast<LoDTensor *>(param->InputX());
  auto dims = framework::vectorize(input->dims());
-  half *input_ptr;
+  int8_t *input_ptr;
  auto out = param->Out();
  if (input->type() == type_id<float>()) {
    out->Resize(framework::make_ddim(dims));
    out->mutable_data<float>(framework::make_ddim(dims));
  } else {
-    input_ptr = input->data<half>();
+    input_ptr = input->data<int8_t>();
  }

  auto float_input = new LoDTensor;
@@ -52,8 +52,6 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
    out->mutable_data<float>(framework::make_ddim(dims));
    float_input->init(type_id<float>().hash_code());
    float_input->mutable_data<float>(framework::make_ddim(dims));
-    //  fpga::format_fp32_ofm(float_input);
-    // fpga::format_fp32_ofm(out);

    fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
    args.input_layout_type = fpga::LAYOUT_HWC;
@@ -69,7 +67,7 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
    param->SetFloatInput(float_input);
    param->SetFpgaArgs(args);
  } else {  // Use FPGA
-    fpga::format_fp16_ofm(out);
+    fpga::format_ofm(out);
    fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
    args.input_layout_type = fpga::LAYOUT_HWC;
    args.output_layout_type = fpga::LAYOUT_CHW;
@@ -91,7 +89,7 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
 template <>
 void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) {
  auto *in_x = (param.InputX());
-  if (in_x->type() == type_id<half>()) {
+  if (in_x->type() == type_id<int8_t>()) {
    fpga::PerformBypass(param.FpgaArgs());
    if (param.FpgaArgs().output.activation.activation_type != fpga::SOFTMAX) {
      Tensor *out = param.Out();

--- a/src/operators/kernel/fpga/V2/split_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/split_kernel.cpp
@@ -36,7 +36,7 @@ bool SplitKernel<FPGA, float>::Init(SplitParam<FPGA> *param) {
      fpga::fpga_malloc(image_num * sizeof(uint32_t)));
  DLOG << "input: " << in;
  for (int i = 0; i < image_num; i++) {
-    fpga::format_fp16_ofm(outs[i]);
+    fpga::format_ofm(outs[i]);
    DLOG << "output: " << outs[i];
    images_out[i] = outs[i]->mutable_data<int8_t>();
    scales_out[i] = outs[i]->scale;

--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -1270,7 +1270,9 @@ class FetchParam : public OpParam {
 #ifdef PADDLE_MOBILE_FPGA

 public:
+#ifdef PADDLE_MOBILE_FPGA_V1
  fpga::BypassArgs fpga_bypass_args;
+#endif
  Tensor aligned_out;
 #endif
 };