Merge pull request #1571 from qnqinan/develop

update some files related with static quantization in FPGA V2 track fixed#1570

Merge pull request #1571 from qnqinan/develop
update some files related with static quantization in FPGA V2 track fixed#1570
769c8083 · jameswu2014 · GitHub · 3f1e8f7d · c72044e0 · 769c8083
19 changed file
--- a/src/operators/kernel/fpga/V2/conv_add_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/conv_add_bn_kernel.cpp
@@ -33,6 +33,9 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
  auto filter = const_cast<LoDTensor *>(param->Filter());
  auto out = param->Output();
+  float Si = input->scale[0];
+  float So = out->scale[0];
+  float Sf = fpga::filter_find_max(filter);
  auto bn_mean_ptr = param->InputMean()->data<float>();
  auto bn_var_ptr = param->InputVariance()->data<float>();
@@ -56,8 +59,10 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
                       static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
    new_bias_ptr[i] =
        bn_bias_ptr[i] + (bias_ptr[i] - bn_mean_ptr[i]) * new_scale_ptr[i];
-    bs_ptr[i + channel] = new_scale_ptr[i];
+    //    bs_ptr[i + channel] = new_scale_ptr[i];
-    bs_ptr[i] = new_bias_ptr[i];
+    //    bs_ptr[i] = new_bias_ptr[i];
+    bs_ptr[i + channel] = new_scale_ptr[i] * Si / So * Sf / 127.0;
+    bs_ptr[i] = new_bias_ptr[i] * 127.0 / So;
  }
  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());

--- a/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp
@@ -32,7 +32,10 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
  auto bias_ptr = bias->data<float>();
  auto filter = const_cast<LoDTensor *>(param->Filter());
  auto out = param->Output();
+  const int groups = param->Groups();
+  float Si = input->scale[0];
+  float So = out->scale[0];
+  float Sf = fpga::filter_find_max(filter);
  vector<int> paddings = param->Paddings();
  vector<int> strides = param->Strides();
  auto bn_mean_ptr = param->InputMean()->data<float>();
@@ -57,11 +60,16 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
                       static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
    new_bias_ptr[i] =
        bn_bias_ptr[i] + (bias_ptr[i] - bn_mean_ptr[i]) * new_scale_ptr[i];
-    bs_ptr[i + channel] = new_scale_ptr[i];
+    //    bs_ptr[i + channel] = new_scale_ptr[i];
-    bs_ptr[i] = new_bias_ptr[i];
+    //    bs_ptr[i] = new_bias_ptr[i];
+    bs_ptr[i + channel] = new_scale_ptr[i] * Si / So * Sf / 127.0;
+    bs_ptr[i] = new_bias_ptr[i] * 127.0 / So;
+    if (groups == channel) {
+      new_scale_ptr[i] = new_scale_ptr[i] * Si / So;
+      new_bias_ptr[i] = new_bias_ptr[i] * 127.0f / So;
+    }
  }
-  const int groups = param->Groups();
  if (groups == channel) {
    fpga::format_dwconv_data(filter, out, new_scale_ptr, &new_bias_ptr);
    fpga::DWconvArgs dwconv_arg = {0};

--- a/src/operators/kernel/fpga/V2/conv_add_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/conv_add_kernel.cpp
@@ -30,6 +30,9 @@ bool ConvAddKernel<FPGA, float>::Init(FusionConvAddParam<FPGA> *param) {
  auto bias_ptr = bias->data<float>();
  auto filter = const_cast<LoDTensor *>(param->Filter());
  auto out = param->Output();
+  float Si = input->scale[0];
+  float So = out->scale[0];
+  float Sf = fpga::filter_find_max(filter);
  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
                        "Output channel should be equal to bias number");
@@ -37,8 +40,10 @@ bool ConvAddKernel<FPGA, float>::Init(FusionConvAddParam<FPGA> *param) {
  auto bs_ptr =
      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
  for (int i = 0; i < channel; i++) {
-    bs_ptr[i + channel] = 1;
+    //    bs_ptr[i + channel] = 1;
-    bs_ptr[i] = bias_ptr[i];
+    //    bs_ptr[i] = bias_ptr[i];
+    bs_ptr[i + channel] = Si / So * Sf / 127.0;
+    bs_ptr[i] = bias_ptr[i] * 127.0 / So;
  }
  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());

--- a/src/operators/kernel/fpga/V2/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/conv_add_relu_kernel.cpp
@@ -30,6 +30,9 @@ bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
  auto bias_ptr = bias->data<float>();
  auto filter = const_cast<LoDTensor *>(param->Filter());
  auto out = param->Output();
+  float Si = input->scale[0];
+  float So = out->scale[0];
+  float Sf = fpga::filter_find_max(filter);
  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
                        "Output channel should be equal to bias number");
@@ -37,8 +40,10 @@ bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
  auto bs_ptr =
      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
  for (int i = 0; i < channel; i++) {
-    bs_ptr[i + channel] = 1;
+    //    bs_ptr[i + channel] = 1;
-    bs_ptr[i] = bias_ptr[i];
+    //    bs_ptr[i] = bias_ptr[i];
+    bs_ptr[i + channel] = Si / So * Sf / 127.0;
+    bs_ptr[i] = bias_ptr[i] * 127.0 / So;
  }
  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());

--- a/src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp
@@ -32,6 +32,9 @@ bool ConvTransposeKernel<FPGA, float>::Init(ConvTransposeParam<FPGA> *param) {
  // auto bias_ptr = bias->data<float>();
  auto filter = const_cast<LoDTensor *>(param->Filter());
  auto out = param->Output();
+  float Si = input->scale[0];
+  float So = out->scale[0];
+  float Sf = fpga::filter_find_max(filter);
  // PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
  //                      "Output channel should be equal to bias number");
@@ -53,6 +56,10 @@ bool ConvTransposeKernel<FPGA, float>::Init(ConvTransposeParam<FPGA> *param) {
  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
                        "filter axis should be the multiple of stride axis ");
  if (param->Groups() == channel) {
+    for (int i = 0; i < channel * sub_conv_n; i++) {
+      bs_ptr[i + sub_conv_n * channel] = Si / So;
+      bs_ptr[i] = 0;  // bias_ptr[i % (channel)];
+    }
    fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
                               sub_conv_n);
    fpga::DWDeconvArgs DWDeconv_arg = {0};
@@ -62,6 +69,10 @@ bool ConvTransposeKernel<FPGA, float>::Init(ConvTransposeParam<FPGA> *param) {
                            param->Paddings()[0], param->Paddings()[1], bs_ptr);
    param->SetFpgaArgs(DWDeconv_arg);
  } else {
+    for (int i = 0; i < channel * sub_conv_n; i++) {
+      bs_ptr[i + sub_conv_n * channel] = Si / So * Sf / 127.0f;
+      bs_ptr[i] = 0;  // bias_ptr[i % (channel)];
+    }
    fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
    fpga::DeconvArgs deconv_arg = {0};
    fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,

--- a/src/operators/kernel/fpga/V2/deconv_add_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/deconv_add_bn_kernel.cpp
@@ -32,7 +32,9 @@ bool DeconvAddBNKernel<FPGA, float>::Init(FusionDeconvAddBNParam<FPGA> *param) {
  auto bias_ptr = bias->data<float>();
  auto filter = const_cast<LoDTensor *>(param->Filter());
  auto out = param->Output();
+  float Si = input->scale[0];
+  float So = out->scale[0];
+  float Sf = fpga::filter_find_max(filter);
  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
                        "Output channel should be equal to bias number");
  int channel = out->dims()[1];
@@ -53,6 +55,10 @@ bool DeconvAddBNKernel<FPGA, float>::Init(FusionDeconvAddBNParam<FPGA> *param) {
  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
                        "filter axis should be the multiple of stride axis ");
  if (param->Groups() == channel) {
+    for (int i = 0; i < channel * sub_conv_n; i++) {
+      bs_ptr[i + sub_conv_n * channel] = Si / So;
+      bs_ptr[i] = bias_ptr[i % (channel)] * 127.0f / So;
+    }
    fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
                               sub_conv_n);
    fpga::DWDeconvArgs DWDeconv_arg = {0};
@@ -62,6 +68,10 @@ bool DeconvAddBNKernel<FPGA, float>::Init(FusionDeconvAddBNParam<FPGA> *param) {
                            param->Paddings()[0], param->Paddings()[1], bs_ptr);
    param->SetFpgaArgs(DWDeconv_arg);
  } else {
+    for (int i = 0; i < channel * sub_conv_n; i++) {
+      bs_ptr[i + sub_conv_n * channel] = Si / So * Sf / 127.0f;
+      bs_ptr[i] = bias_ptr[i % (channel)] * 127.0f / So;
+    }
    fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
    fpga::DeconvArgs deconv_arg = {0};
    fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,

--- a/src/operators/kernel/fpga/V2/deconv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/deconv_add_bn_relu_kernel.cpp
@@ -33,7 +33,9 @@ bool DeconvAddBNReluKernel<FPGA, float>::Init(
  auto bias_ptr = bias->data<float>();
  auto filter = const_cast<LoDTensor *>(param->Filter());
  auto out = param->Output();
+  float Si = input->scale[0];
+  float So = out->scale[0];
+  float Sf = fpga::filter_find_max(filter);
  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
                        "Output channel should be equal to bias number");
  int channel = out->dims()[1];
@@ -54,6 +56,10 @@ bool DeconvAddBNReluKernel<FPGA, float>::Init(
  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
                        "filter axis should be the multiple of stride axis ");
  if (param->Groups() == channel) {
+    for (int i = 0; i < channel * sub_conv_n; i++) {
+      bs_ptr[i + sub_conv_n * channel] = Si / So;
+      bs_ptr[i] = bias_ptr[i % (channel)] * 127.0f / So;
+    }
    fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
                               sub_conv_n);
    fpga::DWDeconvArgs DWDeconv_arg = {0};
@@ -63,6 +69,10 @@ bool DeconvAddBNReluKernel<FPGA, float>::Init(
                            param->Paddings()[0], param->Paddings()[1], bs_ptr);
    param->SetFpgaArgs(DWDeconv_arg);
  } else {
+    for (int i = 0; i < channel * sub_conv_n; i++) {
+      bs_ptr[i + sub_conv_n * channel] = Si / So * Sf / 127.0f;
+      bs_ptr[i] = bias_ptr[i % (channel)] * 127.0f / So;
+    }
    fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
    fpga::DeconvArgs deconv_arg = {0};
    fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,

--- a/src/operators/kernel/fpga/V2/deconv_add_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/deconv_add_kernel.cpp
@@ -32,7 +32,9 @@ bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) {
  auto bias_ptr = bias->data<float>();
  auto filter = const_cast<LoDTensor *>(param->Filter());
  auto out = param->Output();
+  float Si = input->scale[0];
+  float So = out->scale[0];
+  float Sf = fpga::filter_find_max(filter);
  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
                        "Output channel should be equal to bias number");
  int channel = out->dims()[1];
@@ -53,6 +55,10 @@ bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) {
  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
                        "filter axis should be the multiple of stride axis ");
  if (param->Groups() == channel) {
+    for (int i = 0; i < channel * sub_conv_n; i++) {
+      bs_ptr[i + sub_conv_n * channel] = Si / So;
+      bs_ptr[i] = bias_ptr[i % (channel)] * 127.0f / So;
+    }
    fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
                               sub_conv_n);
    fpga::DWDeconvArgs DWDeconv_arg = {0};
@@ -62,6 +68,10 @@ bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) {
                            param->Paddings()[0], param->Paddings()[1], bs_ptr);
    param->SetFpgaArgs(DWDeconv_arg);
  } else {
+    for (int i = 0; i < channel * sub_conv_n; i++) {
+      bs_ptr[i + sub_conv_n * channel] = Si / So * Sf / 127.0f;
+      bs_ptr[i] = bias_ptr[i % (channel)] * 127.0f / So;
+    }
    fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
    fpga::DeconvArgs deconv_arg = {0};
    fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,

--- a/src/operators/kernel/fpga/V2/deconv_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/deconv_add_relu_kernel.cpp
@@ -33,7 +33,9 @@ bool DeconvAddReluKernel<FPGA, float>::Init(
  auto bias_ptr = bias->data<float>();
  auto filter = const_cast<LoDTensor *>(param->Filter());
  auto out = param->Output();
+  float Si = input->scale[0];
+  float So = out->scale[0];
+  float Sf = fpga::filter_find_max(filter);
  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
                        "Output channel should be equal to bias number");
  int channel = out->dims()[1];
@@ -54,6 +56,10 @@ bool DeconvAddReluKernel<FPGA, float>::Init(
  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
                        "filter axis should be the multiple of stride axis ");
  if (param->Groups() == channel) {
+    for (int i = 0; i < channel * sub_conv_n; i++) {
+      bs_ptr[i + sub_conv_n * channel] = Si / So;
+      bs_ptr[i] = bias_ptr[i % (channel)] * 127.0f / So;
+    }
    fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
                               sub_conv_n);
    fpga::DWDeconvArgs DWDeconv_arg = {0};
@@ -63,6 +69,10 @@ bool DeconvAddReluKernel<FPGA, float>::Init(
                            param->Paddings()[0], param->Paddings()[1], bs_ptr);
    param->SetFpgaArgs(DWDeconv_arg);
  } else {
+    for (int i = 0; i < channel * sub_conv_n; i++) {
+      bs_ptr[i + sub_conv_n * channel] = Si / So * Sf / 127.0f;
+      bs_ptr[i] = bias_ptr[i % (channel)] * 127.0f / So;
+    }
    fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
    fpga::DeconvArgs deconv_arg = {0};
    fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,

--- a/src/operators/kernel/fpga/V2/deconv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/deconv_bn_relu_kernel.cpp
@@ -34,6 +34,9 @@ bool DeconvBNReluKernel<FPGA, float>::Init(
  auto bias_ptr = bias->data<float>();
  auto filter = const_cast<LoDTensor *>(param->Filter());
  auto out = param->Output();
+  float Si = input->scale[0];
+  float So = out->scale[0];
+  float Sf = fpga::filter_find_max(filter);
  auto bn_mean_ptr = param->InputMean()->data<float>();
  auto bn_var_ptr = param->InputVariance()->data<float>();
  auto bn_scale_ptr = param->InputScale()->data<float>();
@@ -56,12 +59,22 @@ bool DeconvBNReluKernel<FPGA, float>::Init(
  int sub_conv_n = param->Strides()[0];
  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *  // NOLINT
                                           sizeof(float));             // NOLINT
+  //  for (int i = 0; i < channel * sub_conv_n; i++) {
-  for (int i = 0; i < channel * sub_conv_n; i++) {
+  //    bs_ptr[i + sub_conv_n * channel] = new_scale_ptr[i % channel];
-    bs_ptr[i + sub_conv_n * channel] = new_scale_ptr[i % channel];
+  //    bs_ptr[i] = new_bias_ptr[i % (channel)];
-    bs_ptr[i] = new_bias_ptr[i % (channel)];
+  //  }
+  if (param->Groups() == channel) {
+    for (int i = 0; i < channel * sub_conv_n; i++) {
+      bs_ptr[i + sub_conv_n * channel] = new_scale_ptr[i % channel] * Si / So;
+      bs_ptr[i] = new_bias_ptr[i % (channel)] * 127.0f / So;
+    }
+  } else {
+    for (int i = 0; i < channel * sub_conv_n; i++) {
+      bs_ptr[i + sub_conv_n * channel] =
+          new_scale_ptr[i % channel] * Si / So * Sf / 127.0f;
+      bs_ptr[i] = new_bias_ptr[i % (channel)] * 127.0f / So;
+    }
  }
  PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
                        "stride_width should be equal to stride_height ");
  PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],

--- a/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "operators/kernel/elementwise_add_kernel.h"
 #include <string>
-#include "fpga/V1/api.h"
+#include "fpga/V2/api.h"
 namespace paddle_mobile {
 namespace operators {
@@ -34,7 +34,11 @@ bool ElementwiseAddKernel<FPGA, float>::Init(ElementwiseAddParam<FPGA> *param) {
    auto input_y_ptr = input_y->data<half>();
    fpga::format_fp16_ofm(out);
    auto out_ptr = out->mutable_data<half>();
+    float Si_1 = input_x->scale[0];
+    float Si_2 = input_y->scale[0];
+    float So = out->scale[0];
+    float C1 = Si_1 / So;
+    float C2 = Si_2 / So;
    fpga::EWAddArgs ewaddArgs = {0};
    // ewaddArgs.relu_enabled = relu_enabled;
    ewaddArgs.output.activation.activation_type = activation_enable;

--- a/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
@@ -32,7 +32,11 @@ bool ElementwiseAddReluKernel<FPGA, float>::Init(
  auto input_y_ptr = input_y->data<half>();
  fpga::format_fp16_ofm(out);
  auto out_ptr = out->mutable_data<half>();
+  float Si_1 = input_x->scale[0];
+  float Si_2 = input_y->scale[0];
+  float So = out->scale[0];
+  float C1 = Si_1 / So;
+  float C2 = Si_2 / So;
  fpga::EWAddArgs ewaddArgs = {0};
  // ewaddArgs.relu_enabled = relu_enabled;
  ewaddArgs.output.activation.activation_type = activation_enable;

--- a/src/operators/kernel/fpga/V2/fusion_fc_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/fusion_fc_kernel.cpp
@@ -29,6 +29,9 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
  const Tensor *input_z = param->InputZ();
  auto input_z_ptr = input_z->data<float>();
  auto out = param->Out();
+  float Si = input_x->scale[0];
+  float Sf = filter->scale[0];
+  float So = out->scale[0];
  // PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0],
  //                     "Image channel should be equal to weight number");
@@ -36,8 +39,10 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
  auto bs_ptr =
      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
  for (int i = 0; i < channel; i++) {
-    bs_ptr[i + channel] = 1;
+    //    bs_ptr[i + channel] = 1;
-    bs_ptr[i] = input_z_ptr[i];
+    //    bs_ptr[i] = input_z_ptr[i];
+    bs_ptr[i + channel] = Si / So * Sf / 127.0f;
+    bs_ptr[i] = input_z_ptr[i] * 127.0f / So;
  }
  int num = (uint32_t)filter->dims()[1];
  int chw = (uint32_t)filter->dims()[0];

--- a/src/operators/kernel/fpga/V2/fusion_fc_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/fusion_fc_relu_kernel.cpp
@@ -29,6 +29,9 @@ bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
  const Tensor *input_z = param->InputZ();
  auto input_z_ptr = input_z->data<float>();
  auto out = param->Out();
+  float Si = input_x->scale[0];
+  float Sf = filter->scale[0];
+  float So = out->scale[0];
  // PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0],
  //                      "Image channel should be equal to weight number");
@@ -36,8 +39,10 @@ bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
  auto bs_ptr =
      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
  for (int i = 0; i < channel; i++) {
-    bs_ptr[i + channel] = 1;
+    //    bs_ptr[i + channel] = 1;
-    bs_ptr[i] = input_z_ptr[i];
+    //    bs_ptr[i] = input_z_ptr[i];
+    bs_ptr[i + channel] = Si / So * Sf / 127.0f;
+    bs_ptr[i] = input_z_ptr[i] * 127.0f / So;
  }
  int num = (uint32_t)filter->dims()[1];
  int chw = (uint32_t)filter->dims()[0];

--- a/src/operators/kernel/fpga/V2/pad2d_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/pad2d_kernel.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef PAD2D_OP
-#include "operators/kernel/pad2d_kernel.h"
-namespace paddle_mobile {
-namespace operators {
-template <>
-bool Pad2DKernel<FPGA, float>::Init(Pad2DParam<FPGA> *param) {
-  Tensor *output = param->output_;
-  fpga::format_fp16_ofm(output);
-  return true;
-}
-void pad2dFunc(const framework::Tensor *input, framework::Tensor *output) {
-  auto input_data = (input->data<half>());
-  auto output_data = (output->data<half>());
-  auto input_c = input->dims()[1];
-  auto input_h = input->dims()[2];
-  auto input_w = input->dims()[3];
-  auto output_c = output->dims()[1];
-  auto output_w = output->dims()[3];
-  auto copysize = input_c * input_w;
-  for (int h = 0; h < input_h; ++h) {
-    auto input_offset = h * input_c * input_w;
-    auto output_offset = h * paddle_mobile::fpga::align_to_x(
-                                 output_c * output_w, IMAGE_ALIGNMENT);
-    memcpy((output_data + output_offset), (input_data + input_offset),
-           copysize * sizeof(half));
-  }
-}
-template <>
-void Pad2DKernel<FPGA, float>::Compute(const Pad2DParam<FPGA> &param) {
-  auto in_x = param.input_;
-  auto out = param.output_;
-  fpga::fpga_invalidate((void *)in_x->data<half>(),  // NOLINT
-                        in_x->numel() * sizeof(half));
-  pad2dFunc(in_x, out);
-  (out->scale)[0] = (in_x->scale)[0];
-  (out->scale)[1] = (in_x->scale)[1];
-  DLOG << (out->scale)[0];
-  DLOG << (out->scale)[1];
-  size_t outputSize =
-      out->dims()[2] *
-      paddle_mobile::fpga::align_to_x((out->dims()[1]) * (out->dims()[3]),
-                                      IMAGE_ALIGNMENT) *
-      sizeof(half);
-  fpga::fpga_flush(out->data<half>(), outputSize);
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif  // PAD2D_OP
--- a/src/operators/kernel/fpga/V2/pool_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/pool_kernel.cpp
@@ -44,11 +44,13 @@ bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) {
  auto input_ptr = input->data<half>();
  fpga::format_fp16_ofm(output);
  auto output_ptr = output->mutable_data<half>();
+  float Si = input->scale[0];
+  float So = output->scale[0];
  fpga::PoolingArgs poolArgs = {0};
  poolArgs.mode = pooling_type == "max" ? 0 : 1;  // max:0, avg:1
-  poolArgs.kernel_reciprocal =
+  poolArgs.kernel_reciprocal = fpga::fp32_2_fp16(
-      fpga::fp32_2_fp16(float(1.0 / (ksize[0] * ksize[1])));  // NOLINT
+      float(1.0 / (ksize[0] * ksize[1]) * Si / So));  // NOLINT
  poolArgs.image.address = input_ptr;
  poolArgs.image.channels = (uint32_t)input->dims()[1];
  poolArgs.image.height = (uint32_t)input->dims()[2];

--- a/src/operators/kernel/fpga/V2/psroi_pool_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/psroi_pool_kernel.cpp
@@ -18,8 +18,8 @@ limitations under the License. */
 #include <vector>
 #include "operators/kernel/detection_kernel.h"
-#include "fpga/V1/api.h"
+#include "fpga/V2/api.h"
-#include "fpga/V1/image.h"
+#include "fpga/V2/image.h"
 namespace paddle_mobile {
 namespace operators {

--- a/src/operators/kernel/fpga/V2/roialign_pool_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/roialign_pool_kernel.cpp
@@ -18,8 +18,8 @@ limitations under the License. */
 #include <vector>
 #include "operators/kernel/detection_kernel.h"
-#include "fpga/V1/api.h"
+#include "fpga/V2/api.h"
-#include "fpga/V1/image.h"
+#include "fpga/V2/image.h"
 namespace paddle_mobile {
 namespace operators {

--- a/tools/op.cmake
+++ b/tools/op.cmake
@@ -163,6 +163,26 @@ if (CON GREATER -1)
  set(SPLIT_OP ON)
  set(FUSION_DECONVADD_OP ON)
  set(FUSION_DECONVADDRELU_OP ON)
+  set(RESHAPE_OP ON)
+  set(FUSION_CONVADDBNRELU_OP ON)
+  set(FUSION_CONVADDBN_OP ON)
+  set(RESHAPE2_OP ON)
+  set(PSROI_POOL_OP ON)
+  set(ROIALIGN_POOL_OP ON)
+  set(PROPOSAL_OP ON)
+  set(ANCHOR_GENERATOR_OP ON)
+  set(SLICE_OP ON)
+  set(SIGMOID_OP ON)
+  set(CONCAT_OP ON)
+  set(CONV_TRANSPOSE_OP ON)
+  set(FUSION_DECONVADDBNRELU_OP ON)
+  set(FUSION_DECONVADDBN_OP ON)
+  set(FUSION_DECONVBNRELU_OP ON)
+  set(CONV_OP ON)
+  set(ELEMENTWISEMUL_OP ON)
+  set(FUSION_FCRELU_OP ON)
+  set(RELU_OP ON)
  set(FOUND_MATCH ON)
 endif()