From 4286caa26cf862e20e50ffe537a00c085f472278 Mon Sep 17 00:00:00 2001
From: qnqinan <qnqinan@163.com>
Date: Mon, 22 Apr 2019 20:37:21 +0800
Subject: [PATCH] update some files related with static quantization in FPGA V2
 track

---
 .../fpga/V2/conv_add_bn_relu_kernel.cpp       |  6 +-
 .../kernel/fpga/V2/conv_transpose_kernel.cpp  | 11 ++++
 .../kernel/fpga/V2/deconv_add_bn_kernel.cpp   | 12 +++-
 .../fpga/V2/deconv_add_bn_relu_kernel.cpp     | 12 +++-
 .../kernel/fpga/V2/deconv_add_kernel.cpp      | 12 +++-
 .../kernel/fpga/V2/deconv_add_relu_kernel.cpp | 12 +++-
 .../kernel/fpga/V2/deconv_bn_relu_kernel.cpp  | 23 +++++--
 .../kernel/fpga/V2/elementwise_add_kernel.cpp |  6 +-
 .../fpga/V2/elementwise_add_relu_kernel.cpp   |  6 +-
 .../kernel/fpga/V2/fusion_fc_kernel.cpp       |  9 ++-
 .../kernel/fpga/V2/fusion_fc_relu_kernel.cpp  |  9 ++-
 src/operators/kernel/fpga/V2/pad2d_kernel.cpp | 61 -------------------
 src/operators/kernel/fpga/V2/pool_kernel.cpp  |  6 +-
 tools/op.cmake                                | 20 ++++++
 14 files changed, 126 insertions(+), 79 deletions(-)
 delete mode 100644 src/operators/kernel/fpga/V2/pad2d_kernel.cpp
diff --git a/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp
index ded6654081..d16ec56d70 100644
--- a/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp
@@ -32,6 +32,7 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
   auto bias_ptr = bias->data<float>();
   auto filter = const_cast<LoDTensor *>(param->Filter());
   auto out = param->Output();
+  const int groups = param->Groups();
   float Si = input->scale[0];
   float So = out->scale[0];
   float Sf = fpga::filter_find_max(filter);
@@ -63,9 +64,12 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
     //    bs_ptr[i] = new_bias_ptr[i];
     bs_ptr[i + channel] = new_scale_ptr[i] * Si / So * Sf / 127.0;
     bs_ptr[i] = new_bias_ptr[i] * 127.0 / So;
+    if (groups == channel) {
+      new_scale_ptr[i] = new_scale_ptr[i] * Si / So;
+      new_bias_ptr[i] = new_bias_ptr[i] * 127.0f / So;
+    }
   }
 
-  const int groups = param->Groups();
   if (groups == channel) {
     fpga::format_dwconv_data(filter, out, new_scale_ptr, &new_bias_ptr);
     fpga::DWconvArgs dwconv_arg = {0};
diff --git a/src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp b/src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp
index 1597885e43..76889b0dd9 100644
--- a/src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp
@@ -32,6 +32,9 @@ bool ConvTransposeKernel<FPGA, float>::Init(ConvTransposeParam<FPGA> *param) {
   // auto bias_ptr = bias->data<float>();
   auto filter = const_cast<LoDTensor *>(param->Filter());
   auto out = param->Output();
+  float Si = input->scale[0];
+  float So = out->scale[0];
+  float Sf = fpga::filter_find_max(filter);
 
   // PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
   //                      "Output channel should be equal to bias number");
@@ -53,6 +56,10 @@ bool ConvTransposeKernel<FPGA, float>::Init(ConvTransposeParam<FPGA> *param) {
   PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
                         "filter axis should be the multiple of stride axis ");
   if (param->Groups() == channel) {
+    for (int i = 0; i < channel * sub_conv_n; i++) {
+      bs_ptr[i + sub_conv_n * channel] = Si / So;
+      bs_ptr[i] = 0;  // bias_ptr[i % (channel)];
+    }
     fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
                                sub_conv_n);
     fpga::DWDeconvArgs DWDeconv_arg = {0};
@@ -62,6 +69,10 @@ bool ConvTransposeKernel<FPGA, float>::Init(ConvTransposeParam<FPGA> *param) {
                             param->Paddings()[0], param->Paddings()[1], bs_ptr);
     param->SetFpgaArgs(DWDeconv_arg);
   } else {
+    for (int i = 0; i < channel * sub_conv_n; i++) {
+      bs_ptr[i + sub_conv_n * channel] = Si / So * Sf / 127.0f;
+      bs_ptr[i] = 0;  // bias_ptr[i % (channel)];
+    }
     fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
     fpga::DeconvArgs deconv_arg = {0};
     fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,
diff --git a/src/operators/kernel/fpga/V2/deconv_add_bn_kernel.cpp b/src/operators/kernel/fpga/V2/deconv_add_bn_kernel.cpp
index a8205df3c9..5e3417f8c6 100644
--- a/src/operators/kernel/fpga/V2/deconv_add_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/deconv_add_bn_kernel.cpp
@@ -32,7 +32,9 @@ bool DeconvAddBNKernel<FPGA, float>::Init(FusionDeconvAddBNParam<FPGA> *param) {
   auto bias_ptr = bias->data<float>();
   auto filter = const_cast<LoDTensor *>(param->Filter());
   auto out = param->Output();
-
+  float Si = input->scale[0];
+  float So = out->scale[0];
+  float Sf = fpga::filter_find_max(filter);
   PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
                         "Output channel should be equal to bias number");
   int channel = out->dims()[1];
@@ -53,6 +55,10 @@ bool DeconvAddBNKernel<FPGA, float>::Init(FusionDeconvAddBNParam<FPGA> *param) {
   PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
                         "filter axis should be the multiple of stride axis ");
   if (param->Groups() == channel) {
+    for (int i = 0; i < channel * sub_conv_n; i++) {
+      bs_ptr[i + sub_conv_n * channel] = Si / So;
+      bs_ptr[i] = bias_ptr[i % (channel)] * 127.0f / So;
+    }
     fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
                                sub_conv_n);
     fpga::DWDeconvArgs DWDeconv_arg = {0};
@@ -62,6 +68,10 @@ bool DeconvAddBNKernel<FPGA, float>::Init(FusionDeconvAddBNParam<FPGA> *param) {
                             param->Paddings()[0], param->Paddings()[1], bs_ptr);
     param->SetFpgaArgs(DWDeconv_arg);
   } else {
+    for (int i = 0; i < channel * sub_conv_n; i++) {
+      bs_ptr[i + sub_conv_n * channel] = Si / So * Sf / 127.0f;
+      bs_ptr[i] = bias_ptr[i % (channel)] * 127.0f / So;
+    }
     fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
     fpga::DeconvArgs deconv_arg = {0};
     fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,
diff --git a/src/operators/kernel/fpga/V2/deconv_add_bn_relu_kernel.cpp b/src/operators/kernel/fpga/V2/deconv_add_bn_relu_kernel.cpp
index b27f5cf870..2913a628dd 100644
--- a/src/operators/kernel/fpga/V2/deconv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/deconv_add_bn_relu_kernel.cpp
@@ -33,7 +33,9 @@ bool DeconvAddBNReluKernel<FPGA, float>::Init(
   auto bias_ptr = bias->data<float>();
   auto filter = const_cast<LoDTensor *>(param->Filter());
   auto out = param->Output();
-
+  float Si = input->scale[0];
+  float So = out->scale[0];
+  float Sf = fpga::filter_find_max(filter);
   PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
                         "Output channel should be equal to bias number");
   int channel = out->dims()[1];
@@ -54,6 +56,10 @@ bool DeconvAddBNReluKernel<FPGA, float>::Init(
   PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
                         "filter axis should be the multiple of stride axis ");
   if (param->Groups() == channel) {
+    for (int i = 0; i < channel * sub_conv_n; i++) {
+      bs_ptr[i + sub_conv_n * channel] = Si / So;
+      bs_ptr[i] = bias_ptr[i % (channel)] * 127.0f / So;
+    }
     fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
                                sub_conv_n);
     fpga::DWDeconvArgs DWDeconv_arg = {0};
@@ -63,6 +69,10 @@ bool DeconvAddBNReluKernel<FPGA, float>::Init(
                             param->Paddings()[0], param->Paddings()[1], bs_ptr);
     param->SetFpgaArgs(DWDeconv_arg);
   } else {
+    for (int i = 0; i < channel * sub_conv_n; i++) {
+      bs_ptr[i + sub_conv_n * channel] = Si / So * Sf / 127.0f;
+      bs_ptr[i] = bias_ptr[i % (channel)] * 127.0f / So;
+    }
     fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
     fpga::DeconvArgs deconv_arg = {0};
     fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,
diff --git a/src/operators/kernel/fpga/V2/deconv_add_kernel.cpp b/src/operators/kernel/fpga/V2/deconv_add_kernel.cpp
index 41844d008b..dcafcbea9c 100644
--- a/src/operators/kernel/fpga/V2/deconv_add_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/deconv_add_kernel.cpp
@@ -32,7 +32,9 @@ bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) {
   auto bias_ptr = bias->data<float>();
   auto filter = const_cast<LoDTensor *>(param->Filter());
   auto out = param->Output();
-
+  float Si = input->scale[0];
+  float So = out->scale[0];
+  float Sf = fpga::filter_find_max(filter);
   PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
                         "Output channel should be equal to bias number");
   int channel = out->dims()[1];
@@ -53,6 +55,10 @@ bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) {
   PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
                         "filter axis should be the multiple of stride axis ");
   if (param->Groups() == channel) {
+    for (int i = 0; i < channel * sub_conv_n; i++) {
+      bs_ptr[i + sub_conv_n * channel] = Si / So;
+      bs_ptr[i] = bias_ptr[i % (channel)] * 127.0f / So;
+    }
     fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
                                sub_conv_n);
     fpga::DWDeconvArgs DWDeconv_arg = {0};
@@ -62,6 +68,10 @@ bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) {
                             param->Paddings()[0], param->Paddings()[1], bs_ptr);
     param->SetFpgaArgs(DWDeconv_arg);
   } else {
+    for (int i = 0; i < channel * sub_conv_n; i++) {
+      bs_ptr[i + sub_conv_n * channel] = Si / So * Sf / 127.0f;
+      bs_ptr[i] = bias_ptr[i % (channel)] * 127.0f / So;
+    }
     fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
     fpga::DeconvArgs deconv_arg = {0};
     fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,
diff --git a/src/operators/kernel/fpga/V2/deconv_add_relu_kernel.cpp b/src/operators/kernel/fpga/V2/deconv_add_relu_kernel.cpp
index c6fc9d1955..1364b4b5aa 100644
--- a/src/operators/kernel/fpga/V2/deconv_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/deconv_add_relu_kernel.cpp
@@ -33,7 +33,9 @@ bool DeconvAddReluKernel<FPGA, float>::Init(
   auto bias_ptr = bias->data<float>();
   auto filter = const_cast<LoDTensor *>(param->Filter());
   auto out = param->Output();
-
+  float Si = input->scale[0];
+  float So = out->scale[0];
+  float Sf = fpga::filter_find_max(filter);
   PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
                         "Output channel should be equal to bias number");
   int channel = out->dims()[1];
@@ -54,6 +56,10 @@ bool DeconvAddReluKernel<FPGA, float>::Init(
   PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
                         "filter axis should be the multiple of stride axis ");
   if (param->Groups() == channel) {
+    for (int i = 0; i < channel * sub_conv_n; i++) {
+      bs_ptr[i + sub_conv_n * channel] = Si / So;
+      bs_ptr[i] = bias_ptr[i % (channel)] * 127.0f / So;
+    }
     fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
                                sub_conv_n);
     fpga::DWDeconvArgs DWDeconv_arg = {0};
@@ -63,6 +69,10 @@ bool DeconvAddReluKernel<FPGA, float>::Init(
                             param->Paddings()[0], param->Paddings()[1], bs_ptr);
     param->SetFpgaArgs(DWDeconv_arg);
   } else {
+    for (int i = 0; i < channel * sub_conv_n; i++) {
+      bs_ptr[i + sub_conv_n * channel] = Si / So * Sf / 127.0f;
+      bs_ptr[i] = bias_ptr[i % (channel)] * 127.0f / So;
+    }
     fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
     fpga::DeconvArgs deconv_arg = {0};
     fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,
diff --git a/src/operators/kernel/fpga/V2/deconv_bn_relu_kernel.cpp b/src/operators/kernel/fpga/V2/deconv_bn_relu_kernel.cpp
index 75597f0ecd..6aae1ea729 100644
--- a/src/operators/kernel/fpga/V2/deconv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/deconv_bn_relu_kernel.cpp
@@ -34,6 +34,9 @@ bool DeconvBNReluKernel<FPGA, float>::Init(
   auto bias_ptr = bias->data<float>();
   auto filter = const_cast<LoDTensor *>(param->Filter());
   auto out = param->Output();
+  float Si = input->scale[0];
+  float So = out->scale[0];
+  float Sf = fpga::filter_find_max(filter);
   auto bn_mean_ptr = param->InputMean()->data<float>();
   auto bn_var_ptr = param->InputVariance()->data<float>();
   auto bn_scale_ptr = param->InputScale()->data<float>();
@@ -56,12 +59,22 @@ bool DeconvBNReluKernel<FPGA, float>::Init(
   int sub_conv_n = param->Strides()[0];
   auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *  // NOLINT
                                            sizeof(float));             // NOLINT
-
-  for (int i = 0; i < channel * sub_conv_n; i++) {
-    bs_ptr[i + sub_conv_n * channel] = new_scale_ptr[i % channel];
-    bs_ptr[i] = new_bias_ptr[i % (channel)];
+  //  for (int i = 0; i < channel * sub_conv_n; i++) {
+  //    bs_ptr[i + sub_conv_n * channel] = new_scale_ptr[i % channel];
+  //    bs_ptr[i] = new_bias_ptr[i % (channel)];
+  //  }
+  if (param->Groups() == channel) {
+    for (int i = 0; i < channel * sub_conv_n; i++) {
+      bs_ptr[i + sub_conv_n * channel] = new_scale_ptr[i % channel] * Si / So;
+      bs_ptr[i] = new_bias_ptr[i % (channel)] * 127.0f / So;
+    }
+  } else {
+    for (int i = 0; i < channel * sub_conv_n; i++) {
+      bs_ptr[i + sub_conv_n * channel] =
+          new_scale_ptr[i % channel] * Si / So * Sf / 127.0f;
+      bs_ptr[i] = new_bias_ptr[i % (channel)] * 127.0f / So;
+    }
   }
-
   PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
                         "stride_width should be equal to stride_height ");
   PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
diff --git a/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp b/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp
index eec058edc4..145d7851f0 100644
--- a/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp
@@ -34,7 +34,11 @@ bool ElementwiseAddKernel<FPGA, float>::Init(ElementwiseAddParam<FPGA> *param) {
     auto input_y_ptr = input_y->data<half>();
     fpga::format_fp16_ofm(out);
     auto out_ptr = out->mutable_data<half>();
-
+    float Si_1 = input_x->scale[0];
+    float Si_2 = input_y->scale[0];
+    float So = out->scale[0];
+    float C1 = Si_1 / So;
+    float C2 = Si_2 / So;
     fpga::EWAddArgs ewaddArgs = {0};
     // ewaddArgs.relu_enabled = relu_enabled;
     ewaddArgs.output.activation.activation_type = activation_enable;
diff --git a/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp b/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
index f36206a8a1..44266049a2 100644
--- a/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
@@ -32,7 +32,11 @@ bool ElementwiseAddReluKernel<FPGA, float>::Init(
   auto input_y_ptr = input_y->data<half>();
   fpga::format_fp16_ofm(out);
   auto out_ptr = out->mutable_data<half>();
-
+  float Si_1 = input_x->scale[0];
+  float Si_2 = input_y->scale[0];
+  float So = out->scale[0];
+  float C1 = Si_1 / So;
+  float C2 = Si_2 / So;
   fpga::EWAddArgs ewaddArgs = {0};
   // ewaddArgs.relu_enabled = relu_enabled;
   ewaddArgs.output.activation.activation_type = activation_enable;
diff --git a/src/operators/kernel/fpga/V2/fusion_fc_kernel.cpp b/src/operators/kernel/fpga/V2/fusion_fc_kernel.cpp
index 3a29104d0f..1f85beb532 100644
--- a/src/operators/kernel/fpga/V2/fusion_fc_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/fusion_fc_kernel.cpp
@@ -29,6 +29,9 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
   const Tensor *input_z = param->InputZ();
   auto input_z_ptr = input_z->data<float>();
   auto out = param->Out();
+  float Si = input_x->scale[0];
+  float Sf = filter->scale[0];
+  float So = out->scale[0];
 
   // PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0],
   //                     "Image channel should be equal to weight number");
@@ -36,8 +39,10 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
   auto bs_ptr =
       (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
   for (int i = 0; i < channel; i++) {
-    bs_ptr[i + channel] = 1;
-    bs_ptr[i] = input_z_ptr[i];
+    //    bs_ptr[i + channel] = 1;
+    //    bs_ptr[i] = input_z_ptr[i];
+    bs_ptr[i + channel] = Si / So * Sf / 127.0f;
+    bs_ptr[i] = input_z_ptr[i] * 127.0f / So;
   }
   int num = (uint32_t)filter->dims()[1];
   int chw = (uint32_t)filter->dims()[0];
diff --git a/src/operators/kernel/fpga/V2/fusion_fc_relu_kernel.cpp b/src/operators/kernel/fpga/V2/fusion_fc_relu_kernel.cpp
index fef370515e..0ccec45195 100644
--- a/src/operators/kernel/fpga/V2/fusion_fc_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/fusion_fc_relu_kernel.cpp
@@ -29,6 +29,9 @@ bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
   const Tensor *input_z = param->InputZ();
   auto input_z_ptr = input_z->data<float>();
   auto out = param->Out();
+  float Si = input_x->scale[0];
+  float Sf = filter->scale[0];
+  float So = out->scale[0];
 
   // PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0],
   //                      "Image channel should be equal to weight number");
@@ -36,8 +39,10 @@ bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
   auto bs_ptr =
       (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
   for (int i = 0; i < channel; i++) {
-    bs_ptr[i + channel] = 1;
-    bs_ptr[i] = input_z_ptr[i];
+    //    bs_ptr[i + channel] = 1;
+    //    bs_ptr[i] = input_z_ptr[i];
+    bs_ptr[i + channel] = Si / So * Sf / 127.0f;
+    bs_ptr[i] = input_z_ptr[i] * 127.0f / So;
   }
   int num = (uint32_t)filter->dims()[1];
   int chw = (uint32_t)filter->dims()[0];
diff --git a/src/operators/kernel/fpga/V2/pad2d_kernel.cpp b/src/operators/kernel/fpga/V2/pad2d_kernel.cpp
deleted file mode 100644
index e5328dc319..0000000000
--- a/src/operators/kernel/fpga/V2/pad2d_kernel.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef PAD2D_OP
-#include "operators/kernel/pad2d_kernel.h"
-namespace paddle_mobile {
-namespace operators {
-template <>
-bool Pad2DKernel<FPGA, float>::Init(Pad2DParam<FPGA> *param) {
-  Tensor *output = param->output_;
-  fpga::format_fp16_ofm(output);
-  return true;
-}
-void pad2dFunc(const framework::Tensor *input, framework::Tensor *output) {
-  auto input_data = (input->data<half>());
-  auto output_data = (output->data<half>());
-  auto input_c = input->dims()[1];
-  auto input_h = input->dims()[2];
-  auto input_w = input->dims()[3];
-  auto output_c = output->dims()[1];
-  auto output_w = output->dims()[3];
-  auto copysize = input_c * input_w;
-  for (int h = 0; h < input_h; ++h) {
-    auto input_offset = h * input_c * input_w;
-    auto output_offset = h * paddle_mobile::fpga::align_to_x(
-                                 output_c * output_w, IMAGE_ALIGNMENT);
-    memcpy((output_data + output_offset), (input_data + input_offset),
-           copysize * sizeof(half));
-  }
-}
-template <>
-void Pad2DKernel<FPGA, float>::Compute(const Pad2DParam<FPGA> &param) {
-  auto in_x = param.input_;
-  auto out = param.output_;
-  fpga::fpga_invalidate((void *)in_x->data<half>(),  // NOLINT
-                        in_x->numel() * sizeof(half));
-  pad2dFunc(in_x, out);
-  (out->scale)[0] = (in_x->scale)[0];
-  (out->scale)[1] = (in_x->scale)[1];
-  DLOG << (out->scale)[0];
-  DLOG << (out->scale)[1];
-  size_t outputSize =
-      out->dims()[2] *
-      paddle_mobile::fpga::align_to_x((out->dims()[1]) * (out->dims()[3]),
-                                      IMAGE_ALIGNMENT) *
-      sizeof(half);
-  fpga::fpga_flush(out->data<half>(), outputSize);
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif  // PAD2D_OP
diff --git a/src/operators/kernel/fpga/V2/pool_kernel.cpp b/src/operators/kernel/fpga/V2/pool_kernel.cpp
index 7c8dba1696..60bd3786aa 100644
--- a/src/operators/kernel/fpga/V2/pool_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/pool_kernel.cpp
@@ -44,11 +44,13 @@ bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) {
   auto input_ptr = input->data<half>();
   fpga::format_fp16_ofm(output);
   auto output_ptr = output->mutable_data<half>();
+  float Si = input->scale[0];
+  float So = output->scale[0];
 
   fpga::PoolingArgs poolArgs = {0};
   poolArgs.mode = pooling_type == "max" ? 0 : 1;  // max:0, avg:1
-  poolArgs.kernel_reciprocal =
-      fpga::fp32_2_fp16(float(1.0 / (ksize[0] * ksize[1])));  // NOLINT
+  poolArgs.kernel_reciprocal = fpga::fp32_2_fp16(
+      float(1.0 / (ksize[0] * ksize[1]) * Si / So));  // NOLINT
   poolArgs.image.address = input_ptr;
   poolArgs.image.channels = (uint32_t)input->dims()[1];
   poolArgs.image.height = (uint32_t)input->dims()[2];
diff --git a/tools/op.cmake b/tools/op.cmake
index 5847c60e94..eb6501de22 100755
--- a/tools/op.cmake
+++ b/tools/op.cmake
@@ -163,6 +163,26 @@ if (CON GREATER -1)
   set(SPLIT_OP ON)
   set(FUSION_DECONVADD_OP ON)
   set(FUSION_DECONVADDRELU_OP ON)
+
+  set(RESHAPE_OP ON)
+  set(FUSION_CONVADDBNRELU_OP ON)
+  set(FUSION_CONVADDBN_OP ON)
+  set(RESHAPE2_OP ON)
+  set(PSROI_POOL_OP ON)
+  set(ROIALIGN_POOL_OP ON)
+  set(PROPOSAL_OP ON)
+  set(ANCHOR_GENERATOR_OP ON)
+  set(SLICE_OP ON)
+  set(SIGMOID_OP ON)
+  set(CONCAT_OP ON)
+  set(CONV_TRANSPOSE_OP ON)
+  set(FUSION_DECONVADDBNRELU_OP ON)
+  set(FUSION_DECONVADDBN_OP ON)
+  set(FUSION_DECONVBNRELU_OP ON)
+  set(CONV_OP ON)
+  set(ELEMENTWISEMUL_OP ON)
+  set(FUSION_FCRELU_OP ON)
+  set(RELU_OP ON)
   set(FOUND_MATCH ON)
 endif()
 
-- 
GitLab