Merge remote-tracking branch 'origin/develop' into develop

380c55f2 · qnqinan · dc318170 · 4fdc1eeb · 380c55f2 · 380c55f2
18 changed file
--- a/src/fpga/api/fpga_api.cpp
+++ b/src/fpga/api/fpga_api.cpp
@@ -35,7 +35,7 @@ namespace fpga {
 static int fd = -1;
 static const char *device_path = "/dev/fpgadrv0";

-static inline int do_ioctl(int req, void *arg) {
+static inline int do_ioctl(int req, const void *arg) {
  return ioctl(req, (unsigned int64_t)arg);
 }

@@ -58,12 +58,17 @@ void fpga_copy(void *dest, const void *src, size_t num) {
  memcpy(dest, src, num);
 }

-int ComputeFpgaConv(const struct ConvArgs &args) { return do_ioctl(21, &args); }
+int ComputeFpgaConv(const struct ConvArgs &args) {
+  return do_ioctl(IOCTL_CONFIG_CONV, &args);
+}
 int ComputeFpgaPool(const struct PoolingArgs &args) {
-  return do_ioctl(22, &args);
+  return do_ioctl(IOCTL_CONFIG_POOLING, &args);
 }
 int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
-  return do_ioctl(23, &args);
+  return do_ioctl(IOCTL_CONFIG_EW, &args);
+}
+int PerformBypass(const struct BypassArgs &args) {
+  return do_ioctl(IOCTL_CONFIG_BYPASS, &args);
 }

 }  // namespace fpga

--- a/src/fpga/api/fpga_api.h
+++ b/src/fpga/api/fpga_api.h
@@ -86,12 +86,12 @@ struct ImageOutputArgs {

 struct ConvArgs {
  bool relu_enabled;
-  void* bias_address;
+  void* sb_address;  // scale and bias are interlaced;
  void* filter_address;
+  float* filter_scale_address;
  uint32_t filter_num;
  uint32_t group_num;

-  void* sb_address;  // scale and bias are interlaced;
  struct KernelArgs kernel;
  struct ImageInputArgs image;  // input image;
  struct ImageOutputArgs output;
@@ -116,6 +116,7 @@ struct EWAddArgs {

 struct BypassArgs {
  enum DataConvertType convert_type;
+  enum LayoutConvertType layout_type;
  struct ImageInputArgs image;
  struct ImageOutputArgs output;
 };
@@ -125,11 +126,6 @@ struct FpgaRegWriteArgs {
  uint64_t value;
 };

-struct FpgaRegReadArgs {
-  uint64_t address;
-  uint64_t value;
-};
-
 #define IOCTL_FPGA_MAGIC 'FPGA'

 #define IOCTL_VERSION _IOW(IOCTL_FPGA_MAGIC, 01, struct VersionArgs)
@@ -143,6 +139,7 @@ struct FpgaRegReadArgs {
 #define IOCTL_CONFIG_CONV _IOW(IOCTL_FPGA_MAGIC, 21, struct ConvArgs)
 #define IOCTL_CONFIG_POOLING _IOW(IOCTL_FPGA_MAGIC, 22, struct PoolingArgs)
 #define IOCTL_CONFIG_EW _IOW(IOCTL_FPGA_MAGIC, 23, struct EWAddArgs)
+#define IOCTL_CONFIG_BYPASS _IOW(IOCTL_FPGA_MAGIC, 24, struct BypassArgs)
 #define IOCTL_FPGA_REG_READ _IOW(IOCTL_FPGA_MAGIC, 28, struct FpgaRegReadArgs)
 #define IOCTL_FPGA_REG_WRITE _IOW(IOCTL_FPGA_MAGIC, 29, struct FpgaRegWriteArgs)

@@ -172,6 +169,7 @@ enum FPGA_ERR_TYPE {

 //============================== API =============================

+int PerformBypass(const struct BypassArgs& args);
 int ComputeFpgaConv(const struct ConvArgs& args);
 int ComputeFpgaPool(const struct PoolingArgs& args);
 int ComputeFpgaEWAdd(const struct EWAddArgs& args);

--- a/src/fpga/fpga_quantilization.h
+++ b/src/fpga/fpga_quantilization.h
@@ -13,55 +13,40 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once

-#include <string>
 #include "common/types.h"
 #include "framework/lod_tensor.h"
-#include "framework/operator.h"
-#include "framework/scope.h"
 #include "framework/tensor.h"

 namespace paddle_mobile {

-bool is_conv(std::string type) {
-  if (type.compare(G_OP_TYPE_CONV) == 0) {
-    return true;
-  }
-  if (type.compare(G_OP_TYPE_FUSION_CONV_ADD) == 0) {
-    return true;
-  }
-  if (type.compare(G_OP_TYPE_FUSION_CONV_ADD_RELU) == 0) {
-    return true;
-  }
-  if (type.compare(G_OP_TYPE_FUSION_CONV_BN_RELU) == 0) {
-    return true;
-  }
-  if (type.compare(G_OP_TYPE_FUSION_CONV_ADD_BN) == 0) {
-    return true;
-  }
-  return false;
-}
-
 template <typename Dtype>
-void quantilize_op(std::shared_ptr<framework::OperatorBase<Dtype>> op,
-                   std::shared_ptr<framework::Scope> scope) {
-  if (!is_conv(op.get()->Type())) {
-    return;
-  }
-  framework::Tensor* filter = nullptr;
-  auto var_vec = op.get()->Inputs().at("Filter");
-  if (!var_vec.empty()) {
-    auto var = scope.get()->FindVar(var_vec[0]);
-    filter = var->template GetMutable<framework::LoDTensor>();
-  }
+framework::Tensor* quantilize_filter(framework::Tensor* filter) {
  float scale = 0;
-
  // 32bit filter -> 8bit filter;
+  float min = 0f;
+  float max = 0f;
  if (filter->type() == typeid(float)) {
+    float* floatData = originalFilter->data<float>();
+    for (int i = 0; i < filter->numel(); ++i) {
+      min = std::min(min, floatData[i]);
+      max = std::max(max, floatData[i]);
+    }
+
+    float fix_range = (float)((1 << (8 - 1)) - 1);
+    float float_range = max;
+    scale = (float_range / fix_range);
+
    framework::Tensor* originalFilter = filter;
    framework::Tensor* quantFilter = new framework::Tensor();
-    float* floatData = originalFilter->data<float>();
    int8_t* intData = quantFilter->mutable_data<int8_t>();
-  }
+    for (int i = 0; i < filter->numel(); ++i) {
+      intData[i] = (int8_t)floatData[i] * scale;
+    }
+    quantFilter.scale = scale;
+    // NCHW -> NHWC;
+    return quantFilter;
+  }
+  return filter;
 }

 }  // namespace paddle_mobile
--- a/src/framework/tensor.h
+++ b/src/framework/tensor.h
@@ -257,10 +257,10 @@ class Tensor {
  struct FPGAArgs {
    float scale;

-    inline float *scale_pointer() const { return &scale; }
+    inline const float *scale_pointer() const { return &scale; }
  };

-  const struct FPGAArgs &fpga_args() const { return fpgaArgs_; }
+  const struct FPGAArgs fpga_args() const { return fpgaArgs_; }
 #endif

 private:

--- a/src/io/executor.cpp
+++ b/src/io/executor.cpp
@@ -32,10 +32,6 @@ limitations under the License. */
 #include "common/threadpool.h"
 #endif

-#ifdef PADDLE_MOBILE_FPGA
-#include "fpga/fpga_quantilization.h"
-#endif
-
 namespace paddle_mobile {
 using framework::Variable;

@@ -100,11 +96,6 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
  for (const auto &op : ops) {
    op->Init();
  }
-#ifdef PADDLE_MOBILE_FPGA
-  for (const auto &op : ops) {
-    quantilize_op(op, program_.scope);
-  }
-#endif
 }

 template <typename Dtype, Precision P>

--- a/src/operators/fusion_conv_add_bn_op.cpp
+++ b/src/operators/fusion_conv_add_bn_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDBN_OP
+
+#include "operators/fusion_conv_add_bn_op.h"
+#include "operators/math/conv_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void FusionConvAddBNOp<Dtype, T>::InferShape() const {
+  auto in_dims = this->param_.Input()->dims();
+  auto filter_dims = this->param_.Filter()->dims();
+  const std::vector<int> &strides = this->param_.Strides();
+  std::vector<int> paddings = this->param_.Paddings();
+  int groups = this->param_.Groups();
+  std::vector<int> dilations = this->param_.Dilations();
+
+  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
+                         dilations.size() == paddings.size() &&
+                         paddings.size() == strides.size()),
+                        "ConvParam is not suitable");
+
+  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
+  for (size_t i = 0; i < strides.size(); ++i) {
+    output_shape.push_back(
+        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
+                             paddings[i], strides[i]));
+  }
+
+  framework::DDim ddim = framework::make_ddim(output_shape);
+  this->param_.Output()->Resize(ddim);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(fusion_conv_add_bn, ops::FusionConvAddBNOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(fusion_conv_add_bn, ops::FusionConvAddBNOp);
+#endif
+
+#endif
--- a/src/operators/fusion_conv_add_bn_op.h
+++ b/src/operators/fusion_conv_add_bn_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDBN_OP
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "framework/operator.h"
+#include "framework/program/program-optimize/fusion_op_register.h"
+#include "op_param.h"
+#include "operators/kernel/conv_add_bn_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+using std::string;
+using std::vector;
+class FusionConvAddBNMatcher : public framework::FusionOpMatcher {
+ public:
+  FusionConvAddBNMatcher() {
+    node_ = framework::Node(G_OP_TYPE_CONV);
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
+        std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM);
+  }
+
+  void FolderNodes(
+      framework::Node *node,
+      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
+    node->Folder(node_.Depth(), Type(),
+                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}},
+                  {G_OP_TYPE_BATCHNORM,
+                   {{"Scale", "Scale"},
+                    {"Mean", "Mean"},
+                    {"Bias", "Bias"},
+                    {"Variance", "Variance"}}}},
+                 removed_nodes);
+  }
+
+  std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD_BN; }
+};
+
+template <typename DeviceType, typename T>
+class FusionConvAddBNOp : public framework::OperatorWithKernel<
+                              DeviceType, FusionConvAddBNParam,
+                              operators::ConvAddBNKernel<DeviceType, T>> {
+ public:
+  FusionConvAddBNOp(const string &type, const VariableNameMap &inputs,
+                    const VariableNameMap &outputs,
+                    const framework::AttributeMap &attrs,
+                    std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, FusionConvAddBNParam,
+            operators::ConvAddBNKernel<DeviceType, T>>(type, inputs, outputs,
+                                                       attrs, scope) {}
+
+  void InferShape() const override;
+
+ protected:
+};
+
+#ifdef PADDLE_MOBILE_CPU
+
+#ifndef FUSION_CONV_ADD_BN_REGISTER
+static framework::FusionOpRegistrar fusion_conv_add_bn_registrar(
+    new FusionConvAddBNMatcher());
+#define FUSION_CONV_ADD_BN_REGISTER
+#endif
+
+#endif
+
+#ifdef PADDLE_MOBILE_MALI_GPU
+
+#ifndef FUSION_CONV_ADD_BN_REGISTER
+static framework::FusionOpRegistrar fusion_conv_add_bn_registrar(
+    new FusionConvAddBNMatcher());
+#define FUSION_CONV_ADD_BN_REGISTER
+#endif
+
+#endif
+
+#ifdef PADDLE_MOBILE_FPGA
+
+#ifndef FUSION_CONV_ADD_BN_REGISTER
+static framework::FusionOpRegistrar fusion_conv_add_bn_registrar(
+    new FusionConvAddBNMatcher());
+#define FUSION_CONV_ADD_BN_REGISTER
+#endif
+
+#endif
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(fusion_conv_add_bn);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+USE_OP_FPGA(fusion_conv_add_bn);
+#endif
+
+#endif
--- a/src/operators/fusion_conv_add_bn_relu_op.cpp
+++ b/src/operators/fusion_conv_add_bn_relu_op.cpp
@@ -55,6 +55,7 @@ REGISTER_OPERATOR_CPU(fusion_conv_add_bn_relu, ops::FusionConvAddBNReluOp);
 #ifdef PADDLE_MOBILE_MALI_GPU
 #endif
 #ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(fusion_conv_add_bn_relu, ops::FusionConvAddBNReluOp);
 #endif

 #endif
--- a/src/operators/fusion_conv_add_bn_relu_op.h
+++ b/src/operators/fusion_conv_add_bn_relu_op.h
@@ -96,6 +96,13 @@ static framework::FusionOpRegistrar fusion_conv_add_bn_relu_registrar(
 #endif

 #ifdef PADDLE_MOBILE_FPGA
+
+#ifndef FUSION_CONV_ADD_BN_RELU_REGISTER
+static framework::FusionOpRegistrar fusion_conv_add_bn_relu_registrar(
+    new FusionConvAddBNReluMatcher());
+#define FUSION_CONV_ADD_BN_RELU_REGISTER
+#endif
+
 #endif

 }  // namespace operators
@@ -107,6 +114,7 @@ USE_OP_CPU(fusion_conv_add_bn_relu);
 #ifdef PADDLE_MOBILE_MALI_GPU
 #endif
 #ifdef PADDLE_MOBILE_FPGA
+USE_OP_FPGA(fusion_conv_add_bn_relu);
 #endif

 #endif
--- a/src/operators/fusion_conv_add_relu_op.cpp
+++ b/src/operators/fusion_conv_add_relu_op.cpp
@@ -54,6 +54,7 @@ REGISTER_OPERATOR_CPU(fusion_conv_add_relu, ops::FusionConvAddReluOp);
 #ifdef PADDLE_MOBILE_MALI_GPU
 #endif
 #ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(fusion_conv_add_relu, ops::FusionConvAddReluOp);
 #endif

 #endif
--- a/src/operators/fusion_conv_add_relu_op.h
+++ b/src/operators/fusion_conv_add_relu_op.h
@@ -75,6 +75,13 @@ class FusionConvAddReluOp : public framework::OperatorWithKernel<
 #ifdef PADDLE_MOBILE_MALI_GPU
 #endif
 #ifdef PADDLE_MOBILE_FPGA
+
+#ifndef CONV_ADD_RELU_REGISTER
+#define CONV_ADD_RELU_REGISTER
+static framework::FusionOpRegistrar fusion_conv_add_relu_registrar(
+    new FusionConvAddReluOpMatcher());
+#endif
+
 #endif

 }  // namespace operators
@@ -86,6 +93,7 @@ USE_OP_CPU(fusion_conv_add_relu);
 #ifdef PADDLE_MOBILE_MALI_GPU
 #endif
 #ifdef PADDLE_MOBILE_FPGA
+USE_OP_FPGA(fusion_conv_add_relu);
 #endif

 #endif
--- a/src/operators/kernel/central-arm-func/mul_arm_func.h
+++ b/src/operators/kernel/central-arm-func/mul_arm_func.h
@@ -19,6 +19,40 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {

+// 1、如果x,y维度都是2维，
+// x = [[1,2],   y = [[5,6],
+//      [3,4]]        [7,8]]
+// 运算结果为正常矩阵相乘。结果 out =
+//  [[1*5+2*7,1*6+2*8],[3*5+4*7, 3*6+4*8]]
+//
+// 2、如果x的维度大于2或者y的维度大于2,x的维度(2,3,4) ,y的维度(4,1,2)
+// x = [[[1,2,3,4],
+//       [2,3,4,5],
+//       [3,4,5,6]],
+//      [[1,2,3,4],
+//       [2,3,4,5],
+//       [3,4,5,6]]]
+// y = [[[1,2]],
+//      [[3,4]],
+//      [[5,6]],
+//      [[7,8]]]
+// 需要借助x_num_col_dims和y_num_col_dims将x和y的维度转换为2维
+// 从模型中读到参数,x_num_col_dims = 2,y_num_col_dims = 1,左开右闭
+// (1) 将x = (2,3,4)的index [0,x_num_col_dims)部分2,3相乘，得到6，
+//     [x_num_col_dims,xdim.size())部分4相乘，得到4，
+//     将Tensor x的dims重写成(6,4)
+// (2) 将y = (4,1,2)的index [0,y_num_col_dims)部分4相乘，得到4，
+//     [y_num_col_dims,ydim.size())部分1,2相乘，得到2,
+//     将Tensor y的dims重写成(4,2)
+// 并不影响x,y在内存中的分布。
+// x = [[1,2,3,4],             y = [[1,2],
+//      [2,3,4,5],                  [3,4],
+//      [3,4,5,6],   矩阵乘法        [5,6],
+//      [1,2,3,4],                  [7,8]]
+//      [2,3,4,5],
+//      [3,4,5,6]]
+// 结果x(6行4列)乘y(4行2列)，按1中矩阵相乘，结果out(6行2列)
+
 template <typename P>
 void MulCompute(const MulParam &param) {
  const Tensor *input_x = param.InputX();

--- a/src/operators/kernel/fpga/conv_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_kernel.cpp
@@ -12,21 +12,31 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#ifdef CONV_OP
+#pragma once

-#include "operators/kernel/conv_kernel.h"
+#ifdef FUSION_CONVADDBN_OP
+
+#include <vector>
+#include "framework/ddim.h"
+#include "framework/operator.h"
+#include "operators/math/conv_func.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
+#include "operators/op_param.h"

 namespace paddle_mobile {
 namespace operators {

-template <>
-bool ConvKernel<FPGA, float>::Init(ConvParam *param) {
-  return true;
-}
+using framework::DDim;
+using framework::OpKernelBase;

-template <>
-void ConvKernel<FPGA, float>::Compute(const ConvParam &param) const {}
-template class ConvKernel<FPGA, float>;
+template <typename DeviceType, typename T>
+class ConvAddBNKernel : public OpKernelBase<DeviceType, FusionConvAddBNParam> {
+ public:
+  void Compute(const FusionConvAddBNParam &param) const;
+  bool Init(FusionConvAddBNParam *param);
+};

 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/kernel/fpga/conv_add_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_add_bn_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDBN_OP
+
+#include "operators/kernel/conv_add_bn_kernel.h"
+#include "fpga/api/fpga_api.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam *param) {
+  bool relu_enabled = false;
+  const Tensor *input = param->Input();
+  auto input_ptr = input->data<float>();
+  const Tensor *bias = param->Bias();
+  auto bias_ptr = bias->data<float>();
+  const Tensor *filter = param->Filter();
+  auto filter_ptr = filter->data<float>();
+  Tensor *out = param->Output();
+  auto out_ptr = out->mutable_data<float>();
+  auto bn_mean_ptr = param->InputMean()->data<float>();
+  auto bn_var_ptr = param->InputVariance()->data<float>();
+  auto bn_scale_ptr = param->InputScale()->data<float>();
+  auto bn_bias_ptr = param->InputBias()->data<float>();
+  const float epsilon = param->Epsilon();
+  PADDLE_MOBILE_ENFORCE(input->dims()[1] == bias->dims()[0] &&
+                            bias->dims()[0] == param->InputBias()->dims()[0],
+                        "Image channel should be equal to bias number");
+
+  const int channel = input->dims()[1];
+  float *bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
+  Tensor *new_scale = new Tensor();
+  Tensor *new_bias = new Tensor();
+  auto new_scale_ptr = new_scale->mutable_data<float>({channel});
+  auto new_bias_ptr = new_bias->mutable_data<float>({channel});
+
+  for (int i = 0; i < channel; i++) {
+    new_scale_ptr[i] = bn_scale_ptr[i] /
+                       static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
+    new_bias_ptr[i] =
+        bn_bias_ptr[i] + (bias_ptr[i] - bn_mean_ptr[i]) * new_scale_ptr[i];
+    bs_ptr[i * 2] = new_scale_ptr[i];
+    bs_ptr[i * 2 + 1] = new_bias_ptr[i];
+  }
+  param->SetNewScale(new_scale);
+  param->SetNewBias(new_bias);
+
+  fpga::ConvArgs convArgs;
+  convArgs.relu_enabled = relu_enabled;
+  convArgs.filter_address = (void *)filter_ptr;
+  convArgs.filter_num = filter->dims()[0];
+  convArgs.group_num = param->Groups();
+  convArgs.sb_address = (void *)bs_ptr;
+  convArgs.kernel.stride_h = param->Strides()[0];
+  convArgs.kernel.stride_w = param->Strides()[1];
+  convArgs.kernel.height = filter->dims()[2];
+  convArgs.kernel.width = filter->dims()[3];
+  convArgs.image.address = (void *)input_ptr;
+  convArgs.image.channels = input->dims()[1];
+  convArgs.image.height = input->dims()[2];
+  convArgs.image.width = input->dims()[3];
+  convArgs.image.pad_height = param->Paddings()[0];
+  convArgs.image.pad_width = param->Paddings()[1];
+  convArgs.image.scale_address = input->fpga_args().scale_pointer();
+  convArgs.output.address = (void *)out_ptr;
+  convArgs.output.scale_address = out->fpga_args().scale_pointer();
+  param->SetFpgaArgs(convArgs);
+  return true;
+}
+
+template <>
+void ConvAddBNKernel<FPGA, float>::Compute(
+    const FusionConvAddBNParam &param) const {
+  fpga::ComputeFpgaConv(param.FpgaArgs());
+}
+template class ConvAddBNKernel<FPGA, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDBNRELU_OP
+
+#include "operators/kernel/conv_add_bn_relu_kernel.h"
+#include "memory/t_malloc.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvAddBNReluKernel<FPGA, float>::Init(FusionConvAddBNReluParam *param) {
+  bool relu_enabled = true;
+  const Tensor *input = param->Input();
+  auto input_ptr = input->data<float>();
+  const Tensor *bias = param->Bias();
+  auto bias_ptr = bias->data<float>();
+  const Tensor *filter = param->Filter();
+  auto filter_ptr = filter->data<float>();
+  Tensor *out = param->Output();
+  auto out_ptr = out->mutable_data<float>();
+  auto bn_mean_ptr = param->InputMean()->data<float>();
+  auto bn_var_ptr = param->InputVariance()->data<float>();
+  auto bn_scale_ptr = param->InputScale()->data<float>();
+  auto bn_bias_ptr = param->InputBias()->data<float>();
+  const float epsilon = param->Epsilon();
+  PADDLE_MOBILE_ENFORCE(input->dims()[1] == bias->dims()[0] &&
+                            bias->dims()[0] == param->InputBias()->dims()[0],
+                        "Image channel should be equal to bias number");
+
+  const int channel = input->dims()[1];
+  float *bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
+  Tensor *new_scale = new Tensor();
+  Tensor *new_bias = new Tensor();
+  auto new_scale_ptr = new_scale->mutable_data<float>({channel});
+  auto new_bias_ptr = new_bias->mutable_data<float>({channel});
+
+  for (int i = 0; i < channel; i++) {
+    new_scale_ptr[i] = bn_scale_ptr[i] /
+                       static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
+    new_bias_ptr[i] =
+        bn_bias_ptr[i] + (bias_ptr[i] - bn_mean_ptr[i]) * new_scale_ptr[i];
+    bs_ptr[i * 2] = new_scale_ptr[i];
+    bs_ptr[i * 2 + 1] = new_bias_ptr[i];
+  }
+  param->SetNewScale(new_scale);
+  param->SetNewBias(new_bias);
+
+  fpga::ConvArgs convArgs;
+  convArgs.relu_enabled = relu_enabled;
+  convArgs.filter_address = (void *)filter_ptr;
+  convArgs.filter_num = filter->dims()[0];
+  convArgs.group_num = param->Groups();
+  convArgs.sb_address = (void *)bs_ptr;
+  convArgs.kernel.stride_h = param->Strides()[0];
+  convArgs.kernel.stride_w = param->Strides()[1];
+  convArgs.kernel.height = filter->dims()[2];
+  convArgs.kernel.width = filter->dims()[3];
+  convArgs.image.address = (void *)input_ptr;
+  convArgs.image.channels = input->dims()[1];
+  convArgs.image.height = input->dims()[2];
+  convArgs.image.width = input->dims()[3];
+  convArgs.image.pad_height = param->Paddings()[0];
+  convArgs.image.pad_width = param->Paddings()[1];
+  convArgs.image.scale_address = input->fpga_args().scale_pointer();
+  convArgs.output.address = (void *)out_ptr;
+  convArgs.output.scale_address = out->fpga_args().scale_pointer();
+  param->SetFpgaArgs(convArgs);
+  return true;
+}
+
+template <>
+void ConvAddBNReluKernel<FPGA, float>::Compute(
+    const FusionConvAddBNReluParam &param) const {
+  fpga::ComputeFpgaConv(param.FpgaArgs());
+}
+template class ConvAddBNReluKernel<FPGA, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/fpga/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_add_relu_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDRELU_OP
+
+#include "operators/kernel/conv_add_relu_kernel.h"
+#include "common/enforce.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam *param) {
+  bool relu_enabled = true;
+  const Tensor *input = param->Input();
+  auto input_ptr = input->data<float>();
+  const Tensor *bias = param->Bias();
+  auto bias_ptr = bias->data<float>();
+  const Tensor *filter = param->Filter();
+  auto filter_ptr = filter->data<float>();
+  Tensor *out = param->Output();
+  auto out_ptr = out->mutable_data<float>();
+
+  PADDLE_MOBILE_ENFORCE(input->dims()[1] == bias->dims()[0],
+                        "Image channel should be equal to bias number");
+  int channel = input->dims()[1];
+  float *bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
+  for (int i = 0; i < channel; i++) {
+    bs_ptr[i * 2] = 1;
+    bs_ptr[i * 2 + 1] = bias_ptr[i];
+  }
+
+  fpga::ConvArgs convArgs;
+  convArgs.relu_enabled = relu_enabled;
+  convArgs.filter_address = (void *)filter_ptr;
+  convArgs.filter_num = filter->dims()[0];
+  convArgs.group_num = param->Groups();
+  convArgs.sb_address = (void *)bs_ptr;
+  convArgs.kernel.stride_h = param->Strides()[0];
+  convArgs.kernel.stride_w = param->Strides()[1];
+  convArgs.kernel.height = filter->dims()[2];
+  convArgs.kernel.width = filter->dims()[3];
+  convArgs.image.address = (void *)input_ptr;
+  convArgs.image.channels = input->dims()[1];
+  convArgs.image.height = input->dims()[2];
+  convArgs.image.width = input->dims()[3];
+
+  convArgs.image.pad_height = param->Paddings()[0];
+  convArgs.image.pad_width = param->Paddings()[1];
+  convArgs.image.scale_address = input->fpga_args().scale_pointer();
+  convArgs.output.address = (void *)out_ptr;
+  convArgs.output.scale_address = out->fpga_args().scale_pointer();
+  param->SetFpgaArgs(convArgs);
+  return true;
+}
+
+template <>
+void ConvAddReluKernel<FPGA, float>::Compute(
+    const FusionConvAddReluParam &param) const {
+  fpga::ComputeFpgaConv(param.FpgaArgs());
+}
+template class ConvAddReluKernel<FPGA, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -1136,7 +1136,7 @@ class FusionConvAddBNParam : public OpParam {

  const Tensor *Filter() const { return filter_; }

-  Tensor *OutputY() const { return output_y_; }
+  Tensor *Output() const { return output_y_; }

  const vector<int> &Strides() const { return strides_; }


--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -21,6 +21,7 @@ elseif("resnet" IN_LIST NET)
    # gen test
    ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h  test_include.h executor_for_test.h)
    target_link_libraries(test-resnet paddle-mobile)
+elseif("FPGAnets" IN_LIST NET)
 else ()

    # gen test