Merge pull request #1464 from qnqinan/develop

add deconv bn relu op and update fetch op in FPGA track, fixed#1463

Merge pull request #1464 from qnqinan/develop
add deconv bn relu op and update fetch op in FPGA track, fixed#1463
03c0f0a3 · zhangyang0701 · GitHub · fa139e6b · 6b71cb59 · 03c0f0a3
9 changed file
--- a/src/common/types.cpp
+++ b/src/common/types.cpp
@@ -113,6 +113,7 @@ const char *G_OP_TYPE_ROI_PERSPECTIVE = "roi_perspective_transform";
 const char *G_OP_TYPE_PAD2D = "pad2d";
 const char *G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU = "fusion_deconv_add_bn_relu";
 const char *G_OP_TYPE_FUSION_DECONV_ADD_BN = "fusion_deconv_add_bn";
+const char *G_OP_TYPE_FUSION_DECONV_BN_RELU = "fusion_deconv_bn_relu";
 std::unordered_map<
    std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
@@ -215,5 +216,6 @@ std::unordered_map<
        {G_OP_TYPE_ROI_PERSPECTIVE, {{"X", "ROIs"}, {"Out"}}},
        {G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU, {{"Input"}, {"Out"}}},
        {G_OP_TYPE_FUSION_DECONV_ADD_BN, {{"Input"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_DECONV_BN_RELU, {{"Input"}, {"Out"}}},
        {G_OP_TYPE_PAD2D, {{"X"}, {"Out"}}}};
 }  // namespace paddle_mobile
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -202,6 +202,7 @@ extern const char *G_OP_TYPE_ROI_PERSPECTIVE;
 extern const char *G_OP_TYPE_PAD2D;
 extern const char *G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU;
 extern const char *G_OP_TYPE_FUSION_DECONV_ADD_BN;
+extern const char *G_OP_TYPE_FUSION_DECONV_BN_RELU;
 extern std::unordered_map<
    std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>

--- a/src/operators/fusion_deconv_bn_relu_op.cpp
+++ b/src/operators/fusion_deconv_bn_relu_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_DECONVBNRELU_OP
+#include "operators/fusion_deconv_bn_relu_op.h"
+namespace paddle_mobile {
+namespace operators {}
+}  // namespace paddle_mobile
+namespace ops = paddle_mobile::operators;
+REGISTER_FUSION_MATCHER(fusion_deconv_bn_relu, ops::FusionDeconvBNReluMatcher);
+#ifdef PADDLE_MOBILE_CPU
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(fusion_deconv_bn_relu, ops::FusionDeconvBNReluOp);
+#endif
+#endif
--- a/src/operators/fusion_deconv_bn_relu_op.h
+++ b/src/operators/fusion_deconv_bn_relu_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_DECONVBNRELU_OP
+#pragma once
+#include <string>
+#include <vector>
+#include "framework/operator.h"
+#include "framework/program/program-optimize/fusion_op_register.h"
+#include "operators/kernel/deconv_bn_relu_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+using std::string;
+using std::vector;
+class FusionDeconvBNReluMatcher : public framework::FusionOpMatcher {
+ public:
+  FusionDeconvBNReluMatcher() {
+    node_ = framework::Node(G_OP_TYPE_CONV_TRANSPOSE);
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM) >
+        std::make_shared<framework::Node>(G_OP_TYPE_RELU);
+  }
+  void FolderNodes(
+      framework::Node *node,
+      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
+    node->Folder(node_.Depth(), Type(),
+                 {{G_OP_TYPE_BATCHNORM,
+                   {{"Scale", "Scale"},
+                    {"Mean", "Mean"},
+                    {"Bias", "Bias"},
+                    {"Variance", "Variance"}}}},
+                 removed_nodes);
+  }
+  std::string Type() { return G_OP_TYPE_FUSION_DECONV_BN_RELU; }
+};
+template <typename DeviceType, typename T>
+class FusionDeconvBNReluOp
+    : public framework::OperatorWithKernel<
+          DeviceType, FusionDeconvBNReluParam<DeviceType>,
+          operators::DeconvBNReluKernel<DeviceType, T>> {
+ public:
+  FusionDeconvBNReluOp(const string &type, const VariableNameMap &inputs,
+                       const VariableNameMap &outputs,
+                       const framework::AttributeMap &attrs,
+                       std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, FusionDeconvBNReluParam<DeviceType>,
+            operators::DeconvBNReluKernel<DeviceType, T>>(type, inputs, outputs,
+                                                          attrs, scope) {}
+  void InferShape() const {
+    auto input = this->param_.Input();
+    auto in_dims = input->dims();
+    auto filter = this->param_.Filter();
+    auto filter_dims = filter->dims();
+    std::vector<int> strides = this->param_.Strides();
+    std::vector<int> paddings = this->param_.Paddings();
+    std::vector<int> dilations = this->param_.Dilations();
+    int groups = this->param_.Groups();
+    PADDLE_MOBILE_ENFORCE(
+        in_dims.size() == 4 || in_dims.size() == 5,
+        "ConvTransposeOp intput should be 4-D or 5-D tensor.");
+    PADDLE_MOBILE_ENFORCE(
+        in_dims.size() == filter_dims.size(),
+        "ConvTransposeOp input dimension and filter dimension "
+        "should be the same.");
+    PADDLE_MOBILE_ENFORCE(
+        in_dims.size() - strides.size() == 2U,
+        "ConvTransposeOp input dimension and strides dimension should "
+        "be consistent.");
+    PADDLE_MOBILE_ENFORCE(paddings.size() == strides.size(),
+                          "ConvTransposeOp paddings dimension and strides "
+                          "dimension should be the same.");
+    PADDLE_MOBILE_ENFORCE(paddings.size() == dilations.size(),
+                          "ConvTransposeOp paddings dimension and dilations "
+                          "dimension should be the same.");
+    PADDLE_MOBILE_ENFORCE(
+        in_dims[1] == filter_dims[0],
+        "In ConvTransposeOp, The number of input channels should "
+        "be equal to the number of filter's channels.");
+    std::vector<int64_t> output_shape({in_dims[0], filter_dims[1] * groups});
+    for (size_t i = 0; i < strides.size(); ++i) {
+      auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1;
+      output_shape.push_back((in_dims[i + 2] - 1) * strides[i] -
+                             2 * paddings[i] + filter_extent);
+    }
+    this->param_.Output()->Resize(framework::make_ddim(output_shape));
+  }
+ protected:
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif  // FUSION_DECONV_BN_RELU_OP
--- a/src/operators/kernel/deconv_bn_relu_kernel.h
+++ b/src/operators/kernel/deconv_bn_relu_kernel.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_DECONVBNRELU_OP
+#pragma once
+#include "framework/operator.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+using framework::OpKernelBase;
+template <typename DeviceType, typename T>
+class DeconvBNReluKernel
+    : public OpKernelBase<DeviceType, FusionDeconvBNReluParam<DeviceType>> {
+ public:
+  void Compute(const FusionDeconvBNReluParam<DeviceType> &param);
+  bool Init(FusionDeconvBNReluParam<DeviceType> *param);
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/fpga/V1/conv_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/conv_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef CONV_OP
+#include "operators/kernel/conv_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool ConvKernel<FPGA, float>::Init(ConvParam<FPGA> *param) {
+  paddle_mobile::fpga::ActivationType activation_enable =
+      paddle_mobile::fpga::NONE;
+  int16_t leaky_relu_negative_slope = 0;
+  auto input = const_cast<Tensor *>(param->Input());
+  auto filter = const_cast<Tensor *>(param->Filter());
+  auto out = param->Output();
+  int channel = out->dims()[1];
+  auto bs_ptr =
+      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
+  for (int i = 0; i < channel; i++) {
+    bs_ptr[i + channel] = 1;
+    bs_ptr[i] = 0;
+  }
+  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
+  fpga::SplitConvArgs conv_arg = {0};
+  fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable,
+                       leaky_relu_negative_slope, param->Groups(),
+                       param->Strides()[0], param->Strides()[1],
+                       param->Paddings()[0], param->Paddings()[1], bs_ptr);
+  param->SetFpgaArgs(conv_arg);
+  return true;
+}
+template <>
+void ConvKernel<FPGA, float>::Compute(const ConvParam<FPGA> &param) {
+  fpga::ComputeFpgaConv(param.FpgaArgs());
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/fpga/V1/deconv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/deconv_bn_relu_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_DECONVBNRELU_OP
+#include "operators/kernel/deconv_bn_relu_kernel.h"
+#include <cmath>
+#include "framework/operator.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool DeconvBNReluKernel<FPGA, float>::Init(
+    FusionDeconvBNReluParam<FPGA> *param) {
+  // bool relu_enabled = true;
+  paddle_mobile::fpga::ActivationType activation_enable =
+      paddle_mobile::fpga::LEAKYRELU;
+  int16_t leaky_relu_negative_slope = 0;
+  auto input = const_cast<Tensor *>(param->Input());
+  const Tensor *bias = param->InputBias();
+  auto bias_ptr = bias->data<float>();
+  auto filter = const_cast<Tensor *>(param->Filter());
+  auto out = param->Output();
+  auto bn_mean_ptr = param->InputMean()->data<float>();
+  auto bn_var_ptr = param->InputVariance()->data<float>();
+  auto bn_scale_ptr = param->InputScale()->data<float>();
+  auto bn_bias_ptr = param->InputBias()->data<float>();
+  const float epsilon = param->Epsilon();
+  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
+                        "Output channel should be equal to bias number");
+  int channel = out->dims()[1];
+  auto new_scale = new Tensor();
+  auto new_bias = new Tensor();
+  auto new_scale_ptr = new_scale->mutable_data<float>({channel});
+  auto new_bias_ptr = new_bias->mutable_data<float>({channel});
+  for (int i = 0; i < channel; i++) {
+    new_scale_ptr[i] = bn_scale_ptr[i] /
+                       static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
+    new_bias_ptr[i] = bn_bias_ptr[i] + (0 - bn_mean_ptr[i]) * new_scale_ptr[i];
+  }
+  int sub_conv_n = param->Strides()[0];
+  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *  // NOLINT
+                                           sizeof(float));             // NOLINT
+  for (int i = 0; i < channel * sub_conv_n; i++) {
+    bs_ptr[i + sub_conv_n * channel] = new_scale_ptr[i % channel];
+    bs_ptr[i] = new_bias_ptr[i % (channel)];
+  }
+  PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
+                        "stride_width should be equal to stride_height ");
+  PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
+                        "filter width should be equal to filter height ");
+  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
+                        "filter axis should be the multiple of stride axis ");
+  if (param->Groups() == channel) {
+    fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
+                               sub_conv_n);
+    fpga::DWDeconvArgs DWDeconv_arg = {0};
+    fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter,
+                            activation_enable, leaky_relu_negative_slope,
+                            param->Strides()[0], param->Strides()[1],
+                            param->Paddings()[0], param->Paddings()[1], bs_ptr);
+    param->SetFpgaArgs(DWDeconv_arg);
+  } else {
+    fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
+    fpga::DeconvArgs deconv_arg = {0};
+    fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,
+                          leaky_relu_negative_slope, param->Groups(),
+                          param->Strides()[0], param->Strides()[1],
+                          param->Paddings()[0], param->Paddings()[1], bs_ptr);
+    param->SetFpgaArgs(deconv_arg);
+  }
+  delete new_scale;
+  delete new_bias;
+  return true;
+}
+template <>
+void DeconvBNReluKernel<FPGA, float>::Compute(
+    const FusionDeconvBNReluParam<FPGA> &param) {
+  // fpga::ComputeFpgaDeconv(param.FpgaArgs());
+  if (param.Groups() == param.Output()->dims()[1]) {
+    fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
+  } else {
+    fpga::ComputeFpgaDeconv(param.FpgaArgs());
+  }
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/fpga/V1/fetch_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/fetch_kernel.cpp
@@ -46,24 +46,39 @@ bool FetchKernel<FPGA, float>::Init(FetchParam<FPGA> *param) {
  return true;
 }
+void dealign(float *src, float *dst, int input_c, int input_h, int input_w) {
+  int alignCW = paddle_mobile::fpga::align_to_x(input_c * input_w, 16);
+  int dealignCW = input_c * input_w;
+  for (int h = 0; h < input_h; ++h) {
+    auto input_offset = h * alignCW;
+    auto output_offset = h * dealignCW;
+    memcpy((dst + output_offset), (src + input_offset),
+           dealignCW * sizeof(float));
+  }
+}
 template <>
 void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
-  auto input = const_cast<Tensor *>(param.InputX());
+  auto input = param.InputX();
  if (input->type() == typeid(float)) {
    auto output = param.Out();
    output->ShareDataWith(*input);
    return;
  }
-  fpga::BypassArgs args = param.fpga_bypass_args;
+  fpga::PerformBypass(param.fpga_bypass_args);
-  auto input_address = (input->data<half>());
+  auto outC = param.Out()->dims()[1];
-  args.image.address = static_cast<void *>(input_address);
+  auto outH = param.Out()->dims()[2];
+  auto outW = param.Out()->dims()[3];
-  fpga::PerformBypass(args);
  fpga::fpga_invalidate(param.fpga_bypass_args.output.address,
-                        param.fpga_bypass_args.image.channels * sizeof(float));
+                        outH *
+                            (paddle_mobile::fpga::align_to_x(outC * outW, 16)) *
+                            sizeof(float));
-  // TODO(zhangyang): DEalign: get rid of extra 0
+  float *outdata_ptr =
+      reinterpret_cast<float *>(param.fpga_bypass_args.output.address);
+  float *data_tmp =
+      reinterpret_cast<float *>(malloc(outC * outH * outW * sizeof(float)));
+  dealign(outdata_ptr, data_tmp, outC, outH, outW);
+  memcpy(outdata_ptr, data_tmp, outC * outH * outW * sizeof(float));
 }
 template class FetchKernel<FPGA, float>;

--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -2535,6 +2535,62 @@ class FusionDeconvAddBNParam : public ConvTransposeParam<Dtype> {
  RType *new_scale_;
 };
 #endif
+#ifdef FUSION_DECONVBNRELU_OP
+template <typename Dtype>
+class FusionDeconvBNReluParam : public ConvTransposeParam<Dtype> {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+ public:
+  FusionDeconvBNReluParam(const VariableNameMap &inputs,
+                          const VariableNameMap &outputs,
+                          const AttributeMap &attrs, const Scope &scope)
+      : ConvTransposeParam<Dtype>(inputs, outputs, attrs, scope) {
+    output_ = OpParam::OutFrom<GType>(outputs, scope);
+    input_bias_ = OpParam::InputBiasFrom<GType>(inputs, scope);
+    input_mean_ = OpParam::InputMeanFrom<GType>(inputs, scope);
+    input_scale_ = OpParam::InputScaleFrom<GType>(inputs, scope);
+    input_variance_ = OpParam::InputVarianceFrom<GType>(inputs, scope);
+    epsilon_ = OpParam::GetAttr<float>("epsilon", attrs);
+    momentum_ = OpParam::GetAttr<float>("momentum", attrs);
+  }
+  RType *Output() const { return output_; }
+  const RType *InputBias() const { return input_bias_; }
+  const RType *InputMean() const { return input_mean_; }
+  const RType *InputScale() const { return input_scale_; }
+  const RType *InputVariance() const { return input_variance_; }
+  const float &Epsilon() const { return epsilon_; }
+  const float &Momentum() const { return momentum_; }
+  const bool &IsTest() const { return is_test_; }
+  void SetNewScale(RType *new_scale) { new_scale_ = new_scale; }
+  void SetNewBias(RType *new_bias) { new_bias_ = new_bias; }
+  const RType *NewScale() const { return new_scale_; }
+  const RType *NewBias() const { return new_bias_; }
+ protected:
+  RType *output_;
+  RType *input_bias_;
+  RType *input_mean_;
+  RType *input_scale_;
+  RType *input_variance_;
+  float epsilon_;
+  float momentum_;
+  bool is_test_;
+  RType *new_bias_;
+  RType *new_scale_;
+};
+#endif
 #ifdef FUSION_DECONVADDBNRELU_OP
 template <typename Dtype>
 class FusionDeconvAddBNReluParam : public ConvTransposeParam<Dtype> {