imp fusion_conv_add_prelu and fusion_conv_add_add_prelu op

977a21b1 · yangfei · c81fdf55 · 977a21b1 · 977a21b1 · 977a21b1
23 changed file
--- a/src/common/types.cpp
+++ b/src/common/types.cpp
@@ -23,6 +23,8 @@ const char *G_OP_TYPE_BOX_CODER = "box_coder";
 const char *G_OP_TYPE_CONCAT = "concat";
 const char *G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add";
 const char *G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu";
+const char *G_OP_TYPE_FUSION_CONV_ADD_PRELU = "fusion_conv_add_prelu";
+const char *G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU = "fusion_conv_add_add_prelu";
 const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU = "fusion_conv_add_bn_relu";
 const char *G_OP_TYPE_FUSION_DWCONV_BN_RELU = "fusion_dwconv_bn_relu";
 const char *G_OP_TYPE_FUSION_CONV_BN_RELU = "fusion_conv_bn_relu";
@@ -83,6 +85,8 @@ std::unordered_map<
        {G_OP_TYPE_RESHAPE, {{"X"}, {"Out"}}},
        {G_OP_TYPE_DEPTHWISE_CONV, {{"Input"}, {"Output"}}},
        {G_OP_TYPE_FUSION_CONV_ADD_RELU, {{"Input"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_CONV_ADD_PRELU, {{"Input"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU, {{"Input"}, {"Out"}}},
        {G_OP_TYPE_IM2SEQUENCE, {{"X"}, {"Out"}}},
        {G_OP_TYPE_DROPOUT, {{"X"}, {"Out"}}},
        {G_OP_TYPE_FUSION_CONV_ADD_BN, {{"Input"}, {"Y"}}},

--- a/src/common/types.h
+++ b/src/common/types.h
@@ -85,6 +85,8 @@ extern const char *G_OP_TYPE_BOX_CODER;
 extern const char *G_OP_TYPE_CONCAT;
 extern const char *G_OP_TYPE_ELEMENTWISE_ADD;
 extern const char *G_OP_TYPE_FUSION_CONV_ADD_RELU;
+extern const char *G_OP_TYPE_FUSION_CONV_ADD_PRELU;
+extern const char *G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU;
 extern const char *G_OP_TYPE_FC;
 extern const char *G_OP_TYPE_FUSION_CONV_ADD;
 extern const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU;

--- a/src/framework/operator.h
+++ b/src/framework/operator.h
@@ -183,6 +183,8 @@ class FusionOpMatcher {

  std::string BeginType() { return node_.Type(); }

+  virtual std::vector<std::pair<int, std::string>> NeedCheck() { return {}; }
+
  //  virtual  bool Fusion();
 protected:
  Node node_;

--- a/src/framework/program/program-optimize/node.cpp
+++ b/src/framework/program/program-optimize/node.cpp
@@ -14,15 +14,33 @@ limitations under the License. */

 #include "framework/program/program-optimize/node.h"
 #include <algorithm>
+#include <map>
+#include <memory>
 #include "framework/operator.h"

 namespace paddle_mobile {

 namespace framework {

+std::vector<Node *> Node::operator[](int index) {
+  std::vector<Node *> nodes;
+  GetNodesWithLocation(index, 0, &nodes);
+  return nodes;
+}
+
+void Node::GetNodesWithLocation(int index, int now_index,
+                                std::vector<Node *> *nodes) {
+  if (index == now_index) {
+    nodes->push_back(this);
+  }
+
+  for (int i = 0; i < this->outputs_.size(); ++i) {
+    this->outputs_[i]->GetNodesWithLocation(index, now_index + 1, nodes);
+  }
+}
+
 Node &Node::operator>(std::shared_ptr<Node> node) {
  outputs_.push_back(node);
-  std::shared_ptr<Node> this_node;
  node->inputs_.push_back(this);
  return *node;
 }
@@ -31,19 +49,70 @@ bool Node::operator==(const Node &in) {
  if (in.type_ == this->type_) {
    if (this->outputs_.size() == in.outputs_.size()) {
      for (int i = 0; i < outputs_.size(); ++i) {
-        if (!(*outputs_[i] == *in.outputs_[i])) {
+        if (!(this->outputs_[i]->MedianEqual(*in.outputs_[i]))) {
+          return false;
+        }
+      }
+    } else {
+      return false;
+    }
+  } else {
+    return false;
+  }
+  return true;
+}
+
+bool Node::MedianEqual(const Node &in) {
+  if (in.type_ == this->type_) {
+    if (this->outputs_.size() == in.outputs_.size()) {
+      //      if (this->inputs_.size() != in.inputs_.size()) {
+      //        DLOG << " == - this input size: " << this->inputs_.size();
+      //        DLOG << " == - ptr of this " << this;
+      //        DLOG << " == - in input size: " << in.inputs_.size();
+      //        DLOG << " == - input size not equal ";
+      //        return false;
+      //      } else {
+      //        for (int i = 0; i < this->inputs_.size(); ++i) {
+      //          if (this->inputs_[i]->type_ != in.inputs_[i]->type_) {
+      //            DLOG << " == - input type not equal ";
+      //            return false;
+      //          }
+      //        }
+      //      }
+
+      for (int i = 0; i < outputs_.size(); ++i) {
+        if (!((*outputs_[i]).MedianEqual(*in.outputs_[i]))) {
          return false;
        }
      }
    } else {
+      //      DLOG << " == - output size not equal ";
      return false;
    }
  } else {
+    //    DLOG << " == - median type is not equal ";
    return false;
  }
  return true;
 }

+std::map<std::string, Node *> Node::Relationship() {
+  std::map<std::string, Node *> map;
+  RelationshipPrivate(&map);
+  return map;
+}
+
+void Node::RelationshipPrivate(std::map<std::string, Node *> *map) {
+  for (auto output : op_desc_->outputs_) {
+    for (auto output_key : output.second) {
+      (*map)[output_key] = this;
+    }
+  }
+  for (auto output : this->outputs_) {
+    output->RelationshipPrivate(map);
+  }
+}
+
 std::shared_ptr<Node> Node::To(int size) {
  std::shared_ptr<Node> node = std::make_shared<Node>();
  this->To(size - 1, node);
@@ -51,7 +120,9 @@ std::shared_ptr<Node> Node::To(int size) {
 }

 void Node::To(int index, std::shared_ptr<Node> node) {
+  node->op_desc_ = this->op_desc_;
  node->type_ = this->type_;
+  node->inputs_ = this->inputs_;
  if (index != 0) {
  } else {
    return;
@@ -100,8 +171,19 @@ void Node::Folder(
  if (change->find(this->type_) != change->end()) {
    auto change_pairs = (*change)[this->type_];
    for (const auto &change_pair : change_pairs) {
-      op_desc->GetInputs()[change_pair.second] =
-          this->op_desc_->GetInputs()[change_pair.first];
+      std::map<std::string, int> f;
+      if (this->op_desc_->GetInputs().find(change_pair.first) !=
+          this->op_desc_->GetInputs().end()) {
+        if (op_desc->GetInputs().find(change_pair.second) !=
+            op_desc->GetInputs().end()) {
+          for (auto value : this->op_desc_->GetInputs()[change_pair.first]) {
+            op_desc->GetInputs()[change_pair.second].push_back(value);
+          }
+        } else {
+          op_desc->GetInputs()[change_pair.second] =
+              this->op_desc_->GetInputs()[change_pair.first];
+        }
+      }
    }
  }

@@ -110,7 +192,28 @@ void Node::Folder(
  }
  if (index > 0) {
    --index;
+
    for (auto output : outputs_) {
+      if (change->find(this->type_) != change->end()) {
+        auto change_pairs = (*change)[this->type_];
+        for (const auto &change_pair : change_pairs) {
+          std::map<std::string, int> f;
+          if (this->op_desc_->GetOutputs().find(change_pair.first) !=
+              this->op_desc_->GetOutputs().end()) {
+            if (op_desc->GetInputs().find(change_pair.second) !=
+                op_desc->GetInputs().end()) {
+              for (auto value :
+                   this->op_desc_->GetOutputs()[change_pair.first]) {
+                op_desc->GetInputs()[change_pair.second].push_back(value);
+              }
+            } else {
+              op_desc->GetInputs()[change_pair.second] =
+                  this->op_desc_->GetOutputs()[change_pair.first];
+            }
+          }
+        }
+      }
+
      removed_nodes->push_back(output);
      output->Folder(op_desc, outputs, index, change, begin_node,
                     removed_nodes);

--- a/src/framework/program/program-optimize/node.h
+++ b/src/framework/program/program-optimize/node.h
@@ -35,6 +35,7 @@ class Node {
      : op_desc_(op_desc), type_(op_desc->Type()) {}
  Node &operator>(std::shared_ptr<Node> node);
  bool operator==(const Node &in);
+  bool MedianEqual(const Node &in);

 #ifdef PADDLE_MOBILE_DEBUG
  std::string ToString() const;
@@ -50,7 +51,14 @@ class Node {
  std::shared_ptr<framework::OpDesc> OpDescOfNode() { return op_desc_; }
  std::string Type() { return type_; }

+  std::vector<Node *> operator[](int index);
+
+  std::map<std::string, Node *> Relationship();
+
 private:
+  void RelationshipPrivate(std::map<std::string, Node *> *map);
+  void GetNodesWithLocation(int index, int now_index,
+                            std::vector<Node *> *nodes);
  void To(int index, std::shared_ptr<Node>);
  void Folder(
      std::shared_ptr<framework::OpDesc> op_desc,

--- a/src/framework/program/program-optimize/program_optimize.cpp
+++ b/src/framework/program/program-optimize/program_optimize.cpp
@@ -29,9 +29,15 @@ std::shared_ptr<ProgramDesc> ProgramOptimize::FusionOptimize(

  for (int i = 0; i < optimize_program->Blocks().size(); ++i) {
    std::unordered_map<std::string, std::shared_ptr<Node>> output_nodes;
-    std::unordered_map<std::string, std::vector<std::shared_ptr<Node>>>
+    std::unordered_map<
+        std::string,
+        std::vector<
+            std::pair<std::shared_ptr<Node>,
+                      std::unordered_map<std::string, std::shared_ptr<Node>>>>>
        type_map;

+    std::unordered_map<std::string, bool> output_has;
+
    std::vector<std::shared_ptr<Node>> nodes;

    std::shared_ptr<Node> begin_node;
@@ -50,7 +56,7 @@ std::shared_ptr<ProgramDesc> ProgramOptimize::FusionOptimize(
      nodes.push_back(node);

      //
-      type_map[op->Type()].push_back(node);
+      type_map[op->Type()].push_back({node, output_nodes});

      if (j == 0) {
        begin_node = node;
@@ -69,6 +75,7 @@ std::shared_ptr<ProgramDesc> ProgramOptimize::FusionOptimize(
      }

      auto output_keys = op_input_output_key.at(op_type).second;
+
      for (auto output_key : output_keys) {
        auto op_outputs = op->Output(output_key);
        for (int k = 0; k < op_outputs.size(); ++k) {
@@ -83,11 +90,42 @@ std::shared_ptr<ProgramDesc> ProgramOptimize::FusionOptimize(

      auto match_vector = type_map[matcher->BeginType()];

-      for (auto &match_node : match_vector) {
+      for (auto &match_node_pair : match_vector) {
+        auto match_node = match_node_pair.first;
+
+        auto node_has = match_node_pair.second;
+
        auto depth = matcher->BeginNode().Depth();
        auto sub_node = match_node->To(depth);
        //        DLOG << " sub node: " << *sub_node;
        if (*sub_node == matcher->BeginNode()) {
+          bool can_folder = true;
+
+          auto relationship_map = sub_node->Relationship();
+
+          for (auto to_check : matcher->NeedCheck()) {
+            //            if (node_has)
+            auto nodes = (*sub_node)[to_check.first];
+            for (auto node : nodes) {
+              auto inputs_to_check =
+                  node->OpDescOfNode()->Input(to_check.second);
+
+              for (auto input_to_check : inputs_to_check) {
+                if (node_has.find(input_to_check) == node_has.end()) {
+                  if (relationship_map.find(input_to_check) ==
+                      relationship_map.end()) {
+                    can_folder = false;
+                  } else {
+                  }
+                }
+              }
+            }
+          }
+
+          if (!can_folder) {
+            continue;
+          }
+
          //          DLOG << " match success " << " fusion node: \n" <<
          //          matcher->BeginNode() << "\nsub node: \n" << *sub_node;
          //          DLOG << "match node\n"<< *match_node;
@@ -95,11 +133,13 @@ std::shared_ptr<ProgramDesc> ProgramOptimize::FusionOptimize(
          std::vector<std::shared_ptr<Node>> removed_nodes;
          matcher->FolderNodes(match_node.get(), &removed_nodes);

-          for (int j = 0; j < removed_nodes.size(); ++j) {
-            auto removed_node = removed_nodes[j];
+          for (int k = removed_nodes.size() - 1; k >= 0; --k) {
+            auto removed_node = removed_nodes[k];
            auto removed_ite =
                std::find(nodes.begin(), nodes.end(), removed_node);
-            nodes.erase(removed_ite);
+            if (removed_ite != nodes.end()) {
+              nodes.erase(removed_ite);
+            }
          }
        }
      }

--- a/src/operators/fusion_conv_add_add_prelu.cpp
+++ b/src/operators/fusion_conv_add_add_prelu.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDADDPRELU_OP
+
+#include "fusion_conv_add_add_prelu_op.h"
+#include "operators/math/conv_func.h"
+
+namespace paddle_mobile {
+    namespace operators {
+
+        template <typename Dtype, typename T>
+        void FusionConvAddAddPReluOp<Dtype, T>::InferShape() const {
+            auto in_dims = this->param_.Input()->dims();
+            auto filter_dims = this->param_.Filter()->dims();
+            const std::vector<int> &strides = this->param_.Strides();
+            std::vector<int> paddings = this->param_.Paddings();
+            int groups = this->param_.Groups();
+            std::vector<int> dilations = this->param_.Dilations();
+
+            PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
+                                   dilations.size() == paddings.size() &&
+                                   paddings.size() == strides.size()),
+                                  "ConvParam is not suitable");
+
+            std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
+            for (size_t i = 0; i < strides.size(); ++i) {
+                output_shape.push_back(
+                        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
+                                             paddings[i], strides[i]));
+            }
+            framework::DDim ddim = framework::make_ddim(output_shape);
+            this->param_.Output()->Resize(ddim);
+        }
+
+    }  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(fusion_conv_add_add_prelu, ops::FusionConvAddAddPReluOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(fusion_conv_add_add_prelu, ops::FusionConvAddAddPReluOp);
+#endif
+
+#endif
--- a/src/operators/fusion_conv_add_add_prelu_op.h
+++ b/src/operators/fusion_conv_add_add_prelu_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDADDPRELU_OP
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "framework/operator.h"
+#include "framework/program/program-optimize/fusion_op_register.h"
+#include "operators/kernel/conv_add_add_prelu_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+    namespace operators {
+
+        class FusionConvAddAddPReluOpMatcher : public framework::FusionOpMatcher {
+        public:
+            FusionConvAddAddPReluOpMatcher() {
+                node_ = framework::Node(G_OP_TYPE_CONV);
+                node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
+                std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD)
+                > std::make_shared<framework::Node>(G_OP_TYPE_PRELU);
+            }
+
+            void FolderNodes(
+                    framework::Node *node,
+                    std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
+                node->Folder(node_.Depth(), Type(),
+                             {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}, {"Out", "addOut"},{"X", "addX"}}},
+                              {G_OP_TYPE_PRELU, {{"Alpha", "Alpha"}}}
+                             },
+
+                             removed_nodes);
+            }
+            std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU; }
+
+          std::vector<std::pair<int, std::string>> NeedCheck() {
+              DLOG << " conv add add prelu check add X ";
+            return {{2, "Y"}, {2, "X"}};
+          }
+        };
+
+        template <typename DeviceType, typename T>
+        class FusionConvAddAddPReluOp : public framework::OperatorWithKernel<
+                DeviceType, FusionConvAddAddPReluParam<DeviceType>,
+                operators::ConvAddAddPReluKernel<DeviceType, T>> {
+        public:
+            FusionConvAddAddPReluOp(const string &type, const VariableNameMap &inputs,
+                                 const VariableNameMap &outputs,
+                                 const framework::AttributeMap &attrs,
+                                 std::shared_ptr<framework::Scope> scope)
+                    : framework::OperatorWithKernel<
+                    DeviceType, FusionConvAddAddPReluParam<DeviceType>,
+                    operators::ConvAddAddPReluKernel<DeviceType, T>>(type, inputs, outputs,
+                                                                  attrs, scope) {}
+
+            using framework::OperatorWithKernel<
+                    DeviceType, FusionConvAddAddPReluParam<DeviceType>,
+                    operators::ConvAddAddPReluKernel<DeviceType, T>>::OperatorWithKernel;
+            void InferShape() const override;
+        protected:
+        };
+
+#ifdef PADDLE_MOBILE_CPU
+
+#ifndef CONV_ADD_ADD_PRELU_REGISTER
+#define CONV_ADD_ADD_PRELU_REGISTER
+        static framework::FusionOpRegistrar fusion_conv_add_add_prelu_registrar(
+                new FusionConvAddAddPReluOpMatcher());
+#endif
+
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+
+        #ifndef CONV_ADD_ADD_PRELU_REGISTER
+#define CONV_ADD_ADD_PRELU_REGISTER
+static framework::FusionOpRegistrar fusion_conv_add_add_prelu_registrar(
+    new FusionConvAddAddPReluOpMatcher());
+#endif
+
+#endif
+
+    }  // namespace operators
+}  // namespace paddle_mobile
+
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(fusion_conv_add_add_prelu);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+USE_OP_FPGA(fusion_conv_add_add_prelu);
+#endif
+
+#endif
--- a/src/operators/fusion_conv_add_prelu_op.cpp
+++ b/src/operators/fusion_conv_add_prelu_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDPRELU_OP
+
+#include "fusion_conv_add_prelu_op.h"
+#include "operators/math/conv_func.h"
+
+namespace paddle_mobile {
+    namespace operators {
+
+        template <typename Dtype, typename T>
+        void FusionConvAddPReluOp<Dtype, T>::InferShape() const {
+            auto in_dims = this->param_.Input()->dims();
+            auto filter_dims = this->param_.Filter()->dims();
+            const std::vector<int> &strides = this->param_.Strides();
+            std::vector<int> paddings = this->param_.Paddings();
+            int groups = this->param_.Groups();
+            std::vector<int> dilations = this->param_.Dilations();
+
+            PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
+                                   dilations.size() == paddings.size() &&
+                                   paddings.size() == strides.size()),
+                                  "ConvParam is not suitable");
+
+            std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
+            for (size_t i = 0; i < strides.size(); ++i) {
+                output_shape.push_back(
+                        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
+                                             paddings[i], strides[i]));
+            }
+            framework::DDim ddim = framework::make_ddim(output_shape);
+            this->param_.Output()->Resize(ddim);
+        }
+
+    }  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(fusion_conv_add_prelu,ops::FusionConvAddPReluOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(fusion_conv_add_prelu, ops::FusionConvAddPReluOp);
+#endif
+
+#endif
--- a/src/operators/fusion_conv_add_prelu_op.h
+++ b/src/operators/fusion_conv_add_prelu_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDPRELU_OP
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "framework/operator.h"
+#include "framework/program/program-optimize/fusion_op_register.h"
+#include "operators/kernel/conv_add_prelu_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+    namespace operators {
+
+        class FusionConvAddPReluOpMatcher : public framework::FusionOpMatcher {
+        public:
+            FusionConvAddPReluOpMatcher() {
+                node_ = framework::Node(G_OP_TYPE_CONV);
+                node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
+                std::make_shared<framework::Node>(G_OP_TYPE_PRELU);
+            }
+
+            void FolderNodes(
+                    framework::Node *node,
+                    std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
+                node->Folder(node_.Depth(), Type(),
+                             {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}},
+                              {G_OP_TYPE_PRELU, {{"Alpha", "Alpha"}}}
+
+                             },
+
+
+                             removed_nodes);
+            }
+            std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD_PRELU; }
+        };
+
+        template <typename DeviceType, typename T>
+        class FusionConvAddPReluOp : public framework::OperatorWithKernel<
+                DeviceType, FusionConvAddPReluParam<DeviceType>,
+                operators::ConvAddPReluKernel<DeviceType, T>> {
+        public:
+            FusionConvAddPReluOp(const string &type, const VariableNameMap &inputs,
+                                const VariableNameMap &outputs,
+                                const framework::AttributeMap &attrs,
+                                std::shared_ptr<framework::Scope> scope)
+                    : framework::OperatorWithKernel<
+                    DeviceType, FusionConvAddPReluParam<DeviceType>,
+                    operators::ConvAddPReluKernel<DeviceType, T>>(type, inputs, outputs,
+                                                                 attrs, scope) {}
+
+            using framework::OperatorWithKernel<
+                    DeviceType, FusionConvAddPReluParam<DeviceType>,
+                    operators::ConvAddPReluKernel<DeviceType, T>>::OperatorWithKernel;
+            void InferShape() const override;
+
+        protected:
+        };
+
+#ifdef PADDLE_MOBILE_CPU
+
+#ifndef CONV_ADD_PRELU_REGISTER
+#define CONV_ADD_PRELU_REGISTER
+        static framework::FusionOpRegistrar fusion_conv_add_prelu_registrar(
+                new FusionConvAddPReluOpMatcher());
+#endif
+
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+
+        #ifndef CONV_ADD_PRELU_REGISTER
+#define CONV_ADD_PRELU_REGISTER
+static framework::FusionOpRegistrar fusion_conv_add_prelu_registrar(
+    new FusionConvAddPReluOpMatcher());
+#endif
+
+#endif
+
+    }  // namespace operators
+}  // namespace paddle_mobile
+
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(fusion_conv_add_prelu);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+USE_OP_FPGA(fusion_conv_add_prelu);
+#endif
+
+#endif
--- a/src/operators/kernel/arm/conv_add_add_prelu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_add_prelu_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDADDPRELU_OP
+
+#include "operators/kernel/conv_add_add_prelu_kernel.h"
+#include "operators/kernel/central-arm-func/conv_add_add_prelu_arm_func.h"
+
+namespace paddle_mobile {
+    namespace operators {
+
+        template <>
+        bool ConvAddAddPReluKernel<CPU, float>::Init(FusionConvAddAddPReluParam<CPU> *param) {
+            return true;
+        }
+
+        template <>
+        void ConvAddAddPReluKernel<CPU, float>::Compute(
+                const FusionConvAddAddPReluParam<CPU> &param) const {
+            ConvAddAddPReluCompute<float>(param);
+        }
+        template class ConvAddAddPReluKernel<CPU, float>;
+
+    }  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/arm/conv_add_prelu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_prelu_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDPRELU_OP
+
+#include "operators/kernel/conv_add_prelu_kernel.h"
+#include "operators/kernel/central-arm-func/conv_add_prelu_arm_func.h"
+
+namespace paddle_mobile {
+    namespace operators {
+
+        template <>
+        bool ConvAddPReluKernel<CPU, float>::Init(FusionConvAddPReluParam<CPU> *param) {
+            return true;
+        }
+
+        template <>
+        void ConvAddPReluKernel<CPU, float>::Compute(
+                const FusionConvAddPReluParam<CPU> &param) const {
+            ConvAddPReluCompute<float>(param);
+        }
+        template class ConvAddPReluKernel<CPU, float>;
+
+    }  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/arm/prelu_kernel.cpp
+++ b/src/operators/kernel/arm/prelu_kernel.cpp
@@ -16,6 +16,9 @@ limitations under the License. */

 #include "operators/kernel/prelu_kernel.h"
 #include <operators/math/transform.h>
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif

 namespace paddle_mobile {
 namespace operators {
@@ -37,32 +40,81 @@ void PReluKernel<CPU, float>::Compute(const PReluParam<CPU> &param) const {
  auto *alpha = param.InputAlpha();
  auto *out = param.Out();
  std::string mode = param.Mode();
-  const auto *x_ptr = x->data<float>();
+  auto *x_ptr = x->data<float>();
  auto *o_ptr = out->mutable_data<float>();
-  const auto *alpha_ptr = alpha->data<float>();
+  auto *alpha_ptr = alpha->data<float>();
  int numel = x->numel();
  auto dim = x->dims();
+  int k = dim[0] * dim[1];
+  int n = dim[2] * dim[3];
  int index = 0;
  int i = 0;
  int temp = 0;
+#if __ARM_NEON
+  #pragma omp parallel for
+  for (int i = 0; i < k; i++) {
+    float32x4_t zero = vdupq_n_f32(0.0);
+    float32x4_t cv;
+    float32x4_t cv1;
+    float32x4_t cv2;
+    float32x4_t pv;
+    for (int j = 0; (j + 3) < n; j += 4) {
+      const float *in = x_ptr + i * n + j;
+      float *out = o_ptr + i * n + j;
+      cv = vld1q_f32(in);
+      cv1 = vmaxq_f32(cv, zero);
+      cv2 = vminq_f32(cv, zero);
+      if (mode == "channel") {
+        cv2 = vmulq_n_f32(cv2, alpha_ptr[i]);
+      } else if (mode == "element") {
+        pv = vld1q_f32(alpha_ptr + i * n + j);
+        cv2 = vmulq_f32(cv2, pv);
+      } else {
+        cv2 = vmulq_n_f32(cv2, alpha_ptr[0]);
+      }
+      cv = vaddq_f32(cv1, cv2);
+      vst1q_f32(out, cv);
+    }
+    int j;
+    for (j = 0; (j + 3) < n; j += 4) {
+    }
+    for (int m = j; m < n; m++) {
+      if (mode == "channel") {
+        o_ptr[i * n + m] = x_ptr[i * n + m] > 0
+                               ? x_ptr[i * n + m]
+                               : alpha_ptr[i] * x_ptr[i * n + m];
+      } else if (mode == "element") {
+        o_ptr[i * n + m] = x_ptr[i * n + m] > 0
+                               ? x_ptr[i * n + m]
+                               : alpha_ptr[i * n + m] * x_ptr[i * n + m];
+      } else {
+        o_ptr[i * n + m] = x_ptr[i * n + m] > 0
+                               ? x_ptr[i * n + m]
+                               : alpha_ptr[0] * x_ptr[i * n + m];
+      }
+    }
+  }
+
+#else
  if (mode == "channel") {
    temp = numel / (dim[0] * dim[1]);
-    #pragma omp parallel for
+#pragma omp parallel for
    for (i = 0; i < numel; i++) {
      index = (i / temp) % dim[1];
      o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i];
    }
  } else if (mode == "element") {
-    #pragma omp parallel for
+#pragma omp parallel for
    for (i = 0; i < numel; i++) {
      o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[i] * x_ptr[i];
    }
  } else {
-    #pragma omp parallel for
+#pragma omp parallel for
    for (i = 0; i < numel; i++) {
      o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[0] * x_ptr[i];
    }
  }
+#endif
 }
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/kernel/central-arm-func/conv_add_add_prelu_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_add_add_prelu_arm_func.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDADDPRELU_OP
+
+#pragma once
+#include <vector>
+#include "operators/math/conv_func.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+    namespace operators {
+
+        template <typename P>
+        void ConvAddAddPReluCompute(const FusionConvAddAddPReluParam<CPU> &param) {
+            const Tensor *input = param.Input();
+            Tensor filter = *param.Filter();
+            Tensor bias = *param.Bias();
+            Tensor bias1 = *param.Bias1();
+            int axis = param.Axis();
+            Tensor *output = param.Output();
+
+            float *biase_data = bias.data<float>();
+
+            int groups = param.Groups();
+            std::vector<int> strides = param.Strides();
+            std::vector<int> paddings = param.Paddings();
+            std::vector<int> dilations = param.Dilations();
+            Tensor aa = *param.InputAlpha();
+            float *p = aa.data<float>();
+
+            std::string mode = param.Mode();
+            const int batch_size = static_cast<int>(input->dims()[0]);
+
+            std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
+
+            std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
+            size_t data_dim = filter_shape_vec.size() - 2;
+            std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+            col_shape_vec[0] = input->dims()[1] / groups;
+            for (size_t j = 0; j < data_dim; ++j) {
+                col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+                col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+            }
+            framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+
+            framework::DDim col_matrix_shape =
+                    framework::flatten_to_2d(col_shape, data_dim + 1);
+
+            bool is_expand =
+                    math::IsExpand(filter_shape_vec, strides, paddings, dilations);
+            Tensor col;
+            Tensor col_matrix;
+            if (is_expand) {
+                col.mutable_data<float>(col_shape);
+                col_matrix.ShareDataWith(col);
+                col_matrix.Resize(col_matrix_shape);
+            }
+
+            framework::DDim input_shape = framework::slice_ddim(
+                    input->dims(), 1, static_cast<int>(input->dims().size()));
+
+            framework::DDim filter_matrix_shape = {filter.dims()[0],
+                                                   filter.numel() / filter.dims()[0]};
+            filter.Resize(filter_matrix_shape);
+            framework::DDim output_matrix_shape = {
+                    output->dims()[1],
+                    output->numel() / (output->dims()[0] * output->dims()[1])};
+
+            // convolution operator: im2col(or vol2col) + gemm
+            int in_step = static_cast<int>(input->dims()[1]) / groups;
+            int out_step = static_cast<int>(output->dims()[1]) / groups;
+
+            math::Vol2ColFunctor<CPU, float> vol2col;
+            math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
+
+            for (int i = 0; i < batch_size; i++) {
+                Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
+                Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
+                Tensor bias1_batch = bias1.Slice(i,i+1).Resize(output_matrix_shape);
+                for (int g = 0; g < groups; g++) {
+                    Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+
+                    if (!is_expand) {
+                        col.ShareDataWith(in_slice);
+                        col_matrix.ShareDataWith(col);
+                        col_matrix.Resize(col_matrix_shape);
+                    } else if (data_dim == 2U) {
+                        // im2col
+                        im2col(in_slice, dilations, strides,
+                               std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                                paddings[1]},
+                               &col);
+                    } else if (data_dim == 3U) {
+                        // vol2col
+                        vol2col(in_slice, dilations, strides, paddings, &col);
+                    }
+
+                    // gemm
+                    Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+                    Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+                    Tensor bias1_slice = bias1_batch.Slice(g * out_step, (g + 1) * out_step);
+                    float *biase_data1 = bias1_slice.data<float>();
+//                    int n = bias1_slice.dims()[0];
+//                    int m = bias1_slice.dims()[1];
+//                    for(int i=0;i<n*m;i++){
+//                        if(biase_data1[i]!=0)
+//                        DLOG<<biase_data1[i]<<",yangfei";
+//                    }
+
+//                    math::matmul<float>(filter_slice, false, col_matrix, false,
+//                                        static_cast<float>(1), &out_slice,
+//                                        static_cast<float>(1), true, biase_data);
+                    math::matmulWithPRelu(filter_slice, false, col_matrix, false,
+                                          &out_slice, p,mode, biase_data,biase_data1);
+                }
+            }
+        }
+
+    }  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/central-arm-func/conv_add_prelu_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_add_prelu_arm_func.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDPRELU_OP
+
+#pragma once
+#include <vector>
+#include "operators/math/conv_func.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+    namespace operators {
+
+        template <typename P>
+        void ConvAddPReluCompute(const FusionConvAddPReluParam<CPU> &param) {
+            const Tensor *input = param.Input();
+            Tensor filter = *param.Filter();
+            Tensor bias = *param.Bias();
+//            DLOG<<"yangfei";
+//            DLOG<<bias.dims();
+            int axis = param.Axis();
+            Tensor *output = param.Output();
+            float *biase_data = bias.data<float>();
+
+            int groups = param.Groups();
+            std::vector<int> strides = param.Strides();
+            std::vector<int> paddings = param.Paddings();
+            std::vector<int> dilations = param.Dilations();
+            Tensor aa = *param.InputAlpha();
+            float *p = aa.data<float>();
+            std::string mode = param.Mode();
+            const int batch_size = static_cast<int>(input->dims()[0]);
+
+            std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
+
+            std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
+            size_t data_dim = filter_shape_vec.size() - 2;
+            std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+            col_shape_vec[0] = input->dims()[1] / groups;
+            for (size_t j = 0; j < data_dim; ++j) {
+                col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+                col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+            }
+            framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+
+            framework::DDim col_matrix_shape =
+                    framework::flatten_to_2d(col_shape, data_dim + 1);
+
+            bool is_expand =
+                    math::IsExpand(filter_shape_vec, strides, paddings, dilations);
+            Tensor col;
+            Tensor col_matrix;
+            if (is_expand) {
+                col.mutable_data<float>(col_shape);
+                col_matrix.ShareDataWith(col);
+                col_matrix.Resize(col_matrix_shape);
+            }
+
+            framework::DDim input_shape = framework::slice_ddim(
+                    input->dims(), 1, static_cast<int>(input->dims().size()));
+
+            framework::DDim filter_matrix_shape = {filter.dims()[0],
+                                                   filter.numel() / filter.dims()[0]};
+            filter.Resize(filter_matrix_shape);
+            framework::DDim output_matrix_shape = {
+                    output->dims()[1],
+                    output->numel() / (output->dims()[0] * output->dims()[1])};
+
+            // convolution operator: im2col(or vol2col) + gemm
+            int in_step = static_cast<int>(input->dims()[1]) / groups;
+            int out_step = static_cast<int>(output->dims()[1]) / groups;
+
+            math::Vol2ColFunctor<CPU, float> vol2col;
+            math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
+
+            for (int i = 0; i < batch_size; i++) {
+                Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
+                Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
+
+                for (int g = 0; g < groups; g++) {
+                    Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+
+                    if (!is_expand) {
+                        col.ShareDataWith(in_slice);
+                        col_matrix.ShareDataWith(col);
+                        col_matrix.Resize(col_matrix_shape);
+                    } else if (data_dim == 2U) {
+                        // im2col
+                        im2col(in_slice, dilations, strides,
+                               std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                                paddings[1]},
+                               &col);
+                    } else if (data_dim == 3U) {
+                        // vol2col
+                        vol2col(in_slice, dilations, strides, paddings, &col);
+                    }
+
+                    // gemm
+                    Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+                    Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+//                    math::matmul<float>(filter_slice, false, col_matrix, false,
+//                                        static_cast<float>(1), &out_slice,
+//                                        static_cast<float>(1), true, biase_data);
+                    math::matmulWithPRelu(filter_slice, false, col_matrix, false,
+                                         &out_slice, p,mode, biase_data, nullptr);
+                }
+            }
+        }
+
+    }  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/conv_add_add_prelu_kernel.h
+++ b/src/operators/kernel/conv_add_add_prelu_kernel.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef FUSION_CONVADDADDPRELU_OP
+
+#include <vector>
+#include "framework/ddim.h"
+#include "framework/operator.h"
+#include "operators/math/conv_func.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+    namespace operators {
+
+        using framework::DDim;
+        using framework::OpKernelBase;
+
+        template <typename DeviceType, typename T>
+        class ConvAddAddPReluKernel
+                : public OpKernelBase<DeviceType, FusionConvAddAddPReluParam<DeviceType>> {
+        public:
+            void Compute(const FusionConvAddAddPReluParam<DeviceType> &param) const;
+            bool Init(FusionConvAddAddPReluParam<DeviceType> *param);
+        };
+
+    }  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/conv_add_prelu_kernel.h
+++ b/src/operators/kernel/conv_add_prelu_kernel.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef FUSION_CONVADDPRELU_OP
+
+#include <vector>
+#include "framework/ddim.h"
+#include "framework/operator.h"
+#include "operators/math/conv_func.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+    namespace operators {
+
+        using framework::DDim;
+        using framework::OpKernelBase;
+
+        template <typename DeviceType, typename T>
+        class ConvAddPReluKernel
+                : public OpKernelBase<DeviceType, FusionConvAddPReluParam<DeviceType>> {
+        public:
+            void Compute(const FusionConvAddPReluParam<DeviceType> &param) const;
+            bool Init(FusionConvAddPReluParam<DeviceType> *param);
+        };
+
+    }  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "operators/math/gemm.h"
+#include <string>
 #include "common/log.h"
 #include "memory/t_malloc.h"
 #if __ARM_NEON
@@ -707,6 +708,25 @@ void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
  }
 }

+void InnerKernelWithPRelu(int mc, int nc, const float *a, const float *b,
+                          float *c, float *C, int ldc, float *p,
+                          std::string mode, float *bias, float *bias1) {
+#pragma omp parallel for
+  for (int j = 0; j < nc; j += NR) {
+    for (int i = 0; i < mc; i += MR) {
+#if __aarch64__
+      // AddDot8x12(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+      AddDot6x16(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+#else
+      // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+      // AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+      AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+#endif
+    }
+  }
+  WriteWithAddPRelu(mc, nc, c, C, ldc, p, mode, bias, bias1);
+}
+
 #if __ARM_NEON
 #if __aarch64__

@@ -988,6 +1008,82 @@ void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
  }
 }

+// C = A * B + C,prelu(C)
+void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p,
+                       std::string mode, float *bias, float *bias1) {
+  int nc1 = nc / 4;
+  int _nc1 = nc % 4;
+
+  float *c_ptr, *C_ptr;
+  float32x4_t cv;
+  float32x4_t cv1;
+  float32x4_t biasv;
+  float32x4_t biasv1;
+  float32x4_t zero = vdupq_n_f32(0.0);
+  float32x4_t pv;
+  float *ptr = p;
+  for (int i = 0; i < mc; ++i) {
+    c_ptr = c + i * NC;
+    C_ptr = C + i * ldc;
+    biasv = vld1q_dup_f32(bias + i);
+    if (bias1 == nullptr) {
+      biasv1 = zero;
+    } else {
+      biasv1 = vld1q_dup_f32(bias1 + i);
+    }
+
+    for (int j = 0; j < nc1; ++j) {
+      cv = vld1q_f32(c_ptr);
+      cv = vaddq_f32(cv, biasv);
+      cv = vaddq_f32(cv, biasv1);
+      cv = vmaxq_f32(cv, zero);
+      cv1 = vminq_f32(cv, zero);
+      if (mode == "channel") {
+        cv1 = vmulq_n_f32(cv1, ptr[i]);
+      } else if (mode == "element") {
+        pv = vld1q_f32(ptr);
+        cv1 = vmulq_f32(cv1, pv);
+        ptr = ptr + 4;
+      } else {
+        cv1 = vmulq_n_f32(cv1, ptr[0]);
+      }
+      cv = vaddq_f32(cv, cv1);
+      vst1q_f32(C_ptr, cv);
+      c_ptr += 4;
+      C_ptr += 4;
+    }
+    if (_nc1 != 0) {
+      cv = vld1q_f32(c_ptr);
+      cv = vaddq_f32(cv, biasv);
+      cv = vaddq_f32(cv, biasv1);
+      cv = vmaxq_f32(cv, zero);
+      cv1 = vminq_f32(cv, zero);
+      if (mode == "channel") {
+        cv1 = vmulq_n_f32(cv1, ptr[i]);
+      } else if (mode == "element") {
+        pv = vld1q_f32(ptr);
+        cv1 = vmulq_f32(cv1, pv);
+        ptr = ptr + 4;
+      } else {
+        cv1 = vmulq_n_f32(cv1, ptr[0]);
+      }
+      cv = vaddq_f32(cv, cv1);
+      if (_nc1 >= 1) {
+        vst1q_lane_f32(C_ptr, cv, 0);
+        C_ptr++;
+      }
+      if (_nc1 >= 2) {
+        vst1q_lane_f32(C_ptr, cv, 1);
+        C_ptr++;
+      }
+      if (_nc1 >= 3) {
+        vst1q_lane_f32(C_ptr, cv, 2);
+        C_ptr++;
+      }
+    }
+  }
+}
+
 // C = A * B, batchnorm(C)
 void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale,
                 float *new_bias) {
@@ -1971,6 +2067,145 @@ void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
  }
 }

+void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p,
+                       std::string mode, float *bias, float *bias1) {
+  if (nc < 4) {
+    if (bias1 == nullptr) {
+      for (int i = 0; i < mc; ++i) {
+        for (int j = 0; j < nc; ++j) {
+          float r = *c + *bias;
+          if (r < 0) {
+            r = *p;
+          }
+          c++;
+        }
+        bias++;
+        p++;
+      }
+    } else {
+      for (int i = 0; i < mc; ++i) {
+        for (int j = 0; j < nc; ++j) {
+          float r = *c + *bias;
+          r += *bias1;
+          if (r < 0) {
+            r *= *p;
+          }
+          c++;
+          bias1++;
+        }
+        bias++;
+        p++;
+      }
+    }
+    return;
+  }
+
+  int nc1 = nc / 8;
+  int step = 4 * (ldc - nc);
+  int step1 = 4 * (NC - nc);
+
+  if (bias1 == nullptr) {
+    asm volatile(
+        "vmov.f32   q14,    #0.0            \n\t"
+        "subs       %[mc], %[mc], #1        \n\t"
+        "blt        end_mc_%=               \n\t"
+        "loop_mc_%=:                        \n\t"
+
+        "mov        r5,     %[nc1]          \n\t"
+        "vld1.32    {d0},   [%[bias]]       \n\t"
+        "vld1.32    {d1},   [%[p]]          \n\t"
+        "vdup.32    q1,     d0[0]           \n\t"
+        "vdup.32    q2,     d1[0]           \n\t"
+
+        "subs       r5,   r5,   #1          \n\t"
+        "blt        end_nc1_%=              \n\t"
+        "loop_nc1_%=:                       \n\t"
+
+        "pld        [%[c], #32]             \n\t"
+        "vld1.32    {q3, q4},   [%[c]]!     \n\t"
+        "vadd.f32   q3,   q3,   q1          \n\t"
+        "vadd.f32   q4,   q4,   q1          \n\t"
+        "vmax.f32   q5,   q3,   q14         \n\t"
+        "vmin.f32   q7,   q3,   q14         \n\t"
+        "vmax.f32   q6,   q4,   q14         \n\t"
+        "vmin.f32   q8,   q4,   q14         \n\t"
+        "vmla.f32   q5,   q7,   q2          \n\t"
+        "vmla.f32   q6,   q8,   q2          \n\t"
+        "vst1.32    {q5, q6},   [%[C]]!     \n\t"
+
+        "subs       r5,   r5,   #1          \n\t"
+        "bge        loop_nc1_%=             \n\t"
+        "end_nc1_%=:                        \n\t"
+
+        "add        %[p],     %[p],     #4        \n\t"
+        "add        %[bias],  %[bias],  #4        \n\t"
+        "add        %[c],     %[c],     %[step1]  \n\t"
+        "add        %[C],     %[C],     %[step]   \n\t"
+
+        "subs       %[mc], %[mc], #1        \n\t"
+        "bge        loop_mc_%=              \n\t"
+        "end_mc_%=:                         \n\t"
+
+        :
+        : [C] "r"(C), [c] "r"(c), [mc] "r"(mc), [nc1] "r"(nc1),
+          [step] "r"(step), [step1] "r"(step1), [p] "r"(p), [bias] "r"(bias),
+          [bias1] "r"(bias1)
+        : "memory", "r5", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8");
+  } else {
+    asm volatile(
+        "vmov.f32   q14,    #0.0            \n\t"
+        "subs       %[mc], %[mc], #1        \n\t"
+        "blt        end_mc_%=               \n\t"
+        "loop_mc_%=:                        \n\t"
+
+        "mov        r5,     %[nc1]          \n\t"
+        "vld1.32    {d0},   [%[bias]]       \n\t"
+        "vld1.32    {d1},   [%[p]]          \n\t"
+        "vdup.32    q1,     d0[0]           \n\t"
+        "vdup.32    q2,     d1[0]           \n\t"
+
+        "subs       r5,   r5,   #1          \n\t"
+        "blt        end_nc1_%=              \n\t"
+        "loop_nc1_%=:                       \n\t"
+
+        "pld        [%[c], #32]             \n\t"
+        "pld        [%[bias1], #32]         \n\t"
+        "vld1.32    {q3, q4},   [%[c]]!     \n\t"
+        "vld1.32    {q9, q10},  [%[bias1]]! \n\t"
+        "vadd.f32   q3,   q3,   q1          \n\t"
+        "vadd.f32   q4,   q4,   q1          \n\t"
+        "vadd.f32   q3,   q3,   q9          \n\t"
+        "vadd.f32   q4,   q4,   q10         \n\t"
+        "vmax.f32   q5,   q3,   q14         \n\t"
+        "vmin.f32   q7,   q3,   q14         \n\t"
+        "vmax.f32   q6,   q4,   q14         \n\t"
+        "vmin.f32   q8,   q4,   q14         \n\t"
+        "vmla.f32   q5,   q7,   q2          \n\t"
+        "vmla.f32   q6,   q8,   q2          \n\t"
+        "vst1.32    {q5, q6},   [%[C]]!     \n\t"
+
+        "subs       r5,   r5,   #1          \n\t"
+        "bge        loop_nc1_%=             \n\t"
+        "end_nc1_%=:                        \n\t"
+
+        "add        %[p],     %[p],     #4        \n\t"
+        "add        %[bias],  %[bias],  #4        \n\t"
+        "add        %[c],     %[c],     %[step1]  \n\t"
+        "add        %[C],     %[C],     %[step]   \n\t"
+
+        "subs       %[mc], %[mc], #1        \n\t"
+        "bge        loop_mc_%=              \n\t"
+        "end_mc_%=:                         \n\t"
+
+        :
+        : [C] "r"(C), [c] "r"(c), [mc] "r"(mc), [nc1] "r"(nc1),
+          [step] "r"(step), [step1] "r"(step1), [p] "r"(p), [bias] "r"(bias),
+          [bias1] "r"(bias1)
+        : "memory", "r5", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
+          "q9", "q10");
+  }
+}
+
 // C = A * B, batchnorm(C)
 void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *scale,
                 float *bias) {
@@ -2513,6 +2748,9 @@ void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {}
 void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
                        float *bias) {}

+void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p,
+                       std::string mode, float *bias, float *bias1) {}
+
 void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale,
                 float *new_bias) {}

@@ -2642,6 +2880,74 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
  paddle_mobile::memory::Free(zero);
 }

+void SgemmWithPRelu(int m, int n, int k, const float *A, int lda,
+                    const float *B, int ldb, float *C, int ldc, float *p,
+                    std::string mode, float *bias, float *bias1) {
+  // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
+  // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
+  int L1 = 32 * 1024;
+  int L2 = 0.5 * 1024 * 1024;
+
+  KC = k;
+  MC = L1 / (KC * sizeof(float));
+  NC = L2 / (KC * sizeof(float));
+
+  // make sure MC is multiple of MR, and NC is multiple of NR
+  int mblock_num = (m + MC - 1) / MC;
+  MC = (m + mblock_num - 1) / mblock_num;
+  MC = (MC + MR - 1) / MR * MR;
+  //  DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
+
+  int nblock_num = (n + NC - 1) / NC;
+  NC = (n + nblock_num - 1) / nblock_num;
+  NC = (NC + NR - 1) / NR * NR;
+  //  DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
+
+  packedA = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
+  packedB = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
+  packedC = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * MC * NC));
+  zero = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * KC));
+
+  for (int l = 0; l < KC; ++l) {
+    zero[l] = 0;
+  }
+
+  int mc, nc;
+  for (int j = 0; j < n; j += NC) {
+    nc = s_min(n - j, NC);
+#if __aarch64__
+    // PackMatrixB_12c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
+    PackMatrixB_16c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
+#else
+    PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
+#endif
+    for (int i = 0; i < m; i += MC) {
+      mc = s_min(m - i, MC);
+#if __aarch64__
+      PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
+      // PackMatrixA_8r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
+#else
+      PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
+#endif
+      if (bias1 == nullptr) {
+        InnerKernelWithPRelu(mc, nc, packedA, packedB, packedC, &C(i, j), ldc,
+                             p + i, mode, bias + i, nullptr);
+      } else {
+        InnerKernelWithPRelu(mc, nc, packedA, packedB, packedC, &C(i, j), ldc,
+                             p + i, mode, bias + i, bias1 + i * ldc + j);
+      }
+    }
+  }
+
+  paddle_mobile::memory::Free(packedA);
+  paddle_mobile::memory::Free(packedB);
+  paddle_mobile::memory::Free(packedC);
+  paddle_mobile::memory::Free(zero);
+}
+
 // 32位 float 矩阵乘法
 void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
               const float *B, int ldb, float beta, float *C, int ldc,
@@ -2857,6 +3163,123 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
  paddle_mobile::memory::Free(zero);
 }

+void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
+                        const float *B, int ldb, float *C, int ldc, float *p,
+                        std::string mode, float *bias, float *bias1) {
+#ifdef _OPENMP
+  int max_threads = omp_get_max_threads();
+#else
+  int max_threads = 1;
+#endif
+
+  int L1 = 16 / max_threads * 1024;
+  KC = k;
+  if (m > n) {
+    // 对 A 分块
+    MC = L1 / (KC * sizeof(float));
+    int mblock_num = (m + MC - 1) / MC;
+    MC = (m + mblock_num - 1) / mblock_num;
+    MC = (MC + MR - 1) / MR * MR;
+    // 补齐 B
+    NC = (n + NR - 1) / NR * NR;
+
+#if __aarch64__
+    procPackA = PackMatrixA_6r;
+    procPackB = PackMatrixB_omp_16c;
+    procAddDot = AddDot6x16;
+#else
+    procPackA = PackMatrixA_6r;
+    procPackB = PackMatrixB_omp_8c;
+    procAddDot = AddDot6x8;
+#endif
+
+    packedB = static_cast<float *>(
+        paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
+    procPackB(KC, NC, NC % NR, B, ldb, packedB);
+    packedA = static_cast<float *>(
+        paddle_mobile::memory::Alloc(sizeof(float) * MC * KC * max_threads));
+  } else {
+    // 对 B 分块
+    NC = L1 / (KC * sizeof(float));
+    int nblock_num = (n + NC - 1) / NC;
+    NC = (n + nblock_num - 1) / nblock_num;
+    NC = (NC + NR - 1) / NR * NR;
+    // 补齐 A
+    MC = (m + MR - 1) / MR * MR;
+
+#if __aarch64__
+    procPackA = PackMatrixA_omp_6r;
+    procPackB = PackMatrixB_16c;
+    procAddDot = AddDot6x16;
+#else
+    procPackA = PackMatrixA_omp_6r;
+    procPackB = PackMatrixB_8c;
+    procAddDot = AddDot6x8;
+#endif
+
+    packedA = static_cast<float *>(
+        paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
+    procPackA(MC, KC, MC % MR, A, lda, packedA);
+    packedB = static_cast<float *>(
+        paddle_mobile::memory::Alloc(sizeof(float) * KC * NC * max_threads));
+  }
+  zero = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * KC));
+  memset(static_cast<void *>(zero), 0, sizeof(float) * KC);
+  packedC = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * MC * NC * max_threads));
+
+  if (m > n) {
+#pragma omp parallel for
+    for (int i = 0; i < m; i += MC) {
+#ifdef _OPENMP
+      int local_threads = omp_get_thread_num();
+#else
+      int local_threads = 0;
+#endif
+
+      int mc;
+      mc = s_min(m - i, MC);
+      float *local_A = packedA + MC * KC * local_threads;
+      float *local_C = packedC + MC * NC * local_threads;
+      procPackA(mc, KC, mc % MR, &A(i, 0), lda, local_A);
+      if (bias1 == nullptr) {
+        InnerKernelWithPRelu(mc, n, local_A, packedB, local_C, &C(i, 0), ldc,
+                             p + i, mode, bias + i, nullptr);
+      } else {
+        InnerKernelWithPRelu(mc, n, local_A, packedB, local_C, &C(i, 0), ldc,
+                             p + i, mode, bias + i, bias1 + i * ldc);
+      }
+    }
+  } else {
+#pragma omp parallel for
+    for (int j = 0; j < n; j += NC) {
+#ifdef _OPENMP
+      int local_threads = omp_get_thread_num();
+#else
+      int local_threads = 0;
+#endif
+
+      int nc;
+      nc = s_min(n - j, NC);
+      float *local_B = packedB + KC * NC * local_threads;
+      float *local_C = packedC + MC * NC * local_threads;
+      procPackB(KC, nc, nc % NR, &B(0, j), ldb, local_B);
+      if (bias1 == nullptr) {
+        InnerKernelWithPRelu(m, nc, packedA, local_B, local_C, &C(0, j), ldc, p,
+                             mode, bias, nullptr);
+      } else {
+        InnerKernelWithPRelu(m, nc, packedA, local_B, local_C, &C(0, j), ldc, p,
+                             mode, bias, bias1 + j);
+      }
+    }
+  }
+
+  paddle_mobile::memory::Free(packedA);
+  paddle_mobile::memory::Free(packedB);
+  paddle_mobile::memory::Free(packedC);
+  paddle_mobile::memory::Free(zero);
+}
+
 void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
 #if __ARM_NEON
 #if __aarch64__

--- a/src/operators/math/gemm.h
+++ b/src/operators/math/gemm.h
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
+#include <string>
+#include "common/log.h"

 // 矩阵取值运算宏，假设矩阵按行存储
 #define A(i, j) A[(i)*lda + (j)]
@@ -79,6 +81,9 @@ void InnerKernelWithBias(int mc, int nc, float alpha, const float *a,
 void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
                       const float *b, float beta, float *c, float *C, int ldc,
                       bool relu, float *new_scale, float *new_bias);
+void InnerKernelWithPRelu(int mc, int nc, const float *a, const float *b,
+                          float *c, float *C, int ldc, float *p,
+                          std::string mode, float *bias, float *bias1);
 /*
 // 向量矩阵乘法 (M = 1)
 void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
@@ -108,6 +113,9 @@ void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc);
 void WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc, float *bias);
 // C = A * B + C, relu(C)
 void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc);
+// C = A * B + C,prelu(C)
+void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p,
+                       std::string mode, float *bias, float *bias1);
 // C = A * B + bias ,relu(C)
 void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
                        float *bias);
@@ -146,6 +154,10 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
                 const float *B, int ldb, float beta, float *C, int ldc,
                 bool relu, float *new_scale, float *new_bias);

+void SgemmWithPRelu(int m, int n, int k, const float *A, int lda,
+                    const float *B, int ldb, float *C, int ldc, float *p,
+                    std::string mode, float *bias, float *bias1);
+
 // 32位 float 矩阵乘法（openmp 多线程版本）
 void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
               const float *B, int ldb, float beta, float *C, int ldc,
@@ -156,6 +168,10 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
                     const float *B, int ldb, float beta, float *C, int ldc,
                     bool relu, float *new_scale, float *new_bias);

+void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
+                        const float *B, int ldb, float *C, int ldc, float *p,
+                        std::string mode, float *bias, float *bias1);
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/src/operators/math/math_function.cpp
+++ b/src/operators/math/math_function.cpp
@@ -87,6 +87,38 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a,
              new_bias->data<float>() + group);
 #endif
 }
+void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a,
+                     const framework::Tensor &matrix_b, bool trans_b,
+                     framework::Tensor *matrix_out, float *p, std::string mode,
+                     float *bias, float *bias1) {
+  auto dim_a = matrix_a.dims();
+  auto dim_b = matrix_b.dims();
+  auto dim_out = matrix_out->dims();
+  //  PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 &&
+  //  dim_out.size() ==
+  //  2,
+  //                 "The input and output of matmul be matrix");
+  //
+  //  PADDLE_ENFORCE(platform::is_cpu_place(matrix_a.place()) &&
+  //                     platform::is_cpu_place(matrix_b.place())
+  //                     &&
+  //                     platform::is_cpu_place(matrix_out->place()),
+  //                 "Matrix must all be in CPUPlace");
+
+  int M = dim_out[0];
+  int N = dim_out[1];
+  int K = (!trans_a) ? dim_a[1] : dim_a[0];
+
+#ifdef _OPENMP
+  xsSgemmWithPRelu_omp(M, N, K, matrix_a.data<float>(), K,
+                       matrix_b.data<float>(), N, matrix_out->data<float>(), N,
+                       p, mode, bias, bias1);
+#else
+  SgemmWithPRelu(M, N, K, matrix_a.data<float>(), K, matrix_b.data<float>(), N,
+                 matrix_out->data<float>(), N, p, mode, bias, bias1);
+
+#endif
+}

 }  // namespace math
 }  // namespace operators

--- a/src/operators/math/math_function.h
+++ b/src/operators/math/math_function.h
@@ -33,6 +33,11 @@ void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a,
                  framework::Tensor *matrix_out, T beta, bool relu,
                  framework::Tensor *new_scale, framework::Tensor *new_bias,
                  int group);
+
+void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a,
+                     const framework::Tensor &matrix_b, bool trans_b,
+                     framework::Tensor *matrix_out, float *p, std::string mode,
+                     float *bias, float *bias1);
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -87,12 +87,21 @@ class OpParam {
  static T *InputXFrom(const VariableNameMap &inputs, const Scope &scope) {
    return GetVarValue<T>("X", inputs, scope);
  }
+  template <typename T>
+  static T *InputXFrom1(const VariableNameMap &inputs, const Scope &scope) {
+    return GetVarValue1<T>("addX", inputs, scope);
+  }

  template <typename T>
  static T *InputYFrom(const VariableNameMap &inputs, const Scope &scope) {
    return GetVarValue<T>("Y", inputs, scope);
  }

+  template <typename T>
+  static T *InputYFrom1(const VariableNameMap &inputs, const Scope &scope) {
+    return GetVarValue1<T>("Y", inputs, scope);
+  }
+
  template <typename T>
  static T *InputZFrom(const VariableNameMap &inputs, const Scope &scope) {
    return GetVarValue<T>("Z", inputs, scope);
@@ -218,6 +227,26 @@ class OpParam {
    }
  }

+  static std::string getkey(const string &key, const VariableNameMap &var_map,
+                            int index) {
+    auto var_vec = var_map.at(key);
+    return var_vec[index];
+  }
+
+  template <typename T>
+  static T *GetVarValue1(const string &key, const VariableNameMap &var_map,
+                         const Scope &scope) {
+    PADDLE_MOBILE_ENFORCE(var_map.count(key) > 0,
+                          "%s is not contained in var_map", key.c_str())
+    auto var_vec = var_map.at(key);
+    if (!var_vec.empty()) {
+      auto var = scope.FindVar(var_vec[1]);
+      return var->GetMutable<T>();
+    } else {
+      return nullptr;
+    }
+  }
+
  template <typename T>
  static vector<T *> GetMultiVarValue(const string &key,
                                      const VariableNameMap &var_map,
@@ -1174,6 +1203,163 @@ class FusionConvAddReluParam : public FusionConvAddParam<DeviceType> {
 };
 #endif

+#ifdef FUSION_CONVADDPRELU_OP
+template <typename DeviceType>
+class FusionConvAddPReluParam : public OpParam {
+  typedef typename DtypeTensorTrait<DeviceType>::gtype GType;
+  typedef typename DtypeTensorTrait<DeviceType>::rtype RType;
+
+ public:
+  FusionConvAddPReluParam(const VariableNameMap &inputs,
+                          const VariableNameMap &outputs,
+                          const AttributeMap &attrs, const Scope &scope) {
+    alpha_ = InputAlphaFrom<GType>(inputs, scope);
+    mode_ = GetAttr<std::string>("mode", attrs);
+    framework::DDim dims = alpha_->dims();
+    bias_ = InputYFrom<GType>(inputs, scope);
+    axis_ = GetAttr<int>("axis", attrs);
+    filter_ = FilterFrom<GType>(inputs, scope);
+    input_ = InputFrom<GType>(inputs, scope);
+    output_ = OutFrom<GType>(outputs, scope);
+    strides_ = GetAttr<vector<int>>("strides", attrs);
+    paddings_ = GetAttr<vector<int>>("paddings", attrs);
+    dilations_ = GetAttr<vector<int>>("dilations", attrs);
+    groups = GetAttr<int>("groups", attrs);
+  }
+  const RType *InputAlpha() const { return alpha_; }
+  const std::string &Mode() const { return mode_; }
+  RType *Bias() const { return bias_; }
+
+  const int &Axis() const { return axis_; }
+
+  const RType *Input() const { return input_; }
+
+#ifdef PADDLE_MOBILE_FPGA
+  RType *Filter() const { return filter_; }
+#else
+  const RType *Filter() const { return filter_; }
+#endif
+
+  RType *Output() const { return output_; }
+
+  const vector<int> &Strides() const { return strides_; }
+
+  const vector<int> &Paddings() const { return paddings_; }
+
+  const vector<int> &Dilations() const { return dilations_; }
+
+  const int &Groups() const { return groups; }
+
+ protected:
+  RType *bias_;
+  int axis_;
+  RType *input_;
+  RType *output_;
+  RType *filter_;
+  vector<int> strides_;
+  vector<int> paddings_;
+  vector<int> dilations_;
+  int groups;
+  RType *alpha_;
+  std::string mode_;
+#ifdef PADDLE_MOBILE_FPGA
+
+ private:
+  fpga::ConvArgs fpga_conv_args;
+
+ public:
+  const fpga::ConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::ConvArgs &args) { fpga_conv_args = args; }
+#endif
+};
+#endif
+
+#ifdef FUSION_CONVADDADDPRELU_OP
+template <typename DeviceType>
+class FusionConvAddAddPReluParam : public OpParam {
+  typedef typename DtypeTensorTrait<DeviceType>::gtype GType;
+  typedef typename DtypeTensorTrait<DeviceType>::rtype RType;
+
+ public:
+  FusionConvAddAddPReluParam(const VariableNameMap &inputs,
+                             const VariableNameMap &outputs,
+                             const AttributeMap &attrs, const Scope &scope) {
+    bias1_ = InputYFrom1<GType>(inputs, scope);
+    alpha_ = InputAlphaFrom<GType>(inputs, scope);
+    mode_ = GetAttr<std::string>("mode", attrs);
+    framework::DDim dims = alpha_->dims();
+    bias_ = InputYFrom<GType>(inputs, scope);
+    axis_ = GetAttr<int>("axis", attrs);
+    filter_ = FilterFrom<GType>(inputs, scope);
+    input_ = InputFrom<GType>(inputs, scope);
+    output_ = OutFrom<GType>(outputs, scope);
+    strides_ = GetAttr<vector<int>>("strides", attrs);
+    paddings_ = GetAttr<vector<int>>("paddings", attrs);
+    dilations_ = GetAttr<vector<int>>("dilations", attrs);
+    groups = GetAttr<int>("groups", attrs);
+    keyOutput_ = getkey("addOut", inputs, 0);
+    keyX1_ = getkey("addX", inputs, 1);
+    keyY1_ = getkey("Y", inputs, 1);
+    if (keyX1_ == keyOutput_) {
+      bias1_ = InputYFrom1<GType>(inputs, scope);
+    } else if (keyY1_ == keyOutput_) {
+      bias1_ = InputXFrom1<GType>(inputs, scope);
+    }
+  }
+  const RType *InputAlpha() const { return alpha_; }
+  const std::string &Mode() const { return mode_; }
+  const RType *Bias1() const { return bias1_; }
+
+  RType *Bias() const { return bias_; }
+
+  const int &Axis() const { return axis_; }
+
+  const RType *Input() const { return input_; }
+
+#ifdef PADDLE_MOBILE_FPGA
+  RType *Filter() const { return filter_; }
+#else
+  const RType *Filter() const { return filter_; }
+#endif
+
+  RType *Output() const { return output_; }
+
+  const vector<int> &Strides() const { return strides_; }
+
+  const vector<int> &Paddings() const { return paddings_; }
+
+  const vector<int> &Dilations() const { return dilations_; }
+
+  const int &Groups() const { return groups; }
+
+ protected:
+  RType *bias_;
+  int axis_;
+  RType *input_;
+  RType *output_;
+  RType *filter_;
+  vector<int> strides_;
+  vector<int> paddings_;
+  vector<int> dilations_;
+  int groups;
+  RType *alpha_;
+  std::string mode_;
+  RType *bias1_;
+  std::string keyOutput_;
+  std::string keyX1_;
+  std::string keyY1_;
+#ifdef PADDLE_MOBILE_FPGA
+
+ private:
+  fpga::ConvArgs fpga_conv_args;
+
+ public:
+  const fpga::ConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::ConvArgs &args) { fpga_conv_args = args; }
+#endif
+};
+#endif
+
 #ifdef FUSION_CONVADDBNRELU_OP
 template <typename Dtype>
 class FusionConvAddBNReluParam : public OpParam {

--- a/tools/op.cmake
+++ b/tools/op.cmake
@@ -124,6 +124,7 @@ if(NOT FOUND_MATCH)
  set(DEPTHWISECONV_OP ON)
  set(ELEMENTWISEADD_OP ON)
  set(FUSION_CONVADD_OP ON)
+  set(FUSION_CONVADDPRELU_OP ON)
  set(FUSION_CONVADDRELU_OP ON)
  set(FUSION_FC_OP ON)
  set(LRN_OP ON)
@@ -137,6 +138,7 @@ if(NOT FOUND_MATCH)
  set(SOFTMAX_OP ON)
  set(TRANSPOSE_OP ON)
  set(FUSION_CONVADDBNRELU_OP ON)
+  set(FUSION_CONVADDADDPRELU_OP ON)
  set(FUSION_DWCONVBNRELU_OP ON)
  set(FUSION_CONVBNRELU_OP ON)
  set(PRELU_OP ON)
@@ -192,6 +194,12 @@ endif()
 if (FUSION_CONVADDRELU_OP)
  add_definitions(-DFUSION_CONVADDRELU_OP)
 endif()
+if (FUSION_CONVADDPRELU_OP)
+  add_definitions(-DFUSION_CONVADDPRELU_OP)
+endif()
+if (FUSION_CONVADDADDPRELU_OP)
+  add_definitions(-DFUSION_CONVADDADDPRELU_OP)
+endif()
 if (FUSION_FC_OP)
  add_definitions(-DFUSION_FC_OP)
 endif()