[core] [XPU] add added xpu conv2d fuse, vis fuse and many ops for wangpan clarity feature (#4084)

42e62a74 · sunsetlh · GitHub · 64398557 · 42e62a74 · 42e62a74
41 changed file
--- a/lite/api/paddle_use_passes.h
+++ b/lite/api/paddle_use_passes.h
@@ -67,3 +67,7 @@ USE_MIR_PASS(__xpu__multi_encoder_fuse_pass);
 USE_MIR_PASS(__xpu__embedding_with_eltwise_add_fuse_pass);
 USE_MIR_PASS(__xpu__fc_fuse_pass);
 USE_MIR_PASS(__xpu__mmdnn_fuse_pass);
+USE_MIR_PASS(__xpu__conv2d_fuse_pass);
+USE_MIR_PASS(__xpu__conv2d_link_previous_out_max_pass);
+USE_MIR_PASS(__xpu__sfa_head_meanstd_fuse_pass);
+USE_MIR_PASS(__xpu__sfa_head_moment_fuse_pass);
--- a/lite/core/mir/CMakeLists.txt
+++ b/lite/core/mir/CMakeLists.txt
@@ -30,6 +30,10 @@ lite_cc_library(mir_passes
      fusion/__xpu__embedding_with_eltwise_add_fuse_pass.cc
      fusion/__xpu__fc_fuse_pass.cc
      fusion/__xpu__mmdnn_fuse_pass.cc
+      fusion/__xpu__conv2d_fuse_pass.cc
+      fusion/__xpu__conv2d_link_previous_out_max_pass.cc
+      fusion/__xpu__sfa_head_meanstd_fuse_pass.cc
+      fusion/__xpu__sfa_head_moment_fuse_pass.cc
      fusion/match_matrix_activation_fuse_pass.cc
      fusion/scales_fuse_pass.cc
      fusion/sequence_reverse_embedding_fuse_pass.cc

--- a/lite/core/mir/fusion/__xpu__conv2d_fuse_pass.cc
+++ b/lite/core/mir/fusion/__xpu__conv2d_fuse_pass.cc
--- a/lite/core/mir/fusion/__xpu__conv2d_link_previous_out_max_pass.cc
+++ b/lite/core/mir/fusion/__xpu__conv2d_link_previous_out_max_pass.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <string>
+#include "lite/backends/xpu/math.h"
+#include "lite/core/mir/pass_registry.h"
+#include "lite/core/mir/pattern_matcher_high_api.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+/* link the previous __xpu__conv2d's OutputMax to   */
+/* next __xpu__conv2d as InputMax                   */
+/* For example:                                     */
+/* graph[1]: sub block                              */
+/*                     in_Input                     */
+/*        in_Filter      |     in_FilterMax         */
+/*                  \    |    /                     */
+/*                   \   |   /                      */
+/*     in_Bias ------- __xpu__conv2d                */
+/*                       |      \                   */
+/*                       |       \                  */
+/*                out_Output      out_OutputMax     */
+/*                       |                          */
+/*                       |                          */
+/*                    __xpu__conv2d                 */
+/*                       |                          */
+/*                       |                          */
+/*                     out_Output                   */
+/*                                                  */
+/* After the pass is applied:                       */
+/*                     in_Input                     */
+/*        in_Filter      |     in_FilterMax         */
+/*                  \    |    /                     */
+/*                   \   |   /                      */
+/*     in_Bias ------- __xpu__conv2d                */
+/*                       |      \                   */
+/*                       |       \                  */
+/*                out_Output      out_OutputMax     */
+/*                       |       /                  */
+/*                       |      /                   */
+/*                    __xpu__conv2d                 */
+/*                       |                          */
+/*                       |                          */
+/*                     out_Output                   */
+
+class XPUConv2dLinkFuser : public FuseBase {
+ public:
+  explicit XPUConv2dLinkFuser(bool with_branch) : _with_branch(with_branch) {}
+
+  void BuildPattern() override {
+    auto* input = VarNode("input")
+                      ->assert_is_op_input("__xpu__conv2d", "Input")
+                      ->AsInput();
+    auto* filter = VarNode("filter")
+                       ->assert_is_op_input("__xpu__conv2d", "Filter")
+                       ->AsInput();
+    auto* filter_max = VarNode("filter_max")
+                           ->assert_is_op_input("__xpu__conv2d", "FilterMax")
+                           ->AsInput();
+    auto* bias =
+        VarNode("bias")->assert_is_op_input("__xpu__conv2d", "Bias")->AsInput();
+    auto* xpu_conv = OpNode("xpu_conv", "__xpu__conv2d");
+    auto* xpu_conv_out = VarNode("xpu_conv_out")
+                             ->assert_is_op_output("__xpu__conv2d", "Output")
+                             ->AsOutput();
+    auto* xpu_conv_out_max =
+        VarNode("xpu_conv_out_max")
+            ->assert_is_op_output("__xpu__conv2d", "OutputMax")
+            ->AsOutput();
+
+    *input >> *xpu_conv >> *xpu_conv_out;
+    *filter >> *xpu_conv;
+    *filter_max >> *xpu_conv;
+    *bias >> *xpu_conv;
+    *xpu_conv >> *xpu_conv_out_max;
+
+    if (_with_branch) {
+      auto* branch = VarNode("branch")
+                         ->assert_is_op_input("__xpu__conv2d", "Branch")
+                         ->AsInput();
+      *branch >> *xpu_conv;
+    }
+  }
+
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    auto conv_instruct = matched.at("xpu_conv")->stmt();
+    auto op_desc = *conv_instruct->mutable_op_info();
+    auto conv_old = conv_instruct->op();
+
+    // try to find input_max
+    std::string max_input_name = matched.at("input")->arg()->name + "_max";
+    auto* max_input_node = graph->RetrieveArgument(max_input_name);
+    if (max_input_node != nullptr &&
+        (!op_desc.HasAttr("has_input_max") ||
+         !op_desc.GetAttr<bool>("has_input_max"))) {
+      op_desc.SetInput("InputMax", {max_input_name});
+      op_desc.SetAttr("has_input_max", true);
+      conv_instruct->ResetOp(op_desc, conv_old->valid_places());
+      DirectedLink(max_input_node, matched.at("xpu_conv"));
+    }
+  }
+
+ private:
+  bool _with_branch;
+};
+
+}  // namespace fusion
+
+class XPUConv2dLinkPass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override {
+    if (GetBoolFromEnv("XPU_ENABLE_XTCL")) return;
+
+    fusion::XPUConv2dLinkFuser fuser1(true);
+    fuser1(graph.get());
+
+    // TODO(sunsetlh): need fix bug in no branch case
+    fusion::XPUConv2dLinkFuser fuser2(false);
+    fuser2(graph.get());
+  }
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(__xpu__conv2d_link_previous_out_max_pass,
+                  paddle::lite::mir::XPUConv2dLinkPass)
+    .BindTargets({TARGET(kXPU)})
+    .BindKernel("__xpu__conv2d");
--- a/lite/core/mir/fusion/__xpu__sfa_head_meanstd_fuse_pass.cc
+++ b/lite/core/mir/fusion/__xpu__sfa_head_meanstd_fuse_pass.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <string>
+#include "lite/backends/xpu/math.h"
+#include "lite/core/mir/pass_registry.h"
+#include "lite/core/mir/pattern_matcher_high_api.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+// Special fuse pass for the subgraph block in vis clarity model
+// block desc:
+//  [["reduce_mean",
+//      ["concat"],
+//      ["elementwise_sub",
+//          ["square", ["reduce_sum", ["scale", ["sqrt"]]]]]]]
+
+class XPUSfaHeadMeanstdFuser : public FuseBase {
+ public:
+  void BuildPattern() override {
+    auto* reduce_mean_input = VarNode("reduce_mean_input")
+                                  ->assert_is_op_output("reshape2", "Out")
+                                  ->assert_is_op_input("reduce_mean", "X")
+                                  ->AsInput();
+    auto* reduce_mean = OpNode("reduce_mean", "reduce_mean")->AsIntermediate();
+    auto* reduce_mean_out = VarNode("reduce_mean_out")
+                                ->assert_is_op_output("reduce_mean", "Out")
+                                ->assert_is_op_nth_input("concat", "X", 0)
+                                ->assert_is_op_input("elementwise_sub", "Y")
+                                ->AsIntermediate();
+    auto* elementwise_sub =
+        OpNode("elementwise_sub", "elementwise_sub")->AsIntermediate();
+    auto* elementwise_sub_out =
+        VarNode("elementwise_sub_out")
+            ->assert_is_op_output("elementwise_sub", "Out")
+            ->assert_is_op_input("square", "X")
+            ->AsIntermediate();
+    auto* square = OpNode("square", "square")->AsIntermediate();
+    auto* square_out = VarNode("square_out")
+                           ->assert_is_op_output("square", "Out")
+                           ->assert_is_op_input("reduce_sum", "X")
+                           ->AsIntermediate();
+    auto* reduce_sum = OpNode("reduce_sum", "reduce_sum")->AsIntermediate();
+    auto* reduce_sum_out = VarNode("reduce_sum_out")
+                               ->assert_is_op_output("reduce_sum", "Out")
+                               ->assert_is_op_input("elementwise_div", "X")
+                               ->AsIntermediate();
+    auto* fill_constant =
+        OpNode("fill_constant", "fill_constant")->AsIntermediate();
+    auto* fill_constant_out = VarNode("fill_constant_out")
+                                  ->assert_is_op_output("fill_constant", "Out")
+                                  ->AsIntermediate();
+    auto* elementwise_div =
+        OpNode("elementwise_div", "elementwise_div")->AsIntermediate();
+    auto* elementwise_div_out =
+        VarNode("elementwise_div_out")
+            ->assert_is_op_output("elementwise_div", "Out")
+            ->assert_is_op_input("sqrt", "X")
+            ->AsIntermediate();
+    auto* sqrt = OpNode("sqrt", "sqrt")->AsIntermediate();
+    auto* sqrt_out = VarNode("sqrt_out")
+                         ->assert_is_op_output("sqrt", "Out")
+                         ->assert_is_op_nth_input("concat", "X", 1)
+                         ->AsIntermediate();
+    auto* concat = OpNode("concat", "concat")->AsIntermediate();
+    auto* out =
+        VarNode("out")->assert_is_op_output("concat", "Out")->AsOutput();
+
+    std::vector<PMNode*> elementwise_sub_inputs{reduce_mean_out,
+                                                reduce_mean_input};
+    std::vector<PMNode*> elementwise_div_inputs{reduce_sum_out,
+                                                fill_constant_out};
+    std::vector<PMNode*> concat_inputs{reduce_mean_out, sqrt_out};
+    *reduce_mean_input >> *reduce_mean >> *reduce_mean_out;
+    elementwise_sub_inputs >> *elementwise_sub >> *elementwise_sub_out;
+    *elementwise_sub_out >> *square >> *square_out;
+    *square_out >> *reduce_sum >> *reduce_sum_out;
+    *fill_constant >> *fill_constant_out;
+    elementwise_div_inputs >> *elementwise_div >> *elementwise_div_out;
+    *elementwise_div_out >> *sqrt >> *sqrt_out;
+    concat_inputs >> *concat >> *out;
+  }
+
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    auto reduce_mean = matched.at("reduce_mean")->stmt()->op();
+    auto* scope = reduce_mean->scope();
+    auto op_desc = GenOpDesc(matched);
+    auto vis_op = LiteOpRegistry::Global().Create("__xpu__sfa_head");
+    auto& valid_places = reduce_mean->valid_places();
+    vis_op->Attach(op_desc, scope);
+    auto* new_op_node = graph->GraphCreateInstructNode(vis_op, valid_places);
+
+    IR_NODE_LINK_TO(matched.at("reduce_mean_input"), new_op_node);
+    IR_NODE_LINK_TO(new_op_node, matched.at("out"));
+  }
+
+ private:
+  cpp::OpDesc GenOpDesc(const key2nodes_t& matched) {
+    cpp::OpDesc op_desc = *matched.at("reduce_mean")->stmt()->op_info();
+    op_desc.mutable_inputs()->clear();
+    op_desc.mutable_outputs()->clear();
+    op_desc.SetType("__xpu__sfa_head");
+    op_desc.SetInput("Input", {matched.at("reduce_mean_input")->arg()->name});
+    op_desc.SetOutput("Output", {matched.at("out")->arg()->name});
+    op_desc.SetAttr("op_type", std::string("meanstd"));
+    return op_desc;
+  }
+};
+
+}  // namespace fusion
+
+class XPUSfaHeadMeanstdFusePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override {
+    if (GetBoolFromEnv("XPU_ENABLE_XTCL")) {
+      return;
+    }
+
+    fusion::XPUSfaHeadMeanstdFuser fuser;
+    fuser(graph.get());
+  }
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(__xpu__sfa_head_meanstd_fuse_pass,
+                  paddle::lite::mir::XPUSfaHeadMeanstdFusePass)
+    .BindTargets({TARGET(kXPU)})
+    .BindKernel("reduce_mean");
--- a/lite/core/mir/fusion/__xpu__sfa_head_moment_fuse_pass.cc
+++ b/lite/core/mir/fusion/__xpu__sfa_head_moment_fuse_pass.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <string>
+#include "lite/backends/xpu/math.h"
+#include "lite/core/mir/pass_registry.h"
+#include "lite/core/mir/pattern_matcher_high_api.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+// Special fuse pass for the subgraph block in vis clarity model
+// block desc:
+//  [["reduce_mean",
+//  ["concat"],
+//  ["elementwise_sub",
+//      ["square", ["reduce_mean", ["sqrt"]]],
+//      ["abs", ["pow", ["elementwise_mul", ["reduce_mean", ["abs",
+//      ["pow"]]]]]],
+//      ["sign"],
+//      ["abs", ["pow", ["reduce_mean", ["abs", ["pow"]]]]]]]]
+
+class XPUSfaHeadMomentFuser : public FuseBase {
+ public:
+  void BuildPattern() override {
+    auto* reduce_mean_input = VarNode("reduce_mean_input")
+                                  ->assert_is_op_output("reshape2", "Out")
+                                  ->assert_is_op_input("reduce_mean", "X")
+                                  ->assert_is_op_input("elementwise_sub", "X")
+                                  ->AsInput();
+    auto* reduce_mean = OpNode("reduce_mean", "reduce_mean")->AsIntermediate();
+
+    auto* reduce_mean_out = VarNode("reduce_mean_out")
+                                ->assert_is_op_output("reduce_mean", "Out")
+                                ->assert_is_op_nth_input("concat", "X", 0)
+                                ->assert_is_op_input("elementwise_sub", "Y")
+                                ->AsIntermediate();
+
+    auto* elementwise_sub =
+        OpNode("elementwise_sub", "elementwise_sub")->AsIntermediate();
+    auto* elementwise_sub_out =
+        VarNode("elementwise_sub_out")
+            ->assert_is_op_output("elementwise_sub", "Out")
+            ->assert_is_op_input("square", "X")
+            ->assert_is_op_input("abs", "X")
+            ->assert_is_op_input("sign", "X")
+            ->AsIntermediate();
+
+    auto* square = OpNode("square", "square")->AsIntermediate();
+
+    auto* square_out = VarNode("square_out")
+                           ->assert_is_op_output("square", "Out")
+                           ->assert_is_op_input("reduce_mean", "X")
+                           ->AsIntermediate();
+    auto* reduce_mean_es =
+        OpNode("es_reduce_mean", "reduce_mean")->AsIntermediate();
+    auto* reduce_mean_out_es = VarNode("reduce_mean_out_es")
+                                   ->assert_is_op_output("reduce_mean", "Out")
+                                   ->assert_is_op_input("sqrt", "X")
+                                   ->AsIntermediate();
+    auto* sqrt = OpNode("sqrt", "sqrt")->AsIntermediate();
+    auto* sqrt_out = VarNode("sqrt_out")
+                         ->assert_is_op_output("sqrt", "Out")
+                         ->assert_is_op_nth_input("concat", "X", 1)
+                         ->AsIntermediate();
+    auto* concat = OpNode("concat", "concat")->AsIntermediate();
+    auto* out =
+        VarNode("out")->assert_is_op_output("concat", "Out")->AsOutput();
+
+    auto* abs_e2 = OpNode("e2_abs", "abs")->AsIntermediate();
+    auto* abs_e2_out = VarNode("abs_e2_out")
+                           ->assert_is_op_input("pow", "X")
+                           ->assert_is_op_output("abs", "Out")
+                           ->AsIntermediate();
+
+    auto* pow_e2 = OpNode("e2_pow", "pow")->AsIntermediate();
+    auto* pow_e2_out = VarNode("pow_e2_out")
+                           ->assert_is_op_input("elementwise_mul", "X")
+                           ->assert_is_op_output("pow", "Out")
+                           ->AsIntermediate();
+
+    auto* sign_e3 = OpNode("e3_sign", "sign")->AsIntermediate();
+    auto* sign_e3_out = VarNode("sign_e3_out")
+                            ->assert_is_op_input("elementwise_mul", "Y")
+                            ->assert_is_op_output("sign", "Out")
+                            ->AsIntermediate();
+
+    auto* elementwise_mul_top =
+        OpNode("elementwise_mul_top", "elementwise_mul")->AsIntermediate();
+    auto* elementwise_mul_top_out =
+        VarNode("elementwise_mul_top_out")
+            ->assert_is_op_input("reduce_mean", "X")
+            ->assert_is_op_output("elementwise_mul", "Out")
+            ->AsIntermediate();
+    auto* reduce_mean_e2 =
+        OpNode("reduce_mean_e2", "reduce_mean")->AsIntermediate();
+    auto* reduce_mean_e2_out = VarNode("reduce_mean_e2_out")
+                                   ->assert_is_op_input("abs", "X")
+                                   ->assert_is_op_input("sign", "X")
+                                   ->assert_is_op_output("reduce_mean", "Out")
+                                   ->AsIntermediate();
+    auto* abs_e2_2 = OpNode("abs_e2_2", "abs")->AsIntermediate();
+    auto* abs_e2_2_out = VarNode("abs_e2_2_out")
+                             ->assert_is_op_input("pow", "X")
+                             ->assert_is_op_output("abs", "Out")
+                             ->AsIntermediate();
+    auto* pow_e2_2 = OpNode("pow_e2_2", "pow")->AsIntermediate();
+    auto* pow_e2_2_out = VarNode("pow_e2_2_out")
+                             ->assert_is_op_nth_input("elementwise_mul", "X", 0)
+                             ->assert_is_op_output("pow", "Out")
+                             ->AsIntermediate();
+    auto* sign_e3_2 = OpNode("sign_e3_2", "sign")->AsIntermediate();
+    auto* sign_e3_2_out = VarNode("sign_e3_2_out")
+                              ->assert_is_op_input("elementwise_mul", "Y")
+                              ->assert_is_op_output("sign", "Out")
+                              ->AsIntermediate();
+    auto* elementwise_mul_bottom =
+        OpNode("elementwise_mul_bottom", "elementwise_mul")->AsIntermediate();
+    auto* elementwise_mul_bottom_out =
+        VarNode("elementwise_mul_bottom_out")
+            ->assert_is_op_output("elementwise_mul", "Out")
+            ->assert_is_op_nth_input("concat", "X", 2)
+            ->AsIntermediate();
+
+    // e4
+    auto* abs_e_4 = OpNode("abs_e_4", "abs")->AsIntermediate();
+    auto* abs_e_4_out = VarNode("abs_e_4_out")
+                            ->assert_is_op_output("abs", "Out")
+                            ->assert_is_op_input("pow", "X")
+                            ->AsIntermediate();
+    auto* pow_e_4 = OpNode("pow_e_4", "pow")->AsIntermediate();
+    auto* pow_e_4_out = VarNode("pow_e_4_out")
+                            ->assert_is_op_output("pow", "Out")
+                            ->assert_is_op_input("reduce_mean", "X")
+                            ->AsIntermediate();
+    auto* reduce_mean_4 = OpNode("reduce_mean_4")->AsIntermediate();
+    auto* reduce_mean_4_out = VarNode("reduce_mean_4_out")
+                                  ->assert_is_op_output("reduce_mean", "Out")
+                                  ->assert_is_op_input("abs", "X")
+                                  ->AsIntermediate();
+
+    auto* abs_e_4_2 = OpNode("abs_e_4_2", "abs")->AsIntermediate();
+    auto* abs_e_4_2_out = VarNode("abs_e_4_2_out")
+                              ->assert_is_op_output("abs", "Out")
+                              ->assert_is_op_input("pow", "X")
+                              ->AsIntermediate();
+
+    auto* pow_e_4_2 = OpNode("pow_e_4_2", "pow")->AsIntermediate();
+    auto* pow_e_4_2_out = VarNode("pow_e_4_2_out")
+                              ->assert_is_op_output("pow", "Out")
+                              ->assert_is_op_nth_input("concat", "X", 3)
+                              ->AsIntermediate();
+
+    std::vector<PMNode*> elementwise_sub_inputs{reduce_mean_input,
+                                                reduce_mean_out};
+
+    *reduce_mean_input >> *reduce_mean >> *reduce_mean_out;
+    elementwise_sub_inputs >> *elementwise_sub >> *elementwise_sub_out;
+    *elementwise_sub_out >> *square >> *square_out;
+    *square_out >> *reduce_mean_es >> *reduce_mean_out_es;
+    *reduce_mean_out_es >> *sqrt >> *sqrt_out;
+
+    *elementwise_sub_out >> *sign_e3 >> *sign_e3_out;
+
+    std::vector<PMNode*> elementwise_mul_top_inputs{pow_e2_out, sign_e3_out};
+    *elementwise_sub_out >> *abs_e2 >> *abs_e2_out;
+    *abs_e2_out >> *pow_e2 >> *pow_e2_out;
+    elementwise_mul_top_inputs >> *elementwise_mul_top >>
+        *elementwise_mul_top_out;
+
+    *elementwise_mul_top_out >> *reduce_mean_e2 >> *reduce_mean_e2_out;
+    *reduce_mean_e2_out >> *abs_e2_2 >> *abs_e2_2_out;
+    *abs_e2_2_out >> *pow_e2_2 >> *pow_e2_2_out;
+
+    *reduce_mean_e2_out >> *sign_e3_2 >> *sign_e3_2_out;
+
+    std::vector<PMNode*> elementwise_mul_bottom_inputs{pow_e2_2_out,
+                                                       sign_e3_2_out};
+    elementwise_mul_bottom_inputs >> *elementwise_mul_bottom >>
+        *elementwise_mul_bottom_out;
+
+    *elementwise_sub_out >> *abs_e_4 >> *abs_e_4_out;
+    *abs_e_4_out >> *pow_e_4 >> *pow_e_4_out;
+    *pow_e_4_out >> *reduce_mean_4 >> *reduce_mean_4_out;
+    *reduce_mean_4_out >> *abs_e_4_2 >> *abs_e_4_2_out;
+    *abs_e_4_2_out >> *pow_e_4_2 >> *pow_e_4_2_out;
+
+    std::vector<PMNode*> concat_inputs{
+        reduce_mean_out, sqrt_out, elementwise_mul_bottom_out, pow_e_4_2_out};
+    concat_inputs >> *concat >> *out;
+  }
+
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    auto reduce_mean = matched.at("reduce_mean")->stmt()->op();
+    auto* scope = reduce_mean->scope();
+    auto op_desc = GenOpDesc(matched);
+    auto vis_op = LiteOpRegistry::Global().Create("__xpu__sfa_head");
+    auto& valid_places = reduce_mean->valid_places();
+    vis_op->Attach(op_desc, scope);
+    auto* new_op_node = graph->GraphCreateInstructNode(vis_op, valid_places);
+
+    IR_NODE_LINK_TO(matched.at("reduce_mean_input"), new_op_node);
+    IR_NODE_LINK_TO(new_op_node, matched.at("out"));
+  }
+
+ private:
+  cpp::OpDesc GenOpDesc(const key2nodes_t& matched) {
+    cpp::OpDesc op_desc = *matched.at("reduce_mean")->stmt()->op_info();
+    op_desc.mutable_inputs()->clear();
+    op_desc.mutable_outputs()->clear();
+    op_desc.SetType("__xpu__sfa_head");
+    op_desc.SetInput("Input", {matched.at("reduce_mean_input")->arg()->name});
+    op_desc.SetOutput("Output", {matched.at("out")->arg()->name});
+    op_desc.SetAttr("op_type", std::string("moment"));
+    return op_desc;
+  }
+};
+
+}  // namespace fusion
+
+class XPUSfaHeadMomentFusePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override {
+    if (GetBoolFromEnv("XPU_ENABLE_XTCL")) {
+      return;
+    }
+
+    fusion::XPUSfaHeadMomentFuser fuser;
+    fuser(graph.get());
+  }
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(__xpu__sfa_head_moment_fuse_pass,
+                  paddle::lite::mir::XPUSfaHeadMomentFusePass)
+    .BindTargets({TARGET(kXPU)})
+    .BindKernel("reduce_mean");
--- a/lite/core/mir/graph_visualize_pass.cc
+++ b/lite/core/mir/graph_visualize_pass.cc
@@ -122,7 +122,15 @@ std::string Visualize(mir::SSAGraph* graph) {
        dot.AddNode(var_name, {});
        exists_var_names.insert(var_name);
      }
-      dot.AddEdge(var_name, op_name, {});
+      std::vector<Dot::Attr> attrs;
+      std::string arg_name;
+      if (op_info->GetInputArgname(var_name, &arg_name)) {
+        attrs.emplace_back("label", arg_name);
+      } else {
+        VLOG(5) << "Can not find the input argument for var " << var_name
+                << " in " << op_type;
+      }
+      dot.AddEdge(var_name, op_name, attrs);
    }
    for (auto& x : node->outlinks) {
      std::string var_name;
@@ -136,7 +144,15 @@ std::string Visualize(mir::SSAGraph* graph) {
        dot.AddNode(var_name, {});
        exists_var_names.insert(var_name);
      }
-      dot.AddEdge(op_name, var_name, {});
+      std::vector<Dot::Attr> attrs;
+      std::string arg_name;
+      if (op_info->GetOutputArgname(var_name, &arg_name)) {
+        attrs.emplace_back("label", arg_name);
+      } else {
+        VLOG(5) << "Can not find the output argument for var " << var_name
+                << " in " << op_type;
+      }
+      dot.AddEdge(op_name, var_name, attrs);
    }
    // Output its all of attributes(name and values)
    os << "* " << op_name << "\n";

--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
@@ -109,6 +109,10 @@ class Optimizer {
           "identity_dropout_eliminate_pass",
           "__xpu__resnet_fuse_pass",
           "__xpu__resnet_cbam_fuse_pass",
+           "__xpu__conv2d_fuse_pass",
+           "__xpu__conv2d_link_previous_out_max_pass",
+           "__xpu__sfa_head_meanstd_fuse_pass",
+           "__xpu__sfa_head_moment_fuse_pass",
           "__xpu__mmdnn_fuse_pass",
           "__xpu__multi_encoder_fuse_pass",
           "__xpu__embedding_with_eltwise_add_fuse_pass",

--- a/lite/kernels/arm/CMakeLists.txt
+++ b/lite/kernels/arm/CMakeLists.txt
@@ -90,8 +90,6 @@ add_kernel(sequence_softmax_compute_arm ARM extra SRCS sequence_softmax_compute.
 add_kernel(topk_compute_arm ARM extra SRCS topk_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(increment_compute_arm ARM extra SRCS increment_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(beam_search_compute_arm ARM extra SRCS beam_search_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(fill_constant_compute_arm ARM basic SRCS fill_constant_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(fill_constant_batch_size_like_compute_arm ARM basic SRCS fill_constant_batch_size_like_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(lod_reset_compute_arm ARM extra SRCS lod_reset_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(lstm_arm ARM extra SRCS lstm_compute.cc DEPS ${lite_kernel_deps} math_arm)


--- a/lite/kernels/host/CMakeLists.txt
+++ b/lite/kernels/host/CMakeLists.txt
@@ -8,6 +8,8 @@ add_kernel(unsqueeze_compute_host Host basic SRCS unsqueeze_compute.cc DEPS ${li
 add_kernel(multiclass_nms_compute_host Host basic SRCS multiclass_nms_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(expand_compute_host Host basic SRCS expand_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(expand_as_compute_host Host basic SRCS expand_as_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(fill_constant_compute_host Host basic SRCS fill_constant_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(fill_constant_batch_size_like_compute_host Host basic SRCS fill_constant_batch_size_like_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(shape_compute_host Host extra SRCS shape_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(is_empty_compute_host Host extra SRCS is_empty_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(crf_decoding_compute_host Host extra SRCS crf_decoding_compute.cc DEPS ${lite_kernel_deps})

--- a/lite/kernels/arm/fill_constant_batch_size_like_compute.cc
+++ b/lite/kernels/arm/fill_constant_batch_size_like_compute.cc
@@ -12,16 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "lite/kernels/arm/fill_constant_batch_size_like_compute.h"
+#include "lite/kernels/host/fill_constant_batch_size_like_compute.h"

 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace arm {
+namespace host {

 void FillConstantBatchSizeLikeCompute::Run() {
  auto& param = *param_.get_mutable<param_t>();
-  auto& context = ctx_->As<ARMContext>();

  if (param.dtype == static_cast<int32_t>(lite::core::FluidType::FP32)) {
    auto data = param.out->template mutable_data<float>();
@@ -50,18 +49,18 @@ void FillConstantBatchSizeLikeCompute::Run() {
  }
 }

-}  // namespace arm
+}  // namespace host
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle

 REGISTER_LITE_KERNEL(
    fill_constant_batch_size_like,
-    kARM,
+    kHost,
    kAny,
    kNCHW,
-    paddle::lite::kernels::arm::FillConstantBatchSizeLikeCompute,
+    paddle::lite::kernels::host::FillConstantBatchSizeLikeCompute,
    def)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))})
    .Finalize();
--- a/lite/kernels/arm/fill_constant_batch_size_like_compute.h
+++ b/lite/kernels/arm/fill_constant_batch_size_like_compute.h
@@ -19,10 +19,10 @@
 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace arm {
+namespace host {

 class FillConstantBatchSizeLikeCompute
-    : public KernelLite<TARGET(kARM), PRECISION(kAny)> {
+    : public KernelLite<TARGET(kHost), PRECISION(kAny)> {
 public:
  using param_t = operators::FillConstantBatchSizeLikeParam;

@@ -31,7 +31,7 @@ class FillConstantBatchSizeLikeCompute
  ~FillConstantBatchSizeLikeCompute() {}
 };

-}  // namespace arm
+}  // namespace host
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
--- a/lite/kernels/arm/fill_constant_compute.cc
+++ b/lite/kernels/arm/fill_constant_compute.cc
@@ -12,16 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "lite/kernels/arm/fill_constant_compute.h"
+#include "lite/kernels/host/fill_constant_compute.h"

 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace arm {
+namespace host {

 void FillConstantCompute::Run() {
  auto& param = *param_.get_mutable<param_t>();
-  auto& context = ctx_->As<ARMContext>();

  if (param.dtype == static_cast<int32_t>(lite::core::FluidType::FP32)) {
    auto data = param.out->template mutable_data<float>();
@@ -50,21 +49,21 @@ void FillConstantCompute::Run() {
  }
 }

-}  // namespace arm
+}  // namespace host
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle

 // float
 REGISTER_LITE_KERNEL(fill_constant,
-                     kARM,
+                     kHost,
                     kAny,
                     kNCHW,
-                     paddle::lite::kernels::arm::FillConstantCompute,
+                     paddle::lite::kernels::host::FillConstantCompute,
                     def)
    .BindInput("ShapeTensor",
-               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
    .BindInput("ShapeTensorList",
-               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))})
    .Finalize();
--- a/lite/kernels/arm/fill_constant_compute.h
+++ b/lite/kernels/arm/fill_constant_compute.h
@@ -19,9 +19,9 @@
 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace arm {
+namespace host {

-class FillConstantCompute : public KernelLite<TARGET(kARM), PRECISION(kAny)> {
+class FillConstantCompute : public KernelLite<TARGET(kHost), PRECISION(kAny)> {
 public:
  using param_t = operators::FillConstantParam;

@@ -30,7 +30,7 @@ class FillConstantCompute : public KernelLite<TARGET(kARM), PRECISION(kAny)> {
  ~FillConstantCompute() {}
 };

-}  // namespace arm
+}  // namespace host
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
--- a/lite/kernels/xpu/CMakeLists.txt
+++ b/lite/kernels/xpu/CMakeLists.txt
@@ -24,6 +24,9 @@ else()
  add_kernel(sequence_topk_avg_pooling_compute_xpu XPU basic SRCS sequence_topk_avg_pooling_compute.cc DEPS ${lite_kernel_deps})
  add_kernel(concat_compute_xpu XPU basic SRCS concat_compute.cc DEPS ${lite_kernel_deps})
  add_kernel(search_fc_compute_xpu XPU basic SRCS search_fc_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(reshape_compute_xpu XPU basic SRCS reshape_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(reduce_mean_compute_xpu XPU basic SRCS reduce_mean_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(reduce_sum_compute_xpu XPU basic SRCS reduce_sum_compute.cc DEPS ${lite_kernel_deps})

  # extra
  add_kernel(lookup_table_compute_xpu XPU extra SRCS lookup_table_compute.cc DEPS ${lite_kernel_deps})
@@ -44,4 +47,6 @@ else()
  add_kernel(__xpu__fc_compute_xpu XPU extra SRCS __xpu__fc_compute.cc DEPS ${lite_kernel_deps})
  add_kernel(__xpu__search_attention_compute_xpu XPU extra SRCS __xpu__search_attention_compute.cc DEPS ${lite_kernel_deps})
  add_kernel(__xpu__mmdnn_compute_xpu XPU extra SRCS __xpu__mmdnn_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(__xpu__conv2d_compute_xpu XPU extra SRCS __xpu__conv2d_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(__xpu__sfa_head_compute_xpu XPU extra SRCS __xpu__sfa_head_compute.cc DEPS ${lite_kernel_deps})
 endif()
--- a/lite/kernels/xpu/__xpu__conv2d_compute.cc
+++ b/lite/kernels/xpu/__xpu__conv2d_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/__xpu__conv2d_compute.h"
+#include <string>
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void XPUConv2dCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  auto& input_dims = param.Input->dims();
+  auto& filter_dims = param.Filter->dims();
+  int batch = static_cast<int>(input_dims[0]);
+  int img_c = static_cast<int>(input_dims[1]);
+  int img_h = static_cast<int>(input_dims[2]);
+  int img_w = static_cast<int>(input_dims[3]);
+  int filter_num = static_cast<int>(filter_dims[0]);
+  int win_h = static_cast<int>(filter_dims[2]);
+  int win_w = static_cast<int>(filter_dims[3]);
+
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
+  int stride_h = param.strides[0];
+  int stride_w = param.strides[1];
+  int paddings_h = paddings[0];
+  int paddings_w = paddings[1];
+  int dilations_h = dilations[0];
+  int dilations_w = dilations[1];
+
+  std::string filter_type = param.filter_type;
+  int groups = param.groups;
+
+  int act_type = (param.act_type == -1) ? xdnn::Activation_t::RELU
+                                        : param.act_type;  // -1 means not init
+  const auto* bias = param.Bias ? param.Bias->data<float>() : nullptr;
+  const auto* branch = param.Branch ? param.Branch->data<float>() : nullptr;
+  const float* input_max =
+      param.InputMax ? param.InputMax->data<float>() : nullptr;
+  float* output_max = param.OutputMax
+                          ? param.OutputMax->mutable_data<float>(TARGET(kXPU))
+                          : nullptr;
+  float* output = param.Output->mutable_data<float>(TARGET(kXPU));
+
+  // TODO(luohang): now support for resnet50 first
+  CHECK_EQ(act_type, xdnn::Activation_t::RELU);
+  CHECK_EQ(groups, 1);
+  CHECK_EQ(filter_type, "int16");
+
+  xdnn::Activation_t act((xdnn::Activation_t::act_enum)act_type);
+  int r = xdnn::conv2d_forward_int16<float, int16_t, float, float>(
+      ctx.GetRawContext(),            /* context */
+      batch,                          /* batch */
+      img_c,                          /* input_c */
+      img_h,                          /* input_h */
+      img_w,                          /* input_w */
+      filter_num,                     /* num_filter */
+      win_h,                          /* kernel_h */
+      win_w,                          /* kernel_w */
+      stride_h,                       /* stride_h */
+      stride_w,                       /* stride_w */
+      paddings_h,                     /* pad_h */
+      paddings_w,                     /* pad_w */
+      dilations_h,                    /* dilation_h */
+      dilations_w,                    /* dilation_w */
+      groups,                         /* group */
+      param.Input->data<float>(),     /* input bottom */
+      param.Filter->data<int16_t>(),  /* filter weight */
+      output,                         /* output top */
+      bias,                           /* bias */
+      branch,                         /* branch */
+      act,                            /* act type */
+      input_max,                      /* max_image_ptr */
+      param.FilterMax->data<float>(), /* max_filter_ptr */
+      output_max /* max_result_ptr */);
+
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(__xpu__conv2d,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::XPUConv2dCompute,
+                     def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("InputMax", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("FilterMax", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Branch", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("OutputMax", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
--- a/lite/kernels/xpu/__xpu__conv2d_compute.h
+++ b/lite/kernels/xpu/__xpu__conv2d_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class XPUConv2dCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::XPUConv2dParam;
+
+  virtual void Run();
+
+  virtual ~XPUConv2dCompute() = default;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/xpu/__xpu__sfa_head_compute.cc
+++ b/lite/kernels/xpu/__xpu__sfa_head_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/__xpu__sfa_head_compute.h"
+#include <string>
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void XPUSfaHeadCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+  std::string vis_type = param.op_type;
+  auto input = param.input;
+
+  const int batch = static_cast<int>(input->dims()[0]);
+  const int m = static_cast<int>(input->dims()[1]);
+  const int n = static_cast<int>(input->dims()[2]);
+  if (vis_type == "meanstd") {
+    int r = xdnn::vis_meanstd(ctx.GetRawContext(),
+                              param.input->data<float>(),
+                              param.output->mutable_data<float>(TARGET(kXPU)),
+                              batch,
+                              m,
+                              n);
+    CHECK_EQ(r, 0) << "XPU kernel error";
+    (void)param.output->mutable_data<float>();
+  } else if (vis_type == "moment") {
+    int r = xdnn::vis_moment(ctx.GetRawContext(),
+                             param.input->data<float>(),
+                             param.output->mutable_data<float>(TARGET(kXPU)),
+                             batch,
+                             m,
+                             n);
+    CHECK_EQ(r, 0) << "XPU kernel error";
+  } else {
+    LOG(FATAL) << "vis xpu op not supported type " << vis_type.c_str();
+  }
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(__xpu__sfa_head,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::XPUSfaHeadCompute,
+                     def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
--- a/lite/kernels/xpu/__xpu__sfa_head_compute.h
+++ b/lite/kernels/xpu/__xpu__sfa_head_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class XPUSfaHeadCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::XPUSfaHeadParam;
+
+  virtual void Run();
+
+  virtual ~XPUSfaHeadCompute() = default;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/xpu/activation_compute.cc
+++ b/lite/kernels/xpu/activation_compute.cc
@@ -60,6 +60,71 @@ void SigmoidCompute::Run() {
  CHECK_EQ(r, 0);
 }

+void AbsCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  int r = xdnn::activation_forward(
+      ctx.GetRawContext(),     /* context */
+      xdnn::Activation_t::ABS, /* type */
+      param.X->numel(),        /* len */
+      param.X->data<float>(),  /* x */
+      param.Out->mutable_data<float>(TARGET(kXPU)) /* y */);
+  CHECK_EQ(r, 0);
+}
+
+void SquareCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  int r = xdnn::activation_forward(
+      ctx.GetRawContext(),        /* context */
+      xdnn::Activation_t::SQUARE, /* type */
+      param.X->numel(),           /* len */
+      param.X->data<float>(),     /* x */
+      param.Out->mutable_data<float>(TARGET(kXPU)) /* y */);
+  CHECK_EQ(r, 0);
+}
+
+void SqrtCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  int r = xdnn::activation_forward(
+      ctx.GetRawContext(),      /* context */
+      xdnn::Activation_t::SQRT, /* type */
+      param.X->numel(),         /* len */
+      param.X->data<float>(),   /* x */
+      param.Out->mutable_data<float>(TARGET(kXPU)) /* y */);
+  CHECK_EQ(r, 0);
+}
+
+void PowCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  int r = xdnn::activation_forward(
+      ctx.GetRawContext(),         /* context */
+      xdnn::Activation_t::ACT_POW, /* type */
+      param.X->numel(),            /* len */
+      param.X->data<float>(),      /* x */
+      param.Out->mutable_data<float>(TARGET(kXPU)) /* y */);
+  CHECK_EQ(r, 0);
+}
+
+void SignCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  int r = xdnn::activation_forward(
+      ctx.GetRawContext(),      /* context */
+      xdnn::Activation_t::SIGN, /* type */
+      param.X->numel(),         /* len */
+      param.X->data<float>(),   /* x */
+      param.Out->mutable_data<float>(TARGET(kXPU)) /* y */);
+  CHECK_EQ(r, 0);
+}
+
 }  // namespace xpu
 }  // namespace kernels
 }  // namespace lite
@@ -86,3 +151,33 @@ REGISTER_LITE_KERNEL(sigmoid,
    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    abs, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::AbsCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    square, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::SquareCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    sqrt, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::SqrtCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    pow, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::PowCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    sign, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::SignCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
--- a/lite/kernels/xpu/activation_compute.h
+++ b/lite/kernels/xpu/activation_compute.h
@@ -48,6 +48,51 @@ class SigmoidCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
  virtual ~SigmoidCompute() = default;
 };

+class AbsCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  virtual void Run();
+
+  virtual ~AbsCompute() = default;
+};
+
+class SquareCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  virtual void Run();
+
+  virtual ~SquareCompute() = default;
+};
+
+class SqrtCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  virtual void Run();
+
+  virtual ~SqrtCompute() = default;
+};
+
+class PowCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  virtual void Run();
+
+  virtual ~PowCompute() = default;
+};
+
+class SignCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  virtual void Run();
+
+  virtual ~SignCompute() = default;
+};
+
 }  // namespace xpu
 }  // namespace kernels
 }  // namespace lite

--- a/lite/kernels/xpu/elementwise_compute.cc
+++ b/lite/kernels/xpu/elementwise_compute.cc
@@ -76,6 +76,59 @@ void ElementwiseSubCompute::Run() {
  }
 }

+void ElementwiseDivCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  auto& x_dims = param.X->dims().data();
+  auto& y_dims = param.Y->dims();
+  int axis = param.axis;
+  if (param.axis == -1) {
+    axis = x_dims.size() - y_dims.size();
+  }
+  int iter = std::accumulate(
+      x_dims.begin(), x_dims.begin() + axis, 1, std::multiplies<int>());
+  int stride = param.Y->numel();
+
+  for (int i = 0; i < iter; ++i) {
+    const float* x_ptr = param.X->data<float>() + i * stride;
+    const float* y_ptr = param.Y->data<float>();
+    float* o_ptr = param.Out->mutable_data<float>(TARGET(kXPU)) + i * stride;
+    int r = xdnn::elementwise_div(ctx.GetRawContext(), /* context */
+                                  x_ptr,               /* x */
+                                  y_ptr,               /* y */
+                                  o_ptr,               /* z */
+                                  stride /* len */);
+    CHECK_EQ(r, 0);
+  }
+}
+
+void ElementwiseMulCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  auto& x_dims = param.X->dims().data();
+  auto& y_dims = param.Y->dims();
+  int axis = param.axis;
+  if (param.axis == -1) {
+    axis = x_dims.size() - y_dims.size();
+  }
+  int iter = std::accumulate(
+      x_dims.begin(), x_dims.begin() + axis, 1, std::multiplies<int>());
+  int stride = param.Y->numel();
+
+  for (int i = 0; i < iter; ++i) {
+    const float* x_ptr = param.X->data<float>() + i * stride;
+    const float* y_ptr = param.Y->data<float>();
+    float* o_ptr = param.Out->mutable_data<float>(TARGET(kXPU)) + i * stride;
+    int r = xdnn::elementwise_mul(ctx.GetRawContext(), /* context */
+                                  x_ptr,               /* x */
+                                  y_ptr,               /* y */
+                                  o_ptr,               /* z */
+                                  stride /* len */);
+    CHECK_EQ(r, 0);
+  }
+}
 }  // namespace xpu
 }  // namespace kernels
 }  // namespace lite
@@ -102,3 +155,25 @@ REGISTER_LITE_KERNEL(elementwise_sub,
    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_div,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::ElementwiseDivCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_mul,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::ElementwiseMulCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
--- a/lite/kernels/xpu/elementwise_compute.h
+++ b/lite/kernels/xpu/elementwise_compute.h
@@ -41,6 +41,26 @@ class ElementwiseSubCompute
  virtual ~ElementwiseSubCompute() = default;
 };

+class ElementwiseDivCompute
+    : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ElementwiseParam;
+
+  virtual void Run();
+
+  virtual ~ElementwiseDivCompute() = default;
+};
+
+class ElementwiseMulCompute
+    : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ElementwiseParam;
+
+  virtual void Run();
+
+  virtual ~ElementwiseMulCompute() = default;
+};
+
 }  // namespace xpu
 }  // namespace kernels
 }  // namespace lite

--- a/lite/kernels/xpu/reduce_mean_compute.cc
+++ b/lite/kernels/xpu/reduce_mean_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/reduce_mean_compute.h"
+#include <vector>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void ReduceMeanCompute::Run() {
+  auto& param = Param<operators::ReduceMeanParam>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+  const float* input = param.X->data<float>();
+  auto x_dims = param.X->dims();
+  int x_rank = x_dims.size();
+  float* output = param.Out->mutable_data<float>(TARGET(kXPU));
+  auto reduce_dim = param.dim;
+
+  std::vector<int> idims;
+  for (int i = 0; i < x_rank; i++) {
+    idims.push_back(x_dims[i]);
+  }
+
+  auto type = xdnn::ReduceOp::REDUCE_MEAN;
+  int r = xdnn::reduce(ctx.GetRawContext(),
+                       input,
+                       output,
+                       idims.data(),
+                       x_rank,
+                       reduce_dim.data(),
+                       reduce_dim.size(),
+                       type);
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(reduce_mean,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::ReduceMeanCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
--- a/lite/kernels/xpu/reduce_mean_compute.h
+++ b/lite/kernels/xpu/reduce_mean_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class ReduceMeanCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  void Run() override;
+
+  virtual ~ReduceMeanCompute() = default;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/xpu/reduce_sum_compute.cc
+++ b/lite/kernels/xpu/reduce_sum_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/reduce_sum_compute.h"
+#include <vector>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void ReduceSumCompute::Run() {
+  auto& param = Param<operators::ReduceParam>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+  const float* input = param.x->data<float>();
+  float* output = param.output->mutable_data<float>(TARGET(kXPU));
+  bool reduce_all = param.reduce_all;
+
+  if (reduce_all) {
+    int input_len = param.x->numel();
+    int r = xdnn::sum(ctx.GetRawContext(), input, output, input_len);
+    CHECK_EQ(r, 0);
+  } else {
+    auto x_dims = param.x->dims();
+    int x_rank = x_dims.size();
+    auto reduce_dim = param.dim;
+    auto rdim = reduce_dim.size();
+
+    std::vector<int> idims;
+    for (int i = 0; i < x_rank; i++) {
+      idims.push_back(x_dims[i]);
+    }
+
+    auto type = xdnn::ReduceOp::REDUCE_SUM;
+    int r = xdnn::reduce(ctx.GetRawContext(),
+                         input,
+                         output,
+                         idims.data(),
+                         x_rank,
+                         reduce_dim.data(),
+                         rdim,
+                         type);
+    CHECK_EQ(r, 0);
+  }
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(reduce_sum,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::ReduceSumCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
--- a/lite/kernels/xpu/reduce_sum_compute.h
+++ b/lite/kernels/xpu/reduce_sum_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class ReduceSumCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  void Run() override;
+
+  virtual ~ReduceSumCompute() = default;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/xpu/reshape_compute.cc
+++ b/lite/kernels/xpu/reshape_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/reshape_compute.h"
+#include "lite/core/op_registry.h"
+
+REGISTER_LITE_KERNEL(reshape2,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::Reshape2Compute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("ShapeTensor", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Shape", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
--- a/lite/kernels/xpu/reshape_compute.h
+++ b/lite/kernels/xpu/reshape_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+template <typename T>
+class Reshape2Compute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ReshapeParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    auto x = param.x;
+    auto output = param.output;
+    auto xshape = param.xshape;
+    auto x_dims = x->dims();
+    auto x_dims_data = x_dims.Vectorize();
+    auto out_dims = output->dims();
+    output->ShareDataWith(*x);
+    output->Resize(out_dims);
+    auto* xshape_data = xshape->mutable_data<int64_t>(TARGET(kXPU));
+    TargetWrapperXPU::MemcpySync(xshape_data,
+                                 x_dims_data.data(),
+                                 x_dims.size() * sizeof(int64_t),
+                                 IoDirection::HtoD);
+  }
+
+  virtual ~Reshape2Compute() = default;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
@@ -53,6 +53,8 @@ add_operator(instance_norm_op basic SRCS instance_norm_op.cc DEPS ${op_DEPS})
 add_operator(subgraph_op basic SRCS subgraph_op.cc DEPS ${op_DEPS})
 add_operator(grid_sampler_op basic SRCS grid_sampler_op.cc DEPS ${op_DEPS})
 add_operator(flatten_op basic SRCS flatten_op.cc DEPS ${op_DEPS})
+add_operator(pow_op extra SRCS pow_op.cc DEPS ${op_DEPS})
+add_operator(sign_op extra SRCS sign_op.cc DEPS ${op_DEPS})

 # 2.basic ops not used in basic models
 add_operator(negative_op extra SRCS negative_op.cc DEPS ${op_DEPS})
@@ -177,6 +179,9 @@ add_operator(__xpu__fc_op extra SRCS __xpu__fc_op.cc DEPS ${op_DEPS})
 add_operator(__xpu__resnet_cbam_op extra SRCS __xpu__resnet_cbam_op.cc DEPS ${op_DEPS})
 add_operator(__xpu__search_attention_op extra SRCS __xpu__search_attention_op.cc DEPS ${op_DEPS})
 add_operator(__xpu__mmdnn_op extra SRCS __xpu__mmdnn_op.cc DEPS ${op_DEPS})
+add_operator(__xpu__conv2d_op extra SRCS __xpu__conv2d_op.cc DEPS ${op_DEPS})
+add_operator(__xpu__sfa_head_op extra SRCS __xpu__sfa_head_op.cc DEPS ${op_DEPS})
+
 if (NOT LITE_WITH_X86)
    lite_cc_test(test_one_hot_op SRCS one_hot_op_test.cc DEPS one_hot_op memory scope ${op_deps} one_hot_compute_host)
    lite_cc_test(test_fc_op SRCS fc_op_test.cc

--- a/lite/operators/__xpu__conv2d_op.cc
+++ b/lite/operators/__xpu__conv2d_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/__xpu__conv2d_op.h"
+#include <memory>
+#include <vector>
+#include "lite/core/op_registry.h"
+#include "lite/operators/conv_op.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+std::string padding_algorithm_ = "";  // NOLINT
+
+bool XPUConv2dOp::CheckShape() const {
+  CHECK(param_.Input) << "Input(Input) of ConvXPUOp should not be null.";
+  CHECK(param_.Output) << "Input(Filter) of ConvXPUOp should not be null.";
+  CHECK(param_.Filter) << "Output(Output) of ConvXPUOp should not be null.";
+  // bias is optional.
+
+  const auto in_dims = param_.Input->dims();
+  const auto filter_dims = param_.Filter->dims();
+  int groups = param_.groups;
+
+  CHECK_EQ(in_dims.size(), 4UL) << "Conv intput should be 4-D tensor.";
+  CHECK_EQ(in_dims.size(), filter_dims.size())
+      << "Conv input dimension and filter dimension should be the same.";
+  CHECK_EQ(in_dims.size() - param_.strides.size(), 2U)
+      << "Conv input dimension and strides dimension should be consistent.";
+  CHECK_EQ(filter_dims.size(), 4UL) << "Conv filter should be 4-D tensor.";
+  CHECK_EQ(in_dims[1], filter_dims[1] * groups)
+      << "The number of input channels should be equal to filter channels * "
+         "groups.";
+  CHECK_EQ(filter_dims[0] % groups, 0)
+      << "The number of output channels should be divided by groups.";
+
+  return true;
+}
+
+// copy from conv_op.cc
+inline int ConvOutputSize(int input_size,
+                          int filter_size,
+                          int dilation,
+                          int pad_left,
+                          int pad_right,
+                          int stride) {
+  const int dkernel = dilation * (filter_size - 1) + 1;
+  int output_size =
+      (input_size + (pad_left + pad_right) - dkernel) / stride + 1;
+
+  return output_size;
+}
+
+// copy from conv_op.cc
+bool XPUConv2dOp::InferShapeImpl() const {
+  const auto in_dims = param_.Input->dims();
+  const auto filter_dims = param_.Filter->dims();
+
+  operators::UpdatePaddingAndDilation(param_.paddings.get(),
+                                      param_.dilations.get(),
+                                      param_.strides,
+                                      padding_algorithm_,
+                                      in_dims,
+                                      filter_dims);
+  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
+  auto paddings = *param_.paddings;
+  auto dilations = *param_.dilations;
+  for (size_t i = 0; i < param_.strides.size(); ++i) {
+    output_shape.push_back(ConvOutputSize(in_dims[i + 2],
+                                          filter_dims[i + 2],
+                                          dilations[i],
+                                          paddings[i * 2],
+                                          paddings[i * 2 + 1],
+                                          param_.strides[i]));
+  }
+
+  // Set output and output max dims
+  param_.Output->Resize(lite::DDim(output_shape));
+  param_.OutputMax->Resize({4});
+  // share LoD
+  param_.Output->set_lod(param_.Input->lod());
+
+  return true;
+}
+
+bool XPUConv2dOp::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) {
+  AttachParam(&param_);
+  CHECK(scope->FindVar(op_desc.Input("Input").front()));
+  CHECK(scope->FindVar(op_desc.Input("Filter").front()));
+  CHECK(scope->FindVar(op_desc.Input("FilterMax").front()));
+  CHECK(scope->FindVar(op_desc.Output("Output").front()));
+  CHECK(scope->FindVar(op_desc.Output("OutputMax").front()));
+
+  param_.Input =
+      scope->FindVar(op_desc.Input("Input").front())->GetMutable<Tensor>();
+  param_.Filter =
+      scope->FindVar(op_desc.Input("Filter").front())->GetMutable<Tensor>();
+  param_.FilterMax =
+      scope->FindVar(op_desc.Input("FilterMax").front())->GetMutable<Tensor>();
+  auto bias = scope->FindVar(op_desc.Input("Bias").front());
+  if (bias != nullptr) {
+    param_.Bias = bias->GetMutable<Tensor>();
+  }
+  // optional params
+  std::vector<std::string> input_arg_names = op_desc.InputArgumentNames();
+  if (std::find(input_arg_names.begin(), input_arg_names.end(), "Branch") !=
+      input_arg_names.end()) {
+    auto arguments = op_desc.Input("Branch");
+    if (arguments.size() > 0) {
+      auto arg_var = scope->FindVar(arguments.front());
+      if (arg_var != nullptr) {
+        param_.Branch =
+            const_cast<lite::Tensor*>(&(arg_var->Get<lite::Tensor>()));
+      }
+    }
+  }
+
+  param_.Output =
+      scope->FindVar(op_desc.Output("Output").front())->GetMutable<Tensor>();
+  param_.OutputMax =
+      scope->FindVar(op_desc.Output("OutputMax").front())->GetMutable<Tensor>();
+
+  param_.strides = op_desc.GetAttr<std::vector<int>>("strides");
+  auto paddings = op_desc.GetAttr<std::vector<int>>("paddings");
+  auto dilations = op_desc.GetAttr<std::vector<int>>("dilations");
+  param_.dilations = std::make_shared<std::vector<int>>(dilations);
+  param_.groups = op_desc.GetAttr<int>("groups");
+  if (op_desc.HasAttr("act_type")) {
+    param_.act_type = op_desc.GetAttr<int>("act_type");
+  }
+
+  if (op_desc.HasAttr("filter_type")) {
+    param_.filter_type = op_desc.GetAttr<std::string>("filter_type");
+  } else {
+    param_.filter_type = "int16";
+  }
+
+  if (op_desc.HasAttr("has_input_max") &&
+      op_desc.GetAttr<bool>("has_input_max")) {
+    CHECK(scope->FindVar(op_desc.Input("InputMax").front()));
+    param_.InputMax =
+        scope->FindVar(op_desc.Input("InputMax").front())->GetMutable<Tensor>();
+  }
+
+  if (op_desc.HasAttr("padding_algorithm")) {
+    padding_algorithm_ = op_desc.GetAttr<std::string>("padding_algorithm");
+  }
+
+  // 2-pad to 4-pad
+  if (paddings.size() == 2L) {
+    for (size_t i = 0; i < param_.strides.size(); ++i) {
+      int copy_pad = *(paddings.begin() + 2 * i);
+      paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
+    }
+  } else {
+    if (paddings.size() != 4L) {
+      LOG(FATAL)
+          << "Paddings size should be the same or twice as the input size.";
+    }
+  }
+  param_.paddings = std::make_shared<std::vector<int>>(paddings);
+
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(__xpu__conv2d, paddle::lite::operators::XPUConv2dOp);
--- a/lite/operators/__xpu__conv2d_op.h
+++ b/lite/operators/__xpu__conv2d_op.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "lite/core/op_lite.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class XPUConv2dOp : public OpLite {
+ public:
+  XPUConv2dOp() {}
+
+  explicit XPUConv2dOp(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "XPUConv2d"; }
+
+ private:
+  mutable XPUConv2dParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
--- a/lite/operators/__xpu__sfa_head_op.cc
+++ b/lite/operators/__xpu__sfa_head_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/__xpu__sfa_head_op.h"
+#include <vector>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool XPUSfaHeadOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.input);
+  CHECK_OR_FALSE(param_.output);
+  CHECK_OR_FALSE(param_.op_type != "");
+
+  const auto input_dims = param_.input->dims();
+  if (param_.op_type == "meanstd" || param_.op_type == "moment") {
+    CHECK_EQ_OR_FALSE(input_dims.size(), 3UL);
+  }
+
+  return true;
+}
+
+bool XPUSfaHeadOp::InferShapeImpl() const {
+  const auto& input_dims = param_.input->dims();
+  auto op_type = param_.op_type;
+
+  // Set output dims
+  std::vector<DDim::value_type> output_dims(2);
+  output_dims[0] = input_dims[0];
+  if (op_type == "meanstd") {
+    output_dims[1] = 2 * input_dims[1];
+  } else if (op_type == "moment") {
+    output_dims[1] = 4 * input_dims[1];
+  } else {
+    LOG(FATAL) << "not supported vis op --> " << op_type;
+  }
+  param_.output->Resize(output_dims);
+
+  // share LoD
+  param_.output->set_lod(param_.input->lod());
+  return true;
+}
+
+bool XPUSfaHeadOp::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) {
+  auto input = op_desc.Input("Input").front();
+  auto output = op_desc.Output("Output").front();
+  CHECK(scope->FindVar(input));
+  CHECK(scope->FindVar(output));
+
+  param_.input = scope->FindVar(input)->GetMutable<lite::Tensor>();
+  param_.output = scope->FindVar(output)->GetMutable<lite::Tensor>();
+  param_.op_type = op_desc.GetAttr<std::string>("op_type");
+
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(__xpu__sfa_head, paddle::lite::operators::XPUSfaHeadOp);
--- a/lite/operators/__xpu__sfa_head_op.h
+++ b/lite/operators/__xpu__sfa_head_op.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "lite/core/op_lite.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class XPUSfaHeadOp : public OpLite {
+ public:
+  XPUSfaHeadOp() {}
+
+  explicit XPUSfaHeadOp(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "XPUFc"; }
+
+ private:
+  mutable XPUSfaHeadParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
@@ -232,6 +232,20 @@ struct PowerParam : ParamBase {
  float power{};
 };

+// For Pow Op
+struct PowParam : ParamBase {
+  const lite::Tensor* X{};
+  lite::Tensor* Out{};
+
+  float factor{1.};
+};
+
+// For Sign Op
+struct SignParam : ParamBase {
+  const lite::Tensor* X{};
+  lite::Tensor* Out{};
+};
+
 struct ShuffleChannelParam : ParamBase {
  const lite::Tensor* X{};
  lite::Tensor* Out{};
@@ -1810,6 +1824,31 @@ struct XPUMmdnnMergeAllParam : ParamBase {
  lite::Tensor* out{};
 };

+struct XPUConv2dParam : ParamBase {
+  lite::Tensor* Input{nullptr};
+  lite::Tensor* Filter{nullptr};
+  lite::Tensor* InputMax{nullptr};
+  lite::Tensor* FilterMax{nullptr};
+  lite::Tensor* Bias{nullptr};
+  lite::Tensor* Branch{nullptr};
+  lite::Tensor* Output{nullptr};
+  lite::Tensor* OutputMax{nullptr};
+
+  int groups{1};
+  int act_type{-1};
+  std::string filter_type{""};
+  std::vector<int> strides;
+  std::shared_ptr<std::vector<int>> paddings;
+  std::shared_ptr<std::vector<int>> dilations;
+};
+
+struct XPUSfaHeadParam : ParamBase {
+  lite::Tensor* input{nullptr};
+  lite::Tensor* output{nullptr};
+
+  std::string op_type{""};
+};
+
 // For DeformableConvolution op
 struct DeformableConvParam : ParamBase {
  lite::Tensor* x{};

--- a/lite/operators/pow_op.cc
+++ b/lite/operators/pow_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/pow_op.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool PowOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.X);
+  CHECK_OR_FALSE(param_.Out);
+  return true;
+}
+
+bool PowOp::InferShapeImpl() const {
+  param_.Out->Resize(param_.X->dims());
+  return true;
+}
+
+bool PowOp::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
+  auto X = op_desc.Input("X").front();
+  auto Out = op_desc.Output("Out").front();
+  CHECK(scope->FindVar(X));
+  CHECK(scope->FindVar(Out));
+  param_.X = scope->FindVar(X)->GetMutable<lite::Tensor>();
+  param_.Out = scope->FindVar(Out)->GetMutable<lite::Tensor>();
+  param_.factor = op_desc.GetAttr<float>("factor");
+  CHECK(param_.X);
+  CHECK(param_.Out);
+
+  return true;
+}
+
+} /* namespace operators */
+} /* namespace lite */
+} /* namespace paddle */
+
+REGISTER_LITE_OP(pow, paddle::lite::operators::PowOp);
--- a/lite/operators/pow_op.h
+++ b/lite/operators/pow_op.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class PowOp : public OpLite {
+ public:
+  PowOp() {}
+
+  explicit PowOp(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "pow"; }
+
+#ifdef LITE_WITH_PROFILE
+  void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {
+    ch->input_shape = ch->DimToStr(param_.X->dims());
+    ch->output_shape = ch->DimToStr(param_.Out->dims());
+    ch->macs = param_.Out->numel();
+  }
+#endif
+
+ private:
+  mutable PowParam param_;
+};
+
+} /* namespace operators */
+} /* namespace lite */
+} /* namespace paddle */
--- a/lite/operators/sign_op.cc
+++ b/lite/operators/sign_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/sign_op.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool SignOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.X);
+  CHECK_OR_FALSE(param_.Out);
+  return true;
+}
+
+bool SignOp::InferShapeImpl() const {
+  param_.Out->Resize(param_.X->dims());
+  return true;
+}
+
+bool SignOp::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
+  auto X = op_desc.Input("X").front();
+  auto Out = op_desc.Output("Out").front();
+  CHECK(scope->FindVar(X));
+  CHECK(scope->FindVar(Out));
+  param_.X = scope->FindVar(X)->GetMutable<lite::Tensor>();
+  param_.Out = scope->FindVar(Out)->GetMutable<lite::Tensor>();
+  CHECK(param_.X);
+  CHECK(param_.Out);
+
+  return true;
+}
+
+} /* namespace operators */
+} /* namespace lite */
+} /* namespace paddle */
+
+REGISTER_LITE_OP(sign, paddle::lite::operators::SignOp);
--- a/lite/operators/sign_op.h
+++ b/lite/operators/sign_op.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class SignOp : public OpLite {
+ public:
+  SignOp() {}
+
+  explicit SignOp(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "sign"; }
+
+#ifdef LITE_WITH_PROFILE
+  void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {
+    ch->input_shape = ch->DimToStr(param_.X->dims());
+    ch->output_shape = ch->DimToStr(param_.Out->dims());
+    ch->macs = param_.Out->numel();
+  }
+#endif
+
+ private:
+  mutable SignParam param_;
+};
+
+} /* namespace operators */
+} /* namespace lite */
+} /* namespace paddle */
--- a/lite/tests/kernels/fill_constant_batch_size_like_compute_test.cc
+++ b/lite/tests/kernels/fill_constant_batch_size_like_compute_test.cc
@@ -135,8 +135,8 @@ TEST(fill_constant_batch_size_like, precision) {
 #if defined(LITE_WITH_NPU)
  place = TARGET(kNPU);
  abs_error = 1e-2;  // use fp16 in npu
-#elif defined(LITE_WITH_ARM)
-  place = TARGET(kARM);
+#elif defined(LITE_WITH_ARM) || defined(LITE_WITH_X86)
+  place = TARGET(kHost);
 #else
  return;
 #endif

--- a/lite/tests/kernels/fill_constant_compute_test.cc
+++ b/lite/tests/kernels/fill_constant_compute_test.cc
@@ -174,8 +174,8 @@ TEST(fill_constant, precision) {
 #if defined(LITE_WITH_NPU)
  place = TARGET(kNPU);
  abs_error = 1e-2;  // use fp16 in npu
-#elif defined(LITE_WITH_ARM)
-  place = TARGET(kARM);
+#elif defined(LITE_WITH_ARM) || defined(LITE_WITH_X86)
+  place = TARGET(kHost);
 #else
  return;
 #endif