diff --git a/lite/api/paddle_use_passes.h b/lite/api/paddle_use_passes.h
index 2a04db1519431cb2608c8f39997581dc3bc63973..cea2a45c5db15891a4de679265a9c2cd2779d0fb 100644
--- a/lite/api/paddle_use_passes.h
+++ b/lite/api/paddle_use_passes.h
@@ -67,3 +67,7 @@ USE_MIR_PASS(__xpu__multi_encoder_fuse_pass);
 USE_MIR_PASS(__xpu__embedding_with_eltwise_add_fuse_pass);
 USE_MIR_PASS(__xpu__fc_fuse_pass);
 USE_MIR_PASS(__xpu__mmdnn_fuse_pass);
+USE_MIR_PASS(__xpu__conv2d_fuse_pass);
+USE_MIR_PASS(__xpu__conv2d_link_previous_out_max_pass);
+USE_MIR_PASS(__xpu__sfa_head_meanstd_fuse_pass);
+USE_MIR_PASS(__xpu__sfa_head_moment_fuse_pass);
diff --git a/lite/core/mir/CMakeLists.txt b/lite/core/mir/CMakeLists.txt
index dbf9b69d42e5b6abf0640a113d80a74dbb71dff6..0fe572e1f91919d739199163b7ff5c989e6cd519 100644
--- a/lite/core/mir/CMakeLists.txt
+++ b/lite/core/mir/CMakeLists.txt
@@ -30,6 +30,10 @@ lite_cc_library(mir_passes
       fusion/__xpu__embedding_with_eltwise_add_fuse_pass.cc
       fusion/__xpu__fc_fuse_pass.cc
       fusion/__xpu__mmdnn_fuse_pass.cc
+      fusion/__xpu__conv2d_fuse_pass.cc
+      fusion/__xpu__conv2d_link_previous_out_max_pass.cc
+      fusion/__xpu__sfa_head_meanstd_fuse_pass.cc
+      fusion/__xpu__sfa_head_moment_fuse_pass.cc
       fusion/match_matrix_activation_fuse_pass.cc
       fusion/scales_fuse_pass.cc
       fusion/sequence_reverse_embedding_fuse_pass.cc
diff --git a/lite/core/mir/fusion/__xpu__conv2d_fuse_pass.cc b/lite/core/mir/fusion/__xpu__conv2d_fuse_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d8e9d9db4664cd717dbc949134e5ef52f52c9b61
--- /dev/null
+++ b/lite/core/mir/fusion/__xpu__conv2d_fuse_pass.cc
@@ -0,0 +1,475 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <string>
+#include "lite/backends/xpu/math.h"
+#include "lite/core/mir/pass_registry.h"
+#include "lite/core/mir/pattern_matcher_high_api.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+/* fuse conv2d block in resnet50-like model to xpu_conv2d op    */
+/* For example:                                                 */
+/* graph[1]: sub block                                          */
+/*                     in_Input                                 */
+/*                       |                                      */
+/*                       |                                      */
+/*                     conv2d----in_Filter                      */
+/*                       |                                      */
+/*                       |                                      */
+/*                  batch_norm ------in_Bias                    */
+/*                       |                                      */
+/*                       |                                      */
+/*                     relu                                     */
+/*                       |                                      */
+/*                       |                                      */
+/*                     out_Out                                  */
+/*                                                              */
+/* After the pass is applied:                                   */
+/*                     in_Input                                 */
+/*        in_Filter      |     in_FilterMax                     */
+/*                  \    |    /                                 */
+/*                   \   |   /                                  */
+/*     in_Bias ------- __xpu__conv2d                            */
+/*                       |    \                                 */
+/*                       |     \                                */
+/*                       |      out_OutputMax                   */
+/*                 out_Output                                   */
+/*                                                              */
+/* ------------------------------------------------------       */
+/* graph[2]: sub block                                          */
+/*                     in_Input                                 */
+/*                       |                                      */
+/*                       |                                      */
+/*                     conv2d----in_Filter                      */
+/*                       |                                      */
+/*                       |                                      */
+/*                  batch_norm ------in_Bias                    */
+/*                       |                                      */
+/*                       |                                      */
+/*                     out_Out                                  */
+/*                                                              */
+/* After the pass is applied:                                   */
+/*                     in_Input                                 */
+/*        in_Filter      |     in_FilterMax                     */
+/*                  \    |    /                                 */
+/*                   \   |   /                                  */
+/*     in_Bias ------- __xpu__conv2d                            */
+/*                       |    \                                 */
+/*                       |     \                                */
+/*                       |      out_OutputMax                   */
+/*                     out_Output                               */
+/*                                                              */
+/* ------------------------------------------------------       */
+/* graph[3]: sub block                                          */
+/*                     in_Input                                 */
+/*                       |                                      */
+/*                       |                                      */
+/*                     conv2d----in_Filter                      */
+/*                       |                                      */
+/*                       |                                      */
+/*        in_X       batch_norm ------in_Bias                   */
+/*             \         |                                      */
+/*               \       |                                      */
+/*                elementwise_add                               */
+/*                       |                                      */
+/*                       |                                      */
+/*                     relu                                     */
+/*                       |                                      */
+/*                       |                                      */
+/*                     out_Out                                  */
+/*                                                              */
+/* After the pass is applied:                                   */
+/*                     in_Input                                 */
+/*        in_Filter      |     in_FilterMax                     */
+/*                  \    |    /                                 */
+/*                   \   |   /                                  */
+/*  in_Branch ------- __xpu__conv2d ------ in_Bias              */
+/*                       |    \                                 */
+/*                       |     \                                */
+/*                       |      out_OutputMax                   */
+/*                    out_Output                                */
+
+class XPUConv2dBlock0Fuser : public FuseBase {
+ public:
+  explicit XPUConv2dBlock0Fuser(bool with_relu) : _with_relu(with_relu) {}
+
+  void BuildPattern() override {
+    auto* input =
+        VarNode("input")->assert_is_op_input("conv2d", "Input")->AsInput();
+
+    auto* conv_filter = VarNode("conv_filter")
+                            ->assert_is_op_input("conv2d", "Filter")
+                            ->AsInput();
+    auto* conv = OpNode("conv", "conv2d")->AsIntermediate();
+    auto* conv_out = VarNode("conv_out")
+                         ->assert_is_op_output("conv2d", "Output")
+                         ->assert_is_op_input("batch_norm", "X")
+                         ->AsIntermediate();
+    auto* bn_bias =
+        VarNode("bn_bias")->assert_is_op_input("batch_norm", "Bias")->AsInput();
+    auto* bn_mean = VarNode("bn_mean")
+                        ->assert_is_op_input("batch_norm", "Mean")
+                        ->AsIntermediate();
+    auto* bn_scale = VarNode("bn_scale")
+                         ->assert_is_op_input("batch_norm", "Scale")
+                         ->AsIntermediate();
+    auto* bn_var = VarNode("bn_variance")
+                       ->assert_is_op_input("batch_norm", "Variance")
+                       ->AsIntermediate();
+    auto* bn = OpNode("bn", "batch_norm")->AsIntermediate();
+    auto* bn_out = VarNode("bn_out")->assert_is_op_output("batch_norm", "Y");
+    auto* bn_mean_out = VarNode("bn_mean_out")
+                            ->assert_is_op_output("batch_norm", "MeanOut")
+                            ->AsIntermediate();
+    auto* bn_saved_mean = VarNode("bn_saved_mean")
+                              ->assert_is_op_output("batch_norm", "SavedMean")
+                              ->AsIntermediate();
+    auto* bn_var_out = VarNode("bn_var_out")
+                           ->assert_is_op_output("batch_norm", "VarianceOut")
+                           ->AsIntermediate();
+    auto* bn_saved_var =
+        VarNode("bn_saved_var")
+            ->assert_is_op_output("batch_norm", "SavedVariance")
+            ->AsIntermediate();
+
+    *input >> *conv >> *conv_out >> *bn >> *bn_out;
+
+    *conv_filter >> *conv;
+    *bn_bias >> *bn;
+    *bn_mean >> *bn;
+    *bn_scale >> *bn;
+    *bn_var >> *bn;
+    *bn >> *bn_mean_out;
+    *bn >> *bn_saved_mean;
+    *bn >> *bn_saved_var;
+    *bn >> *bn_var_out;
+
+    if (_with_relu) {
+      bn_out->assert_is_op_input("relu", "X")->AsIntermediate();
+      auto* relu = OpNode("relu", "relu")->AsIntermediate();
+      auto* relu_out =
+          VarNode("relu_out")->assert_is_op_output("relu", "Out")->AsOutput();
+
+      *bn_out >> *relu >> *relu_out;
+    } else {
+      bn_out->AsOutput();
+    }
+  }
+
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    auto op_desc = *matched.at("conv")->stmt()->op_info();
+    auto conv_old = matched.at("conv")->stmt()->op();
+    auto* scope = conv_old->scope();
+    op_desc.mutable_inputs()->clear();
+    op_desc.mutable_outputs()->clear();
+    op_desc.SetType("__xpu__conv2d");
+    std::string input_name = matched.at("input")->arg()->name;
+    op_desc.SetInput("Input", {input_name});
+
+    auto filter_name = matched.at("conv_filter")->arg()->name;
+    auto scale_name = matched.at("bn_scale")->arg()->name;
+    auto bias_name = matched.at("bn_bias")->arg()->name;
+    auto mean_name = matched.at("bn_mean")->arg()->name;
+    auto var_name = matched.at("bn_variance")->arg()->name;
+
+    auto* filter_t = scope->FindMutableTensor(filter_name);
+    auto* scale_t = scope->FindMutableTensor(scale_name);
+    auto* bias_t = scope->FindMutableTensor(bias_name);
+    auto* mean_t = scope->FindMutableTensor(mean_name);
+    auto* var_t = scope->FindMutableTensor(var_name);
+
+    int mean_len = mean_t->numel();
+    int filter_len = filter_t->numel();
+    int filter_stride = filter_len / mean_len;
+
+    float* filter_on_host = filter_t->mutable_data<float>();
+    float* scale_on_host = scale_t->mutable_data<float>();
+    float* bias_on_host = bias_t->mutable_data<float>();
+    float* mean_on_host = mean_t->mutable_data<float>();
+    float* var_on_host = var_t->mutable_data<float>();
+
+    // Perform preprocess
+    for (int i = 0; i < mean_len; ++i) {
+      scale_on_host[i] = scale_on_host[i] / sqrtf(var_on_host[i] + 0.00001f);
+    }
+    for (int i = 0; i < mean_len; ++i) {
+      for (int j = 0; j < filter_stride; ++j) {
+        filter_on_host[i * filter_stride + j] *= scale_on_host[i];
+      }
+    }
+    for (int i = 0; i < mean_len; ++i) {
+      bias_on_host[i] += -mean_on_host[i] * scale_on_host[i];
+    }
+
+    float max_f =
+        paddle::lite::xpu::math::FindMaxAbs(filter_on_host, filter_len);
+    std::unique_ptr<int16_t[]> filter_int16(new int16_t[filter_len]);
+    paddle::lite::xpu::math::ConvertFP32ToInt16(
+        filter_on_host, filter_int16.get(), max_f, filter_len);
+    memcpy(filter_on_host, filter_int16.get(), filter_len * sizeof(int16_t));
+
+    // create new arg in graph and scope
+    std::string max_filter_name = filter_name + "_max";
+    auto* max_filter_node = graph->NewArgumentNode(max_filter_name);
+    max_filter_node->arg()->is_weight = true;
+    max_filter_node->arg()->type = LiteType::GetTensorTy(
+        TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
+
+    auto* max_filter_t = scope->NewTensor(max_filter_name);
+    max_filter_t->Resize({4});
+    float* max_ptr = max_filter_t->mutable_data<float>();
+    max_ptr[0] = max_f;
+    max_ptr[1] = max_f;
+    max_ptr[2] = max_f;
+    max_ptr[3] = max_f;
+
+    op_desc.SetInput("Filter", {filter_name});
+    op_desc.SetInput("Bias", {bias_name});
+    op_desc.SetInput("FilterMax", {max_filter_name});
+
+    std::string output_name = "";
+    if (_with_relu) {
+      output_name = matched.at("relu_out")->arg()->name;
+    } else {
+      output_name = matched.at("bn_out")->arg()->name;
+    }
+    op_desc.SetOutput("Output", {output_name});
+
+    // add new arg output_max
+    std::string max_output_name = output_name + "_max";
+    auto* max_output_node = graph->NewArgumentNode(max_output_name);
+    max_output_node->arg()->type = LiteType::GetTensorTy(
+        TARGET(kXPU), PRECISION(kFloat), DATALAYOUT(kNCHW));
+    scope->NewTensor(max_output_name);
+    op_desc.SetOutput("OutputMax", {max_output_name});
+
+    auto conv_op = LiteOpRegistry::Global().Create("__xpu__conv2d");
+    auto& valid_places = conv_old->valid_places();
+    conv_op->Attach(op_desc, scope);
+    auto* new_op_node = graph->GraphCreateInstructNode(conv_op, valid_places);
+    DirectedLink(matched.at("input"), new_op_node);
+    DirectedLink(matched.at("conv_filter"), new_op_node);
+    DirectedLink(matched.at("bn_bias"), new_op_node);
+    DirectedLink(max_filter_node, new_op_node);
+    DirectedLink(new_op_node, max_output_node);
+    if (_with_relu) {
+      DirectedLink(new_op_node, matched.at("relu_out"));
+    } else {
+      DirectedLink(new_op_node, matched.at("bn_out"));
+    }
+  }
+
+ private:
+  bool _with_relu;
+};
+
+// block with branch
+class XPUConv2dBlock1Fuser : public FuseBase {
+ public:
+  XPUConv2dBlock1Fuser() {}
+
+  void BuildPattern() override {
+    auto* input =
+        VarNode("input")->assert_is_op_input("conv2d", "Input")->AsInput();
+
+    auto* conv_filter = VarNode("conv_filter")
+                            ->assert_is_op_input("conv2d", "Filter")
+                            ->AsInput();
+    auto* conv = OpNode("conv", "conv2d")->AsIntermediate();
+    auto* conv_out = VarNode("conv_out")
+                         ->assert_is_op_output("conv2d", "Output")
+                         ->assert_is_op_input("batch_norm", "X")
+                         ->AsIntermediate();
+    auto* bn_bias =
+        VarNode("bn_bias")->assert_is_op_input("batch_norm", "Bias")->AsInput();
+    auto* bn_mean = VarNode("bn_mean")
+                        ->assert_is_op_input("batch_norm", "Mean")
+                        ->AsIntermediate();
+    auto* bn_scale = VarNode("bn_scale")
+                         ->assert_is_op_input("batch_norm", "Scale")
+                         ->AsIntermediate();
+    auto* bn_var = VarNode("bn_variance")
+                       ->assert_is_op_input("batch_norm", "Variance")
+                       ->AsIntermediate();
+    auto* bn = OpNode("bn", "batch_norm")->AsIntermediate();
+    auto* bn_out = VarNode("bn_out")
+                       ->assert_is_op_output("batch_norm", "Y")
+                       ->assert_is_op_input("elementwise_add", "Y")
+                       ->AsIntermediate();
+    auto* bn_mean_out = VarNode("bn_mean_out")
+                            ->assert_is_op_output("batch_norm", "MeanOut")
+                            ->AsIntermediate();
+    auto* bn_saved_mean = VarNode("bn_saved_mean")
+                              ->assert_is_op_output("batch_norm", "SavedMean")
+                              ->AsIntermediate();
+    auto* bn_var_out = VarNode("bn_var_out")
+                           ->assert_is_op_output("batch_norm", "VarianceOut")
+                           ->AsIntermediate();
+    auto* bn_saved_var =
+        VarNode("bn_saved_var")
+            ->assert_is_op_output("batch_norm", "SavedVariance")
+            ->AsIntermediate();
+    auto* ew_x =
+        VarNode("ew_x")->assert_is_op_input("elementwise_add", "X")->AsInput();
+    auto* ew_add = OpNode("ew_add", "elementwise_add")->AsIntermediate();
+    auto* ew_out = VarNode("ew_out")
+                       ->assert_is_op_output("elementwise_add", "Out")
+                       ->assert_is_op_input("relu", "X")
+                       ->AsIntermediate();
+    auto* relu = OpNode("relu", "relu")->AsIntermediate();
+    auto* relu_out =
+        VarNode("relu_out")->assert_is_op_output("relu", "Out")->AsOutput();
+
+    *input >> *conv >> *conv_out >> *bn >> *bn_out >> *ew_add >> *ew_out >>
+        *relu >> *relu_out;
+
+    *conv_filter >> *conv;
+    *bn_bias >> *bn;
+    *bn_mean >> *bn;
+    *bn_scale >> *bn;
+    *bn_var >> *bn;
+    *bn >> *bn_mean_out;
+    *bn >> *bn_saved_mean;
+    *bn >> *bn_saved_var;
+    *bn >> *bn_var_out;
+
+    *ew_x >> *ew_add;
+  }
+
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    auto op_desc = *matched.at("conv")->stmt()->op_info();
+    auto conv_old = matched.at("conv")->stmt()->op();
+    auto* scope = conv_old->scope();
+    op_desc.mutable_inputs()->clear();
+    op_desc.mutable_outputs()->clear();
+    op_desc.SetType("__xpu__conv2d");
+    std::string input_name = matched.at("input")->arg()->name;
+    op_desc.SetInput("Input", {input_name});
+
+    auto filter_name = matched.at("conv_filter")->arg()->name;
+    auto scale_name = matched.at("bn_scale")->arg()->name;
+    auto bias_name = matched.at("bn_bias")->arg()->name;
+    auto mean_name = matched.at("bn_mean")->arg()->name;
+    auto var_name = matched.at("bn_variance")->arg()->name;
+
+    auto* filter_t = scope->FindMutableTensor(filter_name);
+    auto* scale_t = scope->FindMutableTensor(scale_name);
+    auto* bias_t = scope->FindMutableTensor(bias_name);
+    auto* mean_t = scope->FindMutableTensor(mean_name);
+    auto* var_t = scope->FindMutableTensor(var_name);
+
+    int mean_len = mean_t->numel();
+    int filter_len = filter_t->numel();
+    int filter_stride = filter_len / mean_len;
+
+    float* filter_on_host = filter_t->mutable_data<float>();
+    float* scale_on_host = scale_t->mutable_data<float>();
+    float* bias_on_host = bias_t->mutable_data<float>();
+    float* mean_on_host = mean_t->mutable_data<float>();
+    float* var_on_host = var_t->mutable_data<float>();
+
+    // Perform preprocess
+    for (int i = 0; i < mean_len; ++i) {
+      scale_on_host[i] = scale_on_host[i] / sqrtf(var_on_host[i] + 0.00001f);
+    }
+    for (int i = 0; i < mean_len; ++i) {
+      for (int j = 0; j < filter_stride; ++j) {
+        filter_on_host[i * filter_stride + j] *= scale_on_host[i];
+      }
+    }
+    for (int i = 0; i < mean_len; ++i) {
+      bias_on_host[i] += -mean_on_host[i] * scale_on_host[i];
+    }
+
+    float max_f =
+        paddle::lite::xpu::math::FindMaxAbs(filter_on_host, filter_len);
+    std::unique_ptr<int16_t[]> filter_int16(new int16_t[filter_len]);
+    paddle::lite::xpu::math::ConvertFP32ToInt16(
+        filter_on_host, filter_int16.get(), max_f, filter_len);
+    memcpy(filter_on_host, filter_int16.get(), filter_len * sizeof(int16_t));
+
+    // create new arg in graph and scope
+    std::string max_filter_name = filter_name + "_max";
+    auto* max_filter_node = graph->NewArgumentNode(max_filter_name);
+    max_filter_node->arg()->is_weight = true;
+    max_filter_node->arg()->type = LiteType::GetTensorTy(
+        TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
+
+    auto* max_filter_t = scope->NewTensor(max_filter_name);
+    max_filter_t->Resize({4});
+    float* max_ptr = max_filter_t->mutable_data<float>();
+    max_ptr[0] = max_f;
+    max_ptr[1] = max_f;
+    max_ptr[2] = max_f;
+    max_ptr[3] = max_f;
+
+    op_desc.SetInput("Filter", {filter_name});
+    op_desc.SetInput("Bias", {bias_name});
+    op_desc.SetInput("FilterMax", {max_filter_name});
+    op_desc.SetInput("Branch", {matched.at("ew_x")->arg()->name});
+
+    std::string output_name = matched.at("relu_out")->arg()->name;
+    op_desc.SetOutput("Output", {output_name});
+
+    // add new arg output_max
+    std::string max_output_name = output_name + "_max";
+    auto* max_output_node = graph->NewArgumentNode(max_output_name);
+    max_output_node->arg()->type = LiteType::GetTensorTy(
+        TARGET(kXPU), PRECISION(kFloat), DATALAYOUT(kNCHW));
+    scope->NewTensor(max_output_name);
+    op_desc.SetOutput("OutputMax", {max_output_name});
+
+    auto conv_op = LiteOpRegistry::Global().Create("__xpu__conv2d");
+    auto& valid_places = conv_old->valid_places();
+    conv_op->Attach(op_desc, scope);
+    auto* new_op_node = graph->GraphCreateInstructNode(conv_op, valid_places);
+    DirectedLink(matched.at("input"), new_op_node);
+    DirectedLink(matched.at("conv_filter"), new_op_node);
+    DirectedLink(matched.at("bn_bias"), new_op_node);
+    DirectedLink(matched.at("ew_x"), new_op_node);
+    DirectedLink(max_filter_node, new_op_node);
+    DirectedLink(new_op_node, matched.at("relu_out"));
+    DirectedLink(new_op_node, max_output_node);
+  }
+};
+
+}  // namespace fusion
+
+class XPUConv2dFusePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override {
+    if (GetBoolFromEnv("XPU_ENABLE_XTCL")) return;
+
+    fusion::XPUConv2dBlock1Fuser fuser; /* branch fuse */
+    fuser(graph.get());
+
+    fusion::XPUConv2dBlock0Fuser fuser1(true /* with_relu */);
+    fuser1(graph.get());
+
+    fusion::XPUConv2dBlock0Fuser fuser2(false /* with_relu */);
+    fuser2(graph.get());
+  }
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(__xpu__conv2d_fuse_pass, paddle::lite::mir::XPUConv2dFusePass)
+    .BindTargets({TARGET(kXPU)})
+    .BindKernel("conv2d");
diff --git a/lite/core/mir/fusion/__xpu__conv2d_link_previous_out_max_pass.cc b/lite/core/mir/fusion/__xpu__conv2d_link_previous_out_max_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d294f1f2f7cb440bad79035353989711a59f89d2
--- /dev/null
+++ b/lite/core/mir/fusion/__xpu__conv2d_link_previous_out_max_pass.cc
@@ -0,0 +1,144 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <string>
+#include "lite/backends/xpu/math.h"
+#include "lite/core/mir/pass_registry.h"
+#include "lite/core/mir/pattern_matcher_high_api.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+/* link the previous __xpu__conv2d's OutputMax to   */
+/* next __xpu__conv2d as InputMax                   */
+/* For example:                                     */
+/* graph[1]: sub block                              */
+/*                     in_Input                     */
+/*        in_Filter      |     in_FilterMax         */
+/*                  \    |    /                     */
+/*                   \   |   /                      */
+/*     in_Bias ------- __xpu__conv2d                */
+/*                       |      \                   */
+/*                       |       \                  */
+/*                out_Output      out_OutputMax     */
+/*                       |                          */
+/*                       |                          */
+/*                    __xpu__conv2d                 */
+/*                       |                          */
+/*                       |                          */
+/*                     out_Output                   */
+/*                                                  */
+/* After the pass is applied:                       */
+/*                     in_Input                     */
+/*        in_Filter      |     in_FilterMax         */
+/*                  \    |    /                     */
+/*                   \   |   /                      */
+/*     in_Bias ------- __xpu__conv2d                */
+/*                       |      \                   */
+/*                       |       \                  */
+/*                out_Output      out_OutputMax     */
+/*                       |       /                  */
+/*                       |      /                   */
+/*                    __xpu__conv2d                 */
+/*                       |                          */
+/*                       |                          */
+/*                     out_Output                   */
+
+class XPUConv2dLinkFuser : public FuseBase {
+ public:
+  explicit XPUConv2dLinkFuser(bool with_branch) : _with_branch(with_branch) {}
+
+  void BuildPattern() override {
+    auto* input = VarNode("input")
+                      ->assert_is_op_input("__xpu__conv2d", "Input")
+                      ->AsInput();
+    auto* filter = VarNode("filter")
+                       ->assert_is_op_input("__xpu__conv2d", "Filter")
+                       ->AsInput();
+    auto* filter_max = VarNode("filter_max")
+                           ->assert_is_op_input("__xpu__conv2d", "FilterMax")
+                           ->AsInput();
+    auto* bias =
+        VarNode("bias")->assert_is_op_input("__xpu__conv2d", "Bias")->AsInput();
+    auto* xpu_conv = OpNode("xpu_conv", "__xpu__conv2d");
+    auto* xpu_conv_out = VarNode("xpu_conv_out")
+                             ->assert_is_op_output("__xpu__conv2d", "Output")
+                             ->AsOutput();
+    auto* xpu_conv_out_max =
+        VarNode("xpu_conv_out_max")
+            ->assert_is_op_output("__xpu__conv2d", "OutputMax")
+            ->AsOutput();
+
+    *input >> *xpu_conv >> *xpu_conv_out;
+    *filter >> *xpu_conv;
+    *filter_max >> *xpu_conv;
+    *bias >> *xpu_conv;
+    *xpu_conv >> *xpu_conv_out_max;
+
+    if (_with_branch) {
+      auto* branch = VarNode("branch")
+                         ->assert_is_op_input("__xpu__conv2d", "Branch")
+                         ->AsInput();
+      *branch >> *xpu_conv;
+    }
+  }
+
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    auto conv_instruct = matched.at("xpu_conv")->stmt();
+    auto op_desc = *conv_instruct->mutable_op_info();
+    auto conv_old = conv_instruct->op();
+
+    // try to find input_max
+    std::string max_input_name = matched.at("input")->arg()->name + "_max";
+    auto* max_input_node = graph->RetrieveArgument(max_input_name);
+    if (max_input_node != nullptr &&
+        (!op_desc.HasAttr("has_input_max") ||
+         !op_desc.GetAttr<bool>("has_input_max"))) {
+      op_desc.SetInput("InputMax", {max_input_name});
+      op_desc.SetAttr("has_input_max", true);
+      conv_instruct->ResetOp(op_desc, conv_old->valid_places());
+      DirectedLink(max_input_node, matched.at("xpu_conv"));
+    }
+  }
+
+ private:
+  bool _with_branch;
+};
+
+}  // namespace fusion
+
+class XPUConv2dLinkPass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override {
+    if (GetBoolFromEnv("XPU_ENABLE_XTCL")) return;
+
+    fusion::XPUConv2dLinkFuser fuser1(true);
+    fuser1(graph.get());
+
+    // TODO(sunsetlh): need fix bug in no branch case
+    fusion::XPUConv2dLinkFuser fuser2(false);
+    fuser2(graph.get());
+  }
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(__xpu__conv2d_link_previous_out_max_pass,
+                  paddle::lite::mir::XPUConv2dLinkPass)
+    .BindTargets({TARGET(kXPU)})
+    .BindKernel("__xpu__conv2d");
diff --git a/lite/core/mir/fusion/__xpu__sfa_head_meanstd_fuse_pass.cc b/lite/core/mir/fusion/__xpu__sfa_head_meanstd_fuse_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0af60e7fec36e8e21c0a59e30f562821c04f8978
--- /dev/null
+++ b/lite/core/mir/fusion/__xpu__sfa_head_meanstd_fuse_pass.cc
@@ -0,0 +1,145 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <string>
+#include "lite/backends/xpu/math.h"
+#include "lite/core/mir/pass_registry.h"
+#include "lite/core/mir/pattern_matcher_high_api.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+// Special fuse pass for the subgraph block in vis clarity model
+// block desc:
+//  [["reduce_mean",
+//      ["concat"],
+//      ["elementwise_sub",
+//          ["square", ["reduce_sum", ["scale", ["sqrt"]]]]]]]
+
+class XPUSfaHeadMeanstdFuser : public FuseBase {
+ public:
+  void BuildPattern() override {
+    auto* reduce_mean_input = VarNode("reduce_mean_input")
+                                  ->assert_is_op_output("reshape2", "Out")
+                                  ->assert_is_op_input("reduce_mean", "X")
+                                  ->AsInput();
+    auto* reduce_mean = OpNode("reduce_mean", "reduce_mean")->AsIntermediate();
+    auto* reduce_mean_out = VarNode("reduce_mean_out")
+                                ->assert_is_op_output("reduce_mean", "Out")
+                                ->assert_is_op_nth_input("concat", "X", 0)
+                                ->assert_is_op_input("elementwise_sub", "Y")
+                                ->AsIntermediate();
+    auto* elementwise_sub =
+        OpNode("elementwise_sub", "elementwise_sub")->AsIntermediate();
+    auto* elementwise_sub_out =
+        VarNode("elementwise_sub_out")
+            ->assert_is_op_output("elementwise_sub", "Out")
+            ->assert_is_op_input("square", "X")
+            ->AsIntermediate();
+    auto* square = OpNode("square", "square")->AsIntermediate();
+    auto* square_out = VarNode("square_out")
+                           ->assert_is_op_output("square", "Out")
+                           ->assert_is_op_input("reduce_sum", "X")
+                           ->AsIntermediate();
+    auto* reduce_sum = OpNode("reduce_sum", "reduce_sum")->AsIntermediate();
+    auto* reduce_sum_out = VarNode("reduce_sum_out")
+                               ->assert_is_op_output("reduce_sum", "Out")
+                               ->assert_is_op_input("elementwise_div", "X")
+                               ->AsIntermediate();
+    auto* fill_constant =
+        OpNode("fill_constant", "fill_constant")->AsIntermediate();
+    auto* fill_constant_out = VarNode("fill_constant_out")
+                                  ->assert_is_op_output("fill_constant", "Out")
+                                  ->AsIntermediate();
+    auto* elementwise_div =
+        OpNode("elementwise_div", "elementwise_div")->AsIntermediate();
+    auto* elementwise_div_out =
+        VarNode("elementwise_div_out")
+            ->assert_is_op_output("elementwise_div", "Out")
+            ->assert_is_op_input("sqrt", "X")
+            ->AsIntermediate();
+    auto* sqrt = OpNode("sqrt", "sqrt")->AsIntermediate();
+    auto* sqrt_out = VarNode("sqrt_out")
+                         ->assert_is_op_output("sqrt", "Out")
+                         ->assert_is_op_nth_input("concat", "X", 1)
+                         ->AsIntermediate();
+    auto* concat = OpNode("concat", "concat")->AsIntermediate();
+    auto* out =
+        VarNode("out")->assert_is_op_output("concat", "Out")->AsOutput();
+
+    std::vector<PMNode*> elementwise_sub_inputs{reduce_mean_out,
+                                                reduce_mean_input};
+    std::vector<PMNode*> elementwise_div_inputs{reduce_sum_out,
+                                                fill_constant_out};
+    std::vector<PMNode*> concat_inputs{reduce_mean_out, sqrt_out};
+    *reduce_mean_input >> *reduce_mean >> *reduce_mean_out;
+    elementwise_sub_inputs >> *elementwise_sub >> *elementwise_sub_out;
+    *elementwise_sub_out >> *square >> *square_out;
+    *square_out >> *reduce_sum >> *reduce_sum_out;
+    *fill_constant >> *fill_constant_out;
+    elementwise_div_inputs >> *elementwise_div >> *elementwise_div_out;
+    *elementwise_div_out >> *sqrt >> *sqrt_out;
+    concat_inputs >> *concat >> *out;
+  }
+
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    auto reduce_mean = matched.at("reduce_mean")->stmt()->op();
+    auto* scope = reduce_mean->scope();
+    auto op_desc = GenOpDesc(matched);
+    auto vis_op = LiteOpRegistry::Global().Create("__xpu__sfa_head");
+    auto& valid_places = reduce_mean->valid_places();
+    vis_op->Attach(op_desc, scope);
+    auto* new_op_node = graph->GraphCreateInstructNode(vis_op, valid_places);
+
+    IR_NODE_LINK_TO(matched.at("reduce_mean_input"), new_op_node);
+    IR_NODE_LINK_TO(new_op_node, matched.at("out"));
+  }
+
+ private:
+  cpp::OpDesc GenOpDesc(const key2nodes_t& matched) {
+    cpp::OpDesc op_desc = *matched.at("reduce_mean")->stmt()->op_info();
+    op_desc.mutable_inputs()->clear();
+    op_desc.mutable_outputs()->clear();
+    op_desc.SetType("__xpu__sfa_head");
+    op_desc.SetInput("Input", {matched.at("reduce_mean_input")->arg()->name});
+    op_desc.SetOutput("Output", {matched.at("out")->arg()->name});
+    op_desc.SetAttr("op_type", std::string("meanstd"));
+    return op_desc;
+  }
+};
+
+}  // namespace fusion
+
+class XPUSfaHeadMeanstdFusePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override {
+    if (GetBoolFromEnv("XPU_ENABLE_XTCL")) {
+      return;
+    }
+
+    fusion::XPUSfaHeadMeanstdFuser fuser;
+    fuser(graph.get());
+  }
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(__xpu__sfa_head_meanstd_fuse_pass,
+                  paddle::lite::mir::XPUSfaHeadMeanstdFusePass)
+    .BindTargets({TARGET(kXPU)})
+    .BindKernel("reduce_mean");
diff --git a/lite/core/mir/fusion/__xpu__sfa_head_moment_fuse_pass.cc b/lite/core/mir/fusion/__xpu__sfa_head_moment_fuse_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6daf660500e36ef81640a77776573a3fb93ab5c9
--- /dev/null
+++ b/lite/core/mir/fusion/__xpu__sfa_head_moment_fuse_pass.cc
@@ -0,0 +1,253 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <string>
+#include "lite/backends/xpu/math.h"
+#include "lite/core/mir/pass_registry.h"
+#include "lite/core/mir/pattern_matcher_high_api.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+// Special fuse pass for the subgraph block in vis clarity model
+// block desc:
+//  [["reduce_mean",
+//  ["concat"],
+//  ["elementwise_sub",
+//      ["square", ["reduce_mean", ["sqrt"]]],
+//      ["abs", ["pow", ["elementwise_mul", ["reduce_mean", ["abs",
+//      ["pow"]]]]]],
+//      ["sign"],
+//      ["abs", ["pow", ["reduce_mean", ["abs", ["pow"]]]]]]]]
+
+class XPUSfaHeadMomentFuser : public FuseBase {
+ public:
+  void BuildPattern() override {
+    auto* reduce_mean_input = VarNode("reduce_mean_input")
+                                  ->assert_is_op_output("reshape2", "Out")
+                                  ->assert_is_op_input("reduce_mean", "X")
+                                  ->assert_is_op_input("elementwise_sub", "X")
+                                  ->AsInput();
+    auto* reduce_mean = OpNode("reduce_mean", "reduce_mean")->AsIntermediate();
+
+    auto* reduce_mean_out = VarNode("reduce_mean_out")
+                                ->assert_is_op_output("reduce_mean", "Out")
+                                ->assert_is_op_nth_input("concat", "X", 0)
+                                ->assert_is_op_input("elementwise_sub", "Y")
+                                ->AsIntermediate();
+
+    auto* elementwise_sub =
+        OpNode("elementwise_sub", "elementwise_sub")->AsIntermediate();
+    auto* elementwise_sub_out =
+        VarNode("elementwise_sub_out")
+            ->assert_is_op_output("elementwise_sub", "Out")
+            ->assert_is_op_input("square", "X")
+            ->assert_is_op_input("abs", "X")
+            ->assert_is_op_input("sign", "X")
+            ->AsIntermediate();
+
+    auto* square = OpNode("square", "square")->AsIntermediate();
+
+    auto* square_out = VarNode("square_out")
+                           ->assert_is_op_output("square", "Out")
+                           ->assert_is_op_input("reduce_mean", "X")
+                           ->AsIntermediate();
+    auto* reduce_mean_es =
+        OpNode("es_reduce_mean", "reduce_mean")->AsIntermediate();
+    auto* reduce_mean_out_es = VarNode("reduce_mean_out_es")
+                                   ->assert_is_op_output("reduce_mean", "Out")
+                                   ->assert_is_op_input("sqrt", "X")
+                                   ->AsIntermediate();
+    auto* sqrt = OpNode("sqrt", "sqrt")->AsIntermediate();
+    auto* sqrt_out = VarNode("sqrt_out")
+                         ->assert_is_op_output("sqrt", "Out")
+                         ->assert_is_op_nth_input("concat", "X", 1)
+                         ->AsIntermediate();
+    auto* concat = OpNode("concat", "concat")->AsIntermediate();
+    auto* out =
+        VarNode("out")->assert_is_op_output("concat", "Out")->AsOutput();
+
+    auto* abs_e2 = OpNode("e2_abs", "abs")->AsIntermediate();
+    auto* abs_e2_out = VarNode("abs_e2_out")
+                           ->assert_is_op_input("pow", "X")
+                           ->assert_is_op_output("abs", "Out")
+                           ->AsIntermediate();
+
+    auto* pow_e2 = OpNode("e2_pow", "pow")->AsIntermediate();
+    auto* pow_e2_out = VarNode("pow_e2_out")
+                           ->assert_is_op_input("elementwise_mul", "X")
+                           ->assert_is_op_output("pow", "Out")
+                           ->AsIntermediate();
+
+    auto* sign_e3 = OpNode("e3_sign", "sign")->AsIntermediate();
+    auto* sign_e3_out = VarNode("sign_e3_out")
+                            ->assert_is_op_input("elementwise_mul", "Y")
+                            ->assert_is_op_output("sign", "Out")
+                            ->AsIntermediate();
+
+    auto* elementwise_mul_top =
+        OpNode("elementwise_mul_top", "elementwise_mul")->AsIntermediate();
+    auto* elementwise_mul_top_out =
+        VarNode("elementwise_mul_top_out")
+            ->assert_is_op_input("reduce_mean", "X")
+            ->assert_is_op_output("elementwise_mul", "Out")
+            ->AsIntermediate();
+    auto* reduce_mean_e2 =
+        OpNode("reduce_mean_e2", "reduce_mean")->AsIntermediate();
+    auto* reduce_mean_e2_out = VarNode("reduce_mean_e2_out")
+                                   ->assert_is_op_input("abs", "X")
+                                   ->assert_is_op_input("sign", "X")
+                                   ->assert_is_op_output("reduce_mean", "Out")
+                                   ->AsIntermediate();
+    auto* abs_e2_2 = OpNode("abs_e2_2", "abs")->AsIntermediate();
+    auto* abs_e2_2_out = VarNode("abs_e2_2_out")
+                             ->assert_is_op_input("pow", "X")
+                             ->assert_is_op_output("abs", "Out")
+                             ->AsIntermediate();
+    auto* pow_e2_2 = OpNode("pow_e2_2", "pow")->AsIntermediate();
+    auto* pow_e2_2_out = VarNode("pow_e2_2_out")
+                             ->assert_is_op_nth_input("elementwise_mul", "X", 0)
+                             ->assert_is_op_output("pow", "Out")
+                             ->AsIntermediate();
+    auto* sign_e3_2 = OpNode("sign_e3_2", "sign")->AsIntermediate();
+    auto* sign_e3_2_out = VarNode("sign_e3_2_out")
+                              ->assert_is_op_input("elementwise_mul", "Y")
+                              ->assert_is_op_output("sign", "Out")
+                              ->AsIntermediate();
+    auto* elementwise_mul_bottom =
+        OpNode("elementwise_mul_bottom", "elementwise_mul")->AsIntermediate();
+    auto* elementwise_mul_bottom_out =
+        VarNode("elementwise_mul_bottom_out")
+            ->assert_is_op_output("elementwise_mul", "Out")
+            ->assert_is_op_nth_input("concat", "X", 2)
+            ->AsIntermediate();
+
+    // e4
+    auto* abs_e_4 = OpNode("abs_e_4", "abs")->AsIntermediate();
+    auto* abs_e_4_out = VarNode("abs_e_4_out")
+                            ->assert_is_op_output("abs", "Out")
+                            ->assert_is_op_input("pow", "X")
+                            ->AsIntermediate();
+    auto* pow_e_4 = OpNode("pow_e_4", "pow")->AsIntermediate();
+    auto* pow_e_4_out = VarNode("pow_e_4_out")
+                            ->assert_is_op_output("pow", "Out")
+                            ->assert_is_op_input("reduce_mean", "X")
+                            ->AsIntermediate();
+    auto* reduce_mean_4 = OpNode("reduce_mean_4")->AsIntermediate();
+    auto* reduce_mean_4_out = VarNode("reduce_mean_4_out")
+                                  ->assert_is_op_output("reduce_mean", "Out")
+                                  ->assert_is_op_input("abs", "X")
+                                  ->AsIntermediate();
+
+    auto* abs_e_4_2 = OpNode("abs_e_4_2", "abs")->AsIntermediate();
+    auto* abs_e_4_2_out = VarNode("abs_e_4_2_out")
+                              ->assert_is_op_output("abs", "Out")
+                              ->assert_is_op_input("pow", "X")
+                              ->AsIntermediate();
+
+    auto* pow_e_4_2 = OpNode("pow_e_4_2", "pow")->AsIntermediate();
+    auto* pow_e_4_2_out = VarNode("pow_e_4_2_out")
+                              ->assert_is_op_output("pow", "Out")
+                              ->assert_is_op_nth_input("concat", "X", 3)
+                              ->AsIntermediate();
+
+    std::vector<PMNode*> elementwise_sub_inputs{reduce_mean_input,
+                                                reduce_mean_out};
+
+    *reduce_mean_input >> *reduce_mean >> *reduce_mean_out;
+    elementwise_sub_inputs >> *elementwise_sub >> *elementwise_sub_out;
+    *elementwise_sub_out >> *square >> *square_out;
+    *square_out >> *reduce_mean_es >> *reduce_mean_out_es;
+    *reduce_mean_out_es >> *sqrt >> *sqrt_out;
+
+    *elementwise_sub_out >> *sign_e3 >> *sign_e3_out;
+
+    std::vector<PMNode*> elementwise_mul_top_inputs{pow_e2_out, sign_e3_out};
+    *elementwise_sub_out >> *abs_e2 >> *abs_e2_out;
+    *abs_e2_out >> *pow_e2 >> *pow_e2_out;
+    elementwise_mul_top_inputs >> *elementwise_mul_top >>
+        *elementwise_mul_top_out;
+
+    *elementwise_mul_top_out >> *reduce_mean_e2 >> *reduce_mean_e2_out;
+    *reduce_mean_e2_out >> *abs_e2_2 >> *abs_e2_2_out;
+    *abs_e2_2_out >> *pow_e2_2 >> *pow_e2_2_out;
+
+    *reduce_mean_e2_out >> *sign_e3_2 >> *sign_e3_2_out;
+
+    std::vector<PMNode*> elementwise_mul_bottom_inputs{pow_e2_2_out,
+                                                       sign_e3_2_out};
+    elementwise_mul_bottom_inputs >> *elementwise_mul_bottom >>
+        *elementwise_mul_bottom_out;
+
+    *elementwise_sub_out >> *abs_e_4 >> *abs_e_4_out;
+    *abs_e_4_out >> *pow_e_4 >> *pow_e_4_out;
+    *pow_e_4_out >> *reduce_mean_4 >> *reduce_mean_4_out;
+    *reduce_mean_4_out >> *abs_e_4_2 >> *abs_e_4_2_out;
+    *abs_e_4_2_out >> *pow_e_4_2 >> *pow_e_4_2_out;
+
+    std::vector<PMNode*> concat_inputs{
+        reduce_mean_out, sqrt_out, elementwise_mul_bottom_out, pow_e_4_2_out};
+    concat_inputs >> *concat >> *out;
+  }
+
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    auto reduce_mean = matched.at("reduce_mean")->stmt()->op();
+    auto* scope = reduce_mean->scope();
+    auto op_desc = GenOpDesc(matched);
+    auto vis_op = LiteOpRegistry::Global().Create("__xpu__sfa_head");
+    auto& valid_places = reduce_mean->valid_places();
+    vis_op->Attach(op_desc, scope);
+    auto* new_op_node = graph->GraphCreateInstructNode(vis_op, valid_places);
+
+    IR_NODE_LINK_TO(matched.at("reduce_mean_input"), new_op_node);
+    IR_NODE_LINK_TO(new_op_node, matched.at("out"));
+  }
+
+ private:
+  cpp::OpDesc GenOpDesc(const key2nodes_t& matched) {
+    cpp::OpDesc op_desc = *matched.at("reduce_mean")->stmt()->op_info();
+    op_desc.mutable_inputs()->clear();
+    op_desc.mutable_outputs()->clear();
+    op_desc.SetType("__xpu__sfa_head");
+    op_desc.SetInput("Input", {matched.at("reduce_mean_input")->arg()->name});
+    op_desc.SetOutput("Output", {matched.at("out")->arg()->name});
+    op_desc.SetAttr("op_type", std::string("moment"));
+    return op_desc;
+  }
+};
+
+}  // namespace fusion
+
+class XPUSfaHeadMomentFusePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override {
+    if (GetBoolFromEnv("XPU_ENABLE_XTCL")) {
+      return;
+    }
+
+    fusion::XPUSfaHeadMomentFuser fuser;
+    fuser(graph.get());
+  }
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(__xpu__sfa_head_moment_fuse_pass,
+                  paddle::lite::mir::XPUSfaHeadMomentFusePass)
+    .BindTargets({TARGET(kXPU)})
+    .BindKernel("reduce_mean");
diff --git a/lite/core/mir/graph_visualize_pass.cc b/lite/core/mir/graph_visualize_pass.cc
index 98b1597b49b9a7e151c86d11843e45163890191a..c68bd1161457eca2e7f280e895b8e5aee2498fc8 100644
--- a/lite/core/mir/graph_visualize_pass.cc
+++ b/lite/core/mir/graph_visualize_pass.cc
@@ -122,7 +122,15 @@ std::string Visualize(mir::SSAGraph* graph) {
         dot.AddNode(var_name, {});
         exists_var_names.insert(var_name);
       }
-      dot.AddEdge(var_name, op_name, {});
+      std::vector<Dot::Attr> attrs;
+      std::string arg_name;
+      if (op_info->GetInputArgname(var_name, &arg_name)) {
+        attrs.emplace_back("label", arg_name);
+      } else {
+        VLOG(5) << "Can not find the input argument for var " << var_name
+                << " in " << op_type;
+      }
+      dot.AddEdge(var_name, op_name, attrs);
     }
     for (auto& x : node->outlinks) {
       std::string var_name;
@@ -136,7 +144,15 @@ std::string Visualize(mir::SSAGraph* graph) {
         dot.AddNode(var_name, {});
         exists_var_names.insert(var_name);
       }
-      dot.AddEdge(op_name, var_name, {});
+      std::vector<Dot::Attr> attrs;
+      std::string arg_name;
+      if (op_info->GetOutputArgname(var_name, &arg_name)) {
+        attrs.emplace_back("label", arg_name);
+      } else {
+        VLOG(5) << "Can not find the output argument for var " << var_name
+                << " in " << op_type;
+      }
+      dot.AddEdge(op_name, var_name, attrs);
     }
     // Output its all of attributes(name and values)
     os << "* " << op_name << "\n";
diff --git a/lite/core/optimizer.h b/lite/core/optimizer.h
index 6b18e929c077699a723b9dd9db313370d061cbb8..2dfc444a26ffe013ad05c81a003dd073cc133177 100644
--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
@@ -109,6 +109,10 @@ class Optimizer {
            "identity_dropout_eliminate_pass",
            "__xpu__resnet_fuse_pass",
            "__xpu__resnet_cbam_fuse_pass",
+           "__xpu__conv2d_fuse_pass",
+           "__xpu__conv2d_link_previous_out_max_pass",
+           "__xpu__sfa_head_meanstd_fuse_pass",
+           "__xpu__sfa_head_moment_fuse_pass",
            "__xpu__mmdnn_fuse_pass",
            "__xpu__multi_encoder_fuse_pass",
            "__xpu__embedding_with_eltwise_add_fuse_pass",
diff --git a/lite/kernels/arm/CMakeLists.txt b/lite/kernels/arm/CMakeLists.txt
index c3190716d1d7937933b83516b784d7128084227e..864f2938af6aefd57185a61831e067d56908a892 100644
--- a/lite/kernels/arm/CMakeLists.txt
+++ b/lite/kernels/arm/CMakeLists.txt
@@ -90,8 +90,6 @@ add_kernel(sequence_softmax_compute_arm ARM extra SRCS sequence_softmax_compute.
 add_kernel(topk_compute_arm ARM extra SRCS topk_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(increment_compute_arm ARM extra SRCS increment_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(beam_search_compute_arm ARM extra SRCS beam_search_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(fill_constant_compute_arm ARM basic SRCS fill_constant_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(fill_constant_batch_size_like_compute_arm ARM basic SRCS fill_constant_batch_size_like_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(lod_reset_compute_arm ARM extra SRCS lod_reset_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(lstm_arm ARM extra SRCS lstm_compute.cc DEPS ${lite_kernel_deps} math_arm)
 
diff --git a/lite/kernels/host/CMakeLists.txt b/lite/kernels/host/CMakeLists.txt
index 99e4739ca0350e725458cd77721f9312974584b0..0cc7b5b302907abbe2e8d2ebadbb3a358cc998d9 100644
--- a/lite/kernels/host/CMakeLists.txt
+++ b/lite/kernels/host/CMakeLists.txt
@@ -8,6 +8,8 @@ add_kernel(unsqueeze_compute_host Host basic SRCS unsqueeze_compute.cc DEPS ${li
 add_kernel(multiclass_nms_compute_host Host basic SRCS multiclass_nms_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(expand_compute_host Host basic SRCS expand_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(expand_as_compute_host Host basic SRCS expand_as_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(fill_constant_compute_host Host basic SRCS fill_constant_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(fill_constant_batch_size_like_compute_host Host basic SRCS fill_constant_batch_size_like_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(shape_compute_host Host extra SRCS shape_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(is_empty_compute_host Host extra SRCS is_empty_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(crf_decoding_compute_host Host extra SRCS crf_decoding_compute.cc DEPS ${lite_kernel_deps})
diff --git a/lite/kernels/arm/fill_constant_batch_size_like_compute.cc b/lite/kernels/host/fill_constant_batch_size_like_compute.cc
similarity index 84%
rename from lite/kernels/arm/fill_constant_batch_size_like_compute.cc
rename to lite/kernels/host/fill_constant_batch_size_like_compute.cc
index 3a8a09020f11e9cc84dc4891512b6581372e7085..13725eb707778cd04fc386a2c92f6199cee3860a 100644
--- a/lite/kernels/arm/fill_constant_batch_size_like_compute.cc
+++ b/lite/kernels/host/fill_constant_batch_size_like_compute.cc
@@ -12,16 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/fill_constant_batch_size_like_compute.h"
+#include "lite/kernels/host/fill_constant_batch_size_like_compute.h"
 
 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace arm {
+namespace host {
 
 void FillConstantBatchSizeLikeCompute::Run() {
   auto& param = *param_.get_mutable<param_t>();
-  auto& context = ctx_->As<ARMContext>();
 
   if (param.dtype == static_cast<int32_t>(lite::core::FluidType::FP32)) {
     auto data = param.out->template mutable_data<float>();
@@ -50,18 +49,18 @@ void FillConstantBatchSizeLikeCompute::Run() {
   }
 }
 
-}  // namespace arm
+}  // namespace host
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
 
 REGISTER_LITE_KERNEL(
     fill_constant_batch_size_like,
-    kARM,
+    kHost,
     kAny,
     kNCHW,
-    paddle::lite::kernels::arm::FillConstantBatchSizeLikeCompute,
+    paddle::lite::kernels::host::FillConstantBatchSizeLikeCompute,
     def)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))})
     .Finalize();
diff --git a/lite/kernels/arm/fill_constant_batch_size_like_compute.h b/lite/kernels/host/fill_constant_batch_size_like_compute.h
similarity index 91%
rename from lite/kernels/arm/fill_constant_batch_size_like_compute.h
rename to lite/kernels/host/fill_constant_batch_size_like_compute.h
index 23aa64bb6417ae1ed0b551520096cf6401ec702c..b6f63fc2d6401a7705c04725ed12ad622ed9a728 100644
--- a/lite/kernels/arm/fill_constant_batch_size_like_compute.h
+++ b/lite/kernels/host/fill_constant_batch_size_like_compute.h
@@ -19,10 +19,10 @@
 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace arm {
+namespace host {
 
 class FillConstantBatchSizeLikeCompute
-    : public KernelLite<TARGET(kARM), PRECISION(kAny)> {
+    : public KernelLite<TARGET(kHost), PRECISION(kAny)> {
  public:
   using param_t = operators::FillConstantBatchSizeLikeParam;
 
@@ -31,7 +31,7 @@ class FillConstantBatchSizeLikeCompute
   ~FillConstantBatchSizeLikeCompute() {}
 };
 
-}  // namespace arm
+}  // namespace host
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/kernels/arm/fill_constant_compute.cc b/lite/kernels/host/fill_constant_compute.cc
similarity index 81%
rename from lite/kernels/arm/fill_constant_compute.cc
rename to lite/kernels/host/fill_constant_compute.cc
index 3d8fb9aee83dcaaa39bc94e98e8487c1bf0bf15c..61ef26a5a9c405904b873ccbb72eb01be27a4f3a 100644
--- a/lite/kernels/arm/fill_constant_compute.cc
+++ b/lite/kernels/host/fill_constant_compute.cc
@@ -12,16 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/fill_constant_compute.h"
+#include "lite/kernels/host/fill_constant_compute.h"
 
 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace arm {
+namespace host {
 
 void FillConstantCompute::Run() {
   auto& param = *param_.get_mutable<param_t>();
-  auto& context = ctx_->As<ARMContext>();
 
   if (param.dtype == static_cast<int32_t>(lite::core::FluidType::FP32)) {
     auto data = param.out->template mutable_data<float>();
@@ -50,21 +49,21 @@ void FillConstantCompute::Run() {
   }
 }
 
-}  // namespace arm
+}  // namespace host
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
 
 // float
 REGISTER_LITE_KERNEL(fill_constant,
-                     kARM,
+                     kHost,
                      kAny,
                      kNCHW,
-                     paddle::lite::kernels::arm::FillConstantCompute,
+                     paddle::lite::kernels::host::FillConstantCompute,
                      def)
     .BindInput("ShapeTensor",
-               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
     .BindInput("ShapeTensorList",
-               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))})
     .Finalize();
diff --git a/lite/kernels/arm/fill_constant_compute.h b/lite/kernels/host/fill_constant_compute.h
similarity index 88%
rename from lite/kernels/arm/fill_constant_compute.h
rename to lite/kernels/host/fill_constant_compute.h
index 7717c4c2628cff5358cc2011c01cb4b02ee125dc..7a2450d41751c7ccd9a865ed07f27f72bb60a1de 100644
--- a/lite/kernels/arm/fill_constant_compute.h
+++ b/lite/kernels/host/fill_constant_compute.h
@@ -19,9 +19,9 @@
 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace arm {
+namespace host {
 
-class FillConstantCompute : public KernelLite<TARGET(kARM), PRECISION(kAny)> {
+class FillConstantCompute : public KernelLite<TARGET(kHost), PRECISION(kAny)> {
  public:
   using param_t = operators::FillConstantParam;
 
@@ -30,7 +30,7 @@ class FillConstantCompute : public KernelLite<TARGET(kARM), PRECISION(kAny)> {
   ~FillConstantCompute() {}
 };
 
-}  // namespace arm
+}  // namespace host
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/kernels/xpu/CMakeLists.txt b/lite/kernels/xpu/CMakeLists.txt
index fdb485df02f366f7f4868965b1f20c6861b03d43..29f14c8f3ea10a26f737211e4702103239272853 100644
--- a/lite/kernels/xpu/CMakeLists.txt
+++ b/lite/kernels/xpu/CMakeLists.txt
@@ -24,6 +24,9 @@ else()
   add_kernel(sequence_topk_avg_pooling_compute_xpu XPU basic SRCS sequence_topk_avg_pooling_compute.cc DEPS ${lite_kernel_deps})
   add_kernel(concat_compute_xpu XPU basic SRCS concat_compute.cc DEPS ${lite_kernel_deps})
   add_kernel(search_fc_compute_xpu XPU basic SRCS search_fc_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(reshape_compute_xpu XPU basic SRCS reshape_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(reduce_mean_compute_xpu XPU basic SRCS reduce_mean_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(reduce_sum_compute_xpu XPU basic SRCS reduce_sum_compute.cc DEPS ${lite_kernel_deps})
 
   # extra
   add_kernel(lookup_table_compute_xpu XPU extra SRCS lookup_table_compute.cc DEPS ${lite_kernel_deps})
@@ -44,4 +47,6 @@ else()
   add_kernel(__xpu__fc_compute_xpu XPU extra SRCS __xpu__fc_compute.cc DEPS ${lite_kernel_deps})
   add_kernel(__xpu__search_attention_compute_xpu XPU extra SRCS __xpu__search_attention_compute.cc DEPS ${lite_kernel_deps})
   add_kernel(__xpu__mmdnn_compute_xpu XPU extra SRCS __xpu__mmdnn_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(__xpu__conv2d_compute_xpu XPU extra SRCS __xpu__conv2d_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(__xpu__sfa_head_compute_xpu XPU extra SRCS __xpu__sfa_head_compute.cc DEPS ${lite_kernel_deps})
 endif()
diff --git a/lite/kernels/xpu/__xpu__conv2d_compute.cc b/lite/kernels/xpu/__xpu__conv2d_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3d73832937cf0e5f83d9e82ca769ddcd86e06cad
--- /dev/null
+++ b/lite/kernels/xpu/__xpu__conv2d_compute.cc
@@ -0,0 +1,116 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/__xpu__conv2d_compute.h"
+#include <string>
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void XPUConv2dCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  auto& input_dims = param.Input->dims();
+  auto& filter_dims = param.Filter->dims();
+  int batch = static_cast<int>(input_dims[0]);
+  int img_c = static_cast<int>(input_dims[1]);
+  int img_h = static_cast<int>(input_dims[2]);
+  int img_w = static_cast<int>(input_dims[3]);
+  int filter_num = static_cast<int>(filter_dims[0]);
+  int win_h = static_cast<int>(filter_dims[2]);
+  int win_w = static_cast<int>(filter_dims[3]);
+
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
+  int stride_h = param.strides[0];
+  int stride_w = param.strides[1];
+  int paddings_h = paddings[0];
+  int paddings_w = paddings[1];
+  int dilations_h = dilations[0];
+  int dilations_w = dilations[1];
+
+  std::string filter_type = param.filter_type;
+  int groups = param.groups;
+
+  int act_type = (param.act_type == -1) ? xdnn::Activation_t::RELU
+                                        : param.act_type;  // -1 means not init
+  const auto* bias = param.Bias ? param.Bias->data<float>() : nullptr;
+  const auto* branch = param.Branch ? param.Branch->data<float>() : nullptr;
+  const float* input_max =
+      param.InputMax ? param.InputMax->data<float>() : nullptr;
+  float* output_max = param.OutputMax
+                          ? param.OutputMax->mutable_data<float>(TARGET(kXPU))
+                          : nullptr;
+  float* output = param.Output->mutable_data<float>(TARGET(kXPU));
+
+  // TODO(luohang): now support for resnet50 first
+  CHECK_EQ(act_type, xdnn::Activation_t::RELU);
+  CHECK_EQ(groups, 1);
+  CHECK_EQ(filter_type, "int16");
+
+  xdnn::Activation_t act((xdnn::Activation_t::act_enum)act_type);
+  int r = xdnn::conv2d_forward_int16<float, int16_t, float, float>(
+      ctx.GetRawContext(),            /* context */
+      batch,                          /* batch */
+      img_c,                          /* input_c */
+      img_h,                          /* input_h */
+      img_w,                          /* input_w */
+      filter_num,                     /* num_filter */
+      win_h,                          /* kernel_h */
+      win_w,                          /* kernel_w */
+      stride_h,                       /* stride_h */
+      stride_w,                       /* stride_w */
+      paddings_h,                     /* pad_h */
+      paddings_w,                     /* pad_w */
+      dilations_h,                    /* dilation_h */
+      dilations_w,                    /* dilation_w */
+      groups,                         /* group */
+      param.Input->data<float>(),     /* input bottom */
+      param.Filter->data<int16_t>(),  /* filter weight */
+      output,                         /* output top */
+      bias,                           /* bias */
+      branch,                         /* branch */
+      act,                            /* act type */
+      input_max,                      /* max_image_ptr */
+      param.FilterMax->data<float>(), /* max_filter_ptr */
+      output_max /* max_result_ptr */);
+
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(__xpu__conv2d,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::XPUConv2dCompute,
+                     def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("InputMax", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("FilterMax", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Branch", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("OutputMax", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/__xpu__conv2d_compute.h b/lite/kernels/xpu/__xpu__conv2d_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..0d3b3217c589948d0515eb410e7ee70e1f2b028c
--- /dev/null
+++ b/lite/kernels/xpu/__xpu__conv2d_compute.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class XPUConv2dCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::XPUConv2dParam;
+
+  virtual void Run();
+
+  virtual ~XPUConv2dCompute() = default;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/__xpu__sfa_head_compute.cc b/lite/kernels/xpu/__xpu__sfa_head_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9721e4e13377eab236a775b0301b7dfac1e15752
--- /dev/null
+++ b/lite/kernels/xpu/__xpu__sfa_head_compute.cc
@@ -0,0 +1,69 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/__xpu__sfa_head_compute.h"
+#include <string>
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void XPUSfaHeadCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+  std::string vis_type = param.op_type;
+  auto input = param.input;
+
+  const int batch = static_cast<int>(input->dims()[0]);
+  const int m = static_cast<int>(input->dims()[1]);
+  const int n = static_cast<int>(input->dims()[2]);
+  if (vis_type == "meanstd") {
+    int r = xdnn::vis_meanstd(ctx.GetRawContext(),
+                              param.input->data<float>(),
+                              param.output->mutable_data<float>(TARGET(kXPU)),
+                              batch,
+                              m,
+                              n);
+    CHECK_EQ(r, 0) << "XPU kernel error";
+    (void)param.output->mutable_data<float>();
+  } else if (vis_type == "moment") {
+    int r = xdnn::vis_moment(ctx.GetRawContext(),
+                             param.input->data<float>(),
+                             param.output->mutable_data<float>(TARGET(kXPU)),
+                             batch,
+                             m,
+                             n);
+    CHECK_EQ(r, 0) << "XPU kernel error";
+  } else {
+    LOG(FATAL) << "vis xpu op not supported type " << vis_type.c_str();
+  }
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(__xpu__sfa_head,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::XPUSfaHeadCompute,
+                     def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/__xpu__sfa_head_compute.h b/lite/kernels/xpu/__xpu__sfa_head_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..d869ae2ebf3f50bcb70294986d423dade0e78458
--- /dev/null
+++ b/lite/kernels/xpu/__xpu__sfa_head_compute.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class XPUSfaHeadCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::XPUSfaHeadParam;
+
+  virtual void Run();
+
+  virtual ~XPUSfaHeadCompute() = default;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/activation_compute.cc b/lite/kernels/xpu/activation_compute.cc
index a46b33252e40a56299ebc7d0f133520a04b7cb20..fa20cbd60b37a0ebcc1c708daefcfff316465227 100644
--- a/lite/kernels/xpu/activation_compute.cc
+++ b/lite/kernels/xpu/activation_compute.cc
@@ -60,6 +60,71 @@ void SigmoidCompute::Run() {
   CHECK_EQ(r, 0);
 }
 
+void AbsCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  int r = xdnn::activation_forward(
+      ctx.GetRawContext(),     /* context */
+      xdnn::Activation_t::ABS, /* type */
+      param.X->numel(),        /* len */
+      param.X->data<float>(),  /* x */
+      param.Out->mutable_data<float>(TARGET(kXPU)) /* y */);
+  CHECK_EQ(r, 0);
+}
+
+void SquareCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  int r = xdnn::activation_forward(
+      ctx.GetRawContext(),        /* context */
+      xdnn::Activation_t::SQUARE, /* type */
+      param.X->numel(),           /* len */
+      param.X->data<float>(),     /* x */
+      param.Out->mutable_data<float>(TARGET(kXPU)) /* y */);
+  CHECK_EQ(r, 0);
+}
+
+void SqrtCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  int r = xdnn::activation_forward(
+      ctx.GetRawContext(),      /* context */
+      xdnn::Activation_t::SQRT, /* type */
+      param.X->numel(),         /* len */
+      param.X->data<float>(),   /* x */
+      param.Out->mutable_data<float>(TARGET(kXPU)) /* y */);
+  CHECK_EQ(r, 0);
+}
+
+void PowCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  int r = xdnn::activation_forward(
+      ctx.GetRawContext(),         /* context */
+      xdnn::Activation_t::ACT_POW, /* type */
+      param.X->numel(),            /* len */
+      param.X->data<float>(),      /* x */
+      param.Out->mutable_data<float>(TARGET(kXPU)) /* y */);
+  CHECK_EQ(r, 0);
+}
+
+void SignCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  int r = xdnn::activation_forward(
+      ctx.GetRawContext(),      /* context */
+      xdnn::Activation_t::SIGN, /* type */
+      param.X->numel(),         /* len */
+      param.X->data<float>(),   /* x */
+      param.Out->mutable_data<float>(TARGET(kXPU)) /* y */);
+  CHECK_EQ(r, 0);
+}
+
 }  // namespace xpu
 }  // namespace kernels
 }  // namespace lite
@@ -86,3 +151,33 @@ REGISTER_LITE_KERNEL(sigmoid,
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
     .Finalize();
+
+REGISTER_LITE_KERNEL(
+    abs, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::AbsCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    square, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::SquareCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    sqrt, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::SqrtCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    pow, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::PowCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    sign, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::SignCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/activation_compute.h b/lite/kernels/xpu/activation_compute.h
index f2ad667886ac33191687b70aa7548050461545e7..df4a5d3f8d9cbebdc3ac63a91602b370b48ee629 100644
--- a/lite/kernels/xpu/activation_compute.h
+++ b/lite/kernels/xpu/activation_compute.h
@@ -48,6 +48,51 @@ class SigmoidCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
   virtual ~SigmoidCompute() = default;
 };
 
+class AbsCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  virtual void Run();
+
+  virtual ~AbsCompute() = default;
+};
+
+class SquareCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  virtual void Run();
+
+  virtual ~SquareCompute() = default;
+};
+
+class SqrtCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  virtual void Run();
+
+  virtual ~SqrtCompute() = default;
+};
+
+class PowCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  virtual void Run();
+
+  virtual ~PowCompute() = default;
+};
+
+class SignCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  virtual void Run();
+
+  virtual ~SignCompute() = default;
+};
+
 }  // namespace xpu
 }  // namespace kernels
 }  // namespace lite
diff --git a/lite/kernels/xpu/elementwise_compute.cc b/lite/kernels/xpu/elementwise_compute.cc
index e37337948bf639832ea936de2b5b929d26f534cc..b7d3588a3ed18589c6ec7601992b7ba468842429 100644
--- a/lite/kernels/xpu/elementwise_compute.cc
+++ b/lite/kernels/xpu/elementwise_compute.cc
@@ -76,6 +76,59 @@ void ElementwiseSubCompute::Run() {
   }
 }
 
+void ElementwiseDivCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  auto& x_dims = param.X->dims().data();
+  auto& y_dims = param.Y->dims();
+  int axis = param.axis;
+  if (param.axis == -1) {
+    axis = x_dims.size() - y_dims.size();
+  }
+  int iter = std::accumulate(
+      x_dims.begin(), x_dims.begin() + axis, 1, std::multiplies<int>());
+  int stride = param.Y->numel();
+
+  for (int i = 0; i < iter; ++i) {
+    const float* x_ptr = param.X->data<float>() + i * stride;
+    const float* y_ptr = param.Y->data<float>();
+    float* o_ptr = param.Out->mutable_data<float>(TARGET(kXPU)) + i * stride;
+    int r = xdnn::elementwise_div(ctx.GetRawContext(), /* context */
+                                  x_ptr,               /* x */
+                                  y_ptr,               /* y */
+                                  o_ptr,               /* z */
+                                  stride /* len */);
+    CHECK_EQ(r, 0);
+  }
+}
+
+void ElementwiseMulCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  auto& x_dims = param.X->dims().data();
+  auto& y_dims = param.Y->dims();
+  int axis = param.axis;
+  if (param.axis == -1) {
+    axis = x_dims.size() - y_dims.size();
+  }
+  int iter = std::accumulate(
+      x_dims.begin(), x_dims.begin() + axis, 1, std::multiplies<int>());
+  int stride = param.Y->numel();
+
+  for (int i = 0; i < iter; ++i) {
+    const float* x_ptr = param.X->data<float>() + i * stride;
+    const float* y_ptr = param.Y->data<float>();
+    float* o_ptr = param.Out->mutable_data<float>(TARGET(kXPU)) + i * stride;
+    int r = xdnn::elementwise_mul(ctx.GetRawContext(), /* context */
+                                  x_ptr,               /* x */
+                                  y_ptr,               /* y */
+                                  o_ptr,               /* z */
+                                  stride /* len */);
+    CHECK_EQ(r, 0);
+  }
+}
 }  // namespace xpu
 }  // namespace kernels
 }  // namespace lite
@@ -102,3 +155,25 @@ REGISTER_LITE_KERNEL(elementwise_sub,
     .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
     .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_div,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::ElementwiseDivCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_mul,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::ElementwiseMulCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/elementwise_compute.h b/lite/kernels/xpu/elementwise_compute.h
index d910b9293e74428c426d9505245bc5958fc9df3a..6cf75486d791bfc69fe9ba6d4b54e89cbbb56ff5 100644
--- a/lite/kernels/xpu/elementwise_compute.h
+++ b/lite/kernels/xpu/elementwise_compute.h
@@ -41,6 +41,26 @@ class ElementwiseSubCompute
   virtual ~ElementwiseSubCompute() = default;
 };
 
+class ElementwiseDivCompute
+    : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ElementwiseParam;
+
+  virtual void Run();
+
+  virtual ~ElementwiseDivCompute() = default;
+};
+
+class ElementwiseMulCompute
+    : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ElementwiseParam;
+
+  virtual void Run();
+
+  virtual ~ElementwiseMulCompute() = default;
+};
+
 }  // namespace xpu
 }  // namespace kernels
 }  // namespace lite
diff --git a/lite/kernels/xpu/reduce_mean_compute.cc b/lite/kernels/xpu/reduce_mean_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0e2127c8a928f4587c093877477e9c155399284f
--- /dev/null
+++ b/lite/kernels/xpu/reduce_mean_compute.cc
@@ -0,0 +1,63 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/reduce_mean_compute.h"
+#include <vector>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void ReduceMeanCompute::Run() {
+  auto& param = Param<operators::ReduceMeanParam>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+  const float* input = param.X->data<float>();
+  auto x_dims = param.X->dims();
+  int x_rank = x_dims.size();
+  float* output = param.Out->mutable_data<float>(TARGET(kXPU));
+  auto reduce_dim = param.dim;
+
+  std::vector<int> idims;
+  for (int i = 0; i < x_rank; i++) {
+    idims.push_back(x_dims[i]);
+  }
+
+  auto type = xdnn::ReduceOp::REDUCE_MEAN;
+  int r = xdnn::reduce(ctx.GetRawContext(),
+                       input,
+                       output,
+                       idims.data(),
+                       x_rank,
+                       reduce_dim.data(),
+                       reduce_dim.size(),
+                       type);
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(reduce_mean,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::ReduceMeanCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/reduce_mean_compute.h b/lite/kernels/xpu/reduce_mean_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..8bd1033122c7f7da325cc938a08cb550bb71eeb3
--- /dev/null
+++ b/lite/kernels/xpu/reduce_mean_compute.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class ReduceMeanCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  void Run() override;
+
+  virtual ~ReduceMeanCompute() = default;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/reduce_sum_compute.cc b/lite/kernels/xpu/reduce_sum_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..27dc18b3fa553fcf4edef697083b02726c9d2e58
--- /dev/null
+++ b/lite/kernels/xpu/reduce_sum_compute.cc
@@ -0,0 +1,72 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/reduce_sum_compute.h"
+#include <vector>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void ReduceSumCompute::Run() {
+  auto& param = Param<operators::ReduceParam>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+  const float* input = param.x->data<float>();
+  float* output = param.output->mutable_data<float>(TARGET(kXPU));
+  bool reduce_all = param.reduce_all;
+
+  if (reduce_all) {
+    int input_len = param.x->numel();
+    int r = xdnn::sum(ctx.GetRawContext(), input, output, input_len);
+    CHECK_EQ(r, 0);
+  } else {
+    auto x_dims = param.x->dims();
+    int x_rank = x_dims.size();
+    auto reduce_dim = param.dim;
+    auto rdim = reduce_dim.size();
+
+    std::vector<int> idims;
+    for (int i = 0; i < x_rank; i++) {
+      idims.push_back(x_dims[i]);
+    }
+
+    auto type = xdnn::ReduceOp::REDUCE_SUM;
+    int r = xdnn::reduce(ctx.GetRawContext(),
+                         input,
+                         output,
+                         idims.data(),
+                         x_rank,
+                         reduce_dim.data(),
+                         rdim,
+                         type);
+    CHECK_EQ(r, 0);
+  }
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(reduce_sum,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::ReduceSumCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/reduce_sum_compute.h b/lite/kernels/xpu/reduce_sum_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..28d9f8eb7b438e390928b82f83e623e9d19d8f47
--- /dev/null
+++ b/lite/kernels/xpu/reduce_sum_compute.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class ReduceSumCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  void Run() override;
+
+  virtual ~ReduceSumCompute() = default;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/reshape_compute.cc b/lite/kernels/xpu/reshape_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c428d5d1dfd408dc184bee32966c293da4f4e99b
--- /dev/null
+++ b/lite/kernels/xpu/reshape_compute.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/reshape_compute.h"
+#include "lite/core/op_registry.h"
+
+REGISTER_LITE_KERNEL(reshape2,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::Reshape2Compute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("ShapeTensor", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Shape", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/reshape_compute.h b/lite/kernels/xpu/reshape_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..289fbf4120eceea012723995accf3eea3ab21268
--- /dev/null
+++ b/lite/kernels/xpu/reshape_compute.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+template <typename T>
+class Reshape2Compute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ReshapeParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    auto x = param.x;
+    auto output = param.output;
+    auto xshape = param.xshape;
+    auto x_dims = x->dims();
+    auto x_dims_data = x_dims.Vectorize();
+    auto out_dims = output->dims();
+    output->ShareDataWith(*x);
+    output->Resize(out_dims);
+    auto* xshape_data = xshape->mutable_data<int64_t>(TARGET(kXPU));
+    TargetWrapperXPU::MemcpySync(xshape_data,
+                                 x_dims_data.data(),
+                                 x_dims.size() * sizeof(int64_t),
+                                 IoDirection::HtoD);
+  }
+
+  virtual ~Reshape2Compute() = default;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/CMakeLists.txt b/lite/operators/CMakeLists.txt
index 2099958960050769c0d9c5c6df2f074919d3d701..02377aad498a47cff50c3a595f6fb1634a56b5ff 100644
--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
@@ -53,6 +53,8 @@ add_operator(instance_norm_op basic SRCS instance_norm_op.cc DEPS ${op_DEPS})
 add_operator(subgraph_op basic SRCS subgraph_op.cc DEPS ${op_DEPS})
 add_operator(grid_sampler_op basic SRCS grid_sampler_op.cc DEPS ${op_DEPS})
 add_operator(flatten_op basic SRCS flatten_op.cc DEPS ${op_DEPS})
+add_operator(pow_op extra SRCS pow_op.cc DEPS ${op_DEPS})
+add_operator(sign_op extra SRCS sign_op.cc DEPS ${op_DEPS})
 
 # 2.basic ops not used in basic models
 add_operator(negative_op extra SRCS negative_op.cc DEPS ${op_DEPS})
@@ -177,6 +179,9 @@ add_operator(__xpu__fc_op extra SRCS __xpu__fc_op.cc DEPS ${op_DEPS})
 add_operator(__xpu__resnet_cbam_op extra SRCS __xpu__resnet_cbam_op.cc DEPS ${op_DEPS})
 add_operator(__xpu__search_attention_op extra SRCS __xpu__search_attention_op.cc DEPS ${op_DEPS})
 add_operator(__xpu__mmdnn_op extra SRCS __xpu__mmdnn_op.cc DEPS ${op_DEPS})
+add_operator(__xpu__conv2d_op extra SRCS __xpu__conv2d_op.cc DEPS ${op_DEPS})
+add_operator(__xpu__sfa_head_op extra SRCS __xpu__sfa_head_op.cc DEPS ${op_DEPS})
+
 if (NOT LITE_WITH_X86)
     lite_cc_test(test_one_hot_op SRCS one_hot_op_test.cc DEPS one_hot_op memory scope ${op_deps} one_hot_compute_host)
     lite_cc_test(test_fc_op SRCS fc_op_test.cc
diff --git a/lite/operators/__xpu__conv2d_op.cc b/lite/operators/__xpu__conv2d_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dff4d5e6dadf9bce15e76f5b353611f402eee19a
--- /dev/null
+++ b/lite/operators/__xpu__conv2d_op.cc
@@ -0,0 +1,182 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/__xpu__conv2d_op.h"
+#include <memory>
+#include <vector>
+#include "lite/core/op_registry.h"
+#include "lite/operators/conv_op.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+std::string padding_algorithm_ = "";  // NOLINT
+
+bool XPUConv2dOp::CheckShape() const {
+  CHECK(param_.Input) << "Input(Input) of ConvXPUOp should not be null.";
+  CHECK(param_.Output) << "Input(Filter) of ConvXPUOp should not be null.";
+  CHECK(param_.Filter) << "Output(Output) of ConvXPUOp should not be null.";
+  // bias is optional.
+
+  const auto in_dims = param_.Input->dims();
+  const auto filter_dims = param_.Filter->dims();
+  int groups = param_.groups;
+
+  CHECK_EQ(in_dims.size(), 4UL) << "Conv intput should be 4-D tensor.";
+  CHECK_EQ(in_dims.size(), filter_dims.size())
+      << "Conv input dimension and filter dimension should be the same.";
+  CHECK_EQ(in_dims.size() - param_.strides.size(), 2U)
+      << "Conv input dimension and strides dimension should be consistent.";
+  CHECK_EQ(filter_dims.size(), 4UL) << "Conv filter should be 4-D tensor.";
+  CHECK_EQ(in_dims[1], filter_dims[1] * groups)
+      << "The number of input channels should be equal to filter channels * "
+         "groups.";
+  CHECK_EQ(filter_dims[0] % groups, 0)
+      << "The number of output channels should be divided by groups.";
+
+  return true;
+}
+
+// copy from conv_op.cc
+inline int ConvOutputSize(int input_size,
+                          int filter_size,
+                          int dilation,
+                          int pad_left,
+                          int pad_right,
+                          int stride) {
+  const int dkernel = dilation * (filter_size - 1) + 1;
+  int output_size =
+      (input_size + (pad_left + pad_right) - dkernel) / stride + 1;
+
+  return output_size;
+}
+
+// copy from conv_op.cc
+bool XPUConv2dOp::InferShapeImpl() const {
+  const auto in_dims = param_.Input->dims();
+  const auto filter_dims = param_.Filter->dims();
+
+  operators::UpdatePaddingAndDilation(param_.paddings.get(),
+                                      param_.dilations.get(),
+                                      param_.strides,
+                                      padding_algorithm_,
+                                      in_dims,
+                                      filter_dims);
+  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
+  auto paddings = *param_.paddings;
+  auto dilations = *param_.dilations;
+  for (size_t i = 0; i < param_.strides.size(); ++i) {
+    output_shape.push_back(ConvOutputSize(in_dims[i + 2],
+                                          filter_dims[i + 2],
+                                          dilations[i],
+                                          paddings[i * 2],
+                                          paddings[i * 2 + 1],
+                                          param_.strides[i]));
+  }
+
+  // Set output and output max dims
+  param_.Output->Resize(lite::DDim(output_shape));
+  param_.OutputMax->Resize({4});
+  // share LoD
+  param_.Output->set_lod(param_.Input->lod());
+
+  return true;
+}
+
+bool XPUConv2dOp::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) {
+  AttachParam(&param_);
+  CHECK(scope->FindVar(op_desc.Input("Input").front()));
+  CHECK(scope->FindVar(op_desc.Input("Filter").front()));
+  CHECK(scope->FindVar(op_desc.Input("FilterMax").front()));
+  CHECK(scope->FindVar(op_desc.Output("Output").front()));
+  CHECK(scope->FindVar(op_desc.Output("OutputMax").front()));
+
+  param_.Input =
+      scope->FindVar(op_desc.Input("Input").front())->GetMutable<Tensor>();
+  param_.Filter =
+      scope->FindVar(op_desc.Input("Filter").front())->GetMutable<Tensor>();
+  param_.FilterMax =
+      scope->FindVar(op_desc.Input("FilterMax").front())->GetMutable<Tensor>();
+  auto bias = scope->FindVar(op_desc.Input("Bias").front());
+  if (bias != nullptr) {
+    param_.Bias = bias->GetMutable<Tensor>();
+  }
+  // optional params
+  std::vector<std::string> input_arg_names = op_desc.InputArgumentNames();
+  if (std::find(input_arg_names.begin(), input_arg_names.end(), "Branch") !=
+      input_arg_names.end()) {
+    auto arguments = op_desc.Input("Branch");
+    if (arguments.size() > 0) {
+      auto arg_var = scope->FindVar(arguments.front());
+      if (arg_var != nullptr) {
+        param_.Branch =
+            const_cast<lite::Tensor*>(&(arg_var->Get<lite::Tensor>()));
+      }
+    }
+  }
+
+  param_.Output =
+      scope->FindVar(op_desc.Output("Output").front())->GetMutable<Tensor>();
+  param_.OutputMax =
+      scope->FindVar(op_desc.Output("OutputMax").front())->GetMutable<Tensor>();
+
+  param_.strides = op_desc.GetAttr<std::vector<int>>("strides");
+  auto paddings = op_desc.GetAttr<std::vector<int>>("paddings");
+  auto dilations = op_desc.GetAttr<std::vector<int>>("dilations");
+  param_.dilations = std::make_shared<std::vector<int>>(dilations);
+  param_.groups = op_desc.GetAttr<int>("groups");
+  if (op_desc.HasAttr("act_type")) {
+    param_.act_type = op_desc.GetAttr<int>("act_type");
+  }
+
+  if (op_desc.HasAttr("filter_type")) {
+    param_.filter_type = op_desc.GetAttr<std::string>("filter_type");
+  } else {
+    param_.filter_type = "int16";
+  }
+
+  if (op_desc.HasAttr("has_input_max") &&
+      op_desc.GetAttr<bool>("has_input_max")) {
+    CHECK(scope->FindVar(op_desc.Input("InputMax").front()));
+    param_.InputMax =
+        scope->FindVar(op_desc.Input("InputMax").front())->GetMutable<Tensor>();
+  }
+
+  if (op_desc.HasAttr("padding_algorithm")) {
+    padding_algorithm_ = op_desc.GetAttr<std::string>("padding_algorithm");
+  }
+
+  // 2-pad to 4-pad
+  if (paddings.size() == 2L) {
+    for (size_t i = 0; i < param_.strides.size(); ++i) {
+      int copy_pad = *(paddings.begin() + 2 * i);
+      paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
+    }
+  } else {
+    if (paddings.size() != 4L) {
+      LOG(FATAL)
+          << "Paddings size should be the same or twice as the input size.";
+    }
+  }
+  param_.paddings = std::make_shared<std::vector<int>>(paddings);
+
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(__xpu__conv2d, paddle::lite::operators::XPUConv2dOp);
diff --git a/lite/operators/__xpu__conv2d_op.h b/lite/operators/__xpu__conv2d_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..f3141a594148767a9dbe4c01f496e78f9d3ca5d2
--- /dev/null
+++ b/lite/operators/__xpu__conv2d_op.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "lite/core/op_lite.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class XPUConv2dOp : public OpLite {
+ public:
+  XPUConv2dOp() {}
+
+  explicit XPUConv2dOp(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "XPUConv2d"; }
+
+ private:
+  mutable XPUConv2dParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/__xpu__sfa_head_op.cc b/lite/operators/__xpu__sfa_head_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f04cfa51392e6fd1099f1c9a57de10775e61507c
--- /dev/null
+++ b/lite/operators/__xpu__sfa_head_op.cc
@@ -0,0 +1,74 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/__xpu__sfa_head_op.h"
+#include <vector>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool XPUSfaHeadOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.input);
+  CHECK_OR_FALSE(param_.output);
+  CHECK_OR_FALSE(param_.op_type != "");
+
+  const auto input_dims = param_.input->dims();
+  if (param_.op_type == "meanstd" || param_.op_type == "moment") {
+    CHECK_EQ_OR_FALSE(input_dims.size(), 3UL);
+  }
+
+  return true;
+}
+
+bool XPUSfaHeadOp::InferShapeImpl() const {
+  const auto& input_dims = param_.input->dims();
+  auto op_type = param_.op_type;
+
+  // Set output dims
+  std::vector<DDim::value_type> output_dims(2);
+  output_dims[0] = input_dims[0];
+  if (op_type == "meanstd") {
+    output_dims[1] = 2 * input_dims[1];
+  } else if (op_type == "moment") {
+    output_dims[1] = 4 * input_dims[1];
+  } else {
+    LOG(FATAL) << "not supported vis op --> " << op_type;
+  }
+  param_.output->Resize(output_dims);
+
+  // share LoD
+  param_.output->set_lod(param_.input->lod());
+  return true;
+}
+
+bool XPUSfaHeadOp::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) {
+  auto input = op_desc.Input("Input").front();
+  auto output = op_desc.Output("Output").front();
+  CHECK(scope->FindVar(input));
+  CHECK(scope->FindVar(output));
+
+  param_.input = scope->FindVar(input)->GetMutable<lite::Tensor>();
+  param_.output = scope->FindVar(output)->GetMutable<lite::Tensor>();
+  param_.op_type = op_desc.GetAttr<std::string>("op_type");
+
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(__xpu__sfa_head, paddle::lite::operators::XPUSfaHeadOp);
diff --git a/lite/operators/__xpu__sfa_head_op.h b/lite/operators/__xpu__sfa_head_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..2ecfaf3cbaa1fefd2c3f8f3060e9c945ad185692
--- /dev/null
+++ b/lite/operators/__xpu__sfa_head_op.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "lite/core/op_lite.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class XPUSfaHeadOp : public OpLite {
+ public:
+  XPUSfaHeadOp() {}
+
+  explicit XPUSfaHeadOp(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "XPUFc"; }
+
+ private:
+  mutable XPUSfaHeadParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/op_params.h b/lite/operators/op_params.h
index 3e68bc1631bd41477fdbcbcfbbc6279287e21af1..586d3d1183b4049f8b49ef22f92e84412ed5522f 100644
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
@@ -232,6 +232,20 @@ struct PowerParam : ParamBase {
   float power{};
 };
 
+// For Pow Op
+struct PowParam : ParamBase {
+  const lite::Tensor* X{};
+  lite::Tensor* Out{};
+
+  float factor{1.};
+};
+
+// For Sign Op
+struct SignParam : ParamBase {
+  const lite::Tensor* X{};
+  lite::Tensor* Out{};
+};
+
 struct ShuffleChannelParam : ParamBase {
   const lite::Tensor* X{};
   lite::Tensor* Out{};
@@ -1810,6 +1824,31 @@ struct XPUMmdnnMergeAllParam : ParamBase {
   lite::Tensor* out{};
 };
 
+struct XPUConv2dParam : ParamBase {
+  lite::Tensor* Input{nullptr};
+  lite::Tensor* Filter{nullptr};
+  lite::Tensor* InputMax{nullptr};
+  lite::Tensor* FilterMax{nullptr};
+  lite::Tensor* Bias{nullptr};
+  lite::Tensor* Branch{nullptr};
+  lite::Tensor* Output{nullptr};
+  lite::Tensor* OutputMax{nullptr};
+
+  int groups{1};
+  int act_type{-1};
+  std::string filter_type{""};
+  std::vector<int> strides;
+  std::shared_ptr<std::vector<int>> paddings;
+  std::shared_ptr<std::vector<int>> dilations;
+};
+
+struct XPUSfaHeadParam : ParamBase {
+  lite::Tensor* input{nullptr};
+  lite::Tensor* output{nullptr};
+
+  std::string op_type{""};
+};
+
 // For DeformableConvolution op
 struct DeformableConvParam : ParamBase {
   lite::Tensor* x{};
diff --git a/lite/operators/pow_op.cc b/lite/operators/pow_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d9c16bd67365760e312931b4c6371ead4c459f05
--- /dev/null
+++ b/lite/operators/pow_op.cc
@@ -0,0 +1,53 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/pow_op.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool PowOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.X);
+  CHECK_OR_FALSE(param_.Out);
+  return true;
+}
+
+bool PowOp::InferShapeImpl() const {
+  param_.Out->Resize(param_.X->dims());
+  return true;
+}
+
+bool PowOp::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
+  auto X = op_desc.Input("X").front();
+  auto Out = op_desc.Output("Out").front();
+  CHECK(scope->FindVar(X));
+  CHECK(scope->FindVar(Out));
+  param_.X = scope->FindVar(X)->GetMutable<lite::Tensor>();
+  param_.Out = scope->FindVar(Out)->GetMutable<lite::Tensor>();
+  param_.factor = op_desc.GetAttr<float>("factor");
+  CHECK(param_.X);
+  CHECK(param_.Out);
+
+  return true;
+}
+
+} /* namespace operators */
+} /* namespace lite */
+} /* namespace paddle */
+
+REGISTER_LITE_OP(pow, paddle::lite::operators::PowOp);
diff --git a/lite/operators/pow_op.h b/lite/operators/pow_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..9e8f97fd2b3af99177ad6f2a49b7c382b16443bc
--- /dev/null
+++ b/lite/operators/pow_op.h
@@ -0,0 +1,56 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class PowOp : public OpLite {
+ public:
+  PowOp() {}
+
+  explicit PowOp(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "pow"; }
+
+#ifdef LITE_WITH_PROFILE
+  void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {
+    ch->input_shape = ch->DimToStr(param_.X->dims());
+    ch->output_shape = ch->DimToStr(param_.Out->dims());
+    ch->macs = param_.Out->numel();
+  }
+#endif
+
+ private:
+  mutable PowParam param_;
+};
+
+} /* namespace operators */
+} /* namespace lite */
+} /* namespace paddle */
diff --git a/lite/operators/sign_op.cc b/lite/operators/sign_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9c31f58315b44f9e60e3649419e09f142d362c0e
--- /dev/null
+++ b/lite/operators/sign_op.cc
@@ -0,0 +1,52 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/sign_op.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool SignOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.X);
+  CHECK_OR_FALSE(param_.Out);
+  return true;
+}
+
+bool SignOp::InferShapeImpl() const {
+  param_.Out->Resize(param_.X->dims());
+  return true;
+}
+
+bool SignOp::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
+  auto X = op_desc.Input("X").front();
+  auto Out = op_desc.Output("Out").front();
+  CHECK(scope->FindVar(X));
+  CHECK(scope->FindVar(Out));
+  param_.X = scope->FindVar(X)->GetMutable<lite::Tensor>();
+  param_.Out = scope->FindVar(Out)->GetMutable<lite::Tensor>();
+  CHECK(param_.X);
+  CHECK(param_.Out);
+
+  return true;
+}
+
+} /* namespace operators */
+} /* namespace lite */
+} /* namespace paddle */
+
+REGISTER_LITE_OP(sign, paddle::lite::operators::SignOp);
diff --git a/lite/operators/sign_op.h b/lite/operators/sign_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..f54038adc4792761a6edd090a5d9bc1506149be8
--- /dev/null
+++ b/lite/operators/sign_op.h
@@ -0,0 +1,56 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class SignOp : public OpLite {
+ public:
+  SignOp() {}
+
+  explicit SignOp(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "sign"; }
+
+#ifdef LITE_WITH_PROFILE
+  void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {
+    ch->input_shape = ch->DimToStr(param_.X->dims());
+    ch->output_shape = ch->DimToStr(param_.Out->dims());
+    ch->macs = param_.Out->numel();
+  }
+#endif
+
+ private:
+  mutable SignParam param_;
+};
+
+} /* namespace operators */
+} /* namespace lite */
+} /* namespace paddle */
diff --git a/lite/tests/kernels/fill_constant_batch_size_like_compute_test.cc b/lite/tests/kernels/fill_constant_batch_size_like_compute_test.cc
index 2318d53a33866fd8ba61d14c4d6bc6aed283dbdc..e7196fc04c08108e060f03619f303f349886b001 100644
--- a/lite/tests/kernels/fill_constant_batch_size_like_compute_test.cc
+++ b/lite/tests/kernels/fill_constant_batch_size_like_compute_test.cc
@@ -135,8 +135,8 @@ TEST(fill_constant_batch_size_like, precision) {
 #if defined(LITE_WITH_NPU)
   place = TARGET(kNPU);
   abs_error = 1e-2;  // use fp16 in npu
-#elif defined(LITE_WITH_ARM)
-  place = TARGET(kARM);
+#elif defined(LITE_WITH_ARM) || defined(LITE_WITH_X86)
+  place = TARGET(kHost);
 #else
   return;
 #endif
diff --git a/lite/tests/kernels/fill_constant_compute_test.cc b/lite/tests/kernels/fill_constant_compute_test.cc
index bc2cfce7842c935898bd9ecddc6c2d0ac4c39af5..59a4c301ce4d7ea215b73709aaf07908b91c297f 100644
--- a/lite/tests/kernels/fill_constant_compute_test.cc
+++ b/lite/tests/kernels/fill_constant_compute_test.cc
@@ -174,8 +174,8 @@ TEST(fill_constant, precision) {
 #if defined(LITE_WITH_NPU)
   place = TARGET(kNPU);
   abs_error = 1e-2;  // use fp16 in npu
-#elif defined(LITE_WITH_ARM)
-  place = TARGET(kARM);
+#elif defined(LITE_WITH_ARM) || defined(LITE_WITH_X86)
+  place = TARGET(kHost);
 #else
   return;
 #endif