diff --git a/paddle/fluid/inference/anakin/convert/CMakeLists.txt b/paddle/fluid/inference/anakin/convert/CMakeLists.txt
index 268bb0b2933ba3c1c9ba901b9aeeea21c635fb22..dc7fb7f79c04377c894404ad123cfd5b35c61f1f 100644
--- a/paddle/fluid/inference/anakin/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/anakin/convert/CMakeLists.txt
@@ -1,8 +1,12 @@
-cc_library(anakin_op_converter SRCS fc.cc conv2d.cc activation.cc pool2d.cc concat.cc split.cc DEPS anakin_engine framework_proto scope op_registry)
-cc_test(test_anakin_fc SRCS test_fc_op.cc DEPS  ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS}  anakin_op_converter mul_op)
-cc_test(test_anakin_conv2d SRCS test_conv2d_op.cc DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} anakin_op_converter conv_op im2col vol2col depthwise_conv SERIAL)
-cc_test(test_anakin_activation SRCS test_activation_op.cc DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} activation_op anakin_op_converter
- SERIAL)
-cc_test(test_anakin_pool2d SRCS test_pool2d_op.cc DEPS  ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} anakin_op_converter pool_op pooling)
-cc_test(test_anakin_concat SRCS test_concat_op.cc DEPS  ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} anakin_op_converter concat_op concat_and_split)
-cc_test(test_anakin_split SRCS test_split_op.cc DEPS  ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} anakin_op_converter split_op concat_and_split)
+cc_library(anakin_op_converter SRCS fc.cc conv2d.cc conv2d_fusion.cc
+elementwise.cc activation.cc pool2d.cc concat.cc split.cc relu.cc DEPS anakin_engine framework_proto scope op_registry)
+cc_test(test_anakin_fc SRCS test_fc_op.cc DEPS anakin_op_converter mul_op)
+cc_test(test_anakin_conv2d SRCS test_conv2d_op.cc DEPS anakin_op_converter conv_op im2col vol2col depthwise_conv SERIAL)
+cc_test(test_anakin_activation SRCS test_activation_op.cc DEPS activation_op anakin_op_converter SERIAL)
+cc_test(test_anakin_pool2d SRCS test_pool2d_op.cc DEPS anakin_op_converter pool_op pooling)
+cc_test(test_anakin_concat SRCS test_concat_op.cc DEPS anakin_op_converter concat_op concat_and_split)
+cc_test(test_anakin_split SRCS test_split_op.cc DEPS anakin_op_converter split_op concat_and_split)
+cc_test(test_anakin_elementwise SRCS test_elementwise_op.cc DEPS
+anakin_op_converter elementwise_add_op)
+
+cc_test(test_anakin_relu SRCS test_relu_op.cc DEPS activation_op anakin_op_converter SERIAL)
diff --git a/paddle/fluid/inference/anakin/convert/activation.cc b/paddle/fluid/inference/anakin/convert/activation.cc
index 786869e3dbe511c1edd081c822e45330bd36acea..4939d28045bb8c1d71ff8c03048bb0f499d4acd5 100644
--- a/paddle/fluid/inference/anakin/convert/activation.cc
+++ b/paddle/fluid/inference/anakin/convert/activation.cc
@@ -45,15 +45,11 @@ void ActivationOpConverter::operator()(const framework::proto::OpDesc &op,
   auto output_name = op_desc.Output("Out").front();
   engine_->AddOp(op_name, "Activation", {input_name}, {output_name});
   engine_->AddOpAttr(op_name, "type", anakin_op_type_);
-  if (op_type_ == "relu") {
-    engine_->AddOpAttr(op_name, "alpha", 0);
-  }
 }
 
 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle
 
-REGISTER_ANAKIN_OP_CONVERTER(relu, ReluOpConverter);
 REGISTER_ANAKIN_OP_CONVERTER(sigmoid, SigmoidOpConverter);
 REGISTER_ANAKIN_OP_CONVERTER(tanh, TanhOpConverter);
diff --git a/paddle/fluid/inference/anakin/convert/activation.h b/paddle/fluid/inference/anakin/convert/activation.h
index f1db154a12cda71e0281ef0adddcf5c70dbdc120..527b0d3360bb844f10f5a412c9dc7a787c0f11a5 100644
--- a/paddle/fluid/inference/anakin/convert/activation.h
+++ b/paddle/fluid/inference/anakin/convert/activation.h
@@ -34,13 +34,8 @@ class ActivationOpConverter : public AnakinOpConverter {
  private:
   std::string op_type_;
   std::string anakin_op_type_;
-  std::map<std::string, std::string> anakin_ops_type_{
-      {"relu", "Relu"}, {"tanh", "TanH"}, {"sigmoid", "Sigmoid"}};
-};
-
-class ReluOpConverter : public ActivationOpConverter {
- public:
-  ReluOpConverter() : ActivationOpConverter("relu") {}
+  std::map<std::string, std::string> anakin_ops_type_{{"tanh", "TanH"},
+                                                      {"sigmoid", "Sigmoid"}};
 };
 
 class TanhOpConverter : public ActivationOpConverter {
@@ -50,7 +45,7 @@ class TanhOpConverter : public ActivationOpConverter {
 
 class SigmoidOpConverter : public ActivationOpConverter {
  public:
-  SigmoidOpConverter() : ActivationOpConverter("tanh") {}
+  SigmoidOpConverter() : ActivationOpConverter("sigmoid") {}
 };
 }  // namespace anakin
 }  // namespace inference
diff --git a/paddle/fluid/inference/anakin/convert/concat.cc b/paddle/fluid/inference/anakin/convert/concat.cc
index dbd0c6bc0d82c92c94e16ff8e4ba86fa2657f586..4225307137302dd09a7697f0ef2da39f16266819 100644
--- a/paddle/fluid/inference/anakin/convert/concat.cc
+++ b/paddle/fluid/inference/anakin/convert/concat.cc
@@ -32,8 +32,8 @@ void ConcatOpConverter::operator()(const framework::proto::OpDesc &op,
                                    const framework::Scope &scope,
                                    bool test_mode) {
   framework::OpDesc op_desc(op, nullptr);
-  auto input_names = op_desc.Input("X");
   int axis = boost::get<int>(op_desc.GetAttr("axis"));
+  auto input_names = op_desc.Input("X");
   PADDLE_ENFORCE(axis > 0,
                  "The axis attr of Concat op should be large than 0 for trt");
 
diff --git a/paddle/fluid/inference/anakin/convert/conv2d.cc b/paddle/fluid/inference/anakin/convert/conv2d.cc
index af0a39fdfd95e661cc047a3ad7ddbb204f7b2769..b99c6e71c4dfd2b567d85904f57ebecf0ed9a1cc 100644
--- a/paddle/fluid/inference/anakin/convert/conv2d.cc
+++ b/paddle/fluid/inference/anakin/convert/conv2d.cc
@@ -51,10 +51,11 @@ void Conv2dOpConverter::operator()(const framework::proto::OpDesc &op,
   PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL);
 
   // const int n_output = weight_tensor->dims()[0];
-  const int n_input = weight_tensor->dims()[1];
+  // const int n_input = weight_tensor->dims()[1];
   const int filter_h = weight_tensor->dims()[2];
   const int filter_w = weight_tensor->dims()[3];
-  auto filter_num = n_input * filter_h * filter_w;
+  // auto filter_num = n_input * filter_h * filter_w ;
+  auto filter_num = weight_tensor->dims()[0];
   engine_->AddOpAttr<int>(op_name, "filter_num", filter_num);
   engine_->AddOpAttr<PTuple<int>>(op_name, "kernel_size", {filter_h, filter_w});
   auto strides = boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
diff --git a/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc b/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4d105430dd298076fa8aa4c1925329c3a0e356a1
--- /dev/null
+++ b/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc
@@ -0,0 +1,113 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/anakin/convert/conv2d_fusion.h"
+#include <algorithm>
+#include <memory>
+#include <vector>
+
+using anakin::graph::GraphGlobalMem;
+using anakin::AK_FLOAT;
+using anakin::saber::NV;
+using anakin::saber::Shape;
+using anakin::PTuple;
+
+namespace paddle {
+namespace inference {
+namespace anakin {
+
+void Conv2dFusionOpConverter::operator()(const framework::proto::OpDesc &op,
+                                         const framework::Scope &scope,
+                                         bool test_mode) {
+  framework::OpDesc op_desc(op, nullptr);
+  PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1UL);
+  PADDLE_ENFORCE_EQ(op_desc.Input("Filter").size(), 1UL);
+  PADDLE_ENFORCE_EQ(op_desc.Input("Bias").size(), 1UL);
+  PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1UL);
+
+  auto input_name = op_desc.Input("Input").front();
+  auto output_name = op_desc.Output("Output").front();
+  auto op_name = op_desc.Type() + ":" + op_desc.Output("Output").front();
+  engine_->AddOp(op_name, "Convolution", {input_name}, {output_name});
+
+  auto *filter_v = scope.FindVar(op_desc.Input("Filter").front());
+  PADDLE_ENFORCE_NOT_NULL(filter_v);
+  auto *filter_t = filter_v->GetMutable<framework::LoDTensor>();
+
+  auto *b_v = scope.FindVar(op_desc.Input("Bias").front());
+  PADDLE_ENFORCE_NOT_NULL(b_v);
+  auto *b_t = b_v->GetMutable<framework::LoDTensor>();
+
+  std::unique_ptr<framework::LoDTensor> weight_tensor(
+      new framework::LoDTensor());
+  weight_tensor->Resize(filter_t->dims());
+  TensorCopySync((*filter_t), platform::CPUPlace(), weight_tensor.get());
+
+  PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL);
+
+  // const int n_output = weight_tensor->dims()[0];
+  // const int n_input = weight_tensor->dims()[1];
+  const int filter_h = weight_tensor->dims()[2];
+  const int filter_w = weight_tensor->dims()[3];
+  // auto filter_num = n_input * filter_h * filter_w ;
+  auto filter_num = weight_tensor->dims()[0];
+  engine_->AddOpAttr<int>(op_name, "filter_num", filter_num);
+  engine_->AddOpAttr<PTuple<int>>(op_name, "kernel_size", {filter_h, filter_w});
+  auto strides = boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
+  engine_->AddOpAttr<PTuple<int>>(op_name, "strides", strides);
+  auto paddings = boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
+  engine_->AddOpAttr<PTuple<int>>(op_name, "padding", paddings);
+  auto dilations = boost::get<std::vector<int>>(op_desc.GetAttr("dilations"));
+  engine_->AddOpAttr<PTuple<int>>(op_name, "dilation_rate", dilations);
+  const int groups = boost::get<int>(op_desc.GetAttr("groups"));
+  engine_->AddOpAttr(op_name, "group", groups);
+  engine_->AddOpAttr(op_name, "axis", 1);
+  engine_->AddOpAttr(op_name, "bias_term", true);
+
+  auto weight_shape = framework::vectorize2int(filter_t->dims());
+  Shape anakin_shape(weight_shape);
+  auto *weight1 =
+      GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(anakin_shape);
+  float *cpu_data = static_cast<float *>(weight1->h_tensor().mutable_data());
+  std::copy_n(weight_tensor->data<float>(), weight_tensor->numel(), cpu_data);
+  weight1->d_tensor().set_shape(anakin_shape);
+  weight1->d_tensor().copy_from(weight1->h_tensor());
+  engine_->AddOpAttr(op_name, "weight_1", *weight1);
+
+  auto bias_shape = framework::vectorize2int(b_t->dims());
+  framework::LoDTensor bias_tensor;
+  bias_tensor.Resize(b_t->dims());
+  TensorCopySync((*b_t), platform::CPUPlace(), &bias_tensor);
+  auto *bias_data = bias_tensor.data<float>();
+  bias_shape.insert(bias_shape.begin(), 1);
+  bias_shape.insert(bias_shape.begin(), 1);
+  bias_shape.insert(bias_shape.begin(), 1);
+  // bias_shape.push_back(1);
+  // bias_shape.push_back(1);
+  Shape anakin_bias_shape(bias_shape);
+
+  auto *weight2 = GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(
+      anakin_bias_shape);
+  float *cpu_data2 = static_cast<float *>(weight2->h_tensor().mutable_data());
+  std::copy_n(bias_data, bias_tensor.numel(), cpu_data2);
+  weight2->d_tensor().set_shape(anakin_bias_shape);
+  weight2->d_tensor().copy_from(weight2->h_tensor());
+  engine_->AddOpAttr(op_name, "weight_2", *weight2);
+}
+
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_ANAKIN_OP_CONVERTER(conv2d_fusion, Conv2dFusionOpConverter);
diff --git a/paddle/fluid/inference/anakin/convert/conv2d_fusion.h b/paddle/fluid/inference/anakin/convert/conv2d_fusion.h
new file mode 100644
index 0000000000000000000000000000000000000000..07359b9cba05bf7c885eb38d64816bdb718a6aba
--- /dev/null
+++ b/paddle/fluid/inference/anakin/convert/conv2d_fusion.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+
+namespace paddle {
+namespace inference {
+namespace anakin {
+
+class Conv2dFusionOpConverter : public AnakinOpConverter {
+ public:
+  Conv2dFusionOpConverter() = default;
+
+  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::Scope &scope,
+                          bool test_mode) override;
+  virtual ~Conv2dFusionOpConverter() {}
+};
+
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/anakin/convert/elementwise.cc b/paddle/fluid/inference/anakin/convert/elementwise.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d400e617d87d0a339ffaa3469ad780d2dffbf666
--- /dev/null
+++ b/paddle/fluid/inference/anakin/convert/elementwise.cc
@@ -0,0 +1,57 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/anakin/convert/elementwise.h"
+#include <algorithm>
+#include <string>
+#include <vector>
+
+using anakin::graph::GraphGlobalMem;
+using anakin::AK_FLOAT;
+using anakin::Precision;
+using anakin::saber::NV;
+using anakin::saber::X86;
+using anakin::saber::Shape;
+using anakin::PBlock;
+using anakin::PTuple;
+
+namespace paddle {
+namespace inference {
+namespace anakin {
+
+void ElementwiseAddOpConverter::operator()(const framework::proto::OpDesc &op,
+                                           const framework::Scope &scope,
+                                           bool test_mode) {
+  framework::OpDesc op_desc(op, nullptr);
+  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+  PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);  // Y is a weight
+  PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+
+  auto x_name = op_desc.Input("X").front();
+  auto y_name = op_desc.Input("Y").front();
+  auto out_name = op_desc.Output("Out").front();
+  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
+
+  engine_->AddOp(op_name, "Eltwise", {x_name, y_name}, {out_name});
+  std::string elementwise_type = "Add";
+  engine_->AddOpAttr<std::string>(op_name, "type", elementwise_type);
+  std::vector<float> coeff = {1.0, 1.0};
+  engine_->AddOpAttr<PTuple<float>>(op_name, "coeff", coeff);
+}
+
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_ANAKIN_OP_CONVERTER(elementwise_add, ElementwiseAddOpConverter);
diff --git a/paddle/fluid/inference/anakin/convert/elementwise.h b/paddle/fluid/inference/anakin/convert/elementwise.h
new file mode 100644
index 0000000000000000000000000000000000000000..d41a593803d2a7d91844dc066f53b3de73063e04
--- /dev/null
+++ b/paddle/fluid/inference/anakin/convert/elementwise.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+
+namespace paddle {
+namespace inference {
+namespace anakin {
+
+class ElementwiseAddOpConverter : public AnakinOpConverter {
+ public:
+  ElementwiseAddOpConverter() = default;
+
+  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::Scope &scope,
+                          bool test_mode) override;
+  virtual ~ElementwiseAddOpConverter() {}
+
+ private:
+};
+
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/anakin/convert/fc.cc b/paddle/fluid/inference/anakin/convert/fc.cc
index c88e3af33c9bbb18dd1ea58a8a508eecb8274d09..8638c5b5f0563acba5eb4950637cf929aa14e08c 100644
--- a/paddle/fluid/inference/anakin/convert/fc.cc
+++ b/paddle/fluid/inference/anakin/convert/fc.cc
@@ -14,6 +14,8 @@
 
 #include "paddle/fluid/inference/anakin/convert/fc.h"
 #include <algorithm>
+#include <string>
+#include <vector>
 
 using anakin::graph::GraphGlobalMem;
 using anakin::AK_FLOAT;
@@ -24,28 +26,39 @@ namespace paddle {
 namespace inference {
 namespace anakin {
 
-void FcOpConverter::operator()(const framework::proto::OpDesc &op,
-                               const framework::Scope &scope, bool test_mode) {
+void FcBaseOpConverter::operator()(const framework::proto::OpDesc &op,
+                                   const framework::Scope &scope,
+                                   bool test_mode) {
   framework::OpDesc op_desc(op, nullptr);
-  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
-  PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);
-  PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+  auto input_names = op_desc.InputNames();
+  bool with_bias = input_names.size() == 3;
+
+  std::string w_name = "Y";
+  std::string i_name = "X";
+  if (with_bias) {
+    w_name = "W";
+    i_name = "Input";
+  }
 
-  auto x_name = op_desc.Input("X").front();
   auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
-  auto *y_v = scope.FindVar(op_desc.Input("Y").front());
+
+  // get weights
+  auto *y_v = scope.FindVar(op_desc.Input(w_name).front());
   PADDLE_ENFORCE_NOT_NULL(y_v);
   auto *y_t = y_v->GetMutable<framework::LoDTensor>();
 
-  auto input_name = op_desc.Input("X").front();
+  auto input_name = op_desc.Input(i_name).front();
   auto output_name = op_desc.Output("Out").front();
 
-  auto weight_shape = framework::vectorize2int(y_t->dims());
   engine_->AddOp(op_name, "Dense", {input_name}, {output_name});
-  engine_->AddOpAttr(op_name, "bias_term", false);
+  engine_->AddOpAttr(op_name, "bias_term", with_bias);
   engine_->AddOpAttr(op_name, "axis", 1);
+
+  auto weight_shape = framework::vectorize2int(y_t->dims());
   int out_dim = weight_shape[1];
   engine_->AddOpAttr(op_name, "out_dim", out_dim);
+  const int w_m = weight_shape[0];
+  const int w_k = weight_shape[1];
 
   weight_shape.push_back(1);
   weight_shape.push_back(1);
@@ -54,18 +67,54 @@ void FcOpConverter::operator()(const framework::proto::OpDesc &op,
   framework::LoDTensor weight_tensor;
   weight_tensor.Resize(y_t->dims());
   TensorCopySync((*y_t), platform::CPUPlace(), &weight_tensor);
+  auto *weight_data = weight_tensor.data<float>();
+  PADDLE_ENFORCE(w_m * w_k == weight_tensor.numel());
 
+  std::vector<float> trans_weight_data(weight_tensor.numel());
+  for (int i = 0; i < w_m; i++) {
+    for (int j = 0; j < w_k; j++) {
+      trans_weight_data[i + j * w_m] = weight_data[i * w_k + j];
+    }
+  }
   auto *weight1 =
       GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(anakin_shape);
   float *cpu_data = static_cast<float *>(weight1->h_tensor().mutable_data());
-  std::copy_n(weight_tensor.data<float>(), weight_tensor.numel(), cpu_data);
+  std::copy_n(trans_weight_data.data(), weight_tensor.numel(), cpu_data);
   weight1->d_tensor().set_shape(anakin_shape);
   weight1->d_tensor().copy_from(weight1->h_tensor());
   engine_->AddOpAttr(op_name, "weight_1", *weight1);
+
+  // get bias
+  if (with_bias) {
+    auto *b_v = scope.FindVar(op_desc.Input("Bias").front());
+    PADDLE_ENFORCE_NOT_NULL(b_v);
+    auto *b_t = b_v->GetMutable<framework::LoDTensor>();
+
+    auto bias_shape = framework::vectorize2int(b_t->dims());
+    framework::LoDTensor bias_tensor;
+    bias_tensor.Resize(b_t->dims());
+    TensorCopySync((*b_t), platform::CPUPlace(), &bias_tensor);
+    auto *bias_data = bias_tensor.data<float>();
+    bias_shape.insert(bias_shape.begin(), 1);
+    bias_shape.insert(bias_shape.begin(), 1);
+    bias_shape.insert(bias_shape.begin(), 1);
+    // bias_shape.push_back(1);
+    // bias_shape.push_back(1);
+    Shape anakin_bias_shape(bias_shape);
+
+    auto *weight2 = GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(
+        anakin_bias_shape);
+    float *cpu_data2 = static_cast<float *>(weight2->h_tensor().mutable_data());
+    std::copy_n(bias_data, bias_tensor.numel(), cpu_data2);
+    weight2->d_tensor().set_shape(anakin_bias_shape);
+    weight2->d_tensor().copy_from(weight2->h_tensor());
+    engine_->AddOpAttr(op_name, "weight_2", *weight2);
+  }
 }
 
 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle
 
+REGISTER_ANAKIN_OP_CONVERTER(mul, MulOpConverter);
 REGISTER_ANAKIN_OP_CONVERTER(fc, FcOpConverter);
diff --git a/paddle/fluid/inference/anakin/convert/fc.h b/paddle/fluid/inference/anakin/convert/fc.h
index e67c8dedb314771bb665122dec829ac9ba6bbe2d..060c649b19ef335a9e926eb205ec691a2a188fe1 100644
--- a/paddle/fluid/inference/anakin/convert/fc.h
+++ b/paddle/fluid/inference/anakin/convert/fc.h
@@ -20,14 +20,26 @@ namespace paddle {
 namespace inference {
 namespace anakin {
 
-class FcOpConverter : public AnakinOpConverter {
+class FcBaseOpConverter : public AnakinOpConverter {
  public:
-  FcOpConverter() = default;
+  FcBaseOpConverter() = default;
 
   virtual void operator()(const framework::proto::OpDesc &op,
                           const framework::Scope &scope,
                           bool test_mode) override;
-  virtual ~FcOpConverter() {}
+  virtual ~FcBaseOpConverter() {}
+};
+
+// with bias
+class FcOpConverter : public FcBaseOpConverter {
+ public:
+  FcOpConverter() = default;
+};
+
+// without bias
+class MulOpConverter : public FcBaseOpConverter {
+ public:
+  MulOpConverter() = default;
 };
 
 }  // namespace anakin
diff --git a/paddle/fluid/inference/anakin/convert/op_converter.h b/paddle/fluid/inference/anakin/convert/op_converter.h
index ee4d7948783511773b9f9137fdc7ff08cfd3d727..0d214d82eb88b0c49870db397734b74f48c12918 100644
--- a/paddle/fluid/inference/anakin/convert/op_converter.h
+++ b/paddle/fluid/inference/anakin/convert/op_converter.h
@@ -47,14 +47,6 @@ class AnakinOpConverter {
     std::string op_type = op_desc.Type();
     AnakinOpConverter *it = nullptr;
 
-    if (op_type == "mul") {
-      PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL);
-      std::string Y = op_desc.Input("Y")[0];
-      if (parameters.count(Y)) {
-        it = Registry<AnakinOpConverter>::Global().Lookup("fc");
-      }
-    }
-
     if (!it) {
       it = Registry<AnakinOpConverter>::Global().Lookup(op_type);
     }
diff --git a/paddle/fluid/inference/anakin/convert/relu.cc b/paddle/fluid/inference/anakin/convert/relu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2ce96db1804a3d6d6d1afac79e4e1fc55ed4c35d
--- /dev/null
+++ b/paddle/fluid/inference/anakin/convert/relu.cc
@@ -0,0 +1,47 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/anakin/convert/relu.h"
+#include <algorithm>
+#include <map>
+
+using anakin::graph::GraphGlobalMem;
+using anakin::AK_FLOAT;
+using anakin::saber::NV;
+using anakin::saber::Shape;
+
+namespace paddle {
+namespace inference {
+namespace anakin {
+
+void ReluOpConverter::operator()(const framework::proto::OpDesc &op,
+                                 const framework::Scope &scope,
+                                 bool test_mode) {
+  framework::OpDesc op_desc(op, nullptr);
+  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+  PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+
+  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
+  auto input_name = op_desc.Input("X").front();
+  auto output_name = op_desc.Output("Out").front();
+
+  engine_->AddOp(op_name, "ReLU", {input_name}, {output_name});
+  engine_->AddOpAttr(op_name, "alpha", 0);
+}
+
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_ANAKIN_OP_CONVERTER(relu, ReluOpConverter);
diff --git a/paddle/fluid/inference/anakin/convert/relu.h b/paddle/fluid/inference/anakin/convert/relu.h
new file mode 100644
index 0000000000000000000000000000000000000000..54c4c2316eb32ef70696a2477211008e04892552
--- /dev/null
+++ b/paddle/fluid/inference/anakin/convert/relu.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <string>
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+
+namespace paddle {
+namespace inference {
+namespace anakin {
+
+class ReluOpConverter : public AnakinOpConverter {
+ public:
+  ReluOpConverter() = default;
+
+  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::Scope &scope,
+                          bool test_mode) override;
+  virtual ~ReluOpConverter() {}
+};
+
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/anakin/convert/test_activation_op.cc b/paddle/fluid/inference/anakin/convert/test_activation_op.cc
index 356dfea29f65e40fdcbddc7e6418e70308ee365f..6a81ec54ec4ef0b1b6fc6f0e51ecfb385bde082a 100644
--- a/paddle/fluid/inference/anakin/convert/test_activation_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_activation_op.cc
@@ -41,16 +41,13 @@ static void test_activation_op(const std::string &op_type) {
   validator.Execute(5);
 }
 
-TEST(relu_op, test) { test_activation_op("relu"); }
 TEST(sigm_op, test) { test_activation_op("sigmoid"); }
 TEST(tanh_op, test) { test_activation_op("tanh"); }
 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle
 
-USE_OP(relu);
 USE_OP(sigmoid);
 USE_OP(tanh);
-USE_ANAKIN_CONVERTER(relu);
 USE_ANAKIN_CONVERTER(sigmoid);
 USE_ANAKIN_CONVERTER(tanh);
diff --git a/paddle/fluid/inference/anakin/convert/test_concat_op.cc b/paddle/fluid/inference/anakin/convert/test_concat_op.cc
index 780e9acee123babd2224faeb883fc0d1d41d6ea3..0944e8dbdc90c35c7be7b1ed83ff673a3c74f3e3 100644
--- a/paddle/fluid/inference/anakin/convert/test_concat_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_concat_op.cc
@@ -25,10 +25,10 @@ TEST(concat_op, test) {
   std::unordered_set<std::string> parameters({""});
   framework::Scope scope;
   AnakinConvertValidation validator(parameters, scope);
-  validator.DeclInputVar("concat_x1", {1, 10, 3, 1});
-  validator.DeclInputVar("concat_x2", {1, 3, 3, 1});
-  validator.DeclInputVar("concat_x3", {1, 7, 3, 1});
-  validator.DeclOutputVar("concat_out", {1, 20, 3, 1});
+  validator.DeclInputVar("concat_x1", {1, 2, 1, 1});
+  validator.DeclInputVar("concat_x2", {1, 3, 1, 1});
+  validator.DeclInputVar("concat_x3", {1, 1, 1, 1});
+  validator.DeclOutputVar("concat_out", {1, 6, 1, 1});
 
   // Prepare Op description
   framework::OpDesc desc;
diff --git a/paddle/fluid/inference/anakin/convert/test_conv2d_op.cc b/paddle/fluid/inference/anakin/convert/test_conv2d_op.cc
index 3049d3d033460e09c1209f5d79f8d73e6ae67974..76f11c7b958793c5a323df48247b87a9e480d68b 100644
--- a/paddle/fluid/inference/anakin/convert/test_conv2d_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_conv2d_op.cc
@@ -28,9 +28,9 @@ TEST(conv2d_op, test) {
   std::unordered_set<std::string> parameters({"conv2d-Y"});
   framework::Scope scope;
   AnakinConvertValidation validator(parameters, scope);
-  validator.DeclInputVar("conv2d-X", {1, 2, 5, 5});
-  validator.DeclParamVar("conv2d-Y", {3, 2, 3, 3});
-  validator.DeclOutputVar("conv2d-Out", {1, 3, 5, 5});
+  validator.DeclInputVar("conv2d-X", {1, 3, 3, 3});
+  validator.DeclParamVar("conv2d-Y", {4, 3, 1, 1});
+  validator.DeclOutputVar("conv2d-Out", {1, 4, 3, 3});
 
   // Prepare Op description
   framework::OpDesc desc;
@@ -40,7 +40,7 @@ TEST(conv2d_op, test) {
   desc.SetOutput("Output", {"conv2d-Out"});
 
   const std::vector<int> strides({1, 1});
-  const std::vector<int> paddings({1, 1});
+  const std::vector<int> paddings({0, 0});
   const std::vector<int> dilations({1, 1});
   const int groups = 1;
 
diff --git a/paddle/fluid/inference/anakin/convert/test_elementwise_op.cc b/paddle/fluid/inference/anakin/convert/test_elementwise_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a4298500c4659002cc5a36c8817d42d041082755
--- /dev/null
+++ b/paddle/fluid/inference/anakin/convert/test_elementwise_op.cc
@@ -0,0 +1,50 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+#include "paddle/fluid/inference/anakin/convert/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace anakin {
+
+TEST(elementwise_op, native) {
+  std::unordered_set<std::string> parameters;
+  framework::Scope scope;
+  AnakinConvertValidation validator(parameters, scope);
+  validator.DeclInputVar("elementwise_add_x", {1, 1, 2, 2});
+  validator.DeclInputVar("elementwise_y", {1, 1, 2, 2});
+  validator.DeclOutputVar("elementwise_out", {1, 1, 2, 2});
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("elementwise_add");
+  desc.SetInput("X", {"elementwise_add_x"});
+  desc.SetInput("Y", {"elementwise_y"});
+  desc.SetOutput("Out", {"elementwise_out"});
+
+  int axis = -1;
+  desc.SetAttr("axis", axis);
+
+  validator.SetOp(*desc.Proto());
+  validator.Execute(1);
+}
+
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+
+USE_OP(elementwise_add);
+USE_ANAKIN_CONVERTER(elementwise_add);
diff --git a/paddle/fluid/inference/anakin/convert/test_fc_op.cc b/paddle/fluid/inference/anakin/convert/test_fc_op.cc
index b4e20e1e4708313e9f37c84073e6f418eee1522c..c72974cd596cc04cd85479c25f287c7e159cf7c8 100644
--- a/paddle/fluid/inference/anakin/convert/test_fc_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_fc_op.cc
@@ -27,9 +27,9 @@ TEST(fc_op, test) {
   std::unordered_set<std::string> parameters({"mul_y"});
   framework::Scope scope;
   AnakinConvertValidation validator(parameters, scope);
-  validator.DeclInputVar("mul_x", {1, 1, 1, 1});
-  validator.DeclParamVar("mul_y", {1, 2});
-  validator.DeclOutputVar("mul_out", {1, 1, 1, 2});
+  validator.DeclInputVar("mul_x", {1, 1, 2, 2});
+  validator.DeclParamVar("mul_y", {4, 2});
+  validator.DeclOutputVar("mul_out", {1, 2});
 
   // Prepare Op description
   framework::OpDesc desc;
@@ -37,8 +37,8 @@ TEST(fc_op, test) {
   desc.SetInput("X", {"mul_x"});
   desc.SetInput("Y", {"mul_y"});
   desc.SetOutput("Out", {"mul_out"});
-  int num_flatten_dims = 3;
-  desc.SetAttr("x_num_col_dims", num_flatten_dims);
+  // int num_flatten_dims = 3;
+  // desc.SetAttr("x_num_col_dims", num_flatten_dims);
   validator.SetOp(*desc.Proto());
 
   validator.Execute(10);
diff --git a/paddle/fluid/inference/anakin/convert/test_relu_op.cc b/paddle/fluid/inference/anakin/convert/test_relu_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1695c0dbf057d0a22c393b17c5da0278d70abce8
--- /dev/null
+++ b/paddle/fluid/inference/anakin/convert/test_relu_op.cc
@@ -0,0 +1,50 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+#include "paddle/fluid/inference/anakin/convert/relu.h"
+#include "paddle/fluid/inference/anakin/convert/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace anakin {
+
+static void test_activation_op(const std::string &op_type) {
+  auto *converter = Registry<AnakinOpConverter>::Global().Lookup(op_type);
+  PADDLE_ENFORCE(converter != nullptr);
+  std::unordered_set<std::string> parameters;
+  framework::Scope scope;
+  AnakinConvertValidation validator(parameters, scope);
+  validator.DeclInputVar("act-X", {10, 6, 1, 1});
+  validator.DeclOutputVar("act-Out", {10, 6, 1, 1});
+  framework::OpDesc desc;
+  desc.SetType(op_type);
+  desc.SetInput("X", {"act-X"});
+  desc.SetOutput("Out", {"act-Out"});
+
+  LOG(INFO) << "set OP";
+  validator.SetOp(*desc.Proto());
+  LOG(INFO) << "execute";
+
+  validator.Execute(5);
+}
+
+TEST(sigm_op, test) { test_activation_op("relu"); }
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+
+USE_OP(relu);
+USE_ANAKIN_CONVERTER(relu);
diff --git a/paddle/fluid/inference/anakin/convert/ut_helper.h b/paddle/fluid/inference/anakin/convert/ut_helper.h
index e4fc72a25395f740c58bda1164f658a9a66e0546..621b9c601ffc67f3b2b8e2fd0ddff3318a4392fc 100644
--- a/paddle/fluid/inference/anakin/convert/ut_helper.h
+++ b/paddle/fluid/inference/anakin/convert/ut_helper.h
@@ -161,10 +161,6 @@ class AnakinConvertValidation {
       framework::TensorToVector(*tensor, ctx, &fluid_out);
       fluid_outputs.push_back(fluid_out);
 
-      // size_t fluid_out_size = fluid_out.size();
-      /*for (size_t i = 0; i < fluid_out_size; i++) {
-        std::cout << fluid_out[i] << std::endl;
-      }*/
       outputs.insert({output, tensor});
     }
 
@@ -180,8 +176,7 @@ class AnakinConvertValidation {
       size_t anakin_out_size = anakin_out.size();
       auto fluid_out = fluid_outputs[i_output++];
       for (size_t i = 0; i < anakin_out_size; i++) {
-        LOG(INFO) << "Output[" << i << "]: anakin[" << anakin_out[i] << "], "
-                  << "fluid[" << fluid_out[i] << "]";
+        EXPECT_LT(std::abs(fluid_out[i] - anakin_out[i]), 1e-3);
       }
     }
   }
diff --git a/paddle/fluid/inference/anakin/engine.cc b/paddle/fluid/inference/anakin/engine.cc
index 6549991474f4834f0c3ef74c60d294cca6bebc91..fb95bd03e07470973c59ee752d7f6187ad2a19db 100644
--- a/paddle/fluid/inference/anakin/engine.cc
+++ b/paddle/fluid/inference/anakin/engine.cc
@@ -68,29 +68,34 @@ void AnakinEngine<TargetT, PrecisionType, RunType>::Execute(
     auto *tensor = input.second;
     auto *data = tensor->data<float>();
     auto shape = framework::vectorize2int(tensor->dims());
-    ::anakin::saber::Shape anakin_shape(shape);
     auto *anakin_input = net_->get_in(input.first);
+    auto anakin_input_shape = anakin_input->valid_shape();
+    PADDLE_ENFORCE(tensor->numel(), anakin_input_shape.count(),
+                   "the fluid input size should be equal to anakin");
     ::anakin::saber::Tensor<TargetT> tmp_anakin_tensor(data, TargetT(), 0,
-                                                       anakin_shape);
-    anakin_input->share_from(tmp_anakin_tensor);
+                                                       anakin_input_shape);
+    anakin_input->copy_from(tmp_anakin_tensor);
   }
 
   for (const auto &output : outputs) {
     auto *tensor = output.second;
     auto *data = tensor->data<float>();
     auto shape = framework::vectorize2int(tensor->dims());
-    ::anakin::saber::Shape anakin_shape(shape);
     auto *anakin_output = net_->get_out(output.first);
+    auto anakin_output_shape = anakin_output->valid_shape();
+    PADDLE_ENFORCE(tensor->numel(), anakin_output_shape.count(),
+                   "the fluid output size should be equal to anakin");
     ::anakin::saber::Tensor<TargetT> tmp_anakin_tensor(data, TargetT(), 0,
-                                                       anakin_shape);
+                                                       anakin_output_shape);
     anakin_output->share_from(tmp_anakin_tensor);
   }
   net_->prediction();
+  cudaDeviceSynchronize();
 }
 
 template <typename TargetT, Precision PrecisionType, OpRunType RunType>
 void AnakinEngine<TargetT, PrecisionType, RunType>::Freeze() {
-  PADDLE_ENFORCE(graph_->Freeze(), "Freeze anakin subgraph.");
+  PADDLE_ENFORCE(graph_->Freeze_v3(), "Freeze anakin subgraph.");
 }
 
 template <typename TargetT, Precision PrecisionType, OpRunType RunType>
diff --git a/paddle/fluid/inference/anakin/engine.h b/paddle/fluid/inference/anakin/engine.h
index d8f32f57be5aabb91ba720c6457a03f15083db43..408ad3b9f62f08cef0e3faa967aa1e71c2829408 100644
--- a/paddle/fluid/inference/anakin/engine.h
+++ b/paddle/fluid/inference/anakin/engine.h
@@ -46,6 +46,9 @@ namespace anakin {
 template <typename TargetT, ::anakin::Precision PrecisionType,
           ::anakin::OpRunType RunType = ::anakin::OpRunType::ASYNC>
 class AnakinEngine {
+  using NetT = ::anakin::Net<TargetT, PrecisionType, RunType>;
+  using GraphT = ::anakin::graph::Graph<TargetT, PrecisionType>;
+
  public:
   explicit AnakinEngine(bool need_summary = false);
   ~AnakinEngine();
@@ -61,16 +64,15 @@ class AnakinEngine {
     PADDLE_ENFORCE(graph_->AddOpAttr(op_name, attr_name, attr_value),
                    "Add operation's attribution.");
   }
-
+  NetT *Net() { return net_.get(); }
   std::unique_ptr<AnakinEngine> Clone();
   void Freeze();
   void Optimize();
+  void Save(std::string path) { graph_->save(path); }
   void Execute(const std::map<std::string, framework::LoDTensor *> &inputs,
                const std::map<std::string, framework::LoDTensor *> &outputs);
 
  private:
-  using NetT = ::anakin::Net<TargetT, PrecisionType, RunType>;
-  using GraphT = ::anakin::graph::Graph<TargetT, PrecisionType>;
   std::unique_ptr<GraphT> graph_;
   std::unique_ptr<NetT> net_;
 };
diff --git a/paddle/fluid/inference/anakin/op_teller.cc b/paddle/fluid/inference/anakin/op_teller.cc
index 928661b0a9cef4b6a0edc8212242dde867f5f172..015bcc066e1621a845fca369553e553da031014f 100644
--- a/paddle/fluid/inference/anakin/op_teller.cc
+++ b/paddle/fluid/inference/anakin/op_teller.cc
@@ -20,7 +20,18 @@ namespace anakin {
 
 // Just tell by the op_types.
 struct SimpleOpTypeSetTeller : public Teller {
-  SimpleOpTypeSetTeller() {}
+  SimpleOpTypeSetTeller() {
+    // teller_set.insert("mul");
+    teller_set.insert("fc");
+    teller_set.insert("conv2d_fusion");
+    teller_set.insert("split");
+    teller_set.insert("relu");
+    teller_set.insert("pool2d");
+    teller_set.insert("elementwise_add");
+    teller_set.insert("concat");
+    teller_set.insert("tanh");
+    // teller_set.insert("conv2d");
+  }
 
   bool operator()(const std::string& op_type,
                   const framework::OpDesc& desc) override {
@@ -28,7 +39,7 @@ struct SimpleOpTypeSetTeller : public Teller {
   }
 
  private:
-  std::unordered_set<std::string> teller_set{{"mul"}};
+  std::unordered_set<std::string> teller_set;
 };
 
 bool OpTeller::Tell(const std::string& op_type, const framework::OpDesc& desc) {
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index 8fd86b2cc56c4af50e735be2d660ec3db23e1547..07423a148553d7fd3cc0f0c2568460c4228fc096 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -13,8 +13,11 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/analysis/ir_pass_manager.h"
+#include <memory>
 #include <string>
 #include <unordered_map>
+#include <unordered_set>
+#include <utility>
 #include <vector>
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
@@ -63,7 +66,14 @@ void IRPassManager::CreatePasses(Argument *argument,
     } else if (pass_name == "cpu_quantize_pass") {
       pass->Set("quant_var_scales",
                 new VarQuantScale(argument->quant_var_scales()));
-    } else if (pass_name == "tensorrt_subgraph_pass") {
+    }
+
+    if (pass_name == "anakin_subgraph_pass") {
+      pass->Set("program",
+                new framework::ProgramDesc *(&argument->main_program()));
+    }
+
+    if (pass_name == "tensorrt_subgraph_pass") {
       pass->Set("workspace_size", new int(argument->tensorrt_workspace_size()));
       pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size()));
       pass->Set("min_subgraph_size",
diff --git a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
index 410a90132aa7657a23b858570763547fe53730a0..bd9f08d84e418359f685a095766f0f3e9b44f645 100644
--- a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
@@ -14,3 +14,15 @@ if (WITH_GPU AND TENSORRT_FOUND)
   file(APPEND ${pass_file} "USE_PASS(tensorrt_subgraph_pass);\n")
   set(INFER_IR_PASSES ${INFER_IR_PASSES} tensorrt_subgraph_pass CACHE INTERNAL "")
 endif()
+
+if (WITH_ANAKIN_SUBGRAPH) 
+  cc_library(anakin_subgraph_pass SRCS anakin_subgraph_pass.cc DEPS subgraph_detector anakin_op_teller)
+
+  set(analysis_deps ${analysis_deps}
+          subgraph_detector anakin_subgraph_pass
+          CACHE INTERNAL "")
+
+  set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h)
+  file(APPEND ${pass_file} "USE_PASS(anakin_subgraph_pass);\n")
+  set(INFER_IR_PASSES ${INFER_IR_PASSES} anakin_subgraph_pass CACHE INTERNAL "")
+endif()
diff --git a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f57b89baab758293e421318f75aa74f798c4f233
--- /dev/null
+++ b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
@@ -0,0 +1,273 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <memory>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/inference/anakin/op_teller.h"
+#include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h"
+#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
+#include "paddle/fluid/string/pretty_log.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+using framework::ir::Node;
+
+std::vector<std::string> ExtractAnakinParameters(
+    const std::unordered_set<Node *> &nodes);
+
+std::unique_ptr<framework::ir::Graph> analysis::AnakinSubgraphPass::ApplyImpl(
+    std::unique_ptr<framework::ir::Graph> graph) const {
+  framework::ir::FusePassBase::Init("anakin_subgraph_pass", graph.get());
+
+  auto teller = [](const framework::ir::Node *node) {
+    if (!node->IsOp() || !node->Op()) return false;
+    return anakin::OpTeller::Global().Tell(node->Op()->Type(), *node->Op());
+  };
+
+  SubGraphFuser fuser(graph.get(), teller, 0);
+  fuser();
+
+  for (auto *node : graph->Nodes()) {
+    if (node->IsOp() && !Agent(node).subgraph()->empty()) {
+      CreateAnakinOp(node, graph.get());
+      std::unordered_set<const Node *> nodes2remove(
+          Agent(node).subgraph()->begin(), Agent(node).subgraph()->end());
+      framework::ir::GraphSafeRemoveNodes(graph.get(), nodes2remove);
+    }
+  }
+
+  std::unordered_set<const Node *> nodes2remove;
+  for (auto *node : graph->Nodes()) {
+    if (node->IsOp() && Agent(node).deleted()) {
+      nodes2remove.insert(node);
+    }
+  }
+  framework::ir::GraphSafeRemoveNodes(graph.get(), nodes2remove);
+
+  return graph;
+}
+
+std::string GenerateAnakinEngineKey(
+    const std::set<std::string> &engine_inputs,
+    const std::set<std::string> &engine_outputs) {
+  std::string engine_hash_key = "";
+  for (auto name : engine_inputs) {
+    engine_hash_key += name;
+  }
+  for (auto name : engine_outputs) {
+    engine_hash_key += name;
+  }
+  auto engine_key = std::to_string(std::hash<std::string>()(engine_hash_key));
+  return engine_key;
+}
+
+void AnakinSubgraphPass::CreateAnakinOp(framework::ir::Node *node,
+                                        Graph *graph) const {
+  auto *op_desc = node->Op();
+  auto &subgraph = *Agent(node).subgraph();
+  PADDLE_ENFORCE(!subgraph.empty());
+
+  framework::ProgramDesc *program_desc =
+      Get<framework::ProgramDesc *>("program");
+  // Add new block for TensorRTEngineOP
+  const framework::BlockDesc &main_block =
+      program_desc->Block(framework::kRootBlockIndex);
+  // const framework::BlockDesc& main_block = program_desc->Block(0);
+  framework::BlockDesc *new_block = program_desc->AppendBlock(main_block);
+
+  // An fake block desc.
+  framework::proto::BlockDesc block_proto;
+  framework::BlockDesc block_desc(nullptr, &block_proto);
+  block_desc.Proto()->set_parent_idx(-1);
+  block_desc.Proto()->set_idx(0);
+  string::PrettyLogDetail("---  detect a sub-graph with %d nodes",
+                          subgraph.size());
+
+  for (auto *node : subgraph) {
+    auto *new_block_op = new_block->AppendOp();
+    auto *op = block_desc.AppendOp();
+    *new_block_op->Proto() = *node->Op()->Proto();
+    *op->Proto() = *node->Op()->Proto();
+  }
+
+  // Then, we will use the input_names_with_id and output_names_with_id to
+  // generate the eigine key.
+  // So, We use set instead of unordered_set here to ensure that the engine key
+  // is unique.
+  std::set<std::string> input_names;
+  std::set<std::string> input_names_with_id;
+  for (auto *x : node->inputs) {
+    input_names.insert(x->Name());
+    input_names_with_id.insert(x->Name() + std::to_string(x->id()));
+  }
+  op_desc->SetInput(
+      "Xs", std::vector<std::string>(input_names.begin(), input_names.end()));
+
+  std::set<std::string> output_names;
+  std::set<std::string> output_names_with_id;
+  for (auto *x : node->outputs) {
+    output_names.insert(x->Name());
+    output_names_with_id.insert(x->Name() + std::to_string(x->id()));
+  }
+
+  op_desc->SetOutput(
+      "Ys", std::vector<std::string>(output_names.begin(), output_names.end()));
+  op_desc->SetType("anakin_engine");
+
+  std::unordered_map<std::string, std::string> output_name_map;
+
+  // The following procedure is used to rename all the intermediate
+  // variables and the output variables of the subgraph.
+  // Why we do this?
+  // During the transition from fluid OP to anakin OP, we map
+  // the input and output Tensor(fluid data structure) of fluid OP
+  // to the corresponding ITensor (trt data structure) through the
+  // Tensor name. When we set up ITensor for an variable, we must
+  // ensure that it has not been set before.
+  // If there is variable in the fluid graph, which is not only the
+  // input of a OP, but also the output of a Op, there will be problems.
+  // So we have to rename the variable in the subgraph to make sure
+  // it is either an OP's input or an OP's output.
+
+  auto &subgraph_nodes = *Agent(node).subgraph();
+  for (size_t index = 0; index < block_desc.OpSize(); ++index) {
+    framework::proto::OpDesc *op = block_desc.Op(index)->Proto();
+    auto correspond_node = subgraph_nodes[index];
+    PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type());
+
+    std::unordered_map<std::string, size_t> var2id;
+    for (auto *in_var : correspond_node->inputs) {
+      var2id[in_var->Name()] = in_var->id();
+    }
+    // rename for the input variables of op inside subgraph
+    for (int i = 0; i < op->inputs_size(); i++) {
+      // one input
+      auto *in_var = op->mutable_inputs(i);
+      std::vector<std::string> replaced_names;
+      for (int k = 0; k < in_var->arguments_size(); k++) {  // all the arguments
+        std::string arg_value = in_var->arguments(k);
+        std::string arg_value_with_id =
+            arg_value + std::to_string(var2id[arg_value]);
+        if (input_names_with_id.count(arg_value_with_id)) {
+          replaced_names.push_back(arg_value);
+        } else {
+          replaced_names.push_back(arg_value_with_id);
+        }
+      }
+      in_var->clear_arguments();
+      for (size_t k = 0; k < replaced_names.size(); k++) {
+        in_var->add_arguments(replaced_names[k]);
+      }
+    }
+    var2id.clear();
+    for (auto out_var : correspond_node->outputs) {
+      var2id[out_var->Name()] = out_var->id();
+    }
+
+    // rename for the output variables of op inside subgraph
+    for (int i = 0; i < op->outputs_size(); i++) {
+      framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i);
+      std::vector<std::string> replaced_names;
+      for (int k = 0; k < out_var->arguments_size(); k++) {
+        std::string arg_value = out_var->arguments(k);
+        std::string arg_value_with_id =
+            arg_value + std::to_string(var2id[arg_value]);
+        if (output_names_with_id.count(arg_value_with_id)) {
+          output_name_map[arg_value] = arg_value_with_id;
+        }
+        replaced_names.push_back(arg_value_with_id);
+      }
+      out_var->clear_arguments();
+      for (size_t k = 0; k < replaced_names.size(); k++) {
+        out_var->add_arguments(replaced_names[k]);
+      }
+    }
+  }
+
+  // When anakin engine runs at the end of the operation,
+  // output_mapping help us copy the data from the renamed ITensor
+  // to Tensor.
+  std::vector<std::string> output_mapping;
+  for (auto name : output_names) {
+    PADDLE_ENFORCE(output_name_map.count(name) != 0);
+    output_mapping.push_back(output_name_map[name]);
+  }
+
+  auto *vars = block_desc.Proto()->mutable_vars();
+  for (framework::ir::Node *node : graph->Nodes()) {
+    if (node->IsVar() && node->Var()) {
+      *vars->Add() = *node->Var()->Proto();
+    }
+  }
+
+  PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(),
+                 "the block has no var-desc");
+  PADDLE_ENFORCE(!output_mapping.empty());
+  op_desc->SetBlockAttr("sub_block", new_block);
+  SetAttr(op_desc->Proto(), "subgraph",
+          block_desc.Proto()->SerializeAsString());
+  // Set attrs
+  SetAttr(op_desc->Proto(), "parameters",
+          ExtractAnakinParameters(graph->Nodes()));
+  SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping);
+  auto engine_key =
+      GenerateAnakinEngineKey(input_names_with_id, output_names_with_id);
+
+  SetAttr(op_desc->Proto(), "engine_key", engine_key);
+}
+
+std::vector<std::string> ExtractAnakinParameters(
+    const std::unordered_set<Node *> &nodes) {
+  // We can judge whether a variable is a parameter by
+  // its presistable property, but sometimes the presistable
+  // of the feed op output is true, so we have to identify it.
+  std::vector<std::string> feed_outputs;
+  for (const auto &node : nodes) {
+    if (!node->IsOp()) continue;
+    std::string op_type = node->Op()->Type();
+    if (op_type == "feed") {
+      std::vector<std::string> output_names = node->Op()->OutputArgumentNames();
+      std::copy(output_names.begin(), output_names.end(),
+                std::back_inserter(feed_outputs));
+    }
+  }
+
+  std::vector<std::string> parameters;
+  for (const auto &node : nodes) {
+    if (!node->IsVar()) continue;
+    if (node->Var()->Persistable() &&
+        std::find(feed_outputs.begin(), feed_outputs.end(), node->Name()) ==
+            feed_outputs.end()) {
+      parameters.push_back(node->Name());
+    }
+  }
+  return parameters;
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_PASS(anakin_subgraph_pass,
+              paddle::inference::analysis::AnakinSubgraphPass);
diff --git a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..5f45ff579d51ca3626a926e9d7e6a38e1d7225e6
--- /dev/null
+++ b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <paddle/fluid/framework/ir/fuse_pass_base.h>
+#include <memory>
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+class AnakinSubgraphPass : public framework::ir::FusePassBase {
+ public:
+  std::unique_ptr<framework::ir::Graph> ApplyImpl(
+      std::unique_ptr<framework::ir::Graph> graph) const override;
+
+ private:
+  void CreateAnakinOp(framework::ir::Node *x,
+                      framework::ir::Graph *graph) const;
+  void CleanIntermediateOutputs(framework::ir::Node *node);
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
index 96befe7f8a5d16402338ac337daa96d714b4d310..76b1671601eec95d64b36effc5727481dcd070e2 100644
--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
 #include <string>
+#include <unordered_map>
+#include <unordered_set>
 #include <utility>
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -418,7 +420,7 @@ void SubGraphFuser::ReplaceNodesWithSubGraphs() {
     // Node that contains this subgraph 2. Mark the nodes inside the sub-graph
     // as deleted. 3. Replace the deleted node with the new Block Node.
     framework::OpDesc empty_desc;
-    empty_desc.SetType("tensorrt_engine");
+    empty_desc.SetType("anakin_engine");
     auto *block_node = graph_->CreateOpNode(&empty_desc);
     Agent(block_node).set_subgraph({});
     auto io = ExtractInputAndOutputOfSubGraph(subgraph);
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
index d13ec7608c3e8075c1ef62fd4d47fbeee06e9005..8360963f7366e7cd192e55d307669d4915b065de 100644
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
@@ -30,6 +30,7 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
 
   // The parameters are on the cpu, therefore, synchronization is not necessary.
   if (!argument->use_gpu()) return;
+  return;
 
   auto &graph = argument->main_graph();
   std::vector<std::string> repetitive_params;
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index 85755fc471ae3d37ec5d005882668ccf0c35b354..38313754ea98dcf4b65098d3966c131a3fc98518 100644
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -27,6 +27,10 @@ if(WITH_GPU AND TENSORRT_FOUND)
     set(inference_deps ${inference_deps} tensorrt_engine tensorrt_converter)
 endif()
 
+if (WITH_ANAKIN_SUBGRAPH)
+    set(inference_deps ${inference_deps} anakin_op_converter anakin_engine)
+endif()
+
 add_subdirectory(details)
 
 cc_library(analysis_config SRCS analysis_config.cc DEPS lod_tensor paddle_pass_builder)
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index b58c60e96a0bd6695b827e7063fa7a07f42fe586..aee1d951a1e4d7407c0affb1b65aaa1470c0bf1b 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -43,6 +43,8 @@
 
 #endif
 
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+
 DECLARE_bool(profile);
 
 namespace paddle {
@@ -805,3 +807,14 @@ USE_TRT_CONVERTER(prelu);
 USE_TRT_CONVERTER(conv2d_transpose);
 USE_TRT_CONVERTER(leaky_relu);
 #endif
+
+USE_ANAKIN_CONVERTER(fc);
+USE_ANAKIN_CONVERTER(conv2d);
+USE_ANAKIN_CONVERTER(concat);
+USE_ANAKIN_CONVERTER(split);
+USE_ANAKIN_CONVERTER(relu);
+USE_ANAKIN_CONVERTER(sigmoid);
+USE_ANAKIN_CONVERTER(tanh);
+USE_ANAKIN_CONVERTER(pool2d);
+USE_ANAKIN_CONVERTER(conv2d_fusion);
+USE_ANAKIN_CONVERTER(elementwise_add);
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 651c5e6e75834c27313abd79a33bedb62ecd2632..8ccb6d5a1c02f3624507857a77da08e894f8820f 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -34,6 +34,10 @@ if (WITH_GPU AND TENSORRT_FOUND)
     add_subdirectory(tensorrt)
 endif()
 
+if (WITH_ANAKIN_SUBGRAPH) 
+    add_subdirectory(anakin)
+endif()
+
 SET(OP_HEADER_DEPS xxhash)
 if (WITH_GPU)
     SET(OP_HEADER_DEPS ${OP_HEADER_DEPS} cub)
diff --git a/paddle/fluid/operators/anakin/CMakeLists.txt b/paddle/fluid/operators/anakin/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5eacefc645bab288da7c289a5d7701abbcbef03d
--- /dev/null
+++ b/paddle/fluid/operators/anakin/CMakeLists.txt
@@ -0,0 +1,2 @@
+op_library(anakin_engine_op DEPS anakin_engine anakin_op_converter)
+# file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(anakin_engine);\n")
diff --git a/paddle/fluid/operators/anakin/anakin_engine_op.cc b/paddle/fluid/operators/anakin/anakin_engine_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..48b0490d041af5921d7456a3ae2efe7884ac833f
--- /dev/null
+++ b/paddle/fluid/operators/anakin/anakin_engine_op.cc
@@ -0,0 +1,54 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_CUDA
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/operators/anakin/anakin_engine_op.h"
+
+namespace paddle {
+
+namespace operators {
+
+class AnakinEngineOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Xs", "A list of inputs.").AsDuplicable();
+    AddOutput("Ys", "A list of outputs").AsDuplicable();
+    AddAttr<std::string>("subgraph", "the subgraph.");
+    AddAttr<std::string>(
+        "engine_key",
+        "The engine_key here is used to distinguish different TRT Engines");
+    AddAttr<framework::BlockDesc *>("sub_block", "the trt block");
+    AddComment("Anakin engine operator.");
+  }
+};
+
+class AnakinEngineInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {}
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(anakin_engine, ops::AnakinEngineOp, ops::AnakinEngineOpMaker,
+                  ops::AnakinEngineOpMaker);
+
+#endif  // PADDLE_WITH_CUDA
diff --git a/paddle/fluid/operators/anakin/anakin_engine_op.h b/paddle/fluid/operators/anakin/anakin_engine_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..93569367ffd08d0d72dbad2dc7561812d7727d95
--- /dev/null
+++ b/paddle/fluid/operators/anakin/anakin_engine_op.h
@@ -0,0 +1,152 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_CUDA
+
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+#include "paddle/fluid/inference/anakin/engine.h"
+#include "paddle/fluid/inference/analysis/helper.h"
+
+namespace paddle {
+namespace operators {
+
+using FluidDT = framework::proto::VarType_Type;
+using inference::Singleton;
+
+using anakin::graph::GraphGlobalMem;
+using anakin::AK_FLOAT;
+using anakin::Precision;
+using anakin::saber::NV;
+using anakin::saber::X86;
+using anakin::saber::Shape;
+using anakin::PBlock;
+using anakin::PTuple;
+using inference::anakin::AnakinEngine;
+
+class AnakinEngineOp : public framework::OperatorBase {
+  using AnakinNvEngineT = AnakinEngine<NV, Precision::FP32>;
+
+ private:
+  std::vector<std::string> input_names_;
+  std::unordered_set<std::string> param_names_;
+  mutable std::unique_ptr<AnakinNvEngineT> anakin_engine_;
+  std::string engine_key_;
+
+ public:
+  AnakinEngineOp(const std::string &type,
+                 const framework::VariableNameMap &inputs,
+                 const framework::VariableNameMap &outputs,
+                 const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {
+    input_names_ = Inputs("Xs");
+    engine_key_ = Attr<std::string>("engine_key");
+    auto params = Attr<std::vector<std::string>>("parameters");
+    for (const auto &param : params) {
+      param_names_.insert(param);
+    }
+  }
+
+ protected:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    RunAnakin(scope, dev_place);
+  }
+
+  void RunAnakin(const framework::Scope &scope,
+                 const platform::Place &dev_place) const {
+    if (anakin_engine_.get() == nullptr) {
+      anakin_engine_.reset(new AnakinEngine<NV, Precision::FP32>(true));
+      Prepare(scope, dev_place, anakin_engine_.get());
+    }
+
+    auto *engine = anakin_engine_.get();
+    PADDLE_ENFORCE(!input_names_.empty(), "should pass more than one inputs");
+
+    std::vector<std::string> output_maps =
+        Attr<std::vector<std::string>>("output_name_mapping");
+
+    std::map<std::string, framework::LoDTensor *> inputs;
+    // Convert input tensor from fluid to engine.
+    for (const auto &x : Inputs("Xs")) {
+      if (param_names_.count(x)) continue;
+      auto &t =
+          inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
+      auto t_shape = framework::vectorize(t.dims());
+      inputs.insert({x, &t});
+    }
+
+    std::map<std::string, framework::LoDTensor *> outputs;
+    int output_index = 0;
+    for (const auto &y : Outputs("Ys")) {
+      std::vector<int> ddim =
+          engine->Net()->get_out(output_maps[output_index])->valid_shape();
+      // we need get the output anakin output shape.
+      auto *fluid_v = scope.FindVar(y);
+      PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y);
+      auto *fluid_t = fluid_v->GetMutable<framework::LoDTensor>();
+      fluid_t->Resize(framework::make_ddim(ddim));
+      fluid_t->mutable_data<float>(boost::get<platform::CUDAPlace>(dev_place));
+      outputs.insert({output_maps[output_index], fluid_t});
+      output_index += 1;
+    }
+    engine->Execute(inputs, outputs);
+  }
+
+  void Prepare(const framework::Scope &scope, const platform::Place &dev_place,
+               AnakinNvEngineT *engine) const {
+    LOG(INFO) << "Prepare Anakin engine (Optimize model structure, Select OP "
+                 "kernel etc). This process may cost a lot of time.";
+    framework::proto::BlockDesc block_desc;
+    block_desc.ParseFromString(Attr<std::string>("subgraph"));
+
+    std::vector<std::string> output_maps =
+        Attr<std::vector<std::string>>("output_name_mapping");
+
+    inference::Singleton<inference::anakin::AnakinOpConverter>::Global()
+        .ConvertBlock(block_desc, param_names_, scope, engine);
+    engine->Freeze();
+    engine->Optimize();
+
+    for (const auto &x : Inputs("Xs")) {
+      if (param_names_.count(x)) continue;
+      auto &t =
+          inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
+      auto t_shape = framework::vectorize2int(t.dims());
+      // all input shape should be 4 dims
+      if (t_shape.size() == 2) {
+        t_shape.push_back(1);
+        t_shape.push_back(1);
+      }
+      engine->SetInputShape(x, t_shape);
+    }
+    engine->InitGraph();
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+#endif  // PADDLE_WITH_CUDA