diff --git a/CMakeLists.txt b/CMakeLists.txt
index f4d515c6e7017d79d39e88cd6142801231a6a99f..a38e32b73d51f142a1a1379541ffbd922662561a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -66,7 +66,6 @@ option(WITH_CONTRIB     "Compile the third-party contributation"        OFF)
 option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF)
 # TODO(Superjomn) Remove WITH_ANAKIN option if not needed latter.
 option(WITH_ANAKIN      "Compile with Anakin library"                   OFF)
-option(WITH_ANAKIN_SUBGRAPH "Compile with Anakin subgraph library"      OFF)
 option(ANAKIN_BUILD_FAT_BIN "Build anakin cuda fat-bin lib for all device plantform, ignored when WITH_ANAKIN=OFF" OFF)
 option(ANAKIN_BUILD_CROSS_PLANTFORM "Build anakin lib for any nvidia device plantform. ignored when WITH_ANAKIN=OFF" ON)
 option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
@@ -192,6 +191,7 @@ include(configure)          # add paddle env configuration
 if(WITH_GPU)
     include(cuda)
     include(tensorrt)
+    include(anakin_subgraph)
 endif()
 if(WITH_MKL OR WITH_MKLML)
     include(external/anakin)
diff --git a/cmake/anakin_subgraph.cmake b/cmake/anakin_subgraph.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..4a7d32a63553df31e0928e7b30249ff3e809cba1
--- /dev/null
+++ b/cmake/anakin_subgraph.cmake
@@ -0,0 +1,32 @@
+if(NOT WITH_GPU)
+    return()
+endif()
+
+set(ANAKIN_ROOT "/usr" CACHE PATH "ANAKIN ROOT")
+find_path(ANAKIN_INCLUDE_DIR anakin_config.h
+    PATHS ${ANAKIN_ROOT} ${ANAKIN_ROOT}/include
+    $ENV{ANAKIN_ROOT} $ENV{ANAKIN_ROOT}/include
+    NO_DEFAULT_PATH
+)
+
+find_library(ANAKIN_LIBRARY NAMES libanakin_saber_common.so libanakin.so
+    PATHS ${ANAKIN_ROOT}
+    $ENV{ANAKIN_ROOT} $ENV{ANAKIN_ROOT}/lib
+    NO_DEFAULT_PATH
+    DOC "Path to ANAKIN library.")
+
+if(ANAKIN_INCLUDE_DIR AND ANAKIN_LIBRARY)
+  if(WITH_DSO)
+    set(ANAKIN_FOUND ON)
+  endif(WITH_DSO)
+else()
+    set(ANAKIN_FOUND OFF)
+endif()
+
+if(ANAKIN_FOUND)
+    message(STATUS "Current ANAKIN header is ${ANAKIN_INCLUDE_DIR}/anakin_config.h. ")
+    include_directories(${ANAKIN_ROOT}/include)
+    include_directories(${ANAKIN_ROOT}/include/saber)
+    link_directories(${ANAKIN_ROOT})
+    add_definitions(-DPADDLE_WITH_ANAKIN)
+endif()
diff --git a/cmake/tensorrt.cmake b/cmake/tensorrt.cmake
index 891ff222633741f9894c2fdb6c0096a48f8a35e1..3bf12094e4c32e69f908cbe6cefc7871fc9bb568 100644
--- a/cmake/tensorrt.cmake
+++ b/cmake/tensorrt.cmake
@@ -33,5 +33,6 @@ if(TENSORRT_FOUND)
     message(STATUS "Current TensorRT header is ${TENSORRT_INCLUDE_DIR}/NvInfer.h. "
         "Current TensorRT version is v${TENSORRT_MAJOR_VERSION}. ")
     include_directories(${TENSORRT_INCLUDE_DIR})
+    link_directories(${TENSORRT_LIBRARY})
     add_definitions(-DPADDLE_WITH_TENSORRT)
 endif()
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 5ee12aab961d025ef4715cc05959727d6ff8902a..4cd29486a8e846dc04fbc4e467f2c40782408dfa 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -17,7 +17,7 @@ if (TENSORRT_FOUND)
   add_subdirectory(tensorrt)
 endif()
 
-if (WITH_ANAKIN_SUBGRAPH)
+if (ANAKIN_FOUND)
   add_subdirectory(anakin)
 endif()
 
diff --git a/paddle/fluid/inference/anakin/CMakeLists.txt b/paddle/fluid/inference/anakin/CMakeLists.txt
index 1646c7d16ec3d99fd82d824e9ea7d9819fa62253..e8fb56590563f49f920bfe71d160ec822cb3ca30 100644
--- a/paddle/fluid/inference/anakin/CMakeLists.txt
+++ b/paddle/fluid/inference/anakin/CMakeLists.txt
@@ -1,4 +1,4 @@
-cc_library(anakin_engine SRCS engine.cc)
+cc_library(anakin_engine SRCS engine.cc DEPS framework_proto)
 cc_library(anakin_op_teller SRCS op_teller.cc DEPS framework_proto)
 target_link_libraries(anakin_engine anakin anakin_saber_common)
 cc_test(test_anakin_engine SRCS test_anakin_engine.cc DEPS anakin_engine)
diff --git a/paddle/fluid/inference/anakin/convert/CMakeLists.txt b/paddle/fluid/inference/anakin/convert/CMakeLists.txt
index 9cfe6671ff720e37450ff71b9822df78054147c9..1e7f5ac799de0d7a1debec0529d262f021bba790 100644
--- a/paddle/fluid/inference/anakin/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/anakin/convert/CMakeLists.txt
@@ -1,19 +1,19 @@
 cc_library(anakin_op_converter SRCS fc.cc conv2d.cc conv2d_fusion.cc
  elementwise.cc activation.cc pool2d.cc concat.cc split.cc relu.cc  softmax.cc batch_norm.cc reshape.cc flatten.cc transpose.cc density_prior_box.cc detection_out.cc scale.cc dropout.cc im2sequence.cc sum.cc DEPS anakin_engine framework_proto scope op_registry)
 
-cc_test(test_anakin_fc SRCS test_fc_op.cc DEPS anakin_op_converter mul_op)
-cc_test(test_anakin_conv2d SRCS test_conv2d_op.cc DEPS anakin_op_converter conv_op im2col vol2col depthwise_conv)
-cc_test(test_anakin_activation SRCS test_activation_op.cc DEPS activation_op anakin_op_converter)
-cc_test(test_anakin_pool2d SRCS test_pool2d_op.cc DEPS anakin_op_converter pool_op pooling)
-cc_test(test_anakin_concat SRCS test_concat_op.cc DEPS anakin_op_converter concat_op concat_and_split)
-cc_test(test_anakin_split SRCS test_split_op.cc DEPS anakin_op_converter split_op concat_and_split)
-cc_test(test_anakin_elementwise SRCS test_elementwise_op.cc DEPS anakin_op_converter elementwise_add_op elementwise_mul_op)
-cc_test(test_anakin_relu SRCS test_relu_op.cc DEPS activation_op anakin_op_converter SERIAL)
-cc_test(test_anakin_softmax SRCS test_softmax_op.cc DEPS anakin_op_converter softmax_op softmax)
-cc_test(test_anakin_reshape SRCS test_reshape_op.cc DEPS anakin_op_converter reshape_op)
-cc_test(test_anakin_flatten SRCS test_flatten_op.cc DEPS anakin_op_converter flatten_op reshape_op)
-cc_test(test_anakin_transpose SRCS test_transpose_op.cc DEPS anakin_op_converter transpose_op)
-cc_test(test_anakin_batch_norm SRCS test_batch_norm_op.cc DEPS anakin_op_converter batch_norm_op)
-cc_test(test_anakin_dropout SRCS test_dropout_op.cc DEPS anakin_op_converter dropout_op)
-cc_test(test_anakin_im2sequence SRCS test_im2sequence_op.cc DEPS anakin_op_converter im2sequence_op im2col)
-cc_test(test_anakin_sum SRCS test_sum_op.cc DEPS  anakin_op_converter sum_op selected_rows_functor)
+cc_test(test_anakin_fc SRCS test_fc_op.cc DEPS anakin_op_converter mul_op SERIAL)
+cc_test(test_anakin_conv2d SRCS test_conv2d_op.cc DEPS anakin_op_converter conv_op im2col vol2col depthwise_conv SERIAL)
+cc_test(test_anakin_activation SRCS test_activation_op.cc DEPS activation_op anakin_op_converter SERIAL)
+cc_test(test_anakin_pool2d SRCS test_pool2d_op.cc DEPS anakin_op_converter pool_op pooling SERIAL)
+cc_test(test_anakin_concat SRCS test_concat_op.cc DEPS anakin_op_converter concat_op concat_and_split SERIAL)
+cc_test(test_anakin_split SRCS test_split_op.cc DEPS anakin_op_converter split_op concat_and_split SERIAL)
+cc_test(test_anakin_elementwise SRCS test_elementwise_op.cc DEPS anakin_op_converter elementwise_add_op elementwise_mul_op SERIAL)
+cc_test(test_anakin_relu SRCS test_relu_op.cc DEPS activation_op anakin_op_converter SERIAL SERIAL)
+cc_test(test_anakin_softmax SRCS test_softmax_op.cc DEPS anakin_op_converter softmax_op softmax SERIAL)
+cc_test(test_anakin_reshape SRCS test_reshape_op.cc DEPS anakin_op_converter reshape_op SERIAL)
+cc_test(test_anakin_flatten SRCS test_flatten_op.cc DEPS anakin_op_converter flatten_op reshape_op SERIAL)
+cc_test(test_anakin_transpose SRCS test_transpose_op.cc DEPS anakin_op_converter transpose_op SERIAL)
+cc_test(test_anakin_batch_norm SRCS test_batch_norm_op.cc DEPS anakin_op_converter batch_norm_op SERIAL)
+cc_test(test_anakin_dropout SRCS test_dropout_op.cc DEPS anakin_op_converter dropout_op SERIAL)
+#cc_test(test_anakin_im2sequence SRCS test_im2sequence_op.cc DEPS anakin_op_converter im2sequence_op im2col)
+cc_test(test_anakin_sum SRCS test_sum_op.cc DEPS  anakin_op_converter sum_op selected_rows_functor SERIAL)
diff --git a/paddle/fluid/inference/anakin/convert/test_activation_op.cc b/paddle/fluid/inference/anakin/convert/test_activation_op.cc
index 6a81ec54ec4ef0b1b6fc6f0e51ecfb385bde082a..8bedd4a749a645829658291310347eeed1c0ea49 100644
--- a/paddle/fluid/inference/anakin/convert/test_activation_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_activation_op.cc
@@ -26,7 +26,7 @@ static void test_activation_op(const std::string &op_type) {
   PADDLE_ENFORCE(converter != nullptr);
   std::unordered_set<std::string> parameters;
   framework::Scope scope;
-  AnakinConvertValidation validator(parameters, scope);
+  AnakinConvertValidation validator(parameters, &scope);
   validator.DeclInputVar("act-X", {10, 6, 1, 1});
   validator.DeclOutputVar("act-Out", {10, 6, 1, 1});
   framework::OpDesc desc;
diff --git a/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc b/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc
index 2e438dd7241e3c865404806d41a01fc9c4c33bc7..2832e1c8d167c646c9049beebc57a82fe416e62c 100644
--- a/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc
@@ -24,7 +24,7 @@ TEST(batch_norm_op, test) {
       {"batch_norm_scale", "batch_norm_bias", "batch_norm_mean",
        "batch_norm_variance"});
   framework::Scope scope;
-  AnakinConvertValidation validator(parameters, scope);
+  AnakinConvertValidation validator(parameters, &scope);
   std::vector<int> param_shape{2};
 
   validator.DeclInputVar("batch_norm_X", {1, 2, 5, 5});
diff --git a/paddle/fluid/inference/anakin/convert/test_concat_op.cc b/paddle/fluid/inference/anakin/convert/test_concat_op.cc
index 8c6949cb2914bdc6cd81155c63d4850d5fd57132..ecf44def5a2429360f0bcb92f00a0423e1d491cd 100644
--- a/paddle/fluid/inference/anakin/convert/test_concat_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_concat_op.cc
@@ -24,7 +24,7 @@ namespace anakin {
 TEST(concat_op, test) {
   std::unordered_set<std::string> parameters({""});
   framework::Scope scope;
-  AnakinConvertValidation validator(parameters, scope);
+  AnakinConvertValidation validator(parameters, &scope);
   validator.DeclInputVar("concat_x1", {1, 2, 1, 1});
   validator.DeclInputVar("concat_x2", {1, 3, 1, 1});
   validator.DeclInputVar("concat_x3", {1, 1, 1, 1});
@@ -47,7 +47,7 @@ TEST(concat_op, test) {
 TEST(concat_op, test2) {
   std::unordered_set<std::string> parameters({""});
   framework::Scope scope;
-  AnakinConvertValidation validator(parameters, scope);
+  AnakinConvertValidation validator(parameters, &scope);
   validator.DeclInputVar("concat_x1", {1, 4});
   validator.DeclInputVar("concat_x2", {3, 4});
   validator.DeclInputVar("concat_x3", {2, 4});
diff --git a/paddle/fluid/inference/anakin/convert/test_conv2d_op.cc b/paddle/fluid/inference/anakin/convert/test_conv2d_op.cc
index 76f11c7b958793c5a323df48247b87a9e480d68b..6d93e50bc96b08b6ef7dd7c9d836038e335daae3 100644
--- a/paddle/fluid/inference/anakin/convert/test_conv2d_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_conv2d_op.cc
@@ -27,7 +27,7 @@ TEST(conv2d_op, test) {
   ASSERT_TRUE(conv2d_converter != nullptr);
   std::unordered_set<std::string> parameters({"conv2d-Y"});
   framework::Scope scope;
-  AnakinConvertValidation validator(parameters, scope);
+  AnakinConvertValidation validator(parameters, &scope);
   validator.DeclInputVar("conv2d-X", {1, 3, 3, 3});
   validator.DeclParamVar("conv2d-Y", {4, 3, 1, 1});
   validator.DeclOutputVar("conv2d-Out", {1, 4, 3, 3});
diff --git a/paddle/fluid/inference/anakin/convert/test_dropout_op.cc b/paddle/fluid/inference/anakin/convert/test_dropout_op.cc
index ded279bf4c474523669a499fd40a3e4cac784344..b2de5ae0a6e58eb25a4588571686a25500fe546c 100644
--- a/paddle/fluid/inference/anakin/convert/test_dropout_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_dropout_op.cc
@@ -24,7 +24,7 @@ namespace anakin {
 TEST(dropout_op, native) {
   std::unordered_set<std::string> parameters;
   framework::Scope scope;
-  AnakinConvertValidation validator(parameters, scope);
+  AnakinConvertValidation validator(parameters, &scope);
   validator.DeclInputVar("x", {1, 1, 2, 2});
   validator.DeclOutputVar("out", {1, 1, 2, 2});
   validator.DeclOutputVar("mask", {1, 1, 2, 2});
diff --git a/paddle/fluid/inference/anakin/convert/test_elementwise_op.cc b/paddle/fluid/inference/anakin/convert/test_elementwise_op.cc
index eec8b96f74f1a959d02c7a8a179148bd89120083..3a437f5fdb565609667b7a862c9b2bb13cdbeded 100644
--- a/paddle/fluid/inference/anakin/convert/test_elementwise_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_elementwise_op.cc
@@ -24,7 +24,7 @@ namespace anakin {
 static void test_elementwise_op(const std::string &op_type) {
   std::unordered_set<std::string> parameters;
   framework::Scope scope;
-  AnakinConvertValidation validator(parameters, scope);
+  AnakinConvertValidation validator(parameters, &scope);
   validator.DeclInputVar("x", {1, 1, 2, 2});
   validator.DeclInputVar("y", {1, 1, 2, 2});
   validator.DeclOutputVar("out", {1, 1, 2, 2});
diff --git a/paddle/fluid/inference/anakin/convert/test_fc_op.cc b/paddle/fluid/inference/anakin/convert/test_fc_op.cc
index c72974cd596cc04cd85479c25f287c7e159cf7c8..87bce36403fab59fb38697478ed68ff8e68fed7a 100644
--- a/paddle/fluid/inference/anakin/convert/test_fc_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_fc_op.cc
@@ -26,7 +26,7 @@ TEST(fc_op, test) {
 
   std::unordered_set<std::string> parameters({"mul_y"});
   framework::Scope scope;
-  AnakinConvertValidation validator(parameters, scope);
+  AnakinConvertValidation validator(parameters, &scope);
   validator.DeclInputVar("mul_x", {1, 1, 2, 2});
   validator.DeclParamVar("mul_y", {4, 2});
   validator.DeclOutputVar("mul_out", {1, 2});
diff --git a/paddle/fluid/inference/anakin/convert/test_flatten_op.cc b/paddle/fluid/inference/anakin/convert/test_flatten_op.cc
index 381fab786509a2a89c4f8bf8d5bde2e075f640b7..d13281f11f03fdd75e585bce8b30e8780d81f7d7 100644
--- a/paddle/fluid/inference/anakin/convert/test_flatten_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_flatten_op.cc
@@ -26,7 +26,7 @@ TEST(flatten_op, test) {
 
   std::unordered_set<std::string> parameters;
   framework::Scope scope;
-  AnakinConvertValidation validator(parameters, scope);
+  AnakinConvertValidation validator(parameters, &scope);
   validator.DeclInputVar("flatten-X", {3, 10, 10, 4});
   validator.DeclOutputVar("flatten-Out", {3, 400, 1, 1});
   framework::OpDesc desc;
diff --git a/paddle/fluid/inference/anakin/convert/test_im2sequence_op.cc b/paddle/fluid/inference/anakin/convert/test_im2sequence_op.cc
index 2f6c486298025798a50b8b4e280a6d34f9f52abb..5e5764633125c867e27b0b52e0e6ef18714653b2 100644
--- a/paddle/fluid/inference/anakin/convert/test_im2sequence_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_im2sequence_op.cc
@@ -24,7 +24,7 @@ namespace anakin {
 TEST(im2sequence_op, native) {
   std::unordered_set<std::string> parameters;
   framework::Scope scope;
-  AnakinConvertValidation validator(parameters, scope);
+  AnakinConvertValidation validator(parameters, &scope);
 
   std::vector<int> kernels = {6, 1};
   std::vector<int> strides = {1, 1};
diff --git a/paddle/fluid/inference/anakin/convert/test_pool2d_op.cc b/paddle/fluid/inference/anakin/convert/test_pool2d_op.cc
index 95cb4194957894dea1524ba8ed0a44973870f31b..1ac019467721605c539c7ada452d04d5134fa341 100644
--- a/paddle/fluid/inference/anakin/convert/test_pool2d_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_pool2d_op.cc
@@ -27,7 +27,7 @@ void test_pool2d(bool global_pooling, bool ceil_mode,
 
   framework::Scope scope;
   std::unordered_set<std::string> parameters;
-  AnakinConvertValidation validator(parameters, scope);
+  AnakinConvertValidation validator(parameters, &scope);
 
   // The ITensor's Dims should not contain the batch size.
   // So, the ITensor's Dims of input and output should be C * H * W.
@@ -72,7 +72,7 @@ void test_pool2d2(bool global_pooling, bool ceil_mode,
 
   framework::Scope scope;
   std::unordered_set<std::string> parameters;
-  AnakinConvertValidation validator(parameters, scope);
+  AnakinConvertValidation validator(parameters, &scope);
 
   // The ITensor's Dims should not contain the batch size.
   // So, the ITensor's Dims of input and output should be C * H * W.
diff --git a/paddle/fluid/inference/anakin/convert/test_relu_op.cc b/paddle/fluid/inference/anakin/convert/test_relu_op.cc
index 1695c0dbf057d0a22c393b17c5da0278d70abce8..04e624518a5a4477bbb41475b575f85be5a120d4 100644
--- a/paddle/fluid/inference/anakin/convert/test_relu_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_relu_op.cc
@@ -26,7 +26,7 @@ static void test_activation_op(const std::string &op_type) {
   PADDLE_ENFORCE(converter != nullptr);
   std::unordered_set<std::string> parameters;
   framework::Scope scope;
-  AnakinConvertValidation validator(parameters, scope);
+  AnakinConvertValidation validator(parameters, &scope);
   validator.DeclInputVar("act-X", {10, 6, 1, 1});
   validator.DeclOutputVar("act-Out", {10, 6, 1, 1});
   framework::OpDesc desc;
diff --git a/paddle/fluid/inference/anakin/convert/test_reshape_op.cc b/paddle/fluid/inference/anakin/convert/test_reshape_op.cc
index a60544e6507e679f86926ce5fa9a35252498644b..306ebf510f29a87ca1ffa6df86e08f86b3f8ffbb 100644
--- a/paddle/fluid/inference/anakin/convert/test_reshape_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_reshape_op.cc
@@ -25,7 +25,7 @@ TEST(reshape, test) {
   ASSERT_TRUE(converter);
   framework::Scope scope;
   std::unordered_set<std::string> parameters;
-  AnakinConvertValidation validator(parameters, scope);
+  AnakinConvertValidation validator(parameters, &scope);
 
   // validator.DeclInputVar("reshape-X", {2, 3, 3, 1});
   // validator.DeclOutputVar("reshape-Out", {3, 2, 1, 3});
@@ -48,7 +48,7 @@ TEST(reshape, test) {
 TEST(reshape, test2) {
   framework::Scope scope;
   std::unordered_set<std::string> parameters;
-  AnakinConvertValidation validator(parameters, scope);
+  AnakinConvertValidation validator(parameters, &scope);
 
   validator.DeclInputVar("reshape-X", {1, 2, 4});
   validator.DeclOutputVar("reshape-Out", {1, 4, 2});
diff --git a/paddle/fluid/inference/anakin/convert/test_softmax_op.cc b/paddle/fluid/inference/anakin/convert/test_softmax_op.cc
index 933e3eb2b24682a193916ecef371bea50d8d70e3..8c14fae0a67b9e488cf072535868a34f6195ab71 100644
--- a/paddle/fluid/inference/anakin/convert/test_softmax_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_softmax_op.cc
@@ -25,10 +25,10 @@ TEST(softmax, test) {
   ASSERT_TRUE(converter);
   framework::Scope scope;
   std::unordered_set<std::string> parameters;
-  AnakinConvertValidation validator(parameters, scope);
+  AnakinConvertValidation validator(parameters, &scope);
 
-  validator.DeclInputVar("softmax-X", {1, 10});
-  validator.DeclOutputVar("softmax-Out", {1, 10});
+  validator.DeclInputVar("softmax-X", {1, 10, 2});
+  validator.DeclOutputVar("softmax-Out", {1, 10, 2});
 
   framework::OpDesc desc;
   desc.SetType("softmax");
diff --git a/paddle/fluid/inference/anakin/convert/test_split_op.cc b/paddle/fluid/inference/anakin/convert/test_split_op.cc
index e5a7823298f905d78ee8adca4e5f67bc7f2585e9..aa61c01a511c2337944aadbbc3d47893487de683 100644
--- a/paddle/fluid/inference/anakin/convert/test_split_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_split_op.cc
@@ -26,7 +26,7 @@ void AnakinSliceTest(const std::vector<int> &in_shape,
                      const std::vector<int> &sections) {
   std::unordered_set<std::string> parameters({""});
   framework::Scope scope;
-  AnakinConvertValidation validator(parameters, scope);
+  AnakinConvertValidation validator(parameters, &scope);
 
   validator.DeclInputVar("split_input", in_shape);
   std::vector<std::string> output_vars;
diff --git a/paddle/fluid/inference/anakin/convert/test_sum_op.cc b/paddle/fluid/inference/anakin/convert/test_sum_op.cc
index a93539303bdd01f8756d0daa98871db495b480d0..d6a59a0166be9239b480221cc076069239403429 100644
--- a/paddle/fluid/inference/anakin/convert/test_sum_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_sum_op.cc
@@ -25,7 +25,7 @@ namespace anakin {
 TEST(sum, native) {
   std::unordered_set<std::string> parameters;
   framework::Scope scope;
-  AnakinConvertValidation validator(parameters, scope);
+  AnakinConvertValidation validator(parameters, &scope);
   validator.DeclInputVar("sum_x1", {1, 2, 1, 2});
   validator.DeclInputVar("sum_x2", {1, 2, 1, 2});
   validator.DeclOutputVar("sum_out", {1, 2, 1, 2});
diff --git a/paddle/fluid/inference/anakin/convert/test_transpose_op.cc b/paddle/fluid/inference/anakin/convert/test_transpose_op.cc
index 67c9c23f1fdc9425ad35005d7c4470ebe25f2184..016ed26f02f782fe5032d8368f7767a5c94dfe9f 100644
--- a/paddle/fluid/inference/anakin/convert/test_transpose_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_transpose_op.cc
@@ -25,7 +25,7 @@ TEST(transpose_op, test) {
   ASSERT_TRUE(converter != nullptr);
   std::unordered_set<std::string> parameters;
   framework::Scope scope;
-  AnakinConvertValidation validator(parameters, scope);
+  AnakinConvertValidation validator(parameters, &scope);
   validator.DeclInputVar("transpose-X", {2, 3, 4, 5});
   validator.DeclOutputVar("transpose-Out", {4, 2, 5, 3});
 
@@ -47,7 +47,7 @@ TEST(transpose_op, test) {
 TEST(transpose_op, test2) {
   std::unordered_set<std::string> parameters;
   framework::Scope scope;
-  AnakinConvertValidation validator(parameters, scope);
+  AnakinConvertValidation validator(parameters, &scope);
   validator.DeclInputVar("transpose-X", {3, 4, 5});
   validator.DeclOutputVar("transpose-Out", {3, 5, 4});
 
diff --git a/paddle/fluid/inference/anakin/convert/ut_helper.h b/paddle/fluid/inference/anakin/convert/ut_helper.h
index d62d11d25bba37821099492c5c292e44fc566052..e0371d95347a521f499dd9454d284907b3048a04 100644
--- a/paddle/fluid/inference/anakin/convert/ut_helper.h
+++ b/paddle/fluid/inference/anakin/convert/ut_helper.h
@@ -84,7 +84,7 @@ class AnakinConvertValidation {
   AnakinConvertValidation() = delete;
 
   AnakinConvertValidation(const std::unordered_set<std::string>& parameters,
-                          framework::Scope& scope)
+                          framework::Scope* scope)
       : parameters_(parameters), scope_(scope), place_(0) {
     PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0);
     engine_.reset(new AnakinEngine<NV, Precision::FP32>(true));
@@ -108,7 +108,7 @@ class AnakinConvertValidation {
 
   void DeclVar(const std::string& name, const std::vector<int> dim_vec) {
     platform::CUDADeviceContext ctx(place_);
-    auto* x = scope_.Var(name);
+    auto* x = scope_->Var(name);
     auto* x_tensor = x->GetMutable<framework::LoDTensor>();
     x_tensor->Resize(framework::make_ddim(dim_vec));
     RandomizeTensor(x_tensor, place_, ctx);
@@ -120,13 +120,13 @@ class AnakinConvertValidation {
     // should init anakin engine here.
 
     Singleton<AnakinOpConverter>::Global().ConvertOp(
-        desc, parameters_, scope_, engine_.get(), true /*test_mode*/);
+        desc, parameters_, *scope_, engine_.get(), true /*test_mode*/);
     engine_->Freeze();
 
     std::map<std::string, std::vector<int>> temp_max_input_shape;
     for (const auto& input : op_desc_->InputArgumentNames()) {
       if (parameters_.count(input)) continue;
-      auto& t = inference::analysis::GetFromScope<framework::LoDTensor>(scope_,
+      auto& t = inference::analysis::GetFromScope<framework::LoDTensor>(*scope_,
                                                                         input);
       auto t_shape = framework::vectorize2int(t.dims());
       while (t_shape.size() < 4) {
@@ -147,14 +147,14 @@ class AnakinConvertValidation {
                std::unordered_set<std::string> neglected_output = {}) {
     // Execute Fluid Op
     platform::CUDADeviceContext ctx(place_);
-    op_->Run(scope_, place_);
+    op_->Run(*scope_, place_);
 
     // std::vector<framework::LoDTensor> input_vector;
     // std::vector<framework::LoDTensor> output_vector;
     std::map<std::string, framework::LoDTensor*> inputs;
     for (const auto& input : op_desc_->InputArgumentNames()) {
       if (parameters_.count(input)) continue;
-      auto* var = scope_.FindVar(input);
+      auto* var = scope_->FindVar(input);
       auto tensor = var->GetMutable<framework::LoDTensor>();
       inputs.insert({input, tensor});
     }
@@ -164,7 +164,7 @@ class AnakinConvertValidation {
     for (const auto& output : op_desc_->OutputArgumentNames()) {
       if (neglected_output.count(output)) continue;
       std::vector<float> fluid_out;
-      auto* var = scope_.FindVar(output);
+      auto* var = scope_->FindVar(output);
       auto tensor = var->GetMutable<framework::LoDTensor>();
       framework::TensorToVector(*tensor, ctx, &fluid_out);
       fluid_outputs.push_back(fluid_out);
@@ -177,7 +177,7 @@ class AnakinConvertValidation {
     for (const auto& output : op_desc_->OutputArgumentNames()) {
       if (neglected_output.count(output)) continue;
       std::vector<float> anakin_out;
-      auto* var = scope_.FindVar(output);
+      auto* var = scope_->FindVar(output);
       auto tensor = var->GetMutable<framework::LoDTensor>();
       framework::TensorToVector(*tensor, ctx, &anakin_out);
 
@@ -189,15 +189,13 @@ class AnakinConvertValidation {
     }
   }
 
-  framework::Scope& scope() { return scope_; }
-
  private:
   std::unique_ptr<AnakinNvEngineT> engine_{nullptr};
   cudaStream_t stream_;
   std::unique_ptr<framework::OperatorBase> op_;
   std::unique_ptr<framework::OpDesc> op_desc_;
   const std::unordered_set<std::string>& parameters_;
-  framework::Scope& scope_;
+  framework::Scope* scope_;
   platform::CUDAPlace place_;
 };
 
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index b0e07fdf132f31087c73342e0b239c50ef93abbd..882e1d93b96aebbeddd469ac2ceff29a568f151b 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -97,7 +97,11 @@ void IRPassManager::CreatePasses(Argument *argument,
 
       bool use_static_engine = argument->tensorrt_use_static_engine();
       bool model_from_memory = argument->model_from_memory();
-      if ((!model_from_memory && use_static_engine)) {
+      bool int8_valid = !(model_from_memory && enable_int8);
+      PADDLE_ENFORCE(int8_valid,
+                     "TRT INT8 Now don't support model load from memory.");
+
+      if ((!model_from_memory && use_static_engine) || enable_int8) {
         std::string model_opt_cache_dir =
             argument->Has("model_dir")
                 ? argument->model_dir()
diff --git a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
index bd9f08d84e418359f685a095766f0f3e9b44f645..05a3d7ddfdb08c98866cc0a08ec4113866c7567d 100644
--- a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
@@ -1,4 +1,4 @@
-cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc)
+cc_library(subgraph_detector SRCS subgraph_detector.cc subgraph_util.cc DEPS proto_desc)
 if(WITH_TESTING)
   add_dependencies(subgraph_detector gtest)
 endif()
@@ -15,7 +15,7 @@ if (WITH_GPU AND TENSORRT_FOUND)
   set(INFER_IR_PASSES ${INFER_IR_PASSES} tensorrt_subgraph_pass CACHE INTERNAL "")
 endif()
 
-if (WITH_ANAKIN_SUBGRAPH) 
+if (ANAKIN_FOUND) 
   cc_library(anakin_subgraph_pass SRCS anakin_subgraph_pass.cc DEPS subgraph_detector anakin_op_teller)
 
   set(analysis_deps ${analysis_deps}
diff --git a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
index 0cd1d327ef6686f50de20d8b15bbf47ce085b11c..12deed2533bba713701849d58f8c5cf3269b85da 100644
--- a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
@@ -35,9 +35,6 @@ namespace analysis {
 
 using framework::ir::Node;
 
-std::vector<std::string> ExtractAnakinParameters(
-    const std::unordered_set<Node *> &nodes);
-
 std::unique_ptr<framework::ir::Graph> analysis::AnakinSubgraphPass::ApplyImpl(
     std::unique_ptr<framework::ir::Graph> graph) const {
   framework::ir::FusePassBase::Init("anakin_subgraph_pass", graph.get());
@@ -51,11 +48,10 @@ std::unique_ptr<framework::ir::Graph> analysis::AnakinSubgraphPass::ApplyImpl(
   fuser();
 
   std::vector<std::string> graph_param_names =
-      ExtractAnakinParameters(graph->Nodes());
+      ExtractParameters(graph->Nodes());
 
   // those parameter already exist in anakin, and should not have another copy
-  // in
-  // fluid.
+  // in fluid.
   std::vector<std::string> repetitive_params;
 
   for (auto *node : graph->Nodes()) {
@@ -157,74 +153,13 @@ void AnakinSubgraphPass::CreateAnakinOp(
   op_desc->SetType("anakin_engine");
 
   std::unordered_map<std::string, std::string> output_name_map;
+  auto &subgraph_nodes = *Agent(node).subgraph();
 
   // The following procedure is used to rename all the intermediate
   // variables and the output variables of the subgraph.
-  // Why we do this?
-  // During the transition from fluid OP to anakin OP, we map
-  // the input and output Tensor(fluid data structure) of fluid OP
-  // to the corresponding ITensor (trt data structure) through the
-  // Tensor name. When we set up ITensor for an variable, we must
-  // ensure that it has not been set before.
-  // If there is variable in the fluid graph, which is not only the
-  // input of a OP, but also the output of a Op, there will be problems.
-  // So we have to rename the variable in the subgraph to make sure
-  // it is either an OP's input or an OP's output.
-
-  auto &subgraph_nodes = *Agent(node).subgraph();
-  for (size_t index = 0; index < block_desc.OpSize(); ++index) {
-    framework::proto::OpDesc *op = block_desc.Op(index)->Proto();
-    auto correspond_node = subgraph_nodes[index];
-    PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type());
-
-    std::unordered_map<std::string, size_t> var2id;
-    for (auto *in_var : correspond_node->inputs) {
-      var2id[in_var->Name()] = in_var->id();
-    }
-    // rename for the input variables of op inside subgraph
-    for (int i = 0; i < op->inputs_size(); i++) {
-      // one input
-      auto *in_var = op->mutable_inputs(i);
-      std::vector<std::string> replaced_names;
-      for (int k = 0; k < in_var->arguments_size(); k++) {  // all the arguments
-        std::string arg_value = in_var->arguments(k);
-        std::string arg_value_with_id =
-            arg_value + std::to_string(var2id[arg_value]);
-        if (input_names_with_id.count(arg_value_with_id)) {
-          replaced_names.push_back(arg_value);
-        } else {
-          replaced_names.push_back(arg_value_with_id);
-        }
-      }
-      in_var->clear_arguments();
-      for (size_t k = 0; k < replaced_names.size(); k++) {
-        in_var->add_arguments(replaced_names[k]);
-      }
-    }
-    var2id.clear();
-    for (auto out_var : correspond_node->outputs) {
-      var2id[out_var->Name()] = out_var->id();
-    }
-
-    // rename for the output variables of op inside subgraph
-    for (int i = 0; i < op->outputs_size(); i++) {
-      framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i);
-      std::vector<std::string> replaced_names;
-      for (int k = 0; k < out_var->arguments_size(); k++) {
-        std::string arg_value = out_var->arguments(k);
-        std::string arg_value_with_id =
-            arg_value + std::to_string(var2id[arg_value]);
-        if (output_names_with_id.count(arg_value_with_id)) {
-          output_name_map[arg_value] = arg_value_with_id;
-        }
-        replaced_names.push_back(arg_value_with_id);
-      }
-      out_var->clear_arguments();
-      for (size_t k = 0; k < replaced_names.size(); k++) {
-        out_var->add_arguments(replaced_names[k]);
-      }
-    }
-  }
+  RenameAndGetOutputs(subgraph_nodes, &block_desc, input_names_with_id,
+                      &output_names_with_id, &output_names, &output_name_map,
+                      false);
 
   // When anakin engine runs at the end of the operation,
   // output_mapping help us copy the data from the renamed ITensor
@@ -249,8 +184,7 @@ void AnakinSubgraphPass::CreateAnakinOp(
   SetAttr(op_desc->Proto(), "subgraph",
           block_desc.Proto()->SerializeAsString());
   // Set attrs
-  SetAttr(op_desc->Proto(), "parameters",
-          ExtractAnakinParameters(graph->Nodes()));
+  SetAttr(op_desc->Proto(), "parameters", params);
   SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping);
   int predictor_id = Get<int>("predictor_id");
   auto engine_key = GenerateAnakinEngineKey(
@@ -277,34 +211,6 @@ void AnakinSubgraphPass::CreateAnakinOp(
           param_set, output_mapping, anakin_engine);
 }
 
-std::vector<std::string> ExtractAnakinParameters(
-    const std::unordered_set<Node *> &nodes) {
-  // We can judge whether a variable is a parameter by
-  // its presistable property, but sometimes the presistable
-  // of the feed op output is true, so we have to identify it.
-  std::vector<std::string> feed_outputs;
-  for (const auto &node : nodes) {
-    if (!node->IsOp()) continue;
-    std::string op_type = node->Op()->Type();
-    if (op_type == "feed" || op_type == "fetch") {
-      std::vector<std::string> output_names = node->Op()->OutputArgumentNames();
-      std::copy(output_names.begin(), output_names.end(),
-                std::back_inserter(feed_outputs));
-    }
-  }
-
-  std::vector<std::string> parameters;
-  for (const auto &node : nodes) {
-    if (!node->IsVar()) continue;
-    if (node->Var()->Persistable() &&
-        std::find(feed_outputs.begin(), feed_outputs.end(), node->Name()) ==
-            feed_outputs.end()) {
-      parameters.push_back(node->Name());
-    }
-  }
-  return parameters;
-}
-
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h
index a732cb55921397c18f65b9c679b6f3213a1a7190..c13b9ecda42336a79187185070104ba9ac4b67bc 100644
--- a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h
+++ b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h
@@ -19,6 +19,7 @@
 #include <vector>
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/inference/anakin/engine.h"
+#include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h"
 
 using anakin::Precision;
 using anakin::saber::NV;
diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a17ee1b707a7f950cddc62373a9a57c793d5528f
--- /dev/null
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
@@ -0,0 +1,152 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file defines the the class to partition a graph.
+ */
+
+#include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h"
+#include <algorithm>
+#include <string>
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+using framework::ir::Node;
+
+std::vector<std::string> ExtractParameters(
+    const std::unordered_set<Node *> &nodes) {
+  // We can judge whether a variable is a parameter by
+  // its presistable property, but sometimes the presistable
+  // of the feed op output is true, so we have to identify it.
+  std::vector<std::string> feed_outputs;
+  for (const auto &node : nodes) {
+    if (!node->IsOp()) continue;
+    std::string op_type = node->Op()->Type();
+    if (op_type == "feed" || op_type == "fetch") {
+      std::vector<std::string> output_names = node->Op()->OutputArgumentNames();
+      std::copy(output_names.begin(), output_names.end(),
+                std::back_inserter(feed_outputs));
+    }
+  }
+
+  std::vector<std::string> parameters;
+  for (const auto &node : nodes) {
+    if (!node->IsVar()) continue;
+    if (node->Var()->Persistable() &&
+        std::find(feed_outputs.begin(), feed_outputs.end(), node->Name()) ==
+            feed_outputs.end()) {
+      parameters.push_back(node->Name());
+    }
+  }
+  return parameters;
+}
+
+void RenameAndGetOutputs(
+    const std::vector<framework::ir::Node *> &subgraph_nodes,
+    framework::BlockDesc *block_desc,
+    const std::set<std::string> &input_names_with_id,
+    std::set<std::string> *output_names_with_id,
+    std::set<std::string> *output_names,
+    std::unordered_map<std::string, std::string> *output_name_map,
+    bool is_trt) {
+  //// In the normal case, the paddle-trt exists bug when runing the googlenet.
+  // When there are more than two convolutions of 1 * 1 with the same input, the
+  // paddle-tensorrt will do the merging optimization, which fuse those conv
+  // into one conv, and then trigger bug. So,  We should use strategy to avoid
+  // this optimization for the time being. This bug will be fixed in the future.
+  std::unordered_map<std::string /*name*/, int /*ITensor_quote_num*/>
+      same_hierarchy_conv2d_num_map;
+
+  for (size_t index = 0; index < block_desc->OpSize(); ++index) {
+    framework::proto::OpDesc *op = block_desc->Op(index)->Proto();
+    framework::OpDesc op_desc(*op, nullptr);
+    auto correspond_node = subgraph_nodes[index];
+    PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type());
+
+    std::unordered_map<std::string, size_t> var2id;
+    std::unordered_map<std::string, framework::ir::Node *> in_vars;
+    for (auto *in_var : correspond_node->inputs) {
+      var2id[in_var->Name()] = in_var->id();
+      in_vars[in_var->Name()] = in_var;
+    }
+    // rename for the input variables of op inside subgraph
+    for (int i = 0; i < op->inputs_size(); i++) {
+      // one input
+      auto *in_var = op->mutable_inputs(i);
+      std::vector<std::string> replaced_names;
+      for (int k = 0; k < in_var->arguments_size(); k++) {  // all the arguments
+        std::string arg_value = in_var->arguments(k);
+        std::string arg_value_with_id =
+            arg_value + std::to_string(var2id[arg_value]);
+        if (input_names_with_id.count(arg_value_with_id)) {
+          replaced_names.push_back(arg_value);
+        } else {
+          replaced_names.push_back(arg_value_with_id);
+        }
+      }
+      in_var->clear_arguments();
+      for (size_t k = 0; k < replaced_names.size(); k++) {
+        in_var->add_arguments(replaced_names[k]);
+      }
+    }
+    var2id.clear();
+    for (auto out_var : correspond_node->outputs) {
+      var2id[out_var->Name()] = out_var->id();
+    }
+
+    if (op_desc.Type() == "conv2d" && is_trt) {
+      auto input_var_name = op_desc.Input("Input").front();
+      auto filter_var_name = op_desc.Input("Filter").front();
+      auto out_var_name = op_desc.Output("Output").front();
+      auto filter_shape = in_vars[filter_var_name]->Var()->GetShape();
+      const std::vector<int> strides =
+          boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
+      const std::vector<int> paddings =
+          boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
+      if (same_hierarchy_conv2d_num_map[input_var_name] > 0) {
+        (*output_names_with_id)
+            .insert(out_var_name + std::to_string(var2id[out_var_name]));
+        (*output_names).insert(out_var_name);
+      } else if (filter_shape[2] == 1 && filter_shape[3] == 1 &&
+                 strides[0] == 1 && strides[1] == 1 && paddings[0] == 0 &&
+                 paddings[1] == 0) {
+        same_hierarchy_conv2d_num_map[input_var_name] += 1;
+      }
+    }
+
+    // rename for the output variables of op inside subgraph
+    for (int i = 0; i < op->outputs_size(); i++) {
+      framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i);
+      std::vector<std::string> replaced_names;
+      for (int k = 0; k < out_var->arguments_size(); k++) {
+        std::string arg_value = out_var->arguments(k);
+        std::string arg_value_with_id =
+            arg_value + std::to_string(var2id[arg_value]);
+        if (output_names_with_id->count(arg_value_with_id)) {
+          (*output_name_map)[arg_value] = arg_value_with_id;
+        }
+        replaced_names.push_back(arg_value_with_id);
+      }
+      out_var->clear_arguments();
+      for (size_t k = 0; k < replaced_names.size(); k++) {
+        out_var->add_arguments(replaced_names[k]);
+      }
+    }
+  }
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..3cf21bf5f426a7142626e6ae1db6ee478418d08a
--- /dev/null
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h
@@ -0,0 +1,49 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file defines the the class to partition a graph.
+ */
+
+#pragma once
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_traits.h"
+#include "paddle/fluid/framework/ir/node.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+using framework::ir::Node;
+
+std::vector<std::string> ExtractParameters(
+    const std::unordered_set<Node *> &nodes);
+
+void RenameAndGetOutputs(
+    const std::vector<framework::ir::Node *> &subgraph_nodes,
+    framework::BlockDesc *block_desc,
+    const std::set<std::string> &input_names_with_id,
+    std::set<std::string> *output_names_with_id,
+    std::set<std::string> *output_names,
+    std::unordered_map<std::string, std::string> *output_name_map,
+    bool is_trt = true);
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index 00490fbb184194454e3f72e99503cbd14b8767c2..59399403276b59c143fc3e06a53643e0a85cf559 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -31,17 +31,6 @@ namespace analysis {
 
 using framework::ir::Node;
 
-std::vector<std::string> ExtractParameters(
-    const std::unordered_set<Node *> &nodes);
-
-void RenameAndGetOutputs(
-    const std::vector<framework::ir::Node *> &subgraph_nodes,
-    framework::BlockDesc *block_desc,
-    const std::set<std::string> &input_names_with_id,
-    std::set<std::string> *output_names_with_id,
-    std::set<std::string> *output_names,
-    std::unordered_map<std::string, std::string> *output_name_map);
-
 std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
     std::unique_ptr<framework::ir::Graph> graph) const {
   framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph.get());
@@ -217,7 +206,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
   // Get "" when there is no cached calibration table data.
   bool load_from_memory = Get<bool>("model_from_memory");
   std::string calibration_data = "";
-  if (!load_from_memory && use_static_engine) {
+  if (enable_int8) {
     calibration_data = GetTrtCalibTableData(
         Get<std::string>("model_opt_cache_dir"), engine_key, enable_int8);
   }
@@ -226,13 +215,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
   SetAttr(op_desc->Proto(), "enable_int8", enable_int8);
   SetAttr(op_desc->Proto(), "engine_key", engine_key);
   std::string trt_engine_serialized_data = "";
-  if (load_from_memory) {
-    std::map<std::string, std::string> engine_opt_info =
-        Get<std::map<std::string, std::string>>("engine_opt_info");
-    if (engine_opt_info.count(engine_key)) {
-      trt_engine_serialized_data = engine_opt_info[engine_key];
-    }
-  }
+
   SetAttr(op_desc->Proto(), "engine_serialized_data",
           trt_engine_serialized_data);
 
@@ -240,176 +223,62 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
   if (enable_int8 && calibration_data.size() != 0) {
     calibrator.reset(new tensorrt::TRTInt8Calibrator(calibration_data));
   }
-
   // When in int8 mode and calibration_mode, the program just produce the
   // calibration table data.
   bool calibration_mode = (enable_int8 && calibration_data.size() == 0);
-  if (!calibration_mode && use_static_engine &&
-      trt_engine_serialized_data.empty()) {
-    std::copy(params.begin(), params.end(),
-              std::back_inserter(*repetitive_params));
-
-    if (use_static_engine && !load_from_memory) {
-      trt_engine_serialized_data = GetTrtEngineSerializedData(
-          Get<std::string>("model_opt_cache_dir"), engine_key);
-    }
+  if (calibration_mode) {
+    // calibraion mode means generate int8 calibration table data process.
+    return;
+  }
 
-    if (trt_engine_serialized_data.empty()) {
-      LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP "
-                   "kernel etc). This process may cost a lot of time.";
-      std::unique_ptr<tensorrt::TensorRTEngine> trt_engine(
-          new tensorrt::TensorRTEngine(
-              Get<int>("max_batch_size"), Get<int>("workspace_size"),
-              enable_int8, calibrator.get(), Get<int>("gpu_device_id")));
-      auto *scope = param_scope();
-      framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto());
-      std::unordered_set<std::string> param_set(params.begin(), params.end());
-      inference::Singleton<inference::tensorrt::OpConverter>::Global()
-          .ConvertBlockToTRTEngine(
-              &block_desc_temp, *scope,
-              std::vector<std::string>(input_names.begin(), input_names.end()),
-              param_set, output_mapping, trt_engine.get());
-      nvinfer1::IHostMemory *serialized_engine_data = trt_engine->Serialize();
-      trt_engine_serialized_data =
-          std::string((const char *)serialized_engine_data->data(),
-                      serialized_engine_data->size());
-
-      if (use_static_engine && !load_from_memory) {
-        SaveTrtEngineSerializedDataToFile(
-            GetTrtEngineSerializedPath(Get<std::string>("model_opt_cache_dir"),
-                                       engine_key),
-            trt_engine_serialized_data);
-      }
-    } else {
+  std::copy(params.begin(), params.end(),
+            std::back_inserter(*repetitive_params));
+  bool need_serialize = (use_static_engine && !load_from_memory);
+
+  if (need_serialize) {
+    trt_engine_serialized_data = GetTrtEngineSerializedData(
+        Get<std::string>("model_opt_cache_dir"), engine_key);
+    // we can load the engine info serialized before from the disk.
+    if (!trt_engine_serialized_data.empty()) {
+      SetAttr(op_desc->Proto(), "engine_serialized_data",
+              trt_engine_serialized_data);
       LOG(INFO) << "Load TRT Optimized Info from "
                 << GetTrtEngineSerializedPath(
                        Get<std::string>("model_opt_cache_dir"), engine_key);
-    }
-
-    SetAttr(op_desc->Proto(), "engine_serialized_data",
-            trt_engine_serialized_data);
-  }
-}
-
-std::vector<std::string> ExtractParameters(
-    const std::unordered_set<Node *> &nodes) {
-  // We can judge whether a variable is a parameter by
-  // its presistable property, but sometimes the presistable
-  // of the feed op output is true, so we have to identify it.
-  std::vector<std::string> feed_outputs;
-  for (const auto &node : nodes) {
-    if (!node->IsOp()) continue;
-    std::string op_type = node->Op()->Type();
-    if (op_type == "feed" || op_type == "fetch") {
-      std::vector<std::string> output_names = node->Op()->OutputArgumentNames();
-      std::copy(output_names.begin(), output_names.end(),
-                std::back_inserter(feed_outputs));
+      return;
     }
   }
 
-  std::vector<std::string> parameters;
-  for (const auto &node : nodes) {
-    if (!node->IsVar()) continue;
-    if (node->Var()->Persistable() &&
-        std::find(feed_outputs.begin(), feed_outputs.end(), node->Name()) ==
-            feed_outputs.end()) {
-      parameters.push_back(node->Name());
-    }
-  }
-  return parameters;
-}
-
-void RenameAndGetOutputs(
-    const std::vector<framework::ir::Node *> &subgraph_nodes,
-    framework::BlockDesc *block_desc,
-    const std::set<std::string> &input_names_with_id,
-    std::set<std::string> *output_names_with_id,
-    std::set<std::string> *output_names,
-    std::unordered_map<std::string, std::string> *output_name_map) {
-  //// In the normal case, the paddle-trt exists bug when runing the googlenet.
-  // When there are more than two convolutions of 1 * 1 with the same input, the
-  // paddle-tensorrt will do the merging optimization, which fuse those conv
-  // into one conv, and then trigger bug. So,  We should use strategy to avoid
-  // this optimization for the time being. This bug will be fixed in the future.
-  std::unordered_map<std::string /*name*/, int /*ITensor_quote_num*/>
-      same_hierarchy_conv2d_num_map;
-
-  for (size_t index = 0; index < block_desc->OpSize(); ++index) {
-    framework::proto::OpDesc *op = block_desc->Op(index)->Proto();
-    framework::OpDesc op_desc(*op, nullptr);
-    auto correspond_node = subgraph_nodes[index];
-    PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type());
-
-    std::unordered_map<std::string, size_t> var2id;
-    std::unordered_map<std::string, framework::ir::Node *> in_vars;
-    for (auto *in_var : correspond_node->inputs) {
-      var2id[in_var->Name()] = in_var->id();
-      in_vars[in_var->Name()] = in_var;
-    }
-    // rename for the input variables of op inside subgraph
-    for (int i = 0; i < op->inputs_size(); i++) {
-      // one input
-      auto *in_var = op->mutable_inputs(i);
-      std::vector<std::string> replaced_names;
-      for (int k = 0; k < in_var->arguments_size(); k++) {  // all the arguments
-        std::string arg_value = in_var->arguments(k);
-        std::string arg_value_with_id =
-            arg_value + std::to_string(var2id[arg_value]);
-        if (input_names_with_id.count(arg_value_with_id)) {
-          replaced_names.push_back(arg_value);
-        } else {
-          replaced_names.push_back(arg_value_with_id);
-        }
-      }
-      in_var->clear_arguments();
-      for (size_t k = 0; k < replaced_names.size(); k++) {
-        in_var->add_arguments(replaced_names[k]);
-      }
-    }
-    var2id.clear();
-    for (auto out_var : correspond_node->outputs) {
-      var2id[out_var->Name()] = out_var->id();
-    }
-
-    if (op_desc.Type() == "conv2d") {
-      auto input_var_name = op_desc.Input("Input").front();
-      auto filter_var_name = op_desc.Input("Filter").front();
-      auto out_var_name = op_desc.Output("Output").front();
-      auto filter_shape = in_vars[filter_var_name]->Var()->GetShape();
-      const std::vector<int> strides =
-          boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
-      const std::vector<int> paddings =
-          boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
-      if (same_hierarchy_conv2d_num_map[input_var_name] > 0) {
-        (*output_names_with_id)
-            .insert(out_var_name + std::to_string(var2id[out_var_name]));
-        (*output_names).insert(out_var_name);
-      } else if (filter_shape[2] == 1 && filter_shape[3] == 1 &&
-                 strides[0] == 1 && strides[1] == 1 && paddings[0] == 0 &&
-                 paddings[1] == 0) {
-        same_hierarchy_conv2d_num_map[input_var_name] += 1;
-      }
-    }
-
-    // rename for the output variables of op inside subgraph
-    for (int i = 0; i < op->outputs_size(); i++) {
-      framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i);
-      std::vector<std::string> replaced_names;
-      for (int k = 0; k < out_var->arguments_size(); k++) {
-        std::string arg_value = out_var->arguments(k);
-        std::string arg_value_with_id =
-            arg_value + std::to_string(var2id[arg_value]);
-        if (output_names_with_id->count(arg_value_with_id)) {
-          (*output_name_map)[arg_value] = arg_value_with_id;
-        }
-        replaced_names.push_back(arg_value_with_id);
-      }
-      out_var->clear_arguments();
-      for (size_t k = 0; k < replaced_names.size(); k++) {
-        out_var->add_arguments(replaced_names[k]);
-      }
-    }
+  // the following code will NOT run in following situation:
+  // 1. calibraion mode (generate trt int8 calibraiton table data)
+  // 2. already load serialized trt engine info.
+  LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP "
+               "kernel etc). This process may cost a lot of time.";
+  std::unique_ptr<tensorrt::TensorRTEngine> trt_engine(
+      new tensorrt::TensorRTEngine(
+          Get<int>("max_batch_size"), Get<int>("workspace_size"), enable_int8,
+          calibrator.get(), Get<int>("gpu_device_id")));
+  auto *scope = param_scope();
+  framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto());
+  std::unordered_set<std::string> param_set(params.begin(), params.end());
+  inference::Singleton<inference::tensorrt::OpConverter>::Global()
+      .ConvertBlockToTRTEngine(
+          &block_desc_temp, *scope,
+          std::vector<std::string>(input_names.begin(), input_names.end()),
+          param_set, output_mapping, trt_engine.get());
+  nvinfer1::IHostMemory *serialized_engine_data = trt_engine->Serialize();
+  trt_engine_serialized_data =
+      std::string((const char *)serialized_engine_data->data(),
+                  serialized_engine_data->size());
+
+  if (need_serialize) {
+    SaveTrtEngineSerializedDataToFile(
+        GetTrtEngineSerializedPath(Get<std::string>("model_opt_cache_dir"),
+                                   engine_key),
+        trt_engine_serialized_data);
   }
+  SetAttr(op_desc->Proto(), "engine_serialized_data",
+          trt_engine_serialized_data);
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h
index 6689a668fc9313df4105875477424f1426637226..f043670c5af39c1bdf8d4f00c7294fb53a4c9039 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h
@@ -20,6 +20,7 @@
 #include <vector>
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h"
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index 38313754ea98dcf4b65098d3966c131a3fc98518..90f09505c023c656a0b4ffaf9e3ef52152c0f0e7 100644
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -27,7 +27,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
     set(inference_deps ${inference_deps} tensorrt_engine tensorrt_converter)
 endif()
 
-if (WITH_ANAKIN_SUBGRAPH)
+if (ANAKIN_FOUND)
     set(inference_deps ${inference_deps} anakin_op_converter anakin_engine)
 endif()
 
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 4278d579e8c008b9fbf38e955234f9b64f869d13..001e8e66d5560f631dab8dd7c13bbaaef1e6195a 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -40,10 +40,11 @@
 #if PADDLE_WITH_TENSORRT
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
-
 #endif
 
+#if PADDLE_WITH_ANAKIN
 #include "paddle/fluid/inference/anakin/convert/op_converter.h"
+#endif
 
 DECLARE_bool(profile);
 
@@ -817,6 +818,7 @@ USE_TRT_CONVERTER(conv2d_transpose);
 USE_TRT_CONVERTER(leaky_relu);
 #endif
 
+#if PADDLE_WITH_ANAKIN
 USE_ANAKIN_CONVERTER(mul);
 USE_ANAKIN_CONVERTER(fc);
 USE_ANAKIN_CONVERTER(conv2d);
@@ -838,3 +840,4 @@ USE_ANAKIN_CONVERTER(detection_out);
 USE_ANAKIN_CONVERTER(density_prior_box);
 USE_ANAKIN_CONVERTER(dropout);
 USE_ANAKIN_CONVERTER(sum);
+#endif
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
index 33de430fc8cf3d73bd0e290d9a4b8c1e8a33e80b..d2e2a98a85509be9eb81b1a6e7d36d9d4d583df4 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -81,6 +81,8 @@ PaddleDType ZeroCopyTensor::type() {
     return PaddleDType::FLOAT32;
   } else if (type == framework::proto::VarType::INT64) {
     return PaddleDType::INT64;
+  } else if (type == framework::proto::VarType::INT32) {
+    return PaddleDType::INT32;
   } else {
     LOG(ERROR) << "unknown type, only support float32 and int64 now.";
   }
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 8ccb6d5a1c02f3624507857a77da08e894f8820f..afac8e4d2a39cae37e33b02135bc2c41b5ceb0c3 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -34,7 +34,7 @@ if (WITH_GPU AND TENSORRT_FOUND)
     add_subdirectory(tensorrt)
 endif()
 
-if (WITH_ANAKIN_SUBGRAPH) 
+if (ANAKIN_FOUND) 
     add_subdirectory(anakin)
 endif()
 
diff --git a/paddle/fluid/operators/anakin/anakin_engine_op.cc b/paddle/fluid/operators/anakin/anakin_engine_op.cc
index 48b0490d041af5921d7456a3ae2efe7884ac833f..58db16ea0c1347a366a4d5927e414d76864cb6ab 100644
--- a/paddle/fluid/operators/anakin/anakin_engine_op.cc
+++ b/paddle/fluid/operators/anakin/anakin_engine_op.cc
@@ -39,8 +39,7 @@ class AnakinEngineOpMaker : public framework::OpProtoAndCheckerMaker {
 
 class AnakinEngineInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {}
+  void operator()(framework::InferVarTypeContext *ctx) const override {}
 };
 
 }  // namespace operators