Update the Anakin interfaces for content-dnn and MLU (#17890)

* update anakin-engine interfaces for content-dnn test=develop * support only-gpu mode of Anakin modify eltwise parse test=develop * modification for thread-safe test=develop * Integrated template instance test=develop * increase template parameters test=develop * support MLU predictor test=develop * update anakin cmake files test=develop * update TargetWrapper::set_device * update the initialization of anakin subgraph test=develop * use the default constructor of base class test=develop

Update the Anakin interfaces for content-dnn and MLU (#17890)
* update anakin-engine interfaces for content-dnn test=develop * support only-gpu mode of Anakin modify eltwise parse test=develop * modification for thread-safe test=develop * Integrated template instance test=develop * increase template parameters test=develop * support MLU predictor test=develop * update anakin cmake files test=develop * update TargetWrapper::set_device * update the initialization of anakin subgraph test=develop * use the default constructor of base class test=develop
bce259e5 · 石晓伟 · GitHub · 410907f6 · bce259e5 · bce259e5
39 changed file
--- a/cmake/anakin_subgraph.cmake
+++ b/cmake/anakin_subgraph.cmake
-if(NOT WITH_GPU)
-    return()
-endif()
-
 set(ANAKIN_ROOT "/usr" CACHE PATH "ANAKIN ROOT")
 find_path(ANAKIN_INCLUDE_DIR anakin_config.h
    PATHS ${ANAKIN_ROOT} ${ANAKIN_ROOT}/include
@@ -16,9 +12,7 @@ find_library(ANAKIN_LIBRARY NAMES libanakin_saber_common.so libanakin.so
    DOC "Path to ANAKIN library.")

 if(ANAKIN_INCLUDE_DIR AND ANAKIN_LIBRARY)
-  if(WITH_DSO)
    set(ANAKIN_FOUND ON)
-  endif(WITH_DSO)
 else()
    set(ANAKIN_FOUND OFF)
 endif()
@@ -31,3 +25,8 @@ if(ANAKIN_FOUND)
    link_directories(${ANAKIN_ROOT})
    add_definitions(-DPADDLE_WITH_ANAKIN)
 endif()
+
+if(ANAKIN_FOUND AND WITH_GPU AND WITH_DSO)
+    message(STATUS "Compile with anakin subgraph.")
+    set(ANAKIN_SUBGRAPH ON)
+endif()
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -77,7 +77,7 @@ pass_library(fillconstant_elementwisemul_fuse inference)
 pass_library(shuffle_channel_detect_pass inference)
 pass_library(delete_quant_dequant_op_pass inference)

-if(ANAKIN_FOUND)
+if(ANAKIN_SUBGRAPH)
 pass_library(simplify_anakin_priorbox_detection_out_pass inference)
 endif()


--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -17,7 +17,7 @@ if (TENSORRT_FOUND)
  add_subdirectory(tensorrt)
 endif()

-if (ANAKIN_FOUND)
+if (ANAKIN_SUBGRAPH)
  add_subdirectory(anakin)
 endif()

@@ -43,11 +43,15 @@ if(WITH_MKLDNN)
 endif()

 set(STATIC_INFERENCE_APIS paddle_fluid_api paddle_inference_api analysis_predictor)
+if (ANAKIN_FOUND)
+    set(ANAKIN_SHARED_INFERENCE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/api/api_anakin_engine.cc)
+endif()
 set(SHARED_INFERENCE_SRCS
    io.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api_impl.cc
    ${CMAKE_CURRENT_SOURCE_DIR}/api/analysis_predictor.cc
    ${mkldnn_quantizer_src}
-    ${CMAKE_CURRENT_SOURCE_DIR}/api/details/zero_copy_tensor.cc)
+    ${CMAKE_CURRENT_SOURCE_DIR}/api/details/zero_copy_tensor.cc
+    ${ANAKIN_SHARED_INFERENCE_SRCS})

 if(WIN32)
  sep_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array

--- a/paddle/fluid/inference/anakin/convert/elementwise.cc
+++ b/paddle/fluid/inference/anakin/convert/elementwise.cc
@@ -60,7 +60,7 @@ void ElementwiseMulOpConverter<TargetT, PrecisionT>::operator()(
  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();

  this->engine_->AddOp(op_name, "Eltwise", {x_name, y_name}, {out_name});
-  std::string elementwise_type = "Prod";
+  std::string elementwise_type = "Mul";
  this->engine_->template AddOpAttr<std::string>(op_name, "type",
                                                 elementwise_type);
  std::vector<float> coeff = {1.0, 1.0};

--- a/paddle/fluid/inference/anakin/convert/op_converter.h
+++ b/paddle/fluid/inference/anakin/convert/op_converter.h
@@ -153,11 +153,12 @@ template class AnakinOpConverter<::anakin::saber::NV,
                                 ::anakin::Precision::FP32>;
 template class AnakinOpConverter<::anakin::saber::NV,
                                 ::anakin::Precision::INT8>;
-
+#ifdef ANAKIN_X86_PLACE
 template class AnakinOpConverter<::anakin::saber::X86,
                                 ::anakin::Precision::FP32>;
 template class AnakinOpConverter<::anakin::saber::X86,
                                 ::anakin::Precision::INT8>;
+#endif
 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle
@@ -203,16 +204,16 @@ template class AnakinOpConverter<::anakin::saber::X86,
      CPU, ::anakin::saber::X86, precision_type__,                       \
      ::anakin::Precision::precision_type__)

-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && defined(ANAKIN_X86_PLACE)
 #define REGISTER_ANAKIN_OP_CONVERTER(op_type__, Converter__)       \
  REGISTER_CUDA_ANAKIN_OP_CONVERTER(op_type__, Converter__, FP32); \
  REGISTER_CUDA_ANAKIN_OP_CONVERTER(op_type__, Converter__, INT8); \
  REGISTER_CPU_ANAKIN_OP_CONVERTER(op_type__, Converter__, FP32);  \
  REGISTER_CPU_ANAKIN_OP_CONVERTER(op_type__, Converter__, INT8)
-#else
-#define REGISTER_ANAKIN_OP_CONVERTER(op_type__, Converter__)      \
-  REGISTER_CPU_ANAKIN_OP_CONVERTER(op_type__, Converter__, FP32); \
-  REGISTER_CPU_ANAKIN_OP_CONVERTER(op_type__, Converter__, INT8)
+#elif defined(PADDLE_WITH_CUDA)
+#define REGISTER_ANAKIN_OP_CONVERTER(op_type__, Converter__)       \
+  REGISTER_CUDA_ANAKIN_OP_CONVERTER(op_type__, Converter__, FP32); \
+  REGISTER_CUDA_ANAKIN_OP_CONVERTER(op_type__, Converter__, INT8)
 #endif

 #define USE_ANAKIN_CONVERTER_BASE(op_type__, place_type__, precision_type__)   \
@@ -221,12 +222,16 @@ template class AnakinOpConverter<::anakin::saber::X86,
      __attribute__((unused)) =                                                \
          Touch_anakin_##op_type__##_##place_type__##_##precision_type__();

+#if defined(PADDLE_WITH_CUDA) && defined(ANAKIN_X86_PLACE)
+#define USE_ANAKIN_CONVERTER(op_type__)            \
+  USE_ANAKIN_CONVERTER_BASE(op_type__, CUDA, FP32) \
+  USE_ANAKIN_CONVERTER_BASE(op_type__, CPU, FP32)
+#define USE_INT8_ANAKIN_CONVERTER(op_type__)       \
+  USE_ANAKIN_CONVERTER_BASE(op_type__, CUDA, INT8) \
+  USE_ANAKIN_CONVERTER_BASE(op_type__, CPU, INT8)
+#elif defined(PADDLE_WITH_CUDA)
 #define USE_ANAKIN_CONVERTER(op_type__) \
  USE_ANAKIN_CONVERTER_BASE(op_type__, CUDA, FP32)
 #define USE_INT8_ANAKIN_CONVERTER(op_type__) \
  USE_ANAKIN_CONVERTER_BASE(op_type__, CUDA, INT8)
-
-#define USE_CPU_ANAKIN_CONVERTER(op_type__) \
-  USE_ANAKIN_CONVERTER_BASE(op_type__, CPU, FP32)
-#define USE_CPU_INT8_ANAKIN_CONVERTER(op_type__) \
-  USE_ANAKIN_CONVERTER_BASE(op_type__, CPU, INT8)
+#endif
--- a/paddle/fluid/inference/anakin/convert/test_activation_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_activation_op.cc
@@ -77,32 +77,6 @@ TEST(swish_op, gpu) {
 }
 #endif

-/*
-TEST(sigm_op, cpu) {
-  platform::CPUPlace cpu_place;
-  platform::CPUDeviceContext ctx(cpu_place);
-  test_activation_op<::anakin::saber::X86>("sigmoid", ctx, false);
-}
-
-TEST(tanh_op, cpu) {
-  platform::CPUPlace cpu_place;
-  platform::CPUDeviceContext ctx(cpu_place);
-  test_activation_op<::anakin::saber::X86>("tanh", ctx, false);
-}
-
-TEST(relu6_op, cpu) {
-  platform::CPUPlace cpu_place;
-  platform::CPUDeviceContext ctx(cpu_place);
-  test_activation_op<::anakin::saber::X86>("relu6", ctx, false);
-}
-
-TEST(swish_op, cpu) {
-  platform::CPUPlace cpu_place;
-  platform::CPUDeviceContext ctx(cpu_place);
-  test_activation_op<::anakin::saber::X86>("swish", ctx, false);
-}
-*/
-
 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle
@@ -112,13 +86,7 @@ USE_OP(tanh);
 USE_OP(relu6);
 USE_OP(swish);

-USE_CPU_ANAKIN_CONVERTER(sigmoid);
-USE_CPU_ANAKIN_CONVERTER(tanh);
-USE_CPU_ANAKIN_CONVERTER(relu6);
-USE_CPU_ANAKIN_CONVERTER(swish);
-#ifdef PADDLE_WITH_CUDA
 USE_ANAKIN_CONVERTER(sigmoid);
 USE_ANAKIN_CONVERTER(tanh);
 USE_ANAKIN_CONVERTER(relu6);
 USE_ANAKIN_CONVERTER(swish);
-#endif
--- a/paddle/fluid/inference/anakin/convert/test_affine_channel_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_affine_channel_op.cc
@@ -57,19 +57,16 @@ TEST(affine_channel_op, gpu) {
  test_affine_channel_op<::anakin::saber::NV>(ctx, true);
 }
 #endif
-
+#ifdef ANAKIN_X86_PLACE
 TEST(affine_channel_op, cpu) {
  platform::CPUPlace cpu_place;
  platform::CPUDeviceContext ctx(cpu_place);
  test_affine_channel_op<::anakin::saber::X86>(ctx, false);
 }
-
+#endif
 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle

 USE_OP(affine_channel);
-USE_CPU_ANAKIN_CONVERTER(affine_channel);
-#ifdef PADDLE_WITH_CUDA
 USE_ANAKIN_CONVERTER(affine_channel);
-#endif
--- a/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc
@@ -73,19 +73,15 @@ TEST(batch_norm_op, gpu) {
  test_batchnorm_op<::anakin::saber::NV>(ctx, true);
 }
 #endif
-
+#ifdef ANAKIN_X86_PLACE
 TEST(batch_norm_op, cpu) {
  platform::CPUPlace cpu_place;
  platform::CPUDeviceContext ctx(cpu_place);
  test_batchnorm_op<::anakin::saber::X86>(ctx, false);
 }
-
+#endif
 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle
 USE_OP(batch_norm);
-USE_CPU_ANAKIN_CONVERTER(batch_norm);
-
-#ifdef PADDLE_WITH_CUDA
 USE_ANAKIN_CONVERTER(batch_norm);
-#endif
--- a/paddle/fluid/inference/anakin/convert/test_concat_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_concat_op.cc
@@ -53,19 +53,15 @@ TEST(concat_op, gpu) {
  test_concat_op<::anakin::saber::NV>(ctx, true);
 }
 #endif
-
+#ifdef ANAKIN_X86_PLACE
 TEST(concat_op, cpu) {
  platform::CPUPlace cpu_place;
  platform::CPUDeviceContext ctx(cpu_place);
  test_concat_op<::anakin::saber::X86>(ctx, false);
 }
-
+#endif
 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle
 USE_OP(concat);
-USE_CPU_ANAKIN_CONVERTER(concat);
-
-#ifdef PADDLE_WITH_CUDA
 USE_ANAKIN_CONVERTER(concat);
-#endif
--- a/paddle/fluid/inference/anakin/convert/test_conv2d_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_conv2d_op.cc
@@ -60,20 +60,16 @@ TEST(conv2d_op, gpu) {
  test_conv2d_op<::anakin::saber::NV>(ctx, true);
 }
 #endif
-
+#ifdef ANAKIN_X86_PLACE
 TEST(conv2d_op, cpu) {
  platform::CPUPlace cpu_place;
  platform::CPUDeviceContext ctx(cpu_place);
  test_conv2d_op<::anakin::saber::X86>(ctx, false);
 }
-
+#endif
 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle

 USE_OP(conv2d);
-USE_CPU_ANAKIN_CONVERTER(conv2d);
-
-#ifdef PADDLE_WITH_CUDA
 USE_ANAKIN_CONVERTER(conv2d);
-#endif
--- a/paddle/fluid/inference/anakin/convert/test_dropout_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_dropout_op.cc
@@ -54,19 +54,16 @@ TEST(dropout_op, gpu) {
  test_dropout_op<::anakin::saber::NV>(ctx, true);
 }
 #endif
-
+#ifdef ANAKIN_X86_PLACE
 TEST(dropout_op, cpu) {
  platform::CPUPlace cpu_place;
  platform::CPUDeviceContext ctx(cpu_place);
  test_dropout_op<::anakin::saber::X86>(ctx, false);
 }
-
+#endif
 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle

 USE_OP(dropout);
-USE_CPU_ANAKIN_CONVERTER(dropout);
-#ifdef PADDLE_WITH_CUDA
 USE_ANAKIN_CONVERTER(dropout);
-#endif
--- a/paddle/fluid/inference/anakin/convert/test_elementwise_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_elementwise_op.cc
@@ -59,29 +59,23 @@ TEST(elementwise_op, native_mul_gpu) {
  test_elementwise_op<::anakin::saber::NV>("elementwise_mul", ctx, true);
 }
 #endif
-
+#ifdef ANAKIN_X86_PLACE
 TEST(elementwise_op, native_add_cpu) {
  platform::CPUPlace cpu_place;
  platform::CPUDeviceContext ctx(cpu_place);
  test_elementwise_op<::anakin::saber::X86>("elementwise_add", ctx, false);
 }
-
 TEST(elementwise_op, native_mul_cpu) {
  platform::CPUPlace cpu_place;
  platform::CPUDeviceContext ctx(cpu_place);
  test_elementwise_op<::anakin::saber::X86>("elementwise_mul", ctx, false);
 }
-
+#endif
 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle

 USE_OP(elementwise_add);
 USE_OP(elementwise_mul);
-#ifdef PADDLE_WITH_CUDA
 USE_ANAKIN_CONVERTER(elementwise_add);
 USE_ANAKIN_CONVERTER(elementwise_mul);
-#endif
-
-USE_CPU_ANAKIN_CONVERTER(elementwise_add);
-USE_CPU_ANAKIN_CONVERTER(elementwise_mul);
--- a/paddle/fluid/inference/anakin/convert/test_fc_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_fc_op.cc
@@ -49,19 +49,16 @@ TEST(mul_op, gpu) {
  test_mul_op<::anakin::saber::NV>(ctx, true);
 }
 #endif
-
+#ifdef ANAKIN_X86_PLACE
 TEST(mul_op, cpu) {
  platform::CPUPlace cpu_place;
  platform::CPUDeviceContext ctx(cpu_place);
  test_mul_op<::anakin::saber::X86>(ctx, false);
 }
-
+#endif
 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle

 USE_OP(mul);
-USE_CPU_ANAKIN_CONVERTER(fc);
-#ifdef PADDLE_WITH_CUDA
 USE_ANAKIN_CONVERTER(fc);
-#endif
--- a/paddle/fluid/inference/anakin/convert/test_flatten_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_flatten_op.cc
@@ -48,20 +48,17 @@ TEST(flatten_op, gpu) {
  test_flatten_op<::anakin::saber::NV>(ctx, true);
 }
 #endif
-
+#ifdef ANAKIN_X86_PLACE
 TEST(flatten_op, cpu) {
  platform::CPUPlace cpu_place;
  platform::CPUDeviceContext ctx(cpu_place);
  test_flatten_op<::anakin::saber::X86>(ctx, false);
 }
-
+#endif
 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle

 USE_OP(reshape);
 USE_OP_ITSELF(flatten);
-USE_CPU_ANAKIN_CONVERTER(flatten);
-#ifdef PADDLE_WITH_CUDA
 USE_ANAKIN_CONVERTER(flatten);
-#endif
--- a/paddle/fluid/inference/anakin/convert/test_pool2d_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_pool2d_op.cc
@@ -87,7 +87,7 @@ TEST(Pool2dOpConverter, avg_ceil_test) {
  test_pool2d<::anakin::saber::NV>(ctx, true, false, true, "avg");
 }
 #endif
-
+#ifdef ANAKIN_X86_PLACE
 TEST(Pool2dOpConverter, normal_cpu) {
  platform::CPUPlace cpu_place;
  platform::CPUDeviceContext ctx(cpu_place);
@@ -110,14 +110,10 @@ TEST(Pool2dOpConverter, avg_ceil_test_cpu) {
  platform::CPUDeviceContext ctx(cpu_place);
  test_pool2d<::anakin::saber::X86>(ctx, false, false, true, "avg");
 }
-
+#endif
 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle

 USE_OP(pool2d);
-USE_CPU_ANAKIN_CONVERTER(pool2d);
-
-#ifdef PADDLE_WITH_CUDA
 USE_ANAKIN_CONVERTER(pool2d);
-#endif
--- a/paddle/fluid/inference/anakin/convert/test_relu_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_relu_op.cc
@@ -66,10 +66,5 @@ TEST(leaky_relu_op, gpu) {

 USE_OP(relu);
 USE_OP(leaky_relu);
-USE_CPU_ANAKIN_CONVERTER(relu);
-USE_CPU_ANAKIN_CONVERTER(leaky_relu);
-
-#ifdef PADDLE_WITH_CUDA
 USE_ANAKIN_CONVERTER(relu);
 USE_ANAKIN_CONVERTER(leaky_relu);
-#endif
--- a/paddle/fluid/inference/anakin/convert/test_reshape_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_reshape_op.cc
@@ -81,7 +81,7 @@ TEST(reshape2_op, gpu) {
  test_reshape2_op<::anakin::saber::NV>(ctx, true);
 }
 #endif
-
+#ifdef ANAKIN_X86_PLACE
 TEST(reshape1_op, cpu) {
  platform::CPUPlace cpu_place;
  platform::CPUDeviceContext ctx(cpu_place);
@@ -93,14 +93,10 @@ TEST(reshape2_op, cpu) {
  platform::CPUDeviceContext ctx(cpu_place);
  test_reshape2_op<::anakin::saber::X86>(ctx, false);
 }
-
+#endif
 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle

 USE_OP(reshape);
-USE_CPU_ANAKIN_CONVERTER(reshape);
-
-#ifdef PADDLE_WITH_CUDA
 USE_ANAKIN_CONVERTER(reshape);
-#endif
--- a/paddle/fluid/inference/anakin/convert/test_softmax_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_softmax_op.cc
@@ -48,20 +48,16 @@ TEST(softmax_op, gpu) {
  test_softmax_op<::anakin::saber::NV>(ctx, true);
 }
 #endif
-
+#ifdef ANAKIN_X86_PLACE
 TEST(relu_op, cpu) {
  platform::CPUPlace cpu_place;
  platform::CPUDeviceContext ctx(cpu_place);
  test_softmax_op<::anakin::saber::X86>(ctx, false);
 }
-
+#endif
 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle

 USE_OP(softmax);
-USE_CPU_ANAKIN_CONVERTER(softmax);
-
-#ifdef PADDLE_WITH_CUDA
 USE_ANAKIN_CONVERTER(softmax);
-#endif
--- a/paddle/fluid/inference/anakin/convert/test_split_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_split_op.cc
@@ -92,7 +92,7 @@ TEST(split_op, test_different_shape_axis3_batch1) {
  platform::CUDADeviceContext ctx(gpu_place);
  AnakinSliceTest<::anakin::saber::NV, 3>(ctx, true, {1, 3, 2, 3}, {2, 1});
 }
-
+#ifdef ANAKIN_X86_PLACE
 TEST(split_op, test_different_shape_axis1_batch1_cpu) {
  platform::CPUPlace cpu_place;
  platform::CPUDeviceContext ctx(cpu_place);
@@ -110,13 +110,10 @@ TEST(split_op, test_different_shape_axis3_batch1_cpu) {
  platform::CPUDeviceContext ctx(cpu_place);
  AnakinSliceTest<::anakin::saber::X86, 3>(ctx, false, {1, 3, 2, 4}, {2, 2});
 }
-
+#endif
 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle

 USE_OP(split);
-USE_CPU_ANAKIN_CONVERTER(split);
-#ifdef PADDLE_WITH_CUDA
 USE_ANAKIN_CONVERTER(split);
-#endif
--- a/paddle/fluid/inference/anakin/convert/test_sum_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_sum_op.cc
@@ -49,19 +49,16 @@ TEST(sum_op, gpu) {
  test_sum_op<::anakin::saber::NV>(ctx, true);
 }
 #endif
-
+#ifdef ANAKIN_X86_PLACE
 TEST(sum_op, cpu) {
  platform::CPUPlace cpu_place;
  platform::CPUDeviceContext ctx(cpu_place);
  test_sum_op<::anakin::saber::X86>(ctx, false);
 }
-
+#endif
 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle

 USE_OP(sum);
-USE_CPU_ANAKIN_CONVERTER(sum);
-#ifdef PADDLE_WITH_CUDA
 USE_ANAKIN_CONVERTER(sum);
-#endif
--- a/paddle/fluid/inference/anakin/convert/test_transpose_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_transpose_op.cc
@@ -79,7 +79,7 @@ TEST(transpose2_op, gpu) {
  test_transpose2_op<::anakin::saber::NV>(ctx, true);
 }
 #endif
-
+#ifdef ANAKIN_X86_PLACE
 TEST(transpose1_op, cpu) {
  platform::CPUPlace cpu_place;
  platform::CPUDeviceContext ctx(cpu_place);
@@ -91,13 +91,10 @@ TEST(transpose2_op, cpu) {
  platform::CPUDeviceContext ctx(cpu_place);
  test_transpose2_op<::anakin::saber::X86>(ctx, false);
 }
-
+#endif
 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle

 USE_OP(transpose);
-USE_CPU_ANAKIN_CONVERTER(transpose);
-#ifdef PADDLE_WITH_CUDA
 USE_ANAKIN_CONVERTER(transpose);
-#endif
--- a/paddle/fluid/inference/anakin/convert/ut_helper.h
+++ b/paddle/fluid/inference/anakin/convert/ut_helper.h
@@ -33,7 +33,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"

 using anakin::Precision;
-using anakin::saber::X86;

 namespace paddle {
 namespace inference {
@@ -215,13 +214,14 @@ class AnakinConvertValidation {

 template class AnakinConvertValidation<::anakin::saber::NV,
                                       ::anakin::Precision::FP32>;
-template class AnakinConvertValidation<::anakin::saber::X86,
-                                       ::anakin::Precision::FP32>;
-
 template class AnakinConvertValidation<::anakin::saber::NV,
                                       ::anakin::Precision::INT8>;
+#ifdef ANAKIN_X86_PLACE
+template class AnakinConvertValidation<::anakin::saber::X86,
+                                       ::anakin::Precision::FP32>;
 template class AnakinConvertValidation<::anakin::saber::X86,
                                       ::anakin::Precision::INT8>;
+#endif
 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/anakin/engine.cc
+++ b/paddle/fluid/inference/anakin/engine.cc
@@ -46,10 +46,9 @@ AnakinEngine<TargetT, PrecisionType, RunType>::AnakinEngine(
      max_input_shape_(max_input_shape),
      program_inputs_(program_inputs),
      auto_config_layout_(auto_config_layout) {
-  std::call_once(init_anakin_, [this]() {
-    ::anakin::TargetWrapper<TargetT>::set_device(device_);
-    ::anakin::Env<TargetT>::env_init();
-  });
+  ::anakin::TargetWrapper<TargetT>::set_device(device_);
+  std::call_once(init_anakin_,
+                 [this]() { ::anakin::Env<TargetT>::env_init(); });
  graph_.reset(new AnakinGraphT<TargetT, PrecisionType>());
  net_.reset(new AnakinNetT<TargetT, PrecisionType, RunType>(need_summary));
 }
@@ -194,14 +193,14 @@ template class AnakinEngine<::anakin::saber::NV, ::anakin::Precision::INT8>;
 template class AnakinEngineManager<::anakin::saber::NV,
                                   ::anakin::Precision::INT8>;
 #endif
-
+#ifdef ANAKIN_X86_PLACE
 template class AnakinEngine<::anakin::saber::X86, ::anakin::Precision::FP32>;
 template class AnakinEngineManager<::anakin::saber::X86,
                                   ::anakin::Precision::FP32>;
 template class AnakinEngine<::anakin::saber::X86, ::anakin::Precision::INT8>;
 template class AnakinEngineManager<::anakin::saber::X86,
                                   ::anakin::Precision::INT8>;
-
+#endif
 // template class AnakinEngine<::anakin::saber::X86, ::anakin::Precision::FP32>;
 }  // namespace anakin
 }  // namespace inference

--- a/paddle/fluid/inference/anakin/engine.h
+++ b/paddle/fluid/inference/anakin/engine.h
@@ -24,7 +24,9 @@
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/inference/engine.h"
 #include "paddle/fluid/inference/utils/singleton.h"
-
+#ifdef EXIT  // NOLINT
+#undef EXIT  // NOLINT
+#endif       // NOLINT
 #include "framework/core/net/net.h"
 #include "framework/core/types.h"
 #include "framework/graph/graph.h"

--- a/paddle/fluid/inference/anakin/test_anakin_engine.cc
+++ b/paddle/fluid/inference/anakin/test_anakin_engine.cc
@@ -22,7 +22,6 @@ limitations under the License. */
 using anakin::AK_FLOAT;
 using anakin::Precision;
 using anakin::saber::NV;
-using anakin::saber::X86;
 using anakin::saber::Shape;
 using anakin::PBlock;
 using anakin::PTuple;

--- a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
@@ -15,7 +15,7 @@ if (WITH_GPU AND TENSORRT_FOUND)
  set(INFER_IR_PASSES ${INFER_IR_PASSES} tensorrt_subgraph_pass CACHE INTERNAL "")
 endif()

-if (ANAKIN_FOUND) 
+if (ANAKIN_SUBGRAPH) 
  cc_library(anakin_subgraph_pass SRCS anakin_subgraph_pass.cc DEPS subgraph_detector anakin_op_teller)

  set(analysis_deps ${analysis_deps}

--- a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
@@ -226,7 +226,6 @@ void AnakinSubgraphPass::CreateAnakinEngine(
  auto max_batch_size = Get<int>("max_batch_size");
  auto max_input_shape =
      Get<std::map<std::string, std::vector<int>>>("max_input_shape");
-  bool auto_config_layout = Get<bool>("auto_config_layout");
  if (use_gpu) {
 #ifdef PADDLE_WITH_CUDA
    inference::Singleton<
@@ -235,11 +234,14 @@ void AnakinSubgraphPass::CreateAnakinEngine(
                max_input_shape, program_inputs, false, engine_key);
 #endif
  } else {
+#ifdef ANAKIN_X86_PLACE
+    bool auto_config_layout = Get<bool>("auto_config_layout");
    inference::Singleton<
        anakin::AnakinEngineManager<::anakin::saber::X86, PrecisionT>>::Global()
        .Create(true, Get<int>("gpu_device_id"), max_batch_size,
                max_input_shape, program_inputs, auto_config_layout,
                engine_key);
+#endif
  }

  auto *scope = param_scope();
@@ -258,6 +260,7 @@ void AnakinSubgraphPass::CreateAnakinEngine(
            param_set, output_mapping, anakin_engine);
 #endif
  } else {
+#ifdef ANAKIN_X86_PLACE
    auto *anakin_engine =
        inference::Singleton<inference::anakin::AnakinEngineManager<
            ::anakin::saber::X86, PrecisionT>>::Global()
@@ -268,6 +271,7 @@ void AnakinSubgraphPass::CreateAnakinEngine(
            &block_desc_temp, scope,
            std::vector<std::string>(input_names.begin(), input_names.end()),
            param_set, output_mapping, anakin_engine);
+#endif
  }
 }


--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -27,7 +27,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
    set(inference_deps ${inference_deps} tensorrt_engine tensorrt_converter)
 endif()

-if (ANAKIN_FOUND)
+if (ANAKIN_SUBGRAPH)
    set(inference_deps ${inference_deps} anakin_op_converter anakin_engine)
 endif()

@@ -38,9 +38,9 @@ endif()
 add_subdirectory(details)

 if(WITH_MKLDNN)
-	set(mkldnn_quantizer_src mkldnn_quantizer.cc)
-	set(mkldnn_quantizer_cfg mkldnn_quantizer_config)
-	cc_library(${mkldnn_quantizer_cfg} SRCS mkldnn_quantizer_config.cc DEPS lod_tensor paddle_pass_builder)
+  set(mkldnn_quantizer_src mkldnn_quantizer.cc)
+  set(mkldnn_quantizer_cfg mkldnn_quantizer_config)
+  cc_library(${mkldnn_quantizer_cfg} SRCS mkldnn_quantizer_config.cc DEPS lod_tensor paddle_pass_builder)
 endif()

 cc_library(analysis_config SRCS analysis_config.cc DEPS ${mkldnn_quantizer_cfg} lod_tensor paddle_pass_builder)
@@ -56,9 +56,7 @@ cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS
           paddle_pass_builder zero_copy_tensor
           reset_tensor_array)

-cc_test(test_paddle_inference_api
-        SRCS api_tester.cc
-        DEPS paddle_inference_api)
+cc_test(test_paddle_inference_api SRCS api_tester.cc DEPS paddle_inference_api)

 if(WITH_TESTING)
  inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS ${inference_deps}
@@ -69,13 +67,21 @@ endif()
 cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor benchmark ${inference_deps}
        ARGS --dirname=${WORD2VEC_MODEL_DIR})

-if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
-    # compile the libinference_anakin_api.a and anakin.so.
-    cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml zero_copy_tensor_dummy device_context)
-    cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber zero_copy_tensor_dummy device_context)
-    function(anakin_target target_name)
-      target_compile_options(${target_name} BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
-    endfunction()
-    anakin_target(inference_anakin_api)
-    anakin_target(inference_anakin_api_shared)
+if(ANAKIN_FOUND)
+  if (ANAKIN_MLU AND NOT WITH_GPU AND NOT ANAKIN_X86)
+    message(STATUS "Compile with anakin mlu place.")
+    add_definitions(-DANAKIN_MLU_PLACE)
+  elseif(ANAKIN_X86)
+    message(STATUS "Compile with anakin x86 place.")
+    add_definitions(-DANAKIN_X86_PLACE)
+  endif()
+  cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc)
+  target_link_libraries(inference_anakin_api anakin anakin_saber_common)
+  cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc)
+  target_link_libraries(inference_anakin_api_shared anakin anakin_saber_common)
+  function(anakin_target target_name)
+    target_compile_options(${target_name} BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
+  endfunction()
+  anakin_target(inference_anakin_api)
+  anakin_target(inference_anakin_api_shared)
 endif()
--- a/paddle/fluid/inference/api/api_anakin_engine.cc
+++ b/paddle/fluid/inference/api/api_anakin_engine.cc
--- a/paddle/fluid/inference/api/api_anakin_engine.h
+++ b/paddle/fluid/inference/api/api_anakin_engine.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@ limitations under the License. */

 #pragma once

+#include <memory>
 #include <vector>

 #include "framework/core/net/net.h"
@@ -30,13 +31,18 @@ limitations under the License. */
 namespace paddle {

 using contrib::AnakinConfig;
+using anakin::Precision;
+using anakin::OpRunType;

-template <typename Target>
+template <typename T, Precision P, OpRunType R>
 class PaddleInferenceAnakinPredictor : public PaddlePredictor {
 public:
-  PaddleInferenceAnakinPredictor() {}
+  PaddleInferenceAnakinPredictor() = default;

-  explicit PaddleInferenceAnakinPredictor(const AnakinConfig& config);
+  explicit PaddleInferenceAnakinPredictor(const AnakinConfig& config)
+      : config_(config) {
+    this->InitPredictor();
+  }

  // NOTE Unlike the native engine, the buffers of anakin engine's output_data
  // should be allocated first.
@@ -45,21 +51,45 @@ class PaddleInferenceAnakinPredictor : public PaddlePredictor {
           int batch_size = -1) override;

  std::unique_ptr<PaddlePredictor> Clone() override;
-
-  anakin::Net<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>&
-  get_executer();
+  virtual bool ResetConfig(const AnakinConfig& config);
+  virtual anakin::Net<T, P, R>& ResetExecuter(
+      std::shared_ptr<anakin::graph::Graph<T, P>> graph_p);
+  void InitPredictor();

  ~PaddleInferenceAnakinPredictor() override;

- private:
-  bool Init(const AnakinConfig& config);
-
-  anakin::graph::Graph<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>
-      graph_;
-  anakin::Net<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>*
-      executor_p_{nullptr};
+  static std::mutex mutex_;
  AnakinConfig config_;
-  int max_batch_size_{0};
+  std::shared_ptr<anakin::Context<T>> ctx_p_;
+  std::shared_ptr<anakin::graph::Graph<T, P>> graph_p_;
+  anakin::Net<T, P, R>* executor_p_{nullptr};
+
+  void InitEnv();
+  void InitGraph();
+  virtual void OptimizeGraph();
+  virtual void InitNet();
+  virtual void SetContext();
+  virtual void Predict();
+
+ private:
+  bool RunImpl(const std::vector<PaddleTensor>& inputs,
+               std::vector<PaddleTensor>* output_data);
+  static std::once_flag init_anakin_;
 };

+#ifdef ANAKIN_MLU_PLACE
+template <Precision P, OpRunType R>
+class PaddleInferenceAnakinMLUPredictor final
+    : public PaddleInferenceAnakinPredictor<anakin::MLU, P, R> {
+ public:
+  explicit PaddleInferenceAnakinMLUPredictor(const AnakinConfig& config) {
+    this->ResetConfig(config);
+    this->InitPredictor();
+  }
+  void SetContext() override;
+  void OptimizeGraph() override;
+  void InitNet() override;
+  void Predict() override;
+};
+#endif
 }  // namespace paddle
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -64,9 +64,12 @@ static int GetUniqueId() {
 }

 static void split(const std::string &str, char sep,
-                  std::vector<std::string> *pieces) {
+                  std::vector<std::string> *pieces, bool ignore_null = true) {
  pieces->clear();
  if (str.empty()) {
+    if (!ignore_null) {
+      pieces->push_back(str);
+    }
    return;
  }
  size_t pos = 0;

--- a/paddle/fluid/inference/api/paddle_anakin_config.h
+++ b/paddle/fluid/inference/api/paddle_anakin_config.h
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -14,6 +14,7 @@
 #pragma once

 #include <cassert>
+#include <map>
 #include <memory>
 #include <string>
 #include <vector>
@@ -24,11 +25,22 @@ namespace paddle {
 namespace contrib {
 // Configurations for Anakin engine.
 struct AnakinConfig : public PaddlePredictor::Config {
-  enum TargetType { NVGPU = 0, X86 };
-  int device;
+  enum TargetType { NVGPU = 0, X86, MLU };
+  int device_id{0};
  std::string model_file;
-  int max_batch_size{-1};
+  std::map<std::string, std::vector<int>> init_inputs_shape;
+  int init_batch_size{-1};
+  bool re_allocable{true};
+  int max_stream{4};
+  int data_stream_id{0};
+  int compute_stream_id{0};
  TargetType target_type;
+#ifdef ANAKIN_MLU_PLACE
+  int model_parallel{8};
+  int data_parallel{1};
+  bool op_fuse{false};
+  bool sparse{false};
+#endif
 };

 }  // namespace contrib

--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -28,6 +28,6 @@ limitations under the License. */

 #include "paddle_analysis_config.h"  // NOLINT
 #include "paddle_api.h"              // NOLINT
-#ifdef WITH_ANAKIN
+#if (defined WITH_ANAKIN) || (defined PADDLE_WITH_ANAKIN)
 #include "paddle_anakin_config.h"  // NOLINT
 #endif
--- a/paddle/fluid/inference/tests/api/anakin_mlu_tester.cc
+++ b/paddle/fluid/inference/tests/api/anakin_mlu_tester.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <glog/logging.h>
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+
+DEFINE_string(model, "", "Directory of the inference model.");
+
+namespace paddle {
+
+contrib::AnakinConfig Config() {
+  // Determine the use of memory here.
+  std::map<std::string, std::vector<int>> init_inputs_shape;
+  init_inputs_shape["input_0"] = std::vector<int>({1, 3, 112, 112});
+
+  contrib::AnakinConfig config;
+  config.target_type = contrib::AnakinConfig::MLU;
+  config.model_file = FLAGS_model;
+  config.init_inputs_shape = init_inputs_shape;
+
+  // Determine the device execution context.
+  config.device_id = 0;
+  config.data_stream_id = 0;
+  config.compute_stream_id = 0;
+
+  // Set re_allocable and op_fuse TRUE.
+  config.re_allocable = true;
+  config.op_fuse = true;
+
+  return config;
+}
+
+void single_test() {
+  // 1. Defining basic data structures.
+  auto config = paddle::Config();
+  auto predictor =
+      paddle::CreatePaddlePredictor<paddle::contrib::AnakinConfig,
+                                    paddle::PaddleEngineKind::kAnakin>(config);
+
+  // 2. Define the data structure of the predictor inputs and outputs.
+  std::vector<paddle::PaddleTensor> input_tensors;
+  std::vector<paddle::PaddleTensor> output_tensors;
+
+  // 3. Define and fill the inputs tensor.
+  int num = 1;
+  int channel = 3;
+  int height = 112;
+  int width = 112;
+  std::vector<float> input(num * channel * height * width, 1);
+  std::vector<std::vector<float>> inputs({input});
+  const std::vector<std::string> input_names{"input_0"};
+  for (auto& name : input_names) {
+    paddle::PaddleTensor tensor;
+    tensor.name = name;
+    tensor.dtype = PaddleDType::FLOAT32;
+    input_tensors.push_back(tensor);
+  }
+  for (size_t j = 0; j < input_tensors.size(); j++) {
+    input_tensors[j].data =
+        paddle::PaddleBuf(&inputs[j][0], inputs[j].size() * sizeof(float));
+    // The shape of each execution can be changed.
+    input_tensors[j].shape = std::vector<int>({num, channel, height, width});
+  }
+
+  // 4. Set the output placeholder of predictor.
+  PaddleTensor predict_out, score_out;
+  predict_out.name = "landmark_predict_out";
+  score_out.name = "landmark_score_out";
+  output_tensors.push_back(predict_out);
+  output_tensors.push_back(score_out);
+
+  // 5. Execution predict.
+  predictor->Run(input_tensors, &output_tensors);
+
+  // 6. Take out the output data.
+  for (auto out : output_tensors) {
+    float* data_o = static_cast<float*>(out.data.data());
+    LOG(INFO) << out.name << " size = " << out.data.length() / sizeof(float);
+  }
+}
+}  // namespace paddle
+
+int main(int argc, char** argv) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  paddle::single_test();
+  return 0;
+}
--- a/paddle/fluid/inference/tests/api/anakin_mobilenet_tester.cc
+++ b/paddle/fluid/inference/tests/api/anakin_mobilenet_tester.cc
@@ -27,8 +27,8 @@ contrib::AnakinConfig GetConfig() {
  // using AnakinConfig::X86 if you need to use cpu to do inference
  config.target_type = contrib::AnakinConfig::NVGPU;
  config.model_file = FLAGS_model;
-  config.device = 0;
-  config.max_batch_size = 1;
+  config.device_id = 0;
+  config.init_batch_size = 1;
  return config;
 }


--- a/paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc
+++ b/paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc
@@ -100,8 +100,8 @@ contrib::AnakinConfig GetConfig() {
  // using AnakinConfig::X86 if you need to use cpu to do inference
  config.target_type = contrib::AnakinConfig::X86;
  config.model_file = FLAGS_model;
-  config.device = 0;
-  config.max_batch_size = 1000;  // the max number of token
+  config.device_id = 0;
+  config.init_batch_size = 1000;  // the max number of token
  return config;
 }


--- a/paddle/fluid/inference/tests/api/anakin_rnn2_tester.cc
+++ b/paddle/fluid/inference/tests/api/anakin_rnn2_tester.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <glog/logging.h>
+#include <cmath>
+#include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+
+#define BUFFER_SIZE (10000)
+#define COMPARE_OUTPUTS (1)
+#define PRINT_INPUTS (0)
+
+DEFINE_string(model, "", "Directory of the inference model.");
+DEFINE_string(datapath, "", "Path of the dataset.");
+DEFINE_string(truthpath, "", "Path of the dataset.");
+DEFINE_int32(batch_size, 1, "Batch size per execution.");
+DEFINE_int32(repeats, 1, "Number of iterations.");
+DEFINE_int32(
+    start_line, 0,
+    "The starting line of the text file read (this line will be read).");
+DEFINE_int32(end_line, 1000000,
+             "The ending line of the text file read (this line will be read).");
+DEFINE_int32(init_batch_size, 40,
+             "Max batch size for Anakin memory allocation.");
+DEFINE_int32(threads_num, 2, "Threads num for Anakin.");
+
+class Data {
+ public:
+  Data(std::string file_name, size_t batch_size, size_t start = 0,
+       size_t end = 1000000)
+      : _batch_size(batch_size), _total_length(0), _inputs_size(6) {
+    _file.open(file_name);
+    _file.seekg(_file.end);
+    _total_length = _file.tellg();
+    _file.seekg(_file.beg);
+    read_file_to_vec(start, end);
+    reset_current_line();
+  }
+  void reset_current_line();
+  const std::vector<std::string>& get_lines();
+  void read_file_to_vec(const size_t start, const size_t end);
+  int get_next_batches(std::vector<std::vector<float>>* inputs,
+                       std::vector<std::vector<size_t>>* seq_offsets);
+
+ private:
+  std::fstream _file;
+  int _batch_size;
+  size_t _total_length;
+  size_t _inputs_size;
+  std::vector<std::string> _lines;
+  size_t _current_line;
+};
+
+void Data::read_file_to_vec(const size_t start, const size_t end) {
+  std::string line;
+  size_t count = 0;
+  _lines.clear();
+  while (std::getline(_file, line)) {
+    if (count >= start && count <= end) {
+      _lines.push_back(line);
+    }
+    count++;
+  }
+}
+
+const std::vector<std::string>& Data::get_lines() { return _lines; }
+
+void Data::reset_current_line() { _current_line = 0; }
+
+int Data::get_next_batches(std::vector<std::vector<float>>* data,
+                           std::vector<std::vector<size_t>>* offsets) {
+  data->clear();
+  offsets->clear();
+  data->resize(_inputs_size);
+  offsets->resize(_inputs_size);
+  for (auto& offset : *offsets) {
+    offset.push_back(0);
+  }
+
+  int seq_num = -1;
+  int pre_query_index = -1;
+  while (_current_line < _lines.size()) {
+    int cur_query_index = -1;
+    std::vector<std::string> line;
+    paddle::inference::split(_lines[_current_line], ';', &line);
+    for (size_t i = 0; i < line.size(); i++) {
+      std::vector<float> float_v;
+      paddle::inference::split_to_float(line[i], ' ', &float_v);
+      if (i == 0) {
+        cur_query_index = float_v[0];
+        if (pre_query_index != -1 && cur_query_index != pre_query_index) {
+          return seq_num;
+        }
+        seq_num++;
+        _current_line++;
+      } else {
+        if (float_v.size() == 0) {
+          float_v.push_back(-1);
+        }
+        (*data)[i - 1].insert((*data)[i - 1].end(), float_v.begin(),
+                              float_v.end());
+        (*offsets)[i - 1].push_back((*offsets)[i - 1][seq_num] +
+                                    float_v.size());
+      }
+    }
+    if (seq_num + 1 >= _batch_size) {
+      return seq_num;
+    } else {
+      pre_query_index = cur_query_index;
+    }
+  }
+  return seq_num;
+}
+
+namespace paddle {
+
+contrib::AnakinConfig GetConfig() {
+  contrib::AnakinConfig config;
+
+  std::map<std::string, std::vector<int>> init_inputs_shape;
+  init_inputs_shape["q_basic"] = std::vector<int>({1000, 1, 1, 1});
+  init_inputs_shape["q_bigram0"] = std::vector<int>({1000, 1, 1, 1});
+  init_inputs_shape["pt_basic"] = std::vector<int>({2000, 1, 1, 1});
+  init_inputs_shape["pa_basic"] = std::vector<int>({4000, 1, 1, 1});
+  init_inputs_shape["pa_bigram0"] = std::vector<int>({4000, 1, 1, 1});
+  init_inputs_shape["pt_bigram0"] = std::vector<int>({2000, 1, 1, 1});
+
+  // using AnakinConfig::X86 if you need to use cpu to do inference
+  config.target_type = contrib::AnakinConfig::NVGPU;
+  config.model_file = FLAGS_model;
+  config.device_id = 0;
+  config.init_batch_size = FLAGS_init_batch_size;
+  config.init_inputs_shape = init_inputs_shape;
+  config.re_allocable = false;
+  return config;
+}
+
+void single_test(PaddlePredictor* predictor_master) {
+  auto predictor = predictor_master->Clone();
+
+  Data data(FLAGS_datapath, FLAGS_batch_size, FLAGS_start_line, FLAGS_end_line);
+
+  std::vector<std::vector<float>> inputs;
+  std::vector<std::vector<size_t>> seq_offsets;
+  std::vector<float> compare_outputs;
+
+  const std::vector<std::string> input_names{"q_basic",  "q_bigram0",
+                                             "pt_basic", "pt_bigram0",
+                                             "pa_basic", "pa_bigram0"};
+  std::vector<PaddleTensor> input_tensors;
+  std::vector<PaddleTensor> output_tensors;
+  for (auto& name : input_names) {
+    PaddleTensor tensor;
+    tensor.name = name;
+    tensor.dtype = PaddleDType::FLOAT32;
+    input_tensors.push_back(tensor);
+  }
+
+  PaddleTensor tensor_out;
+  tensor_out.name = "save_infer_model/scale_0";
+  tensor_out.shape = std::vector<int>({});
+  tensor_out.data = PaddleBuf();
+  tensor_out.dtype = PaddleDType::FLOAT32;
+  output_tensors.push_back(tensor_out);
+
+  inference::Timer timer;
+  for (int i = 0; i < FLAGS_repeats; i++) {
+    data.reset_current_line();
+    size_t count = 0;
+    float time_sum = 0;
+    while (data.get_next_batches(&inputs, &seq_offsets) >= 0) {
+#if PRINT_INPUTS
+      for (size_t i = 0; i < inputs.size(); i++) {
+        LOG(INFO) << "data " << i;
+        for (size_t j = 0; j < inputs[i].size(); j++) {
+          LOG(INFO) << j << ": " << inputs[i][j];
+        }
+        for (auto j : seq_offsets[i]) {
+          LOG(INFO) << "offsets: " << i << ": " << j;
+        }
+      }
+#endif
+      for (size_t j = 0; j < input_tensors.size(); j++) {
+        input_tensors[j].data =
+            PaddleBuf(&inputs[j][0], inputs[j].size() * sizeof(float));
+        input_tensors[j].lod =
+            std::vector<std::vector<size_t>>({seq_offsets[j]});
+        input_tensors[j].shape =
+            std::vector<int>({static_cast<int>(inputs[j].size()), 1, 1, 1});
+      }
+      timer.tic();
+      predictor->Run(input_tensors, &output_tensors);
+      float time = timer.toc();
+#if COMPARE_OUTPUTS
+      float* data_o = static_cast<float*>(output_tensors[0].data.data());
+      LOG(INFO) << "outputs[0].data.size() = "
+                << output_tensors[0].data.length() / sizeof(float);
+      size_t sum = 1;
+      for_each(output_tensors[0].shape.begin(), output_tensors[0].shape.end(),
+               [&](int n) { sum *= n; });
+      for (size_t j = 0; j < sum; ++j) {
+        LOG(INFO) << "output[" << j << "]: " << data_o[j];
+        compare_outputs.push_back(data_o[j]);
+      }
+#endif
+      LOG(INFO) << "Single Time: " << time;
+      count++;
+      if (count > 10) {
+        time_sum += timer.toc();
+      }
+    }
+    inference::PrintTime(FLAGS_batch_size, FLAGS_repeats, 1, 0,
+                         time_sum / (count - 10));
+#if COMPARE_OUTPUTS
+    Data data(FLAGS_truthpath, 1);
+    const std::vector<std::string> truth_vals = data.get_lines();
+    for (size_t j = 0; j < truth_vals.size(); j++) {
+      float truth = std::atof(truth_vals[j].c_str());
+      float compa = compare_outputs[j];
+      float diff = std::abs(truth - compa);
+      LOG(INFO) << "[DIFF " << j << " ] " << diff;
+      if (diff > 0.0001) {
+        LOG(FATAL) << "The result is wrong!";
+      }
+    }
+    LOG(INFO) << "The result is correct!";
+#endif
+  }
+}
+}  // namespace paddle
+
+int main(int argc, char** argv) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  std::vector<std::thread> threads;
+
+  auto config = paddle::GetConfig();
+  config.data_stream_id = 0;
+  config.compute_stream_id = 0;
+  std::unique_ptr<paddle::PaddlePredictor> predictor_master =
+      paddle::CreatePaddlePredictor<paddle::contrib::AnakinConfig,
+                                    paddle::PaddleEngineKind::kAnakin>(config);
+
+  for (int i = 0; i < FLAGS_threads_num; i++) {
+    threads.push_back(std::thread(paddle::single_test, predictor_master.get()));
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+  return 0;
+}
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -33,7 +33,7 @@ if (WITH_GPU AND TENSORRT_FOUND)
    add_subdirectory(tensorrt)
 endif()

-if (ANAKIN_FOUND) 
+if (ANAKIN_SUBGRAPH) 
    add_subdirectory(anakin)
 endif()


--- a/paddle/fluid/operators/anakin/anakin_engine_op.h
+++ b/paddle/fluid/operators/anakin/anakin_engine_op.h
@@ -119,11 +119,15 @@ class AnakinEngineOp : public framework::OperatorBase {
      engine->Execute(inputs, outputs, stream);
 #endif
    } else {
+#ifdef ANAKIN_X86_PLACE
      auto *engine =
          inference::Singleton<inference::anakin::AnakinEngineManager<
              ::anakin::saber::X86, PrecisionT>>::Global()
              .Get(engine_key_);
      engine->Execute(inputs, outputs);
+#else
+      LOG(FATAL) << "Unknown Platform for AnakinEngine!";
+#endif
    }
  }
 };