Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into bpr

272f3d31 · frankwhzhang · 570d89ec · 549f165b · 272f3d31 · 272f3d31
198 changed file
--- a/cmake/external/ngraph.cmake
+++ b/cmake/external/ngraph.cmake
@@ -32,6 +32,8 @@ IF(NOT ${WITH_NGRAPH})
    return()
 ENDIF()
+INCLUDE(GNUInstallDirs)
 INCLUDE(ExternalProject)
 SET(NGRAPH_PROJECT         "extern_ngraph")
@@ -40,10 +42,14 @@ SET(NGRAPH_GIT_TAG         "f9fd9d4cc318dc59dd4b68448e7fbb5f67a28bd0")
 SET(NGRAPH_SOURCES_DIR     ${THIRD_PARTY_PATH}/ngraph)
 SET(NGRAPH_INSTALL_DIR     ${THIRD_PARTY_PATH}/install/ngraph)
 SET(NGRAPH_INC_DIR         ${NGRAPH_INSTALL_DIR}/include)
+SET(NGRAPH_LIB_DIR         ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR})
 SET(NGRAPH_SHARED_LIB_NAME libngraph.so.${NGRAPH_VERSION})
 SET(NGRAPH_CPU_LIB_NAME    libcpu_backend.so)
 SET(NGRAPH_TBB_LIB_NAME    libtbb.so.2)
 SET(NGRAPH_GIT_REPO        "https://github.com/NervanaSystems/ngraph.git")
+SET(NGRAPH_SHARED_LIB      ${NGRAPH_LIB_DIR}/${NGRAPH_SHARED_LIB_NAME})
+SET(NGRAPH_CPU_LIB         ${NGRAPH_LIB_DIR}/${NGRAPH_CPU_LIB_NAME})
+SET(NGRAPH_TBB_LIB         ${NGRAPH_LIB_DIR}/${NGRAPH_TBB_LIB_NAME})
 ExternalProject_Add(
    ${NGRAPH_PROJECT}
@@ -63,18 +69,6 @@ ExternalProject_Add(
    CMAKE_ARGS          -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/lib
 )
-if(UNIX AND NOT APPLE)
-    include(GNUInstallDirs)
-    SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR})
-else()
-    SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/lib)
-endif()
-MESSAGE(STATUS "nGraph lib will be installed at: ${NGRAPH_LIB_DIR}")
-SET(NGRAPH_SHARED_LIB      ${NGRAPH_LIB_DIR}/${NGRAPH_SHARED_LIB_NAME})
-SET(NGRAPH_CPU_LIB         ${NGRAPH_LIB_DIR}/${NGRAPH_CPU_LIB_NAME})
-SET(NGRAPH_TBB_LIB         ${NGRAPH_LIB_DIR}/${NGRAPH_TBB_LIB_NAME})
 # Workaround for nGraph expecting mklml to be in mkldnn install directory.
 ExternalProject_Add_Step(
    ${NGRAPH_PROJECT}

--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -129,6 +129,15 @@ if (WITH_MKLDNN)
            )
 endif ()
+if (WITH_NGRAPH)
+    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/ngraph")
+    copy(ngraph_lib
+            SRCS ${NGRAPH_INC_DIR} ${NGRAPH_LIB_DIR}
+            DSTS ${dst_dir} ${dst_dir}
+            DEPS ngraph
+            )
+endif ()
 if (NOT WIN32)
    if (NOT MOBILE_INFERENCE AND NOT RPI)
        set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy")

--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -166,6 +166,8 @@ function(op_library TARGET)
      # Append first implemented MKLDNN activation operator
      if (${MKLDNN_FILE} STREQUAL "activation_mkldnn_op")
        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, MKLDNN);\n")
+      elseif(${MKLDNN_FILE} STREQUAL "conv_mkldnn_op")
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32);\n")
      else()
        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n")
      endif()

--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -32,6 +32,13 @@ paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.c
 paddle.fluid.BuildStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy) -> None
 paddle.fluid.create_lod_tensor ArgSpec(args=['data', 'recursive_seq_lens', 'place'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.create_random_int_lodtensor ArgSpec(args=['recursive_seq_lens', 'base_shape', 'place', 'low', 'high'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.DataFeedDesc.__init__ ArgSpec(args=['self', 'proto_file'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.DataFeedDesc.desc ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.DataFeedDesc.set_batch_size ArgSpec(args=['self', 'batch_size'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.DataFeedDesc.set_dense_slots ArgSpec(args=['self', 'dense_slots_name'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.DataFeedDesc.set_use_slots ArgSpec(args=['self', 'use_slots_name'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.AsyncExecutor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.AsyncExecutor.run ArgSpec(args=['self', 'program', 'data_feed', 'filelist', 'thread_num', 'fetch', 'debug'], varargs=None, keywords=None, defaults=(False,))
 paddle.fluid.io.save_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None))
 paddle.fluid.io.save_params ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.io.save_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
@@ -69,7 +76,7 @@ paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name']
 paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None))
 paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True))
 paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True))
-paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False))
+paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False))
 paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
 paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
@@ -175,7 +182,7 @@ paddle.fluid.layers.clip ArgSpec(args=['x', 'min', 'max', 'name'], varargs=None,
 paddle.fluid.layers.clip_by_norm ArgSpec(args=['x', 'max_norm', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None))
-paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'ignore_index', 'name'], varargs=None, keywords=None, defaults=(-100, None))
 paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.space_to_depth ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,))
@@ -187,6 +194,9 @@ paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=Non
 paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None))
 paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None))
+paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
 paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
@@ -291,6 +301,7 @@ paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'i
 paddle.fluid.layers.iou_similarity ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.box_coder ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name'], varargs=None, keywords=None, defaults=('encode_center_size', True, None))
 paddle.fluid.layers.polygon_box_transform ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.yolov3_loss ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'class_num', 'ignore_thresh', 'loss_weight_xy', 'loss_weight_wh', 'loss_weight_conf_target', 'loss_weight_conf_notarget', 'loss_weight_class', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None))
 paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None))
 paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1))
 paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
@@ -411,3 +422,17 @@ paddle.fluid.Scope.drop_kids drop_kids(self: paddle.fluid.core.Scope) -> None
 paddle.fluid.Scope.find_var find_var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable
 paddle.fluid.Scope.new_scope new_scope(self: paddle.fluid.core.Scope) -> paddle.fluid.core.Scope
 paddle.fluid.Scope.var var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable
+paddle.reader.map_readers ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None)
+paddle.reader.buffered ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None)
+paddle.reader.compose ArgSpec(args=[], varargs='readers', keywords='kwargs', defaults=None)
+paddle.reader.chain ArgSpec(args=[], varargs='readers', keywords=None, defaults=None)
+paddle.reader.shuffle ArgSpec(args=['reader', 'buf_size'], varargs=None, keywords=None, defaults=None)
+paddle.reader.firstn ArgSpec(args=['reader', 'n'], varargs=None, keywords=None, defaults=None)
+paddle.reader.xmap_readers ArgSpec(args=['mapper', 'reader', 'process_num', 'buffer_size', 'order'], varargs=None, keywords=None, defaults=(False,))
+paddle.reader.PipeReader.__init__ ArgSpec(args=['self', 'command', 'bufsize', 'file_type'], varargs=None, keywords=None, defaults=(8192, 'plain'))
+paddle.reader.PipeReader.get_line ArgSpec(args=['self', 'cut_lines', 'line_break'], varargs=None, keywords=None, defaults=(True, '\n'))
+paddle.reader.multiprocess_reader ArgSpec(args=['readers', 'use_pipe', 'queue_size'], varargs=None, keywords=None, defaults=(True, 1000))
+paddle.reader.Fake.__init__ ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.reader.creator.np_array ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
+paddle.reader.creator.text_file ArgSpec(args=['path'], varargs=None, keywords=None, defaults=None)
+paddle.reader.creator.recordio ArgSpec(args=['paths', 'buf_size'], varargs=None, keywords=None, defaults=(100,))
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -34,6 +34,7 @@ add_subdirectory(ir)
 add_subdirectory(details)
 # ddim lib
 proto_library(framework_proto SRCS framework.proto)
+proto_library(async_executor_proto SRCS data_feed.proto)
 cc_library(ddim SRCS ddim.cc DEPS eigen3 boost)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
@@ -117,8 +118,9 @@ cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
 cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context)
 cc_library(transfer_scope_cache SRCS transfer_scope_cache.cc DEPS scope framework_proto device_context)
+cc_library(op_kernel_type SRCS op_kernel_type.cc DEPS device_context place)
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
-    shape_inference data_transform lod_tensor profiler transfer_scope_cache)
+    shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type)
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)
@@ -126,8 +128,9 @@ cc_library(version SRCS version.cc)
 cc_test(version_test SRCS version_test.cc DEPS version)
 cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version)
-cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto)
 if(NOT WIN32)
+cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph)
 cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog
  shape_inference data_transform lod_tensor profiler)
 endif(NOT WIN32)
@@ -135,7 +138,7 @@ endif(NOT WIN32)
 cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
 nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
-py_proto_compile(framework_py_proto SRCS framework.proto)
+py_proto_compile(framework_py_proto SRCS framework.proto data_feed.proto)
 # Generate an empty __init__.py to make framework_py_proto as a valid python module.
 add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
 add_dependencies(framework_py_proto framework_py_proto_init)
@@ -157,18 +160,19 @@ endif(NOT WIN32)
 cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
 cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)
+cc_library(variable_helper SRCS variable_helper.cc DEPS lod_tensor)
-cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass)
+cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
 if(WITH_DISTRIBUTE)
-  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass)
+  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass variable_helper)
  set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
  set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 else()
  if(NOT WIN32)
-    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator)
+    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator variable_helper)
  else(NOT WIN32)
-    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass)
+    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
  endif(NOT WIN32)
  cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
 endif()
@@ -176,8 +180,11 @@ endif()
 cc_library(parallel_executor SRCS parallel_executor.cc DEPS
        threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
        graph build_strategy
-        fast_threaded_ssa_graph_executor)
+        fast_threaded_ssa_graph_executor variable_helper)
+cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper)
+cc_test(data_feed_test SRCS data_feed_test.cc DEPS async_executor)
 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
 cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
@@ -185,7 +192,7 @@ cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
 cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
 cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)
-cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto)
+cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto op_kernel_type)
 cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
 cc_test(tuple_test SRCS tuple_test.cc )

--- a/paddle/fluid/framework/async_executor.cc
+++ b/paddle/fluid/framework/async_executor.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/framework/async_executor.h"
+#include "google/protobuf/io/zero_copy_stream_impl.h"
+#include "google/protobuf/message.h"
+#include "google/protobuf/text_format.h"
+#include "gflags/gflags.h"
+#include "paddle/fluid/framework/data_feed_factory.h"
+#include "paddle/fluid/framework/executor_thread_worker.h"
+#include "paddle/fluid/framework/feed_fetch_method.h"
+#include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/inference/io.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/pybind/pybind.h"
+namespace paddle {
+namespace framework {
+AsyncExecutor::AsyncExecutor(Scope* scope, const platform::Place& place)
+    : root_scope_(scope), place_(place) {}
+void AsyncExecutor::CreateThreads(
+    ExecutorThreadWorker* worker, const ProgramDesc& main_program,
+    const std::shared_ptr<DataFeed>& reader,
+    const std::vector<std::string>& fetch_var_names, Scope* root_scope,
+    const int thread_index, const bool debug) {
+  worker->SetThreadId(thread_index);
+  worker->SetDebug(debug);
+  worker->SetRootScope(root_scope);
+  worker->CreateThreadResource(main_program, place_);
+  worker->SetDataFeed(reader);
+  worker->SetFetchVarNames(fetch_var_names);
+  worker->BindingDataFeedMemory();
+}
+void PrepareReaders(std::vector<std::shared_ptr<DataFeed>>& readers,  // NOLINT
+                    const int thread_num, const DataFeedDesc& data_feed_desc,
+                    const std::vector<std::string>& filelist) {
+  readers.resize(thread_num);
+  for (size_t i = 0; i < readers.size(); ++i) {
+    readers[i] = DataFeedFactory::CreateDataFeed(data_feed_desc.name());
+    readers[i]->Init(data_feed_desc);  // set batch_size and queue_size here
+  }
+  readers[0]->SetFileList(filelist);
+}
+void AsyncExecutor::RunFromFile(const ProgramDesc& main_program,
+                                const std::string& data_feed_desc_str,
+                                const std::vector<std::string>& filelist,
+                                const int thread_num,
+                                const std::vector<std::string>& fetch_var_names,
+                                const bool debug) {
+  std::vector<std::thread> threads;
+  auto& block = main_program.Block(0);
+  for (auto var_name : fetch_var_names) {
+    auto var_desc = block.FindVar(var_name);
+    auto shapes = var_desc->GetShape();
+    PADDLE_ENFORCE(shapes[shapes.size() - 1] == 1,
+                   "var %s: Fetched var has wrong shape, "
+                   "only variables with the last dimension size 1 supported",
+                   var_name);
+  }
+  DataFeedDesc data_feed_desc;
+  google::protobuf::TextFormat::ParseFromString(data_feed_desc_str,
+                                                &data_feed_desc);
+  int actual_thread_num = thread_num;
+  int file_cnt = filelist.size();
+  PADDLE_ENFORCE(file_cnt > 0, "File list cannot be empty");
+  if (actual_thread_num > file_cnt) {
+    VLOG(1) << "Thread num = " << thread_num << ", file num = " << file_cnt
+            << ". Changing thread_num = " << file_cnt;
+    actual_thread_num = file_cnt;
+  }
+  /*
+    readerDesc: protobuf description for reader initlization
+    argument: class_name, batch_size, use_slot, queue_size, buffer_size,
+    padding_index
+    reader:
+    1) each thread has a reader, reader will read input data and
+    put it into input queue
+    2) each reader has a Next() iterface, that can fetch an instance
+    from the input queue
+   */
+  // todo: should be factory method for creating datafeed
+  std::vector<std::shared_ptr<DataFeed>> readers;
+  PrepareReaders(readers, actual_thread_num, data_feed_desc, filelist);
+  std::vector<std::shared_ptr<ExecutorThreadWorker>> workers;
+  workers.resize(actual_thread_num);
+  for (auto& worker : workers) {
+    worker.reset(new ExecutorThreadWorker);
+  }
+  // prepare thread resource here
+  for (int thidx = 0; thidx < actual_thread_num; ++thidx) {
+    CreateThreads(workers[thidx].get(), main_program, readers[thidx],
+                  fetch_var_names, root_scope_, thidx, debug);
+  }
+  // start executing ops in multiple threads
+  for (int thidx = 0; thidx < actual_thread_num; ++thidx) {
+    threads.push_back(
+        std::thread(&ExecutorThreadWorker::TrainFiles, workers[thidx].get()));
+  }
+  for (auto& th : threads) {
+    th.join();
+  }
+  root_scope_->DropKids();
+  return;
+}
+}  // einit_modelnd namespace framework
+}  // end namespace paddle
--- a/paddle/fluid/framework/async_executor.h
+++ b/paddle/fluid/framework/async_executor.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+  http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <map>
+#include <memory>
+#include <mutex>  // NOLINT
+#include <set>
+#include <string>
+#include <thread>  // NOLINT
+#include <typeinfo>
+#include <vector>
+#include "paddle/fluid/framework/data_feed.pb.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/executor_thread_worker.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+namespace paddle {
+namespace framework {
+class AsyncExecutor {
+ public:
+  AsyncExecutor(Scope* scope, const platform::Place& place);
+  virtual ~AsyncExecutor() {}
+  void RunFromFile(const ProgramDesc& main_program,
+                   const std::string& data_feed_desc_str,
+                   const std::vector<std::string>& filelist,
+                   const int thread_num,
+                   const std::vector<std::string>& fetch_names,
+                   const bool debug = false);
+ private:
+  void CreateThreads(ExecutorThreadWorker* worker,
+                     const ProgramDesc& main_program,
+                     const std::shared_ptr<DataFeed>& reader,
+                     const std::vector<std::string>& fetch_var_names,
+                     Scope* root_scope, const int thread_index,
+                     const bool debug);
+ public:
+  Scope* root_scope_;
+  platform::Place place_;
+};
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/data_feed.cc
+++ b/paddle/fluid/framework/data_feed.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "google/protobuf/io/zero_copy_stream_impl.h"
+#include "google/protobuf/message.h"
+#include "google/protobuf/text_format.h"
+#include "gflags/gflags.h"
+#include "paddle/fluid/framework/data_feed.h"
+#include "paddle/fluid/framework/feed_fetch_method.h"
+#include "paddle/fluid/framework/feed_fetch_type.h"
+namespace paddle {
+namespace framework {
+std::vector<std::string> DataFeed::filelist_;
+size_t DataFeed::file_idx_;
+std::mutex DataFeed::mutex_for_pick_file_;
+bool DataFeed::finish_set_filelist_;
+void DataFeed::AddFeedVar(Variable* var, const std::string& name) {
+  CheckInit();
+  for (size_t i = 0; i < use_slots_.size(); ++i) {
+    if (name == use_slots_[i]) {
+      if (use_slots_is_dense_[i]) {
+        feed_vec_[i] = MixTensor(var->GetMutable<Tensor>());
+      } else {
+        feed_vec_[i] = MixTensor(var->GetMutable<LoDTensor>());
+      }
+    }
+  }
+}
+bool DataFeed::SetFileList(const std::vector<std::string>& files) {
+  std::unique_lock<std::mutex> lock(mutex_for_pick_file_);
+  CheckInit();
+  if (finish_set_filelist_) {
+    VLOG(3) << "info: you have set the filelist.";
+    return false;
+  }
+  PADDLE_ENFORCE(files.size(), "You have set an empty filelist.");
+  filelist_.assign(files.begin(), files.end());
+  file_idx_ = 0;
+  finish_set_filelist_ = true;
+  return true;
+}
+void DataFeed::SetBatchSize(int batch_size) {
+  PADDLE_ENFORCE(batch_size > 0, "Illegal batch size: %d.", batch_size);
+  default_batch_size_ = batch_size;
+}
+bool DataFeed::PickOneFile(std::string* filename) {
+  std::unique_lock<std::mutex> lock(mutex_for_pick_file_);
+  if (file_idx_ == filelist_.size()) {
+    return false;
+  }
+  *filename = filelist_[file_idx_++];
+  return true;
+}
+void DataFeed::CheckInit() {
+  PADDLE_ENFORCE(finish_init_, "Initialization did not succeed.");
+}
+void DataFeed::CheckSetFileList() {
+  PADDLE_ENFORCE(finish_set_filelist_, "Set filelist did not succeed.");
+}
+void DataFeed::CheckStart() {
+  PADDLE_ENFORCE(finish_start_, "Datafeed has not started running yet.");
+}
+template <typename T>
+void PrivateQueueDataFeed<T>::SetQueueSize(int queue_size) {
+  PADDLE_ENFORCE(queue_size > 0, "Illegal queue size: %d.", queue_size);
+  queue_size_ = queue_size;
+  queue_ = std::unique_ptr<paddle::operators::reader::BlockingQueue<T>>(
+      new paddle::operators::reader::BlockingQueue<T>(queue_size_));
+}
+template <typename T>
+bool PrivateQueueDataFeed<T>::Start() {
+  CheckSetFileList();
+  read_thread_ = std::thread(&PrivateQueueDataFeed::ReadThread, this);
+  read_thread_.detach();
+  finish_start_ = true;
+  return true;
+}
+template <typename T>
+void PrivateQueueDataFeed<T>::ReadThread() {
+  std::string filename;
+  while (PickOneFile(&filename)) {
+    file_.open(filename.c_str());  // is_text_feed
+    PADDLE_ENFORCE(file_.good(), "Open file<%s> fail.", filename.c_str());
+    T instance;
+    while (ParseOneInstance(&instance)) {
+      queue_->Send(instance);
+    }
+    file_.close();
+  }
+  queue_->Close();
+}
+template <typename T>
+int PrivateQueueDataFeed<T>::Next() {
+  CheckStart();
+  int index = 0;
+  T instance;
+  T ins_vec;
+  while (index < default_batch_size_) {
+    if (!queue_->Receive(&instance)) {
+      break;
+    }
+    AddInstanceToInsVec(&ins_vec, instance, index++);
+  }
+  batch_size_ = index;
+  if (batch_size_ != 0) {
+    PutToFeedVec(ins_vec);
+  }
+  return batch_size_;
+}
+#ifdef _WIN32
+template class PrivateQueueDataFeed<std::vector<MultiSlotType>>;
+#endif
+void MultiSlotDataFeed::Init(
+    const paddle::framework::DataFeedDesc& data_feed_desc) {
+  finish_init_ = false;
+  finish_set_filelist_ = false;
+  finish_start_ = false;
+  PADDLE_ENFORCE(data_feed_desc.has_multi_slot_desc(),
+                 "Multi_slot_desc has not been set.");
+  paddle::framework::MultiSlotDesc multi_slot_desc =
+      data_feed_desc.multi_slot_desc();
+  SetBatchSize(data_feed_desc.batch_size());
+  SetQueueSize(data_feed_desc.batch_size());
+  size_t all_slot_num = multi_slot_desc.slots_size();
+  all_slots_.resize(all_slot_num);
+  all_slots_type_.resize(all_slot_num);
+  use_slots_index_.resize(all_slot_num);
+  use_slots_.clear();
+  use_slots_is_dense_.clear();
+  for (size_t i = 0; i < all_slot_num; ++i) {
+    const auto& slot = multi_slot_desc.slots(i);
+    all_slots_[i] = slot.name();
+    all_slots_type_[i] = slot.type();
+    use_slots_index_[i] = slot.is_used() ? use_slots_.size() : -1;
+    if (slot.is_used()) {
+      use_slots_.push_back(all_slots_[i]);
+      use_slots_is_dense_.push_back(slot.is_dense());
+    }
+  }
+  feed_vec_.resize(use_slots_.size());
+  finish_init_ = true;
+}
+bool MultiSlotDataFeed::CheckFile(const char* filename) {
+  CheckInit();  // get info of slots
+  std::ifstream fin(filename);
+  if (!fin.good()) {
+    VLOG(1) << "error: open file<" << filename << "> fail";
+    return false;
+  }
+  std::string line;
+  int instance_cout = 0;
+  std::string all_slots_alias = "";
+  for (const auto& alias : all_slots_) {
+    all_slots_alias += alias + " ";
+  }
+  std::string use_slots_alias = "";
+  for (const auto& alias : use_slots_) {
+    use_slots_alias += alias + " ";
+  }
+  VLOG(3) << "total slots num: " << all_slots_.size();
+  VLOG(3) << "total slots alias: " << all_slots_alias;
+  VLOG(3) << "used slots num: " << use_slots_.size();
+  VLOG(3) << "used slots alias: " << use_slots_alias;
+  while (getline(fin, line)) {
+    ++instance_cout;
+    const char* str = line.c_str();
+    char* endptr = const_cast<char*>(str);
+    int len = line.length();
+    for (size_t i = 0; i < all_slots_.size(); ++i) {
+      int num = strtol(endptr, &endptr, 10);
+      if (num < 0) {
+        VLOG(0) << "error: the number of ids is a negative number: " << num;
+        VLOG(0) << "please check line<" << instance_cout << "> in file<"
+                << filename << ">";
+        return false;
+      } else if (num == 0) {
+        VLOG(0)
+            << "error: the number of ids can not be zero, you need "
+               "padding it in data generator; or if there is something wrong"
+               " with the data, please check if the data contains unresolvable "
+               "characters.";
+        VLOG(0) << "please check line<" << instance_cout << "> in file<"
+                << filename << ">";
+        return false;
+      } else if (errno == ERANGE || num > INT_MAX) {
+        VLOG(0) << "error: the number of ids greater than INT_MAX";
+        VLOG(0) << "please check line<" << instance_cout << "> in file<"
+                << filename << ">";
+        return false;
+      }
+      if (all_slots_type_[i] == "float") {
+        for (int i = 0; i < num; ++i) {
+          strtof(endptr, &endptr);
+          if (errno == ERANGE) {
+            VLOG(0) << "error: the value is out of the range of "
+                       "representable values for float";
+            VLOG(0) << "please check line<" << instance_cout << "> in file<"
+                    << filename << ">";
+            return false;
+          }
+          if (i + 1 != num && endptr - str == len) {
+            VLOG(0) << "error: there is a wrong with the number of ids.";
+            VLOG(0) << "please check line<" << instance_cout << "> in file<"
+                    << filename << ">";
+            return false;
+          }
+        }
+      } else if (all_slots_type_[i] == "uint64") {
+        for (int i = 0; i < num; ++i) {
+          strtoull(endptr, &endptr, 10);
+          if (errno == ERANGE) {
+            VLOG(0) << "error: the value is out of the range of "
+                       "representable values for uint64_t";
+            VLOG(0) << "please check line<" << instance_cout << "> in file<"
+                    << filename << ">";
+            return false;
+          }
+          if (i + 1 != num && endptr - str == len) {
+            VLOG(0) << "error: there is a wrong with the number of ids.";
+            VLOG(0) << "please check line<" << instance_cout << "> in file<"
+                    << filename << ">";
+            return false;
+          }
+        }
+      } else {
+        VLOG(0) << "error: this type<" << all_slots_type_[i]
+                << "> is not supported";
+        return false;
+      }
+    }
+    // It may be added '\t' character to the end of the output of reduce
+    // task when processes data by Hadoop(when the output of the reduce
+    // task of Hadoop has only one field, it will add a '\t' at the end
+    // of the line by default, and you can use this option to avoid it:
+    // `-D mapred.textoutputformat.ignoreseparator=true`), which does
+    // not affect the correctness of the data. Therefore, it should be
+    // judged that the data is not normal when the end of each line of
+    // data contains characters which are not spaces.
+    while (endptr - str != len) {
+      if (!isspace(*(endptr++))) {
+        VLOG(0)
+            << "error: there is some extra characters at the end of the line.";
+        VLOG(0) << "please check line<" << instance_cout << "> in file<"
+                << filename << ">";
+        return false;
+      }
+    }
+  }
+  VLOG(3) << "instances cout: " << instance_cout;
+  VLOG(3) << "The file format is correct";
+  return true;
+}
+bool MultiSlotDataFeed::ParseOneInstance(std::vector<MultiSlotType>* instance) {
+  std::string line;
+  if (getline(file_, line)) {
+    int use_slots_num = use_slots_.size();
+    instance->resize(use_slots_num);
+    // parse line
+    const char* str = line.c_str();
+    char* endptr = const_cast<char*>(str);
+    int pos = 0;
+    for (size_t i = 0; i < use_slots_index_.size(); ++i) {
+      int idx = use_slots_index_[i];
+      int num = strtol(&str[pos], &endptr, 10);
+      PADDLE_ENFORCE(
+          num,
+          "The number of ids can not be zero, you need padding "
+          "it in data generator; or if there is something wrong with "
+          "the data, please check if the data contains unresolvable "
+          "characters.\nplease check this error line: %s",
+          str);
+      if (idx != -1) {
+        (*instance)[idx].Init(all_slots_type_[i]);
+        if ((*instance)[idx].GetType()[0] == 'f') {  // float
+          for (int j = 0; j < num; ++j) {
+            float feasign = strtof(endptr, &endptr);
+            (*instance)[idx].AddValue(feasign);
+          }
+        } else if ((*instance)[idx].GetType()[0] == 'u') {  // uint64
+          for (int j = 0; j < num; ++j) {
+            uint64_t feasign = (uint64_t)strtoull(endptr, &endptr, 10);
+            (*instance)[idx].AddValue(feasign);
+          }
+        }
+        pos = endptr - str;
+      } else {
+        for (int j = 0; j <= num; ++j) {
+          pos = line.find_first_of(' ', pos + 1);
+        }
+      }
+    }
+  } else {
+    return false;
+  }
+  return true;
+}
+void MultiSlotDataFeed::AddInstanceToInsVec(
+    std::vector<MultiSlotType>* ins_vec,
+    const std::vector<MultiSlotType>& instance, int index) {
+  if (index == 0) {
+    ins_vec->resize(instance.size());
+    for (size_t i = 0; i < instance.size(); ++i) {
+      (*ins_vec)[i].Init(instance[i].GetType());
+      (*ins_vec)[i].InitOffset();
+    }
+  }
+  for (size_t i = 0; i < instance.size(); ++i) {
+    (*ins_vec)[i].AddIns(instance[i]);
+  }
+}
+void MultiSlotDataFeed::PutToFeedVec(
+    const std::vector<MultiSlotType>& ins_vec) {
+  for (size_t i = 0; i < use_slots_.size(); ++i) {
+    const auto& type = ins_vec[i].GetType();
+    const auto& offset = ins_vec[i].GetOffset();
+    int total_instance = static_cast<int>(offset.back());
+    if (type[0] == 'f') {  // float
+      const auto& feasign = ins_vec[i].GetFloatData();
+      if (feed_vec_[i].IsDense()) {
+        int size_in_each_batch = total_instance / batch_size_;
+        float* tensor_ptr = feed_vec_[i].GetTensor()->mutable_data<float>(
+            {batch_size_, size_in_each_batch}, platform::CPUPlace());
+        memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(float));
+      } else {
+        float* tensor_ptr = feed_vec_[i].GetLoDTensor()->mutable_data<float>(
+            {total_instance, 1}, platform::CPUPlace());
+        memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(float));
+        LoD data_lod{offset};
+        feed_vec_[i].GetLoDTensor()->set_lod(data_lod);
+      }
+    } else if (type[0] == 'u') {  // uint64
+      // no uint64_t type in paddlepaddle
+      const auto& feasign = ins_vec[i].GetUint64Data();
+      if (feed_vec_[i].IsDense()) {
+        int size_in_each_batch = total_instance / batch_size_;
+        int64_t* tensor_ptr = feed_vec_[i].GetTensor()->mutable_data<int64_t>(
+            {batch_size_, size_in_each_batch}, platform::CPUPlace());
+        memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(int64_t));
+      } else {
+        int64_t* tensor_ptr =
+            feed_vec_[i].GetLoDTensor()->mutable_data<int64_t>(
+                {total_instance, 1}, platform::CPUPlace());
+        memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(int64_t));
+        LoD data_lod{offset};
+        feed_vec_[i].GetLoDTensor()->set_lod(data_lod);
+      }
+    }
+  }
+}
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/data_feed.h
+++ b/paddle/fluid/framework/data_feed.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <fstream>
+#include <memory>
+#include <mutex>  // NOLINT
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+#include "paddle/fluid/framework/data_feed.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/operators/reader/blocking_queue.h"
+namespace paddle {
+namespace framework {
+// Pack Tensor type and LoDTensor type into MixTensor type, in order
+// to record either Tensor or LoDTensor information at the same time.
+class MixTensor {
+ public:
+  MixTensor() {}
+  explicit MixTensor(LoDTensor* lodtensor) {
+    is_dense_ = false;
+    lodtensor_ = lodtensor;
+  }
+  explicit MixTensor(Tensor* tensor) {
+    is_dense_ = true;
+    tensor_ = tensor;
+  }
+  bool IsDense() { return is_dense_; }
+  LoDTensor* GetLoDTensor() {
+    PADDLE_ENFORCE(!is_dense_, "Let a dense var return a LoDTensor ptr.");
+    return lodtensor_;
+  }
+  Tensor* GetTensor() {
+    PADDLE_ENFORCE(is_dense_, "Let a sparse var return a Tensor ptr.");
+    return tensor_;
+  }
+ private:
+  bool is_dense_;
+  LoDTensor* lodtensor_;
+  Tensor* tensor_;
+};
+// DataFeed is the base virtual class for all ohther DataFeeds.
+// It is used to read files and parse the data for subsequent trainer.
+// Example:
+//   DataFeed* reader =
+//   paddle::framework::DataFeedFactory::CreateDataFeed(data_feed_name);
+//   reader->Init(data_feed_desc); // data_feed_desc is a protobuf object
+//   reader->SetFileList(filelist);
+//   const std::vector<std::string> & use_slot_alias =
+//   reader->GetUseSlotAlias();
+//   for (auto name: use_slot_alias){ // for binding memory
+//     reader->AddFeedVar(scope->Var(name), name);
+//   }
+//   reader->Start();
+//   while (reader->Next()) {
+//      // trainer do something
+//   }
+class DataFeed {
+ public:
+  DataFeed() {}
+  virtual ~DataFeed() {}
+  virtual void Init(const paddle::framework::DataFeedDesc& data_feed_desc) = 0;
+  virtual bool CheckFile(const char* filename) {
+    PADDLE_THROW("This function(CheckFile) is not implemented.");
+  }
+  // Set filelist for DataFeed.
+  // Pay attention that it must init all readers before call this function.
+  // Otherwise, Init() function will init finish_set_filelist_ flag.
+  virtual bool SetFileList(const std::vector<std::string>& files);
+  virtual bool Start() = 0;
+  // The trainer calls the Next() function, and the DataFeed will load a new
+  // batch to the feed_vec. The return value of this function is the batch
+  // size of the current batch.
+  virtual int Next() = 0;
+  // Get all slots' alias which defined in protofile
+  virtual const std::vector<std::string>& GetAllSlotAlias() {
+    return all_slots_;
+  }
+  // Get used slots' alias which defined in protofile
+  virtual const std::vector<std::string>& GetUseSlotAlias() {
+    return use_slots_;
+  }
+  // This function is used for binding feed_vec memory
+  virtual void AddFeedVar(Variable* var, const std::string& name);
+ protected:
+  // The following three functions are used to check if it is executed in this
+  // order:
+  //   Init() -> SetFileList() -> Start() -> Next()
+  virtual void CheckInit();
+  virtual void CheckSetFileList();
+  virtual void CheckStart();
+  virtual void SetBatchSize(
+      int batch);  // batch size will be set in Init() function
+  // This function is used to pick one file from the global filelist(thread
+  // safe).
+  virtual bool PickOneFile(std::string* filename);
+  static std::vector<std::string> filelist_;
+  static size_t file_idx_;
+  static std::mutex mutex_for_pick_file_;
+  // the alias of used slots, and its order is determined by
+  // data_feed_desc(proto object)
+  std::vector<std::string> use_slots_;
+  std::vector<bool> use_slots_is_dense_;
+  // the alias of all slots, and its order is determined by data_feed_desc(proto
+  // object)
+  std::vector<std::string> all_slots_;
+  std::vector<std::string> all_slots_type_;
+  std::vector<int>
+      use_slots_index_;  // -1: not used; >=0: the index of use_slots_
+  // The data read by DataFeed will be stored here
+  std::vector<MixTensor> feed_vec_;
+  // the batch size defined by user
+  int default_batch_size_;
+  // current batch size
+  int batch_size_;
+  bool finish_init_;
+  static bool finish_set_filelist_;
+  bool finish_start_;
+};
+// PrivateQueueDataFeed is the base virtual class for ohther DataFeeds.
+// It use a read-thread to read file and parse data to a private-queue
+// (thread level), and get data from this queue when trainer call Next().
+template <typename T>
+class PrivateQueueDataFeed : public DataFeed {
+ public:
+  PrivateQueueDataFeed() {}
+  virtual ~PrivateQueueDataFeed() {}
+  virtual void Init(const paddle::framework::DataFeedDesc& data_feed_desc) = 0;
+  virtual bool Start();
+  virtual int Next();
+ protected:
+  // The thread implementation function for reading file and parse.
+  virtual void ReadThread();
+  // This function is used to set private-queue size, and the most
+  // efficient when the queue size is close to the batch size.
+  virtual void SetQueueSize(int queue_size);
+  // The reading and parsing method called in the ReadThread.
+  virtual bool ParseOneInstance(T* instance) = 0;
+  // This function is used to put instance to vec_ins
+  virtual void AddInstanceToInsVec(T* vec_ins, const T& instance,
+                                   int index) = 0;
+  // This function is used to put ins_vec to feed_vec
+  virtual void PutToFeedVec(const T& ins_vec) = 0;
+  // The thread for read files
+  std::thread read_thread_;
+  // using ifstream one line and one line parse is faster
+  // than using fread one buffer and one buffer parse.
+  //   for a 601M real data:
+  //     ifstream one line and one line parse: 6034 ms
+  //     fread one buffer and one buffer parse: 7097 ms
+  std::ifstream file_;
+  size_t queue_size_;
+  // The queue for store parsed data
+  std::unique_ptr<paddle::operators::reader::BlockingQueue<T>> queue_;
+};
+// This class define the data type of instance(ins_vec) in MultiSlotDataFeed
+class MultiSlotType {
+ public:
+  MultiSlotType() {}
+  ~MultiSlotType() {}
+  void Init(const std::string& type) {
+    CheckType(type);
+    if (type_[0] == 'f') {
+      float_feasign_.clear();
+    } else if (type_[0] == 'u') {
+      uint64_feasign_.clear();
+    }
+    type_ = type;
+  }
+  void InitOffset() {
+    offset_.resize(1);
+    // LoDTensor' lod is counted from 0, the size of lod
+    // is one size larger than the size of data.
+    offset_[0] = 0;
+  }
+  const std::vector<size_t>& GetOffset() const { return offset_; }
+  void AddValue(const float v) {
+    CheckFloat();
+    float_feasign_.push_back(v);
+  }
+  void AddValue(const uint64_t v) {
+    CheckUint64();
+    uint64_feasign_.push_back(v);
+  }
+  void AddIns(const MultiSlotType& ins) {
+    if (ins.GetType()[0] == 'f') {  // float
+      CheckFloat();
+      auto& vec = ins.GetFloatData();
+      offset_.push_back(offset_.back() + vec.size());
+      float_feasign_.insert(float_feasign_.end(), vec.begin(), vec.end());
+    } else if (ins.GetType()[0] == 'u') {  // uint64
+      CheckUint64();
+      auto& vec = ins.GetUint64Data();
+      offset_.push_back(offset_.back() + vec.size());
+      uint64_feasign_.insert(uint64_feasign_.end(), vec.begin(), vec.end());
+    }
+  }
+  const std::vector<float>& GetFloatData() const { return float_feasign_; }
+  const std::vector<uint64_t>& GetUint64Data() const { return uint64_feasign_; }
+  const std::string& GetType() const { return type_; }
+ private:
+  void CheckType(const std::string& type) const {
+    PADDLE_ENFORCE((type == "uint64") || (type == "float"),
+                   "There is no this type<%s>.", type);
+  }
+  void CheckFloat() const {
+    PADDLE_ENFORCE(type_[0] == 'f', "Add %s value to float slot.", type_);
+  }
+  void CheckUint64() const {
+    PADDLE_ENFORCE(type_[0] == 'u', "Add %s value to uint64 slot.", type_);
+  }
+  std::vector<float> float_feasign_;
+  std::vector<uint64_t> uint64_feasign_;
+  std::string type_;
+  std::vector<size_t> offset_;
+};
+// This DataFeed is used to feed multi-slot type data.
+// The format of multi-slot type data:
+//   [n feasign_0 feasign_1 ... feasign_n]*
+class MultiSlotDataFeed
+    : public PrivateQueueDataFeed<std::vector<MultiSlotType>> {
+ public:
+  MultiSlotDataFeed() {}
+  virtual ~MultiSlotDataFeed() {}
+  virtual void Init(const paddle::framework::DataFeedDesc& data_feed_desc);
+  virtual bool CheckFile(const char* filename);
+ protected:
+  virtual void AddInstanceToInsVec(std::vector<MultiSlotType>* vec_ins,
+                                   const std::vector<MultiSlotType>& instance,
+                                   int index);
+  virtual bool ParseOneInstance(std::vector<MultiSlotType>* instance);
+  virtual void PutToFeedVec(const std::vector<MultiSlotType>& ins_vec);
+};
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/data_feed.proto
+++ b/paddle/fluid/framework/data_feed.proto
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+syntax = "proto2";
+package paddle.framework;
+message Slot {
+  required string name = 1;
+  required string type = 2;
+  optional bool is_dense = 3 [ default = false ];
+  optional bool is_used = 4 [ default = false ];
+}
+message MultiSlotDesc { repeated Slot slots = 1; }
+message DataFeedDesc {
+  optional string name = 1;
+  optional int32 batch_size = 2 [ default = 32 ];
+  optional MultiSlotDesc multi_slot_desc = 3;
+}
--- a/paddle/fluid/framework/data_feed_factory.cc
+++ b/paddle/fluid/framework/data_feed_factory.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+  http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/framework/data_feed_factory.h"
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include "paddle/fluid/framework/data_feed.h"
+namespace paddle {
+namespace framework {
+typedef std::shared_ptr<DataFeed> (*Createdata_feedFunction)();
+typedef std::unordered_map<std::string, Createdata_feedFunction> data_feedMap;
+data_feedMap g_data_feed_map;
+#define REGISTER_DATAFEED_CLASS(data_feed_class)                      \
+  namespace {                                                         \
+  std::shared_ptr<DataFeed> Creator_##data_feed_class() {             \
+    return std::shared_ptr<DataFeed>(new data_feed_class);            \
+  }                                                                   \
+  class __Registerer_##data_feed_class {                              \
+   public:                                                            \
+    __Registerer_##data_feed_class() {                                \
+      g_data_feed_map[#data_feed_class] = &Creator_##data_feed_class; \
+    }                                                                 \
+  };                                                                  \
+  __Registerer_##data_feed_class g_registerer_##data_feed_class;      \
+  }  // namespace
+std::string DataFeedFactory::DataFeedTypeList() {
+  std::string data_feed_types;
+  for (auto iter = g_data_feed_map.begin(); iter != g_data_feed_map.end();
+       ++iter) {
+    if (iter != g_data_feed_map.begin()) {
+      data_feed_types += ", ";
+    }
+    data_feed_types += iter->first;
+  }
+  return data_feed_types;
+}
+std::shared_ptr<DataFeed> DataFeedFactory::CreateDataFeed(
+    std::string data_feed_class) {
+  if (g_data_feed_map.count(data_feed_class) < 1) {
+    exit(-1);
+  }
+  return g_data_feed_map[data_feed_class]();
+}
+REGISTER_DATAFEED_CLASS(MultiSlotDataFeed);
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/data_feed_factory.h
+++ b/paddle/fluid/framework/data_feed_factory.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+  http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <memory>
+#include <string>
+#include "paddle/fluid/framework/data_feed.h"
+namespace paddle {
+namespace framework {
+class DataFeedFactory {
+ public:
+  static std::string DataFeedTypeList();
+  static std::shared_ptr<DataFeed> CreateDataFeed(std::string data_feed_class);
+};
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/data_feed_test.cc
+++ b/paddle/fluid/framework/data_feed_test.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/data_feed.h"
+#include <fcntl.h>
+#include <chrono>  // NOLINT
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <mutex>  // NOLINT
+#include <set>
+#include <thread>  // NOLINT
+#include <utility>
+#include <vector>
+#include "google/protobuf/io/zero_copy_stream_impl.h"
+#include "google/protobuf/text_format.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/data_feed_factory.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+paddle::framework::DataFeedDesc load_datafeed_param_from_file(
+    const char* filename) {
+  paddle::framework::DataFeedDesc data_feed_desc;
+  int file_descriptor = open(filename, O_RDONLY);
+  PADDLE_ENFORCE(file_descriptor != -1, "Can not open %s.", filename);
+  google::protobuf::io::FileInputStream fileInput(file_descriptor);
+  google::protobuf::TextFormat::Parse(&fileInput, &data_feed_desc);
+  close(file_descriptor);
+  return data_feed_desc;
+}
+const std::vector<std::string> load_filelist_from_file(const char* filename) {
+  std::vector<std::string> filelist;
+  std::ifstream fin(filename);
+  PADDLE_ENFORCE(fin.good(), "Can not open %s.", filename);
+  std::string line;
+  while (getline(fin, line)) {
+    filelist.push_back(line);
+  }
+  fin.close();
+  return filelist;
+}
+void GenerateFileForTest(const char* protofile, const char* filelist) {
+  std::ofstream w_protofile(protofile);
+  w_protofile << "name: \"MultiSlotDataFeed\"\n"
+                 "batch_size: 2\n"
+                 "multi_slot_desc {\n"
+                 "    slots {\n"
+                 "        name: \"uint64_sparse_slot\"\n"
+                 "        type: \"uint64\"\n"
+                 "        is_dense: false\n"
+                 "        is_used: true\n"
+                 "    }\n"
+                 "    slots {\n"
+                 "        name: \"float_sparse_slot\"\n"
+                 "        type: \"float\"\n"
+                 "        is_dense: false\n"
+                 "        is_used: true\n"
+                 "    }\n"
+                 "    slots {\n"
+                 "        name: \"uint64_dense_slot\"\n"
+                 "        type: \"uint64\"\n"
+                 "        is_dense: true\n"
+                 "        is_used: true\n"
+                 "    }\n"
+                 "    slots {\n"
+                 "        name: \"float_dense_slot\"\n"
+                 "        type: \"float\"\n"
+                 "        is_dense: true\n"
+                 "        is_used: true\n"
+                 "    }\n"
+                 "    slots {\n"
+                 "        name: \"not_used_slot\"\n"
+                 "        type: \"uint64\"\n"
+                 "        is_dense: false\n"
+                 "        is_used: false\n"
+                 "    }\n"
+                 "}";
+  w_protofile.close();
+  std::ofstream w_filelist(filelist);
+  int total_file = 4;
+  for (int i = 0; i < total_file; ++i) {
+    std::string filename = "TestMultiSlotDataFeed.data." + std::to_string(i);
+    w_filelist << filename;
+    if (i + 1 != total_file) {
+      w_filelist << std::endl;
+    }
+    std::ofstream w_datafile(filename.c_str());
+    w_datafile << "3 3978 620 82 1 1926.08 1 1926 1 6.02 1 1996\n"
+                  "2 1300 2983353 1 985.211 1 8 1 0.618 1 12\n"
+                  "1 19260827 2 3.14 2.718 1 27 1 2.236 1 28\n";
+    w_datafile.close();
+  }
+  w_filelist.close();
+}
+class MultiTypeSet {
+ public:
+  MultiTypeSet() {
+    uint64_set_.clear();
+    float_set_.clear();
+  }
+  ~MultiTypeSet() {}
+  void AddValue(uint64_t v) { uint64_set_.insert(v); }
+  void AddValue(float v) { float_set_.insert(v); }
+  const std::set<uint64_t>& GetUint64Set() const { return uint64_set_; }
+  const std::set<float>& GetFloatSet() const { return float_set_; }
+ private:
+  std::set<uint64_t> uint64_set_;
+  std::set<float> float_set_;
+};
+void GetElemSetFromReader(std::vector<MultiTypeSet>* reader_elem_set,
+                          const paddle::framework::DataFeedDesc& data_feed_desc,
+                          const std::vector<std::string>& filelist,
+                          const int thread_num) {
+  int used_slot_num = 0;
+  for (auto i = 0; i < data_feed_desc.multi_slot_desc().slots_size(); ++i) {
+    if (data_feed_desc.multi_slot_desc().slots(i).is_used()) {
+      ++used_slot_num;
+    }
+  }
+  reader_elem_set->resize(used_slot_num);
+  std::vector<std::thread> threads;
+  std::vector<std::shared_ptr<paddle::framework::DataFeed>> readers;
+  readers.resize(thread_num);
+  for (int i = 0; i < thread_num; ++i) {
+    readers[i] = paddle::framework::DataFeedFactory::CreateDataFeed(
+        data_feed_desc.name());
+    readers[i]->Init(data_feed_desc);
+  }
+  readers[0]->SetFileList(filelist);
+  std::mutex mu;
+  for (int idx = 0; idx < thread_num; ++idx) {
+    threads.emplace_back(std::thread([&, idx] {
+      std::unique_ptr<paddle::framework::Scope> scope(
+          new paddle::framework::Scope());
+      const auto& multi_slot_desc = data_feed_desc.multi_slot_desc();
+      std::map<std::string, const paddle::framework::LoDTensor*>
+          lodtensor_targets;
+      std::map<std::string, const paddle::framework::Tensor*> tensor_targets;
+      for (int i = 0; i < multi_slot_desc.slots_size(); ++i) {
+        const auto& slot = multi_slot_desc.slots(i);
+        if (slot.is_used()) {
+          const auto& name = slot.name();
+          readers[idx]->AddFeedVar(scope->Var(name), name);
+          if (slot.is_dense()) {
+            tensor_targets[name] =
+                &scope->FindVar(name)->Get<paddle::framework::Tensor>();
+          } else {
+            lodtensor_targets[name] =
+                &scope->FindVar(name)->Get<paddle::framework::LoDTensor>();
+          }
+        }
+      }
+      readers[idx]->Start();
+      while (readers[idx]->Next()) {
+        int index = 0;
+        for (int k = 0; k < multi_slot_desc.slots_size(); ++k) {
+          const auto& slot = multi_slot_desc.slots(k);
+          if (!slot.is_used()) {
+            continue;
+          }
+          if (slot.is_dense()) {  // dense branch
+            const paddle::framework::Tensor* tens = tensor_targets[slot.name()];
+            if (slot.type() == "uint64") {
+              const int64_t* data = tens->data<int64_t>();
+              int batch_size = tens->dims()[0];
+              int dim = tens->dims()[1];
+              for (int i = 0; i < batch_size; ++i) {
+                for (int j = 0; j < dim; ++j) {
+                  std::lock_guard<std::mutex> lock(mu);
+                  (*reader_elem_set)[index].AddValue(
+                      (uint64_t)data[i * dim + j]);
+                }
+              }
+            } else if (slot.type() == "float") {
+              const float* data = tens->data<float>();
+              int batch_size = tens->dims()[0];
+              int dim = tens->dims()[1];
+              for (int i = 0; i < batch_size; ++i) {
+                for (int j = 0; j < dim; ++j) {
+                  std::lock_guard<std::mutex> lock(mu);
+                  (*reader_elem_set)[index].AddValue(data[i * dim + j]);
+                }
+              }
+            } else {
+              PADDLE_THROW("Error type in proto file.");
+            }
+          } else {  // sparse branch
+            const paddle::framework::LoDTensor* tens =
+                lodtensor_targets[slot.name()];
+            if (slot.type() == "uint64") {
+              const int64_t* data = tens->data<int64_t>();
+              for (size_t i = 0; i < tens->NumElements(); ++i) {
+                std::pair<size_t, size_t> element = tens->lod_element(0, i);
+                for (size_t j = element.first; j < element.second; ++j) {
+                  std::lock_guard<std::mutex> lock(mu);
+                  (*reader_elem_set)[index].AddValue((uint64_t)data[j]);
+                }
+              }
+            } else if (slot.type() == "float") {
+              const float* data = tens->data<float>();
+              for (size_t i = 0; i < tens->NumElements(); ++i) {
+                std::pair<size_t, size_t> element = tens->lod_element(0, i);
+                for (size_t j = element.first; j < element.second; ++j) {
+                  std::lock_guard<std::mutex> lock(mu);
+                  (*reader_elem_set)[index].AddValue(data[j]);
+                }
+              }
+            } else {
+              PADDLE_THROW("Error type in proto file.");
+            }
+          }  // end sparse branch
+          ++index;
+        }  // end slots loop
+      }    // end while Next()
+    }));   // end anonymous function
+  }
+  for (auto& th : threads) {
+    th.join();
+  }
+}
+void CheckIsUnorderedSame(const std::vector<MultiTypeSet>& s1,
+                          const std::vector<MultiTypeSet>& s2) {
+  EXPECT_EQ(s1.size(), s2.size());
+  for (size_t i = 0; i < s1.size(); ++i) {
+    // check for uint64
+    const std::set<uint64_t>& uint64_s1 = s1[i].GetUint64Set();
+    const std::set<uint64_t>& uint64_s2 = s2[i].GetUint64Set();
+    EXPECT_EQ(uint64_s1.size(), uint64_s2.size());
+    auto uint64_it1 = uint64_s1.begin();
+    auto uint64_it2 = uint64_s2.begin();
+    while (uint64_it1 != uint64_s1.end()) {
+      EXPECT_EQ(*uint64_it1, *uint64_it2);
+      ++uint64_it1;
+      ++uint64_it2;
+    }
+    // check for float
+    const std::set<float>& float_s1 = s1[i].GetFloatSet();
+    const std::set<float>& float_s2 = s2[i].GetFloatSet();
+    EXPECT_EQ(float_s1.size(), float_s2.size());
+    auto float_it1 = float_s1.begin();
+    auto float_it2 = float_s2.begin();
+    while (float_it1 != float_s1.end()) {
+      EXPECT_EQ(*float_it1, *float_it2);
+      ++float_it1;
+      ++float_it2;
+    }
+  }
+}
+void GetElemSetFromFile(std::vector<MultiTypeSet>* file_elem_set,
+                        const paddle::framework::DataFeedDesc& data_feed_desc,
+                        const std::vector<std::string>& filelist) {
+  int used_slot_num = 0;
+  for (auto i = 0; i < data_feed_desc.multi_slot_desc().slots_size(); ++i) {
+    if (data_feed_desc.multi_slot_desc().slots(i).is_used()) {
+      ++used_slot_num;
+    }
+  }
+  file_elem_set->resize(used_slot_num);
+  for (const auto& file : filelist) {
+    std::ifstream fin(file.c_str());
+    PADDLE_ENFORCE(fin.good(), "Can not open %s.", file.c_str());
+    while (1) {
+      bool end_flag = false;
+      int index = 0;
+      for (auto i = 0; i < data_feed_desc.multi_slot_desc().slots_size(); ++i) {
+        int num;
+        if (fin >> num) {
+          auto slot = data_feed_desc.multi_slot_desc().slots(i);
+          auto type = slot.type();
+          if (type == "uint64") {
+            while (num--) {
+              uint64_t feasign;
+              fin >> feasign;
+              if (slot.is_used()) {
+                (*file_elem_set)[index].AddValue(feasign);
+              }
+            }
+          } else if (type == "float") {
+            while (num--) {
+              float feasign;
+              fin >> feasign;
+              if (slot.is_used()) {
+                (*file_elem_set)[index].AddValue(feasign);
+              }
+            }
+          } else {
+            PADDLE_THROW("Error type in proto file.");
+          }
+          if (slot.is_used()) {
+            ++index;
+          }
+        } else {
+          end_flag = true;
+          break;
+        }
+      }
+      if (end_flag) {
+        break;
+      }
+    }
+    fin.close();
+  }
+}
+TEST(DataFeed, MultiSlotUnitTest) {
+  const char* protofile = "data_feed_desc.prototxt";
+  const char* filelist_name = "filelist.txt";
+  GenerateFileForTest(protofile, filelist_name);
+  const std::vector<std::string> filelist =
+      load_filelist_from_file(filelist_name);
+  paddle::framework::DataFeedDesc data_feed_desc =
+      load_datafeed_param_from_file(protofile);
+  std::vector<MultiTypeSet> reader_elem_set;
+  std::vector<MultiTypeSet> file_elem_set;
+  GetElemSetFromReader(&reader_elem_set, data_feed_desc, filelist, 4);
+  GetElemSetFromFile(&file_elem_set, data_feed_desc, filelist);
+  CheckIsUnorderedSame(reader_elem_set, file_elem_set);
+}
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -48,7 +48,14 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
 void AllReduceOpHandle::RunImpl() {
  platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second);
+// FIXME(typhoonzero): If scope0(global scope) have NCCL_ID_VAR,
+// this is a distributed or inter-process call, find a better way.
+#ifdef PADDLE_WITH_CUDA
+  if (NoDummyInputSize() == 1 &&
+      local_scopes_[0]->FindLocalVar(NCCL_ID_VARNAME) == nullptr) {
+#else
  if (NoDummyInputSize() == 1) {
+#endif
    return;  // No need to all reduce when GPU count = 1;
  } else {
    // Wait input done

--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -62,6 +62,8 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
    auto multi_devices_pass = AppendPass("multi_devices_pass");
    multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy",
                                                         &strategy_);
+    multi_devices_pass->Set<int>("num_trainers",
+                                 new int(strategy_.num_trainers_));
    // Add a graph print pass to record a graph with device info.
    if (!strategy_.debug_graphviz_path_.empty()) {

--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -133,6 +133,7 @@ static const char kPlaces[] = "places";
 static const char kParams[] = "params";
 static const char kLocalScopes[] = "local_scopes";
 static const char kStrategy[] = "strategy";
+static const char kNumTrainers[] = "num_trainers";
 void MultiDevSSAGraphBuilder::Init() const {
  all_vars_.clear();
@@ -299,6 +300,8 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
  auto nodes = graph->ReleaseNodes();
  ir::Graph &result = *graph;
+  int num_trainers = Get<int>(kNumTrainers);
  for (auto &node : nodes) {
    if (node->IsVar() && node->Var()) {
      all_vars_.emplace(node->Name(), node->Var());
@@ -383,7 +386,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
          CreateComputationalOps(&result, node, places_.size());
        }
-        if (!is_forwarding && places_.size() > 1) {
+        if (!is_forwarding && (places_.size() > 1 || num_trainers > 1)) {
          // Currently, we assume that once gradient is generated, it can be
          // broadcast, and each gradient is only broadcast once.
          if (static_cast<bool>(boost::get<int>(node->Op()->GetAttr(
@@ -862,7 +865,7 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(
      if (node->Op()->Type() == "fetch_barrier") {
        outvar_dev_id =
            GetVarDeviceID(*result, output->Name(), *sharded_var_device);
-        PADDLE_ENFORCE_NE(outvar_dev_id, -1);
+        PADDLE_ENFORCE_NE(outvar_dev_id, -1, "output name %s", output->Name());
      }
      p = places_[outvar_dev_id];
      ir::Node *new_node = nullptr;
@@ -895,4 +898,5 @@ REGISTER_PASS(multi_devices_pass,
    .RequirePassAttr(paddle::framework::details::kPlaces)
    .RequirePassAttr(paddle::framework::details::kParams)
    .RequirePassAttr(paddle::framework::details::kLocalScopes)
-    .RequirePassAttr(paddle::framework::details::kStrategy);
+    .RequirePassAttr(paddle::framework::details::kStrategy)
+    .RequirePassAttr(paddle::framework::details::kNumTrainers);
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -16,7 +16,7 @@
 #include <stdexcept>
 #include <string>
 #include <vector>
-#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/platform/profiler.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/framework/details/reference_count_op_handle.h"

--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/transfer_scope_cache.h"
+#include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/operators/detail/macros.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -114,36 +115,6 @@ void Executor::Close() {
 #endif
 }
-void InitializeVariable(Variable* var, proto::VarType::Type var_type) {
-  if (var_type == proto::VarType::LOD_TENSOR) {
-    var->GetMutable<LoDTensor>();
-  } else if (var_type == proto::VarType::SELECTED_ROWS) {
-    var->GetMutable<SelectedRows>();
-  } else if (var_type == proto::VarType::FEED_MINIBATCH) {
-    var->GetMutable<FeedFetchList>();
-  } else if (var_type == proto::VarType::FETCH_LIST) {
-    var->GetMutable<FeedFetchList>();
-  } else if (var_type == proto::VarType::STEP_SCOPES) {
-    var->GetMutable<std::vector<framework::Scope*>>();
-  } else if (var_type == proto::VarType::LOD_RANK_TABLE) {
-    var->GetMutable<LoDRankTable>();
-  } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {
-    var->GetMutable<LoDTensorArray>();
-  } else if (var_type == proto::VarType::PLACE_LIST) {
-    var->GetMutable<platform::PlaceList>();
-  } else if (var_type == proto::VarType::READER) {
-    var->GetMutable<ReaderHolder>();
-  } else if (var_type == proto::VarType::RAW) {
-    // GetMutable will be called in operator
-  } else {
-    PADDLE_THROW(
-        "Variable type %d is not in "
-        "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
-        "LOD_RANK_TABLE, PLACE_LIST, READER, RAW]",
-        var_type);
-  }
-}
 void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope,
                               int block_id) {
  auto& global_block = pdesc.Block(block_id);

--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -26,7 +26,6 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
-extern void InitializeVariable(Variable* var, proto::VarType::Type var_type);
 template <typename T>
 std::unordered_map<std::string, T> GetNonPersistableReferenceCount(

--- a/paddle/fluid/framework/executor_thread_worker.cc
+++ b/paddle/fluid/framework/executor_thread_worker.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/framework/executor_thread_worker.h"
+#include "google/protobuf/io/zero_copy_stream_impl.h"
+#include "google/protobuf/message.h"
+#include "google/protobuf/text_format.h"
+#include "gflags/gflags.h"
+#include "paddle/fluid/framework/feed_fetch_method.h"
+#include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/framework/variable_helper.h"
+#include "paddle/fluid/inference/io.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/pybind/pybind.h"
+namespace paddle {
+namespace framework {
+void ExecutorThreadWorker::CreateThreadOperators(const ProgramDesc& program) {
+  auto& block = program.Block(0);
+  op_names_.clear();
+  for (auto& op_desc : block.AllOps()) {
+    std::unique_ptr<OperatorBase> local_op = OpRegistry::CreateOp(*op_desc);
+    op_names_.push_back(op_desc->Type());
+    OperatorBase* local_op_ptr = local_op.release();
+    ops_.push_back(local_op_ptr);
+    continue;
+  }
+}
+void ExecutorThreadWorker::CreateThreadResource(
+    const framework::ProgramDesc& program,
+    const paddle::platform::Place& place) {
+  CreateThreadScope(program);
+  CreateThreadOperators(program);
+  SetMainProgram(program);
+  SetPlace(place);
+}
+void ExecutorThreadWorker::CreateThreadScope(const ProgramDesc& program) {
+  auto& block = program.Block(0);
+  PADDLE_ENFORCE_NOT_NULL(
+      root_scope_, "root_scope should be set before creating thread scope");
+  thread_scope_ = &root_scope_->NewScope();
+  for (auto& var : block.AllVars()) {
+    if (var->Persistable()) {
+      auto* ptr = root_scope_->Var(var->Name());
+      InitializeVariable(ptr, var->GetType());
+    } else {
+      auto* ptr = thread_scope_->Var(var->Name());
+      InitializeVariable(ptr, var->GetType());
+    }
+  }
+}
+void ExecutorThreadWorker::SetDataFeed(
+    const std::shared_ptr<DataFeed>& datafeed) {
+  thread_reader_ = datafeed;
+}
+void ExecutorThreadWorker::BindingDataFeedMemory() {
+  const std::vector<std::string>& input_feed =
+      thread_reader_->GetUseSlotAlias();
+  for (auto name : input_feed) {
+    thread_reader_->AddFeedVar(thread_scope_->Var(name), name);
+  }
+}
+void ExecutorThreadWorker::SetFetchVarNames(
+    const std::vector<std::string>& fetch_var_names) {
+  fetch_var_names_.clear();
+  fetch_var_names_.insert(fetch_var_names_.end(), fetch_var_names.begin(),
+                          fetch_var_names.end());
+}
+void ExecutorThreadWorker::SetDevice() {
+#if defined _WIN32 || defined __APPLE__
+  return;
+#else
+  static unsigned concurrency_cap = std::thread::hardware_concurrency();
+  int thread_id = this->thread_id_;
+  if (thread_id < concurrency_cap) {
+    unsigned proc = thread_id;
+    cpu_set_t mask;
+    CPU_ZERO(&mask);
+    CPU_SET(proc, &mask);
+    if (-1 == sched_setaffinity(0, sizeof(mask), &mask)) {
+      VLOG(1) << "WARNING: Failed to set thread affinity for thread "
+              << thread_id;
+    } else {
+      CPU_ZERO(&mask);
+      if ((0 != sched_getaffinity(0, sizeof(mask), &mask)) ||
+          (CPU_ISSET(proc, &mask) == 0)) {
+        VLOG(3) << "WARNING: Failed to set thread affinity for thread "
+                << thread_id;
+      }
+    }
+  } else {
+    VLOG(1) << "WARNING: Failed to set thread affinity for thread "
+            << thread_id;
+  }
+#endif
+}
+template <typename T>
+void print_lod_tensor(std::string var_name, const LoDTensor& lod_tensor) {
+  auto inspect = lod_tensor.data<T>();
+  auto element_num = lod_tensor.numel();
+  std::ostringstream sstream;
+  sstream << var_name << " (element num " << element_num << "): [";
+  sstream << inspect[0];
+  for (int j = 1; j < element_num; ++j) {
+    sstream << " " << inspect[j];
+  }
+  sstream << "]";
+  std::cout << sstream.str() << std::endl;
+}
+void print_fetch_var(Scope* scope, std::string var_name) {
+  const LoDTensor& tensor = scope->FindVar(var_name)->Get<LoDTensor>();
+  if (std::type_index(tensor.type()) ==
+      std::type_index(typeid(platform::float16))) {
+    print_lod_tensor<platform::float16>(var_name, tensor);
+  } else if (std::type_index(tensor.type()) == std::type_index(typeid(float))) {
+    print_lod_tensor<float>(var_name, tensor);
+  } else if (std::type_index(tensor.type()) ==
+             std::type_index(typeid(double))) {
+    print_lod_tensor<double>(var_name, tensor);
+  } else if (std::type_index(tensor.type()) == std::type_index(typeid(int))) {
+    print_lod_tensor<int>(var_name, tensor);
+  } else if (std::type_index(tensor.type()) ==
+             std::type_index(typeid(int64_t))) {
+    print_lod_tensor<int64_t>(var_name, tensor);
+  } else if (std::type_index(tensor.type()) == std::type_index(typeid(bool))) {
+    print_lod_tensor<bool>(var_name, tensor);
+  } else if (std::type_index(tensor.type()) ==
+             std::type_index(typeid(uint8_t))) {
+    print_lod_tensor<uint8_t>(var_name, tensor);
+  } else if (std::type_index(tensor.type()) ==
+             std::type_index(typeid(int16_t))) {
+    print_lod_tensor<int16_t>(var_name, tensor);
+  } else if (std::type_index(tensor.type()) ==
+             std::type_index(typeid(int8_t))) {
+    print_lod_tensor<int8_t>(var_name, tensor);
+  } else {
+    VLOG(1) << "print_fetch_var: unrecognized data type:"
+            << tensor.type().name();
+  }
+  return;
+}
+void ExecutorThreadWorker::TrainFiles() {
+  // todo: configurable
+  SetDevice();
+  int fetch_var_num = fetch_var_names_.size();
+  fetch_values_.clear();
+  fetch_values_.resize(fetch_var_num);
+  thread_reader_->Start();
+  int cur_batch;
+  int batch_cnt = 0;
+  while ((cur_batch = thread_reader_->Next()) > 0) {
+    // executor run here
+    for (auto& op : ops_) {
+      op->Run(*thread_scope_, place_);
+    }
+    ++batch_cnt;
+    thread_scope_->DropKids();
+    if (debug_ == false || thread_id_ != 0) {
+      continue;
+    }
+    for (int i = 0; i < fetch_var_num; ++i) {
+      print_fetch_var(thread_scope_, fetch_var_names_[i]);
+    }  // end for (int i = 0...)
+  }    // end while ()
+}
+void ExecutorThreadWorker::SetThreadId(int tid) { thread_id_ = tid; }
+void ExecutorThreadWorker::SetPlace(const platform::Place& place) {
+  place_ = place;
+}
+void ExecutorThreadWorker::SetMainProgram(
+    const ProgramDesc& main_program_desc) {
+  main_program_.reset(new ProgramDesc(main_program_desc));
+}
+void ExecutorThreadWorker::SetRootScope(Scope* g_scope) {
+  root_scope_ = g_scope;
+}
+}  // einit_modelnd namespace framework
+}  // end namespace paddle
--- a/paddle/fluid/framework/executor_thread_worker.h
+++ b/paddle/fluid/framework/executor_thread_worker.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+  http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <map>
+#include <memory>
+#include <mutex>  // NOLINT
+#include <set>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+#include "paddle/fluid/framework/data_feed.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+namespace paddle {
+namespace framework {
+void CreateTensor(Variable* var, proto::VarType::Type var_type);
+class ExecutorThreadWorker {
+ public:
+  ExecutorThreadWorker()
+      : thread_id_(-1), root_scope_(NULL), thread_scope_(NULL), debug_(false) {}
+  ~ExecutorThreadWorker() {}
+  void CreateThreadResource(const framework::ProgramDesc& program,
+                            const paddle::platform::Place& place);
+  void SetThreadId(int tid);
+  void SetDebug(const bool debug) { debug_ = debug; }
+  void SetRootScope(Scope* g_scope);
+  // set cpu device in this function
+  // cpu binding is used by default
+  void SetDevice();
+  // since we read data into memory that can not be accessed by program
+  // we need to bind memory of data with corresponding variables in program
+  // this function should be called after data feed is set
+  void BindingDataFeedMemory();
+  // set data feed declared in executor
+  void SetDataFeed(const std::shared_ptr<DataFeed>& datafeed);
+  // A multi-thread training function
+  void TrainFiles();
+  // set fetch variable names from python interface assigned by users
+  void SetFetchVarNames(const std::vector<std::string>& fetch_var_names);
+ private:
+  void CreateThreadScope(const framework::ProgramDesc& program);
+  void CreateThreadOperators(const framework::ProgramDesc& program);
+  void SetMainProgram(const ProgramDesc& main_program_desc);
+  void SetPlace(const paddle::platform::Place& place);
+ protected:
+  // thread index
+  std::shared_ptr<DataFeed> thread_reader_;  // shared queue, thread buffer
+  int thread_id_;
+  // operator name
+  std::vector<std::string> op_names_;
+  // thread level, local operators for forward and backward
+  std::vector<OperatorBase*> ops_;
+  // main program for training
+  std::unique_ptr<framework::ProgramDesc> main_program_;
+  // execution place
+  platform::Place place_;
+  // root scope for model parameters
+  Scope* root_scope_;
+  // a thread scope, father scope is global score which is shared
+  Scope* thread_scope_;
+ private:
+  std::vector<std::string> fetch_var_names_;
+  std::vector<std::vector<float>> fetch_values_;
+  bool debug_;
+};
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -177,14 +177,13 @@ class Graph {
    return nullptr;
  }
-  const ProgramDesc &program() const { return program_; }
-  std::map<std::string, std::vector<ir::Node *>> InitFromProgram(
-      const ProgramDesc &program);
  void ResolveHazard(
      const std::map<std::string, std::vector<ir::Node *>> &var_nodes);
 private:
+  std::map<std::string, std::vector<ir::Node *>> InitFromProgram(
+      const ProgramDesc &program);
  // This method takes ownership of `node`.
  ir::Node *AddNode(ir::Node *node) {
    PADDLE_ENFORCE(node_set_.find(node) == node_set_.end());

--- a/paddle/fluid/framework/ir/is_test_pass.cc
+++ b/paddle/fluid/framework/ir/is_test_pass.cc
@@ -38,7 +38,7 @@ std::unique_ptr<ir::Graph> IsTestPass::ApplyImpl(
  for (const Node* n : graph->Nodes()) {
    if (n->IsOp()) {
      auto* op = n->Op();
-      if (op->HasAttr("is_test")) {
+      if (n->RuntimeHasAttr("is_test")) {
        op->SetAttr("is_test", true);
      } else if (std::find(begin(op_list), end(op_list), op->Type()) !=
                 end(op_list)) {

--- a/paddle/fluid/framework/ir/is_test_pass_tester.cc
+++ b/paddle/fluid/framework/ir/is_test_pass_tester.cc
@@ -104,9 +104,9 @@ TEST(IsTestPass, basic) {
      auto* op = node->Op();
      auto op_name = boost::get<std::string>(op->GetAttr("name"));
      if (op_name == "conv3") {
-        ASSERT_FALSE(op->HasAttr("is_test"));
+        ASSERT_FALSE(node->RuntimeHasAttr("is_test"));
      } else {
-        ASSERT_TRUE(op->HasAttr("is_test"));
+        ASSERT_TRUE(node->RuntimeHasAttr("is_test"));
        EXPECT_TRUE(boost::get<bool>(op->GetAttr("is_test")));
      }
    }

--- a/paddle/fluid/framework/ir/mkldnn_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn_placement_pass.cc
@@ -22,7 +22,7 @@ std::unique_ptr<ir::Graph> MKLDNNPlacementPass::ApplyImpl(
    std::unique_ptr<ir::Graph> graph) const {
  VLOG(3) << "Aplies MKL-DNN placement strategy.";
  for (const Node* n : graph->Nodes()) {
-    if (n->IsOp() && n->Op()->HasAttr("use_mkldnn")) {
+    if (n->IsOp() && n->RuntimeHasAttr("use_mkldnn")) {
      n->Op()->SetAttr("use_mkldnn", true);
    }
  }

--- a/paddle/fluid/framework/ir/node.cc
+++ b/paddle/fluid/framework/ir/node.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/framework/ir/node.h"
+#include "paddle/fluid/framework/op_info.h"
 namespace paddle {
 namespace framework {
@@ -24,10 +25,33 @@ constexpr char Node::kControlDepVarName[];
 const char Node::kControlDepVarName[] = "__control_var";
 #endif
-std::unique_ptr<Node> CreateNodeForTest(const std::string& name,
+std::unique_ptr<Node> CreateNodeForTest(const std::string &name,
                                        Node::Type type) {
  return std::unique_ptr<Node>(new Node(name, type));
 }
+bool Node::RuntimeHasAttr(const std::string &name) const {
+  if (Op()->HasAttr(name)) {
+    return true;
+  } else {
+    auto &op_info = OpInfoMap::Instance();
+    auto op_type = Op()->Type();
+    if (op_info.Has(op_type)) {
+      auto op_info_ptr = op_info.Get(op_type);
+      if (op_info_ptr.HasOpProtoAndChecker()) {
+        const proto::OpProto &proto = op_info_ptr.Proto();
+        for (int i = 0; i != proto.attrs_size(); ++i) {
+          const proto::OpProto::Attr &attr = proto.attrs(i);
+          if (attr.name() == name) {
+            return true;
+          }
+        }
+      }
+    }
+  }
+  return false;
+}
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@@ -108,6 +108,18 @@ class Node {
           Name().find(ir::Node::kControlDepVarName) != std::string::npos;
  }
+  // RuntimeHasAttr is different with HasAttr now.
+  // 1. For Op()->HasAttr(), it judges whether a stored program_desc_ has attr,
+  // thus, if stored program_desc_ are old which don't have an attr, a new
+  // library which adds the attr already will fail on this function.
+  // Details:
+  // https://github.com/PaddlePaddle/Paddle/pull/14608#issuecomment-442309087
+  // 2. For Op()->RuntimeHasAttr, it judges the attr in runtime to avoid above
+  // problem.
+  // TODO(luotao): Maybe we should enhance HasAttr later, instead of adding
+  // RuntimeHasAttr.
+  bool RuntimeHasAttr(const std::string& name) const;
  std::vector<Node*> inputs;
  std::vector<Node*> outputs;

--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -21,42 +21,11 @@
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/string/pretty_log.h"
 namespace paddle {
 namespace framework {
-// These code can be shared with Executor.
-static void InitializeVariable(Variable *var, proto::VarType::Type var_type) {
-  if (var_type == proto::VarType::LOD_TENSOR) {
-    var->GetMutable<LoDTensor>();
-  } else if (var_type == proto::VarType::SELECTED_ROWS) {
-    var->GetMutable<SelectedRows>();
-  } else if (var_type == proto::VarType::FEED_MINIBATCH) {
-    var->GetMutable<FeedFetchList>();
-  } else if (var_type == proto::VarType::FETCH_LIST) {
-    var->GetMutable<FeedFetchList>();
-  } else if (var_type == proto::VarType::STEP_SCOPES) {
-    var->GetMutable<std::vector<framework::Scope *>>();
-  } else if (var_type == proto::VarType::LOD_RANK_TABLE) {
-    var->GetMutable<LoDRankTable>();
-  } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {
-    var->GetMutable<LoDTensorArray>();
-  } else if (var_type == proto::VarType::PLACE_LIST) {
-    var->GetMutable<platform::PlaceList>();
-  } else if (var_type == proto::VarType::READER) {
-    var->GetMutable<ReaderHolder>();
-  } else if (var_type == proto::VarType::RAW) {
-    // GetMutable will be called in operator
-  } else {
-    PADDLE_THROW(
-        "Variable type %d is not in "
-        "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
-        "LOD_RANK_TABLE, PLACE_LIST, READER, CHANNEL, RAW]",
-        var_type);
-  }
-}
 void NaiveExecutor::Prepare(Scope *scope, const ProgramDesc &program_desc,
                            int block_id, bool with_feed_fetch_ops) {
  if (!scope) {

--- a/paddle/fluid/framework/ngraph_bridge.cc
+++ b/paddle/fluid/framework/ngraph_bridge.cc
@@ -15,23 +15,105 @@ limitations under the License. */
 #ifdef PADDLE_WITH_NGRAPH
 #include <algorithm>
 #include <functional>
+#include <vector>
 #include "paddle/fluid/framework/ngraph_bridge.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/enforce.h"
 #include "ngraph/ngraph.hpp"
 namespace paddle {
 namespace framework {
+static std::shared_ptr<ngraph::Node> GetNode(
+    const std::shared_ptr<OperatorBase>& op, const std::string prm,
+    const VariableNameMap& var_map,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto& var_names = var_map.at(prm);
+  PADDLE_ENFORCE_EQ(var_names.size(), 1,
+                    "op %s prm %s expects one associated var", op->Type(), prm);
+  if (ngb_node_map->find(var_names[0]) != ngb_node_map->end()) {
+    return (*ngb_node_map)[var_names[0]];
+  } else {
+    return nullptr;
+  }
+}
+static std::shared_ptr<ngraph::Node> GetInputNode(
+    const std::shared_ptr<OperatorBase>& op, const std::string prm,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  return GetNode(op, prm, op->Inputs(), ngb_node_map);
+}
+static std::shared_ptr<ngraph::Node> GetOutputNode(
+    const std::shared_ptr<OperatorBase>& op, const std::string prm,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  return GetNode(op, prm, op->Outputs(), ngb_node_map);
+}
+static void SetOutputNode(
+    const std::shared_ptr<OperatorBase>& op, const std::string prm,
+    std::shared_ptr<ngraph::Node> node,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto& var_names = op->Outputs().at(prm);
+  if (var_names.size() == 1) {
+    (*ngb_node_map)[var_names[0]] = node;
+  } else if (var_names.size() == 0) {
+    (*ngb_node_map)[""] = node;
+  } else {
+    PADDLE_THROW("prm %s has more than 1 var_names.", prm);
+  }
+}
+static bool HasOutput(const std::shared_ptr<OperatorBase>& op,
+                      const std::string prm) {
+  auto& outputs = op->Outputs();
+  if (outputs.find(prm) == outputs.end()) return false;
+  return outputs.at(prm).size() > 0;
+}
+template <typename T>
+static void BuildBinaryNode(
+    const std::shared_ptr<OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto x = GetInputNode(op, "X", ngb_node_map);
+  auto y = GetInputNode(op, "Y", ngb_node_map);
+  auto out = std::make_shared<T>(x, y);
+  SetOutputNode(op, "Out", out, ngb_node_map);
+}
+template <typename T>
+static void BuildUnaryNode(
+    const std::shared_ptr<OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto input = GetInputNode(op, "X", ngb_node_map);
+  auto out = std::make_shared<T>(input);
+  SetOutputNode(op, "Out", out, ngb_node_map);
+}
 std::map<std::string,
         std::function<void(const std::shared_ptr<OperatorBase>&,
                            std::shared_ptr<std::unordered_map<
                                std::string, std::shared_ptr<ngraph::Node>>>)>>
-    NgraphBridge::NG_NODE_MAP = {};
+    NgraphBridge::NG_NODE_MAP = {{"relu", BuildUnaryNode<ngraph::op::Relu>},
+                                 {"tanh", BuildUnaryNode<ngraph::op::Tanh>}};
-void NgraphBridge::build_graph(const std::shared_ptr<OperatorBase>& op) {
+void NgraphBridge::BuildNgNode(const std::shared_ptr<OperatorBase>& op) {
  auto& op_type = op->Type();
-  NG_NODE_MAP[op_type](op, ngb_node_map);
+  NG_NODE_MAP[op_type](op, ngb_node_map_);
 }
 }  // namespace framework

--- a/paddle/fluid/framework/ngraph_bridge.h
+++ b/paddle/fluid/framework/ngraph_bridge.h
@@ -20,16 +20,14 @@ limitations under the License. */
 #include <map>
 #include <string>
 #include <unordered_map>
-#include <vector>
-#include "paddle/fluid/framework/operator.h"
+#include "ngraph/node.hpp"
-#include "paddle/fluid/platform/enforce.h"
-#include "ngraph/ngraph.hpp"
 namespace paddle {
 namespace framework {
+class OperatorBase;
 class NgraphBridge {
 public:
  static std::map<
@@ -43,14 +41,14 @@ class NgraphBridge {
      std::shared_ptr<
          std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
          var_node_map)
-      : ngb_node_map(var_node_map) {}
+      : ngb_node_map_(var_node_map) {}
-  void build_graph(const std::shared_ptr<OperatorBase>& op);
+  void BuildNgNode(const std::shared_ptr<OperatorBase>& op);
 private:
  std::shared_ptr<
      std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-      ngb_node_map;
+      ngb_node_map_;
 };
 }  // namespace framework

--- a/paddle/fluid/framework/ngraph_operator.cc
+++ b/paddle/fluid/framework/ngraph_operator.cc
@@ -19,14 +19,29 @@ limitations under the License. */
 #include <map>
 #include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/ngraph_bridge.h"
 #include "paddle/fluid/framework/ngraph_operator.h"
-#include "paddle/fluid/framework/shape_inference.h"
+#include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/var_desc.h"
 #include "paddle/fluid/framework/var_type.h"
+#include "ngraph/ngraph.hpp"
 namespace paddle {
 namespace framework {
+static ngraph::Shape Ddim2Shape(const DDim& dims) {
+  ngraph::Shape sp;
+  for (int i = 0; i < dims.size(); ++i) {
+    int k = dims[i];
+    k = k == 0 ? 1 : k;
+    sp.push_back(k);
+  }
+  return sp;
+}
 static std::map<proto::VarType::Type, ngraph::element::Type> pd2ng_type_map = {
    {proto::VarType::FP32, ngraph::element::f32},
    {proto::VarType::FP64, ngraph::element::f64},
@@ -42,6 +57,7 @@ typedef enum {                /* nGraph support state on ops          */
               PARTIAL_TEST   /* Support partial list of ops for test */
 } op_state;
+// perform graph build through bridge and execute computation
 class NgraphOperator {
 public:
  explicit NgraphOperator(const Scope& scope, const platform::Place& place,
@@ -59,13 +75,23 @@ class NgraphOperator {
        persistables_(persist),
        fetches_(fetches),
        post_op_inputs_(post_op_inputs),
-        ng_op_state_(ng_op_state) {}
+        ng_op_state_(ng_op_state) {
+    var_in_node_map_ = std::make_shared<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>();
+    var_node_map_ = std::make_shared<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>();
+    BuildNgIO();
+    GetNgFunction();
+  }
  void Run(const Scope& scope, const platform::Place& place) const;
 private:
  static std::unordered_map<std::string, std::shared_ptr<ngraph::Function>>
-      func_cache;
+      func_cache_;
  const Scope& scope_;
  const platform::Place& place_;
  std::vector<std::shared_ptr<OperatorBase>> fused_ops_;
@@ -74,6 +100,35 @@ class NgraphOperator {
  std::unordered_set<std::string> fetches_;
  std::unordered_set<std::string> post_op_inputs_;
  op_state ng_op_state_;
+  // ngraph backend eg. CPU
+  static std::shared_ptr<ngraph::runtime::Backend> backend_;
+  // ngraph function to call and execute
+  std::shared_ptr<ngraph::Function> ngraph_function_;
+  // var_name of inputs
+  std::vector<std::string> var_in_;
+  // var_name of outputs from  fetch in order
+  std::vector<std::string> var_out_;
+  // map input vars to nodes
+  std::shared_ptr<
+      std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+      var_in_node_map_;
+  // map each var name with a ngraph node
+  std::shared_ptr<
+      std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+      var_node_map_;
+  // cache key to check if function is cached
+  std::shared_ptr<std::string> GetCacheKey();
+  // get ngraph input and define ngraph input parameters
+  void GetNgInputShape(std::shared_ptr<OperatorBase> op);
+  // Call ngraph bridge to map ops
+  void BuildNgNodes();
+  // get the ngraph input and output var list
+  void BuildNgIO();
+  // build ngraph function call
+  void BuildNgFunction();
+  // Check cache for ngraph function or otherwise build the function
+  void GetNgFunction();
 };
 std::vector<std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
@@ -86,7 +141,7 @@ FusedOperator::FusedOpIntervals(
  }
  size_t size = ops->size();
  size_t left = 0;
-  while (left < size && ops.at(left)->Type() != kFeedOpType) {
+  while (left < size && ops->at(left)->Type() != kFeedOpType) {
    ++left;
  }
  if (left == size) {
@@ -116,7 +171,7 @@ FusedOperator::FusedOpIntervals(
      size_t start = pivot, end = start;
      while (pivot < right &&
             (paddle::framework::NgraphBridge::NG_NODE_MAP.find(
-                  ops.at(pivot)->Type()) !=
+                  ops->at(pivot)->Type()) !=
              paddle::framework::NgraphBridge::NG_NODE_MAP.end())) {
        ++pivot;
        ++end;
@@ -136,7 +191,9 @@ FusedOperator::FusedOperator(
    std::vector<std::unique_ptr<OperatorBase>>::iterator end,
    const std::string& type, const VariableNameMap& inputs,
    const VariableNameMap& outputs, const AttributeMap& attrs)
-    : OperatorBase(type, inputs, outputs, attrs), pdesc(prog), block(block_id) {
+    : OperatorBase(type, inputs, outputs, attrs),
+      pdesc_(prog),
+      block_(block_id) {
  for (std::vector<std::unique_ptr<OperatorBase>>::iterator it = start;
       it != end; ++it) {
    fused_ops_.push_back(std::move(*it));
@@ -152,7 +209,7 @@ FusedOperator::FusedOperator(
  }
  if ((*(start - 1))->Type() == kFeedOpType && (*end)->Type() == kFetchOpType) {
-    is_complete = true;
+    is_full_ = true;
  }
  Process();
@@ -205,7 +262,7 @@ void FusedOperator::RunImpl(const Scope& scope,
    }
  }
-  if (is_full) {
+  if (is_full_) {
    ng_op_state = ng_op_state == PARTIAL_TEST ? FULL_TEST : FULL_TRAIN;
  }
@@ -215,6 +272,280 @@ void FusedOperator::RunImpl(const Scope& scope,
  ngraph_op.Run(scope, place);
 }
+std::unordered_map<std::string, std::shared_ptr<ngraph::Function>>
+    NgraphOperator::func_cache_ = {};
+std::shared_ptr<ngraph::runtime::Backend> NgraphOperator::backend_ =
+    ngraph::runtime::Backend::create("CPU");
+void NgraphOperator::GetNgInputShape(std::shared_ptr<OperatorBase> op) {
+  op->RuntimeInferShape(scope_, place_);
+  for (auto& var_name_item : op->Inputs()) {
+    for (auto& var_name : var_name_item.second) {
+      auto* var = scope_.FindVar(var_name);
+      if (var && var->IsType<LoDTensor>()) {
+        auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
+        auto sp = Ddim2Shape(tensor_pd->dims());
+        if (std::find(var_in_.begin(), var_in_.end(), var_name) !=
+            var_in_.end()) {
+          if (var_node_map_->find(var_name) == var_node_map_->end()) {
+            auto ng_type = var_type_map_.at(var_name);
+            auto prm =
+                std::make_shared<ngraph::op::Parameter>(ng_type, sp, true);
+            (*var_node_map_)[var_name] = prm;
+            (*var_in_node_map_)[var_name] = prm;
+          }
+        }
+      }
+    }
+  }
+}
+void NgraphOperator::BuildNgNodes() {
+  for (auto& var_name : var_out_) {
+    if (var_node_map_->find(var_name) == var_node_map_->end()) {
+      auto* var = scope_.FindVar(var_name);
+      if (var && var->IsType<LoDTensor>()) {
+        auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
+        auto& ddim = tensor_pd->dims();
+        auto ng_shape = Ddim2Shape(ddim);
+        auto ng_type = var_type_map_.at(var_name);
+        auto prm =
+            std::make_shared<ngraph::op::Parameter>(ng_type, ng_shape, true);
+        (*var_node_map_)[var_name] = prm;
+      }
+    }
+  }
+  paddle::framework::NgraphBridge ngb(var_node_map_);
+  for (auto& op : fused_ops_) {
+    ngb.BuildNgNode(op);
+  }
+}
+void NgraphOperator::BuildNgIO() {
+  std::unordered_set<std::string> inputs;
+  std::unordered_set<std::string> outputs;
+  for (auto& op : fused_ops_) {
+    for (auto& var_name_item : op->Inputs()) {
+      for (auto& var_name : var_name_item.second) {
+        inputs.insert(var_name);
+        const bool is_output = outputs.find(var_name) != outputs.end();
+        if (!is_output &&
+            std::find(var_in_.begin(), var_in_.end(), var_name) ==
+                var_in_.end()) {
+          // fill var_in here to keep lhs and rhs order
+          var_in_.push_back(var_name);
+        }
+      }
+    }
+    if (op->Type() != "fill_constant") {
+      GetNgInputShape(op);
+    }
+    for (auto& var_name_item : op->Outputs()) {
+      PADDLE_ENFORCE_LE(var_name_item.second.size(), 1,
+                        "op %s has more than 1 output - Not handling yet",
+                        op->Type());
+      for (auto& var_name : var_name_item.second) {
+        outputs.insert(var_name);
+      }
+    }
+  }
+  // var_out.clear();
+  for (auto& op : fused_ops_) {
+    for (auto& var_name_item : op->Outputs()) {
+      PADDLE_ENFORCE_LE(var_name_item.second.size(), 1,
+                        "op %s has more than 1 output - Not handling yet",
+                        op->Type());
+      for (auto& var_name : var_name_item.second) {
+        switch (ng_op_state_) {
+          case PARTIAL_TEST:
+            if (post_op_inputs_.find(var_name) != post_op_inputs_.end() ||
+                fetches_.find(var_name) != fetches_.end()) {
+              var_out_.push_back(var_name);
+            }
+            break;
+          case FULL_TEST:
+            if (fetches_.find(var_name) != fetches_.end()) {
+              var_out_.push_back(var_name);
+            }
+            break;
+          case PARTIAL_TRAIN:
+            if (fetches_.find(var_name) != fetches_.end() ||
+                post_op_inputs_.find(var_name) != post_op_inputs_.end() ||
+                persistables_.find(var_name) != persistables_.end()) {
+              var_out_.push_back(var_name);
+            }
+            break;
+          case FULL_TRAIN:
+            if (fetches_.find(var_name) != fetches_.end() ||
+                persistables_.find(var_name) != persistables_.end()) {
+              var_out_.push_back(var_name);
+            }
+            break;
+          default:
+            var_out_.push_back(var_name);
+        }
+      }
+    }
+  }
+}
+void NgraphOperator::BuildNgFunction() {
+  BuildNgNodes();
+  ngraph_function_ = nullptr;
+  ngraph::NodeVector func_outputs;
+  ngraph::op::ParameterVector func_inputs;
+  for (auto& vo : var_out_) {
+    func_outputs.push_back(var_node_map_->at(vo));
+  }
+  for (auto& vi : var_in_) {
+    std::shared_ptr<ngraph::op::Parameter> prm =
+        std::dynamic_pointer_cast<ngraph::op::Parameter>(
+            var_in_node_map_->at(vi));
+    func_inputs.push_back(prm);
+  }
+  ngraph_function_ =
+      std::make_shared<ngraph::Function>(func_outputs, func_inputs);
+}
+std::shared_ptr<std::string> NgraphOperator::GetCacheKey() {
+  auto cache_key = std::make_shared<std::string>("");
+  *cache_key += std::to_string(fused_ops_.size());
+  for (auto& op : fused_ops_) {
+    *cache_key += op->Type();
+  }
+  for (auto& var_name : var_in_) {
+    auto shape = var_node_map_->at(var_name)->get_shape();
+    *cache_key += var_name;
+    *cache_key += var_type_map_.at(var_name).c_type_string();
+    for (size_t i = 0; i < shape.size(); ++i) {
+      *cache_key += std::to_string(shape.at(i));
+    }
+  }
+  for (auto& var_name : var_out_) {
+    auto* var = scope_.FindVar(var_name);
+    if (var && var->IsType<LoDTensor>()) {
+      auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
+      auto& ddim = tensor_pd->dims();
+      for (int i = 0; i < ddim.size(); ++i) {
+        *cache_key += std::to_string(ddim[i]);
+      }
+    }
+  }
+  return cache_key;
+}
+void NgraphOperator::GetNgFunction() {
+  bool cache_on = true;
+  if (cache_on) {
+    std::string cache_key_val = *GetCacheKey();
+    if (func_cache_.find(cache_key_val) != func_cache_.end()) {
+      ngraph_function_ = func_cache_.at(cache_key_val);
+    } else {
+      BuildNgFunction();
+      func_cache_[cache_key_val] = ngraph_function_;
+    }
+  } else {
+    BuildNgFunction();
+  }
+}
+void NgraphOperator::Run(const Scope& scope,
+                         const platform::Place& place) const {
+  std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_in;
+  std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_out;
+  for (size_t i = 0; i < var_in_.size(); ++i) {
+    auto vi = var_in_.at(i);
+    auto sp = var_node_map_->at(vi)->get_shape();
+    std::shared_ptr<ngraph::runtime::Tensor> ti;
+    auto* var = scope.FindVar(vi);
+    if (var && var->IsType<LoDTensor>()) {
+      auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
+      PADDLE_ENFORCE(sp == Ddim2Shape(tensor_pd->dims()),
+                     "Ensure ngraph tensor layout align with paddle tensor");
+      if (tensor_pd->type().hash_code() ==
+          typeid(float).hash_code()) {  // NOLINT
+        const float* arr = tensor_pd->data<float>();
+        ti = backend_->create_tensor(ngraph::element::f32, sp,
+                                     const_cast<float*>(arr));
+      } else if (tensor_pd->type().hash_code() ==
+                 typeid(int).hash_code()) {  // NOLINT
+        const int* arr = tensor_pd->data<int>();
+        ti = backend_->create_tensor(ngraph::element::i32, sp,
+                                     const_cast<int*>(arr));
+      } else if (tensor_pd->type().hash_code() == typeid(int64_t).hash_code()) {
+        const int64_t* arr = tensor_pd->data<int64_t>();
+        ti = backend_->create_tensor(ngraph::element::i64, sp,
+                                     const_cast<int64_t*>(arr));
+      } else if (tensor_pd->type().hash_code() ==
+                 typeid(double).hash_code()) {  // NOLINT
+        const double* arr = tensor_pd->data<double>();
+        ti = backend_->create_tensor(ngraph::element::f64, sp,
+                                     const_cast<double*>(arr));
+      } else if (tensor_pd->type().hash_code() ==
+                 typeid(bool).hash_code()) {  // NOLINT
+        const bool* arr = tensor_pd->data<bool>();
+        ti = backend_->create_tensor(ngraph::element::boolean, sp,
+                                     const_cast<bool*>(arr));
+      } else {
+        PADDLE_THROW("Data type not handling for var %s", vi);
+      }
+    } else {
+      PADDLE_THROW("Cannot find var or tensor with var name %s", vi);
+    }
+    bool is_test = (ng_op_state_ == PARTIAL_TEST || ng_op_state_ == FULL_TEST)
+                       ? true
+                       : false;
+    bool is_persistable =
+        (persistables_.find(vi) != persistables_.end()) ? true : false;
+    if (is_test && is_persistable) {
+      ti->set_stale(false);
+    }
+    t_in.push_back(ti);
+  }
+  for (size_t i = 0; i < var_out_.size(); ++i) {
+    auto var_name = var_out_[i];
+    auto* var = scope.FindVar(var_name);
+    std::shared_ptr<ngraph::runtime::Tensor> to;
+    if (var && var->IsType<LoDTensor>()) {
+      auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var);
+      auto dd = tensor_pd->dims();
+      ngraph::Shape sp = Ddim2Shape(dd);
+      auto ng_type = var_type_map_.at(var_name);
+      if (ng_type == ngraph::element::f32) {
+        auto pd_arr = tensor_pd->mutable_data<float>(place);
+        to = backend_->create_tensor(ngraph::element::f32, sp, pd_arr);
+      } else if (ng_type == ngraph::element::i64) {
+        auto pd_arr = tensor_pd->mutable_data<int64_t>(place);
+        to = backend_->create_tensor(ngraph::element::i64, sp, pd_arr);
+      } else if (ng_type == ngraph::element::f64) {
+        auto pd_arr = tensor_pd->mutable_data<double>(place);
+        to = backend_->create_tensor(ngraph::element::f64, sp, pd_arr);
+      } else if (ng_type == ngraph::element::boolean) {
+        auto pd_arr = tensor_pd->mutable_data<bool>(place);
+        to = backend_->create_tensor(ngraph::element::boolean, sp, pd_arr);
+      } else {
+        PADDLE_THROW("Data type not handled in for var %s", var_name);
+      }
+      t_out.push_back(to);
+    } else {
+      PADDLE_THROW("Cannot find var or tensor with var name %s", var_name);
+    }
+  }
+  backend_->call(ngraph_function_, t_out, t_in);
+}  // NgraphOperator::RunImpl
 }  // namespace framework
 }  // namespace paddle
 #endif
--- a/paddle/fluid/framework/ngraph_operator.h
+++ b/paddle/fluid/framework/ngraph_operator.h
@@ -17,24 +17,19 @@ limitations under the License. */
 #ifdef PADDLE_WITH_NGRAPH
 #include <algorithm>
-#include <atomic>
 #include <string>
 #include <unordered_map>
 #include <vector>
 #include "paddle/fluid/framework/attribute.h"
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/ngraph_bridge.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_kernel_type.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/variant.h"
-#include "ngraph/ngraph.hpp"
+#include "ngraph/type/element_type.hpp"
 namespace paddle {
 namespace framework {

--- a/paddle/fluid/framework/op_kernel_type.cc
+++ b/paddle/fluid/framework/op_kernel_type.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/framework/op_kernel_type.h"
+namespace paddle {
+namespace framework {
+size_t OpKernelType::Hash::operator()(const OpKernelType& key) const {
+  int cur_loc = 0;
+  int place = key.place_.which();
+  cur_loc += OpKernelType::kPlaceBits;
+  int data_type = static_cast<int>(key.data_type_) << cur_loc;
+  cur_loc += OpKernelType::kPrimaryDTypeBits;
+  int data_layout = static_cast<int>(key.data_layout_) << cur_loc;
+  cur_loc += OpKernelType::kLayoutBits;
+  int library_type = static_cast<int>(key.library_type_) << cur_loc;
+  cur_loc += OpKernelType::kLibBits;
+  int customized_value = key.customized_type_value_;
+  PADDLE_ENFORCE(customized_value < (1 << OpKernelType::kCustomizeBits));
+  customized_value = customized_value << cur_loc;
+  cur_loc += OpKernelType::kCustomizeBits;
+  PADDLE_ENFORCE(cur_loc < 64);
+  std::hash<int> hasher;
+  return hasher(place + data_type + data_layout + library_type +
+                customized_value);
+}
+bool OpKernelType::operator==(const OpKernelType& o) const {
+  return platform::places_are_same_class(place_, o.place_) &&
+         data_type_ == o.data_type_ && data_layout_ == o.data_layout_ &&
+         library_type_ == o.library_type_ &&
+         customized_type_value_ == o.customized_type_value_;
+}
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/op_kernel_type.h
+++ b/paddle/fluid/framework/op_kernel_type.h
@@ -24,54 +24,55 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
-struct OpKernelType {
+class OpKernelType {
-  struct Hash {
+ public:
-    size_t operator()(const OpKernelType& key) const {
+  constexpr static int kDefaultCustomizedTypeValue = 0;
-      int place = key.place_.which();
-      int data_type = static_cast<int>(key.data_type_) << LEFT_SHIFT;
-      int data_layout = static_cast<int>(key.data_layout_) << (LEFT_SHIFT * 2);
-      int library_type = static_cast<int>(key.library_type_)
-                         << (LEFT_SHIFT * 3);
-      std::hash<int> hasher;
-      return hasher(place + data_type + data_layout + library_type);
-    }
-  };
-  // place, data_type, library_type kinds less than 2^8
+  // In total should be smaller than 64.
-  constexpr static int LEFT_SHIFT = 8;
+  constexpr static int kPlaceBits = 4;
+  constexpr static int kPrimaryDTypeBits = 8;
-  proto::VarType::Type data_type_;
+  constexpr static int kLayoutBits = 4;
-  DataLayout data_layout_;
+  constexpr static int kLibBits = 4;
-  platform::Place place_;
+  constexpr static int kCustomizeBits = 4;
-  LibraryType library_type_;
  OpKernelType(proto::VarType::Type data_type, platform::Place place,
               DataLayout data_layout = DataLayout::kAnyLayout,
-               LibraryType library_type = LibraryType::kPlain)
+               LibraryType library_type = LibraryType::kPlain,
+               int customized_type_value = kDefaultCustomizedTypeValue)
      : data_type_(data_type),
        data_layout_(data_layout),
        place_(place),
-        library_type_(library_type) {}
+        library_type_(library_type),
+        customized_type_value_(customized_type_value) {}
  OpKernelType(proto::VarType::Type data_type,
               const platform::DeviceContext& dev_ctx,
               DataLayout data_layout = DataLayout::kAnyLayout,
-               LibraryType library_type = LibraryType::kPlain)
+               LibraryType library_type = LibraryType::kPlain,
+               int customized_type_value = kDefaultCustomizedTypeValue)
      : data_type_(data_type),
        data_layout_(data_layout),
        place_(dev_ctx.GetPlace()),
-        library_type_(library_type) {}
+        library_type_(library_type),
+        customized_type_value_(customized_type_value) {}
+  virtual ~OpKernelType() {}
+  struct Hash {
+    size_t operator()(const OpKernelType& key) const;
+  };
  size_t hash_key() const { return Hash()(*this); }
-  bool operator==(const OpKernelType& o) const {
+  bool operator==(const OpKernelType& o) const;
-    return platform::places_are_same_class(place_, o.place_) &&
-           data_type_ == o.data_type_ && data_layout_ == o.data_layout_ &&
-           library_type_ == o.library_type_;
-  }
  bool operator!=(const OpKernelType& o) const { return !(*this == o); }
+  proto::VarType::Type data_type_;
+  DataLayout data_layout_;
+  platform::Place place_;
+  LibraryType library_type_;
+  int customized_type_value_;
 };
 inline std::ostream& operator<<(std::ostream& os,

--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -35,6 +35,7 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 class Registrar {
 public:
  // In our design, various kinds of classes, e.g., operators and kernels,
@@ -78,7 +79,7 @@ struct OpKernelRegistrarFunctor;
 template <typename PlaceType, typename T, typename Func>
 inline void RegisterKernelClass(const char* op_type, const char* library_type,
-                                Func func) {
+                                int customized_type_value, Func func) {
  std::string library(library_type);
  std::string data_layout = "ANYLAYOUT";
  if (library == "MKLDNN") {
@@ -86,7 +87,7 @@ inline void RegisterKernelClass(const char* op_type, const char* library_type,
  }
  OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType(),
                   StringToDataLayout(data_layout),
-                   StringToLibraryType(library_type));
+                   StringToLibraryType(library_type), customized_type_value);
  OperatorWithKernel::AllOpKernels()[op_type][key] = func;
 }
@@ -95,22 +96,26 @@ struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {
  using KERNEL_TYPE =
      typename std::tuple_element<I, std::tuple<KernelTypes...>>::type;
-  void operator()(const char* op_type, const char* library_type) const {
+  void operator()(const char* op_type, const char* library_type,
+                  int customized_type_value) const {
    using T = typename KERNEL_TYPE::ELEMENT_TYPE;
    RegisterKernelClass<PlaceType, T>(
-        op_type, library_type, [](const framework::ExecutionContext& ctx) {
+        op_type, library_type, customized_type_value,
+        [](const framework::ExecutionContext& ctx) {
          KERNEL_TYPE().Compute(ctx);
        });
    constexpr auto size = std::tuple_size<std::tuple<KernelTypes...>>::value;
    OpKernelRegistrarFunctor<PlaceType, I + 1 == size, I + 1, KernelTypes...>
        func;
-    func(op_type, library_type);
+    func(op_type, library_type, customized_type_value);
  }
 };
 template <typename PlaceType, size_t I, typename... KernelType>
 struct OpKernelRegistrarFunctor<PlaceType, true, I, KernelType...> {
-  void operator()(const char* op_type, const char* library_type) const {}
+  void operator()(const char* op_type, const char* library_type,
+                  int customized_type_value) const {}
 };
 // User can register many kernel in one place. The data type could be
@@ -118,9 +123,10 @@ struct OpKernelRegistrarFunctor<PlaceType, true, I, KernelType...> {
 template <typename PlaceType, typename... KernelType>
 class OpKernelRegistrar : public Registrar {
 public:
-  explicit OpKernelRegistrar(const char* op_type, const char* library_type) {
+  explicit OpKernelRegistrar(const char* op_type, const char* library_type,
+                             int customized_type_value) {
    OpKernelRegistrarFunctor<PlaceType, false, 0, KernelType...> func;
-    func(op_type, library_type);
+    func(op_type, library_type, customized_type_value);
  }
 };
@@ -130,17 +136,19 @@ struct OpKernelRegistrarFunctorEx;
 template <typename PlaceType, typename... DataTypeAndKernelType>
 class OpKernelRegistrarEx : public Registrar {
 public:
-  explicit OpKernelRegistrarEx(const char* op_type, const char* library_type) {
+  explicit OpKernelRegistrarEx(const char* op_type, const char* library_type,
+                               int customized_type_value) {
    OpKernelRegistrarFunctorEx<PlaceType, false, 0, DataTypeAndKernelType...>
        func;
-    func(op_type, library_type);
+    func(op_type, library_type, customized_type_value);
  }
 };
 template <typename PlaceType, size_t I, typename... DataTypeAndKernelType>
 struct OpKernelRegistrarFunctorEx<PlaceType, true, I,
                                  DataTypeAndKernelType...> {
-  void operator()(const char* op_type, const char* library_type) const {}
+  void operator()(const char* op_type, const char* library_type,
+                  int customized_type_value) const {}
 };
 template <typename PlaceType, size_t I, typename... DataTypeAndKernelType>
@@ -153,18 +161,21 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
      typename std::tuple_element<I,
                                  std::tuple<DataTypeAndKernelType...>>::type;
-  void operator()(const char* op_type, const char* library_type) const {
+  void operator()(const char* op_type, const char* library_type,
-    RegisterKernelClass<PlaceType, T>(op_type, library_type, Functor());
+                  int customized_type_value) const {
+    RegisterKernelClass<PlaceType, T>(op_type, library_type,
+                                      customized_type_value, Functor());
    constexpr auto size =
        std::tuple_size<std::tuple<DataTypeAndKernelType...>>::value;
    OpKernelRegistrarFunctorEx<PlaceType, I + 2 >= size, I + 2,
                               DataTypeAndKernelType...>
        func;
-    func(op_type, library_type);
+    func(op_type, library_type, customized_type_value);
  }
 };
+// clang-format off
 /**
 * check if MACRO is used in GLOBAL NAMESPACE.
 */
@@ -199,42 +210,64 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
 /**
 * Macro to register OperatorKernel.
 */
-#define REGISTER_OP_KERNEL(op_type, library_type, place_class, ...)        \
+#define REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(op_type, library_type,             \
-  STATIC_ASSERT_GLOBAL_NAMESPACE(                                          \
+                                            place_class, customized_name,      \
-      __reg_op_kernel_##op_type##_##library_type##__,                      \
+                                            customized_type_value, ...)        \
-      "REGISTER_OP_KERNEL must be called in global namespace");            \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                              \
-  static ::paddle::framework::OpKernelRegistrar<place_class, __VA_ARGS__>  \
+      __reg_op_kernel_##op_type##_##library_type##_##customized_name##__,      \
-      __op_kernel_registrar_##op_type##_##library_type##__(#op_type,       \
+                                 "REGISTER_OP_KERNEL must be called in "       \
-                                                           #library_type); \
+                                 "global namespace");                          \
-  int TouchOpKernelRegistrar_##op_type##_##library_type() {                \
+  static ::paddle::framework::OpKernelRegistrar<place_class,                   \
-    __op_kernel_registrar_##op_type##_##library_type##__.Touch();          \
+                                                __VA_ARGS__>                   \
-    return 0;                                                              \
+      __op_kernel_registrar_##op_type##_##library_type##_##customized_name##__(\
+          #op_type, #library_type, customized_type_value);                     \
+  int TouchOpKernelRegistrar_##op_type##_##library_type##_##customized_name() {\
+    __op_kernel_registrar_##op_type##_##library_type##_##customized_name##__   \
+        .Touch();                                                              \
+    return 0;                                                                  \
  }
+#define REGISTER_OP_KERNEL(op_type, library_type, place_class, ...)   \
+  REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(                                \
+      op_type, library_type, place_class, DEFAULT_TYPE,               \
+      ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
+      __VA_ARGS__)
 #define REGISTER_OP_CUDA_KERNEL(op_type, ...) \
  REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::CUDAPlace, __VA_ARGS__)
 #define REGISTER_OP_CPU_KERNEL(op_type, ...) \
  REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
-#define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class, ...)      \
+#define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class,  \
-  STATIC_ASSERT_GLOBAL_NAMESPACE(                                           \
+                              customized_name,                     \
-      __reg_op_kernel_##op_type##_##library_type##__,                       \
+                              customized_type_value,               \
-      "REGISTER_OP_KERNEL_EX must be called in global namespace");          \
+                              ...)                                 \
-  static ::paddle::framework::OpKernelRegistrarEx<place_class, __VA_ARGS__> \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                  \
-      __op_kernel_registrar_##op_type##_##library_type##__(#op_type,        \
+      __reg_op_kernel_##op_type##_##library_type##_##customized_name##__, \
-                                                           #library_type);  \
+                                 "REGISTER_OP_KERNEL_EX must be called in "  \
-  int TouchOpKernelRegistrar_##op_type##_##library_type() {                 \
+                                 "global namespace");  \
-    __op_kernel_registrar_##op_type##_##library_type##__.Touch();           \
+  static ::paddle::framework::OpKernelRegistrarEx<place_class,  \
-    return 0;                                                               \
+                                                  __VA_ARGS__>  \
+      __op_kernel_registrar_##op_type##_##library_type##_##customized_name##__(\
+          #op_type, #library_type, customized_type_value);  \
+  int TouchOpKernelRegistrar_##op_type##_##library_type##_##customized_name() {\
+    __op_kernel_registrar_##op_type##_##library_type##_##customized_name##__   \
+        .Touch();                                                              \
+    return 0;                                                                  \
  }
 #define REGISTER_OP_CUDA_KERNEL_FUNCTOR(op_type, ...)                 \
-  REGISTER_OP_KERNEL_EX(op_type, CUDA, ::paddle::platform::CUDAPlace, \
+  REGISTER_OP_KERNEL_EX(                                              \
-                        __VA_ARGS__)
+      op_type, CUDA, ::paddle::platform::CUDAPlace, DEFAULT_TYPE,     \
+      ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
+      __VA_ARGS__)
-#define REGISTER_OP_CPU_KERNEL_FUNCTOR(op_type, ...) \
+#define REGISTER_OP_CPU_KERNEL_FUNCTOR(op_type, ...)                  \
-  REGISTER_OP_KERNEL_EX(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
+  REGISTER_OP_KERNEL_EX(                                              \
+      op_type, CPU, ::paddle::platform::CPUPlace, DEFAULT_TYPE,       \
+      ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
+      __VA_ARGS__)
 /**
 * Macro to mark what Operator and Kernel
@@ -248,13 +281,19 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
  extern int TouchOpRegistrar_##op_type();                 \
  UNUSED static int use_op_itself_##op_type##_ = TouchOpRegistrar_##op_type()
-#define USE_OP_DEVICE_KERNEL(op_type, LIBRARY_TYPE)               \
+#define USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(op_type,                     \
-  STATIC_ASSERT_GLOBAL_NAMESPACE(                                 \
+                                              LIBRARY_TYPE,                \
-      __use_op_kernel_##op_type##_##LIBRARY_TYPE##__,             \
+                                              customized_name)             \
-      "USE_OP_DEVICE_KERNEL must be in global namespace");        \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                          \
-  extern int TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE(); \
+      __use_op_kernel_##op_type##_##LIBRARY_TYPE##_##customized_name##__,  \
-  UNUSED static int use_op_kernel_##op_type##_##LIBRARY_TYPE##_ = \
+      "USE_OP_DEVICE_KERNEL must be in global namespace");                 \
-      TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE()
+  extern int                                                               \
+      TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE##_##customized_name(); \
+  UNUSED static int use_op_kernel_##op_type##_##LIBRARY_TYPE##_##DEFAULT_TYPE##_ = /* NOLINT */ \
+      TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE##_##customized_name()
+#define USE_OP_DEVICE_KERNEL(op_type, LIBRARY_TYPE) \
+  USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(op_type, LIBRARY_TYPE, DEFAULT_TYPE)
 // TODO(fengjiayi): The following macros
 // seems ugly, do we have better method?
@@ -280,6 +319,7 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
 #define USE_OP(op_type)   \
  USE_OP_ITSELF(op_type); \
  USE_OP_KERNEL(op_type)
+// clang-format off
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -695,6 +695,12 @@ static void CheckTensorNANOrInf(const std::string& name,
                 "Tensor %s contains NAN", name);
 }
+void OperatorWithKernel::RuntimeInferShape(const Scope& scope,
+                                           const platform::Place& place) const {
+  RuntimeInferShapeContext infer_shape_ctx(*this, scope);
+  this->InferShape(&infer_shape_ctx);
+}
 void OperatorWithKernel::RunImpl(const Scope& scope,
                                 const platform::Place& place) const {
  RuntimeInferShapeContext infer_shape_ctx(*this, scope);

--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -128,6 +128,8 @@ class OperatorBase {
  virtual std::vector<std::string> OutputVars(bool has_intermediate) const;
  void SetIsCalledByExecutor(bool x) { run_by_executor_ = x; }
+  virtual void RuntimeInferShape(const Scope& scope,
+                                 const platform::Place& place) const {}
 protected:
  std::string type_;
@@ -348,6 +350,9 @@ class OperatorWithKernel : public OperatorBase {
    OpInfoMap::Instance().Get(Type()).infer_shape_(ctx);
  }
+  void RuntimeInferShape(const Scope& scope,
+                         const platform::Place& place) const override;
 protected:
  virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const;
  virtual OpKernelType GetKernelTypeForVar(

--- a/paddle/fluid/framework/operator_test.cc
+++ b/paddle/fluid/framework/operator_test.cc
@@ -50,6 +50,8 @@ class OpWithoutKernelCheckerMaker : public OpProtoAndCheckerMaker {
    AddInput("input", "input of test op");
    AddOutput("output", "output of test op");
    AddAttr<float>("scale", "scale of cosine op");
+    AddAttr<int>("kernel_sub_type", "kernels with different implementations.")
+        .SetDefault(0);
    AddComment("This is test op");
  }
 };
@@ -95,6 +97,8 @@ TEST(OperatorBase, all) {
 namespace paddle {
 namespace framework {
+static int special_type_value = 1;
 class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
 public:
  void Make() {
@@ -103,11 +107,14 @@ class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
    AddAttr<float>("scale", "scale of cosine op")
        .SetDefault(1.0)
        .GreaterThan(0.0);
+    AddAttr<int>("kernel_sub_type", "kernels with different implementations.")
+        .SetDefault(0);
    AddComment("This is test op");
  }
 };
 static int cpu_kernel_run_num = 0;
+static int cpu_kernel2_run_num = 0;
 class OpWithKernelTest : public OperatorWithKernel {
 public:
@@ -117,7 +124,10 @@ class OpWithKernelTest : public OperatorWithKernel {
  void InferShape(framework::InferShapeContext* ctx) const override {}
  OpKernelType GetExpectedKernelType(
      const ExecutionContext& ctx) const override {
-    return OpKernelType(proto::VarType::FP32, ctx.GetPlace());
+    int sub_type = ctx.Attr<int>("kernel_sub_type");
+    return OpKernelType(proto::VarType::FP32, ctx.GetPlace(),
+                        framework::DataLayout::kAnyLayout,
+                        framework::LibraryType::kPlain, sub_type);
  }
 };
@@ -132,6 +142,17 @@ class CPUKernelTest : public OpKernel<float> {
  }
 };
+template <typename T1, typename T2>
+class CPUKernel2Test : public OpKernel<float> {
+ public:
+  void Compute(const ExecutionContext& ctx) const {
+    std::cout << ctx.op().DebugString() << std::endl;
+    cpu_kernel2_run_num++;
+    ASSERT_EQ(ctx.op().Input("x"), "IN1");
+    ASSERT_EQ(ctx.op().Output("y"), "OUT1");
+  }
+};
 class OpKernelTestMultiInputsProtoAndCheckerMaker
    : public OpProtoAndCheckerMaker {
 public:
@@ -142,6 +163,8 @@ class OpKernelTestMultiInputsProtoAndCheckerMaker
    AddAttr<float>("scale", "scale of cosine op")
        .SetDefault(1.0)
        .GreaterThan(0.0);
+    AddAttr<int>("kernel_sub_type", "kernels with different implementations.")
+        .SetDefault(0);
    AddComment("This is test op");
  }
 };
@@ -189,9 +212,15 @@ class CPUKernalMultiInputsTest : public OpKernel<float> {
 REGISTER_OP_WITHOUT_GRADIENT(
    op_with_kernel, paddle::framework::OpWithKernelTest,
    paddle::framework::OpKernelTestProtoAndCheckerMaker);
 REGISTER_OP_CPU_KERNEL(op_with_kernel,
                       paddle::framework::CPUKernelTest<float, float>);
+REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(
+    op_with_kernel, CPU, paddle::platform::CPUPlace, MY_SPECIAL_NAME,
+    paddle::framework::special_type_value,
+    paddle::framework::CPUKernel2Test<float, float>);
 // test with single input
 TEST(OpKernel, all) {
  paddle::framework::InitDevices(true);
@@ -211,7 +240,19 @@ TEST(OpKernel, all) {
  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
  ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 0);
  op->Run(scope, cpu_place);
+  // kerne_sub_type = 0, hence cpu_kernel is called, cpu_kernel2 is not called.
+  ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1);
+  ASSERT_EQ(paddle::framework::cpu_kernel2_run_num, 0);
+  attr = op_desc.mutable_attrs()->Add();
+  attr->set_name("kernel_sub_type");
+  attr->set_type(paddle::framework::proto::AttrType::INT);
+  attr->set_i(1);
+  auto op2 = paddle::framework::OpRegistry::CreateOp(op_desc);
+  op2->Run(scope, cpu_place);
+  // kerne_sub_type = 1, hence cpu_kernel2 is called, cpu_kernel is not called.
  ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1);
+  ASSERT_EQ(paddle::framework::cpu_kernel2_run_num, 1);
 }
 REGISTER_OP_WITHOUT_GRADIENT(

--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
@@ -32,8 +32,7 @@ namespace framework {
 class SelectedRows {
  /*
   * @brief We can use the SelectedRows structure to reproduce a sparse table.
-   *  A sparse table is a key-value structure that the key is an `int64_t`
+   *  A sparse table is a key-value structure that the key is an `int64_t`,
-   * number,
   *  and the value is a Tensor which the first dimension is 0.
   *  You can use the following interface to operate the sparse table, and you
   * can find

--- a/paddle/fluid/framework/variable_helper.cc
+++ b/paddle/fluid/framework/variable_helper.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/framework/variable_helper.h"
+#include <vector>
+#include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/platform/place.h"
+namespace paddle {
+namespace framework {
+void InitializeVariable(Variable* var, proto::VarType::Type var_type) {
+  if (var_type == proto::VarType::LOD_TENSOR) {
+    var->GetMutable<LoDTensor>();
+  } else if (var_type == proto::VarType::SELECTED_ROWS) {
+    var->GetMutable<SelectedRows>();
+  } else if (var_type == proto::VarType::FEED_MINIBATCH) {
+    var->GetMutable<FeedFetchList>();
+  } else if (var_type == proto::VarType::FETCH_LIST) {
+    var->GetMutable<FeedFetchList>();
+  } else if (var_type == proto::VarType::STEP_SCOPES) {
+    var->GetMutable<std::vector<framework::Scope*>>();
+  } else if (var_type == proto::VarType::LOD_RANK_TABLE) {
+    var->GetMutable<LoDRankTable>();
+  } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {
+    var->GetMutable<LoDTensorArray>();
+  } else if (var_type == proto::VarType::PLACE_LIST) {
+    var->GetMutable<platform::PlaceList>();
+  } else if (var_type == proto::VarType::READER) {
+    var->GetMutable<ReaderHolder>();
+  } else if (var_type == proto::VarType::RAW) {
+    // GetMutable will be called in operator
+  } else {
+    PADDLE_THROW(
+        "Variable type %d is not in "
+        "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
+        "LOD_RANK_TABLE, PLACE_LIST, READER, RAW]",
+        var_type);
+  }
+}
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/variable_helper.h
+++ b/paddle/fluid/framework/variable_helper.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/variable.h"
+namespace paddle {
+namespace framework {
+void InitializeVariable(Variable *var, proto::VarType::Type var_type);
+}
+}
--- a/paddle/fluid/inference/analysis/analysis_pass.h
+++ b/paddle/fluid/inference/analysis/analysis_pass.h
@@ -46,8 +46,6 @@ class AnalysisPass {
 protected:
  // User should implement these.
  virtual void RunImpl(Argument* argument) = 0;
-  Argument* argument_{nullptr};
 };
 }  // namespace analysis

--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -178,11 +178,12 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
    output_mapping.push_back(output_name_map[name]);
  }
-  *block_desc.Proto()->mutable_vars() =
+  auto *vars = block_desc.Proto()->mutable_vars();
-      const_cast<framework::ProgramDesc *>(&graph->program())
+  for (framework::ir::Node *node : graph->Nodes()) {
-          ->Proto()
+    if (node->IsVar() && node->Var()) {
-          ->blocks(0)
+      *vars->Add() = *node->Var()->Proto();
-          .vars();
+    }
+  }
  PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(),
                 "the block has no var-desc");
  PADDLE_ENFORCE(!output_mapping.empty());

--- a/paddle/fluid/inference/analysis/passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/passes/CMakeLists.txt
 cc_library(ir_graph_build_pass SRCS ir_graph_build_pass.cc DEPS analysis_pass argument ir_pass_manager)
 cc_library(ir_analysis_pass SRCS ir_analysis_pass.cc DEPS analysis_pass argument ir_pass_manager)
-cc_library(analysis_passes SRCS passes.cc DEPS ir_graph_build_pass ir_analysis_pass)
+cc_library(ir_params_sync_among_devices_pass SRCS ir_params_sync_among_devices_pass.cc DEPS analysis_pass argument ir_pass_manager)
+cc_library(analysis_passes SRCS passes.cc DEPS ir_graph_build_pass ir_analysis_pass ir_params_sync_among_devices_pass)
 set(analysis_deps ${analysis_deps}
        ir_graph_build_pass

--- a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
@@ -61,6 +61,7 @@ void IrAnalysisComposePass::InitTensorRTAttrs(Argument *argument) {
 void IrAnalysisComposePass::ApplyIrPasses(Argument *argument) {
  std::vector<std::string> passes({
      "ir_graph_build_pass", "ir_analysis_pass",
+      "ir_params_sync_among_devices_pass",
  });
  for (const auto &pass : passes) {
    VLOG(2) << "Run pass " << pass;

--- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
@@ -36,12 +36,7 @@ void IrGraphBuildPass::RunImpl(Argument *argument) {
  // so that the parameters will on the same device, or they will keep copying
  // between difference devices.
  platform::Place place;
-  if (argument->use_gpu()) {
+  place = platform::CPUPlace();
-    PADDLE_ENFORCE(argument->gpu_device_id_valid());
-    place = platform::CUDAPlace(argument->gpu_device_id());
-  } else {
-    place = platform::CPUPlace();
-  }
  if (argument->model_dir_valid()) {
    auto program =

--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h"
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/enforce.h"
+namespace paddle {
+namespace inference {
+namespace analysis {
+void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
+  PADDLE_ENFORCE(argument->scope_valid());
+  PADDLE_ENFORCE(argument->use_gpu_valid());
+  platform::Place place;
+  // The parameters are on the cpu, therefore, synchronization is not necessary.
+  if (!argument->use_gpu()) return;
+  LOG(INFO) << "Sync params from CPU to GPU";
+  PADDLE_ENFORCE(argument->gpu_device_id_valid());
+  place = platform::CUDAPlace(argument->gpu_device_id());
+  auto *scope = argument->scope_ptr();
+  std::vector<std::string> all_vars = scope->LocalVarNames();
+  // We get all the vars from local_scope instead of the ProgramDesc.
+  // Because there exists the case that new parameter variables are not added to
+  // the program in the analysis pass.
+  for (auto &var_name : all_vars) {
+    auto *var = scope->FindLocalVar(var_name);
+    PADDLE_ENFORCE(var != nullptr);
+    if (var->IsType<framework::LoDTensor>() ||
+        var->IsType<framework::Tensor>()) {
+      auto *t = var->GetMutable<framework::LoDTensor>();
+      platform::CPUPlace cpu_place;
+      framework::LoDTensor temp_tensor;
+      temp_tensor.Resize(t->dims());
+      temp_tensor.mutable_data<float>(cpu_place);
+      // Copy the parameter data to a tmp tensor.
+      TensorCopySync(*t, cpu_place, &temp_tensor);
+      // Reallocation the space on GPU
+      t->mutable_data<float>(place);
+      // Copy parameter data to newly allocated GPU space.
+      TensorCopySync(temp_tensor, place, t);
+    }
+  }
+}
+std::string IrParamsSyncAmongDevicesPass::repr() const {
+  return "ir-params-sync-among-devices-pass";
+}
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
+#include "paddle/fluid/platform/place.h"
+namespace paddle {
+namespace inference {
+namespace analysis {
+/*
+ * Sync parameter from CPU to GPU.
+ */
+class IrParamsSyncAmongDevicesPass : public AnalysisPass {
+ public:
+  void RunImpl(Argument *argument) override;
+  std::string repr() const override;
+};
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/analysis/passes/passes.cc
+++ b/paddle/fluid/inference/analysis/passes/passes.cc
@@ -16,6 +16,7 @@
 #include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc"
 #include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
 #include "paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h"
+#include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h"
 namespace paddle {
 namespace inference {
@@ -27,6 +28,9 @@ PassRegistry::PassRegistry() {
                  std::unique_ptr<AnalysisPass>(new IrGraphBuildPass));
  passes_.emplace("ir_analysis_compose_pass",
                  std::unique_ptr<AnalysisPass>(new IrAnalysisComposePass));
+  passes_.emplace(
+      "ir_params_sync_among_devices_pass",
+      std::unique_ptr<AnalysisPass>(new IrParamsSyncAmongDevicesPass));
 }
 }  // namespace analysis

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -190,9 +190,13 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
  }
  VLOG(3) << "predict cost: " << timer.toc() << "ms";
-  // Fix TensorArray reuse not cleaned bug.
+  // All the containers in the scope will be hold in inference, but the
-  tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get());
+  // operators assume that the container will be reset after each batch.
-  tensor_array_batch_cleaner_.ResetTensorArray();
+  // Here is a bugfix, collect all the container variables, and reset then to a
+  // bool; the next time, the operator will call MutableData and construct a new
+  // container again, so that the container will be empty for each batch.
+  tensor_array_batch_cleaner_.CollectNoTensorVars(sub_scope_);
+  tensor_array_batch_cleaner_.ResetNoTensorVars();
  return true;
 }
@@ -417,7 +421,7 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
 bool AnalysisPredictor::ZeroCopyRun() {
  executor_->Run();
  // Fix TensorArray reuse not cleaned bug.
-  tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get());
+  tensor_array_batch_cleaner_.CollectTensorArrays(sub_scope_);
  tensor_array_batch_cleaner_.ResetTensorArray();
  return true;
 }

--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -154,9 +154,9 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
  }
  VLOG(3) << "predict cost: " << timer.toc() << "ms";
-  // Fix TensorArray reuse not cleaned bug.
+  // For some other vector like containers not cleaned after each batch.
-  tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get());
+  tensor_array_batch_cleaner_.CollectNoTensorVars(scope_.get());
-  tensor_array_batch_cleaner_.ResetTensorArray();
+  tensor_array_batch_cleaner_.ResetNoTensorVars();
  return true;
 }

--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -79,6 +79,16 @@ link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
 link_directories("${PADDLE_LIB}/third_party/install/xxhash/lib")
 link_directories("${PADDLE_LIB}/paddle/lib")
+if (NOT WIN32)
+    set(NGRAPH_PATH "${PADDLE_LIB}/third_party/install/ngraph")
+    if(EXISTS ${NGRAPH_PATH})
+        include(GNUInstallDirs)
+        include_directories("${NGRAPH_PATH}/include")
+        link_directories("${NGRAPH_PATH}/${CMAKE_INSTALL_LIBDIR}")
+        set(NGRAPH_LIB ${NGRAPH_PATH}/${CMAKE_INSTALL_LIBDIR}/libngraph${CMAKE_SHARED_LIBRARY_SUFFIX})
+    endif()
+endif()
 add_executable(${DEMO_NAME} ${DEMO_NAME}.cc)
 if(WITH_MKL)
@@ -106,7 +116,7 @@ endif()
 if (NOT WIN32)
 set(EXTERNAL_LIB "-lrt -ldl -lpthread")
 set(DEPS ${DEPS}
-    ${MATH_LIB} ${MKLDNN_LIB}
+    ${MATH_LIB} ${MKLDNN_LIB} ${NGRAPH_LIB}
    glog gflags protobuf snappystream snappy z xxhash
    ${EXTERNAL_LIB})
 else()

--- a/paddle/fluid/inference/api/details/reset_tensor_array.cc
+++ b/paddle/fluid/inference/api/details/reset_tensor_array.cc
@@ -46,5 +46,28 @@ void TensorArrayBatchCleaner::ResetTensorArray() {
  }
 }
+void TensorArrayBatchCleaner::CollectNoTensorVars(framework::Scope *scope) {
+  if (no_tensor_flag_) {
+    for (auto &var_name : scope->LocalVarNames()) {
+      auto *var = scope->FindVar(var_name);
+      if (!var->IsInitialized()) continue;
+      if (!valid_types_.count(var->Type())) {
+        no_tensor_vars_.insert(var);
+      }
+    }
+    for (auto *kid : scope->kids()) {
+      CollectTensorArrays(kid);
+    }
+    no_tensor_flag_ = false;  // Only collect one time.
+  }
+}
+void TensorArrayBatchCleaner::ResetNoTensorVars() {
+  for (auto *var : no_tensor_vars_) {
+    var->Clear();
+  }
+}
 }  // namespace details
 }  // namespace paddle
--- a/paddle/fluid/inference/api/details/reset_tensor_array.h
+++ b/paddle/fluid/inference/api/details/reset_tensor_array.h
@@ -14,9 +14,11 @@
 #pragma once
+#include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/variable.h"
 namespace paddle {
 namespace details {
@@ -24,13 +26,28 @@ namespace details {
 // Clean the TensorArray each batch to make the behavior the same with the
 // training phase.
 struct TensorArrayBatchCleaner {
+  TensorArrayBatchCleaner() {
+    valid_types_.insert(typeid(framework::Tensor));
+    valid_types_.insert(typeid(framework::LoDTensor));
+  }
+  // Collect the variables that are not Tensor or LoDTensor, and reset them to a
+  // bool(trick), because some of them are containers, and some operators just
+  // keep inserting new items without clearing the containers first; So the
+  // memory grow larger and larger in inference service deployed online.
+  void CollectNoTensorVars(framework::Scope *scope);
+  void ResetNoTensorVars();
  // Fix the tensor array not clear in the inference scenarios.
  void CollectTensorArrays(framework::Scope *scope);
  void ResetTensorArray();
 private:
  bool flag_{true};
+  bool no_tensor_flag_{true};
  std::vector<framework::LoDTensorArray *> arrays_;
+  std::unordered_set<std::type_index> valid_types_;
+  std::unordered_set<framework::Variable *> no_tensor_vars_;
 };
 }  // namespace details

--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -116,12 +116,8 @@ class CpuPassStrategy : public PassStrategy {
 class GpuPassStrategy : public PassStrategy {
 public:
  GpuPassStrategy() : PassStrategy({}) {
-    // TODO(NHZlX) Problem with Data synchronization between GPU and CPU
-    // When running in GPU mode, the parameters are all on GPU. But the
-    // opearations of "conv_bn_fuse_pass" are on CPU.
    passes_.assign({
-        "infer_clean_graph_pass",
+        "infer_clean_graph_pass", "conv_bn_fuse_pass",
-        // "infer_clean_graph_pass", "conv_bn_fuse_pass",
    });
  }

--- a/paddle/fluid/inference/tensorrt/convert/test_prelu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_prelu_op.cc
@@ -90,5 +90,4 @@ TEST(prelu_op, test_scalar) {
 }  // namespace inference
 }  // namespace paddle
-// USE_OP(prelu);
+USE_OP(prelu);
-USE_CPU_ONLY_OP(prelu);
--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
 nv_library(tensorrt_plugin
           SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu prelu_op_plugin.cu
           avg_pool_op_plugin.cu
-           DEPS enforce tensorrt_engine)
+           DEPS enforce tensorrt_engine prelu)
--- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
@@ -14,92 +14,16 @@
 #include <stdio.h>
 #include <cassert>
+#include <vector>
 #include "glog/logging.h"
 #include "paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h"
+#include "paddle/fluid/operators/math/prelu.h"
 namespace paddle {
 namespace inference {
 namespace tensorrt {
 namespace plugin {
-static const int CUDA_NUM_THREADS = 1024;
-static const int CUDA_MAX_NUM_BLOCKS = 65535;
-inline static int GET_NUM_BLOCKS(const int N) {
-  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
-}
-__global__ void PReluChannelWiseKernel(const float *input, const float *alpha,
-                                       float *output, int channel,
-                                       size_t spatial_size) {
-  size_t offset = blockIdx.x * spatial_size;
-  const float *in = input + offset;
-  float *out = output + offset;
-  float scale = alpha[blockIdx.x % channel];
-  for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) {
-    float x = in[i];
-    out[i] = (x > 0) ? x : scale * x;
-  }
-}
-__global__ void PReluElementWiseKernel(const float *input, const float *alpha,
-                                       float *output, size_t spatial_size) {
-  size_t offset = blockIdx.x * spatial_size;
-  const float *in = input + offset;
-  const float *scale = alpha + offset;
-  float *out = output + offset;
-  for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) {
-    float x = in[i];
-    out[i] = (x > 0) ? x : scale[i] * x;
-  }
-}
-__global__ void PReluScalarKernel(const float *input, const float *alpha,
-                                  float *output, size_t spatial_size) {
-  size_t offset = blockIdx.x * spatial_size;
-  const float *in = input + offset;
-  float scale = *alpha;
-  float *out = output + offset;
-  for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) {
-    float x = in[i];
-    out[i] = (x > 0) ? x : scale * x;
-  }
-}
-static inline void PReluChannelWise(cudaStream_t stream, const float *input,
-                                    const float *alpha, float *output,
-                                    int batch_size,
-                                    const nvinfer1::Dims &dims) {
-  size_t unroll = batch_size * dims.d[0];
-  size_t spatial_size = dims.d[1] * dims.d[2];
-  CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
-  PReluChannelWiseKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
-      input, alpha, output, dims.d[0], spatial_size);
-}
-static inline void PReluElementWise(cudaStream_t stream, const float *input,
-                                    const float *alpha, float *output,
-                                    int batch_size,
-                                    const nvinfer1::Dims &dims) {
-  size_t unroll = batch_size * dims.d[0];
-  size_t spatial_size = dims.d[1] * dims.d[2];
-  CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
-  PReluElementWiseKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
-      input, alpha, output, spatial_size);
-}
-static inline void PReluScalar(cudaStream_t stream, const float *input,
-                               const float *alpha, float *output,
-                               int batch_size, const nvinfer1::Dims &dims) {
-  size_t unroll = batch_size * dims.d[0];
-  size_t spatial_size = dims.d[1] * dims.d[2];
-  CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
-  PReluScalarKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
-      input, alpha, output, spatial_size);
-}
 nvinfer1::Dims PReluPlugin::getOutputDimensions(int index,
                                                const nvinfer1::Dims *inputDims,
                                                int nbInputs) {
@@ -110,19 +34,31 @@ nvinfer1::Dims PReluPlugin::getOutputDimensions(int index,
  return output_dims;
 }
-int PReluPlugin::enqueue(int batchSize, const void *const *inputs,
+int PReluPlugin::enqueue(int batch_size, const void *const *inputs,
                         void **outputs, void *workspace, cudaStream_t stream) {
  // input dims is CHW.
  const auto &input_dims = this->getInputDims(0);
  const float *input = reinterpret_cast<const float *>(inputs[0]);
  const float *alpha = reinterpret_cast<const float *>(alpha_.get().values);
  float *output = reinterpret_cast<float **>(outputs)[0];
+  std::vector<int> input_shape;
+  input_shape.push_back(batch_size);
+  for (int i = 0; i < input_dims.nbDims; i++) {
+    input_shape.push_back(input_dims.d[i]);
+  }
  if (mode_ == "channel") {
-    PReluChannelWise(stream, input, alpha, output, batchSize, input_dims);
+    operators::math::PreluChannelWiseDirectCUDAFunctor<float>
+        prelu_channel_wise;
+    prelu_channel_wise(stream, input, alpha, output, input_shape);
  } else if (mode_ == "element") {
-    PReluElementWise(stream, input, alpha, output, batchSize, input_dims);
+    operators::math::PreluElementWiseDirectCUDAFunctor<float>
+        prelu_element_wise;
+    prelu_element_wise(stream, input, alpha, output, input_shape);
  } else {
-    PReluScalar(stream, input, alpha, output, batchSize, input_dims);
+    operators::math::PreluScalarDirectCUDAFunctor<float> prelu_scalar;
+    prelu_scalar(stream, input, alpha, output, input_shape);
  }
  return cudaGetLastError() != cudaSuccess;
 }

--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -46,11 +46,18 @@ set(RNN2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn2")
 download_model_and_data(${RNN2_INSTALL_DIR} "rnn2_model.tar.gz" "rnn2_data.txt.tar.gz")
 inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2_tester.cc)
-# DAM
+# normal DAM
 set(DAM_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/dam")
 download_model_and_data(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz")
 inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc)
+# small DAM
+set(DAM_SMALL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_dam")
+download_model_and_data(${DAM_SMALL_INSTALL_DIR} "dam_small_model.tar.gz" "dam_small_data.txt.tar.gz")
+inference_analysis_test(test_analyzer_small_dam SRCS analyzer_dam_tester.cc
+        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+        ARGS --infer_model=${DAM_SMALL_INSTALL_DIR}/model --infer_data=${DAM_SMALL_INSTALL_DIR}/data.txt --max_turn_num=1)
 # chinese_ner
 set(CHINESE_NER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/chinese_ner")
 download_model_and_data(${CHINESE_NER_INSTALL_DIR} "chinese_ner_model.tar.gz" "chinese_ner-data.txt.tar.gz")

--- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
@@ -14,38 +14,54 @@
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
+DEFINE_int32(max_turn_num, 9,
+             "The max turn number: 1 for the small and 9 for the normal.");
 namespace paddle {
 namespace inference {
 using contrib::AnalysisConfig;
-#define MAX_TURN_NUM 9
-#define MAX_TURN_LEN 50
+constexpr int32_t kMaxTurnLen = 50;
 static std::vector<float> result_data;
 struct DataRecord {
-  std::vector<std::vector<int64_t>>
+  std::vector<std::vector<int64_t>> *turns;
-      turns[MAX_TURN_NUM];  // turns data : MAX_TURN_NUM
+  std::vector<std::vector<float>> *turns_mask;
-  std::vector<std::vector<float>>
+  std::vector<std::vector<int64_t>> response;     // response data : 1
-      turns_mask[MAX_TURN_NUM];                // turns mask data : MAX_TURN_NUM
-  std::vector<std::vector<int64_t>> response;  // response data : 1
  std::vector<std::vector<float>> response_mask;  // response mask data : 1
  size_t batch_iter{0};
  size_t batch_size{1};
  size_t num_samples;  // total number of samples
-  DataRecord() = default;
+  DataRecord() {
+    turns = new std::vector<std::vector<
+        int64_t>>[FLAGS_max_turn_num];  // turns data : FLAGS_max_turn_num
+    turns_mask = new std::vector<std::vector<
+        float>>[FLAGS_max_turn_num];  // turns mask data : FLAGS_max_turn_num
+  }
  explicit DataRecord(const std::string &path, int batch_size = 1)
-      : batch_size(batch_size) {
+      : DataRecord() {
+    this->batch_size = batch_size;
    Load(path);
  }
+  ~DataRecord() {
+    delete[] turns;
+    delete[] turns_mask;
+  }
  DataRecord NextBatch() {
    DataRecord data;
    size_t batch_end = batch_iter + batch_size;
    // NOTE skip the final batch, if no enough data is provided.
    if (batch_end <= response.size()) {
-      for (int i = 0; i < MAX_TURN_NUM; ++i) {
+      for (int i = 0; i < FLAGS_max_turn_num; ++i) {
        data.turns[i].assign(turns[i].begin() + batch_iter,
                             turns[i].begin() + batch_end);
      }
-      for (int i = 0; i < MAX_TURN_NUM; ++i) {
+      for (int i = 0; i < FLAGS_max_turn_num; ++i) {
        data.turns_mask[i].assign(turns_mask[i].begin() + batch_iter,
                                  turns_mask[i].begin() + batch_end);
      }
@@ -60,6 +76,7 @@ struct DataRecord {
    batch_iter += batch_size;
    return data;
  }
  void Load(const std::string &path) {
    std::ifstream file(path);
    std::string line;
@@ -69,30 +86,30 @@ struct DataRecord {
      num_lines++;
      std::vector<std::string> data;
      split(line, ',', &data);
-      CHECK_EQ(data.size(), (size_t)(2 * MAX_TURN_NUM + 3));
+      CHECK_EQ(data.size(), (size_t)(2 * FLAGS_max_turn_num + 3));
      // load turn data
-      std::vector<int64_t> turns_tmp[MAX_TURN_NUM];
+      std::vector<int64_t> turns_tmp[FLAGS_max_turn_num];
-      for (int i = 0; i < MAX_TURN_NUM; ++i) {
+      for (int i = 0; i < FLAGS_max_turn_num; ++i) {
        split_to_int64(data[i], ' ', &turns_tmp[i]);
        turns[i].push_back(std::move(turns_tmp[i]));
      }
      // load turn_mask data
-      std::vector<float> turns_mask_tmp[MAX_TURN_NUM];
+      std::vector<float> turns_mask_tmp[FLAGS_max_turn_num];
-      for (int i = 0; i < MAX_TURN_NUM; ++i) {
+      for (int i = 0; i < FLAGS_max_turn_num; ++i) {
-        split_to_float(data[MAX_TURN_NUM + i], ' ', &turns_mask_tmp[i]);
+        split_to_float(data[FLAGS_max_turn_num + i], ' ', &turns_mask_tmp[i]);
        turns_mask[i].push_back(std::move(turns_mask_tmp[i]));
      }
      // load response data
      std::vector<int64_t> response_tmp;
-      split_to_int64(data[2 * MAX_TURN_NUM], ' ', &response_tmp);
+      split_to_int64(data[2 * FLAGS_max_turn_num], ' ', &response_tmp);
      response.push_back(std::move(response_tmp));
      // load response_mask data
      std::vector<float> response_mask_tmp;
-      split_to_float(data[2 * MAX_TURN_NUM + 1], ' ', &response_mask_tmp);
+      split_to_float(data[2 * FLAGS_max_turn_num + 1], ' ', &response_mask_tmp);
      response_mask.push_back(std::move(response_mask_tmp));
      // load result data
      float result_tmp;
-      result_tmp = std::stof(data[2 * MAX_TURN_NUM + 2]);
+      result_tmp = std::stof(data[2 * FLAGS_max_turn_num + 2]);
      result_data.push_back(result_tmp);
    }
    num_samples = num_lines;
@@ -101,8 +118,8 @@ struct DataRecord {
 void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
                   int batch_size) {
-  PaddleTensor turns_tensor[MAX_TURN_NUM];
+  PaddleTensor turns_tensor[FLAGS_max_turn_num];
-  PaddleTensor turns_mask_tensor[MAX_TURN_NUM];
+  PaddleTensor turns_mask_tensor[FLAGS_max_turn_num];
  PaddleTensor response_tensor;
  PaddleTensor response_mask_tensor;
  std::string turn_pre = "turn_";
@@ -110,16 +127,16 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
  auto one_batch = data->NextBatch();
  int size = one_batch.response[0].size();
-  CHECK_EQ(size, MAX_TURN_LEN);
+  CHECK_EQ(size, kMaxTurnLen);
  // turn tensor assignment
-  for (int i = 0; i < MAX_TURN_NUM; ++i) {
+  for (int i = 0; i < FLAGS_max_turn_num; ++i) {
    turns_tensor[i].name = turn_pre + std::to_string(i);
    turns_tensor[i].shape.assign({batch_size, size, 1});
    turns_tensor[i].dtype = PaddleDType::INT64;
    TensorAssignData<int64_t>(&turns_tensor[i], one_batch.turns[i]);
  }
  // turn mask tensor assignment
-  for (int i = 0; i < MAX_TURN_NUM; ++i) {
+  for (int i = 0; i < FLAGS_max_turn_num; ++i) {
    turns_mask_tensor[i].name = turn_mask_pre + std::to_string(i);
    turns_mask_tensor[i].shape.assign({batch_size, size, 1});
    turns_mask_tensor[i].dtype = PaddleDType::FLOAT32;
@@ -137,10 +154,10 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
  TensorAssignData<float>(&response_mask_tensor, one_batch.response_mask);
  // Set inputs.
-  for (int i = 0; i < MAX_TURN_NUM; ++i) {
+  for (int i = 0; i < FLAGS_max_turn_num; ++i) {
    input_slots->push_back(std::move(turns_tensor[i]));
  }
-  for (int i = 0; i < MAX_TURN_NUM; ++i) {
+  for (int i = 0; i < FLAGS_max_turn_num; ++i) {
    input_slots->push_back(std::move(turns_mask_tensor[i]));
  }
  input_slots->push_back(std::move(response_tensor));
@@ -202,8 +219,6 @@ TEST(Analyzer_dam, fuse_statis) {
  auto fuse_statis = GetFuseStatis(
      static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
  ASSERT_TRUE(fuse_statis.count("fc_fuse"));
-  EXPECT_EQ(fuse_statis.at("fc_fuse"), 317);
-  EXPECT_EQ(num_ops, 2020);
 }
 // Compare result of NativeConfig and AnalysisConfig

--- a/paddle/fluid/inference/utils/benchmark.cc
+++ b/paddle/fluid/inference/utils/benchmark.cc
@@ -33,7 +33,7 @@ std::string Benchmark::SerializeToString() const {
  ss << batch_size_ << "\t";
  ss << num_threads_ << "\t";
  ss << latency_ << "\t";
-  ss << 1000 / latency_;
+  ss << 1000.0 / latency_;
  ss << '\n';
  return ss.str();
 }

--- a/paddle/fluid/inference/utils/benchmark.h
+++ b/paddle/fluid/inference/utils/benchmark.h
@@ -11,9 +11,11 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#pragma once
 #include <fstream>
 #include <iostream>
+#include <string>
 namespace paddle {
 namespace inference {
@@ -31,8 +33,8 @@ struct Benchmark {
  bool use_gpu() const { return use_gpu_; }
  void SetUseGpu() { use_gpu_ = true; }
-  int latency() const { return latency_; }
+  float latency() const { return latency_; }
-  void SetLatency(int x) { latency_ = x; }
+  void SetLatency(float x) { latency_ = x; }
  const std::string& name() const { return name_; }
  void SetName(const std::string& name) { name_ = name; }
@@ -43,7 +45,7 @@ struct Benchmark {
 private:
  bool use_gpu_{false};
  int batch_size_{0};
-  int latency_;
+  float latency_;
  int num_threads_{1};
  std::string name_;
 };

--- a/paddle/fluid/memory/allocation/legacy_allocator.cc
+++ b/paddle/fluid/memory/allocation/legacy_allocator.cc
@@ -14,11 +14,13 @@
 #include "paddle/fluid/memory/allocation/legacy_allocator.h"
 #include <string>
+#include <vector>
 #include "glog/logging.h"
 #include "paddle/fluid/memory/detail/buddy_allocator.h"
 #include "paddle/fluid/memory/detail/system_allocator.h"
 #include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/fluid/string/split.h"
 DEFINE_bool(init_allocated_mem, false,
            "It is a mistake that the values of the memory allocated by "
@@ -86,7 +88,7 @@ struct NaiveAllocator {
 template <>
 void *Alloc<platform::CPUPlace>(const platform::CPUPlace &place, size_t size) {
-  VLOG(1) << "Allocate " << size << " bytes on " << platform::Place(place);
+  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
  void *p = GetCPUBuddyAllocator()->Alloc(size);
  if (FLAGS_init_allocated_mem) {
    memset(p, 0xEF, size);
@@ -97,7 +99,7 @@ void *Alloc<platform::CPUPlace>(const platform::CPUPlace &place, size_t size) {
 template <>
 void Free<platform::CPUPlace>(const platform::CPUPlace &place, void *p) {
-  VLOG(1) << "Free pointer=" << p << " on " << platform::Place(place);
+  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
  GetCPUBuddyAllocator()->Free(p);
 }
@@ -110,19 +112,21 @@ size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) {
 BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) {
  static std::once_flag init_flag;
  static detail::BuddyAllocator **a_arr = nullptr;
+  static std::vector<int> devices;
  std::call_once(init_flag, [gpu_id]() {
-    int gpu_num = platform::GetCUDADeviceCount();
+    devices = platform::GetSelectedDevices();
-    PADDLE_ENFORCE(gpu_id < gpu_num, "gpu_id:%d should < gpu_num:%d", gpu_id,
+    int gpu_num = devices.size();
-                   gpu_num);
    a_arr = new BuddyAllocator *[gpu_num];
-    for (int i = 0; i < gpu_num; i++) {
+    for (size_t i = 0; i < devices.size(); ++i) {
+      int dev_id = devices[i];
      a_arr[i] = nullptr;
-      platform::SetDeviceId(i);
+      platform::SetDeviceId(dev_id);
-      a_arr[i] = new BuddyAllocator(
+      a_arr[i] = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
-          std::unique_ptr<detail::SystemAllocator>(new detail::GPUAllocator(i)),
+                                        new detail::GPUAllocator(dev_id)),
-          platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
+                                    platform::GpuMinChunkSize(),
+                                    platform::GpuMaxChunkSize());
      VLOG(10) << "\n\nNOTE: each GPU device use "
               << FLAGS_fraction_of_gpu_memory_to_use * 100
@@ -134,7 +138,9 @@ BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) {
  });
  platform::SetDeviceId(gpu_id);
-  return a_arr[gpu_id];
+  auto pos = std::distance(devices.begin(),
+                           std::find(devices.begin(), devices.end(), gpu_id));
+  return a_arr[pos];
 }
 #endif

--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -37,7 +37,13 @@ if (WITH_GPU)
    SET(OP_HEADER_DEPS ${OP_HEADER_DEPS} cub)
 endif()
-register_operators(EXCLUDES warpctc_op conv_fusion_op DEPS ${OP_HEADER_DEPS})
+SET(OP_PREFETCH_DEPS "")
+if (WITH_DISTRIBUTE)
+    SET(OP_PREFETCH_DEPS ${OP_PREFETCH_DEPS} parameter_prefetch)
+endif()
+register_operators(EXCLUDES warpctc_op conv_fusion_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS})
 # warpctc_op needs cudnn 7 above
 if (WITH_GPU AND NOT WIN32)
@@ -64,7 +70,7 @@ endif()
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions)
 if (WITH_GPU)
-  set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv)
+  set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu)
 endif()
 # FIXME(typhoonzero): operator deps may not needed.

--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -76,8 +76,8 @@ framework::OpKernelType GetKernelType(const framework::ExecutionContext& ctx,
  }
 #endif
  return framework::OpKernelType(
-      framework::ToDataType(ctx.Input<framework::Tensor>(name)->type()),
+      framework::GetDataTypeOfVar(ctx.InputVar(name)), ctx.GetPlace(), layout,
-      ctx.GetPlace(), layout, library);
+      library);
 }
 class ActivationOp : public framework::OperatorWithKernel {

--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -41,6 +41,12 @@ static std::unordered_set<std::string> InplaceOpSet = {
    "floor",   "reciprocal", "relu6", "soft_relu", "hard_sigmoid",
 };
+/* The following operator can be used to process SelectedRows, because the
+ * output of those operator for zero is zero too.
+ */
+static std::unordered_set<std::string> CanBeUsedBySelectedRows = {
+    "abs", "abs_grad", "square", "square_grad", "sqrt", "sqrt_grad"};
 static bool IsInplace(std::string op) { return InplaceOpSet.count(op); }
 template <typename DeviceContext, typename Functor>
@@ -50,16 +56,38 @@ class ActivationKernel
  using T = typename Functor::ELEMENT_TYPE;
  void Compute(const framework::ExecutionContext& context) const override {
-    auto& X = detail::Ref(context.Input<framework::Tensor>("X"),
+    auto x_var = context.InputVar("X");
-                          "Cannot get input tensor X, variable name = %s",
+    auto out_var = context.OutputVar("Out");
-                          context.op().Input("X"));
+    PADDLE_ENFORCE(x_var != nullptr,
+                   "Cannot get input Variable X, variable name = %s",
-    auto& Out = detail::Ref(context.Output<framework::Tensor>("Out"),
+                   context.op().Input("X"));
-                            "Cannot get output tensor Out, variable name = %s",
+    PADDLE_ENFORCE(out_var != nullptr,
-                            context.op().Output("Out"));
+                   "Cannot get output Variable Out, variable name = %s",
-    Out.mutable_data<T>(context.GetPlace());
+                   context.op().Output("Out"));
+    framework::Tensor X, *Out;
+    if (CanBeUsedBySelectedRows.count(context.op().Type())) {
+      X = detail::Ref(
+          paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*x_var),
+          "Cannot get input Tensor X, variable name = %s",
+          context.op().Input("X"));
+      Out = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(
+          out_var);
+    } else {
+      X = detail::Ref(context.Input<framework::Tensor>("X"),
+                      "Cannot get input Tensor X, variable name = %s",
+                      context.op().Input("X"));
+      Out = context.Output<framework::Tensor>("Out");
+    }
+    PADDLE_ENFORCE(Out != nullptr,
+                   "Cannot get output tensor Out, variable name = %s",
+                   context.op().Output("Out"));
+    Out->mutable_data<T>(context.GetPlace());
    auto x = framework::EigenVector<T>::Flatten(X);
-    auto out = framework::EigenVector<T>::Flatten(Out);
+    auto out = framework::EigenVector<T>::Flatten(*Out);
    auto* place =
        context.template device_context<DeviceContext>().eigen_device();
    Functor functor;
@@ -78,14 +106,54 @@ class ActivationGradKernel
 public:
  using T = typename Functor::ELEMENT_TYPE;
  void Compute(const framework::ExecutionContext& context) const override {
-    auto* Out = context.Input<framework::Tensor>("Out");
+    auto out_var = context.InputVar("Out");
-    auto* dOut =
+    auto out_grad_var = context.InputVar(framework::GradVarName("Out"));
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto x_grad_var = context.OutputVar(framework::GradVarName("X"));
-    auto* dX = context.Output<framework::Tensor>(framework::GradVarName("X"));
+    PADDLE_ENFORCE(out_var != nullptr,
+                   "Cannot get input Variable Out, variable name = %s",
+                   context.op().Input("Out"));
+    PADDLE_ENFORCE(out_grad_var != nullptr,
+                   "Cannot get input Variable %s, variable name = %s",
+                   framework::GradVarName("Out"),
+                   context.op().Input(framework::GradVarName("Out")));
+    PADDLE_ENFORCE(x_grad_var != nullptr,
+                   "Cannot get output Variable %s, variable name = %s",
+                   framework::GradVarName("X"),
+                   context.op().Output(framework::GradVarName("X")));
+    framework::Tensor Out, dOut, *dX;
+    if (CanBeUsedBySelectedRows.count(context.op().Type())) {
+      Out = detail::Ref(
+          paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*out_var),
+          "Cannot get input Tensor Out, variable name = %s",
+          context.op().Input("Out"));
+      dOut =
+          detail::Ref(paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(
+                          *out_grad_var),
+                      "Cannot get input Tensor %s, variable name = %s",
+                      framework::GradVarName("Out"),
+                      context.op().Input(framework::GradVarName("Out")));
+      dX = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(
+          x_grad_var);
+    } else {
+      Out = detail::Ref(context.Input<framework::Tensor>("Out"),
+                        "Cannot get input Tensor Out, variable name = %s",
+                        context.op().Input("Out"));
+      dOut = detail::Ref(
+          context.Input<framework::Tensor>(framework::GradVarName("Out")),
+          "Cannot get input Tensor %s, variable name = %s",
+          framework::GradVarName("Out"),
+          context.op().Input(framework::GradVarName("Out")));
+      dX = context.Output<framework::Tensor>(framework::GradVarName("X"));
+    }
+    PADDLE_ENFORCE(dX != nullptr,
+                   "Cannot get output tensor %s, variable name = %s",
+                   framework::GradVarName("X"),
+                   context.op().Output(framework::GradVarName("X")));
    dX->mutable_data<T>(context.GetPlace());
-    auto dout = framework::EigenVector<T>::Flatten(*dOut);
+    auto dout = framework::EigenVector<T>::Flatten(dOut);
-    auto out = framework::EigenVector<T>::Flatten(*Out);
+    auto out = framework::EigenVector<T>::Flatten(Out);
    auto dx = framework::EigenVector<T>::Flatten(*dX);
    auto* place =
        context.template device_context<DeviceContext>().eigen_device();
@@ -96,8 +164,19 @@ class ActivationGradKernel
    }
    bool inplace = functor.Inplace();
    if (!inplace) {
-      auto* X = context.Input<framework::Tensor>("X");
+      auto x_var = context.InputVar("X");
-      auto x = framework::EigenVector<T>::Flatten(*X);
+      PADDLE_ENFORCE(x_var != nullptr,
+                     "Cannot get input tensor X, variable name = %s",
+                     context.op().Input("X"));
+      framework::Tensor X;
+      if (CanBeUsedBySelectedRows.count(context.op().Type())) {
+        X = detail::Ref(
+            paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*x_var));
+      } else {
+        X = detail::Ref(context.Input<framework::Tensor>("X"));
+      }
+      auto x = framework::EigenVector<T>::Flatten(X);
      functor(*place, x, out, dout, dx);
    } else {
      VLOG(10) << " Inplace activation ";

--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -231,10 +231,10 @@ use lstm_x_t as input and compute as standard LSTM.
 template <typename T>
 inline void bias_relu(const int n, const T* x, const T* bias, T* y) {
  if (bias) {
-    math::vec_add_bias<T, platform::jit::avx>(n, *bias, x, y);
+    math::vec_add_bias<T, platform::avx>(n, *bias, x, y);
-    math::vec_relu<T, platform::jit::avx>(n, y, y);
+    math::vec_relu<T, platform::avx>(n, y, y);
  } else {
-    math::vec_relu<T, platform::jit::avx>(n, x, y);
+    math::vec_relu<T, platform::avx>(n, x, y);
  }
 }
@@ -245,8 +245,8 @@ inline void vec_softmax(const int n, const T* x, T* y) {
  for (int i = 1; i < n; ++i) {
    scalar = scalar < x[i] ? x[i] : scalar;
  }
-  math::vec_add_bias<T, platform::jit::avx>(n, -scalar, x, y);  // sub
+  math::vec_add_bias<T, platform::avx>(n, -scalar, x, y);  // sub
-  math::vec_exp<T>(n, y, y);                                    // exp
+  math::vec_exp<T>(n, y, y);                               // exp
  // sum
  scalar = T(0);
  for (int i = 0; i < n; ++i) {
@@ -302,13 +302,13 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
    auto& act_gate_str = ctx.Attr<std::string>("gate_activation");
    auto& act_cell_str = ctx.Attr<std::string>("cell_activation");
    auto& act_cand_str = ctx.Attr<std::string>("candidate_activation");
-    if (platform::jit::MayIUse(platform::jit::avx)) {
+    if (platform::MayIUse(platform::avx)) {
-      math::VecActivations<T, platform::jit::avx> act_functor;
+      math::VecActivations<T, platform::avx> act_functor;
      act_gate = act_functor(act_gate_str);
      act_cell = act_functor(act_cell_str);
      act_cand = act_functor(act_cand_str);
    } else {
-      math::VecActivations<T, platform::jit::isa_any> act_functor;
+      math::VecActivations<T, platform::isa_any> act_functor;
      act_gate = act_functor(act_gate_str);
      act_cell = act_functor(act_cell_str);
      act_cand = act_functor(act_cand_str);

--- a/paddle/fluid/operators/batch_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/batch_norm_mkldnn_op.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 #include "mkldnn.hpp"
 #include "paddle/fluid/operators/batch_norm_op.h"
-#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/mkldnn_reuse.h"
 namespace paddle {
 namespace operators {
@@ -146,7 +146,9 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    const float epsilon = ctx.Attr<float>("epsilon");
    const float momentum = ctx.Attr<float>("momentum");
    const bool is_test = ctx.Attr<bool>("is_test");
+    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
    const bool fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
+    bool global_stats = is_test || use_global_stats;
    const auto *x = ctx.Input<Tensor>("X");
    const auto *mean = ctx.Input<Tensor>("Mean");
@@ -177,13 +179,14 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    T *batch_mean_data = nullptr;
    T *batch_variance_data = nullptr;
-    if (!is_test) {
+    if (!global_stats) {
      batch_mean_data = batch_mean->mutable_data<T>(ctx.GetPlace());
      batch_variance_data = batch_variance->mutable_data<T>(ctx.GetPlace());
    }
-    auto propagation = is_test == true ? mkldnn::prop_kind::forward_scoring
+    auto propagation = global_stats == true
-                                       : mkldnn::prop_kind::forward_training;
+                           ? mkldnn::prop_kind::forward_scoring
+                           : mkldnn::prop_kind::forward_training;
    auto src_tz = paddle::framework::vectorize2int(x->dims());
    auto scale_tz = paddle::framework::vectorize2int(scale->dims());
@@ -199,7 +202,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                    shift->data<T>() + ic, &scaleshift_data);
    unsigned flags = mkldnn::use_scale_shift;
-    if (is_test) flags |= mkldnn::use_global_stats;
+    if (global_stats) flags |= mkldnn::use_global_stats;
    if (fuse_with_relu) flags |= mkldnn::fuse_bn_relu;
    // create mkldnn memory from input x tensor
@@ -208,7 +211,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    // keys for backward pass
    const std::string key = BatchNormMKLDNNHandler::GetHash(
-        src_tz, epsilon, flags, is_test, input_format,
+        src_tz, epsilon, flags, global_stats, input_format,
        ctx.op().Output("SavedMean"));
    const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd";
@@ -239,7 +242,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
        batch_norm_fwd_pd->dst_primitive_desc().desc(), y_data);
    std::shared_ptr<batch_norm_fwd> batch_norm_p;
-    if (is_test) {
+    if (global_stats) {
      // create mkldnn memory for stats (as input)
      std::shared_ptr<memory> mean_memory =
          handler.AcquireMeanMemoryFromPrimitive(to_void_cast(mean_data));
@@ -269,7 +272,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    pipeline.push_back(*batch_norm_p);
    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
-    if (!is_test) {
+    if (!global_stats) {
      // mkldnn only compute stats for current batch
      // so we need compute momentum stats via Eigen lib
      EigenVectorArrayMap<T> batch_mean_e(batch_mean_data, ic);

--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -159,6 +159,14 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<bool>("fuse_with_relu",
                  "(bool, default false) Only used in mkldnn kernel")
        .SetDefault(false);
+    AddAttr<bool>("use_global_stats",
+                  "(bool, default false) Whether to use global mean and "
+                  "variance. In inference or test mode, set use_global_stats "
+                  "to true or is_test true. the behavior is equivalent. "
+                  "In train mode, when setting use_global_stats True, the "
+                  "global mean and variance are also used during train time, "
+                  "the BN acts as scaling and shiffting.")
+        .SetDefault(false);
    AddComment(R"DOC(
 Batch Normalization.
@@ -190,6 +198,10 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
    const float epsilon = ctx.Attr<float>("epsilon");
    const float momentum = ctx.Attr<float>("momentum");
    const bool is_test = ctx.Attr<bool>("is_test");
+    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
+    bool global_stats = is_test || use_global_stats;
    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
    const DataLayout data_layout =
        framework::StringToDataLayout(data_layout_str);
@@ -217,7 +229,7 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
    saved_mean->mutable_data<T>(ctx.GetPlace());
    saved_variance->mutable_data<T>(ctx.GetPlace());
-    if (!is_test) {
+    if (!global_stats) {
      // saved_xx is use just in this batch of data
      EigenVectorArrayMap<T> saved_mean_e(
          saved_mean->mutable_data<T>(ctx.GetPlace()), C);
@@ -234,7 +246,7 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
      if ((N * sample_size) == 1) {
        LOG(WARNING) << "Only 1 element in normalization dimension, "
                     << "we skip the batch norm calculation, let y = x.";
-        framework::TensorCopySync(*x, ctx.GetPlace(), y);
+        framework::TensorCopy(*x, ctx.GetPlace(), y);
        return;
      }
@@ -277,7 +289,7 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
    // use SavedMean and SavedVariance to do normalize
    Eigen::Array<T, Eigen::Dynamic, 1> inv_std(C);
-    if (is_test) {
+    if (global_stats) {
      ConstEigenVectorArrayMap<T> var_arr(
          ctx.Input<Tensor>("Variance")->data<T>(), C);
      inv_std = (var_arr + epsilon).sqrt().inverse();
@@ -289,8 +301,8 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
      inv_std = saved_inv_std;
    }
    ConstEigenVectorArrayMap<T> mean_arr(
-        is_test ? ctx.Input<Tensor>("Mean")->data<T>()
+        global_stats ? ctx.Input<Tensor>("Mean")->data<T>()
-                : ctx.Output<Tensor>("SavedMean")->data<T>(),
+                     : ctx.Output<Tensor>("SavedMean")->data<T>(),
        C);
    //   ((x - est_mean) * (inv_var) * scale + bias
@@ -336,15 +348,27 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
  void InferShape(framework::InferShapeContext *ctx) const override {
    // check input
    PADDLE_ENFORCE(ctx->HasInput("X"));
-    PADDLE_ENFORCE(ctx->HasInput("Scale"), "");
+    PADDLE_ENFORCE(ctx->HasInput("Scale"), "Input(scale) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), "");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
-    PADDLE_ENFORCE(ctx->HasInput("SavedMean"), "");
+                   "Input(Y@GRAD) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("SavedVariance"), "");
+    PADDLE_ENFORCE(ctx->HasInput("SavedMean"),
+                   "Input(SavedMean) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("SavedVariance"),
+                   "Input(SavedVariance) should not be null");
    // check output
    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), "");
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Scale")), "");
+    if (ctx->HasOutput(framework::GradVarName("Scale"))) {
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Bias")), "");
+      PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Bias")),
+                     "Output(Scale@GRAD) and Output(Bias@GRAD) should not be "
+                     "null at same time");
+    }
+    const bool use_global_stats = ctx->Attrs().Get<bool>("use_global_stats");
+    if (use_global_stats) {
+      PADDLE_ENFORCE(!ctx->Attrs().Get<bool>("use_mkldnn"),
+                     "Using global stats during training is not supported "
+                     "in gradient op kernel of batch_norm_mkldnn_op now.");
+    }
    const auto x_dims = ctx->GetInputDim("X");
    const DataLayout data_layout = framework::StringToDataLayout(
@@ -354,8 +378,10 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
                                          : x_dims[x_dims.size() - 1]);
    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
-    ctx->SetOutputDim(framework::GradVarName("Scale"), {C});
+    if (ctx->HasOutput(framework::GradVarName("Scale"))) {
-    ctx->SetOutputDim(framework::GradVarName("Bias"), {C});
+      ctx->SetOutputDim(framework::GradVarName("Scale"), {C});
+      ctx->SetOutputDim(framework::GradVarName("Bias"), {C});
+    }
  }
 protected:
@@ -405,6 +431,8 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
    // SavedVariance have been reverted in forward operator
    const auto *saved_inv_variance = ctx.Input<Tensor>("SavedVariance");
    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
+    const float epsilon = ctx.Attr<float>("epsilon");
    const DataLayout data_layout =
        framework::StringToDataLayout(data_layout_str);
@@ -419,38 +447,60 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
                                          : x_dims[x_dims.size() - 1]);
    const int sample_size = x->numel() / N / C;
-    ConstEigenVectorArrayMap<T> scale_arr(scale->data<T>(), C);
-    ConstEigenVectorArrayMap<T> mean_arr(saved_mean->data<T>(), C);
-    ConstEigenVectorArrayMap<T> inv_var_arr(saved_inv_variance->data<T>(), C);
    // init output
    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
    d_x->mutable_data<T>(ctx.GetPlace());
-    d_scale->mutable_data<T>(ctx.GetPlace());
-    d_bias->mutable_data<T>(ctx.GetPlace());
+    const T *mean_data = saved_mean->data<T>();
+    const T *inv_var_data = saved_inv_variance->data<T>();
+    Tensor inv_var_tensor;
+    if (use_global_stats) {
+      const auto *running_mean = ctx.Input<Tensor>("Mean");
+      const auto *running_variance = ctx.Input<Tensor>("Variance");
+      mean_data = running_mean->data<T>();
+      T *running_inv_var_data = inv_var_tensor.mutable_data<T>(ctx.GetPlace());
+      EigenVectorArrayMap<T> inv_var_tmp(running_inv_var_data, C);
+      ConstEigenVectorArrayMap<T> var_arr(running_variance->data<T>(), C);
+      inv_var_tmp = (var_arr + epsilon).sqrt().inverse().eval();
+      inv_var_data = running_inv_var_data;
+    }
+    ConstEigenVectorArrayMap<T> scale_arr(scale->data<T>(), C);
+    ConstEigenVectorArrayMap<T> mean_arr(mean_data, C);
+    ConstEigenVectorArrayMap<T> inv_var_arr(inv_var_data, C);
+    T *d_bias_data = nullptr;
+    T *d_scale_data = nullptr;
+    if (d_scale && d_bias) {
+      d_scale->mutable_data<T>(ctx.GetPlace());
+      d_bias->mutable_data<T>(ctx.GetPlace());
+      d_bias_data = d_bias->mutable_data<T>(ctx.GetPlace());
+      d_scale_data = d_scale->mutable_data<T>(ctx.GetPlace());
+    }
    // d_bias = np.sum(d_y, axis=0)
    // d_scale = np.sum((X - mean) / inv_std * dy, axis=0)
    // d_x = (1. / N) * scale * inv_var * (N * d_y - np.sum(d_y, axis=0)
    //   - (X - mean) * inv_var * inv_var * np.sum(d_y * (X - mean), axis=0))
+    EigenVectorArrayMap<T> d_bias_arr(d_bias_data, C);
+    EigenVectorArrayMap<T> d_scale_arr(d_scale_data, C);
-    EigenVectorArrayMap<T> d_bias_arr(d_bias->mutable_data<T>(ctx.GetPlace()),
+    if (d_scale && d_bias) {
-                                      C);
+      d_bias_arr.setZero();
-    EigenVectorArrayMap<T> d_scale_arr(d_scale->mutable_data<T>(ctx.GetPlace()),
+      d_scale_arr.setZero();
-                                       C);
+    }
-    d_bias_arr.setZero();
-    d_scale_arr.setZero();
-    if ((N * sample_size) == 1) {
+    if ((N * sample_size) == 1 && !use_global_stats) {
-      framework::TensorCopySync(*d_y, ctx.GetPlace(), d_x);
+      framework::TensorCopy(*d_y, ctx.GetPlace(), d_x);
      return;
    }
-    const auto scale_inv_var_nhw = scale_arr * inv_var_arr / (N * sample_size);
+    int scale_coefff = use_global_stats ? 1 : N * sample_size;
+    const auto scale_inv_var_nhw = scale_arr * inv_var_arr / scale_coefff;
    switch (data_layout) {
      case DataLayout::kNCHW: {
@@ -460,19 +510,29 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
                                 sample_size, N * C);
        d_x_arr.setZero();
-        for (int nc = 0; nc < N * C; ++nc) {
+        if (d_scale && d_bias) {
-          int c = nc % C;
+          for (int nc = 0; nc < N * C; ++nc) {
-          d_bias_arr(c) += d_y_arr.col(nc).sum();
+            int c = nc % C;
-          d_scale_arr(c) +=
+            d_bias_arr(c) += d_y_arr.col(nc).sum();
-              ((x_arr.col(nc) - mean_arr(c)) * inv_var_arr(c) * d_y_arr.col(nc))
+            d_scale_arr(c) += ((x_arr.col(nc) - mean_arr(c)) * inv_var_arr(c) *
-                  .sum();
+                               d_y_arr.col(nc))
+                                  .sum();
+          }
        }
-        for (int nc = 0; nc < N * C; ++nc) {
+        if (!use_global_stats) {
-          int c = nc % C;
+          for (int nc = 0; nc < N * C; ++nc) {
-          d_x_arr.col(nc) +=
+            int c = nc % C;
-              scale_inv_var_nhw(c) *
+            d_x_arr.col(nc) +=
-              (d_y_arr.col(nc) * N * sample_size - d_bias_arr(c) -
+                scale_inv_var_nhw(c) *
-               (x_arr.col(nc) - mean_arr[c]) * d_scale_arr(c) * inv_var_arr(c));
+                (d_y_arr.col(nc) * N * sample_size - d_bias_arr(c) -
+                 (x_arr.col(nc) - mean_arr[c]) * d_scale_arr(c) *
+                     inv_var_arr(c));
+          }
+        } else {
+          for (int nc = 0; nc < N * C; ++nc) {
+            int c = nc % C;
+            d_x_arr.col(nc) += scale_inv_var_nhw(c) * d_y_arr.col(nc);
+          }
        }
        break;
      }
@@ -488,15 +548,27 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
        const auto d_y_mul_x_minus_mean_row_sum =
            (d_y_arr * x_minus_mean).rowwise().sum();
        const auto inv_var_sqr = inv_var_arr * inv_var_arr;
-        for (int nhw = 0; nhw < N * sample_size; ++nhw) {
-          d_bias_arr += d_y_arr.col(nhw);
+        if (d_scale && d_bias) {
-          d_scale_arr +=
+          for (int nhw = 0; nhw < N * sample_size; ++nhw) {
-              (x_arr.col(nhw) - mean_arr) * inv_var_arr * d_y_arr.col(nhw);
+            d_bias_arr += d_y_arr.col(nhw);
-          d_x_arr.col(nhw) +=
+            d_scale_arr +=
-              scale_inv_var_nhw *
+                (x_arr.col(nhw) - mean_arr) * inv_var_arr * d_y_arr.col(nhw);
-              (d_y_arr.col(nhw) * N * sample_size - d_y_row_sum -
+          }
-               x_minus_mean.col(nhw) * inv_var_sqr *
+        }
-                   d_y_mul_x_minus_mean_row_sum);
+        if (!use_global_stats) {
+          for (int nhw = 0; nhw < N * sample_size; ++nhw) {
+            d_x_arr.col(nhw) +=
+                scale_inv_var_nhw *
+                (d_y_arr.col(nhw) * N * sample_size - d_y_row_sum -
+                 x_minus_mean.col(nhw) * inv_var_sqr *
+                     d_y_mul_x_minus_mean_row_sum);
+          }
+        } else {
+          for (int nhw = 0; nhw < N * sample_size; ++nhw) {
+            d_x_arr.col(nhw) += scale_inv_var_nhw * d_y_arr.col(nhw);
+          }
        }
        break;
      }
@@ -522,6 +594,10 @@ class BatchNormGradMaker : public framework::SingleGradOpDescMaker {
    op->SetInput("SavedMean", Output("SavedMean"));
    op->SetInput("SavedVariance", Output("SavedVariance"));
+    // used when setting use_global_stats True during training
+    op->SetInput("Mean", Output("MeanOut"));
+    op->SetInput("Variance", Output("VarianceOut"));
    op->SetAttrMap(Attrs());
    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));

--- a/paddle/fluid/operators/batch_norm_op.cu.cc
+++ b/paddle/fluid/operators/batch_norm_op.cu.cc
@@ -12,9 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/batch_norm_op.h"
+#include <algorithm>
 #include <cfloat>
+#include <string>
+#include <vector>
+#include "cub/cub.cuh"
 #include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/operators/batch_norm_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/cudnn_helper.h"
 #include "paddle/fluid/platform/float16.h"
@@ -59,6 +63,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
    const float momentum = ctx.Attr<float>("momentum");
    const bool is_test = ctx.Attr<bool>("is_test");
+    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
    const DataLayout data_layout =
        framework::StringToDataLayout(data_layout_str);
@@ -121,7 +126,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
    auto handle = dev_ctx.cudnn_handle();
    // Now, depending on whether we are running test or not, we have two paths.
-    if (is_test) {
+    if (is_test || use_global_stats) {
      // only when test we use input to do computation.
      const auto *est_mean = ctx.Input<Tensor>("Mean");
      const auto *est_var = ctx.Input<Tensor>("Variance");
@@ -163,7 +168,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
      if ((N * H * W * D) == 1) {
        LOG(WARNING) << "Only 1 element in normalization dimension, "
                     << "we skip the batch norm calculation, let y = x.";
-        framework::TensorCopySync(*x, ctx.GetPlace(), y);
+        framework::TensorCopy(*x, ctx.GetPlace(), y);
      } else {
        double this_factor = 1. - momentum;
@@ -191,6 +196,58 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
  }
 };
+template <typename T, framework::DataLayout layout>
+static __global__ void KeBNBackwardData(const T *dy,
+                                        const BatchNormParamType<T> *scale,
+                                        const BatchNormParamType<T> *variance,
+                                        const double epsilon, const int C,
+                                        const int HxW, const int num, T *dx) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  for (int i = gid; i < num; i += stride) {
+    const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C;
+    BatchNormParamType<T> inv_var = 1.0 / sqrt(variance[c] + epsilon);
+    dx[i] = static_cast<T>(static_cast<BatchNormParamType<T>>(dy[i]) *
+                           scale[c] * inv_var);
+  }
+}
+template <typename T, int BlockDim, framework::DataLayout layout>
+static __global__ void KeBNBackwardScaleBias(
+    const T *dy, const T *x, const BatchNormParamType<T> *mean,
+    const BatchNormParamType<T> *variance, const double epsilon, const int N,
+    const int C, const int HxW, BatchNormParamType<T> *dscale,
+    BatchNormParamType<T> *dbias) {
+  const int outer_size = C;
+  const int inner_size = N * HxW;
+  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage ds_storage;
+  __shared__ typename BlockReduce::TempStorage db_storage;
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    BatchNormParamType<T> ds_sum = static_cast<BatchNormParamType<T>>(0);
+    BatchNormParamType<T> db_sum = static_cast<BatchNormParamType<T>>(0);
+    BatchNormParamType<T> inv_var_i = 1.0 / sqrt(variance[i] + epsilon);
+    BatchNormParamType<T> mean_i = mean[i];
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == framework::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      ds_sum += static_cast<BatchNormParamType<T>>(dy[index]) *
+                (static_cast<BatchNormParamType<T>>(x[index]) - mean_i);
+      db_sum += static_cast<BatchNormParamType<T>>(dy[index]);
+    }
+    ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum());
+    db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum());
+    if (threadIdx.x == 0) {
+      dscale[i] = ds_sum * inv_var_i;
+      dbias[i] = db_sum;
+    }
+    __syncthreads();
+  }
+}
 template <typename T>
 class BatchNormGradKernel<platform::CUDADeviceContext, T>
    : public framework::OpKernel<T> {
@@ -200,6 +257,8 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
                   "It must use CUDAPlace.");
    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
    const DataLayout data_layout =
        framework::StringToDataLayout(data_layout_str);
    const auto *x = ctx.Input<Tensor>("X");
@@ -219,42 +278,13 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
    d_x->mutable_data<T>(ctx.GetPlace());
-    d_scale->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+    if (d_scale && d_bias) {
-    d_bias->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+      d_scale->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+      d_bias->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    if ((N * H * W * D) == 1) {
-      framework::TensorCopySync(*d_y, ctx.GetPlace(), d_x);
-      math::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
-          functor;
-      functor(dev_ctx, d_scale, static_cast<BatchNormParamType<T>>(0));
-      functor(dev_ctx, d_bias, static_cast<BatchNormParamType<T>>(0));
-      return;
    }
    PADDLE_ENFORCE_EQ(scale->dims().size(), 1UL);
    PADDLE_ENFORCE_EQ(scale->dims()[0], C);
-    // ------------------- cudnn descriptors ---------------------
-    cudnnTensorDescriptor_t data_desc_;
-    cudnnTensorDescriptor_t bn_param_desc_;
-    cudnnBatchNormMode_t mode_;
-    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
-    CUDNN_ENFORCE(
-        platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
-    if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
-      LOG(ERROR) << "Provided epsilon is smaller than "
-                 << "CUDNN_BN_MIN_EPSILON. Setting it to "
-                 << "CUDNN_BN_MIN_EPSILON instead.";
-    }
-    epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
-#if CUDNN_VERSION_MIN(7, 0, 0)
-    mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
-#else
-    mode_ = CUDNN_BATCHNORM_SPATIAL;
-#endif
    std::vector<int> dims;
    std::vector<int> strides;
    if (data_layout == DataLayout::kNCHW) {
@@ -264,34 +294,114 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
      dims = {N, C, H, W, D};
      strides = {H * W * C * D, 1, W * D * C, D * C, C};
    }
-    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
-        data_desc_, CudnnDataType<T>::type,
-        x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
-    CUDNN_ENFORCE(platform::dynload::cudnnDeriveBNTensorDescriptor(
-        bn_param_desc_, data_desc_, mode_));
-    const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
-    const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
-    const void *saved_mean_data =
-        saved_mean->template data<BatchNormParamType<T>>();
-    const void *saved_var_data =
-        saved_var->template data<BatchNormParamType<T>>();
-    CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationBackward(
-        dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
-        CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
-        CudnnDataType<T>::kZero(), data_desc_, x->template data<T>(),
-        data_desc_, d_y->template data<T>(), data_desc_,
-        d_x->template mutable_data<T>(ctx.GetPlace()), bn_param_desc_,
-        scale->template data<BatchNormParamType<T>>(),
-        d_scale->template mutable_data<BatchNormParamType<T>>(ctx.GetPlace()),
-        d_bias->template mutable_data<BatchNormParamType<T>>(ctx.GetPlace()),
-        epsilon, saved_mean_data, saved_var_data));
-    // clean when exit.
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+    if (!use_global_stats) {
-    CUDNN_ENFORCE(
+      if ((N * H * W * D) == 1) {
-        platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
+        framework::TensorCopy(*d_y, ctx.GetPlace(), d_x);
+        math::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
+            functor;
+        functor(dev_ctx, d_scale, static_cast<BatchNormParamType<T>>(0));
+        functor(dev_ctx, d_bias, static_cast<BatchNormParamType<T>>(0));
+        return;
+      }
+      // ------------------- cudnn descriptors ---------------------
+      cudnnTensorDescriptor_t data_desc_;
+      cudnnTensorDescriptor_t bn_param_desc_;
+      cudnnBatchNormMode_t mode_;
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
+      if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
+        LOG(ERROR) << "Provided epsilon is smaller than "
+                   << "CUDNN_BN_MIN_EPSILON. Setting it to "
+                   << "CUDNN_BN_MIN_EPSILON instead.";
+      }
+      epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
+#if CUDNN_VERSION_MIN(7, 0, 0)
+      mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
+#else
+      mode_ = CUDNN_BATCHNORM_SPATIAL;
+#endif
+      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+          data_desc_, CudnnDataType<T>::type,
+          x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
+      CUDNN_ENFORCE(platform::dynload::cudnnDeriveBNTensorDescriptor(
+          bn_param_desc_, data_desc_, mode_));
+      const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
+      const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
+      const void *saved_mean_data =
+          saved_mean->template data<BatchNormParamType<T>>();
+      const void *saved_var_data =
+          saved_var->template data<BatchNormParamType<T>>();
+      CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationBackward(
+          dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
+          CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
+          CudnnDataType<T>::kZero(), data_desc_, x->template data<T>(),
+          data_desc_, d_y->template data<T>(), data_desc_,
+          d_x->template mutable_data<T>(ctx.GetPlace()), bn_param_desc_,
+          scale->template data<BatchNormParamType<T>>(),
+          d_scale->template mutable_data<BatchNormParamType<T>>(ctx.GetPlace()),
+          d_bias->template mutable_data<BatchNormParamType<T>>(ctx.GetPlace()),
+          epsilon, saved_mean_data, saved_var_data));
+      // clean when exit.
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
+    } else {
+      const auto *running_mean = ctx.Input<Tensor>("Mean");
+      const auto *running_var = ctx.Input<Tensor>("Variance");
+      const auto *running_mean_data =
+          running_mean->template data<BatchNormParamType<T>>();
+      const auto *running_var_data =
+          running_var->template data<BatchNormParamType<T>>();
+      const int num = x->numel();
+      const int block = 512;
+      int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+      const int max_blocks = std::max(max_threads / block, 1);
+      int grid1 = (num + block - 1) / block;
+      int grid2 = std::min(C, max_blocks);
+      if (data_layout == framework::DataLayout::kNCHW) {
+        if (d_x) {
+          KeBNBackwardData<T, framework::DataLayout::kNCHW><<<
+              grid1, block, 0, dev_ctx.stream()>>>(
+              d_y->data<T>(), scale->data<BatchNormParamType<T>>(),
+              running_var_data, epsilon, C, H * W, num, d_x->data<T>());
+        }
+        if (d_scale && d_bias) {
+          KeBNBackwardScaleBias<T, block, framework::DataLayout::kNCHW><<<
+              grid2, block, 0, dev_ctx.stream()>>>(
+              d_y->data<T>(), x->data<T>(), running_mean_data, running_var_data,
+              epsilon, C, H * W, num, d_scale->data<BatchNormParamType<T>>(),
+              d_bias->data<BatchNormParamType<T>>());
+        }
+      } else {
+        if (d_x) {
+          KeBNBackwardData<T, framework::DataLayout::kNHWC><<<
+              grid1, block, 0, dev_ctx.stream()>>>(
+              d_y->data<T>(), scale->data<BatchNormParamType<T>>(),
+              running_var_data, epsilon, C, H * W, num, d_x->data<T>());
+        }
+        if (d_scale && d_bias) {
+          KeBNBackwardScaleBias<T, block, framework::DataLayout::kNCHW><<<
+              grid2, block, 0, dev_ctx.stream()>>>(
+              d_y->data<T>(), x->data<T>(), running_mean_data, running_var_data,
+              epsilon, C, H * W, num, d_scale->data<BatchNormParamType<T>>(),
+              d_bias->data<BatchNormParamType<T>>());
+        }
+      }
+    }
  }
 };

--- a/paddle/fluid/operators/conv_fusion_op.cu.cc
+++ b/paddle/fluid/operators/conv_fusion_op.cu.cc
@@ -110,11 +110,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
    auto x_dims = framework::vectorize(input->dims());
    auto f_dims = framework::vectorize(filter->dims());
-    if (activation == "identity") {
+    if (!exhaustive_search) {
-      // Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is
-      // enabled with CUDNN_ACTIVATION_IDENTITY in cuDNN lib.
-      algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
-    } else if (!exhaustive_search) {
      CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
          handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
          cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
@@ -165,18 +161,42 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
    PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit,
                      "workspace_size to be allocated exceeds the limit");
-    // ------------------- cudnn conv+bias+act forward --------------------
+    if ((activation == "identity") &&
-    ScalingParamType<T> alpha1 = 1.0f;
+        (algo != CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) &&
-    ScalingParamType<T> alpha2 = residual ? 1.0f : 0.0f;
+        (!residual)) {
-    auto cudnn_func = [&](void* cudnn_workspace) {
+      // Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is
-      CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward(
+      // enabled with CUDNN_ACTIVATION_IDENTITY in cuDNN lib.
-          handle, &alpha1, cudnn_input_desc, input_data, cudnn_filter_desc,
+      // But test in some case, the speed is slower, change to use
-          filter_data, cudnn_conv_desc, algo, cudnn_workspace,
+      // cudnnConvolutionForward and cudnnAddTensor
-          workspace_size_in_bytes, &alpha2, cudnn_output_desc, residual_data,
+      // ------------- cudnn conv forward and bias add ---------------------
-          cudnn_bias_desc, bias_data, cudnn_act_desc, cudnn_output_desc,
+      ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
+      auto cudnn_func = [&](void* cudnn_workspace) {
+        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
+            handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc,
+            filter_data, cudnn_conv_desc, algo, cudnn_workspace,
+            workspace_size_in_bytes, &beta, cudnn_output_desc, output_data));
+      };
+      workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
+      CUDNN_ENFORCE(platform::dynload::cudnnAddTensor(
+          handle, &alpha, cudnn_bias_desc, bias_data, &alpha, cudnn_output_desc,
          output_data));
-    };
+    } else {
-    workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
+      if (activation == "identity") {
+        algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
+      }
+      // ------------------- cudnn conv+bias+act forward --------------------
+      ScalingParamType<T> alpha1 = 1.0f;
+      ScalingParamType<T> alpha2 = residual ? 1.0f : 0.0f;
+      auto cudnn_func = [&](void* cudnn_workspace) {
+        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward(
+            handle, &alpha1, cudnn_input_desc, input_data, cudnn_filter_desc,
+            filter_data, cudnn_conv_desc, algo, cudnn_workspace,
+            workspace_size_in_bytes, &alpha2, cudnn_output_desc, residual_data,
+            cudnn_bias_desc, bias_data, cudnn_act_desc, cudnn_output_desc,
+            output_data));
+      };
+      workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
+    }
  }
 };
 #endif

--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -74,6 +74,8 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
 framework::OpKernelType ConvOp::GetExpectedKernelType(
    const framework::ExecutionContext& ctx) const {
+  int customized_type_value =
+      framework::OpKernelType::kDefaultCustomizedTypeValue;
  framework::LibraryType library{framework::LibraryType::kPlain};
  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
  std::string data_format = ctx.Attr<std::string>("data_format");
@@ -89,6 +91,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
      platform::CanMKLDNNBeUsed(ctx)) {
    library = framework::LibraryType::kMKLDNN;
    layout = framework::DataLayout::kMKLDNN;
+    customized_type_value = kConvMKLDNNFP32;
  }
 #endif
@@ -105,7 +108,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
  }
  return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
-                                 library);
+                                 library, customized_type_value);
 }
 void Conv2DOpMaker::Make() {
@@ -342,6 +345,8 @@ void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const {
 framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
    const framework::ExecutionContext& ctx) const {
+  int customized_type_value =
+      framework::OpKernelType::kDefaultCustomizedTypeValue;
  framework::LibraryType library_{framework::LibraryType::kPlain};
  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
  std::string data_format = ctx.Attr<std::string>("data_format");
@@ -357,12 +362,13 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
      platform::CanMKLDNNBeUsed(ctx)) {
    library_ = framework::LibraryType::kMKLDNN;
    layout_ = framework::DataLayout::kMKLDNN;
+    customized_type_value = kConvMKLDNNFP32;
  }
 #endif
  return framework::OpKernelType(
      framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
-      layout_, library_);
+      layout_, library_, customized_type_value);
 }
 }  // namespace operators

--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@@ -27,6 +27,8 @@ namespace paddle {
 namespace operators {
 using Tensor = framework::Tensor;
+constexpr int kConvMKLDNNFP32 = 1;
+constexpr int kConvMKLDNNINT8 = 2;
 // Base convolution operator definations for other conv
 // like operators to reuse the implementation.

--- a/paddle/fluid/operators/conv_transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_transpose_mkldnn_op.cc
--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
--- a/paddle/fluid/operators/cudnn_lstm_op.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cc
--- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
--- a/paddle/fluid/operators/detection/box_coder_op.h
+++ b/paddle/fluid/operators/detection/box_coder_op.h
@@ -43,6 +43,9 @@ class BoxCoderKernel : public framework::OpKernel<T> {
    const T* prior_box_var_data = nullptr;
    if (prior_box_var) prior_box_var_data = prior_box_var->data<T>();
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for collapse(2)
+#endif
    for (int64_t i = 0; i < row; ++i) {
      for (int64_t j = 0; j < col; ++j) {
        T prior_box_width = prior_box_data[j * len + 2] -
@@ -96,6 +99,9 @@ class BoxCoderKernel : public framework::OpKernel<T> {
    const T* prior_box_var_data = nullptr;
    if (prior_box_var) prior_box_var_data = prior_box_var->data<T>();
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for collapse(2)
+#endif
    for (int64_t i = 0; i < row; ++i) {
      for (int64_t j = 0; j < col; ++j) {
        size_t offset = i * col * len + j * len;

--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
--- a/paddle/fluid/operators/distributed/grpc_client.h
+++ b/paddle/fluid/operators/distributed/grpc_client.h
@@ -194,6 +194,7 @@ class GRPCClient : public RPCClient {
                                const framework::Scope& scope,
                                const std::string& in_var_name,
                                const std::string& out_var_name,
+                                const std::string& table_name = "",
                                int64_t time_out = FLAGS_rpc_deadline) override;
  VarHandlePtr AsyncSendBatchBarrier(

--- a/paddle/fluid/operators/distributed/grpc_serde.cc
+++ b/paddle/fluid/operators/distributed/grpc_serde.cc
--- a/paddle/fluid/operators/distributed/grpc_serde.h
+++ b/paddle/fluid/operators/distributed/grpc_serde.h
--- a/paddle/fluid/operators/distributed/grpc_serde_test.cc
+++ b/paddle/fluid/operators/distributed/grpc_serde_test.cc
--- a/paddle/fluid/operators/distributed/grpc_server.cc
+++ b/paddle/fluid/operators/distributed/grpc_server.cc
--- a/paddle/fluid/operators/distributed/grpc_variable_response.cc
+++ b/paddle/fluid/operators/distributed/grpc_variable_response.cc
--- a/paddle/fluid/operators/distributed/parameter_prefetch.cc
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc
--- a/paddle/fluid/operators/distributed/parameter_prefetch.h
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.h
--- a/paddle/fluid/operators/distributed/request_handler.h
+++ b/paddle/fluid/operators/distributed/request_handler.h
--- a/paddle/fluid/operators/distributed/request_handler_impl.cc
+++ b/paddle/fluid/operators/distributed/request_handler_impl.cc
--- a/paddle/fluid/operators/distributed/request_handler_impl.h
+++ b/paddle/fluid/operators/distributed/request_handler_impl.h
--- a/paddle/fluid/operators/distributed/rpc_client.h
+++ b/paddle/fluid/operators/distributed/rpc_client.h
--- a/paddle/fluid/operators/distributed/send_recv.proto.in
+++ b/paddle/fluid/operators/distributed/send_recv.proto.in
--- a/paddle/fluid/operators/distributed/variable_response.h
+++ b/paddle/fluid/operators/distributed/variable_response.h
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
--- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
--- a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
--- a/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc
+++ b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.h
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
--- a/paddle/fluid/operators/lookup_table_op.cu
+++ b/paddle/fluid/operators/lookup_table_op.cu
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
--- a/paddle/fluid/operators/math/cpu_vec.h
+++ b/paddle/fluid/operators/math/cpu_vec.h
--- a/paddle/fluid/operators/math/cpu_vec_test.cc
+++ b/paddle/fluid/operators/math/cpu_vec_test.cc
--- a/paddle/fluid/operators/math/jit_code.cc
+++ b/paddle/fluid/operators/math/jit_code.cc
--- a/paddle/fluid/operators/math/jit_code.h
+++ b/paddle/fluid/operators/math/jit_code.h
--- a/paddle/fluid/operators/math/jit_gen.cc
+++ b/paddle/fluid/operators/math/jit_gen.cc
--- a/paddle/fluid/operators/math/jit_kernel.cc
+++ b/paddle/fluid/operators/math/jit_kernel.cc
--- a/paddle/fluid/operators/math/jit_kernel_blas.cc
+++ b/paddle/fluid/operators/math/jit_kernel_blas.cc
--- a/paddle/fluid/operators/math/jit_kernel_crf_decode.cc
+++ b/paddle/fluid/operators/math/jit_kernel_crf_decode.cc
--- a/paddle/fluid/operators/math/jit_kernel_exp.cc
+++ b/paddle/fluid/operators/math/jit_kernel_exp.cc
--- a/paddle/fluid/operators/math/jit_kernel_layer_norm.cc
+++ b/paddle/fluid/operators/math/jit_kernel_layer_norm.cc
--- a/paddle/fluid/operators/math/jit_kernel_macro.h
+++ b/paddle/fluid/operators/math/jit_kernel_macro.h
--- a/paddle/fluid/operators/math/jit_kernel_test.cc
+++ b/paddle/fluid/operators/math/jit_kernel_test.cc
--- a/paddle/fluid/operators/math/matrix_bit_code.cc
+++ b/paddle/fluid/operators/math/matrix_bit_code.cc
--- a/paddle/fluid/operators/math/matrix_bit_code.h
+++ b/paddle/fluid/operators/math/matrix_bit_code.h
--- a/paddle/fluid/operators/math/prelu.cu
+++ b/paddle/fluid/operators/math/prelu.cu
--- a/paddle/fluid/operators/math/prelu.h
+++ b/paddle/fluid/operators/math/prelu.h
--- a/paddle/fluid/operators/merge_selected_rows_op.cc
+++ b/paddle/fluid/operators/merge_selected_rows_op.cc
--- a/paddle/fluid/operators/merge_selected_rows_op.cu.cc
+++ b/paddle/fluid/operators/merge_selected_rows_op.cu.cc
--- a/paddle/fluid/operators/merge_selected_rows_op.h
+++ b/paddle/fluid/operators/merge_selected_rows_op.h
--- a/paddle/fluid/operators/metrics/auc_op.h
+++ b/paddle/fluid/operators/metrics/auc_op.h
--- a/paddle/fluid/operators/pad2d_op.cc
+++ b/paddle/fluid/operators/pad2d_op.cc
--- a/paddle/fluid/operators/pad2d_op.cu
+++ b/paddle/fluid/operators/pad2d_op.cu
--- a/paddle/fluid/operators/prelu_op.cc
+++ b/paddle/fluid/operators/prelu_op.cc
--- a/paddle/fluid/operators/prelu_op.cu
+++ b/paddle/fluid/operators/prelu_op.cu
--- a/paddle/fluid/operators/sequence_ops/sequence_mask_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_mask_op.h
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
--- a/paddle/fluid/operators/softmax_mkldnn_op.cc
+++ b/paddle/fluid/operators/softmax_mkldnn_op.cc
--- a/paddle/fluid/operators/yolov3_loss_op.cc
+++ b/paddle/fluid/operators/yolov3_loss_op.cc
--- a/paddle/fluid/operators/yolov3_loss_op.h
+++ b/paddle/fluid/operators/yolov3_loss_op.h
--- a/paddle/fluid/platform/assert.h
+++ b/paddle/fluid/platform/assert.h
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
--- a/paddle/fluid/platform/cpu_info.h
+++ b/paddle/fluid/platform/cpu_info.h
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
--- a/paddle/fluid/platform/device_tracer.h
+++ b/paddle/fluid/platform/device_tracer.h
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
--- a/paddle/fluid/platform/gpu_info.h
+++ b/paddle/fluid/platform/gpu_info.h
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
--- a/paddle/fluid/pybind/async_executor_py.cc
+++ b/paddle/fluid/pybind/async_executor_py.cc
--- a/paddle/fluid/pybind/async_executor_py.h
+++ b/paddle/fluid/pybind/async_executor_py.h
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
--- a/paddle/fluid/string/CMakeLists.txt
+++ b/paddle/fluid/string/CMakeLists.txt
--- a/paddle/fluid/string/split.h
+++ b/paddle/fluid/string/split.h
--- a/paddle/fluid/string/split_test.cc
+++ b/paddle/fluid/string/split_test.cc
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
--- a/python/paddle/fluid/async_executor.py
+++ b/python/paddle/fluid/async_executor.py
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
--- a/python/paddle/fluid/data_feed_desc.py
+++ b/python/paddle/fluid/data_feed_desc.py
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
--- a/python/paddle/fluid/tests/demo/async_executor.py
+++ b/python/paddle/fluid/tests/demo/async_executor.py
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
--- a/python/paddle/fluid/tests/test_gradient_clip.py
+++ b/python/paddle/fluid/tests/test_gradient_clip.py
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
--- a/python/paddle/fluid/tests/unittests/dist_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_ctr.py
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
--- a/python/paddle/fluid/tests/unittests/test_async_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_async_executor.py
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
--- a/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
--- a/python/paddle/fluid/tests/unittests/test_get_tensor_from_selected_rows_op.py
+++ b/python/paddle/fluid/tests/unittests/test_get_tensor_from_selected_rows_op.py
--- a/python/paddle/fluid/tests/unittests/test_gradient_clip.py
+++ b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
--- a/python/paddle/fluid/tests/unittests/test_lookup_remote_table_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_remote_table_op.py
--- a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_merge_selectedrows_op.py
+++ b/python/paddle/fluid/tests/unittests/test_merge_selectedrows_op.py
--- a/python/paddle/fluid/tests/unittests/test_pad2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pad2d_op.py
--- a/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
--- a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
--- a/python/paddle/fluid/tests/unittests/testsuite.py
+++ b/python/paddle/fluid/tests/unittests/testsuite.py
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
--- a/python/paddle/reader/tests/decorator_test.py
+++ b/python/paddle/reader/tests/decorator_test.py
--- a/python/setup.py.in
+++ b/python/setup.py.in
--- a/tools/print_signatures.py
+++ b/tools/print_signatures.py