commit data_feed for pull

a5cb79c6 · barrierye · 92a98ca7 · 91fc8f35 · a5cb79c6 · a5cb79c6
266 changed file
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
+set(PART_CUDA_KERNEL_FILES)
+function(op_library TARGET)
+    # op_library is a function to create op library. The interface is same as
+    # cc_library. But it handle split GPU/CPU code and link some common library
+    # for ops.
+    set(cc_srcs)
+    set(cu_srcs)
+    set(hip_cu_srcs)
+    set(miopen_hip_cc_srcs)
+    set(cu_cc_srcs)
+    set(cudnn_cu_cc_srcs)
+    set(CUDNN_FILE)
+    set(mkldnn_cc_srcs)
+    set(MKLDNN_FILE)
+    set(op_common_deps operator op_registry math_function)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    set(pybind_flag 0)
+    cmake_parse_arguments(op_library "${options}" "${oneValueArgs}"
+            "${multiValueArgs}" ${ARGN})
+    list(LENGTH op_library_SRCS op_library_SRCS_len)
+    if (${op_library_SRCS_len} EQUAL 0)
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
+            list(APPEND cc_srcs ${TARGET}.cc)
+        endif()
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc)
+            list(APPEND cu_cc_srcs ${TARGET}.cu.cc)
+        endif()
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
+            list(APPEND cu_srcs ${TARGET}.cu)
+        endif()
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
+            set(PART_CUDA_KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu
+                    ${PART_CUDA_KERNEL_FILES} PARENT_SCOPE)
+            list(APPEND cu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
+        endif()
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.hip.cu)
+            list(APPEND hip_cu_srcs ${TARGET}.hip.cu)
+        endif()
+        string(REPLACE "_op" "_cudnn_op" CUDNN_FILE "${TARGET}")
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu.cc)
+            list(APPEND cudnn_cu_cc_srcs ${CUDNN_FILE}.cu.cc)
+        endif()
+        if(WITH_AMD_GPU)
+            string(REPLACE "_op" "_miopen_op" MIOPEN_FILE "${TARGET}")
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MIOPEN_FILE}.hip.cc)
+                list(APPEND miopen_hip_cc_srcs ${MIOPEN_FILE}.hip.cc)
+            endif()
+        endif()
+        if(WITH_MKLDNN)
+            string(REPLACE "_op" "_mkldnn_op" MKLDNN_FILE "${TARGET}")
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MKLDNN_FILE}.cc)
+                list(APPEND mkldnn_cc_srcs ${MKLDNN_FILE}.cc)
+            endif()
+        endif()
+    else()
+        foreach(src ${op_library_SRCS})
+            if (${src} MATCHES ".*\\.hip.cu$")
+                list(APPEND hip_cu_srcs ${src})
+            elseif (${src} MATCHES ".*\\.cu$")
+                list(APPEND cu_srcs ${src})
+            elseif(${src} MATCHES ".*_cudnn_op.cu.cc$")
+                list(APPEND cudnn_cu_cc_srcs ${src})
+            elseif(WITH_AMD_GPU AND ${src} MATCHES ".*_miopen_op.hip.cc$")
+                list(APPEND miopen_hip_cc_srcs ${src})
+            elseif(WITH_MKLDNN AND ${src} MATCHES ".*_mkldnn_op.cc$")
+                list(APPEND mkldnn_cc_srcs ${src})
+            elseif(${src} MATCHES ".*\\.cu.cc$")
+                list(APPEND cu_cc_srcs ${src})
+            elseif(${src} MATCHES ".*\\.cc$")
+                list(APPEND cc_srcs ${src})
+            else()
+                message(FATAL_ERROR "${TARGET} Source file ${src} should only be .cc or .cu")
+            endif()
+        endforeach()
+    endif()
+    list(LENGTH cc_srcs cc_srcs_len)
+    if (${cc_srcs_len} EQUAL 0)
+        message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file")
+    endif()
+    if (WIN32)
+    # remove windows unsupported op, because windows has no nccl, no warpctc such ops.
+    foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op" "hierarchical_sigmoid_op"
+     "crf_decoding_op" "select_op" "lstmp_op" "gru_op" "fusion_gru_op" "lstm_op" "fusion_lstm_op" "cumsum_op"
+      "fusion_seqconv_eltadd_relu_op" "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op")
+        if ("${TARGET}" STREQUAL "${windows_unsupport_op}")
+          return()
+        endif()
+    endforeach()
+    endif(WIN32)
+    set(OP_LIBRARY ${TARGET} ${OP_LIBRARY} CACHE INTERNAL "op libs")
+    list(LENGTH op_library_DEPS op_library_DEPS_len)
+    if (${op_library_DEPS_len} GREATER 0)
+        set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE)
+    endif()
+    if (WITH_GPU)
+        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${mkldnn_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
+                ${op_common_deps})
+    elseif (WITH_AMD_GPU)
+        hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cu_srcs} ${miopen_hip_cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
+                ${op_common_deps})
+    else()
+        cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
+            ${op_common_deps})
+    endif()
+    # Define operators that don't need pybind here.
+    foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op"
+"tensor_array_read_write_op" "tensorrt_engine_op")
+        if ("${TARGET}" STREQUAL "${manual_pybind_op}")
+            set(pybind_flag 1)
+        endif()
+    endforeach()
+    # The registration of USE_OP, please refer to paddle/fluid/framework/op_registry.h.
+    # Note that it's enough to just adding one operator to pybind in a *_op.cc file.
+    # And for detail pybind information, please see generated paddle/pybind/pybind.h.
+    file(READ ${TARGET}.cc TARGET_CONTENT)
+    string(REGEX MATCH "REGISTER_OPERATOR\\(.*REGISTER_OPERATOR\\(" multi_register "${TARGET_CONTENT}")
+    string(REGEX MATCH "REGISTER_OPERATOR\\([a-z0-9_]*," one_register "${multi_register}")
+    if (one_register STREQUAL "")
+        string(REPLACE "_op" "" TARGET "${TARGET}")
+    else ()
+        string(REPLACE "REGISTER_OPERATOR(" "" TARGET "${one_register}")
+        string(REPLACE "," "" TARGET "${TARGET}")
+    endif()
+    # pybind USE_NO_KERNEL_OP
+    # HACK: if REGISTER_OP_CPU_KERNEL presents the operator must have kernel
+    string(REGEX MATCH "REGISTER_OP_CPU_KERNEL" regex_result "${TARGET_CONTENT}")
+    string(REPLACE "_op" "" TARGET "${TARGET}")
+    if (${pybind_flag} EQUAL 0 AND regex_result STREQUAL "")
+        file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(${TARGET});\n")
+        set(pybind_flag 1)
+    endif()
+    # pybind USE_CPU_ONLY_OP
+    list(LENGTH cu_srcs cu_srcs_len)
+    list(LENGTH cu_cc_srcs cu_cc_srcs_len)
+    list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
+    list(LENGTH hip_cu_srcs hip_cu_srcs_len)
+    list(LENGTH miopen_hip_cc_srcs miopen_hip_cc_srcs_len)
+    if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0 AND
+        ${hip_cu_srcs_len} EQUAL 0 AND ${miopen_hip_cc_srcs_len} EQUAL 0)
+        file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
+        set(pybind_flag 1)
+    endif()
+    # pybind USE_OP_DEVICE_KERNEL for CUDNN
+    list(LENGTH cudnn_cu_cc_srcs cudnn_cu_cc_srcs_len)
+    if (WITH_GPU AND ${cudnn_cu_cc_srcs_len} GREATER 0)
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
+    endif()
+    # pybind USE_OP_DEVICE_KERNEL for MIOPEN
+    if (WITH_AMD_GPU AND ${miopen_hip_cc_srcs_len} GREATER 0)
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MIOPEN);\n")
+    endif()
+    # pybind USE_OP_DEVICE_KERNEL for MKLDNN
+    if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
+      # Append first implemented MKLDNN activation operator
+      if (${MKLDNN_FILE} STREQUAL "activation_mkldnn_op")
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, MKLDNN);\n")
+      else()
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n")
+      endif()
+    endif()
+    # pybind USE_OP
+    if (${pybind_flag} EQUAL 0)
+      # NOTE(*): activation use macro to regist the kernels, set use_op manually.
+      if(${TARGET} STREQUAL "activation")
+        file(APPEND ${pybind_file} "USE_OP(relu);\n")
+      elseif(${TARGET} STREQUAL "fake_dequantize")
+        file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n")
+      elseif(${TARGET} STREQUAL "fake_quantize")
+        file(APPEND ${pybind_file} "USE_OP(fake_quantize_abs_max);\n")
+      elseif(${TARGET} STREQUAL "tensorrt_engine_op")
+          message(STATUS "Pybind skips [tensorrt_engine_op], for this OP is only used in inference")
+      elseif(${TARGET} STREQUAL "fc")
+        # HACK: fc only have mkldnn and cpu, which would mismatch the cpu only condition
+        file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
+      else()
+        file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
+      endif()
+    endif()
+endfunction()
+function(register_operators)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs EXCLUDES DEPS)
+    cmake_parse_arguments(register_operators "${options}" "${oneValueArgs}"
+            "${multiValueArgs}" ${ARGN})
+    file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
+    string(REPLACE "_mkldnn" "" OPS "${OPS}")
+    string(REPLACE ".cc" "" OPS "${OPS}")
+    list(REMOVE_DUPLICATES OPS)
+    list(LENGTH register_operators_DEPS register_operators_DEPS_len)
+    foreach(src ${OPS})
+        list(FIND register_operators_EXCLUDES ${src} _index)
+        if (${_index} EQUAL -1)
+            if (${register_operators_DEPS_len} GREATER 0)
+                op_library(${src} DEPS ${register_operators_DEPS})
+            else()
+                op_library(${src})
+            endif()
+        endif()
+    endforeach()
+endfunction()
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -93,11 +93,11 @@ paddle.fluid.layers.edit_distance ArgSpec(args=['input', 'label', 'normalized',
 paddle.fluid.layers.l2_normalize ArgSpec(args=['x', 'axis', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(1e-12, None))
 paddle.fluid.layers.matmul ArgSpec(args=['x', 'y', 'transpose_x', 'transpose_y', 'alpha', 'name'], varargs=None, keywords=None, defaults=(False, False, 1.0, None))
 paddle.fluid.layers.topk ArgSpec(args=['input', 'k', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.warpctc ArgSpec(args=['input', 'label', 'blank', 'norm_by_times'], varargs=None, keywords=None, defaults=(0, False))
+paddle.fluid.layers.warpctc ArgSpec(args=['input', 'label', 'blank', 'norm_by_times', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, False, False))
 paddle.fluid.layers.sequence_reshape ArgSpec(args=['input', 'new_dim'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None))
-paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, None))
+paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0))
 paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'name'], varargs=None, keywords=None, defaults=(0, None))
 paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))

--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -36,7 +36,7 @@ add_subdirectory(details)
 endif (NOT WIN32)
 # ddim lib
 proto_library(framework_proto SRCS framework.proto)
-proto_library(async_executor_param SRCS async_executor_param.proto)
+proto_library(async_executor_proto SRCS data_feed.proto)
 cc_library(ddim SRCS ddim.cc DEPS eigen3 boost)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
@@ -138,31 +138,23 @@ cc_test(version_test SRCS version_test.cc DEPS version)
 cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version)
 cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto)
-if(NOT WIN32)
 cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog
  shape_inference data_transform lod_tensor profiler)
-endif(NOT WIN32)
 cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
 nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
+if (NOT WIN32)
 py_proto_compile(framework_py_proto SRCS framework.proto)
 # Generate an empty __init__.py to make framework_py_proto as a valid python module.
 add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
 add_dependencies(framework_py_proto framework_py_proto_init)
-if (NOT WIN32)
+add_custom_command(TARGET framework_py_proto POST_BUILD
-  add_custom_command(TARGET framework_py_proto POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
-      COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
+    COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/
-      COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/
+    COMMENT "Copy generated python proto into directory paddle/fluid/proto."
-      COMMENT "Copy generated python proto into directory paddle/fluid/proto."
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-      WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-else(NOT WIN32)
-  string(REPLACE "/" "\\" proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/")
-  add_custom_command(TARGET framework_py_proto POST_BUILD
-          COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
-          COMMAND copy /Y *.py ${proto_dstpath}
-          COMMENT "Copy generated python proto into directory paddle/fluid/proto."
-          WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 endif(NOT WIN32)
 cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
@@ -176,11 +168,7 @@ if(WITH_DISTRIBUTE)
  set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
  set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 else()
-  if(NOT WIN32)
+  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator)
-    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator)
-  else(NOT WIN32)
-    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass)
-  endif(NOT WIN32)
  cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
 endif()
@@ -192,10 +180,11 @@ cc_library(parallel_executor SRCS parallel_executor.cc DEPS
 endif() # NOT WIN32
 cc_library(async_executor
-          SRCS async_executor.cc data_feed.cc datafeed_creator.cc
+          SRCS async_executor.cc data_feed.cc data_feed_factory.cc
+              executor_thread_worker.cc
          DEPS op_registry device_context scope framework_proto glog
              lod_rank_table feed_fetch_method graph_to_program_pass
-              async_executor_param)
+              async_executor_proto)
 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)

--- a/paddle/fluid/framework/async_executor.cc
+++ b/paddle/fluid/framework/async_executor.cc
@@ -36,43 +36,13 @@ limitations under the License. */
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/inference/io.h"
+#include "paddle/fluid/framework/executor_thread_worker.h"
+#include "paddle/fluid/framework/data_feed_factory.h"
 #include "paddle/fluid/pybind/pybind.h"
 namespace paddle {
 namespace framework {
-bool AsyncExecutor::workers_initialized_ = false;
-void CreateTensor(Variable* var, proto::VarType::Type var_type) {
-  if (var_type == proto::VarType::LOD_TENSOR) {
-    var->GetMutable<LoDTensor>();
-  } else if (var_type == proto::VarType::SELECTED_ROWS) {
-    var->GetMutable<SelectedRows>();
-  } else if (var_type == proto::VarType::FEED_MINIBATCH) {
-    var->GetMutable<FeedFetchList>();
-  } else if (var_type == proto::VarType::FETCH_LIST) {
-    var->GetMutable<FeedFetchList>();
-  } else if (var_type == proto::VarType::STEP_SCOPES) {
-    var->GetMutable<std::vector<Scope>>();
-  } else if (var_type == proto::VarType::LOD_RANK_TABLE) {
-    var->GetMutable<LoDRankTable>();
-  } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {
-    var->GetMutable<LoDTensorArray>();
-  } else if (var_type == proto::VarType::PLACE_LIST) {
-    var->GetMutable<platform::PlaceList>();
-  } else if (var_type == proto::VarType::READER) {
-    var->GetMutable<ReaderHolder>();
-  } else if (var_type == proto::VarType::RAW) {
-    // GetMutable will be called in operator
-  } else {
-    PADDLE_THROW(
-        "Variable type %d is not in "
-        "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
-        "LOD_RANK_TABLE, PLACE_LIST, READER, CHANNEL, RAW]",
-        var_type);
-  }
-}
 static void ReadBinaryFile(const std::string& filename,
                             std::string* content) {
  std::string &contents = *content;
@@ -139,343 +109,100 @@ static void SaveModel(
  }
 }   // end SaveModel
-void ExecutorThreadWorker::Reset() {
+AsyncExecutor::AsyncExecutor(Scope& scope, const platform::Place& place)
-  inspect_values_.clear();
+    : root_scope_(scope), place_(place) {}
-}
-void ExecutorThreadWorker::CreateThreadOperators(const ProgramDesc& program) {
-  auto& block = program.Block(0);
-  op_names_.clear();
-  for (auto& op_desc : block.AllOps()) {
-    std::unique_ptr<OperatorBase> local_op = OpRegistry::CreateOp(*op_desc);
-    op_names_.push_back(op_desc->Type());
-    OperatorBase* local_op_ptr = local_op.release();
-    ops_.push_back(local_op_ptr);
-    continue;
-  }
-}
-void ExecutorThreadWorker::CreateThreadScope(const ProgramDesc& program) {
-  auto& block = program.Block(0);
-  thread_scope_ = &root_scope_->NewScope();
-  for (auto& var : block.AllVars()) {
-    if (var->Persistable()) {
-      auto* ptr = root_scope_->Var(var->Name());
-      CreateTensor(ptr, var->GetType());
-    } else {
-      auto* ptr = thread_scope_->Var(var->Name());
-      CreateTensor(ptr, var->GetType());
-    }
-  }
-}
-void ExecutorThreadWorker::SetDataFeed(DataFeed& datafeed) {
-  if (typeid(datafeed) == typeid(TextClassDataFeed)) {
-    local_reader_.reset(
-        new TextClassDataFeed(dynamic_cast<TextClassDataFeed &>(datafeed)));
-    local_reader_->SetThreadId(thread_id_);
-  }
-}
-void ExecutorThreadWorker::BindingDataFeedMemory() {
-  const std::vector<std::string>& input_feed = local_reader_->GetUseSlotAlias();
-  for (auto name : input_feed) {
-    local_reader_->AddFeedVar(thread_scope_->Var(name), name);
-  }
-}
-void ExecutorThreadWorker::SetInspectVarNames(
+void AsyncExecutor::CreateThreads(
-    const std::vector<std::string>& inspect_var_names) {
+    ExecutorThreadWorker* worker,
-  inspect_var_names_.clear();
+    const ProgramDesc& main_program,
-  inspect_var_names_.insert(inspect_var_names_.end(),
+    const std::shared_ptr<DataFeed>& reader,
-                            inspect_var_names.begin(), inspect_var_names.end());
+    const std::vector<std::string>& fetch_var_names,
+    Scope& root_scope,
+    const int thread_index) {
+  worker->SetThreadId(thread_index);
+  worker->SetRootScope(&root_scope);
+  worker->CreateThreadResource(main_program, place_);
+  worker->SetDataFeed(reader);
+  worker->SetFetchVarNames(fetch_var_names);
+  worker->BindingDataFeedMemory();
 }
-void ExecutorThreadWorker::SetModelParamNames(
+void AsyncExecutor::CheckFiles(
-    const std::vector<std::string>& param_names) {
+    const std::vector<std::string>& files) {
-  model_param_names_ = param_names;
+  // function for user to check file formats
-}
+  // should be exposed to users
-void ExecutorThreadWorker::SetDevice() {
-  static unsigned priority[] = {
-    0, 1, 2, 3, 4, 5,
-    6, 7, 8, 9, 10, 11,
-    12, 13, 14, 15, 16, 17,
-    18, 19, 20, 21, 22, 23,
-    24, 25, 26, 27, 28, 29,
-    30, 31, 32, 33, 34, 35,
-    36, 37, 38, 39, 40, 41,
-    42, 43, 44, 45, 46, 47
-  };
-  unsigned int i = this->thread_id_;
-  if (i < sizeof(priority) / sizeof(unsigned)) {
-    unsigned proc = priority[i];
-    cpu_set_t mask;
-    CPU_ZERO(&mask);
-    CPU_SET(proc, &mask);
-    if (-1 == sched_setaffinity(0, sizeof(mask), &mask)) {
-      LOG(ERROR) << "WARNING: Failed to set thread affinity for thread " << i;
-    } else {
-      CPU_ZERO(&mask);
-      if ((0 == sched_getaffinity(0, sizeof(mask), &mask))
-          && CPU_ISSET(proc, &mask)) {
-        LOG(ERROR) << "TRACE: Thread " << i
-                   << " is running on processor " << proc
-                   << "...";
-      }
-    }
-  }
-}
-void ExecutorThreadWorker::Train() {
-  LOG(ERROR) << "begin to train";
-  SetDevice();
-  int inspect_var_num = inspect_var_names_.size();
-  inspect_values_.clear();
-  inspect_values_.resize(inspect_var_num, 0);
-  local_reader_->WaitNextEpoch();
-  int epoch = local_reader_->GetCurrentEpoch();
-  LOG(ERROR) << "epoch: " << epoch;
-  int batch_num = 1;
-  while (true) {
-    const char *file = local_reader_->PickOneFile();
-    if (file == NULL) {
-      break;
-    }
-    if (!local_reader_->SetFile(file)) {
-      break;
-    }
-    while (true) {
-      bool flag = local_reader_->ReadBatch();
-      if (!flag) {
-        break;
-      }
-      for (unsigned int i = 0; i < ops_.size(); ++i) {
-        ops_[i]->Run(*thread_scope_, place_);
-      }
-      batch_num++;
-      float avg_inspect = 0.0;
-      for (int i = 0; i < inspect_var_num; ++i) {
-        avg_inspect = thread_scope_->FindVar(inspect_var_names_[i])
-                                   ->GetMutable<LoDTensor>()
-                                   ->data<float>()[0];
-        inspect_values_[i] += avg_inspect;
-      }
-      thread_scope_->DropKids();
-    }
-    local_reader_->UpdateEpochNum();
-    LOG(ERROR) << "memory used after epoch " << epoch + 1
-               << " called: " << memory::memory_usage(place_);
-  }
-  for (int i = 0; i < inspect_var_num; ++i) {
-    inspect_values_[i] /= batch_num;
-    std::string var = inspect_var_names_[i].substr(
-                          0,
-                          inspect_var_names_[i].find_first_of("_"));
-    LOG(ERROR) << "mean " << var.c_str()
-               << " of epoch " << i + 1 << ": " << inspect_values_[i];
-  }
-  if (thread_id_ == 0) {
-    char modelfile[1024];
-    snprintf(&modelfile[0], sizeof(modelfile), "%s_epoch%d.model",
-             model_prefix_.c_str(), epoch);
-    std::string model_filename = std::string(modelfile);
-    // this save_inference_model can only save imdbtask, should make this
-    // general
-    //
-    // currently comment it
-    LOG(ERROR) << "Going to save model " << modelfile;
-    SaveModel(main_program_,
-              thread_scope_,
-              model_param_names_,
-              model_filename,
-              true);
-  }
-}
-void ExecutorThreadWorker::SetThreadId(int tid) {
-  thread_id_ = tid;
-}
-void ExecutorThreadWorker::SetPlace(const platform::Place& place) {
-  place_ = place;
-}
-void ExecutorThreadWorker::SetMainProgram(
-    const ProgramDesc& main_program_desc) {
-  main_program_.reset(new ProgramDesc(main_program_desc));
-}
-void ExecutorThreadWorker::SetRootScope(Scope* g_scope) {
-  root_scope_ = g_scope;
-}
-void ExecutorThreadWorker::SetMaxTrainingEpoch(int max_epoch) {
-  max_epoch_ = max_epoch;
-}
-AsyncExecutor::AsyncExecutor(ProgramDesc& main_program,
-                       const std::vector<std::string>& param_names,
-                       TextClassDataFeed& data_feed,
-                       unsigned int thread_num,
-                       const platform::Place& place)
-    : thread_num_(thread_num),
-      place_(place),
-      main_program_(main_program),
-      data_feed_(data_feed) {
-  model_param_names_.clear();
-  model_param_names_.insert(model_param_names_.end(),
-                            param_names.begin(),
-                            param_names.end());
-}
-void AsyncExecutor::InitRootScope(Scope* scope) {
-  root_scope_ = scope;
-}
-void AsyncExecutor::SetMaxTrainingEpoch(int max_epoch) {
-  max_epoch_ = max_epoch;
 }
 void AsyncExecutor::SetModelPrefix(const std::string& model_prefix) {
  model_prefix_ = model_prefix;
 }
-void AsyncExecutor::RunStartupProgram(const ProgramDesc& program,
+std::vector<float> AsyncExecutor::RunFromFile(
-                                        Scope* scope) {
+    const ProgramDesc& main_program,
-  auto& block = program.Block(0);
+    const DataFeedDesc& data_feed_desc,
-  for (auto& var : block.AllVars()) {
+    const std::vector<std::string>& filelist,
-    if (var->Persistable()) {
+    const int thread_num,
-      auto* ptr = scope->Var(var->Name());
+    const std::vector<std::string>& fetch_var_names) {
-      CreateTensor(ptr, var->GetType());
+  std::vector<std::thread> threads;
-      // LOGERR("Persistable Var Name:%s", var->Name().c_str());
-    }
-  }
-  std::map<std::string, int> param_dict;
+  /*
-  std::vector<OperatorBase *> ops;
+    readerDesc: protobuf description for reader initlization
-  for (auto& op_desc : block.AllOps()) {
+    argument: class_name, batch_size, use_slot, queue_size, buffer_size, padding_index
-    std::vector<std::string> param_name_vec = op_desc->OutputArgumentNames();
-    bool need_to_run = false;
+    reader: 
-    for (auto& name : param_name_vec) {
+    1) each thread has a reader, reader will read input data and 
-      if (param_dict.find(name) == param_dict.end()) {
+    put it into input queue
-        param_dict[name] = 1;
+    2) each reader has a Next() iterface, that can fetch an instance
-        need_to_run = true;
+    from the input queue
-      }
+   */
-    }
+  // todo: should be factory method for creating datafeed
-    if (need_to_run) {
+  std::vector<std::shared_ptr<DataFeed> > readers;
-      std::unique_ptr<OperatorBase> local_op = OpRegistry::CreateOp(*op_desc);
+  readers.resize(thread_num);
-      OperatorBase* local_op_ptr = local_op.release();
+  for (unsigned int i = 0; i < readers.size(); ++i) {
-      ops.push_back(local_op_ptr);
+    readers[i] = DataFeedFactory::CreateDataFeed(data_feed_desc.name());
-    }
  }
-  // LOGERR("There are %d parameters in startup program, %d op needs to run",
-  //        param_dict.size(), ops.size());
-  for (auto& op : ops) {
+  std::vector<std::shared_ptr<ExecutorThreadWorker> > workers;
-    op->Run(*scope, place_);
+  workers.resize(thread_num);
+  for (auto& worker : workers) {
+    worker.reset(new ExecutorThreadWorker);
  }
-  // LOGERR("total time for startup program: %fs", timeline.elapsed_sec());
-  for (auto& op : ops) {
-    delete op;
-  }
-  // LOGERR("run startup program done.");
-}
-std::unique_ptr<ProgramDesc> AsyncExecutor::LoadDescFromFile(
+  // prepare thread resource here
-    const std::string& f) {
+  for (int thidx = 0; thidx < thread_num; ++thidx) {
-  std::string program_desc_str;
+    CreateThreads(workers[thidx].get(), main_program,
-  ReadBinaryFile(f, &program_desc_str);
+                  readers[thidx], fetch_var_names, root_scope_, thidx);
-  std::unique_ptr<ProgramDesc> program(new ProgramDesc(program_desc_str));
-  return program;
-}
-void AsyncExecutor::SetInspectVarNames(
-    const std::vector<std::string>& inspect_var_names) {
-  inspect_var_names_.clear();
-  inspect_var_names_.insert(inspect_var_names_.end(),
-                            inspect_var_names.begin(), inspect_var_names.end());
-}
-void AsyncExecutor::PrepareThreads(const ProgramDesc& host_program) {
-  workers_.resize(thread_num_);
-  for (int i = 0; i < thread_num_; ++i) {
-    workers_[i].reset(new ExecutorThreadWorker);
-    workers_[i]->SetThreadId(i);
-    workers_[i]->CreateThreadOperators(host_program);
-    workers_[i]->SetRootScope(root_scope_);
-    workers_[i]->SetPlace(place_);
-    workers_[i]->SetMaxTrainingEpoch(max_epoch_);
-    workers_[i]->CreateThreadScope(host_program);
-    workers_[i]->SetInspectVarNames(inspect_var_names_);
-    workers_[i]->SetModelParamNames(model_param_names_);
-    workers_[i]->SetMainProgram(host_program);
-    workers_[i]->SetModelPrefix(model_prefix_);
-    //
-    // new a datafeed here
-    workers_[i]->SetDataFeed(data_feed_);
-    workers_[i]->BindingDataFeedMemory();
  }
-}
-std::vector<float>& AsyncExecutor::Run(
+  // start executing ops in multiple threads
-    const std::vector<std::string>& inspect_var_names) {
+  for (int thidx = 0; thidx < thread_num; ++thidx) {
-  SetInspectVarNames(inspect_var_names);
+    threads.push_back(std::thread(&ExecutorThreadWorker::TrainFiles,
-  threads_.clear();
+                                  workers[thidx].get()));
-  // thread binding here?
-  if (workers_initialized_ == false) {
-    PrepareThreads(main_program_);
-    workers_initialized_ = true;
-  }
-  for (int i = 0; i < thread_num_; ++i) {
-    workers_[i]->Reset();
-    workers_[i]->SetInspectVarNames(inspect_var_names);
-    threads_.push_back(std::thread(&ExecutorThreadWorker::Train,
-                      workers_[i].get()));
  }
-  for (auto& th : threads_) {
+  for (auto& th : threads) {
    th.join();
  }
-  inspect_values_.clear();
+  std::vector<float> fetch_values;
-  inspect_values_.resize(inspect_var_names_.size(), 0);
+  fetch_values.resize(fetch_var_names.size(), 0);
-  std::vector<std::vector<float>*> inspect_value_vectors;
+  std::vector<std::vector<float>*> fetch_value_vectors;
-  inspect_value_vectors.resize(thread_num_);
+  fetch_value_vectors.resize(thread_num);
-  for (int i = 0; i < thread_num_; ++i) {
+  for (int i = 0; i < thread_num; ++i) {
-    inspect_value_vectors[i] = &workers_[i]->GetInspectValues();
+    fetch_value_vectors[i] = &workers[i]->GetFetchValues();
  }
-  for (unsigned int i = 0; i < inspect_var_names_.size(); ++i) {
+  for (unsigned int i = 0; i < fetch_var_names.size(); ++i) {
    float value = 0.0;
-    for (int j = 0; j < thread_num_; ++j) {
+    for (int j = 0; j < thread_num; ++j) {
-      value += inspect_value_vectors[j]->at(i);
+      value += fetch_value_vectors[j]->at(i);
    }
-    value /= thread_num_;
+    value /= thread_num;
-    inspect_values_[i] = value;
+    fetch_values[i] = value;
  }
-  return inspect_values_;
+  return fetch_values;
 }
 void AsyncExecutor::LoadInitModel() {

--- a/paddle/fluid/framework/async_executor.h
+++ b/paddle/fluid/framework/async_executor.h
@@ -23,7 +23,8 @@ limitations under the License. */
 #include <thread>   // NOLINT
 #include <vector>
 #include <typeinfo>
-#include "paddle/fluid/framework/data_feed.h"
+#include "paddle/fluid/framework/data_feed.pb.h"
+#include "paddle/fluid/framework/executor_thread_worker.h"
 #include "paddle/fluid/framework/datafeed_creator.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/program_desc.h"
@@ -31,93 +32,13 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
-void CreateTensor(Variable* var, proto::VarType::Type var_type);
-class ExecutorThreadWorker {
- public:
-  ExecutorThreadWorker() {}
-  ~ExecutorThreadWorker() {}
-  void CreateThreadScope(const ProgramDesc& program);
-  void SetThreadId(int tid);
-  void CreateThreadOperators(const ProgramDesc& program);
-  void SetRootScope(Scope* g_scope);
-  void SetDevice();
-  void AddFidSet();
-  void SetCommBatch(int comm_batch) { comm_batch_ = comm_batch; }
-  void AddTrainFile(const std::string& filename);
-  void SetMainProgram(const ProgramDesc& main_program_desc);
-  void SetPlace(const paddle::platform::Place& place);
-  void SetMaxTrainingEpoch(const int max_epoch);
-  void BindingDataFeedMemory();
-  void SetModelPrefix(const std::string& prefix) { model_prefix_ = prefix; }
-  void SetInspectVarNames(const std::vector<std::string>& inspect_var_names);
-  void SetModelParamNames(const std::vector<std::string>& param_names);
-  void SetDataFeed(DataFeed& datafeed); // NOLINT
-  void Train();
-  const char* PickOneFile();
-  void UpdateEpochNum();
-  void Reset();
-  void Initialize() {}
-  std::vector<float>& GetInspectValues() {return inspect_values_;}
- protected:
-  // thread index
-  int thread_id_;
-  // max epoch for each thread
-  unsigned int max_epoch_;
-  // instances learned currently
-  int comm_batch_;
-  std::string model_prefix_;
-  std::vector<std::string> op_names_;
-  // local ops for forward and backward
-  std::vector<OperatorBase *> ops_;
-  // main program for training
-  std::unique_ptr<ProgramDesc> main_program_;
-  // binary data reader
-  std::unique_ptr<DataFeed> local_reader_;
-  std::vector<std::string> inspect_var_names_;
-  std::vector<std::string> model_param_names_;
-  // execution place
-  platform::Place place_;
-  // root scope for model parameters
-  Scope* root_scope_;
-  // a thread scope, father scope is global score which is shared
-  Scope* thread_scope_;
- private:
-  std::vector<float> inspect_values_;
-};
 class AsyncExecutor {
 public:
-  explicit AsyncExecutor(ProgramDesc& main_program,     // NOLINT
+  explicit AsyncExecutor(Scope& scope, const platform::Place& place);   // NOLINT
-                         const std::vector<std::string>& param_names,
-                         TextClassDataFeed& data_feed,  // NOLINT
-                         unsigned int thread_num,
-                         const platform::Place& place);
  virtual ~AsyncExecutor() {}
  static std::unique_ptr<ProgramDesc> LoadDescFromFile(
                                          const std::string& filename);
-  void InitRootScope(Scope* scope);
+  Scope* GetRootScope() { return &root_scope_; }
-  void SetMaxTrainingEpoch(const int max_epoch);
-  Scope* GetRootScope() { return root_scope_; }
-  void SetBatchSize(const int batch_size) { batch_size_ = batch_size; }
-  void SetCommBatch(int comm_batch) {
-    comm_batch_ = comm_batch;
-  }
  void SetModelPath(const std::string& model_path) {
    model_path_ = model_path;
@@ -132,38 +53,32 @@ class AsyncExecutor {
  }
  void SetModelPrefix(const std::string& model_prefix);
-  virtual void PrepareThreads(const ProgramDesc& host_program);
  void RunStartupProgram(const ProgramDesc& program, Scope* scope);
-  std::vector<float>& Run(const std::vector<std::string>& inspect_var_names);
+  std::vector<float> RunFromFile(const ProgramDesc& main_program,
+                                  const DataFeedDesc& data_feed_desc,
+                                  const std::vector<std::string>& filelist,
+                                  const int thread_num,
+                                  const std::vector<std::string>& fetch_names);
+  void CheckFiles(const std::vector<std::string>& files);
  void LoadInitModel();
 private:
-  void SetInspectVarNames(const std::vector<std::string>& inspect_var_names);
+  void CreateThreads(ExecutorThreadWorker* worker,
+                     const ProgramDesc& main_program,
+                     const std::shared_ptr<DataFeed>& reader,
+                     const std::vector<std::string>& fetch_var_names,
+                     Scope& root_scope,   // NOLINT
+                     const int thread_index);
 public:
-  int thread_num_;
-  int max_epoch_;
-  int batch_size_;
-  int comm_batch_;
-  std::vector<std::shared_ptr<ExecutorThreadWorker> > workers_;
-  std::vector<std::thread> threads_;
-  std::vector<std::string> inspect_var_names_;
-  std::vector<std::string> model_param_names_;
  std::string model_prefix_;
  std::string model_path_;
  std::string init_prog_file_;
  std::string init_model_file_;
-  Scope* root_scope_;
+  Scope& root_scope_;
  platform::Place place_;
- private:
-  ProgramDesc& main_program_;
-  TextClassDataFeed& data_feed_;
-  std::vector<float> inspect_values_;
- private:
-  static bool workers_initialized_;
 };
 }  // namespace framework

--- a/paddle/fluid/framework/data_device_transform_test.cu
+++ b/paddle/fluid/framework/data_device_transform_test.cu
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/elementwise_op_function.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/init.h"

--- a/paddle/fluid/framework/data_feed.cc
+++ b/paddle/fluid/framework/data_feed.cc
@@ -43,7 +43,7 @@ size_t DataFeed::file_idx_;
 std::mutex DataFeed::mutex_for_pick_file_;
 void DataFeed::AddFeedVar(Variable* var, const std::string& name) {
-  if (CheckInit() == false) {return;}
+  CheckInit();
  for (size_t i = 0; i < use_slots_.size(); ++i) {
    if (name == use_slots_[i]) {
      if (use_slots_is_dense_[i]) {
@@ -56,7 +56,7 @@ void DataFeed::AddFeedVar(Variable* var, const std::string& name) {
 }
 bool DataFeed::SetFileList(const std::vector<std::string>& files) {
-  if (CheckInit() == false) {return false;}
+  CheckInit();
  if (files.size() == 0) {
    LOG(ERROR) << "error: you have set an empty filelist";
    return false;
@@ -77,27 +77,27 @@ bool DataFeed::PickOneFile(std::string& filename) {
  return true;
 }
-bool DataFeed::CheckInit() {
+void DataFeed::CheckInit() {
-  if (finish_init_) {return true;}
+  if (finish_init_) {return;}
  LOG(ERROR) << "error: initialization did not succeed";
-  return false;
+  exit(-1);
 }
-bool DataFeed::CheckSetFileList() {
+void DataFeed::CheckSetFileList() {
-  if (finish_set_filelist_) {return true;}
+  if (finish_set_filelist_) {return;}
  LOG(ERROR) << "error: set filelist did not succeed";
-  return false;
+  exit(-1);
 }
-bool DataFeed::CheckStart() {
+void DataFeed::CheckStart() {
-  if (finish_start_) {return true;}
+  if (finish_start_) {return;}
  LOG(ERROR) << "error: Datafeed has not started running yet";
-  return false;
+  exit(-1);
 }
 template<typename T>
 void PrivateQueueDataFeed<T>::SetQueueSize(int queue_size) {
-  if (!CheckInit()) {return;}
+  CheckInit();
  if (queue_size <= 0) {
    LOG(ERROR) << "error: illegal queue size: " << queue_size;
    return;
@@ -108,7 +108,7 @@ void PrivateQueueDataFeed<T>::SetQueueSize(int queue_size) {
 template<typename T>
 bool PrivateQueueDataFeed<T>::Start() {
-  if (!(CheckSetFileList())) {return false;}
+  CheckSetFileList();
  read_thread_ = std::thread(&PrivateQueueDataFeed::ReadThread, this);
  read_thread_.detach();
@@ -121,8 +121,9 @@ void PrivateQueueDataFeed<T>::ReadThread(){
  std::string filename;
  while (PickOneFile(filename)) {
    file_.open(filename.c_str()); // is_text_feed
-    if (!file_.is_open()) {
+    if (!file_.good()) {
      LOG(ERROR) << "error: open file<" << filename << "> fail";
+      continue;
    }
    T instance;
    while (ParseOneInstance(instance)) {
@@ -135,7 +136,7 @@ void PrivateQueueDataFeed<T>::ReadThread(){
 template<typename T>
 bool PrivateQueueDataFeed<T>::Next(){
-  if (!CheckStart()) {return false;}
+  CheckStart();
  int index = 0;
  T instance;
  T ins_vec(use_slots_.size());
@@ -150,15 +151,16 @@ bool PrivateQueueDataFeed<T>::Next(){
  return batch_size_ != 0;
 }
-void MultiSlotDataFeed::Init(paddle::DataFeedDesc& data_feed_desc) {
+void MultiSlotDataFeed::Init(paddle::framework::DataFeedDesc& data_feed_desc) {
  finish_init_ = false;
  finish_set_filelist_ = false;
  finish_start_ = false;
+  /*
  if (!data_feed_desc.has_multi_slot_desc()){
    LOG(ERROR) << "error: multi_slot_desc has not been set";
-    return ;
+    exit(-1);
  }
-  paddle::MultiSlotDesc multi_slot_desc = data_feed_desc.multi_slot_desc();
+  paddle::framework::MultiSlotDesc multi_slot_desc = data_feed_desc.multi_slot_desc();
  size_t all_slot_num = multi_slot_desc.slots_size();
  all_slots_.resize(all_slot_num);
  all_slots_type_.resize(all_slot_num);
@@ -176,10 +178,16 @@ void MultiSlotDataFeed::Init(paddle::DataFeedDesc& data_feed_desc) {
    }
  }
  feed_vec_.resize(use_slots_.size());
+  */
  finish_init_ = true;
 }
+bool MultiSlotDataFeed::CheckFile(const char* filename) {
+  // check with protobuf ?
+  std::cerr << "Check error" << std::endl;
+  return false;
+}
 bool MultiSlotDataFeed::ParseOneInstance(std::vector<MultiSlotType>& instance) {
  std::string line;
  if (getline(file_, line)) {
@@ -233,6 +241,7 @@ void MultiSlotDataFeed::AddInstanceToInsVec(std::vector<MultiSlotType>& ins_vec,
    ins_vec[i].AddIns(instance[i]);
  }
 }
 void MultiSlotDataFeed::PutToFeedVec(std::vector<MultiSlotType>& ins_vec) {
  for (size_t i = 0; i < use_slots_.size(); ++i) {
    auto& type = ins_vec[i].GetType();
@@ -253,7 +262,7 @@ void MultiSlotDataFeed::PutToFeedVec(std::vector<MultiSlotType>& ins_vec) {
        feed_vec_[i].GetLoDTensor()->set_lod(data_lod);
      }
    } else if (type[0] == 'u') { // uint64
-      // no uint64_t type
+      // no uint64_t type in paddle
      auto& feasign = ins_vec[i].GetUint64Data();
      if (feed_vec_[i].IsDense()) {
        int size_in_each_batch = total_instance / batch_size_;
@@ -263,7 +272,7 @@ void MultiSlotDataFeed::PutToFeedVec(std::vector<MultiSlotType>& ins_vec) {
      } else {
        int64_t* tensor_ptr = feed_vec_[i].GetLoDTensor()->
          mutable_data<int64_t>({total_instance, 1}, platform::CPUPlace());
-        memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(uint64_t));
+        memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(int64_t));
        LoD data_lod{offset};
        feed_vec_[i].GetLoDTensor()->set_lod(data_lod);
      }

--- a/paddle/fluid/framework/data_feed.h
+++ b/paddle/fluid/framework/data_feed.h
@@ -53,14 +53,14 @@ class MixTensor {
  LoDTensor* GetLoDTensor(){
    if (is_dense_) {
      LOG(ERROR) << "error: let a dense var return a LoDTensor ptr";
-      return NULL;
+      exit(-1);
    }
    return lodtensor_;
  }
  Tensor* GetTensor(){
    if (!is_dense_) {
      LOG(ERROR) << "error: let a sparse var return a Tensor ptr";
-      return NULL;
+      exit(-1);
    }
    return tensor_;
  }
@@ -155,7 +155,7 @@ class DataFeed {
 public:
  DataFeed() {}
  virtual ~DataFeed() {}
-  virtual void Init(paddle::DataFeedDesc& data_feed_desc) = 0;
+  virtual void Init(paddle::framework::DataFeedDesc& data_feed_desc) = 0;
  // for some datafeeds may not be able to implement this interface
  virtual bool CheckFile(const char* filename) {
    LOG(ERROR) << "error: The function CheckFile is not implemented";
@@ -169,10 +169,12 @@ class DataFeed {
  // for subclass with queue
  virtual void SetQueueSize(int queue_size) {
    LOG(ERROR) << "error: The function SetQueueSize is not implemented";
+    exit(-1);
  }
  // for subclass with buffer
  virtual void SetBufferSize(int buffer_size) {
    LOG(ERROR) << "error: The function SetBufferSize is not implemented";
+    exit(-1);
  }
  virtual const std::vector<std::string>& GetAllSlots() {return all_slots_;}
  virtual const std::vector<std::string>& GetUseSlots() {return use_slots_;}
@@ -181,9 +183,9 @@ class DataFeed {
 protected:
  // Check if it is executed in this order:
  //   Init -> SetFileList/BindingMemory -> Start -> Next
-  virtual bool CheckInit();
+  virtual void CheckInit();
-  virtual bool CheckSetFileList();
+  virtual void CheckSetFileList();
-  virtual bool CheckStart();
+  virtual void CheckStart();
  virtual bool PickOneFile(std::string& filename);
  static std::vector<std::string> filelist_;
@@ -213,7 +215,7 @@ class PrivateQueueDataFeed : public DataFeed {
 public:
  PrivateQueueDataFeed() {}
  virtual ~PrivateQueueDataFeed() {}
-  virtual void Init(paddle::DataFeedDesc& data_feed_desc) = 0;
+  virtual void Init(paddle::framework::DataFeedDesc& data_feed_desc) = 0;
  virtual bool Start();
  virtual bool Next(); // no buffer
  virtual void SetQueueSize(int queue_size);
@@ -247,28 +249,28 @@ class MultiSlotType {
  }
  ~MultiSlotType() {}
  void SetType(std::string& type) {
-    if (!CheckType(type)) {return;}
+    CheckType(type);
    type_ = type;
  }
  std::vector<size_t>& GetOffset() {
    return offset_;
  }
  void AddValue(float v) {
-    if (!CheckFloat()) {return;}
+    CheckFloat();
    float_feasign_.push_back(v);
  }
  void AddValue(uint64_t v) {
-    if (!CheckUint64()) {return;}
+    CheckUint64();
    uint64_feasign_.push_back(v);
  }
  void AddIns(MultiSlotType& ins) {
    if (ins.GetType()[0] == 'f') { //float
-      if (!CheckFloat()) {return;}
+      CheckFloat();
      auto& vec = ins.GetFloatData();
      offset_.push_back(offset_.back() + vec.size());
      float_feasign_.insert(float_feasign_.end(), vec.begin(), vec.end());
    } else if (ins.GetType()[0] == 'u') { //uint64
-      if (!CheckUint64()) {return;}
+      CheckUint64();
      auto& vec = ins.GetUint64Data();
      offset_.push_back(offset_.back() + vec.size());
      uint64_feasign_.insert(uint64_feasign_.end(), vec.begin(), vec.end());
@@ -284,27 +286,24 @@ class MultiSlotType {
    return type_;
  }
 private:
-  bool CheckType(std::string& type) {
+  void CheckType(std::string& type) {
    if (type != "uint64" && type != "float") {
      // check in here
      LOG(ERROR) << "error: here is no this type";
-      return false;
+      exit(-1);
    }
-    return true;
  }
-  bool CheckFloat() {
+  void CheckFloat() {
    if (type_[0] != 'f') { //float
      LOG(ERROR) << "error: add " << type_ << " value to float slot";
-      return false;
+      exit(-1);
    }
-    return true;
  }
-  bool CheckUint64() {
+  void CheckUint64() {
    if (type_[0] != 'u') { //uint64
      LOG(ERROR) << "error: add " << type_ << " value to uint64 slot";
-      return false;
+      exit(-1);
    }
-    return true;
  }
  std::vector<float> float_feasign_;
  std::vector<uint64_t> uint64_feasign_;
@@ -316,8 +315,8 @@ class MultiSlotDataFeed : public PrivateQueueDataFeed<std::vector<MultiSlotType>
 public:
  MultiSlotDataFeed() {}
  virtual ~MultiSlotDataFeed() {}
-  virtual void Init(paddle::DataFeedDesc& data_feed_desc);
+  virtual void Init(paddle::framework::DataFeedDesc& data_feed_desc);
-  //TODO: virtual bool CheckFile();
+  virtual bool CheckFile(const char* filename);
 protected:
  virtual void AddInstanceToInsVec(std::vector<MultiSlotType>& vec_ins, 
      std::vector<MultiSlotType>& instance, int index);
@@ -325,39 +324,6 @@ class MultiSlotDataFeed : public PrivateQueueDataFeed<std::vector<MultiSlotType>
  virtual void PutToFeedVec(std::vector<MultiSlotType>& ins_vec);
 };
-//TODO: to be deleted
-class TextClassDataFeed : public DataFeed {
- public:
-  virtual ~TextClassDataFeed() {}
-  virtual void Init(paddle::DataFeedDesc& data_feed_desc) {}
-  virtual bool Start() {return false;}; //TODO
-  virtual bool Next() {return false;}; //TODO
-  virtual bool ReadBatch() {return false;}
-  virtual void AddFeedVar(Variable* feed, const std::string& name) {}
-  virtual void BindScope(Scope* scope) {}
-  virtual bool SetFile(const char* filename) {return false;}
-  virtual bool CheckFile(const char* filename) {
-    // TODO(xxx)
-    return false;
-  }
-  void SetBatchSize(int batch) {batch_size_ = batch;}
- private:
-  int ReadWholeFile(const std::string& filename, char* buffer) {return -1;}
-  char* file_content_buffer_;
-  char* file_content_buffer_ptr_;
-  int* batch_id_buffer_;
-  int* label_ptr_;
-  int file_size_;
-  std::vector<std::string> names_;
-  std::shared_ptr<char> file_content_buffer_host_;
-  std::shared_ptr<int> batch_id_host_;
-  std::shared_ptr<int> label_host_;
-};
 }   // namespace framework
 }   // namespace paddle

--- a/paddle/fluid/framework/data_feed.proto
+++ b/paddle/fluid/framework/data_feed.proto
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 syntax = "proto2";
-package paddle;
+package paddle.framework;
 message DataFeedDesc {
  optional string name = 1;
@@ -28,5 +28,5 @@ message Slot {
  required string name = 1;
  required string type = 2;
  optional bool dense = 3 [default = false];
-  optional bool use = 4 [default = true];
+  optional bool use = 4 [default = false];
 }
--- a/paddle/fluid/framework/data_feed_factory.cc
+++ b/paddle/fluid/framework/data_feed_factory.cc
@@ -12,18 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/framework/data_feed_factory.h"
+#include "paddle/fluid/framework/data_feed_factory.h"
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include "paddle/fluid/framework/data_feed.h"
 namespace paddle {
 namespace framework {
-typedef shared_ptr<DataFeed> (*Createdata_feedFunction)();
+typedef std::shared_ptr<DataFeed> (*Createdata_feedFunction)();
 typedef std::unordered_map<std::string, Createdata_feedFunction> data_feedMap;
 data_feedMap g_data_feed_map;
 #define REGISTER_DATAFEED_CLASS(data_feed_class)                      \
  namespace { \
-    shared_ptr<DataFeed> Creator_##data_feed_class() {      \
+    std::shared_ptr<DataFeed> Creator_##data_feed_class() {      \
-      return shared_ptr<DataFeed>(new data_feed_class);             \
+      return std::shared_ptr<DataFeed>(new data_feed_class);             \
    }                                                        \
    class __Registerer_##data_feed_class { \
     public: \
@@ -35,8 +40,8 @@ data_feedMap g_data_feed_map;
  }  // namespace
-string DataFeedFactory::DataFeedTypeList() {
+std::string DataFeedFactory::DataFeedTypeList() {
-  string data_feed_types;
+  std::string data_feed_types;
  for (auto iter = g_data_feed_map.begin();
       iter != g_data_feed_map.end(); ++iter) {
    if (iter != g_data_feed_map.begin()) {
@@ -47,9 +52,9 @@ string DataFeedFactory::DataFeedTypeList() {
  return data_feed_types;
 }
-shared_ptr<DataFeed> DataFeedFactory::CreateDataFeed(
+std::shared_ptr<DataFeed> DataFeedFactory::CreateDataFeed(
-    const char* data_feed_class) {
+    std::string data_feed_class) {
-  if (g_data_feed_map.count(string(data_feed_class)) < 1) {
+  if (g_data_feed_map.count(data_feed_class) < 1) {
    exit(-1);
  }
  return g_data_feed_map[data_feed_class]();

--- a/paddle/fluid/framework/data_feed_factory.h
+++ b/paddle/fluid/framework/data_feed_factory.h
@@ -16,14 +16,15 @@ limitations under the License. */
 #define PADDLE_FLUID_FRAMEWORK_DATA_FEED_FACTORY_H_
 #include <string>
-#include "paddle/framework/data_feed.h"
+#include <memory>
+#include "paddle/fluid/framework/data_feed.h"
 namespace paddle {
 namespace framework {
 class DataFeedFactory {
 public:
  static std::string DataFeedTypeList();
-  static shared_ptr<DataFeed> CreateDataFeed(const char* data_feed_class);
+  static std::shared_ptr<DataFeed> CreateDataFeed(std::string data_feed_class);
 };
 }  // namespace framework
 }  // namespace paddle

--- a/paddle/fluid/framework/executor_thread_worker.cc
+++ b/paddle/fluid/framework/executor_thread_worker.cc
@@ -93,6 +93,11 @@ void ExecutorThreadWorker::CreateThreadResource(
 void ExecutorThreadWorker::CreateThreadScope(const ProgramDesc& program) {
  auto& block = program.Block(0);
+  PADDLE_ENFORCE_NOT_NULL(
+      root_scope_,
+      "root_scope should be set before creating thread scope");
  thread_scope_ = &root_scope_->NewScope();
  for (auto& var : block.AllVars()) {
    if (var->Persistable()) {
@@ -107,17 +112,24 @@ void ExecutorThreadWorker::CreateThreadScope(const ProgramDesc& program) {
 void ExecutorThreadWorker::SetDataFeed(
    const std::shared_ptr<DataFeed>& datafeed) {
-  local_reader_ = datafeed;
+  thread_reader_ = datafeed;
 }
 void ExecutorThreadWorker::BindingDataFeedMemory() {
  const std::vector<std::string>& input_feed =
      thread_reader_->GetUseSlotAlias();
  for (auto name : input_feed) {
-    local_reader_->AddFeedVar(thread_scope_->Var(name), name);
+    thread_reader_->AddFeedVar(thread_scope_->Var(name), name);
  }
 }
+void ExecutorThreadWorker::SetFetchVarNames(
+    const std::vector<std::string>& fetch_var_names) {
+  fetch_var_names_.clear();
+  fetch_var_names_.insert(fetch_var_names_.end(),
+                          fetch_var_names.begin(), fetch_var_names.end());
+}
 void ExecutorThreadWorker::SetDevice() {
  // at most 48 threads binding currently
  static unsigned priority[] = {
@@ -156,12 +168,28 @@ void ExecutorThreadWorker::SetDevice() {
 void ExecutorThreadWorker::TrainFiles() {
  // todo: configurable
  SetDevice();
+  int fetch_var_num = fetch_var_names_.size();
+  fetch_values_.clear();
+  fetch_values_.resize(fetch_var_num, 0);
  thread_reader_->Start();
-  while (int cur_batch = thread_reader_->Next()) {
+  int cur_batch;
+  while ((cur_batch = thread_reader_->Next()) > 0) {
    // executor run here
    for (auto& op : ops_) {
      op->Run(*thread_scope_, place_);
    }
+    float avg_inspect = 0.0;
+    for (int i = 0; i < fetch_var_num; ++i) {
+      avg_inspect = thread_scope_->FindVar(fetch_var_names_[i])
+                                 ->GetMutable<LoDTensor>()
+                                 ->data<float>()[0];
+      fetch_values_[i] += avg_inspect;
+    }
    thread_scope_->DropKids();
  }
 }

--- a/paddle/fluid/framework/executor_thread_worker.h
+++ b/paddle/fluid/framework/executor_thread_worker.h
@@ -43,6 +43,9 @@ class ExecutorThreadWorker {
  void SetDevice();
  void BindingDataFeedMemory();
  void SetDataFeed(const std::shared_ptr<DataFeed>& datafeed);
+  void TrainFiles();
+  void SetFetchVarNames(const std::vector<std::string>& fetch_var_names);
+  std::vector<float>& GetFetchValues() {return fetch_values_;}
 private:
  void CreateThreadScope(const framework::ProgramDesc& program);
@@ -66,9 +69,13 @@ class ExecutorThreadWorker {
  Scope* root_scope_;
  // a thread scope, father scope is global score which is shared
  Scope* thread_scope_;
+ private:
+  std::vector<std::string> fetch_var_names_;
+  std::vector<float> fetch_values_;
 };
 }  // namespace framework
 }  // namespace paddle
-#endif  // PADDLE_FLUID_FRAMEWORK_ASYNC_EXECUTOR_H_
+#endif  // PADDLE_FLUID_FRAMEWORK_EXECUTOR_THREAD_WORKER_H_
 /* vim: set expandtab ts=2 sw=2 sts=2 tw=100: */
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -15,7 +15,7 @@ cc_library(inference_io SRCS io.cc)
 # TODO(panyx0718): Should this be called paddle_fluid_inference_api_internal?
 cc_library(paddle_fluid_api
    SRCS io.cc
-    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) 
+    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
 get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES)

--- a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
@@ -45,7 +45,7 @@ void IrAnalysisComposePass::InitTensorRTAttrs(Argument *argument) {
      std::unordered_set<std::string> teller_set(
          {"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
           "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
-           "elementwise_add", "dropout", "split"});
+           "elementwise_add", "dropout", "split", "prelu", "conv2d_transpose"});
      if (!node->IsOp()) return false;
      if (teller_set.count(node->Op()->Type())) {

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -549,4 +549,6 @@ USE_TRT_CONVERTER(concat);
 USE_TRT_CONVERTER(dropout);
 USE_TRT_CONVERTER(pad);
 USE_TRT_CONVERTER(split);
+USE_TRT_CONVERTER(prelu);
+USE_TRT_CONVERTER(conv2d_transpose);
 #endif
--- a/paddle/fluid/inference/tensorrt/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt
-nv_library(tensorrt_engine SRCS engine.cc DEPS framework_proto device_context)
+nv_library(tensorrt_engine SRCS engine.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context)
 nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
 nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine)
 add_subdirectory(plugin)

--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -2,35 +2,38 @@
 nv_library(tensorrt_converter
  SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
 batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc
-pad_op.cc split_op.cc
+pad_op.cc split_op.cc prelu_op.cc
  DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry)
 nv_test(test_op_converter SRCS test_op_converter.cc DEPS
-  ${FLUID_CORE_MODULES} tensorrt_engine tensorrt_converter)
+  ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_converter)
 nv_test(test_io_converter SRCS test_io_converter.cc io_converter.cc DEPS dynload_cuda dynamic_loader lod_tensor)
 nv_test(test_trt_mul_op SRCS test_mul_op.cc mul_op.cc
-        DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL)
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine mul_op SERIAL)
 nv_test(test_trt_fc_op SRCS test_fc_op.cc fc_op.cc
-        DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL)
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine mul_op SERIAL)
 nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc
-        DEPS ${FLUID_CORE_MODULES} tensorrt_engine activation_op SERIAL)
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine activation_op SERIAL)
 nv_test(test_trt_conv_op SRCS test_conv2d_op.cc conv2d_op.cc
-        DEPS ${FLUID_CORE_MODULES} tensorrt_engine conv_op SERIAL)
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine conv_op conv_transpose_op SERIAL)
 nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc
-        DEPS ${FLUID_CORE_MODULES} tensorrt_engine pool_op SERIAL)
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine pool_op SERIAL)
 nv_test(test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc
-        DEPS ${FLUID_CORE_MODULES} tensorrt_engine elementwise_add_op SERIAL)
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine elementwise_add_op SERIAL)
 nv_test(test_trt_softmax_op SRCS test_softmax_op.cc softmax_op.cc
-        DEPS ${FLUID_CORE_MODULES} tensorrt_engine softmax_op SERIAL)
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine softmax_op SERIAL)
 nv_test(test_trt_batch_norm_op SRCS test_batch_norm_op.cc batch_norm_op.cc
-        DEPS ${FLUID_CORE_MODULES} tensorrt_engine batch_norm_op SERIAL)
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine batch_norm_op SERIAL)
 nv_test(test_trt_concat_op SRCS test_concat_op.cc concat_op.cc
-        DEPS ${FLUID_CORE_MODULES} tensorrt_engine concat_op SERIAL)
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine concat_op SERIAL)
 nv_test(test_trt_dropout_op SRCS test_dropout_op.cc dropout_op.cc
-        DEPS ${FLUID_CORE_MODULES} tensorrt_engine dropout_op SERIAL)
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine dropout_op SERIAL)
 nv_test(test_trt_pad_op SRCS test_pad_op.cc pad_op.cc
-        DEPS ${FLUID_CORE_MODULES} tensorrt_engine pad_op SERIAL)
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine pad_op SERIAL)
 nv_test(test_trt_split_op SRCS test_split_op.cc split_op.cc
-        DEPS ${FLUID_CORE_MODULES} tensorrt_engine tensorrt_plugin
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_plugin
-split_op concat_op SERIAL)
+        split_op concat_op SERIAL)
+nv_test(test_trt_prelu_op SRCS test_prelu_op.cc prelu_op.cc
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_plugin
+        prelu_op SERIAL)
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -18,92 +18,139 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {
-bool to_skip_merging_optimize(TensorRTEngine* engine_,
+bool to_skip_merging_optimize(TensorRTEngine* engine,
                              const std::vector<int>& filters,
                              const std::vector<int>& strides,
                              const std::vector<int>& paddings,
                              std::string input_name) {
-  if (engine_->itensor_quote_num[input_name] > 0) {
+  if (engine->itensor_quote_num[input_name] > 0) {
    return true;
  }
  if (filters[0] == 1 && filters[1] == 1 && strides[0] == 1 &&
      strides[1] == 1 && paddings[0] == 0 && paddings[1] == 0)
-    engine_->itensor_quote_num[input_name] += 1;
+    engine->itensor_quote_num[input_name] += 1;
  return false;
 }
+template <typename RegistFunc, typename SetDilationFunc>
+void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
+                   const framework::Scope& scope, bool test_mode,
+                   RegistFunc fadd_layer, SetDilationFunc fset_dilation,
+                   const std::string& name) {
+  VLOG(3) << "convert a fluid " << name << " op to tensorrt layer without bias";
+  framework::OpDesc op_desc(op, nullptr);
+  PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1);
+  PADDLE_ENFORCE_EQ(op_desc.Input("Filter").size(), 1);  // Y is a weight
+  PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1);
+  PADDLE_ENFORCE(engine != nullptr);
+  auto* X = engine->GetITensor(op_desc.Input("Input").front());
+  // Declare weights
+  auto* Y_v = scope.FindVar(op_desc.Input("Filter").front());
+  PADDLE_ENFORCE_NOT_NULL(Y_v);
+  auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
+  platform::CPUPlace cpu_place;
+  std::unique_ptr<framework::LoDTensor> weight_tensor(
+      new framework::LoDTensor());
+  weight_tensor->Resize(Y_t->dims());
+  TensorCopySync((*Y_t), cpu_place, weight_tensor.get());
+  auto* weight_data = weight_tensor->mutable_data<float>(platform::CPUPlace());
+  PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL);
+  const int n_output = weight_tensor->dims()[0];
+  const int n_input = weight_tensor->dims()[1];
+  const int filter_h = weight_tensor->dims()[2];
+  const int filter_w = weight_tensor->dims()[3];
+  const int groups = boost::get<int>(op_desc.GetAttr("groups"));
+  const std::vector<int> dilations =
+      boost::get<std::vector<int>>(op_desc.GetAttr("dilations"));
+  const std::vector<int> strides =
+      boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
+  const std::vector<int> paddings =
+      boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
+  nvinfer1::DimsHW nv_ksize(filter_h, filter_w);
+  nvinfer1::DimsHW nv_dilations(dilations[0], dilations[1]);
+  nvinfer1::DimsHW nv_strides(strides[0], strides[1]);
+  nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]);
+  TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
+                                static_cast<void*>(weight_data),
+                                static_cast<size_t>(weight_tensor->numel())};
+  TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
+  auto* layer = fadd_layer(const_cast<nvinfer1::ITensor*>(X), n_output, n_input,
+                           nv_ksize, weight, bias);
+  PADDLE_ENFORCE(layer != nullptr);
+  layer->setStride(nv_strides);
+  layer->setPadding(nv_paddings);
+  layer->setNbGroups(groups);
+  // set dilations
+  fset_dilation(layer, nv_dilations);
+  auto output_name = op_desc.Output("Output").front();
+  layer->setName((name + " (Output: " + output_name + ")").c_str());
+  engine->weight_map[op_desc.Input("Filter").front()] =
+      std::move(weight_tensor);
+  layer->getOutput(0)->setName(output_name.c_str());
+  engine->SetITensor(output_name, layer->getOutput(0));
+  if (test_mode ||
+      to_skip_merging_optimize(engine, {filter_h, filter_w}, strides, paddings,
+                               op_desc.Input("Input").front())) {
+    engine->DeclareOutput(output_name);
+  }
+}
 class Conv2dOpConverter : public OpConverter {
 public:
  void operator()(const framework::proto::OpDesc& op,
                  const framework::Scope& scope, bool test_mode) override {
-    VLOG(3) << "convert a fluid conv2d op to tensorrt conv layer without bias";
+    ConvertConv2d(
+        engine_, op, scope, test_mode,
-    framework::OpDesc op_desc(op, nullptr);
+        [&](nvinfer1::ITensor* inputs, int n_output, /* Conv output maps */
-    PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1);
+            int n_input,                             /* Conv input maps */
-    PADDLE_ENFORCE_EQ(op_desc.Input("Filter").size(), 1);  // Y is a weight
+            nvinfer1::DimsHW& ksize, TensorRTEngine::Weight& weight,
-    PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1);
+            TensorRTEngine::Weight& bias) -> nvinfer1::IConvolutionLayer* {
+          auto* layer =
-    auto* X = engine_->GetITensor(op_desc.Input("Input").front());
+              TRT_ENGINE_ADD_LAYER(engine_, Convolution, *inputs, n_output,
+                                   ksize, weight.get(), bias.get());
-    // Declare weights
+          return layer;
-    auto* Y_v = scope.FindVar(op_desc.Input("Filter").front());
+        },
-    PADDLE_ENFORCE_NOT_NULL(Y_v);
+        [](nvinfer1::IConvolutionLayer* layer, nvinfer1::DimsHW& dilations) {
-    auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
+          layer->setDilation(dilations);
+        },
-    platform::CPUPlace cpu_place;
+        "conv2d");
-    std::unique_ptr<framework::LoDTensor> weight_tensor(
+  }
-        new framework::LoDTensor());
+};
-    weight_tensor->Resize(Y_t->dims());
-    TensorCopySync((*Y_t), cpu_place, weight_tensor.get());
+class Deconv2dOpConverter : public OpConverter {
+ public:
-    auto* weight_data =
+  void operator()(const framework::proto::OpDesc& op,
-        weight_tensor->mutable_data<float>(platform::CPUPlace());
+                  const framework::Scope& scope, bool test_mode) override {
+    ConvertConv2d(
-    PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL);
+        engine_, op, scope, test_mode,
-    const int n_output = weight_tensor->dims()[0];
+        [&](nvinfer1::ITensor* inputs, int n_output, /* Deconv input maps */
-    const int filter_h = weight_tensor->dims()[2];
+            int n_input,                             /* Deconv output maps */
-    const int filter_w = weight_tensor->dims()[3];
+            nvinfer1::DimsHW& ksize, TensorRTEngine::Weight& weight,
+            TensorRTEngine::Weight& bias) -> nvinfer1::IDeconvolutionLayer* {
-    const int groups = boost::get<int>(op_desc.GetAttr("groups"));
+          auto* layer =
-    const std::vector<int> dilations =
+              TRT_ENGINE_ADD_LAYER(engine_, Deconvolution, *inputs, n_input,
-        boost::get<std::vector<int>>(op_desc.GetAttr("dilations"));
+                                   ksize, weight.get(), bias.get());
-    const std::vector<int> strides =
+          return layer;
-        boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
+        },
-    const std::vector<int> paddings =
+        [](nvinfer1::IDeconvolutionLayer* layer, nvinfer1::DimsHW& dilations) {
-        boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
+          PADDLE_ENFORCE(
+              dilations.d[0] == 1 && dilations.d[1] == 1,
-    nvinfer1::DimsHW nv_ksize(filter_h, filter_w);
+              "Dilations must be (1, 1) for tensorRT, but given (%d, %d)",
-    nvinfer1::DimsHW nv_dilations(dilations[0], dilations[1]);
+              dilations.d[0], dilations.d[1]);
-    nvinfer1::DimsHW nv_strides(strides[0], strides[1]);
+        },
-    nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]);
+        "conv2d_transpose");
-    TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
-                                  static_cast<void*>(weight_data),
-                                  weight_tensor->memory_size() / sizeof(float)};
-    TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
-    auto* layer = TRT_ENGINE_ADD_LAYER(
-        engine_, Convolution, *const_cast<nvinfer1::ITensor*>(X), n_output,
-        nv_ksize, weight.get(), bias.get());
-    PADDLE_ENFORCE(layer != nullptr);
-    layer->setStride(nv_strides);
-    layer->setPadding(nv_paddings);
-    layer->setDilation(nv_dilations);
-    layer->setNbGroups(groups);
-    auto output_name = op_desc.Output("Output").front();
-    layer->setName(("conv2d (Output: " + output_name + ")").c_str());
-    engine_->weight_map[op_desc.Input("Filter").front()] =
-        std::move(weight_tensor);
-    layer->getOutput(0)->setName(output_name.c_str());
-    engine_->SetITensor(output_name, layer->getOutput(0));
-    if (test_mode ||
-        to_skip_merging_optimize(engine_, {filter_h, filter_w}, strides,
-                                 paddings, op_desc.Input("Input").front())) {
-      engine_->DeclareOutput(output_name);
-    }
  }
 };
@@ -112,3 +159,4 @@ class Conv2dOpConverter : public OpConverter {
 }  // namespace paddle
 REGISTER_TRT_OP_CONVERTER(conv2d, Conv2dOpConverter);
+REGISTER_TRT_OP_CONVERTER(conv2d_transpose, Deconv2dOpConverter);
--- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
@@ -34,7 +34,8 @@ class ElementwiseWeightOpConverter : public OpConverter {
    auto* X = engine_->GetITensor(op_desc.Input("X").front());
    nvinfer1::Dims dims_x = X->getDimensions();
-    PADDLE_ENFORCE(dims_x.nbDims >= 3);
+    PADDLE_ENFORCE(dims_x.nbDims >= 3, "x dims experts 3, but %d is given.",
+                   dims_x.nbDims);
    auto* Y_v = scope.FindVar(op_desc.Input("Y").front());
    PADDLE_ENFORCE_NOT_NULL(Y_v);

--- a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h"
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+/*
+ * PRelu converter from fluid to tensorRT.
+ */
+class PReluOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(4) << "convert fluid prelu op to tensorrt prelu layer";
+    framework::OpDesc op_desc(op, nullptr);
+    // Declare inputs
+    int input_num = op_desc.Input("X").size();
+    PADDLE_ENFORCE(input_num == 1);
+    auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
+    // Get output
+    size_t output_num = op_desc.Output("Out").size();
+    PADDLE_ENFORCE(output_num == 1);
+    // Get attrs
+    std::string mode = boost::get<std::string>(op_desc.GetAttr("mode"));
+    //
+    auto* alpha_var = scope.FindVar(op_desc.Input("Alpha")[0]);
+    PADDLE_ENFORCE_NOT_NULL(alpha_var);
+    auto* alpha_tensor = alpha_var->GetMutable<framework::LoDTensor>();
+    platform::CUDAPlace place;
+    std::unique_ptr<framework::LoDTensor> alpha_tensor_device(
+        new framework::LoDTensor());
+    alpha_tensor_device->Resize(alpha_tensor->dims());
+    TensorCopySync(*alpha_tensor, place, alpha_tensor_device.get());
+    float* alpha_data = alpha_tensor_device->mutable_data<float>(place);
+    // Transform alpha to TensorRTEngine::Weight
+    TensorRTEngine::Weight alpha_rt(nvinfer1::DataType::kFLOAT,
+                                    static_cast<void*>(alpha_data),
+                                    alpha_tensor_device->numel());
+    PReluPlugin* plugin = new PReluPlugin(alpha_rt, mode);
+    nvinfer1::IPluginLayer* layer =
+        engine_->AddPlugin(&input, input_num, plugin);
+    // keep alpha tensor to avoid release it's memory
+    engine_->weight_map[op_desc.Input("Alpha")[0]] =
+        std::move(alpha_tensor_device);
+    std::string layer_name = "prelu (Output: ";
+    auto output_name = op_desc.Output("Out")[0];
+    layer->getOutput(0)->setName(output_name.c_str());
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    layer_name += output_name;
+    if (test_mode) {
+      engine_->DeclareOutput(output_name);
+    }
+    layer->setName((layer_name + ")").c_str());
+  }
+};
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+REGISTER_TRT_OP_CONVERTER(prelu, PReluOpConverter);
--- a/paddle/fluid/inference/tensorrt/convert/split_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/split_op.cc
@@ -26,7 +26,7 @@ class SplitOpConverter : public OpConverter {
 public:
  void operator()(const framework::proto::OpDesc& op,
                  const framework::Scope& scope, bool test_mode) override {
-    VLOG(40) << "convert a fluid split op to tensorrt split layer";
+    VLOG(4) << "convert a fluid split op to tensorrt split layer";
    framework::OpDesc op_desc(op, nullptr);
    // Declare inputs

--- a/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc
@@ -16,6 +16,9 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+USE_OP(conv2d);
+USE_OP(conv2d_transpose);
 namespace paddle {
 namespace inference {
 namespace tensorrt {
@@ -51,7 +54,37 @@ TEST(conv2d_op, test) {
  validator.Execute(3);
 }
+TEST(conv2d_transpose_op, test) {
+  std::unordered_set<std::string> parameters({"deconv2d-Y"});
+  framework::Scope scope;
+  TRTConvertValidation validator(5, parameters, scope, 1 << 15);
+  validator.DeclInputVar("deconv2d-X", nvinfer1::Dims3(3, 5, 5));
+  validator.DeclParamVar("deconv2d-Y", nvinfer1::Dims4(3, 2, 3, 3));
+  validator.DeclOutputVar("deconv2d-Out", nvinfer1::Dims3(2, 5, 5));
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("conv2d_transpose");
+  desc.SetInput("Input", {"deconv2d-X"});
+  desc.SetInput("Filter", {"deconv2d-Y"});
+  desc.SetOutput("Output", {"deconv2d-Out"});
+  const std::vector<int> strides({1, 1});
+  const std::vector<int> paddings({1, 1});
+  const std::vector<int> dilations({1, 1});
+  const int groups = 1;
+  desc.SetAttr("strides", strides);
+  desc.SetAttr("paddings", paddings);
+  desc.SetAttr("dilations", dilations);
+  desc.SetAttr("groups", groups);
+  validator.SetOp(*desc.Proto());
+  validator.Execute(3);
+}
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
-USE_OP(conv2d);
--- a/paddle/fluid/inference/tensorrt/convert/test_prelu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_prelu_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+TEST(prelu_op, test_channel_wise) {
+  std::unordered_set<std::string> parameters({"prelu_alpha"});
+  framework::Scope scope;
+  TRTConvertValidation validator(10, parameters, scope, 1000);
+  validator.DeclInputVar("prelu_input", nvinfer1::DimsCHW(3, 2, 2));
+  validator.DeclParamVar("prelu_alpha", nvinfer1::Dims3(3, 1, 1));
+  validator.DeclOutputVar("prelu_out", nvinfer1::DimsCHW(3, 2, 2));
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("prelu");
+  desc.SetInput("X", {"prelu_input"});
+  desc.SetInput("Alpha", {"prelu_alpha"});
+  desc.SetOutput("Out", {"prelu_out"});
+  desc.SetAttr("mode", std::string("channel"));
+  validator.SetOp(*desc.Proto());
+  validator.Execute(1);
+}
+TEST(prelu_op, test_element_wise) {
+  std::unordered_set<std::string> parameters({"prelu_alpha"});
+  framework::Scope scope;
+  TRTConvertValidation validator(10, parameters, scope, 1000);
+  validator.DeclInputVar("prelu_input", nvinfer1::DimsCHW(3, 2, 2));
+  validator.DeclParamVar("prelu_alpha", nvinfer1::Dims4(10, 3, 2, 2));
+  validator.DeclOutputVar("prelu_out", nvinfer1::DimsCHW(3, 2, 2));
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("prelu");
+  desc.SetInput("X", {"prelu_input"});
+  desc.SetInput("Alpha", {"prelu_alpha"});
+  desc.SetOutput("Out", {"prelu_out"});
+  desc.SetAttr("mode", std::string("element"));
+  validator.SetOp(*desc.Proto());
+  validator.Execute(1);
+}
+TEST(prelu_op, test_scalar) {
+  std::unordered_set<std::string> parameters({"prelu_alpha"});
+  framework::Scope scope;
+  TRTConvertValidation validator(10, parameters, scope, 1000);
+  validator.DeclInputVar("prelu_input", nvinfer1::DimsCHW(3, 2, 2));
+  validator.DeclParamVar("prelu_alpha", nvinfer1::Dims3(1, 1, 1));
+  validator.DeclOutputVar("prelu_out", nvinfer1::DimsCHW(3, 2, 2));
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("prelu");
+  desc.SetInput("X", {"prelu_input"});
+  desc.SetInput("Alpha", {"prelu_alpha"});
+  desc.SetOutput("Out", {"prelu_out"});
+  desc.SetAttr("mode", std::string("all"));
+  validator.SetOp(*desc.Proto());
+  validator.Execute(1);
+}
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+// USE_OP(prelu);
+USE_CPU_ONLY_OP(prelu);
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -200,7 +200,8 @@ void TensorRTEngine::GetOutputInCPU(const std::string &name, void *dst,
 Buffer &TensorRTEngine::buffer(const std::string &name) {
  PADDLE_ENFORCE(infer_engine_ != nullptr, "call FreezeNetwork first.");
  auto it = buffer_sizes_.find(name);
-  PADDLE_ENFORCE(it != buffer_sizes_.end());
+  PADDLE_ENFORCE(it != buffer_sizes_.end(), "tried to access buffer named %s",
+                 name);
  auto slot_offset = infer_engine_->getBindingIndex(name.c_str());
  return buffers_[slot_offset];
 }

--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -40,6 +40,7 @@ class TensorRTEngine : public EngineBase {
  // Weight is model parameter.
  class Weight {
   public:
+    Weight() = default;
    Weight(nvinfer1::DataType dtype, void* value, size_t num_elem) {
      w_.type = dtype;
      w_.values = value;

--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
-nv_library(tensorrt_plugin SRCS trt_plugin.cc split_op_plugin.cu DEPS enforce)
+nv_library(tensorrt_plugin SRCS trt_plugin.cc split_op_plugin.cu prelu_op_plugin.cu DEPS enforce)
--- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <stdio.h>
+#include <cassert>
+#include "glog/logging.h"
+#include "paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h"
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+static const int CUDA_NUM_THREADS = 1024;
+static const int CUDA_MAX_NUM_BLOCKS = 65535;
+inline static int GET_NUM_BLOCKS(const int N) {
+  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}
+__global__ void PReluChannelWiseKernel(const float *input, const float *alpha,
+                                       float *output, int channel,
+                                       size_t spatial_size) {
+  size_t offset = blockIdx.x * spatial_size;
+  const float *in = input + offset;
+  float *out = output + offset;
+  float scale = alpha[blockIdx.x % channel];
+  for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) {
+    float x = in[i];
+    out[i] = (x > 0) ? x : scale * x;
+  }
+}
+__global__ void PReluElementWiseKernel(const float *input, const float *alpha,
+                                       float *output, size_t spatial_size) {
+  size_t offset = blockIdx.x * spatial_size;
+  const float *in = input + offset;
+  const float *scale = alpha + offset;
+  float *out = output + offset;
+  for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) {
+    float x = in[i];
+    out[i] = (x > 0) ? x : scale[i] * x;
+  }
+}
+__global__ void PReluScalarKernel(const float *input, const float *alpha,
+                                  float *output, size_t spatial_size) {
+  size_t offset = blockIdx.x * spatial_size;
+  const float *in = input + offset;
+  float scale = *alpha;
+  float *out = output + offset;
+  for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) {
+    float x = in[i];
+    out[i] = (x > 0) ? x : scale * x;
+  }
+}
+static inline void PReluChannelWise(cudaStream_t stream, const float *input,
+                                    const float *alpha, float *output,
+                                    int batch_size,
+                                    const nvinfer1::Dims &dims) {
+  size_t unroll = batch_size * dims.d[0];
+  size_t spatial_size = dims.d[1] * dims.d[2];
+  CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
+  PReluChannelWiseKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
+      input, alpha, output, dims.d[0], spatial_size);
+}
+static inline void PReluElementWise(cudaStream_t stream, const float *input,
+                                    const float *alpha, float *output,
+                                    int batch_size,
+                                    const nvinfer1::Dims &dims) {
+  size_t unroll = batch_size * dims.d[0];
+  size_t spatial_size = dims.d[1] * dims.d[2];
+  CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
+  PReluElementWiseKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
+      input, alpha, output, spatial_size);
+}
+static inline void PReluScalar(cudaStream_t stream, const float *input,
+                               const float *alpha, float *output,
+                               int batch_size, const nvinfer1::Dims &dims) {
+  size_t unroll = batch_size * dims.d[0];
+  size_t spatial_size = dims.d[1] * dims.d[2];
+  CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
+  PReluScalarKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
+      input, alpha, output, spatial_size);
+}
+nvinfer1::Dims PReluPlugin::getOutputDimensions(int index,
+                                                const nvinfer1::Dims *inputDims,
+                                                int nbInputs) {
+  assert(nbInputs == 1);
+  assert(index < this->getNbOutputs());
+  nvinfer1::Dims const &input_dims = inputDims[0];
+  nvinfer1::Dims output_dims = input_dims;
+  return output_dims;
+}
+int PReluPlugin::enqueue(int batchSize, const void *const *inputs,
+                         void **outputs, void *workspace, cudaStream_t stream) {
+  // input dims is CHW.
+  const auto &input_dims = this->getInputDims(0);
+  const float *input = reinterpret_cast<const float *>(inputs[0]);
+  const float *alpha = reinterpret_cast<const float *>(alpha_.get().values);
+  float *output = reinterpret_cast<float **>(outputs)[0];
+  if (mode_ == "channel") {
+    PReluChannelWise(stream, input, alpha, output, batchSize, input_dims);
+  } else if (mode_ == "element") {
+    PReluElementWise(stream, input, alpha, output, batchSize, input_dims);
+  } else {
+    PReluScalar(stream, input, alpha, output, batchSize, input_dims);
+  }
+  return cudaGetLastError() != cudaSuccess;
+}
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+class PReluPlugin : public PluginTensorRT {
+  TensorRTEngine::Weight alpha_;
+  std::string mode_;
+ protected:
+  size_t getSerializationSize() override {
+    // return getBaseSerializationSize(alpha_) + SerializedSize(mode_);
+    return 0;
+  }
+  // TRT will call this func when we need to serialize the configuration of
+  // tensorrt.
+  // It should not be called by users.
+  void serialize(void *buffer) override {
+    // serializeBase(buffer);
+    // SerializeValue(&buffer, alpha_);
+    // SerializeValue(&buffer, mode_);
+  }
+ public:
+  PReluPlugin(TensorRTEngine::Weight const &alpha, std::string const &mode)
+      : alpha_(alpha), mode_(mode) {}
+  // It was used for tensorrt deserialization.
+  // It should not be called by users.
+  PReluPlugin(void const *serialData, size_t serialLength) {
+    // deserializeBase(serialData, serialLength);
+    // DeserializeValue(&serialData, &serialLength, &alpha_);
+    // DeserializeValue(&serialData, &serialLength, &mode_);
+  }
+  PReluPlugin *clone() const override { return new PReluPlugin(alpha_, mode_); }
+  const char *getPluginType() const override { return "prelu"; }
+  int getNbOutputs() const override { return 1; }
+  nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *inputs,
+                                     int nbInputDims) override;
+  int enqueue(int batchSize, const void *const *inputs, void **outputs,
+              void *workspace, cudaStream_t stream) override;
+};
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
-file(GLOB GENERAL_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
+include(operators)
-string(REPLACE "_mkldnn" "" GENERAL_OPS "${GENERAL_OPS}")
-string(REPLACE ".cc" "" GENERAL_OPS "${GENERAL_OPS}")
-list(REMOVE_DUPLICATES GENERAL_OPS)
-set(DEPS_OPS "")
-set(pybind_file ${PADDLE_BINARY_DIR}/paddle/fluid/pybind/pybind.h)
-file(WRITE ${pybind_file} "// Generated by the paddle/fluid/operator/CMakeLists.txt.  DO NOT EDIT!\n\n")
-set(PART_CUDA_KERNEL_FILES)
-function(op_library TARGET)
-    # op_library is a function to create op library. The interface is same as
-    # cc_library. But it handle split GPU/CPU code and link some common library
-    # for ops.
-    set(cc_srcs)
-    set(cu_srcs)
-    set(hip_cu_srcs)
-    set(miopen_hip_cc_srcs)
-    set(cu_cc_srcs)
-    set(cudnn_cu_cc_srcs)
-    set(CUDNN_FILE)
-    set(mkldnn_cc_srcs)
-    set(MKLDNN_FILE)
-    set(op_common_deps operator op_registry math_function)
-    set(options "")
-    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS)
-    set(pybind_flag 0)
-    cmake_parse_arguments(op_library "${options}" "${oneValueArgs}"
-            "${multiValueArgs}" ${ARGN})
-    list(LENGTH op_library_SRCS op_library_SRCS_len)
-    if (${op_library_SRCS_len} EQUAL 0)
-        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
-            list(APPEND cc_srcs ${TARGET}.cc)
-        endif()
-        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc)
-            list(APPEND cu_cc_srcs ${TARGET}.cu.cc)
-        endif()
-        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
-            list(APPEND cu_srcs ${TARGET}.cu)
-        endif()
-        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
-            set(PART_CUDA_KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu
-                    ${PART_CUDA_KERNEL_FILES} PARENT_SCOPE)
-            list(APPEND cu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
-        endif()
-        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.hip.cu)
-            list(APPEND hip_cu_srcs ${TARGET}.hip.cu)
-        endif()
-        string(REPLACE "_op" "_cudnn_op" CUDNN_FILE "${TARGET}")
-        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu.cc)
-            list(APPEND cudnn_cu_cc_srcs ${CUDNN_FILE}.cu.cc)
-        endif()
-        if(WITH_AMD_GPU)
-            string(REPLACE "_op" "_miopen_op" MIOPEN_FILE "${TARGET}")
-            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MIOPEN_FILE}.hip.cc)
-                list(APPEND miopen_hip_cc_srcs ${MIOPEN_FILE}.hip.cc)
-            endif()
-        endif()
-        if(WITH_MKLDNN)
-            string(REPLACE "_op" "_mkldnn_op" MKLDNN_FILE "${TARGET}")
-            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MKLDNN_FILE}.cc)
-                list(APPEND mkldnn_cc_srcs ${MKLDNN_FILE}.cc)
-            endif()
-        endif()
-    else()
-        foreach(src ${op_library_SRCS})
-            if (${src} MATCHES ".*\\.hip.cu$")
-                list(APPEND hip_cu_srcs ${src})
-            elseif (${src} MATCHES ".*\\.cu$")
-                list(APPEND cu_srcs ${src})
-            elseif(${src} MATCHES ".*_cudnn_op.cu.cc$")
-                list(APPEND cudnn_cu_cc_srcs ${src})
-            elseif(WITH_AMD_GPU AND ${src} MATCHES ".*_miopen_op.hip.cc$")
-                list(APPEND miopen_hip_cc_srcs ${src})
-            elseif(WITH_MKLDNN AND ${src} MATCHES ".*_mkldnn_op.cc$")
-                list(APPEND mkldnn_cc_srcs ${src})
-            elseif(${src} MATCHES ".*\\.cu.cc$")
-                list(APPEND cu_cc_srcs ${src})
-            elseif(${src} MATCHES ".*\\.cc$")
-                list(APPEND cc_srcs ${src})
-            else()
-                message(FATAL_ERROR "${TARGET} Source file ${src} should only be .cc or .cu")
-            endif()
-        endforeach()
-    endif()
-    list(LENGTH cc_srcs cc_srcs_len)
-    if (${cc_srcs_len} EQUAL 0)
-        message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file")
-    endif()
-    if (WIN32)
-    # remove windows unsupported op, because windows has no nccl, no warpctc such ops.
-    foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op" "hierarchical_sigmoid_op"
-     "crf_decoding_op" "select_op" "lstmp_op" "gru_op" "fusion_gru_op" "lstm_op" "fusion_lstm_op" "cumsum_op"
-      "fusion_seqconv_eltadd_relu_op" "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op"
-            "fusion_seqexpand_concat_fc_op" "attention_lstm_op" "fused_embedding_fc_lstm_op" "fc_op")
-        if ("${TARGET}" STREQUAL "${windows_unsupport_op}")
-          return()
-        endif()
-    endforeach()
-    endif(WIN32)
-    set(OP_LIBRARY ${TARGET} ${OP_LIBRARY} PARENT_SCOPE)
-    list(LENGTH op_library_DEPS op_library_DEPS_len)
+# clean cache and pybind_file content first when rebuild
-    if (${op_library_DEPS_len} GREATER 0)
+unset(GLOB_OP_LIB CACHE)
-        set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE)
+unset(OP_LIBRARY CACHE)
-    endif()
+set(pybind_file ${PADDLE_BINARY_DIR}/paddle/fluid/pybind/pybind.h CACHE INTERNAL "pybind.h file")
-    if (WITH_GPU)
+file(WRITE ${pybind_file} "// Generated by the paddle/fluid/operator/CMakeLists.txt.  DO NOT EDIT!\n\n")
-        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${mkldnn_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
-                ${op_common_deps})
-    elseif (WITH_AMD_GPU)
-        hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cu_srcs} ${miopen_hip_cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
-                ${op_common_deps})
-    else()
-        cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
-            ${op_common_deps})
-    endif()
-    # Define operators that don't need pybind here.
-    foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op"
-"tensor_array_read_write_op" "tensorrt_engine_op")
-        if ("${TARGET}" STREQUAL "${manual_pybind_op}")
-            set(pybind_flag 1)
-        endif()
-    endforeach()
-    # The registration of USE_OP, please refer to paddle/fluid/framework/op_registry.h.
-    # Note that it's enough to just adding one operator to pybind in a *_op.cc file.
-    # And for detail pybind information, please see generated paddle/pybind/pybind.h.
-    file(READ ${TARGET}.cc TARGET_CONTENT)
-    string(REGEX MATCH "REGISTER_OPERATOR\\(.*REGISTER_OPERATOR\\(" multi_register "${TARGET_CONTENT}")
-    string(REGEX MATCH "REGISTER_OPERATOR\\([a-z0-9_]*," one_register "${multi_register}")
-    if (one_register STREQUAL "")
-        string(REPLACE "_op" "" TARGET "${TARGET}")
-    else ()
-        string(REPLACE "REGISTER_OPERATOR(" "" TARGET "${one_register}")
-        string(REPLACE "," "" TARGET "${TARGET}")
-    endif()
-    # pybind USE_NO_KERNEL_OP
-    # HACK: if REGISTER_OP_CPU_KERNEL presents the operator must have kernel
-    string(REGEX MATCH "REGISTER_OP_CPU_KERNEL" regex_result "${TARGET_CONTENT}")
-    string(REPLACE "_op" "" TARGET "${TARGET}")
-    if (${pybind_flag} EQUAL 0 AND regex_result STREQUAL "")
-        file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(${TARGET});\n")
-        set(pybind_flag 1)
-    endif()
-    # pybind USE_CPU_ONLY_OP
-    list(LENGTH cu_srcs cu_srcs_len)
-    list(LENGTH cu_cc_srcs cu_cc_srcs_len)
-    list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
-    list(LENGTH hip_cu_srcs hip_cu_srcs_len)
-    list(LENGTH miopen_hip_cc_srcs miopen_hip_cc_srcs_len)
-    if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0 AND
-        ${hip_cu_srcs_len} EQUAL 0 AND ${miopen_hip_cc_srcs_len} EQUAL 0)
-        file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
-        set(pybind_flag 1)
-    endif()
-    # pybind USE_OP_DEVICE_KERNEL for CUDNN
-    list(LENGTH cudnn_cu_cc_srcs cudnn_cu_cc_srcs_len)
-    if (WITH_GPU AND ${cudnn_cu_cc_srcs_len} GREATER 0)
-        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
-    endif()
-    # pybind USE_OP_DEVICE_KERNEL for MIOPEN
-    if (WITH_AMD_GPU AND ${miopen_hip_cc_srcs_len} GREATER 0)
-        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MIOPEN);\n")
-    endif()
-    # pybind USE_OP_DEVICE_KERNEL for MKLDNN
-    if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
-      # Append first implemented MKLDNN activation operator
-      if (${MKLDNN_FILE} STREQUAL "activation_mkldnn_op")
-        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, MKLDNN);\n")
-      else()
-        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n")
-      endif()
-    endif()
-    # pybind USE_OP
-    if (${pybind_flag} EQUAL 0)
-      # NOTE(*): activation use macro to regist the kernels, set use_op manually.
-      if(${TARGET} STREQUAL "activation")
-        file(APPEND ${pybind_file} "USE_OP(relu);\n")
-      elseif(${TARGET} STREQUAL "fake_dequantize")
-        file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n")
-      elseif(${TARGET} STREQUAL "fake_quantize")
-        file(APPEND ${pybind_file} "USE_OP(fake_quantize_abs_max);\n")
-      elseif(${TARGET} STREQUAL "tensorrt_engine_op")
-          message(STATUS "Pybind skips [tensorrt_engine_op], for this OP is only used in inference")
-      elseif(${TARGET} STREQUAL "fc")
-        # HACK: fc only have mkldnn and cpu, which would mismatch the cpu only condition
-        file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
-      else()
-        file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
-      endif()
-    endif()
-endfunction()
 add_subdirectory(math)
-if (NOT WIN32)
+add_subdirectory(controlflow)
-add_subdirectory(nccl)
+add_subdirectory(csp)
-if(WITH_GPU)
+add_subdirectory(detection)
-    op_library(nccl_op DEPS nccl_common)
+add_subdirectory(elementwise)
-    file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(ncclAllReduce);\n")
+add_subdirectory(fused)
-else()
+add_subdirectory(metrics)
-    set(DEPS_OPS ${DEPS_OPS} nccl_op)
+add_subdirectory(optimizers)
-endif()
+add_subdirectory(reduce_ops)
-endif() # NOT WIN32
+add_subdirectory(sequence_ops)
-set(DISTRIBUTE_DEPS "")
 if(WITH_DISTRIBUTE)
    add_subdirectory(distributed)
-    set(DISTRIBUTE_DEPS "")
+    add_subdirectory(distributed_ops)
-    if(WITH_GRPC)
+endif()
-        set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node)
-    else()
-        set(DISTRIBUTE_DEPS sendrecvop_brpc brpc leveldb snappystream snappy protobuf ssl crypto zlib node)
-        if(WITH_BRPC_RDMA)
-            find_library(IBVERBS_LIBRARY NAMES ibverbs)
-            ADD_LIBRARY(ibverbs SHARED IMPORTED GLOBAL)
-            SET_PROPERTY(TARGET ibverbs PROPERTY IMPORTED_LOCATION ${IBVERBS_LIBRARY})
-            find_library(RDMACM_LIBRARY NAMES rdmacm)
-            ADD_LIBRARY(rdmacm SHARED IMPORTED GLOBAL)
-            SET_PROPERTY(TARGET rdmacm PROPERTY IMPORTED_LOCATION ${RDMACM_LIBRARY})
-            set(DISTRIBUTE_DEPS ${DISTRIBUTE_DEPS} ibverbs rdmacm)
-        endif()
-    endif()
-    set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-    foreach(dist_op "prefetch_op" "checkpoint_notify_op" "listen_and_serv_op" "send_op" "recv_op" "send_barrier_op" "fetch_barrier_op")
-        op_library(${dist_op} DEPS ${DISTRIBUTE_DEPS})
-        set_source_files_properties(${dist_op}.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    endforeach()
-    #set_source_files_properties(send_recv_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+if (NOT WIN32)
-    #cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op
+    add_subdirectory(reader)
-    #        listen_and_serv_op sum_op executor SERIAL)
-    if(WITH_GPU AND NOT WIN32)
-        set_source_files_properties(test_send_nccl_id.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-        cc_test(test_send_nccl_id SRCS test_send_nccl_id.cc DEPS listen_and_serv_op ${DISTRIBUTE_DEPS} executor SERIAL)
-        if(WITH_GRPC)
-            op_library(gen_nccl_id_op DEPS nccl_common sendrecvop_grpc)
-        else()
-            op_library(gen_nccl_id_op DEPS nccl_common sendrecvop_brpc)
-        endif()
-        set_source_files_properties(gen_nccl_id_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    else()
-        set(DEPS_OPS ${DEPS_OPS} gen_nccl_id_op)
-    endif() # WITH_GPU AND NOT WIN32
-else()
-    set(DEPS_OPS ${DEPS_OPS}  checkpoint_notify_op prefetch_op recv_op listen_and_serv_op send_op send_barrier_op fetch_barrier_op gen_nccl_id_op)
 endif()
-op_library(cross_entropy_op DEPS cross_entropy)
+if (NOT WIN32)
-if(WITH_GPU)
+    add_subdirectory(nccl)
-  op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax cub)
-  op_library(sequence_softmax_op DEPS cub)
-else()
-  op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
 endif()
-op_library(softmax_op DEPS softmax)
 if (WITH_GPU AND TENSORRT_FOUND)
-    op_library(tensorrt_engine_op DEPS tensorrt_engine tensorrt_converter)
+    add_subdirectory(tensorrt)
-    file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(tensorrt_engine);\n")
-    nv_test(test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc
-      DEPS tensorrt_engine_op
-      analysis)
-else()
-    set(DEPS_OPS ${DEPS_OPS} tensorrt_engine_op)
 endif()
-op_library(hash_op DEPS xxhash)
-op_library(clip_by_norm_op DEPS selected_rows_functor selected_rows)
+register_operators(EXCLUDES warpctc_op)
-op_library(sum_op DEPS selected_rows_functor)
-op_library(sgd_op DEPS selected_rows_functor)
+# warpctc_cudnn need cudnn 7 above
-op_library(print_op DEPS lod_tensor)
-op_library(adagrad_op DEPS selected_rows_functor)
-op_library(maxout_op DEPS maxouting)
-op_library(unpool_op DEPS unpooling)
-op_library(pool_op DEPS pooling)
-op_library(pool_with_index_op DEPS pooling)
-op_library(lod_rank_table_op DEPS lod_rank_table)
-op_library(lod_tensor_to_array_op DEPS lod_rank_table_op)
-op_library(array_to_lod_tensor_op DEPS lod_rank_table_op)
-op_library(max_sequence_len_op DEPS lod_rank_table)
-op_library(sequence_conv_op DEPS context_project)
-op_library(sequence_pool_op DEPS sequence_pooling)
-if (NOT WIN32)
-    op_library(lstm_op DEPS sequence2batch lstm_compute)
-    op_library(hierarchical_sigmoid_op DEPS matrix_bit_code)
-    op_library(lstmp_op DEPS sequence2batch lstm_compute)
-    op_library(gru_op DEPS sequence2batch gru_compute)
-endif(NOT WIN32)
-op_library(recurrent_op DEPS executor)
-op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
-op_library(cos_sim_op DEPS cos_sim_functor)
-op_library(parallel_do_op DEPS executor)
-op_library(unsqueeze_op DEPS reshape_op)
-op_library(squeeze_op DEPS reshape_op)
-op_library(flatten_op DEPS reshape_op)
-op_library(sequence_pad_op DEPS sequence_padding)
-op_library(unstack_op DEPS stack_op)
-op_library(fake_quantize_op DEPS memory)
-if (NOT WIN32)
-op_library(crf_decoding_op DEPS jit_kernel)
-op_library(fusion_lstm_op DEPS jit_kernel)
-endif(NOT WIN32)
 if (WITH_GPU)
-    op_library(conv_op DEPS vol2col depthwise_conv im2col)
+    if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
-    op_library(layer_norm_op DEPS cub)
+        op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu.cc)
-    op_library(reduce_mean_op DEPS cub)
+    else()
-    op_library(affine_channel_op DEPS cub)
+        op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
+    endif()
 else()
-    op_library(conv_op DEPS vol2col im2col)
+    op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
 endif()
-op_library(conv_transpose_op DEPS vol2col im2col)
-# FIXME(typhoonzero): save/load depends lodtensor serialization functions
-op_library(save_op DEPS lod_tensor)
-op_library(load_op DEPS lod_tensor)
-op_library(save_combine_op DEPS lod_tensor)
-op_library(load_combine_op DEPS lod_tensor)
-op_library(concat_op DEPS concat_and_split)
-op_library(tensor_array_to_tensor_op DEPS concat_op)
-list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
-foreach(src ${GENERAL_OPS})
-    op_library(${src})
-endforeach()
-file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n")
+set(COMMON_OP_DEPS "")
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} xxhash selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor dynload_warpctc sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler)
 if (NOT WIN32)
-add_subdirectory(reader)
+  set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions)
-endif(NOT WIN32)
+endif()
-foreach(src ${READER_LIBRARY})
+if (WITH_GPU)
-    set(OP_LIBRARY ${src} ${OP_LIBRARY})
+  set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv cub)
-endforeach()
+endif()
-add_subdirectory(detection)
+# FIXME(typhoonzero): operator deps may not needed.
-foreach(src ${DETECTION_LIBRARY})
+# op_library(lod_tensor_to_array_op DEPS lod_rank_table_op)
-    set(OP_LIBRARY ${src} ${OP_LIBRARY})
+# op_library(array_to_lod_tensor_op DEPS lod_rank_table_op)
-endforeach()
+# op_library(unsqueeze_op DEPS reshape_op)
+# op_library(squeeze_op DEPS reshape_op)
+# op_library(flatten_op DEPS reshape_op)
+# op_library(unstack_op DEPS stack_op)
+# op_library(tensor_array_to_tensor_op DEPS concat_op)
-set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
+set(OPERATOR_DEPS ${OPERATOR_DEPS} ${COMMON_OP_DEPS})
-set(GLOB_DISTRIBUTE_DEPS ${DISTRIBUTE_DEPS} CACHE INTERNAL "distributed dependency")
+set(GLOB_OPERATOR_DEPS ${OPERATOR_DEPS} CACHE INTERNAL "Global Op dependencies")
 cc_test(gather_test SRCS gather_test.cc DEPS tensor)
 cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
@@ -362,18 +76,6 @@ cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_sea
 cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory)
 cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
 cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
-if(NOT WIN32)
-    nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
-endif()
 nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor)
-if(WITH_GPU)
+set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
-    foreach(CUDA_KERNEL_FILE ${PART_CUDA_KERNEL_FILES})
-        file(READ ${CUDA_KERNEL_FILE} TARGET_CONTENT)
-        string(REGEX MATCH "REGISTER_OP_CUDA_KERNEL\\(\\n?([^,]+),.*" MATCHED ${TARGET_CONTENT})
-        if (MATCHED)
-            string(STRIP ${CMAKE_MATCH_1} MATCHED)
-            file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${MATCHED}, CUDA);\n")
-        endif()
-    endforeach()
-endif()
--- a/paddle/fluid/operators/controlflow/CMakeLists.txt
+++ b/paddle/fluid/operators/controlflow/CMakeLists.txt
+include(operators)
+register_operators()
+file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n")
--- a/paddle/fluid/operators/compare_op.cc
+++ b/paddle/fluid/operators/compare_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/compare_op.h"
+#include "paddle/fluid/operators/controlflow/compare_op.h"
 #include <string>
 #include "paddle/fluid/framework/op_registry.h"

--- a/paddle/fluid/operators/compare_op.cu
+++ b/paddle/fluid/operators/compare_op.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/compare_op.h"
+#include "paddle/fluid/operators/controlflow/compare_op.h"
 REGISTER_COMPARE_KERNEL(less_than, CUDA, paddle::operators::LessThanFunctor);
 REGISTER_COMPARE_KERNEL(less_equal, CUDA, paddle::operators::LessEqualFunctor);

--- a/paddle/fluid/operators/compare_op.h
+++ b/paddle/fluid/operators/compare_op.h
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <math.h>
 #include <type_traits>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/elementwise_op_function.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/platform/transform.h"
 namespace paddle {

--- a/paddle/fluid/operators/conditional_block_op.cc
+++ b/paddle/fluid/operators/conditional_block_op.cc
--- a/paddle/fluid/operators/feed_op.cc
+++ b/paddle/fluid/operators/feed_op.cc
--- a/paddle/fluid/operators/fetch_op.cc
+++ b/paddle/fluid/operators/fetch_op.cc
--- a/paddle/fluid/operators/get_places_op.cc
+++ b/paddle/fluid/operators/get_places_op.cc
--- a/paddle/fluid/operators/logical_op.cc
+++ b/paddle/fluid/operators/logical_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/logical_op.h"
+#include "paddle/fluid/operators/controlflow/logical_op.h"
 #include <string>
 #include "paddle/fluid/framework/op_registry.h"

--- a/paddle/fluid/operators/logical_op.cu
+++ b/paddle/fluid/operators/logical_op.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/logical_op.h"
+#include "paddle/fluid/operators/controlflow/logical_op.h"
 REGISTER_BINARY_LOGICAL_KERNEL(logical_and, CUDA,
                               paddle::operators::LogicalAndFunctor);

--- a/paddle/fluid/operators/logical_op.h
+++ b/paddle/fluid/operators/logical_op.h
--- a/paddle/fluid/operators/parallel_do_op.cc
+++ b/paddle/fluid/operators/parallel_do_op.cc
--- a/paddle/fluid/operators/tensor_array_read_write_op.cc
+++ b/paddle/fluid/operators/tensor_array_read_write_op.cc
--- a/paddle/fluid/operators/while_op.cc
+++ b/paddle/fluid/operators/while_op.cc
--- a/paddle/fluid/operators/csp/CMakeLists.txt
+++ b/paddle/fluid/operators/csp/CMakeLists.txt
+include(operators)
+register_operators()
--- a/paddle/fluid/operators/go_op.cc
+++ b/paddle/fluid/operators/go_op.cc
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -40,4 +40,8 @@ endif()
 detection_library(roi_perspective_transform_op SRCS roi_perspective_transform_op.cc roi_perspective_transform_op.cu)
 #Export local libraries to parent
-set(DETECTION_LIBRARY ${LOCAL_DETECTION_LIBS} PARENT_SCOPE)
+# set(DETECTION_LIBRARY ${LOCAL_DETECTION_LIBS} PARENT_SCOPE)
+foreach(src ${LOCAL_DETECTION_LIBS})
+    set(OP_LIBRARY ${src} ${OP_LIBRARY} CACHE INTERNAL "op libs")
+endforeach()
--- a/paddle/fluid/operators/distributed_ops/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed_ops/CMakeLists.txt
+include(operators)
+set(DISTRIBUTE_DEPS "")
+if(WITH_GRPC)
+    set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node)
+else()
+    set(DISTRIBUTE_DEPS sendrecvop_brpc brpc leveldb snappystream snappy protobuf ssl crypto zlib node)
+    if(WITH_BRPC_RDMA)
+        find_library(IBVERBS_LIBRARY NAMES ibverbs)
+        ADD_LIBRARY(ibverbs SHARED IMPORTED GLOBAL)
+        SET_PROPERTY(TARGET ibverbs PROPERTY IMPORTED_LOCATION ${IBVERBS_LIBRARY})
+        find_library(RDMACM_LIBRARY NAMES rdmacm)
+        ADD_LIBRARY(rdmacm SHARED IMPORTED GLOBAL)
+        SET_PROPERTY(TARGET rdmacm PROPERTY IMPORTED_LOCATION ${RDMACM_LIBRARY})
+        set(DISTRIBUTE_DEPS ${DISTRIBUTE_DEPS} ibverbs rdmacm)
+    endif()
+endif()
+set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
+list(REMOVE_DUPLICATES OPS)
+foreach(src ${OPS})
+    set_source_files_properties(${src} PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+endforeach()
+register_operators(EXCLUDES gen_nccl_id_op DEPS ${DISTRIBUTE_DEPS})
+if(WITH_GPU AND NOT WIN32)
+    set(DISTRIBUTE_DEPS ${DISTRIBUTE_DEPS} nccl_common)
+    op_library(gen_nccl_id_op ${DISTRIBUTE_DEPS} nccl_common)
+endif()
+set(OPERATOR_DEPS ${OPERATOR_DEPS} ${DISTRIBUTE_DEPS} PARENT_SCOPE)
+set(GLOB_DISTRIBUTE_DEPS ${DISTRIBUTE_DEPS} CACHE INTERNAL "distributed dependency")
--- a/paddle/fluid/operators/checkpoint_notify_op.cc
+++ b/paddle/fluid/operators/checkpoint_notify_op.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/macros.h"
-#include "paddle/fluid/operators/send_recv_util.h"
+#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
 #include "paddle/fluid/string/printf.h"
 namespace paddle {

--- a/paddle/fluid/operators/fake_init_op.cc
+++ b/paddle/fluid/operators/fake_init_op.cc
--- a/paddle/fluid/operators/fetch_barrier_op.cc
+++ b/paddle/fluid/operators/fetch_barrier_op.cc
--- a/paddle/fluid/operators/gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/gen_nccl_id_op.cc
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -25,7 +25,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/distributed/request_handler_impl.h"
-#include "paddle/fluid/operators/listen_and_serv_op.h"
+#include "paddle/fluid/operators/distributed_ops/listen_and_serv_op.h"
 DEFINE_int32(rpc_send_thread_num, 5, "number of threads for rpc send");
 DEFINE_int32(rpc_get_thread_num, 5, "number of threads for rpc get");

--- a/paddle/fluid/operators/listen_and_serv_op.h
+++ b/paddle/fluid/operators/listen_and_serv_op.h
--- a/paddle/fluid/operators/merge_ids_op.cc
+++ b/paddle/fluid/operators/merge_ids_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/merge_ids_op.h"
+#include "paddle/fluid/operators/distributed_ops/merge_ids_op.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/merge_ids_op.h
+++ b/paddle/fluid/operators/merge_ids_op.h
--- a/paddle/fluid/operators/prefetch_op.cc
+++ b/paddle/fluid/operators/prefetch_op.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/macros.h"
-#include "paddle/fluid/operators/send_recv_util.h"
+#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/recv_op.cc
+++ b/paddle/fluid/operators/recv_op.cc
--- a/paddle/fluid/operators/ref_by_trainer_id_op.cc
+++ b/paddle/fluid/operators/ref_by_trainer_id_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/ref_by_trainer_id_op.h"
+#include "paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.h"
 #include <string>
 namespace paddle {

--- a/paddle/fluid/operators/ref_by_trainer_id_op.cu.cc
+++ b/paddle/fluid/operators/ref_by_trainer_id_op.cu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/ref_by_trainer_id_op.h"
+#include "paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.h"
 REGISTER_OP_CUDA_KERNEL(
    ref_by_trainer_id,

--- a/paddle/fluid/operators/ref_by_trainer_id_op.h
+++ b/paddle/fluid/operators/ref_by_trainer_id_op.h
--- a/paddle/fluid/operators/send_barrier_op.cc
+++ b/paddle/fluid/operators/send_barrier_op.cc
--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/macros.h"
-#include "paddle/fluid/operators/send_recv_util.h"
+#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
 #include "paddle/fluid/platform/profiler.h"
 namespace paddle {

--- a/paddle/fluid/operators/send_recv_op_test.cc
+++ b/paddle/fluid/operators/send_recv_op_test.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/listen_and_serv_op.h"
+#include "paddle/fluid/operators/distributed_ops/listen_and_serv_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/string/printf.h"

--- a/paddle/fluid/operators/send_recv_util.h
+++ b/paddle/fluid/operators/send_recv_util.h
--- a/paddle/fluid/operators/split_byref_op.cc
+++ b/paddle/fluid/operators/split_byref_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/split_byref_op.h"
+#include "paddle/fluid/operators/distributed_ops/split_byref_op.h"
 #include "paddle/fluid/operators/split_op.h"
 namespace paddle {

--- a/paddle/fluid/operators/split_byref_op.cu.cc
+++ b/paddle/fluid/operators/split_byref_op.cu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/split_byref_op.h"
+#include "paddle/fluid/operators/distributed_ops/split_byref_op.h"
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
    split_byref,

--- a/paddle/fluid/operators/split_byref_op.h
+++ b/paddle/fluid/operators/split_byref_op.h
--- a/paddle/fluid/operators/split_ids_op.cc
+++ b/paddle/fluid/operators/split_ids_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/split_ids_op.h"
+#include "paddle/fluid/operators/distributed_ops/split_ids_op.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/split_ids_op.h
+++ b/paddle/fluid/operators/split_ids_op.h
--- a/paddle/fluid/operators/test_send_nccl_id.cc
+++ b/paddle/fluid/operators/test_send_nccl_id.cc
@@ -22,14 +22,14 @@ limitations under the License. */
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/detail/macros.h"
 #include "paddle/fluid/operators/distributed/request_handler_impl.h"
-#include "paddle/fluid/operators/listen_and_serv_op.h"
+#include "paddle/fluid/operators/distributed_ops/listen_and_serv_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/platform/nccl_helper.h"
 #include "paddle/fluid/string/printf.h"
 #ifdef PADDLE_WITH_GRPC
-#include "paddle/fluid/operators/send_recv_util.h"
+#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
 #endif
 USE_NO_KERNEL_OP(listen_and_serv);

--- a/paddle/fluid/operators/elementwise/CMakeLists.txt
+++ b/paddle/fluid/operators/elementwise/CMakeLists.txt
+include(operators)
+register_operators()
--- a/paddle/fluid/operators/elementwise_add_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise_add_mkldnn_op.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/operators/elementwise_add_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
-#include "paddle/fluid/operators/elementwise_op_function.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"

--- a/paddle/fluid/operators/elementwise_add_op.cc
+++ b/paddle/fluid/operators/elementwise_add_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/elementwise_add_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
-#include "paddle/fluid/operators/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
 namespace ops = paddle::operators;
 REGISTER_ELEMWISE_GRAD_MAKER(elementwise_add, Add);
 REGISTER_ELEMWISE_EXPLICIT_OP(elementwise_add, "Add", "Out = X + Y", "Out",

--- a/paddle/fluid/operators/elementwise_add_op.cu
+++ b/paddle/fluid/operators/elementwise_add_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #define EIGEN_USE_GPU
-#include "paddle/fluid/operators/elementwise_add_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
 #include "paddle/fluid/platform/float16.h"
 namespace ops = paddle::operators;

--- a/paddle/fluid/operators/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise_add_op.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/operators/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/elementwise_op_function.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/operators/math/blas.h"
 namespace paddle {

--- a/paddle/fluid/operators/elementwise_div_op.cc
+++ b/paddle/fluid/operators/elementwise_div_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/elementwise_div_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
-#include "paddle/fluid/operators/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
 namespace ops = paddle::operators;
 REGISTER_ELEMWISE_OP(elementwise_div, "Div", "Out = X / Y");

--- a/paddle/fluid/operators/elementwise_div_op.cu
+++ b/paddle/fluid/operators/elementwise_div_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #define EIGEN_USE_GPU
-#include "paddle/fluid/operators/elementwise_div_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
 namespace ops = paddle::operators;

--- a/paddle/fluid/operators/elementwise_div_op.h
+++ b/paddle/fluid/operators/elementwise_div_op.h
@@ -14,8 +14,8 @@ limitations under the License. */
 #pragma once
-#include "paddle/fluid/operators/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/elementwise_op_function.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/elementwise_max_op.cc
+++ b/paddle/fluid/operators/elementwise_max_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/elementwise_max_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_max_op.h"
-#include "paddle/fluid/operators/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
 namespace ops = paddle::operators;
 REGISTER_ELEMWISE_OP(elementwise_max, "Max", "Out = max(X, Y)");
 REGISTER_OP_CPU_KERNEL(

--- a/paddle/fluid/operators/elementwise_max_op.cu
+++ b/paddle/fluid/operators/elementwise_max_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #define EIGEN_USE_GPU
-#include "paddle/fluid/operators/elementwise_max_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_max_op.h"
 namespace ops = paddle::operators;

--- a/paddle/fluid/operators/elementwise_max_op.h
+++ b/paddle/fluid/operators/elementwise_max_op.h
@@ -14,8 +14,8 @@ limitations under the License. */
 #pragma once
-#include "paddle/fluid/operators/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/elementwise_op_function.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/elementwise_min_op.cc
+++ b/paddle/fluid/operators/elementwise_min_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/elementwise_min_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_min_op.h"
-#include "paddle/fluid/operators/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
 namespace ops = paddle::operators;
 REGISTER_ELEMWISE_OP(elementwise_min, "Min", "Out = min(X, Y)");
 REGISTER_OP_CPU_KERNEL(

--- a/paddle/fluid/operators/elementwise_min_op.cu
+++ b/paddle/fluid/operators/elementwise_min_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #define EIGEN_USE_GPU
-#include "paddle/fluid/operators/elementwise_min_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_min_op.h"
 namespace ops = paddle::operators;

--- a/paddle/fluid/operators/elementwise_min_op.h
+++ b/paddle/fluid/operators/elementwise_min_op.h
@@ -14,8 +14,8 @@ limitations under the License. */
 #pragma once
-#include "paddle/fluid/operators/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/elementwise_op_function.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/elementwise_mul_op.cc
+++ b/paddle/fluid/operators/elementwise_mul_op.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/elementwise_mul_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
 #include <string>
-#include "paddle/fluid/operators/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise_mul_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #define EIGEN_USE_GPU
-#include "paddle/fluid/operators/elementwise_mul_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
 namespace ops = paddle::operators;

--- a/paddle/fluid/operators/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise_mul_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
-#include "paddle/fluid/operators/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/elementwise_op_function.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/operators/math/blas.h"
 namespace paddle {

--- a/paddle/fluid/operators/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise_op.h
--- a/paddle/fluid/operators/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise_op_function.h
--- a/paddle/fluid/operators/elementwise_pow_op.cc
+++ b/paddle/fluid/operators/elementwise_pow_op.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/elementwise_pow_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_pow_op.h"
 #include <string>
-#include "paddle/fluid/operators/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/elementwise_pow_op.cu
+++ b/paddle/fluid/operators/elementwise_pow_op.cu
@@ -10,7 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #define EIGEN_USE_GPU
-#include "paddle/fluid/operators/elementwise_pow_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_pow_op.h"
 namespace ops = paddle::operators;

--- a/paddle/fluid/operators/elementwise_pow_op.h
+++ b/paddle/fluid/operators/elementwise_pow_op.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 #include <cmath>
-#include "paddle/fluid/operators/elementwise_op_function.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/elementwise_sub_op.cc
+++ b/paddle/fluid/operators/elementwise_sub_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/elementwise_sub_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
-#include "paddle/fluid/operators/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
 namespace ops = paddle::operators;
 REGISTER_ELEMWISE_GRAD_MAKER(elementwise_sub, Sub);
 REGISTER_ELEMWISE_EXPLICIT_OP(elementwise_sub, "Sub", "Out = X - Y", "Out",

--- a/paddle/fluid/operators/elementwise_sub_op.cu
+++ b/paddle/fluid/operators/elementwise_sub_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #define EIGEN_USE_GPU
-#include "paddle/fluid/operators/elementwise_sub_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
 namespace ops = paddle::operators;

--- a/paddle/fluid/operators/elementwise_sub_op.h
+++ b/paddle/fluid/operators/elementwise_sub_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
-#include "paddle/fluid/operators/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/elementwise_op_function.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/expand_op.cc
+++ b/paddle/fluid/operators/expand_op.cc
@@ -47,6 +47,11 @@ class ExpandOp : public framework::OperatorWithKernel {
      out_shape[i] = x_dims[i] * expand_times[i];
    }
+    // set the first dim to -1 in compile time
+    if (!ctx->IsRuntime()) {
+      out_shape[0] = x_dims[0];
+    }
    ctx->SetOutputDim("Out", framework::make_ddim(out_shape));
    if (out_shape[0] == x_dims[0]) {
      ctx->ShareLoD("X", "Out");
@@ -109,7 +114,16 @@ class ExpandGradOp : public framework::OperatorWithKernel {
        ctx->Attrs().Get<std::vector<int>>("expand_times");
    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-    for (size_t i = 0; i < expand_times.size(); ++i) {
+    size_t start_pos = 0u;
+    if (!ctx->IsRuntime()) {
+      PADDLE_ENFORCE_EQ(
+          x_dims[0], out_dims[0],
+          "The first dimension size of Input(Out@GRAD) should be "
+          "equal to the crroresponding dimension size of Input(X)");
+      start_pos = 1u;
+    }
+    for (size_t i = start_pos; i < expand_times.size(); ++i) {
      PADDLE_ENFORCE_EQ(x_dims[i] * expand_times[i], out_dims[i],
                        "Each dimension size of Input(Out@GRAD) should be "
                        "equal to multiplication of crroresponding dimension "

--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
+include(operators)
+register_operators()
--- a/paddle/fluid/operators/fused_elemwise_activation_op.cc
+++ b/paddle/fluid/operators/fused_elemwise_activation_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/fused_elemwise_activation_op.h"
+#include "paddle/fluid/operators/fused/fused_elemwise_activation_op.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/fused_elemwise_activation_op.cu
+++ b/paddle/fluid/operators/fused_elemwise_activation_op.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/fused_elemwise_activation_op.h"
+#include "paddle/fluid/operators/fused/fused_elemwise_activation_op.h"
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(

--- a/paddle/fluid/operators/fused_elemwise_activation_op.h
+++ b/paddle/fluid/operators/fused_elemwise_activation_op.h
--- a/paddle/fluid/operators/fused_embedding_fc_lstm_op.cc
+++ b/paddle/fluid/operators/fused_embedding_fc_lstm_op.cc
--- a/paddle/fluid/operators/fused_embedding_fc_lstm_op.h
+++ b/paddle/fluid/operators/fused_embedding_fc_lstm_op.h
--- a/paddle/fluid/operators/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fusion_gru_op.cc
--- a/paddle/fluid/operators/fusion_gru_op.h
+++ b/paddle/fluid/operators/fusion_gru_op.h
--- a/paddle/fluid/operators/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fusion_lstm_op.cc
--- a/paddle/fluid/operators/fusion_lstm_op.h
+++ b/paddle/fluid/operators/fusion_lstm_op.h
--- a/paddle/fluid/operators/fusion_seqconv_eltadd_relu_op.cc
+++ b/paddle/fluid/operators/fusion_seqconv_eltadd_relu_op.cc
--- a/paddle/fluid/operators/fusion_seqconv_eltadd_relu_op.h
+++ b/paddle/fluid/operators/fusion_seqconv_eltadd_relu_op.h
--- a/paddle/fluid/operators/fusion_seqexpand_concat_fc_op.cc
+++ b/paddle/fluid/operators/fusion_seqexpand_concat_fc_op.cc
--- a/paddle/fluid/operators/fusion_seqexpand_concat_fc_op.h
+++ b/paddle/fluid/operators/fusion_seqexpand_concat_fc_op.h
--- a/paddle/fluid/operators/layer_norm_op.h
+++ b/paddle/fluid/operators/layer_norm_op.h
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
--- a/paddle/fluid/operators/math/cpu_vec.h
+++ b/paddle/fluid/operators/math/cpu_vec.h
--- a/paddle/fluid/operators/math/jit_code.cc
+++ b/paddle/fluid/operators/math/jit_code.cc
--- a/paddle/fluid/operators/math/jit_code.h
+++ b/paddle/fluid/operators/math/jit_code.h
--- a/paddle/fluid/operators/math/jit_kernel.h
+++ b/paddle/fluid/operators/math/jit_kernel.h
--- a/paddle/fluid/operators/math/jit_kernel_blas.cc
+++ b/paddle/fluid/operators/math/jit_kernel_blas.cc
--- a/paddle/fluid/operators/math/jit_kernel_crf_decode.cc
+++ b/paddle/fluid/operators/math/jit_kernel_crf_decode.cc
--- a/paddle/fluid/operators/math/jit_kernel_exp.cc
+++ b/paddle/fluid/operators/math/jit_kernel_exp.cc
--- a/paddle/fluid/operators/math/jit_kernel_macro.h
+++ b/paddle/fluid/operators/math/jit_kernel_macro.h
--- a/paddle/fluid/operators/math/jit_kernel_rnn.cc
+++ b/paddle/fluid/operators/math/jit_kernel_rnn.cc
--- a/paddle/fluid/operators/math/jit_kernel_test.cc
+++ b/paddle/fluid/operators/math/jit_kernel_test.cc
--- a/paddle/fluid/operators/math/sampler.cc
+++ b/paddle/fluid/operators/math/sampler.cc
--- a/paddle/fluid/operators/math/sampler.h
+++ b/paddle/fluid/operators/math/sampler.h
--- a/paddle/fluid/operators/metrics/CMakeLists.txt
+++ b/paddle/fluid/operators/metrics/CMakeLists.txt
--- a/paddle/fluid/operators/accuracy_op.cc
+++ b/paddle/fluid/operators/accuracy_op.cc
--- a/paddle/fluid/operators/accuracy_op.cu
+++ b/paddle/fluid/operators/accuracy_op.cu
--- a/paddle/fluid/operators/accuracy_op.h
+++ b/paddle/fluid/operators/accuracy_op.h
--- a/paddle/fluid/operators/auc_op.cc
+++ b/paddle/fluid/operators/auc_op.cc
--- a/paddle/fluid/operators/auc_op.h
+++ b/paddle/fluid/operators/auc_op.h
--- a/paddle/fluid/operators/precision_recall_op.cc
+++ b/paddle/fluid/operators/precision_recall_op.cc
--- a/paddle/fluid/operators/precision_recall_op.h
+++ b/paddle/fluid/operators/precision_recall_op.h
--- a/paddle/fluid/operators/nccl/CMakeLists.txt
+++ b/paddle/fluid/operators/nccl/CMakeLists.txt
--- a/paddle/fluid/operators/nccl_op.cc
+++ b/paddle/fluid/operators/nccl_op.cc
--- a/paddle/fluid/operators/nccl_op.cu.cc
+++ b/paddle/fluid/operators/nccl_op.cu.cc
--- a/paddle/fluid/operators/nccl_op_test.cu.cc
+++ b/paddle/fluid/operators/nccl_op_test.cu.cc
--- a/paddle/fluid/operators/nce_op.cc
+++ b/paddle/fluid/operators/nce_op.cc
--- a/paddle/fluid/operators/nce_op.h
+++ b/paddle/fluid/operators/nce_op.h
--- a/paddle/fluid/operators/optimizers/CMakeLists.txt
+++ b/paddle/fluid/operators/optimizers/CMakeLists.txt
--- a/paddle/fluid/operators/adadelta_op.cc
+++ b/paddle/fluid/operators/adadelta_op.cc
--- a/paddle/fluid/operators/adadelta_op.cu
+++ b/paddle/fluid/operators/adadelta_op.cu
--- a/paddle/fluid/operators/adadelta_op.h
+++ b/paddle/fluid/operators/adadelta_op.h
--- a/paddle/fluid/operators/adagrad_op.cc
+++ b/paddle/fluid/operators/adagrad_op.cc
--- a/paddle/fluid/operators/adagrad_op.cu
+++ b/paddle/fluid/operators/adagrad_op.cu
--- a/paddle/fluid/operators/adagrad_op.h
+++ b/paddle/fluid/operators/adagrad_op.h
--- a/paddle/fluid/operators/adam_op.cc
+++ b/paddle/fluid/operators/adam_op.cc
--- a/paddle/fluid/operators/adam_op.cu
+++ b/paddle/fluid/operators/adam_op.cu
--- a/paddle/fluid/operators/adam_op.h
+++ b/paddle/fluid/operators/adam_op.h
--- a/paddle/fluid/operators/adamax_op.cc
+++ b/paddle/fluid/operators/adamax_op.cc
--- a/paddle/fluid/operators/adamax_op.cu
+++ b/paddle/fluid/operators/adamax_op.cu
--- a/paddle/fluid/operators/adamax_op.h
+++ b/paddle/fluid/operators/adamax_op.h
--- a/paddle/fluid/operators/decayed_adagrad_op.cc
+++ b/paddle/fluid/operators/decayed_adagrad_op.cc
--- a/paddle/fluid/operators/decayed_adagrad_op.cu
+++ b/paddle/fluid/operators/decayed_adagrad_op.cu
--- a/paddle/fluid/operators/decayed_adagrad_op.h
+++ b/paddle/fluid/operators/decayed_adagrad_op.h
--- a/paddle/fluid/operators/ftrl_op.cc
+++ b/paddle/fluid/operators/ftrl_op.cc
--- a/paddle/fluid/operators/ftrl_op.cu
+++ b/paddle/fluid/operators/ftrl_op.cu
--- a/paddle/fluid/operators/ftrl_op.h
+++ b/paddle/fluid/operators/ftrl_op.h
--- a/paddle/fluid/operators/lars_momentum_op.cc
+++ b/paddle/fluid/operators/lars_momentum_op.cc
--- a/paddle/fluid/operators/lars_momentum_op.cu
+++ b/paddle/fluid/operators/lars_momentum_op.cu
--- a/paddle/fluid/operators/lars_momentum_op.h
+++ b/paddle/fluid/operators/lars_momentum_op.h
--- a/paddle/fluid/operators/momentum_op.cc
+++ b/paddle/fluid/operators/momentum_op.cc
--- a/paddle/fluid/operators/momentum_op.cu
+++ b/paddle/fluid/operators/momentum_op.cu
--- a/paddle/fluid/operators/momentum_op.h
+++ b/paddle/fluid/operators/momentum_op.h
--- a/paddle/fluid/operators/proximal_adagrad_op.cc
+++ b/paddle/fluid/operators/proximal_adagrad_op.cc
--- a/paddle/fluid/operators/proximal_adagrad_op.cu
+++ b/paddle/fluid/operators/proximal_adagrad_op.cu
--- a/paddle/fluid/operators/proximal_adagrad_op.h
+++ b/paddle/fluid/operators/proximal_adagrad_op.h
--- a/paddle/fluid/operators/proximal_gd_op.cc
+++ b/paddle/fluid/operators/proximal_gd_op.cc
--- a/paddle/fluid/operators/proximal_gd_op.cu
+++ b/paddle/fluid/operators/proximal_gd_op.cu
--- a/paddle/fluid/operators/proximal_gd_op.h
+++ b/paddle/fluid/operators/proximal_gd_op.h
--- a/paddle/fluid/operators/rmsprop_op.cc
+++ b/paddle/fluid/operators/rmsprop_op.cc
--- a/paddle/fluid/operators/rmsprop_op.cu
+++ b/paddle/fluid/operators/rmsprop_op.cu
--- a/paddle/fluid/operators/rmsprop_op.h
+++ b/paddle/fluid/operators/rmsprop_op.h
--- a/paddle/fluid/operators/sgd_op.cc
+++ b/paddle/fluid/operators/sgd_op.cc
--- a/paddle/fluid/operators/sgd_op.cu
+++ b/paddle/fluid/operators/sgd_op.cu
--- a/paddle/fluid/operators/sgd_op.h
+++ b/paddle/fluid/operators/sgd_op.h
--- a/paddle/fluid/operators/reader/CMakeLists.txt
+++ b/paddle/fluid/operators/reader/CMakeLists.txt
--- a/paddle/fluid/operators/read_op.cc
+++ b/paddle/fluid/operators/read_op.cc
--- a/paddle/fluid/operators/reduce_ops/CMakeLists.txt
+++ b/paddle/fluid/operators/reduce_ops/CMakeLists.txt
--- a/paddle/fluid/operators/cub_reduce.h
+++ b/paddle/fluid/operators/cub_reduce.h
--- a/paddle/fluid/operators/reduce_max_op.cc
+++ b/paddle/fluid/operators/reduce_max_op.cc
--- a/paddle/fluid/operators/reduce_max_op.cu
+++ b/paddle/fluid/operators/reduce_max_op.cu
--- a/paddle/fluid/operators/reduce_max_op.part.cu
+++ b/paddle/fluid/operators/reduce_max_op.part.cu
--- a/paddle/fluid/operators/reduce_mean_op.cc
+++ b/paddle/fluid/operators/reduce_mean_op.cc
--- a/paddle/fluid/operators/reduce_mean_op.cu
+++ b/paddle/fluid/operators/reduce_mean_op.cu
--- a/paddle/fluid/operators/reduce_mean_op.h
+++ b/paddle/fluid/operators/reduce_mean_op.h
--- a/paddle/fluid/operators/reduce_mean_op.part.cu
+++ b/paddle/fluid/operators/reduce_mean_op.part.cu
--- a/paddle/fluid/operators/reduce_min_max_op.h
+++ b/paddle/fluid/operators/reduce_min_max_op.h
--- a/paddle/fluid/operators/reduce_min_op.cc
+++ b/paddle/fluid/operators/reduce_min_op.cc
--- a/paddle/fluid/operators/reduce_min_op.cu
+++ b/paddle/fluid/operators/reduce_min_op.cu
--- a/paddle/fluid/operators/reduce_min_op.part.cu
+++ b/paddle/fluid/operators/reduce_min_op.part.cu
--- a/paddle/fluid/operators/reduce_op.h
+++ b/paddle/fluid/operators/reduce_op.h
--- a/paddle/fluid/operators/reduce_op_function.h
+++ b/paddle/fluid/operators/reduce_op_function.h
--- a/paddle/fluid/operators/reduce_prod_op.cc
+++ b/paddle/fluid/operators/reduce_prod_op.cc
--- a/paddle/fluid/operators/reduce_prod_op.cu
+++ b/paddle/fluid/operators/reduce_prod_op.cu
--- a/paddle/fluid/operators/reduce_prod_op.h
+++ b/paddle/fluid/operators/reduce_prod_op.h
--- a/paddle/fluid/operators/reduce_prod_op.part.cu
+++ b/paddle/fluid/operators/reduce_prod_op.part.cu
--- a/paddle/fluid/operators/reduce_sum_op.cc
+++ b/paddle/fluid/operators/reduce_sum_op.cc
--- a/paddle/fluid/operators/reduce_sum_op.cu
+++ b/paddle/fluid/operators/reduce_sum_op.cu
--- a/paddle/fluid/operators/reduce_sum_op.h
+++ b/paddle/fluid/operators/reduce_sum_op.h
--- a/paddle/fluid/operators/reduce_sum_op.part.cu
+++ b/paddle/fluid/operators/reduce_sum_op.part.cu
--- a/paddle/fluid/operators/sequence_ops/CMakeLists.txt
+++ b/paddle/fluid/operators/sequence_ops/CMakeLists.txt
--- a/paddle/fluid/operators/sequence_concat_op.cc
+++ b/paddle/fluid/operators/sequence_concat_op.cc
--- a/paddle/fluid/operators/sequence_concat_op.cu.cc
+++ b/paddle/fluid/operators/sequence_concat_op.cu.cc
--- a/paddle/fluid/operators/sequence_concat_op.h
+++ b/paddle/fluid/operators/sequence_concat_op.h
--- a/paddle/fluid/operators/sequence_conv_op.cc
+++ b/paddle/fluid/operators/sequence_conv_op.cc
--- a/paddle/fluid/operators/sequence_conv_op.cu.cc
+++ b/paddle/fluid/operators/sequence_conv_op.cu.cc
--- a/paddle/fluid/operators/sequence_conv_op.h
+++ b/paddle/fluid/operators/sequence_conv_op.h
--- a/paddle/fluid/operators/sequence_enumerate_op.cc
+++ b/paddle/fluid/operators/sequence_enumerate_op.cc
--- a/paddle/fluid/operators/sequence_enumerate_op.cu
+++ b/paddle/fluid/operators/sequence_enumerate_op.cu
--- a/paddle/fluid/operators/sequence_enumerate_op.h
+++ b/paddle/fluid/operators/sequence_enumerate_op.h
--- a/paddle/fluid/operators/sequence_erase_op.cc
+++ b/paddle/fluid/operators/sequence_erase_op.cc
--- a/paddle/fluid/operators/sequence_erase_op.cu
+++ b/paddle/fluid/operators/sequence_erase_op.cu
--- a/paddle/fluid/operators/sequence_erase_op.h
+++ b/paddle/fluid/operators/sequence_erase_op.h
--- a/paddle/fluid/operators/sequence_expand_as_op.cc
+++ b/paddle/fluid/operators/sequence_expand_as_op.cc
--- a/paddle/fluid/operators/sequence_expand_as_op.cu
+++ b/paddle/fluid/operators/sequence_expand_as_op.cu
--- a/paddle/fluid/operators/sequence_expand_as_op.h
+++ b/paddle/fluid/operators/sequence_expand_as_op.h
--- a/paddle/fluid/operators/sequence_expand_op.cc
+++ b/paddle/fluid/operators/sequence_expand_op.cc
--- a/paddle/fluid/operators/sequence_expand_op.cu
+++ b/paddle/fluid/operators/sequence_expand_op.cu
--- a/paddle/fluid/operators/sequence_expand_op.h
+++ b/paddle/fluid/operators/sequence_expand_op.h
--- a/paddle/fluid/operators/sequence_mask_op.cc
+++ b/paddle/fluid/operators/sequence_mask_op.cc
--- a/paddle/fluid/operators/sequence_mask_op.cu
+++ b/paddle/fluid/operators/sequence_mask_op.cu
--- a/paddle/fluid/operators/sequence_mask_op.h
+++ b/paddle/fluid/operators/sequence_mask_op.h
--- a/paddle/fluid/operators/sequence_pad_op.cc
+++ b/paddle/fluid/operators/sequence_pad_op.cc
--- a/paddle/fluid/operators/sequence_pad_op.cu
+++ b/paddle/fluid/operators/sequence_pad_op.cu
--- a/paddle/fluid/operators/sequence_pad_op.h
+++ b/paddle/fluid/operators/sequence_pad_op.h
--- a/paddle/fluid/operators/sequence_pool_op.cc
+++ b/paddle/fluid/operators/sequence_pool_op.cc
--- a/paddle/fluid/operators/sequence_pool_op.cu
+++ b/paddle/fluid/operators/sequence_pool_op.cu
--- a/paddle/fluid/operators/sequence_pool_op.h
+++ b/paddle/fluid/operators/sequence_pool_op.h
--- a/paddle/fluid/operators/sequence_reshape_op.cc
+++ b/paddle/fluid/operators/sequence_reshape_op.cc
--- a/paddle/fluid/operators/sequence_reshape_op.cu
+++ b/paddle/fluid/operators/sequence_reshape_op.cu
--- a/paddle/fluid/operators/sequence_reshape_op.h
+++ b/paddle/fluid/operators/sequence_reshape_op.h
--- a/paddle/fluid/operators/sequence_reverse_op.cc
+++ b/paddle/fluid/operators/sequence_reverse_op.cc
--- a/paddle/fluid/operators/sequence_reverse_op.cu
+++ b/paddle/fluid/operators/sequence_reverse_op.cu
--- a/paddle/fluid/operators/sequence_reverse_op.h
+++ b/paddle/fluid/operators/sequence_reverse_op.h
--- a/paddle/fluid/operators/sequence_scatter_op.cc
+++ b/paddle/fluid/operators/sequence_scatter_op.cc
--- a/paddle/fluid/operators/sequence_scatter_op.h
+++ b/paddle/fluid/operators/sequence_scatter_op.h
--- a/paddle/fluid/operators/sequence_slice_op.cc
+++ b/paddle/fluid/operators/sequence_slice_op.cc
--- a/paddle/fluid/operators/sequence_slice_op.cu
+++ b/paddle/fluid/operators/sequence_slice_op.cu
--- a/paddle/fluid/operators/sequence_slice_op.h
+++ b/paddle/fluid/operators/sequence_slice_op.h
--- a/paddle/fluid/operators/sequence_softmax_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/sequence_softmax_cudnn_op.cu.cc
--- a/paddle/fluid/operators/sequence_softmax_op.cc
+++ b/paddle/fluid/operators/sequence_softmax_op.cc
--- a/paddle/fluid/operators/sequence_softmax_op.cu
+++ b/paddle/fluid/operators/sequence_softmax_op.cu
--- a/paddle/fluid/operators/sequence_softmax_op.h
+++ b/paddle/fluid/operators/sequence_softmax_op.h
--- a/paddle/fluid/operators/sequence_unpad_op.cc
+++ b/paddle/fluid/operators/sequence_unpad_op.cc
--- a/paddle/fluid/operators/sequence_unpad_op.cu
+++ b/paddle/fluid/operators/sequence_unpad_op.cu
--- a/paddle/fluid/operators/sequence_unpad_op.h
+++ b/paddle/fluid/operators/sequence_unpad_op.h
--- a/paddle/fluid/operators/space_to_depth_op.cc
+++ b/paddle/fluid/operators/space_to_depth_op.cc
--- a/paddle/fluid/operators/tensorrt/CMakeLists.txt
+++ b/paddle/fluid/operators/tensorrt/CMakeLists.txt
--- a/paddle/fluid/operators/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op.cc
--- a/paddle/fluid/operators/tensorrt_engine_op.cu.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op.cu.cc
--- a/paddle/fluid/operators/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
--- a/paddle/fluid/operators/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op_test.cc
--- a/paddle/fluid/operators/warpctc_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/warpctc_cudnn_op.cu.cc
--- a/paddle/fluid/operators/warpctc_op.cc
+++ b/paddle/fluid/operators/warpctc_op.cc
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
--- a/paddle/fluid/pybind/async_executor_py.cc
+++ b/paddle/fluid/pybind/async_executor_py.cc
--- a/python/paddle/fluid/async_executor.py
+++ b/python/paddle/fluid/async_executor.py
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
--- a/python/paddle/fluid/tests/unittests/test_infer_shape.py
+++ b/python/paddle/fluid/tests/unittests/test_infer_shape.py
--- a/python/paddle/fluid/tests/unittests/test_nce.py
+++ b/python/paddle/fluid/tests/unittests/test_nce.py
--- a/python/paddle/fluid/tests/unittests/test_warpctc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
--- a/python/paddle/fluid/transpiler/details/checkport.py
+++ b/python/paddle/fluid/transpiler/details/checkport.py