Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into rewrite_allocation

test=develop

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into rewrite_allocation
test=develop
98bbfc17 · Yu Yang · 38143e5a · d7bd0361 · 98bbfc17 · 98bbfc17
285 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -315,7 +315,6 @@ endif()
 if (ON_INFER)
    message(STATUS "On inference mode, will take place some specific optimization.")
-    add_definitions(-DPADDLE_ON_INFERENCE)
 else()
    #TODO(luotao), combine this warning with `make inference_lib_dist` command.
    message(WARNING "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only.")

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -218,3 +218,7 @@ endif(WITH_GRPC)
 if(WITH_BRPC_RDMA)
    add_definitions(-DPADDLE_WITH_BRPC_RDMA)
 endif(WITH_BRPC_RDMA)
+if(ON_INFER)
+    add_definitions(-DPADDLE_ON_INFERENCE)
+endif(ON_INFER)
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
+set(PART_CUDA_KERNEL_FILES)
+function(op_library TARGET)
+    # op_library is a function to create op library. The interface is same as
+    # cc_library. But it handle split GPU/CPU code and link some common library
+    # for ops.
+    set(cc_srcs)
+    set(cu_srcs)
+    set(hip_cu_srcs)
+    set(miopen_hip_cc_srcs)
+    set(cu_cc_srcs)
+    set(cudnn_cu_cc_srcs)
+    set(CUDNN_FILE)
+    set(mkldnn_cc_srcs)
+    set(MKLDNN_FILE)
+    set(op_common_deps operator op_registry math_function)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    set(pybind_flag 0)
+    cmake_parse_arguments(op_library "${options}" "${oneValueArgs}"
+            "${multiValueArgs}" ${ARGN})
+    list(LENGTH op_library_SRCS op_library_SRCS_len)
+    if (${op_library_SRCS_len} EQUAL 0)
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
+            list(APPEND cc_srcs ${TARGET}.cc)
+        endif()
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc)
+            list(APPEND cu_cc_srcs ${TARGET}.cu.cc)
+        endif()
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
+            list(APPEND cu_srcs ${TARGET}.cu)
+        endif()
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
+            set(PART_CUDA_KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu
+                    ${PART_CUDA_KERNEL_FILES} PARENT_SCOPE)
+            list(APPEND cu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
+        endif()
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.hip.cu)
+            list(APPEND hip_cu_srcs ${TARGET}.hip.cu)
+        endif()
+        string(REPLACE "_op" "_cudnn_op" CUDNN_FILE "${TARGET}")
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu.cc)
+            list(APPEND cudnn_cu_cc_srcs ${CUDNN_FILE}.cu.cc)
+        endif()
+        if(WITH_AMD_GPU)
+            string(REPLACE "_op" "_miopen_op" MIOPEN_FILE "${TARGET}")
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MIOPEN_FILE}.hip.cc)
+                list(APPEND miopen_hip_cc_srcs ${MIOPEN_FILE}.hip.cc)
+            endif()
+        endif()
+        if(WITH_MKLDNN)
+            string(REPLACE "_op" "_mkldnn_op" MKLDNN_FILE "${TARGET}")
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MKLDNN_FILE}.cc)
+                list(APPEND mkldnn_cc_srcs ${MKLDNN_FILE}.cc)
+            endif()
+        endif()
+    else()
+        foreach(src ${op_library_SRCS})
+            if (${src} MATCHES ".*\\.hip.cu$")
+                list(APPEND hip_cu_srcs ${src})
+            elseif (${src} MATCHES ".*\\.cu$")
+                list(APPEND cu_srcs ${src})
+            elseif(${src} MATCHES ".*_cudnn_op.cu.cc$")
+                list(APPEND cudnn_cu_cc_srcs ${src})
+            elseif(WITH_AMD_GPU AND ${src} MATCHES ".*_miopen_op.hip.cc$")
+                list(APPEND miopen_hip_cc_srcs ${src})
+            elseif(WITH_MKLDNN AND ${src} MATCHES ".*_mkldnn_op.cc$")
+                list(APPEND mkldnn_cc_srcs ${src})
+            elseif(${src} MATCHES ".*\\.cu.cc$")
+                list(APPEND cu_cc_srcs ${src})
+            elseif(${src} MATCHES ".*\\.cc$")
+                list(APPEND cc_srcs ${src})
+            else()
+                message(FATAL_ERROR "${TARGET} Source file ${src} should only be .cc or .cu")
+            endif()
+        endforeach()
+    endif()
+    list(LENGTH cc_srcs cc_srcs_len)
+    if (${cc_srcs_len} EQUAL 0)
+        message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file")
+    endif()
+    if (WIN32)
+    # remove windows unsupported op, because windows has no nccl, no warpctc such ops.
+    foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op" "hierarchical_sigmoid_op"
+     "crf_decoding_op" "select_op" "lstmp_op" "gru_op" "fusion_gru_op" "lstm_op" "fusion_lstm_op" "cumsum_op"
+      "fusion_seqconv_eltadd_relu_op" "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op")
+        if ("${TARGET}" STREQUAL "${windows_unsupport_op}")
+          return()
+        endif()
+    endforeach()
+    endif(WIN32)
+    set(OP_LIBRARY ${TARGET} ${OP_LIBRARY} CACHE INTERNAL "op libs")
+    list(LENGTH op_library_DEPS op_library_DEPS_len)
+    if (${op_library_DEPS_len} GREATER 0)
+        set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE)
+    endif()
+    if (WITH_GPU)
+        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${mkldnn_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
+                ${op_common_deps})
+    elseif (WITH_AMD_GPU)
+        hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cu_srcs} ${miopen_hip_cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
+                ${op_common_deps})
+    else()
+        cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
+            ${op_common_deps})
+    endif()
+    # Define operators that don't need pybind here.
+    foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op"
+"tensor_array_read_write_op" "tensorrt_engine_op")
+        if ("${TARGET}" STREQUAL "${manual_pybind_op}")
+            set(pybind_flag 1)
+        endif()
+    endforeach()
+    # The registration of USE_OP, please refer to paddle/fluid/framework/op_registry.h.
+    # Note that it's enough to just adding one operator to pybind in a *_op.cc file.
+    # And for detail pybind information, please see generated paddle/pybind/pybind.h.
+    file(READ ${TARGET}.cc TARGET_CONTENT)
+    string(REGEX MATCH "REGISTER_OPERATOR\\(.*REGISTER_OPERATOR\\(" multi_register "${TARGET_CONTENT}")
+    string(REGEX MATCH "REGISTER_OPERATOR\\([a-z0-9_]*," one_register "${multi_register}")
+    if (one_register STREQUAL "")
+        string(REPLACE "_op" "" TARGET "${TARGET}")
+    else ()
+        string(REPLACE "REGISTER_OPERATOR(" "" TARGET "${one_register}")
+        string(REPLACE "," "" TARGET "${TARGET}")
+    endif()
+    # pybind USE_NO_KERNEL_OP
+    # HACK: if REGISTER_OP_CPU_KERNEL presents the operator must have kernel
+    string(REGEX MATCH "REGISTER_OP_CPU_KERNEL" regex_result "${TARGET_CONTENT}")
+    string(REPLACE "_op" "" TARGET "${TARGET}")
+    if (${pybind_flag} EQUAL 0 AND regex_result STREQUAL "")
+        file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(${TARGET});\n")
+        set(pybind_flag 1)
+    endif()
+    # pybind USE_CPU_ONLY_OP
+    list(LENGTH cu_srcs cu_srcs_len)
+    list(LENGTH cu_cc_srcs cu_cc_srcs_len)
+    list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
+    list(LENGTH hip_cu_srcs hip_cu_srcs_len)
+    list(LENGTH miopen_hip_cc_srcs miopen_hip_cc_srcs_len)
+    if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0 AND
+        ${hip_cu_srcs_len} EQUAL 0 AND ${miopen_hip_cc_srcs_len} EQUAL 0)
+        file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
+        set(pybind_flag 1)
+    endif()
+    # pybind USE_OP_DEVICE_KERNEL for CUDNN
+    list(LENGTH cudnn_cu_cc_srcs cudnn_cu_cc_srcs_len)
+    if (WITH_GPU AND ${cudnn_cu_cc_srcs_len} GREATER 0)
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
+    endif()
+    # pybind USE_OP_DEVICE_KERNEL for MIOPEN
+    if (WITH_AMD_GPU AND ${miopen_hip_cc_srcs_len} GREATER 0)
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MIOPEN);\n")
+    endif()
+    # pybind USE_OP_DEVICE_KERNEL for MKLDNN
+    if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
+      # Append first implemented MKLDNN activation operator
+      if (${MKLDNN_FILE} STREQUAL "activation_mkldnn_op")
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, MKLDNN);\n")
+      else()
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n")
+      endif()
+    endif()
+    # pybind USE_OP
+    if (${pybind_flag} EQUAL 0)
+      # NOTE(*): activation use macro to regist the kernels, set use_op manually.
+      if(${TARGET} STREQUAL "activation")
+        file(APPEND ${pybind_file} "USE_OP(relu);\n")
+      elseif(${TARGET} STREQUAL "fake_dequantize")
+        file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n")
+      elseif(${TARGET} STREQUAL "fake_quantize")
+        file(APPEND ${pybind_file} "USE_OP(fake_quantize_abs_max);\n")
+      elseif(${TARGET} STREQUAL "tensorrt_engine_op")
+          message(STATUS "Pybind skips [tensorrt_engine_op], for this OP is only used in inference")
+      elseif(${TARGET} STREQUAL "fc")
+        # HACK: fc only have mkldnn and cpu, which would mismatch the cpu only condition
+        file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
+      else()
+        file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
+      endif()
+    endif()
+endfunction()
+function(register_operators)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs EXCLUDES DEPS)
+    cmake_parse_arguments(register_operators "${options}" "${oneValueArgs}"
+            "${multiValueArgs}" ${ARGN})
+    file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
+    string(REPLACE "_mkldnn" "" OPS "${OPS}")
+    string(REPLACE ".cc" "" OPS "${OPS}")
+    list(REMOVE_DUPLICATES OPS)
+    list(LENGTH register_operators_DEPS register_operators_DEPS_len)
+    foreach(src ${OPS})
+        list(FIND register_operators_EXCLUDES ${src} _index)
+        if (${_index} EQUAL -1)
+            if (${register_operators_DEPS_len} GREATER 0)
+                op_library(${src} DEPS ${register_operators_DEPS})
+            else()
+                op_library(${src})
+            endif()
+        endif()
+    endforeach()
+endfunction()
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -93,11 +93,11 @@ paddle.fluid.layers.edit_distance ArgSpec(args=['input', 'label', 'normalized',
 paddle.fluid.layers.l2_normalize ArgSpec(args=['x', 'axis', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(1e-12, None))
 paddle.fluid.layers.matmul ArgSpec(args=['x', 'y', 'transpose_x', 'transpose_y', 'alpha', 'name'], varargs=None, keywords=None, defaults=(False, False, 1.0, None))
 paddle.fluid.layers.topk ArgSpec(args=['input', 'k', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.warpctc ArgSpec(args=['input', 'label', 'blank', 'norm_by_times'], varargs=None, keywords=None, defaults=(0, False))
+paddle.fluid.layers.warpctc ArgSpec(args=['input', 'label', 'blank', 'norm_by_times', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, False, False))
 paddle.fluid.layers.sequence_reshape ArgSpec(args=['input', 'new_dim'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None))
-paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, None))
+paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0))
 paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'name'], varargs=None, keywords=None, defaults=(0, None))
 paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
@@ -128,6 +128,7 @@ paddle.fluid.layers.sequence_scatter ArgSpec(args=['input', 'index', 'updates',
 paddle.fluid.layers.random_crop ArgSpec(args=['x', 'shape', 'seed'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.mean_iou ArgSpec(args=['input', 'label', 'num_classes'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.relu ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.selu ArgSpec(args=['x', 'scale', 'alpha', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.log ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,))

--- a/paddle/fluid/framework/data_device_transform_test.cu
+++ b/paddle/fluid/framework/data_device_transform_test.cu
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/elementwise_op_function.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/init.h"

--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -41,6 +41,7 @@ pass_library(seq_concat_fc_fuse_pass inference)
 pass_library(multi_batch_merge_pass base)
 pass_library(conv_bn_fuse_pass inference)
 pass_library(seqconv_eltadd_relu_fuse_pass inference)
+pass_library(is_test_pass base)
 if(WITH_MKLDNN)
    pass_library(mkldnn_placement_pass base)
    pass_library(depthwise_conv_mkldnn_pass base)
@@ -62,6 +63,7 @@ cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_r
 cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass)
 cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector)
 cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
+cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass)
 if (WITH_MKLDNN)
    cc_test(test_depthwise_conv_mkldnn_pass SRCS depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass)
    cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)

--- a/paddle/fluid/framework/ir/is_test_pass.cc
+++ b/paddle/fluid/framework/ir/is_test_pass.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/framework/ir/is_test_pass.h"
+#include <string>
+#include <utility>
+namespace paddle {
+namespace framework {
+namespace ir {
+std::unique_ptr<ir::Graph> IsTestPass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  VLOG(3) << "Sets is_test attrbiute to true and if it is missing, inserts it "
+             "for activations and pooling.";
+  auto op_list = {"pool2d",      "sigmoid",      "logsigmoid",
+                  "softshrink",  "exp",          "brelu",
+                  "pow",         "leaky_relu",   "stanh",
+                  "relu",        "tanh",         "tanh_shrink",
+                  "sqrt",        "abs",          "ceil",
+                  "elu",         "floor",        "cos",
+                  "sin",         "round",        "reciprocal",
+                  "hard_shrink", "hard_sigmoid", "relu6",
+                  "soft_relu",   "swish",        "thresholded_relu",
+                  "log",         "square",       "softplus",
+                  "softsign"};
+  for (const Node* n : graph->Nodes()) {
+    if (n->IsOp()) {
+      auto* op = n->Op();
+      if (op->HasAttr("is_test")) {
+        op->SetAttr("is_test", true);
+      } else if (std::find(begin(op_list), end(op_list), op->Type()) !=
+                 end(op_list)) {
+        op->MutableAttrMap()->insert(
+            std::pair<std::string, Attribute>("is_test", true));
+      }
+    }
+  }
+  return graph;
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+REGISTER_PASS(is_test_pass, paddle::framework::ir::IsTestPass);
--- a/paddle/fluid/framework/ir/is_test_pass.h
+++ b/paddle/fluid/framework/ir/is_test_pass.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/fluid/framework/ir/pass.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+class IsTestPass : public Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+};
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/is_test_pass_tester.cc
+++ b/paddle/fluid/framework/ir/is_test_pass_tester.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/ir/is_test_pass.h"
+#include <gtest/gtest.h>
+namespace paddle {
+namespace framework {
+namespace ir {
+enum class ISTEST_STATE { FALSE, TRUE, UNSET };
+void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs, bool use_mkldnn = false,
+           ISTEST_STATE is_test = ISTEST_STATE::UNSET) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+  op->SetType(type);
+  op->SetAttr("name", name);
+  op->SetInput("X", inputs);
+  op->SetOutput("Out", outputs);
+  op->SetAttr("use_mkldnn", use_mkldnn);
+  if (is_test == ISTEST_STATE::UNSET)
+    op->MutableAttrMap()->erase("is_test");
+  else if (is_test == ISTEST_STATE::FALSE)
+    op->SetAttr("is_test", false);
+  else
+    op->SetAttr("is_test", true);
+}
+// a->pool2d->b
+// b->relu->c
+// c,weights1)->conv2d->d
+//
+// d->pool2d->e
+// e->hard_sigmoid->f
+// (f,weights2)->conv2d->g
+//
+// g->pool2d->h
+// h->tanh->i
+// (i,weights3)->conv2d->j
+ProgramDesc BuildProgramDesc() {
+  ProgramDesc prog;
+  for (auto& v :
+       std::vector<std::string>({"a", "b", "c", "d", "e", "f", "g", "h", "i",
+                                 "j", "weights1", "weights2", "weights3"})) {
+    auto* var = prog.MutableBlock(0)->Var(v);
+    var->SetType(proto::VarType::SELECTED_ROWS);
+    if (v == "weights1" || v == "weights2" || v == "weights3") {
+      var->SetPersistable(true);
+    }
+  }
+  SetOp(&prog, "pool2d", "pooling1", std::vector<std::string>({"a"}),
+        std::vector<std::string>({"b"}), true, ISTEST_STATE::TRUE);
+  SetOp(&prog, "relu", "activation1", std::vector<std::string>({"b"}),
+        std::vector<std::string>({"c"}), true, ISTEST_STATE::TRUE);
+  SetOp(&prog, "conv2d", "conv1", std::vector<std::string>({"c", "weights1"}),
+        std::vector<std::string>({"d"}), true, ISTEST_STATE::TRUE);
+  SetOp(&prog, "pool2d", "pooling2", std::vector<std::string>({"d"}),
+        std::vector<std::string>({"e"}), false, ISTEST_STATE::FALSE);
+  SetOp(&prog, "hard_sigmoid", "activation2", std::vector<std::string>({"e"}),
+        std::vector<std::string>({"f"}), false, ISTEST_STATE::FALSE);
+  SetOp(&prog, "conv2d", "conv2", std::vector<std::string>({"f", "weights2"}),
+        std::vector<std::string>({"g"}), false, ISTEST_STATE::FALSE);
+  SetOp(&prog, "pool2d", "pooling3", std::vector<std::string>({"g"}),
+        std::vector<std::string>({"h"}), false, ISTEST_STATE::UNSET);
+  SetOp(&prog, "tanh", "activation3", std::vector<std::string>({"h"}),
+        std::vector<std::string>({"i"}), true, ISTEST_STATE::UNSET);
+  SetOp(&prog, "conv2d", "conv3", std::vector<std::string>({"i", "weights3"}),
+        std::vector<std::string>({"j"}), false, ISTEST_STATE::UNSET);
+  return prog;
+}
+TEST(IsTestPass, basic) {
+  auto prog = BuildProgramDesc();
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+  auto pass = PassRegistry::Instance().Get("is_test_pass");
+  graph = pass->Apply(std::move(graph));
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp()) {
+      auto* op = node->Op();
+      auto op_name = boost::get<std::string>(op->GetAttr("name"));
+      if (op_name == "conv3") {
+        ASSERT_FALSE(op->HasAttr("is_test"));
+      } else {
+        ASSERT_TRUE(op->HasAttr("is_test"));
+        EXPECT_TRUE(boost::get<bool>(op->GetAttr("is_test")));
+      }
+    }
+  }
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+USE_PASS(is_test_pass);
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -70,6 +70,16 @@ void NaiveExecutor::Prepare(Scope *scope, const ProgramDesc &program_desc,
 }
 void NaiveExecutor::Run() {
+#ifndef PADDLE_ON_INFERENCE
+  LOG_FIRST_N(WARNING, 15) << "The NaiveExecutor can not work properly if the "
+                              "cmake flag ON_INFER is not set.";
+  LOG_FIRST_N(WARNING, 15) << "Unlike the training phase, all the scopes and "
+                              "variables will be reused to save the allocation "
+                              "overhead.";
+  LOG_FIRST_N(WARNING, 15) << "Please re-compile the inference library by "
+                              "setting the cmake flag ON_INFER=ON if you are "
+                              "running Paddle Inference";
+#endif  // PADDLE_ON_INFERENCE
  for (auto &op : ops_) {
    VLOG(3) << std::this_thread::get_id() << " run " << op->Type()
            << " on scope " << scope_;

--- a/paddle/fluid/framework/op_kernel_type.h
+++ b/paddle/fluid/framework/op_kernel_type.h
@@ -63,6 +63,8 @@ struct OpKernelType {
        place_(dev_ctx.GetPlace()),
        library_type_(library_type) {}
+  size_t hash_key() const { return Hash()(*this); }
  bool operator==(const OpKernelType& o) const {
    return platform::places_are_same_class(place_, o.place_) &&
           data_type_ == o.data_type_ && data_layout_ == o.data_layout_ &&

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -35,6 +35,11 @@ DEFINE_bool(check_nan_inf, false,
 namespace paddle {
 namespace framework {
+// Combine two hash values to a single hash.
+inline size_t CombineHash(size_t seed, size_t a) {
+  return (seed ^ a) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+}
 std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority = {
    std::make_tuple(platform::CUDAPlace(0), LibraryType::kCUDNN),
    std::make_tuple(platform::CUDAPlace(0), LibraryType::kPlain),
@@ -794,6 +799,17 @@ void OperatorWithKernel::TransferInplaceVarsBack(
 Scope* OperatorWithKernel::TryTransferData(
    const Scope& scope, const OpKernelType& expected_kernel_key,
    std::vector<std::string>* transfered_inplace_vars) const {
+// In the inference scenerio, the scopes will be reused across the batches, so
+// the `new_scope` here will result in GPU memroy explosion over the running of
+// operators.
+// We use a thread_local cache to fix that issue, the key in the cache is the
+// combination of the `scope` argument, from_kernel_type, target_kernel_type.
+// Have a discussion with @Superjomn or the inference developers if some changes
+// on this logic for this macro might not tested on the other scenerios.
+#ifdef PADDLE_ON_INFERENCE
+  thread_local std::unordered_map<size_t, Scope*> infer_transfer_scope_cache;
+#endif
  Scope* new_scope = nullptr;
  for (auto& var_name_item : Inputs()) {
    for (auto& var_name : var_name_item.second) {
@@ -824,11 +840,28 @@ Scope* OperatorWithKernel::TryTransferData(
      VLOG(30) << "Transform Variable " << var_name << " from "
               << kernel_type_for_var << " to " << expected_kernel_key;
+#ifdef PADDLE_ON_INFERENCE
+      size_t infer_cache_key =
+          CombineHash(OpKernelType::Hash()(kernel_type_for_var),
+                      OpKernelType::Hash()(expected_kernel_key));
+      infer_cache_key =
+          CombineHash(infer_cache_key, std::hash<const Scope*>()(&scope));
+      auto it = infer_transfer_scope_cache.find(infer_cache_key);
+      if (it != infer_transfer_scope_cache.end()) {
+        new_scope = infer_transfer_scope_cache[infer_cache_key];
+      } else {
+        new_scope = &scope.NewScope();
+        infer_transfer_scope_cache[infer_cache_key] = new_scope;
+      }
+#endif
      if (new_scope == nullptr) {
        new_scope = &scope.NewScope();
      }
      auto* trans_var = new_scope->Var(var_name);
      Tensor out;
      TransformData(expected_kernel_key, kernel_type_for_var, *tensor_in, &out);
      SetTensorToVariable(*var, out, trans_var);

--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -42,7 +42,7 @@ DEFINE_double(
 // a mean time, but a scope may be read by multiple threads concurrently, and
 // the mutex will cause serious performance issue.
 // So the mutex is disabled when `ON_INFER`.
-#ifdef ON_INFER
+#ifdef PADDLE_ON_INFERENCE
 #define SCOPE_LOCK_GUARD
 #else
 #define SCOPE_LOCK_GUARD std::lock_guard<std::mutex> lock(mutex_);

--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -13,7 +13,7 @@ set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor)
 # TODO(panyx0718): Should this be called paddle_fluid_inference_api_internal?
 cc_library(paddle_fluid_api
    SRCS io.cc
-    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) 
+    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
 get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES)

--- a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
@@ -45,7 +45,7 @@ void IrAnalysisComposePass::InitTensorRTAttrs(Argument *argument) {
      std::unordered_set<std::string> teller_set(
          {"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
           "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
-           "elementwise_add", "dropout", "split"});
+           "elementwise_add", "dropout", "split", "prelu", "conv2d_transpose"});
      if (!node->IsOp()) return false;
      if (teller_set.count(node->Op()->Type())) {

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -549,4 +549,6 @@ USE_TRT_CONVERTER(concat);
 USE_TRT_CONVERTER(dropout);
 USE_TRT_CONVERTER(pad);
 USE_TRT_CONVERTER(split);
+USE_TRT_CONVERTER(prelu);
+USE_TRT_CONVERTER(conv2d_transpose);
 #endif
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -86,6 +86,7 @@ class CpuPassStrategy : public PassStrategy {
        "fc_fuse_pass",                  //
        "conv_bn_fuse_pass",             //
        "conv_eltwiseadd_bn_fuse_pass",  //
+        "is_test_pass",                  //
    });
  }

--- a/paddle/fluid/inference/tensorrt/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt
-nv_library(tensorrt_engine SRCS engine.cc DEPS framework_proto device_context)
+nv_library(tensorrt_engine SRCS engine.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context)
 nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
 nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine)
 add_subdirectory(plugin)

--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -2,35 +2,38 @@
 nv_library(tensorrt_converter
  SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
 batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc
-pad_op.cc split_op.cc
+pad_op.cc split_op.cc prelu_op.cc
  DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry)
 nv_test(test_op_converter SRCS test_op_converter.cc DEPS
-  ${FLUID_CORE_MODULES} tensorrt_engine tensorrt_converter)
+  ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_converter)
 nv_test(test_io_converter SRCS test_io_converter.cc io_converter.cc DEPS dynload_cuda dynamic_loader lod_tensor)
 nv_test(test_trt_mul_op SRCS test_mul_op.cc mul_op.cc
-        DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL)
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine mul_op SERIAL)
 nv_test(test_trt_fc_op SRCS test_fc_op.cc fc_op.cc
-        DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL)
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine mul_op SERIAL)
 nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc
-        DEPS ${FLUID_CORE_MODULES} tensorrt_engine activation_op SERIAL)
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine activation_op SERIAL)
 nv_test(test_trt_conv_op SRCS test_conv2d_op.cc conv2d_op.cc
-        DEPS ${FLUID_CORE_MODULES} tensorrt_engine conv_op SERIAL)
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine conv_op conv_transpose_op SERIAL)
 nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc
-        DEPS ${FLUID_CORE_MODULES} tensorrt_engine pool_op SERIAL)
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine pool_op SERIAL)
 nv_test(test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc
-        DEPS ${FLUID_CORE_MODULES} tensorrt_engine elementwise_add_op SERIAL)
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine elementwise_add_op SERIAL)
 nv_test(test_trt_softmax_op SRCS test_softmax_op.cc softmax_op.cc
-        DEPS ${FLUID_CORE_MODULES} tensorrt_engine softmax_op SERIAL)
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine softmax_op SERIAL)
 nv_test(test_trt_batch_norm_op SRCS test_batch_norm_op.cc batch_norm_op.cc
-        DEPS ${FLUID_CORE_MODULES} tensorrt_engine batch_norm_op SERIAL)
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine batch_norm_op SERIAL)
 nv_test(test_trt_concat_op SRCS test_concat_op.cc concat_op.cc
-        DEPS ${FLUID_CORE_MODULES} tensorrt_engine concat_op SERIAL)
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine concat_op SERIAL)
 nv_test(test_trt_dropout_op SRCS test_dropout_op.cc dropout_op.cc
-        DEPS ${FLUID_CORE_MODULES} tensorrt_engine dropout_op SERIAL)
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine dropout_op SERIAL)
 nv_test(test_trt_pad_op SRCS test_pad_op.cc pad_op.cc
-        DEPS ${FLUID_CORE_MODULES} tensorrt_engine pad_op SERIAL)
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine pad_op SERIAL)
 nv_test(test_trt_split_op SRCS test_split_op.cc split_op.cc
-        DEPS ${FLUID_CORE_MODULES} tensorrt_engine tensorrt_plugin
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_plugin
-split_op concat_op SERIAL)
+        split_op concat_op SERIAL)
+nv_test(test_trt_prelu_op SRCS test_prelu_op.cc prelu_op.cc
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_plugin
+        prelu_op SERIAL)
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -18,92 +18,139 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {
-bool to_skip_merging_optimize(TensorRTEngine* engine_,
+bool to_skip_merging_optimize(TensorRTEngine* engine,
                              const std::vector<int>& filters,
                              const std::vector<int>& strides,
                              const std::vector<int>& paddings,
                              std::string input_name) {
-  if (engine_->itensor_quote_num[input_name] > 0) {
+  if (engine->itensor_quote_num[input_name] > 0) {
    return true;
  }
  if (filters[0] == 1 && filters[1] == 1 && strides[0] == 1 &&
      strides[1] == 1 && paddings[0] == 0 && paddings[1] == 0)
-    engine_->itensor_quote_num[input_name] += 1;
+    engine->itensor_quote_num[input_name] += 1;
  return false;
 }
+template <typename RegistFunc, typename SetDilationFunc>
+void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
+                   const framework::Scope& scope, bool test_mode,
+                   RegistFunc fadd_layer, SetDilationFunc fset_dilation,
+                   const std::string& name) {
+  VLOG(3) << "convert a fluid " << name << " op to tensorrt layer without bias";
+  framework::OpDesc op_desc(op, nullptr);
+  PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1);
+  PADDLE_ENFORCE_EQ(op_desc.Input("Filter").size(), 1);  // Y is a weight
+  PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1);
+  PADDLE_ENFORCE(engine != nullptr);
+  auto* X = engine->GetITensor(op_desc.Input("Input").front());
+  // Declare weights
+  auto* Y_v = scope.FindVar(op_desc.Input("Filter").front());
+  PADDLE_ENFORCE_NOT_NULL(Y_v);
+  auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
+  platform::CPUPlace cpu_place;
+  std::unique_ptr<framework::LoDTensor> weight_tensor(
+      new framework::LoDTensor());
+  weight_tensor->Resize(Y_t->dims());
+  TensorCopySync((*Y_t), cpu_place, weight_tensor.get());
+  auto* weight_data = weight_tensor->mutable_data<float>(platform::CPUPlace());
+  PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL);
+  const int n_output = weight_tensor->dims()[0];
+  const int n_input = weight_tensor->dims()[1];
+  const int filter_h = weight_tensor->dims()[2];
+  const int filter_w = weight_tensor->dims()[3];
+  const int groups = boost::get<int>(op_desc.GetAttr("groups"));
+  const std::vector<int> dilations =
+      boost::get<std::vector<int>>(op_desc.GetAttr("dilations"));
+  const std::vector<int> strides =
+      boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
+  const std::vector<int> paddings =
+      boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
+  nvinfer1::DimsHW nv_ksize(filter_h, filter_w);
+  nvinfer1::DimsHW nv_dilations(dilations[0], dilations[1]);
+  nvinfer1::DimsHW nv_strides(strides[0], strides[1]);
+  nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]);
+  TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
+                                static_cast<void*>(weight_data),
+                                static_cast<size_t>(weight_tensor->numel())};
+  TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
+  auto* layer = fadd_layer(const_cast<nvinfer1::ITensor*>(X), n_output, n_input,
+                           nv_ksize, weight, bias);
+  PADDLE_ENFORCE(layer != nullptr);
+  layer->setStride(nv_strides);
+  layer->setPadding(nv_paddings);
+  layer->setNbGroups(groups);
+  // set dilations
+  fset_dilation(layer, nv_dilations);
+  auto output_name = op_desc.Output("Output").front();
+  layer->setName((name + " (Output: " + output_name + ")").c_str());
+  engine->weight_map[op_desc.Input("Filter").front()] =
+      std::move(weight_tensor);
+  layer->getOutput(0)->setName(output_name.c_str());
+  engine->SetITensor(output_name, layer->getOutput(0));
+  if (test_mode ||
+      to_skip_merging_optimize(engine, {filter_h, filter_w}, strides, paddings,
+                               op_desc.Input("Input").front())) {
+    engine->DeclareOutput(output_name);
+  }
+}
 class Conv2dOpConverter : public OpConverter {
 public:
  void operator()(const framework::proto::OpDesc& op,
                  const framework::Scope& scope, bool test_mode) override {
-    VLOG(3) << "convert a fluid conv2d op to tensorrt conv layer without bias";
+    ConvertConv2d(
+        engine_, op, scope, test_mode,
-    framework::OpDesc op_desc(op, nullptr);
+        [&](nvinfer1::ITensor* inputs, int n_output, /* Conv output maps */
-    PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1);
+            int n_input,                             /* Conv input maps */
-    PADDLE_ENFORCE_EQ(op_desc.Input("Filter").size(), 1);  // Y is a weight
+            nvinfer1::DimsHW& ksize, TensorRTEngine::Weight& weight,
-    PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1);
+            TensorRTEngine::Weight& bias) -> nvinfer1::IConvolutionLayer* {
+          auto* layer =
-    auto* X = engine_->GetITensor(op_desc.Input("Input").front());
+              TRT_ENGINE_ADD_LAYER(engine_, Convolution, *inputs, n_output,
+                                   ksize, weight.get(), bias.get());
-    // Declare weights
+          return layer;
-    auto* Y_v = scope.FindVar(op_desc.Input("Filter").front());
+        },
-    PADDLE_ENFORCE_NOT_NULL(Y_v);
+        [](nvinfer1::IConvolutionLayer* layer, nvinfer1::DimsHW& dilations) {
-    auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
+          layer->setDilation(dilations);
+        },
-    platform::CPUPlace cpu_place;
+        "conv2d");
-    std::unique_ptr<framework::LoDTensor> weight_tensor(
+  }
-        new framework::LoDTensor());
+};
-    weight_tensor->Resize(Y_t->dims());
-    TensorCopySync((*Y_t), cpu_place, weight_tensor.get());
+class Deconv2dOpConverter : public OpConverter {
+ public:
-    auto* weight_data =
+  void operator()(const framework::proto::OpDesc& op,
-        weight_tensor->mutable_data<float>(platform::CPUPlace());
+                  const framework::Scope& scope, bool test_mode) override {
+    ConvertConv2d(
-    PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL);
+        engine_, op, scope, test_mode,
-    const int n_output = weight_tensor->dims()[0];
+        [&](nvinfer1::ITensor* inputs, int n_output, /* Deconv input maps */
-    const int filter_h = weight_tensor->dims()[2];
+            int n_input,                             /* Deconv output maps */
-    const int filter_w = weight_tensor->dims()[3];
+            nvinfer1::DimsHW& ksize, TensorRTEngine::Weight& weight,
+            TensorRTEngine::Weight& bias) -> nvinfer1::IDeconvolutionLayer* {
-    const int groups = boost::get<int>(op_desc.GetAttr("groups"));
+          auto* layer =
-    const std::vector<int> dilations =
+              TRT_ENGINE_ADD_LAYER(engine_, Deconvolution, *inputs, n_input,
-        boost::get<std::vector<int>>(op_desc.GetAttr("dilations"));
+                                   ksize, weight.get(), bias.get());
-    const std::vector<int> strides =
+          return layer;
-        boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
+        },
-    const std::vector<int> paddings =
+        [](nvinfer1::IDeconvolutionLayer* layer, nvinfer1::DimsHW& dilations) {
-        boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
+          PADDLE_ENFORCE(
+              dilations.d[0] == 1 && dilations.d[1] == 1,
-    nvinfer1::DimsHW nv_ksize(filter_h, filter_w);
+              "Dilations must be (1, 1) for tensorRT, but given (%d, %d)",
-    nvinfer1::DimsHW nv_dilations(dilations[0], dilations[1]);
+              dilations.d[0], dilations.d[1]);
-    nvinfer1::DimsHW nv_strides(strides[0], strides[1]);
+        },
-    nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]);
+        "conv2d_transpose");
-    TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
-                                  static_cast<void*>(weight_data),
-                                  weight_tensor->memory_size() / sizeof(float)};
-    TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
-    auto* layer = TRT_ENGINE_ADD_LAYER(
-        engine_, Convolution, *const_cast<nvinfer1::ITensor*>(X), n_output,
-        nv_ksize, weight.get(), bias.get());
-    PADDLE_ENFORCE(layer != nullptr);
-    layer->setStride(nv_strides);
-    layer->setPadding(nv_paddings);
-    layer->setDilation(nv_dilations);
-    layer->setNbGroups(groups);
-    auto output_name = op_desc.Output("Output").front();
-    layer->setName(("conv2d (Output: " + output_name + ")").c_str());
-    engine_->weight_map[op_desc.Input("Filter").front()] =
-        std::move(weight_tensor);
-    layer->getOutput(0)->setName(output_name.c_str());
-    engine_->SetITensor(output_name, layer->getOutput(0));
-    if (test_mode ||
-        to_skip_merging_optimize(engine_, {filter_h, filter_w}, strides,
-                                 paddings, op_desc.Input("Input").front())) {
-      engine_->DeclareOutput(output_name);
-    }
  }
 };
@@ -112,3 +159,4 @@ class Conv2dOpConverter : public OpConverter {
 }  // namespace paddle
 REGISTER_TRT_OP_CONVERTER(conv2d, Conv2dOpConverter);
+REGISTER_TRT_OP_CONVERTER(conv2d_transpose, Deconv2dOpConverter);
--- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
@@ -34,7 +34,8 @@ class ElementwiseWeightOpConverter : public OpConverter {
    auto* X = engine_->GetITensor(op_desc.Input("X").front());
    nvinfer1::Dims dims_x = X->getDimensions();
-    PADDLE_ENFORCE(dims_x.nbDims >= 3);
+    PADDLE_ENFORCE(dims_x.nbDims >= 3, "x dims experts 3, but %d is given.",
+                   dims_x.nbDims);
    auto* Y_v = scope.FindVar(op_desc.Input("Y").front());
    PADDLE_ENFORCE_NOT_NULL(Y_v);

--- a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h"
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+/*
+ * PRelu converter from fluid to tensorRT.
+ */
+class PReluOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(4) << "convert fluid prelu op to tensorrt prelu layer";
+    framework::OpDesc op_desc(op, nullptr);
+    // Declare inputs
+    int input_num = op_desc.Input("X").size();
+    PADDLE_ENFORCE(input_num == 1);
+    auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
+    // Get output
+    size_t output_num = op_desc.Output("Out").size();
+    PADDLE_ENFORCE(output_num == 1);
+    // Get attrs
+    std::string mode = boost::get<std::string>(op_desc.GetAttr("mode"));
+    //
+    auto* alpha_var = scope.FindVar(op_desc.Input("Alpha")[0]);
+    PADDLE_ENFORCE_NOT_NULL(alpha_var);
+    auto* alpha_tensor = alpha_var->GetMutable<framework::LoDTensor>();
+    platform::CUDAPlace place;
+    std::unique_ptr<framework::LoDTensor> alpha_tensor_device(
+        new framework::LoDTensor());
+    alpha_tensor_device->Resize(alpha_tensor->dims());
+    TensorCopySync(*alpha_tensor, place, alpha_tensor_device.get());
+    float* alpha_data = alpha_tensor_device->mutable_data<float>(place);
+    // Transform alpha to TensorRTEngine::Weight
+    TensorRTEngine::Weight alpha_rt(nvinfer1::DataType::kFLOAT,
+                                    static_cast<void*>(alpha_data),
+                                    alpha_tensor_device->numel());
+    PReluPlugin* plugin = new PReluPlugin(alpha_rt, mode);
+    nvinfer1::IPluginLayer* layer =
+        engine_->AddPlugin(&input, input_num, plugin);
+    // keep alpha tensor to avoid release it's memory
+    engine_->weight_map[op_desc.Input("Alpha")[0]] =
+        std::move(alpha_tensor_device);
+    std::string layer_name = "prelu (Output: ";
+    auto output_name = op_desc.Output("Out")[0];
+    layer->getOutput(0)->setName(output_name.c_str());
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    layer_name += output_name;
+    if (test_mode) {
+      engine_->DeclareOutput(output_name);
+    }
+    layer->setName((layer_name + ")").c_str());
+  }
+};
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+REGISTER_TRT_OP_CONVERTER(prelu, PReluOpConverter);
--- a/paddle/fluid/inference/tensorrt/convert/split_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/split_op.cc
@@ -26,7 +26,7 @@ class SplitOpConverter : public OpConverter {
 public:
  void operator()(const framework::proto::OpDesc& op,
                  const framework::Scope& scope, bool test_mode) override {
-    VLOG(40) << "convert a fluid split op to tensorrt split layer";
+    VLOG(4) << "convert a fluid split op to tensorrt split layer";
    framework::OpDesc op_desc(op, nullptr);
    // Declare inputs

--- a/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc
@@ -16,6 +16,9 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+USE_OP(conv2d);
+USE_OP(conv2d_transpose);
 namespace paddle {
 namespace inference {
 namespace tensorrt {
@@ -51,7 +54,37 @@ TEST(conv2d_op, test) {
  validator.Execute(3);
 }
+TEST(conv2d_transpose_op, test) {
+  std::unordered_set<std::string> parameters({"deconv2d-Y"});
+  framework::Scope scope;
+  TRTConvertValidation validator(5, parameters, scope, 1 << 15);
+  validator.DeclInputVar("deconv2d-X", nvinfer1::Dims3(3, 5, 5));
+  validator.DeclParamVar("deconv2d-Y", nvinfer1::Dims4(3, 2, 3, 3));
+  validator.DeclOutputVar("deconv2d-Out", nvinfer1::Dims3(2, 5, 5));
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("conv2d_transpose");
+  desc.SetInput("Input", {"deconv2d-X"});
+  desc.SetInput("Filter", {"deconv2d-Y"});
+  desc.SetOutput("Output", {"deconv2d-Out"});
+  const std::vector<int> strides({1, 1});
+  const std::vector<int> paddings({1, 1});
+  const std::vector<int> dilations({1, 1});
+  const int groups = 1;
+  desc.SetAttr("strides", strides);
+  desc.SetAttr("paddings", paddings);
+  desc.SetAttr("dilations", dilations);
+  desc.SetAttr("groups", groups);
+  validator.SetOp(*desc.Proto());
+  validator.Execute(3);
+}
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
-USE_OP(conv2d);
--- a/paddle/fluid/inference/tensorrt/convert/test_prelu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_prelu_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+TEST(prelu_op, test_channel_wise) {
+  std::unordered_set<std::string> parameters({"prelu_alpha"});
+  framework::Scope scope;
+  TRTConvertValidation validator(10, parameters, scope, 1000);
+  validator.DeclInputVar("prelu_input", nvinfer1::DimsCHW(3, 2, 2));
+  validator.DeclParamVar("prelu_alpha", nvinfer1::Dims3(3, 1, 1));
+  validator.DeclOutputVar("prelu_out", nvinfer1::DimsCHW(3, 2, 2));
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("prelu");
+  desc.SetInput("X", {"prelu_input"});
+  desc.SetInput("Alpha", {"prelu_alpha"});
+  desc.SetOutput("Out", {"prelu_out"});
+  desc.SetAttr("mode", std::string("channel"));
+  validator.SetOp(*desc.Proto());
+  validator.Execute(1);
+}
+TEST(prelu_op, test_element_wise) {
+  std::unordered_set<std::string> parameters({"prelu_alpha"});
+  framework::Scope scope;
+  TRTConvertValidation validator(10, parameters, scope, 1000);
+  validator.DeclInputVar("prelu_input", nvinfer1::DimsCHW(3, 2, 2));
+  validator.DeclParamVar("prelu_alpha", nvinfer1::Dims4(10, 3, 2, 2));
+  validator.DeclOutputVar("prelu_out", nvinfer1::DimsCHW(3, 2, 2));
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("prelu");
+  desc.SetInput("X", {"prelu_input"});
+  desc.SetInput("Alpha", {"prelu_alpha"});
+  desc.SetOutput("Out", {"prelu_out"});
+  desc.SetAttr("mode", std::string("element"));
+  validator.SetOp(*desc.Proto());
+  validator.Execute(1);
+}
+TEST(prelu_op, test_scalar) {
+  std::unordered_set<std::string> parameters({"prelu_alpha"});
+  framework::Scope scope;
+  TRTConvertValidation validator(10, parameters, scope, 1000);
+  validator.DeclInputVar("prelu_input", nvinfer1::DimsCHW(3, 2, 2));
+  validator.DeclParamVar("prelu_alpha", nvinfer1::Dims3(1, 1, 1));
+  validator.DeclOutputVar("prelu_out", nvinfer1::DimsCHW(3, 2, 2));
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("prelu");
+  desc.SetInput("X", {"prelu_input"});
+  desc.SetInput("Alpha", {"prelu_alpha"});
+  desc.SetOutput("Out", {"prelu_out"});
+  desc.SetAttr("mode", std::string("all"));
+  validator.SetOp(*desc.Proto());
+  validator.Execute(1);
+}
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+// USE_OP(prelu);
+USE_CPU_ONLY_OP(prelu);
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -200,7 +200,8 @@ void TensorRTEngine::GetOutputInCPU(const std::string &name, void *dst,
 Buffer &TensorRTEngine::buffer(const std::string &name) {
  PADDLE_ENFORCE(infer_engine_ != nullptr, "call FreezeNetwork first.");
  auto it = buffer_sizes_.find(name);
-  PADDLE_ENFORCE(it != buffer_sizes_.end());
+  PADDLE_ENFORCE(it != buffer_sizes_.end(), "tried to access buffer named %s",
+                 name);
  auto slot_offset = infer_engine_->getBindingIndex(name.c_str());
  return buffers_[slot_offset];
 }

--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -40,6 +40,7 @@ class TensorRTEngine : public EngineBase {
  // Weight is model parameter.
  class Weight {
   public:
+    Weight() = default;
    Weight(nvinfer1::DataType dtype, void* value, size_t num_elem) {
      w_.type = dtype;
      w_.values = value;

--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
-nv_library(tensorrt_plugin SRCS trt_plugin.cc split_op_plugin.cu DEPS enforce)
+nv_library(tensorrt_plugin SRCS trt_plugin.cc split_op_plugin.cu prelu_op_plugin.cu DEPS enforce)
--- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <stdio.h>
+#include <cassert>
+#include "glog/logging.h"
+#include "paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h"
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+static const int CUDA_NUM_THREADS = 1024;
+static const int CUDA_MAX_NUM_BLOCKS = 65535;
+inline static int GET_NUM_BLOCKS(const int N) {
+  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}
+__global__ void PReluChannelWiseKernel(const float *input, const float *alpha,
+                                       float *output, int channel,
+                                       size_t spatial_size) {
+  size_t offset = blockIdx.x * spatial_size;
+  const float *in = input + offset;
+  float *out = output + offset;
+  float scale = alpha[blockIdx.x % channel];
+  for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) {
+    float x = in[i];
+    out[i] = (x > 0) ? x : scale * x;
+  }
+}
+__global__ void PReluElementWiseKernel(const float *input, const float *alpha,
+                                       float *output, size_t spatial_size) {
+  size_t offset = blockIdx.x * spatial_size;
+  const float *in = input + offset;
+  const float *scale = alpha + offset;
+  float *out = output + offset;
+  for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) {
+    float x = in[i];
+    out[i] = (x > 0) ? x : scale[i] * x;
+  }
+}
+__global__ void PReluScalarKernel(const float *input, const float *alpha,
+                                  float *output, size_t spatial_size) {
+  size_t offset = blockIdx.x * spatial_size;
+  const float *in = input + offset;
+  float scale = *alpha;
+  float *out = output + offset;
+  for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) {
+    float x = in[i];
+    out[i] = (x > 0) ? x : scale * x;
+  }
+}
+static inline void PReluChannelWise(cudaStream_t stream, const float *input,
+                                    const float *alpha, float *output,
+                                    int batch_size,
+                                    const nvinfer1::Dims &dims) {
+  size_t unroll = batch_size * dims.d[0];
+  size_t spatial_size = dims.d[1] * dims.d[2];
+  CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
+  PReluChannelWiseKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
+      input, alpha, output, dims.d[0], spatial_size);
+}
+static inline void PReluElementWise(cudaStream_t stream, const float *input,
+                                    const float *alpha, float *output,
+                                    int batch_size,
+                                    const nvinfer1::Dims &dims) {
+  size_t unroll = batch_size * dims.d[0];
+  size_t spatial_size = dims.d[1] * dims.d[2];
+  CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
+  PReluElementWiseKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
+      input, alpha, output, spatial_size);
+}
+static inline void PReluScalar(cudaStream_t stream, const float *input,
+                               const float *alpha, float *output,
+                               int batch_size, const nvinfer1::Dims &dims) {
+  size_t unroll = batch_size * dims.d[0];
+  size_t spatial_size = dims.d[1] * dims.d[2];
+  CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
+  PReluScalarKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
+      input, alpha, output, spatial_size);
+}
+nvinfer1::Dims PReluPlugin::getOutputDimensions(int index,
+                                                const nvinfer1::Dims *inputDims,
+                                                int nbInputs) {
+  assert(nbInputs == 1);
+  assert(index < this->getNbOutputs());
+  nvinfer1::Dims const &input_dims = inputDims[0];
+  nvinfer1::Dims output_dims = input_dims;
+  return output_dims;
+}
+int PReluPlugin::enqueue(int batchSize, const void *const *inputs,
+                         void **outputs, void *workspace, cudaStream_t stream) {
+  // input dims is CHW.
+  const auto &input_dims = this->getInputDims(0);
+  const float *input = reinterpret_cast<const float *>(inputs[0]);
+  const float *alpha = reinterpret_cast<const float *>(alpha_.get().values);
+  float *output = reinterpret_cast<float **>(outputs)[0];
+  if (mode_ == "channel") {
+    PReluChannelWise(stream, input, alpha, output, batchSize, input_dims);
+  } else if (mode_ == "element") {
+    PReluElementWise(stream, input, alpha, output, batchSize, input_dims);
+  } else {
+    PReluScalar(stream, input, alpha, output, batchSize, input_dims);
+  }
+  return cudaGetLastError() != cudaSuccess;
+}
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+class PReluPlugin : public PluginTensorRT {
+  TensorRTEngine::Weight alpha_;
+  std::string mode_;
+ protected:
+  size_t getSerializationSize() override {
+    // return getBaseSerializationSize(alpha_) + SerializedSize(mode_);
+    return 0;
+  }
+  // TRT will call this func when we need to serialize the configuration of
+  // tensorrt.
+  // It should not be called by users.
+  void serialize(void *buffer) override {
+    // serializeBase(buffer);
+    // SerializeValue(&buffer, alpha_);
+    // SerializeValue(&buffer, mode_);
+  }
+ public:
+  PReluPlugin(TensorRTEngine::Weight const &alpha, std::string const &mode)
+      : alpha_(alpha), mode_(mode) {}
+  // It was used for tensorrt deserialization.
+  // It should not be called by users.
+  PReluPlugin(void const *serialData, size_t serialLength) {
+    // deserializeBase(serialData, serialLength);
+    // DeserializeValue(&serialData, &serialLength, &alpha_);
+    // DeserializeValue(&serialData, &serialLength, &mode_);
+  }
+  PReluPlugin *clone() const override { return new PReluPlugin(alpha_, mode_); }
+  const char *getPluginType() const override { return "prelu"; }
+  int getNbOutputs() const override { return 1; }
+  nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *inputs,
+                                     int nbInputDims) override;
+  int enqueue(int batchSize, const void *const *inputs, void **outputs,
+              void *workspace, cudaStream_t stream) override;
+};
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -78,6 +78,10 @@ inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_te
 inference_analysis_api_test_with_fake_data(test_analyzer_resnet50 
  "${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz")
+# mobilenet with depthwise_conv op
+inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet 
+  "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz")
 # anakin
 if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
   # anakin rnn1

--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
-file(GLOB GENERAL_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
+include(operators)
-string(REPLACE "_mkldnn" "" GENERAL_OPS "${GENERAL_OPS}")
-string(REPLACE ".cc" "" GENERAL_OPS "${GENERAL_OPS}")
-list(REMOVE_DUPLICATES GENERAL_OPS)
-set(DEPS_OPS "")
-set(pybind_file ${PADDLE_BINARY_DIR}/paddle/fluid/pybind/pybind.h)
-file(WRITE ${pybind_file} "// Generated by the paddle/fluid/operator/CMakeLists.txt.  DO NOT EDIT!\n\n")
-set(PART_CUDA_KERNEL_FILES)
-function(op_library TARGET)
-    # op_library is a function to create op library. The interface is same as
-    # cc_library. But it handle split GPU/CPU code and link some common library
-    # for ops.
-    set(cc_srcs)
-    set(cu_srcs)
-    set(hip_cu_srcs)
-    set(miopen_hip_cc_srcs)
-    set(cu_cc_srcs)
-    set(cudnn_cu_cc_srcs)
-    set(CUDNN_FILE)
-    set(mkldnn_cc_srcs)
-    set(MKLDNN_FILE)
-    set(op_common_deps operator op_registry math_function)
-    set(options "")
-    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS)
-    set(pybind_flag 0)
-    cmake_parse_arguments(op_library "${options}" "${oneValueArgs}"
-            "${multiValueArgs}" ${ARGN})
-    list(LENGTH op_library_SRCS op_library_SRCS_len)
-    if (${op_library_SRCS_len} EQUAL 0)
-        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
-            list(APPEND cc_srcs ${TARGET}.cc)
-        endif()
-        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc)
-            list(APPEND cu_cc_srcs ${TARGET}.cu.cc)
-        endif()
-        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
-            list(APPEND cu_srcs ${TARGET}.cu)
-        endif()
-        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
-            set(PART_CUDA_KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu
-                    ${PART_CUDA_KERNEL_FILES} PARENT_SCOPE)
-            list(APPEND cu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
-        endif()
-        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.hip.cu)
-            list(APPEND hip_cu_srcs ${TARGET}.hip.cu)
-        endif()
-        string(REPLACE "_op" "_cudnn_op" CUDNN_FILE "${TARGET}")
-        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu.cc)
-            list(APPEND cudnn_cu_cc_srcs ${CUDNN_FILE}.cu.cc)
-        endif()
-        if(WITH_AMD_GPU)
-            string(REPLACE "_op" "_miopen_op" MIOPEN_FILE "${TARGET}")
-            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MIOPEN_FILE}.hip.cc)
-                list(APPEND miopen_hip_cc_srcs ${MIOPEN_FILE}.hip.cc)
-            endif()
-        endif()
-        if(WITH_MKLDNN)
-            string(REPLACE "_op" "_mkldnn_op" MKLDNN_FILE "${TARGET}")
-            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MKLDNN_FILE}.cc)
-                list(APPEND mkldnn_cc_srcs ${MKLDNN_FILE}.cc)
-            endif()
-        endif()
-    else()
-        foreach(src ${op_library_SRCS})
-            if (${src} MATCHES ".*\\.hip.cu$")
-                list(APPEND hip_cu_srcs ${src})
-            elseif (${src} MATCHES ".*\\.cu$")
-                list(APPEND cu_srcs ${src})
-            elseif(${src} MATCHES ".*_cudnn_op.cu.cc$")
-                list(APPEND cudnn_cu_cc_srcs ${src})
-            elseif(WITH_AMD_GPU AND ${src} MATCHES ".*_miopen_op.hip.cc$")
-                list(APPEND miopen_hip_cc_srcs ${src})
-            elseif(WITH_MKLDNN AND ${src} MATCHES ".*_mkldnn_op.cc$")
-                list(APPEND mkldnn_cc_srcs ${src})
-            elseif(${src} MATCHES ".*\\.cu.cc$")
-                list(APPEND cu_cc_srcs ${src})
-            elseif(${src} MATCHES ".*\\.cc$")
-                list(APPEND cc_srcs ${src})
-            else()
-                message(FATAL_ERROR "${TARGET} Source file ${src} should only be .cc or .cu")
-            endif()
-        endforeach()
-    endif()
-    list(LENGTH cc_srcs cc_srcs_len)
-    if (${cc_srcs_len} EQUAL 0)
-        message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file")
-    endif()
-    if (WIN32)
-    # remove windows unsupported op, because windows has no nccl, no warpctc such ops.
-    foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op" "hierarchical_sigmoid_op"
-     "crf_decoding_op" "select_op" "lstmp_op" "gru_op" "fusion_gru_op" "lstm_op" "fusion_lstm_op" "cumsum_op"
-      "fusion_seqconv_eltadd_relu_op" "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op"
-            "fusion_seqexpand_concat_fc_op" "attention_lstm_op" "fused_embedding_fc_lstm_op" "fc_op")
-        if ("${TARGET}" STREQUAL "${windows_unsupport_op}")
-          return()
-        endif()
-    endforeach()
-    endif(WIN32)
-    set(OP_LIBRARY ${TARGET} ${OP_LIBRARY} PARENT_SCOPE)
-    list(LENGTH op_library_DEPS op_library_DEPS_len)
+# clean cache and pybind_file content first when rebuild
-    if (${op_library_DEPS_len} GREATER 0)
+unset(GLOB_OP_LIB CACHE)
-        set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE)
+unset(OP_LIBRARY CACHE)
-    endif()
+set(pybind_file ${PADDLE_BINARY_DIR}/paddle/fluid/pybind/pybind.h CACHE INTERNAL "pybind.h file")
-    if (WITH_GPU)
+file(WRITE ${pybind_file} "// Generated by the paddle/fluid/operator/CMakeLists.txt.  DO NOT EDIT!\n\n")
-        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${mkldnn_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
-                ${op_common_deps})
-    elseif (WITH_AMD_GPU)
-        hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cu_srcs} ${miopen_hip_cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
-                ${op_common_deps})
-    else()
-        cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
-            ${op_common_deps})
-    endif()
-    # Define operators that don't need pybind here.
-    foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op"
-"tensor_array_read_write_op" "tensorrt_engine_op")
-        if ("${TARGET}" STREQUAL "${manual_pybind_op}")
-            set(pybind_flag 1)
-        endif()
-    endforeach()
-    # The registration of USE_OP, please refer to paddle/fluid/framework/op_registry.h.
-    # Note that it's enough to just adding one operator to pybind in a *_op.cc file.
-    # And for detail pybind information, please see generated paddle/pybind/pybind.h.
-    file(READ ${TARGET}.cc TARGET_CONTENT)
-    string(REGEX MATCH "REGISTER_OPERATOR\\(.*REGISTER_OPERATOR\\(" multi_register "${TARGET_CONTENT}")
-    string(REGEX MATCH "REGISTER_OPERATOR\\([a-z0-9_]*," one_register "${multi_register}")
-    if (one_register STREQUAL "")
-        string(REPLACE "_op" "" TARGET "${TARGET}")
-    else ()
-        string(REPLACE "REGISTER_OPERATOR(" "" TARGET "${one_register}")
-        string(REPLACE "," "" TARGET "${TARGET}")
-    endif()
-    # pybind USE_NO_KERNEL_OP
-    # HACK: if REGISTER_OP_CPU_KERNEL presents the operator must have kernel
-    string(REGEX MATCH "REGISTER_OP_CPU_KERNEL" regex_result "${TARGET_CONTENT}")
-    string(REPLACE "_op" "" TARGET "${TARGET}")
-    if (${pybind_flag} EQUAL 0 AND regex_result STREQUAL "")
-        file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(${TARGET});\n")
-        set(pybind_flag 1)
-    endif()
-    # pybind USE_CPU_ONLY_OP
-    list(LENGTH cu_srcs cu_srcs_len)
-    list(LENGTH cu_cc_srcs cu_cc_srcs_len)
-    list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
-    list(LENGTH hip_cu_srcs hip_cu_srcs_len)
-    list(LENGTH miopen_hip_cc_srcs miopen_hip_cc_srcs_len)
-    if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0 AND
-        ${hip_cu_srcs_len} EQUAL 0 AND ${miopen_hip_cc_srcs_len} EQUAL 0)
-        file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
-        set(pybind_flag 1)
-    endif()
-    # pybind USE_OP_DEVICE_KERNEL for CUDNN
-    list(LENGTH cudnn_cu_cc_srcs cudnn_cu_cc_srcs_len)
-    if (WITH_GPU AND ${cudnn_cu_cc_srcs_len} GREATER 0)
-        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
-    endif()
-    # pybind USE_OP_DEVICE_KERNEL for MIOPEN
-    if (WITH_AMD_GPU AND ${miopen_hip_cc_srcs_len} GREATER 0)
-        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MIOPEN);\n")
-    endif()
-    # pybind USE_OP_DEVICE_KERNEL for MKLDNN
-    if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
-      # Append first implemented MKLDNN activation operator
-      if (${MKLDNN_FILE} STREQUAL "activation_mkldnn_op")
-        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, MKLDNN);\n")
-      else()
-        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n")
-      endif()
-    endif()
-    # pybind USE_OP
-    if (${pybind_flag} EQUAL 0)
-      # NOTE(*): activation use macro to regist the kernels, set use_op manually.
-      if(${TARGET} STREQUAL "activation")
-        file(APPEND ${pybind_file} "USE_OP(relu);\n")
-      elseif(${TARGET} STREQUAL "fake_dequantize")
-        file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n")
-      elseif(${TARGET} STREQUAL "fake_quantize")
-        file(APPEND ${pybind_file} "USE_OP(fake_quantize_abs_max);\n")
-      elseif(${TARGET} STREQUAL "tensorrt_engine_op")
-          message(STATUS "Pybind skips [tensorrt_engine_op], for this OP is only used in inference")
-      elseif(${TARGET} STREQUAL "fc")
-        # HACK: fc only have mkldnn and cpu, which would mismatch the cpu only condition
-        file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
-      else()
-        file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
-      endif()
-    endif()
-endfunction()
 add_subdirectory(math)
-if (NOT WIN32)
+add_subdirectory(controlflow)
-add_subdirectory(nccl)
+add_subdirectory(csp)
-if(WITH_GPU)
+add_subdirectory(detection)
-    op_library(nccl_op DEPS nccl_common)
+add_subdirectory(elementwise)
-    file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(ncclAllReduce);\n")
+add_subdirectory(fused)
-else()
+add_subdirectory(metrics)
-    set(DEPS_OPS ${DEPS_OPS} nccl_op)
+add_subdirectory(optimizers)
-endif()
+add_subdirectory(reduce_ops)
-endif() # NOT WIN32
+add_subdirectory(sequence_ops)
-set(DISTRIBUTE_DEPS "")
 if(WITH_DISTRIBUTE)
    add_subdirectory(distributed)
-    set(DISTRIBUTE_DEPS "")
+    add_subdirectory(distributed_ops)
-    if(WITH_GRPC)
+endif()
-        set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node)
-    else()
-        set(DISTRIBUTE_DEPS sendrecvop_brpc brpc leveldb snappystream snappy protobuf ssl crypto zlib node)
-        if(WITH_BRPC_RDMA)
-            find_library(IBVERBS_LIBRARY NAMES ibverbs)
-            ADD_LIBRARY(ibverbs SHARED IMPORTED GLOBAL)
-            SET_PROPERTY(TARGET ibverbs PROPERTY IMPORTED_LOCATION ${IBVERBS_LIBRARY})
-            find_library(RDMACM_LIBRARY NAMES rdmacm)
-            ADD_LIBRARY(rdmacm SHARED IMPORTED GLOBAL)
-            SET_PROPERTY(TARGET rdmacm PROPERTY IMPORTED_LOCATION ${RDMACM_LIBRARY})
-            set(DISTRIBUTE_DEPS ${DISTRIBUTE_DEPS} ibverbs rdmacm)
-        endif()
-    endif()
-    set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-    foreach(dist_op "prefetch_op" "checkpoint_notify_op" "listen_and_serv_op" "send_op" "recv_op" "send_barrier_op" "fetch_barrier_op")
-        op_library(${dist_op} DEPS ${DISTRIBUTE_DEPS})
-        set_source_files_properties(${dist_op}.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    endforeach()
-    #set_source_files_properties(send_recv_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+if (NOT WIN32)
-    #cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op
+    add_subdirectory(reader)
-    #        listen_and_serv_op sum_op executor SERIAL)
-    if(WITH_GPU AND NOT WIN32)
-        set_source_files_properties(test_send_nccl_id.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-        cc_test(test_send_nccl_id SRCS test_send_nccl_id.cc DEPS listen_and_serv_op ${DISTRIBUTE_DEPS} executor SERIAL)
-        if(WITH_GRPC)
-            op_library(gen_nccl_id_op DEPS nccl_common sendrecvop_grpc)
-        else()
-            op_library(gen_nccl_id_op DEPS nccl_common sendrecvop_brpc)
-        endif()
-        set_source_files_properties(gen_nccl_id_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    else()
-        set(DEPS_OPS ${DEPS_OPS} gen_nccl_id_op)
-    endif() # WITH_GPU AND NOT WIN32
-else()
-    set(DEPS_OPS ${DEPS_OPS}  checkpoint_notify_op prefetch_op recv_op listen_and_serv_op send_op send_barrier_op fetch_barrier_op gen_nccl_id_op)
 endif()
-op_library(cross_entropy_op DEPS cross_entropy)
+if (NOT WIN32)
-if(WITH_GPU)
+    add_subdirectory(nccl)
-  op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax cub)
-  op_library(sequence_softmax_op DEPS cub)
-else()
-  op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
 endif()
-op_library(softmax_op DEPS softmax)
 if (WITH_GPU AND TENSORRT_FOUND)
-    op_library(tensorrt_engine_op DEPS tensorrt_engine tensorrt_converter)
+    add_subdirectory(tensorrt)
-    file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(tensorrt_engine);\n")
-    nv_test(test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc
-      DEPS tensorrt_engine_op
-      analysis)
-else()
-    set(DEPS_OPS ${DEPS_OPS} tensorrt_engine_op)
 endif()
-op_library(hash_op DEPS xxhash)
-op_library(clip_by_norm_op DEPS selected_rows_functor selected_rows)
+register_operators(EXCLUDES warpctc_op)
-op_library(sum_op DEPS selected_rows_functor)
-op_library(sgd_op DEPS selected_rows_functor)
+# warpctc_cudnn need cudnn 7 above
-op_library(print_op DEPS lod_tensor)
-op_library(adagrad_op DEPS selected_rows_functor)
-op_library(maxout_op DEPS maxouting)
-op_library(unpool_op DEPS unpooling)
-op_library(pool_op DEPS pooling)
-op_library(pool_with_index_op DEPS pooling)
-op_library(lod_rank_table_op DEPS lod_rank_table)
-op_library(lod_tensor_to_array_op DEPS lod_rank_table_op)
-op_library(array_to_lod_tensor_op DEPS lod_rank_table_op)
-op_library(max_sequence_len_op DEPS lod_rank_table)
-op_library(sequence_conv_op DEPS context_project)
-op_library(sequence_pool_op DEPS sequence_pooling)
-if (NOT WIN32)
-    op_library(lstm_op DEPS sequence2batch lstm_compute)
-    op_library(hierarchical_sigmoid_op DEPS matrix_bit_code)
-    op_library(lstmp_op DEPS sequence2batch lstm_compute)
-    op_library(gru_op DEPS sequence2batch gru_compute)
-endif(NOT WIN32)
-op_library(recurrent_op DEPS executor)
-op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
-op_library(cos_sim_op DEPS cos_sim_functor)
-op_library(parallel_do_op DEPS executor)
-op_library(unsqueeze_op DEPS reshape_op)
-op_library(squeeze_op DEPS reshape_op)
-op_library(flatten_op DEPS reshape_op)
-op_library(sequence_pad_op DEPS sequence_padding)
-op_library(unstack_op DEPS stack_op)
-op_library(fake_quantize_op DEPS memory)
-if (NOT WIN32)
-op_library(crf_decoding_op DEPS jit_kernel)
-op_library(fusion_lstm_op DEPS jit_kernel)
-endif(NOT WIN32)
 if (WITH_GPU)
-    op_library(conv_op DEPS vol2col depthwise_conv im2col)
+    if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
-    op_library(layer_norm_op DEPS cub)
+        op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu.cc)
-    op_library(reduce_mean_op DEPS cub)
+    else()
-    op_library(affine_channel_op DEPS cub)
+        op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
+    endif()
 else()
-    op_library(conv_op DEPS vol2col im2col)
+    op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
 endif()
-op_library(conv_transpose_op DEPS vol2col im2col)
-# FIXME(typhoonzero): save/load depends lodtensor serialization functions
-op_library(save_op DEPS lod_tensor)
-op_library(load_op DEPS lod_tensor)
-op_library(save_combine_op DEPS lod_tensor)
-op_library(load_combine_op DEPS lod_tensor)
-op_library(concat_op DEPS concat_and_split)
-op_library(tensor_array_to_tensor_op DEPS concat_op)
-list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
-foreach(src ${GENERAL_OPS})
-    op_library(${src})
-endforeach()
-file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n")
+set(COMMON_OP_DEPS "")
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} xxhash selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor dynload_warpctc sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler)
 if (NOT WIN32)
-add_subdirectory(reader)
+  set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions)
-endif(NOT WIN32)
+endif()
-foreach(src ${READER_LIBRARY})
+if (WITH_GPU)
-    set(OP_LIBRARY ${src} ${OP_LIBRARY})
+  set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv cub)
-endforeach()
+endif()
-add_subdirectory(detection)
+# FIXME(typhoonzero): operator deps may not needed.
-foreach(src ${DETECTION_LIBRARY})
+# op_library(lod_tensor_to_array_op DEPS lod_rank_table_op)
-    set(OP_LIBRARY ${src} ${OP_LIBRARY})
+# op_library(array_to_lod_tensor_op DEPS lod_rank_table_op)
-endforeach()
+# op_library(unsqueeze_op DEPS reshape_op)
+# op_library(squeeze_op DEPS reshape_op)
+# op_library(flatten_op DEPS reshape_op)
+# op_library(unstack_op DEPS stack_op)
+# op_library(tensor_array_to_tensor_op DEPS concat_op)
-set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
+set(OPERATOR_DEPS ${OPERATOR_DEPS} ${COMMON_OP_DEPS})
-set(GLOB_DISTRIBUTE_DEPS ${DISTRIBUTE_DEPS} CACHE INTERNAL "distributed dependency")
+set(GLOB_OPERATOR_DEPS ${OPERATOR_DEPS} CACHE INTERNAL "Global Op dependencies")
 cc_test(gather_test SRCS gather_test.cc DEPS tensor)
 cc_test(scatter_test SRCS scatter_test.cc DEPS tensor math_function)
@@ -362,18 +76,6 @@ cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_sea
 cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory)
 cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
 cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
-if(NOT WIN32)
-    nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
-endif()
 nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor)
-if(WITH_GPU)
+set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
-    foreach(CUDA_KERNEL_FILE ${PART_CUDA_KERNEL_FILES})
-        file(READ ${CUDA_KERNEL_FILE} TARGET_CONTENT)
-        string(REGEX MATCH "REGISTER_OP_CUDA_KERNEL\\(\\n?([^,]+),.*" MATCHED ${TARGET_CONTENT})
-        if (MATCHED)
-            string(STRIP ${CMAKE_MATCH_1} MATCHED)
-            file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${MATCHED}, CUDA);\n")
-        endif()
-    endforeach()
-endif()
--- a/paddle/fluid/operators/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/activation_mkldnn_op.cc
@@ -71,6 +71,10 @@ class MKLDNNActivationGradKernel
                       diff_y->format() != memory::format::format_undef,
                   "Wrong layout/format set for Input OutGrad tensor");
+    PADDLE_ENFORCE(
+        !ctx.Attr<bool>("is_test"),
+        "is_test attribute should be set to False in training phase.");
    Functor functor;
    auto attrs = functor.GetAttrs();
@@ -115,11 +119,15 @@ void eltwise_forward(const framework::ExecutionContext &ctx,
  const std::string key_fwd = key_with_layout + "@eltwise_fwd";
  const std::string key_fwd_pd = key_with_layout + "@eltwise_fwd_pd";
+  bool is_test = ctx.Attr<bool>("is_test");
  // save input data and layout to be referred in backward path
  auto p_src_data = std::make_shared<const T *>(x_data);
-  dev_ctx.SetBlob(key_src_data, p_src_data);
  auto p_src_layout = std::make_shared<memory::format>(src_format);
-  dev_ctx.SetBlob(key_src_layout, p_src_layout);
+  if (!is_test) {
+    dev_ctx.SetBlob(key_src_data, p_src_data);
+    dev_ctx.SetBlob(key_src_layout, p_src_layout);
+  }
  auto p_fwd = std::static_pointer_cast<mkldnn::eltwise_forward>(
      dev_ctx.GetBlob(key_fwd));
@@ -136,14 +144,17 @@ void eltwise_forward(const framework::ExecutionContext &ctx,
    dev_ctx.SetBlob(key_src_mem, src_memory);
    // create primitive descriptor for activation forward and save it
+    auto mkldnn_forward_prop_kind = is_test
+                                        ? mkldnn::prop_kind::forward_inference
+                                        : mkldnn::prop_kind::forward_training;
    auto forward_desc = mkldnn::eltwise_forward::desc(
-        mkldnn::prop_kind::forward_training, algorithm,
+        mkldnn_forward_prop_kind, algorithm,
        src_memory->get_primitive_desc().desc(), alpha, beta);
    auto forward_pd = std::make_shared<mkldnn::eltwise_forward::primitive_desc>(
        forward_desc, mkldnn_engine);
    // save prim desc into global device context to be referred in backward path
-    dev_ctx.SetBlob(key_fwd_pd, forward_pd);
+    if (!is_test) dev_ctx.SetBlob(key_fwd_pd, forward_pd);
    // create mkldnn memory for output y
    dst_memory =

--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -22,18 +22,23 @@ namespace operators {
 using paddle::framework::Tensor;
-#define REGISTER_ACTIVATION_OP_MAKER(OP_NAME, OP_COMMENT)               \
+#define REGISTER_ACTIVATION_OP_MAKER(OP_NAME, OP_COMMENT)                \
-  class OP_NAME##OpMaker                                                \
+  class OP_NAME##OpMaker                                                 \
-      : public ::paddle::framework::OpProtoAndCheckerMaker {            \
+      : public ::paddle::framework::OpProtoAndCheckerMaker {             \
-   public:                                                              \
+   public:                                                               \
-    void Make() override {                                              \
+    void Make() override {                                               \
-      AddInput("X", "Input of " #OP_NAME " operator");                  \
+      AddInput("X", "Input of " #OP_NAME " operator");                   \
-      AddOutput("Out", "Output of " #OP_NAME " operator");              \
+      AddOutput("Out", "Output of " #OP_NAME " operator");               \
-      AddAttr<bool>("use_mkldnn",                                       \
+      AddAttr<bool>("use_mkldnn",                                        \
-                    "(bool, default false) Only used in mkldnn kernel") \
+                    "(bool, default false) Only used in mkldnn kernel")  \
-          .SetDefault(false);                                           \
+          .SetDefault(false);                                            \
-      AddComment(#OP_COMMENT);                                          \
+      AddAttr<bool>(                                                     \
-    }                                                                   \
+          "is_test",                                                     \
+          "(bool, default false) Set to true for inference only, false " \
+          "for training. Some layers may run faster when this is true.") \
+          .SetDefault(false);                                            \
+      AddComment(#OP_COMMENT);                                           \
+    }                                                                    \
  }
 #define REGISTER_ACTIVATION_OP_GRAD_MAKER(OP_NAME, KERNEL_TYPE)              \
@@ -269,7 +274,7 @@ class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
 :strong:`Softshrink Activation Operator`
 ..  math::
-    out = \begin{cases} 
+    out = \begin{cases}
         x - \lambda, \text{if } x > \lambda \\
         x + \lambda, \text{if } x < -\lambda \\
         0,  \text{otherwise}
@@ -435,7 +440,7 @@ class HardSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
    AddComment(R"DOC(
 HardSigmoid Activation Operator.
-Segment-wise linear approximation of sigmoid(https://arxiv.org/abs/1603.00391), 
+Segment-wise linear approximation of sigmoid(https://arxiv.org/abs/1603.00391),
 which is much faster than sigmoid.
 $out = \max(0, \min(1, slope * x + shift))$

--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -113,7 +113,10 @@ class BatchNormOp : public framework::OperatorWithKernel {
 class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
-    AddAttr<bool>("is_test", "").SetDefault(false);
+    AddAttr<bool>("is_test",
+                  "(bool, default false) Set to true for inference only, false "
+                  "for training. Some layers may run faster when this is true.")
+        .SetDefault(false);
    AddAttr<float>("momentum", "").SetDefault(0.9);
    AddAttr<float>("epsilon", "")
        .SetDefault(1e-5)

--- a/paddle/fluid/operators/controlflow/CMakeLists.txt
+++ b/paddle/fluid/operators/controlflow/CMakeLists.txt
+include(operators)
+register_operators()
+file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n")
--- a/paddle/fluid/operators/compare_op.cc
+++ b/paddle/fluid/operators/compare_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/compare_op.h"
+#include "paddle/fluid/operators/controlflow/compare_op.h"
 #include <string>
 #include "paddle/fluid/framework/op_registry.h"

--- a/paddle/fluid/operators/compare_op.cu
+++ b/paddle/fluid/operators/compare_op.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/compare_op.h"
+#include "paddle/fluid/operators/controlflow/compare_op.h"
 REGISTER_COMPARE_KERNEL(less_than, CUDA, paddle::operators::LessThanFunctor);
 REGISTER_COMPARE_KERNEL(less_equal, CUDA, paddle::operators::LessEqualFunctor);

--- a/paddle/fluid/operators/compare_op.h
+++ b/paddle/fluid/operators/compare_op.h
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <math.h>
 #include <type_traits>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/elementwise_op_function.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/platform/transform.h"
 namespace paddle {

--- a/paddle/fluid/operators/conditional_block_op.cc
+++ b/paddle/fluid/operators/conditional_block_op.cc
--- a/paddle/fluid/operators/feed_op.cc
+++ b/paddle/fluid/operators/feed_op.cc
--- a/paddle/fluid/operators/fetch_op.cc
+++ b/paddle/fluid/operators/fetch_op.cc
--- a/paddle/fluid/operators/get_places_op.cc
+++ b/paddle/fluid/operators/get_places_op.cc
--- a/paddle/fluid/operators/logical_op.cc
+++ b/paddle/fluid/operators/logical_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/logical_op.h"
+#include "paddle/fluid/operators/controlflow/logical_op.h"
 #include <string>
 #include "paddle/fluid/framework/op_registry.h"

--- a/paddle/fluid/operators/logical_op.cu
+++ b/paddle/fluid/operators/logical_op.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/logical_op.h"
+#include "paddle/fluid/operators/controlflow/logical_op.h"
 REGISTER_BINARY_LOGICAL_KERNEL(logical_and, CUDA,
                               paddle::operators::LogicalAndFunctor);

--- a/paddle/fluid/operators/logical_op.h
+++ b/paddle/fluid/operators/logical_op.h
--- a/paddle/fluid/operators/parallel_do_op.cc
+++ b/paddle/fluid/operators/parallel_do_op.cc
--- a/paddle/fluid/operators/tensor_array_read_write_op.cc
+++ b/paddle/fluid/operators/tensor_array_read_write_op.cc
--- a/paddle/fluid/operators/while_op.cc
+++ b/paddle/fluid/operators/while_op.cc
@@ -92,7 +92,10 @@ class WhileOpMaker : public framework::OpProtoAndCheckerMaker {
              "variables generated in the i'th step.");
    AddAttr<framework::BlockDesc *>(kStepBlock,
                                    "The step block inside WhileOp");
-    AddAttr<bool>("is_test", "True if in test phase.").SetDefault(false);
+    AddAttr<bool>("is_test",
+                  "(bool, default false) Set to true for inference only, false "
+                  "for training. Some layers may run faster when this is true.")
+        .SetDefault(false);
    AddComment(R"DOC(
 )DOC");
  }

--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
@@ -383,20 +383,22 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    // create a conv primitive descriptor and save it for usage in backward
    std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd;
+    auto fwd_prop_kind = is_test ? mkldnn::prop_kind::forward_inference
+                                 : mkldnn::prop_kind::forward_training;
    if (bias) {
      bias_tz = paddle::framework::vectorize2int(bias->dims());
      auto bias_md = platform::MKLDNNMemDesc(
          bias_tz, platform::MKLDNNGetDataType<T>(), memory::format::x);
-      conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md,
+      conv_pd = ConvFwdPrimitiveDesc(
-                                     strides, paddings, mkldnn_engine,
+          src_md, weights_md, bias_md, dst_md, strides, paddings, mkldnn_engine,
-                                     fuse_relu, fuse_residual_conn);
+          fuse_relu, fuse_residual_conn, fwd_prop_kind);
    } else {
-      conv_pd =
+      conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides,
-          ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings,
+                                     paddings, mkldnn_engine, fuse_relu,
-                               mkldnn_engine, fuse_relu, fuse_residual_conn);
+                                     fuse_residual_conn, fwd_prop_kind);
    }
    // Save conv_pd/src_memory/weights_memory for backward pass
-    dev_ctx.SetBlob(key_conv_pd, conv_pd);
+    if (!is_test) dev_ctx.SetBlob(key_conv_pd, conv_pd);
    ConvMKLDNNHandler handler(conv_pd, dev_ctx, mkldnn_engine, key);
@@ -512,14 +514,14 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                       const memory::desc& dst, const std::vector<int>& strides,
                       const std::vector<int>& paddings,
                       const mkldnn::engine& engine, const bool fuse_relu,
-                       const bool fuse_residual_conn) const {
+                       const bool fuse_residual_conn,
+                       mkldnn::prop_kind fwd_prop_kind) const {
    memory::dims stride_dims = {strides[0], strides[1]};
    memory::dims padding_dims = {paddings[0], paddings[1]};
    auto conv_desc = mkldnn::convolution_forward::desc(
-        mkldnn::prop_kind::forward, mkldnn::convolution_direct, src, weights,
+        fwd_prop_kind, mkldnn::convolution_direct, src, weights, dst,
-        dst, stride_dims, padding_dims, padding_dims,
+        stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero);
-        mkldnn::padding_kind::zero);
    mkldnn::primitive_attr conv_attr =
        CreatePostOps(fuse_relu, fuse_residual_conn);
@@ -537,14 +539,14 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                       const std::vector<int>& strides,
                       const std::vector<int>& paddings,
                       const mkldnn::engine& engine, const bool fuse_relu,
-                       const bool fuse_residual_conn) const {
+                       const bool fuse_residual_conn,
+                       mkldnn::prop_kind fwd_prop_kind) const {
    memory::dims stride_dims = {strides[0], strides[1]};
    memory::dims padding_dims = {paddings[0], paddings[1]};
    auto conv_desc = mkldnn::convolution_forward::desc(
-        mkldnn::prop_kind::forward, mkldnn::convolution_direct, src, weights,
+        fwd_prop_kind, mkldnn::convolution_direct, src, weights, bias, dst,
-        bias, dst, stride_dims, padding_dims, padding_dims,
+        stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero);
-        mkldnn::padding_kind::zero);
    mkldnn::primitive_attr conv_attr =
        CreatePostOps(fuse_relu, fuse_residual_conn);
@@ -589,6 +591,10 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
                       output_grad->format() != memory::format::format_undef,
                   "Wrong layout/format set for output_grad tensor");
+    PADDLE_ENFORCE(
+        !ctx.Attr<bool>("is_test"),
+        "is_test attribute should be set to False in training phase.");
    if (!input_grad && !filter_grad) return;
    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");

--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -109,7 +109,10 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
 }
 void Conv2DOpMaker::Make() {
-  AddAttr<bool>("is_test", "").SetDefault(false);
+  AddAttr<bool>("is_test",
+                "(bool, default false) Set to true for inference only, false "
+                "for training. Some layers may run faster when this is true.")
+      .SetDefault(false);
  AddInput(
      "Input",
      "(Tensor) The input tensor of convolution operator. "

--- a/paddle/fluid/operators/csp/CMakeLists.txt
+++ b/paddle/fluid/operators/csp/CMakeLists.txt
+include(operators)
+register_operators()
--- a/paddle/fluid/operators/go_op.cc
+++ b/paddle/fluid/operators/go_op.cc
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -40,4 +40,8 @@ endif()
 detection_library(roi_perspective_transform_op SRCS roi_perspective_transform_op.cc roi_perspective_transform_op.cu)
 #Export local libraries to parent
-set(DETECTION_LIBRARY ${LOCAL_DETECTION_LIBS} PARENT_SCOPE)
+# set(DETECTION_LIBRARY ${LOCAL_DETECTION_LIBS} PARENT_SCOPE)
+foreach(src ${LOCAL_DETECTION_LIBS})
+    set(OP_LIBRARY ${src} ${OP_LIBRARY} CACHE INTERNAL "op libs")
+endforeach()
--- a/paddle/fluid/operators/distributed_ops/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed_ops/CMakeLists.txt
+include(operators)
+set(DISTRIBUTE_DEPS "")
+if(WITH_GRPC)
+    set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node)
+else()
+    set(DISTRIBUTE_DEPS sendrecvop_brpc brpc leveldb snappystream snappy protobuf ssl crypto zlib node)
+    if(WITH_BRPC_RDMA)
+        find_library(IBVERBS_LIBRARY NAMES ibverbs)
+        ADD_LIBRARY(ibverbs SHARED IMPORTED GLOBAL)
+        SET_PROPERTY(TARGET ibverbs PROPERTY IMPORTED_LOCATION ${IBVERBS_LIBRARY})
+        find_library(RDMACM_LIBRARY NAMES rdmacm)
+        ADD_LIBRARY(rdmacm SHARED IMPORTED GLOBAL)
+        SET_PROPERTY(TARGET rdmacm PROPERTY IMPORTED_LOCATION ${RDMACM_LIBRARY})
+        set(DISTRIBUTE_DEPS ${DISTRIBUTE_DEPS} ibverbs rdmacm)
+    endif()
+endif()
+set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
+list(REMOVE_DUPLICATES OPS)
+foreach(src ${OPS})
+    set_source_files_properties(${src} PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+endforeach()
+register_operators(EXCLUDES gen_nccl_id_op DEPS ${DISTRIBUTE_DEPS})
+if(WITH_GPU AND NOT WIN32)
+    set(DISTRIBUTE_DEPS ${DISTRIBUTE_DEPS} nccl_common)
+    op_library(gen_nccl_id_op ${DISTRIBUTE_DEPS} nccl_common)
+endif()
+set(OPERATOR_DEPS ${OPERATOR_DEPS} ${DISTRIBUTE_DEPS} PARENT_SCOPE)
+set(GLOB_DISTRIBUTE_DEPS ${DISTRIBUTE_DEPS} CACHE INTERNAL "distributed dependency")
--- a/paddle/fluid/operators/checkpoint_notify_op.cc
+++ b/paddle/fluid/operators/checkpoint_notify_op.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/macros.h"
-#include "paddle/fluid/operators/send_recv_util.h"
+#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
 #include "paddle/fluid/string/printf.h"
 namespace paddle {

--- a/paddle/fluid/operators/fake_init_op.cc
+++ b/paddle/fluid/operators/fake_init_op.cc
--- a/paddle/fluid/operators/fetch_barrier_op.cc
+++ b/paddle/fluid/operators/fetch_barrier_op.cc
--- a/paddle/fluid/operators/gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/gen_nccl_id_op.cc
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -25,7 +25,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/distributed/request_handler_impl.h"
-#include "paddle/fluid/operators/listen_and_serv_op.h"
+#include "paddle/fluid/operators/distributed_ops/listen_and_serv_op.h"
 DEFINE_int32(rpc_send_thread_num, 5, "number of threads for rpc send");
 DEFINE_int32(rpc_get_thread_num, 5, "number of threads for rpc get");

--- a/paddle/fluid/operators/listen_and_serv_op.h
+++ b/paddle/fluid/operators/listen_and_serv_op.h
--- a/paddle/fluid/operators/merge_ids_op.cc
+++ b/paddle/fluid/operators/merge_ids_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/merge_ids_op.h"
+#include "paddle/fluid/operators/distributed_ops/merge_ids_op.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/merge_ids_op.h
+++ b/paddle/fluid/operators/merge_ids_op.h
--- a/paddle/fluid/operators/prefetch_op.cc
+++ b/paddle/fluid/operators/prefetch_op.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/macros.h"
-#include "paddle/fluid/operators/send_recv_util.h"
+#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/recv_op.cc
+++ b/paddle/fluid/operators/recv_op.cc
--- a/paddle/fluid/operators/ref_by_trainer_id_op.cc
+++ b/paddle/fluid/operators/ref_by_trainer_id_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/ref_by_trainer_id_op.h"
+#include "paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.h"
 #include <string>
 namespace paddle {

--- a/paddle/fluid/operators/ref_by_trainer_id_op.cu.cc
+++ b/paddle/fluid/operators/ref_by_trainer_id_op.cu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/ref_by_trainer_id_op.h"
+#include "paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.h"
 REGISTER_OP_CUDA_KERNEL(
    ref_by_trainer_id,

--- a/paddle/fluid/operators/ref_by_trainer_id_op.h
+++ b/paddle/fluid/operators/ref_by_trainer_id_op.h
--- a/paddle/fluid/operators/send_barrier_op.cc
+++ b/paddle/fluid/operators/send_barrier_op.cc
--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/macros.h"
-#include "paddle/fluid/operators/send_recv_util.h"
+#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
 #include "paddle/fluid/platform/profiler.h"
 namespace paddle {

--- a/paddle/fluid/operators/send_recv_op_test.cc
+++ b/paddle/fluid/operators/send_recv_op_test.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/listen_and_serv_op.h"
+#include "paddle/fluid/operators/distributed_ops/listen_and_serv_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/string/printf.h"

--- a/paddle/fluid/operators/send_recv_util.h
+++ b/paddle/fluid/operators/send_recv_util.h
--- a/paddle/fluid/operators/split_byref_op.cc
+++ b/paddle/fluid/operators/split_byref_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/split_byref_op.h"
+#include "paddle/fluid/operators/distributed_ops/split_byref_op.h"
 #include "paddle/fluid/operators/split_op.h"
 namespace paddle {

--- a/paddle/fluid/operators/split_byref_op.cu.cc
+++ b/paddle/fluid/operators/split_byref_op.cu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/split_byref_op.h"
+#include "paddle/fluid/operators/distributed_ops/split_byref_op.h"
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
    split_byref,

--- a/paddle/fluid/operators/split_byref_op.h
+++ b/paddle/fluid/operators/split_byref_op.h
--- a/paddle/fluid/operators/split_ids_op.cc
+++ b/paddle/fluid/operators/split_ids_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/split_ids_op.h"
+#include "paddle/fluid/operators/distributed_ops/split_ids_op.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/split_ids_op.h
+++ b/paddle/fluid/operators/split_ids_op.h
--- a/paddle/fluid/operators/test_send_nccl_id.cc
+++ b/paddle/fluid/operators/test_send_nccl_id.cc
@@ -22,14 +22,14 @@ limitations under the License. */
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/detail/macros.h"
 #include "paddle/fluid/operators/distributed/request_handler_impl.h"
-#include "paddle/fluid/operators/listen_and_serv_op.h"
+#include "paddle/fluid/operators/distributed_ops/listen_and_serv_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/platform/nccl_helper.h"
 #include "paddle/fluid/string/printf.h"
 #ifdef PADDLE_WITH_GRPC
-#include "paddle/fluid/operators/send_recv_util.h"
+#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
 #endif
 USE_NO_KERNEL_OP(listen_and_serv);

--- a/paddle/fluid/operators/dropout_op.cc
+++ b/paddle/fluid/operators/dropout_op.cc
@@ -49,7 +49,10 @@ class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
          PADDLE_ENFORCE(drop_p >= 0.0f && drop_p <= 1.0f,
                         "'dropout_prob' must be between 0.0 and 1.0.");
        });
-    AddAttr<bool>("is_test", "True if in test phase.").SetDefault(false);
+    AddAttr<bool>("is_test",
+                  "(bool, default false) Set to true for inference only, false "
+                  "for training. Some layers may run faster when this is true.")
+        .SetDefault(false);
    AddAttr<bool>("fix_seed",
                  "A flag indicating whether to use a fixed seed to generate "
                  "random mask. NOTE: DO NOT set this flag to true in "

--- a/paddle/fluid/operators/elementwise/CMakeLists.txt
+++ b/paddle/fluid/operators/elementwise/CMakeLists.txt
+include(operators)
+register_operators()
--- a/paddle/fluid/operators/elementwise_add_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise_add_mkldnn_op.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/operators/elementwise_add_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
-#include "paddle/fluid/operators/elementwise_op_function.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"

--- a/paddle/fluid/operators/elementwise_add_op.cc
+++ b/paddle/fluid/operators/elementwise_add_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/elementwise_add_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
-#include "paddle/fluid/operators/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
 namespace ops = paddle::operators;
 REGISTER_ELEMWISE_GRAD_MAKER(elementwise_add, Add);
 REGISTER_ELEMWISE_EXPLICIT_OP(elementwise_add, "Add", "Out = X + Y", "Out",

--- a/paddle/fluid/operators/elementwise_add_op.cu
+++ b/paddle/fluid/operators/elementwise_add_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #define EIGEN_USE_GPU
-#include "paddle/fluid/operators/elementwise_add_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
 #include "paddle/fluid/platform/float16.h"
 namespace ops = paddle::operators;

--- a/paddle/fluid/operators/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise_add_op.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/operators/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/elementwise_op_function.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/operators/math/blas.h"
 namespace paddle {

--- a/paddle/fluid/operators/elementwise_div_op.cc
+++ b/paddle/fluid/operators/elementwise_div_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/elementwise_div_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
-#include "paddle/fluid/operators/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
 namespace ops = paddle::operators;
 REGISTER_ELEMWISE_OP(elementwise_div, "Div", "Out = X / Y");

--- a/paddle/fluid/operators/elementwise_div_op.cu
+++ b/paddle/fluid/operators/elementwise_div_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #define EIGEN_USE_GPU
-#include "paddle/fluid/operators/elementwise_div_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
 namespace ops = paddle::operators;

--- a/paddle/fluid/operators/elementwise_div_op.h
+++ b/paddle/fluid/operators/elementwise_div_op.h
@@ -14,8 +14,8 @@ limitations under the License. */
 #pragma once
-#include "paddle/fluid/operators/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/elementwise_op_function.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/elementwise_max_op.cc
+++ b/paddle/fluid/operators/elementwise_max_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/elementwise_max_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_max_op.h"
-#include "paddle/fluid/operators/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
 namespace ops = paddle::operators;
 REGISTER_ELEMWISE_OP(elementwise_max, "Max", "Out = max(X, Y)");
 REGISTER_OP_CPU_KERNEL(

--- a/paddle/fluid/operators/elementwise_max_op.cu
+++ b/paddle/fluid/operators/elementwise_max_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #define EIGEN_USE_GPU
-#include "paddle/fluid/operators/elementwise_max_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_max_op.h"
 namespace ops = paddle::operators;

--- a/paddle/fluid/operators/elementwise_max_op.h
+++ b/paddle/fluid/operators/elementwise_max_op.h
@@ -14,8 +14,8 @@ limitations under the License. */
 #pragma once
-#include "paddle/fluid/operators/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/elementwise_op_function.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/elementwise_min_op.cc
+++ b/paddle/fluid/operators/elementwise_min_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/elementwise_min_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_min_op.h"
-#include "paddle/fluid/operators/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
 namespace ops = paddle::operators;
 REGISTER_ELEMWISE_OP(elementwise_min, "Min", "Out = min(X, Y)");
 REGISTER_OP_CPU_KERNEL(

--- a/paddle/fluid/operators/elementwise_min_op.cu
+++ b/paddle/fluid/operators/elementwise_min_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #define EIGEN_USE_GPU
-#include "paddle/fluid/operators/elementwise_min_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_min_op.h"
 namespace ops = paddle::operators;

--- a/paddle/fluid/operators/elementwise_min_op.h
+++ b/paddle/fluid/operators/elementwise_min_op.h
@@ -14,8 +14,8 @@ limitations under the License. */
 #pragma once
-#include "paddle/fluid/operators/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/elementwise_op_function.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/elementwise_mul_op.cc
+++ b/paddle/fluid/operators/elementwise_mul_op.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/elementwise_mul_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
 #include <string>
-#include "paddle/fluid/operators/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise_mul_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #define EIGEN_USE_GPU
-#include "paddle/fluid/operators/elementwise_mul_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
 namespace ops = paddle::operators;

--- a/paddle/fluid/operators/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise_mul_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
-#include "paddle/fluid/operators/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/elementwise_op_function.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/operators/math/blas.h"
 namespace paddle {

--- a/paddle/fluid/operators/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise_op.h
--- a/paddle/fluid/operators/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise_op_function.h
--- a/paddle/fluid/operators/elementwise_pow_op.cc
+++ b/paddle/fluid/operators/elementwise_pow_op.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/elementwise_pow_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_pow_op.h"
 #include <string>
-#include "paddle/fluid/operators/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/elementwise_pow_op.cu
+++ b/paddle/fluid/operators/elementwise_pow_op.cu
@@ -10,7 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #define EIGEN_USE_GPU
-#include "paddle/fluid/operators/elementwise_pow_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_pow_op.h"
 namespace ops = paddle::operators;

--- a/paddle/fluid/operators/elementwise_pow_op.h
+++ b/paddle/fluid/operators/elementwise_pow_op.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 #include <cmath>
-#include "paddle/fluid/operators/elementwise_op_function.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/elementwise_sub_op.cc
+++ b/paddle/fluid/operators/elementwise_sub_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/elementwise_sub_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
-#include "paddle/fluid/operators/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
 namespace ops = paddle::operators;
 REGISTER_ELEMWISE_GRAD_MAKER(elementwise_sub, Sub);
 REGISTER_ELEMWISE_EXPLICIT_OP(elementwise_sub, "Sub", "Out = X - Y", "Out",

--- a/paddle/fluid/operators/elementwise_sub_op.cu
+++ b/paddle/fluid/operators/elementwise_sub_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #define EIGEN_USE_GPU
-#include "paddle/fluid/operators/elementwise_sub_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
 namespace ops = paddle::operators;

--- a/paddle/fluid/operators/elementwise_sub_op.h
+++ b/paddle/fluid/operators/elementwise_sub_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
-#include "paddle/fluid/operators/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/elementwise_op_function.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/expand_op.cc
+++ b/paddle/fluid/operators/expand_op.cc
@@ -47,6 +47,11 @@ class ExpandOp : public framework::OperatorWithKernel {
      out_shape[i] = x_dims[i] * expand_times[i];
    }
+    // set the first dim to -1 in compile time
+    if (!ctx->IsRuntime()) {
+      out_shape[0] = x_dims[0];
+    }
    ctx->SetOutputDim("Out", framework::make_ddim(out_shape));
    if (out_shape[0] == x_dims[0]) {
      ctx->ShareLoD("X", "Out");
@@ -109,7 +114,16 @@ class ExpandGradOp : public framework::OperatorWithKernel {
        ctx->Attrs().Get<std::vector<int>>("expand_times");
    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-    for (size_t i = 0; i < expand_times.size(); ++i) {
+    size_t start_pos = 0u;
+    if (!ctx->IsRuntime()) {
+      PADDLE_ENFORCE_EQ(
+          x_dims[0], out_dims[0],
+          "The first dimension size of Input(Out@GRAD) should be "
+          "equal to the crroresponding dimension size of Input(X)");
+      start_pos = 1u;
+    }
+    for (size_t i = start_pos; i < expand_times.size(); ++i) {
      PADDLE_ENFORCE_EQ(x_dims[i] * expand_times[i], out_dims[i],
                        "Each dimension size of Input(Out@GRAD) should be "
                        "equal to multiplication of crroresponding dimension "

--- a/paddle/fluid/operators/fake_quantize_op.cc
+++ b/paddle/fluid/operators/fake_quantize_op.cc
@@ -138,7 +138,7 @@ class FakeQuantizeAbsMaxOpMaker : public framework::OpProtoAndCheckerMaker {
    AddComment(R"DOC(
 FakeQuantize operator
-$$scale = max(abs(X))$$ 
+$$scale = max(abs(X))$$
 $$range = 2^{bit_length - 1} - 1$$
 $$Out = round(X/scale * range)$$
@@ -199,11 +199,14 @@ class FakeQuantizeRangeAbsMaxOpMaker
          PADDLE_ENFORCE(bit_length >= 1 && bit_length <= 16,
                         "'bit_length' should be between 1 and 16.");
        });
-    AddAttr<bool>("is_test", "").SetDefault(false);
+    AddAttr<bool>("is_test",
+                  "(bool, default false) Set to true for inference only, false "
+                  "for training. Some layers may run faster when this is true.")
+        .SetDefault(false);
    AddComment(R"DOC(
 FakeQuantize operator is used in static quantization.
-$$scale = max(max(abs(x)), history_abs_max)$$ 
+$$scale = max(max(abs(x)), history_abs_max)$$
 $$range = 2^{bit_length - 1} - 1$$
 $$Out = round(X/scale * range)$$

--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
+include(operators)
+register_operators()
--- a/paddle/fluid/operators/fused_elemwise_activation_op.cc
+++ b/paddle/fluid/operators/fused_elemwise_activation_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/fused_elemwise_activation_op.h"
+#include "paddle/fluid/operators/fused/fused_elemwise_activation_op.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/fused_elemwise_activation_op.cu
+++ b/paddle/fluid/operators/fused_elemwise_activation_op.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/fused_elemwise_activation_op.h"
+#include "paddle/fluid/operators/fused/fused_elemwise_activation_op.h"
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(

--- a/paddle/fluid/operators/fused_elemwise_activation_op.h
+++ b/paddle/fluid/operators/fused_elemwise_activation_op.h
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
-#include "paddle/fluid/operators/elementwise_op_function.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/operators/math/compound_functors.h"
 #include "paddle/fluid/operators/math/functors.h"

--- a/paddle/fluid/operators/fused_embedding_fc_lstm_op.cc
+++ b/paddle/fluid/operators/fused_embedding_fc_lstm_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/fused_embedding_fc_lstm_op.h"
+#include "paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.h"
 #include <string>
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/cpu_vec.h"

--- a/paddle/fluid/operators/fused_embedding_fc_lstm_op.h
+++ b/paddle/fluid/operators/fused_embedding_fc_lstm_op.h
--- a/paddle/fluid/operators/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fusion_gru_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/fusion_gru_op.h"
+#include "paddle/fluid/operators/fused/fusion_gru_op.h"
 #include <cstring>  // for memcpy
 #include <string>
 #include "paddle/fluid/operators/math/blas.h"

--- a/paddle/fluid/operators/fusion_gru_op.h
+++ b/paddle/fluid/operators/fusion_gru_op.h
--- a/paddle/fluid/operators/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fusion_lstm_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/fusion_lstm_op.h"
+#include "paddle/fluid/operators/fused/fusion_lstm_op.h"
 #include <string>
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/fc_compute.h"

--- a/paddle/fluid/operators/fusion_lstm_op.h
+++ b/paddle/fluid/operators/fusion_lstm_op.h
--- a/paddle/fluid/operators/fusion_seqconv_eltadd_relu_op.cc
+++ b/paddle/fluid/operators/fusion_seqconv_eltadd_relu_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/fusion_seqconv_eltadd_relu_op.h"
+#include "paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.h"
 #include <algorithm>  // for min, max
 #include <string>
 #include "paddle/fluid/operators/math/blas.h"

--- a/paddle/fluid/operators/fusion_seqconv_eltadd_relu_op.h
+++ b/paddle/fluid/operators/fusion_seqconv_eltadd_relu_op.h
--- a/paddle/fluid/operators/fusion_seqexpand_concat_fc_op.cc
+++ b/paddle/fluid/operators/fusion_seqexpand_concat_fc_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/fusion_seqexpand_concat_fc_op.h"
+#include "paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.h"
 #include <string>
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/cpu_vec.h"

--- a/paddle/fluid/operators/fusion_seqexpand_concat_fc_op.h
+++ b/paddle/fluid/operators/fusion_seqexpand_concat_fc_op.h
--- a/paddle/fluid/operators/layer_norm_op.h
+++ b/paddle/fluid/operators/layer_norm_op.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/elementwise_op_function.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/math_function.h"

--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
@@ -46,7 +46,7 @@ struct LRNFunctor<platform::CPUDeviceContext, T> {
    int pre_pad = (n - 1) / 2;
    // compute batches one by one
    for (int i = 0; i < N; ++i) {
-      blas.VSQR(fea_size, idata + i * fea_size, sdata + pre_pad * img_size);
+      blas.VSQUARE(fea_size, idata + i * fea_size, sdata + pre_pad * img_size);
      // init the first channel of mid
      for (int c = 0; c < n; ++c) {
        blas.AXPY(img_size, alpha, sdata + c * img_size, mdata + i * fea_size);
@@ -229,8 +229,8 @@ class LRNOpMaker : public framework::OpProtoAndCheckerMaker {
        "the input will be transformed automatically. ")
        .SetDefault("AnyLayout");
    AddAttr<bool>("is_test",
-                  "Turns on memory optimization that optimizes away "
+                  "(bool, default false) Set to true for inference only, false "
-                  "unnecessary memory allocations. Used by MKLDNN.")
+                  "for training. Some layers may run faster when this is true.")
        .SetDefault(false);
    AddComment(R"DOC(

--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -41,6 +41,7 @@ math_library(cross_entropy)
 math_library(cos_sim_functor)
 math_library(depthwise_conv)
 math_library(im2col)
+math_library(sampler)
 if (NOT WIN32) # windows do not support avx functions yet.
    math_library(gru_compute DEPS activation_functions math_function)

--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
@@ -153,7 +153,7 @@ class Blas {
  void VEXP(int n, const T* x, T* y) const;
  template <typename T>
-  void VSQR(int n, const T* x, T* y) const;
+  void VSQUARE(int n, const T* x, T* y) const;
  template <typename T>
  void VPOW(int n, const T* x, T alpha, T* y) const;
@@ -245,8 +245,8 @@ class BlasT : private Blas<DeviceContext> {
  }
  template <typename... ARGS>
-  void VSQR(ARGS... args) const {
+  void VSQUARE(ARGS... args) const {
-    Base()->template VSQR<T>(args...);
+    Base()->template VSQUARE<T>(args...);
  }
  template <typename... ARGS>

--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -105,7 +105,7 @@ struct CBlas<float> {
  }
  template <typename... ARGS>
-  static void VSQR(ARGS... args) {
+  static void VSQUARE(ARGS... args) {
    platform::dynload::vsSqr(args...);
  }
@@ -195,7 +195,7 @@ struct CBlas<double> {
  }
  template <typename... ARGS>
-  static void VSQR(ARGS... args) {
+  static void VSQUARE(ARGS... args) {
    platform::dynload::vdSqr(args...);
  }
@@ -262,7 +262,9 @@ struct CBlas<platform::float16> {
  }
  static void VMUL(...) { PADDLE_THROW("float16 VMUL not supported on CPU"); }
  static void VEXP(...) { PADDLE_THROW("float16 VEXP not supported on CPU"); }
-  static void VSQR(...) { PADDLE_THROW("float16 VSQR not supported on CPU"); }
+  static void VSQUARE(...) {
+    PADDLE_THROW("float16 VSQUARE not supported on CPU");
+  }
  static void VPOW(...) { PADDLE_THROW("float16 VPOW not supported on CPU"); }
  static void DOT(...) { PADDLE_THROW("float16 DOT not supported on CPU"); };
  static void SCAL(...) { PADDLE_THROW("float16 SCAL not supported on CPU"); };
@@ -423,12 +425,12 @@ void Blas<platform::CPUDeviceContext>::VEXP(int n, const T *x, T *y) const {
 template <>
 template <typename T>
-void Blas<platform::CPUDeviceContext>::VSQR(int n, const T *x, T *y) const {
+void Blas<platform::CPUDeviceContext>::VSQUARE(int n, const T *x, T *y) const {
 #ifdef PADDLE_WITH_MKLML
-  CBlas<T>::VSQR(n, x, y);
+  CBlas<T>::VSQUARE(n, x, y);
 #else
  for (int i = 0; i < n; ++i) {
-    y[i] = std::sqrt(x[i]);
+    y[i] = x[i] * x[i];
  }
 #endif
 }

--- a/paddle/fluid/operators/math/cpu_vec.h
+++ b/paddle/fluid/operators/math/cpu_vec.h
@@ -33,11 +33,11 @@ namespace math {
 #define SIGMOID_THRESHOLD_MIN -40.0
 #define SIGMOID_THRESHOLD_MAX 13.0
-#define AVX_FLOAT_BLOCK 8
+#define YMM_FLOAT_BLOCK 8
 #define AVX_DOUBLE_BLOCK 4
-#define AVX2_FLOAT_BLOCK 8
+#define YMM_FLOAT_BLOCK 8
 #define AVX2_DOUBLE_BLOCK 4
-#define AVX512_FLOAT_BLOCK 16
+#define ZMM_FLOAT_BLOCK 16
 #define AVX512_DOUBLE_BLOCK 8
 template <typename T>
@@ -88,7 +88,7 @@ template <>
 inline void vec_scal<float, platform::jit::avx>(const int n, const float a,
                                                const float* x, float* y) {
 #ifdef __AVX__
-  constexpr int block = AVX_FLOAT_BLOCK;
+  constexpr int block = YMM_FLOAT_BLOCK;
  if (n < block) {
    vec_scal<float, platform::jit::isa_any>(n, a, x, y);
    return;
@@ -142,7 +142,7 @@ template <>
 inline void vec_bias_sub<float, platform::jit::avx>(const int n, const float a,
                                                    const float* x, float* y) {
 #ifdef __AVX__
-  constexpr int block = AVX_FLOAT_BLOCK;
+  constexpr int block = YMM_FLOAT_BLOCK;
  if (n < block) {
    vec_bias_sub<float, platform::jit::isa_any>(n, a, x, y);
    return;
@@ -200,7 +200,7 @@ inline void vec_cross<float, platform::jit::avx>(const int n, const float* x,
                                                 const float* y, const float* z,
                                                 float* out) {
 #ifdef __AVX__
-  constexpr int block = AVX_FLOAT_BLOCK;
+  constexpr int block = YMM_FLOAT_BLOCK;
  if (n < block) {
    vec_cross<float, platform::jit::isa_any>(n, x, y, z, out);
    return;
@@ -257,7 +257,7 @@ template <>
 inline void vec_add_bias<float, platform::jit::avx>(const int n, const float a,
                                                    const float* x, float* y) {
 #ifdef __AVX__
-  constexpr int block = AVX_FLOAT_BLOCK;
+  constexpr int block = YMM_FLOAT_BLOCK;
  if (n < block) {
    vec_add_bias<float, platform::jit::isa_any>(n, a, x, y);
    return;
@@ -326,7 +326,7 @@ template <>
 inline void vec_sigmoid<float, platform::jit::avx>(const int n, const float* x,
                                                   float* y) {
 #ifdef __AVX__
-  constexpr int block = AVX_FLOAT_BLOCK;
+  constexpr int block = YMM_FLOAT_BLOCK;
  if (n < block) {
    vec_sigmoid<float, platform::jit::isa_any>(n, x, y);
    return;
@@ -415,7 +415,7 @@ template <>
 inline void vec_relu<float, platform::jit::avx>(const int n, const float* x,
                                                float* y) {
 #ifdef __AVX__
-  constexpr int block = AVX_FLOAT_BLOCK;
+  constexpr int block = YMM_FLOAT_BLOCK;
  if (n < block * 4) {
    vec_relu<float, platform::jit::isa_any>(n, x, y);
    return;

--- a/paddle/fluid/operators/math/jit_code.cc
+++ b/paddle/fluid/operators/math/jit_code.cc
@@ -41,7 +41,7 @@ void VXXJitCode::generate() {
  } else if (scalar_index_ == 2) {
    vbroadcastss(ymm_src2, ptr[param2]);
  }
-  for (int i = 0; i < num_ / AVX_FLOAT_BLOCK; ++i) {
+  for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) {
    if (scalar_index_ != 1) {
      vmovups(ymm_src1, ptr[param1 + offset]);
    }
@@ -57,9 +57,9 @@ void VXXJitCode::generate() {
      vmaxps(ymm_dst, ymm_zero, ymm_dst);
    }
    vmovups(ptr[param3 + offset], ymm_dst);
-    offset += sizeof(float) * AVX_FLOAT_BLOCK;
+    offset += sizeof(float) * YMM_FLOAT_BLOCK;
  }
-  int rest = num_ % AVX_FLOAT_BLOCK;
+  int rest = num_ % YMM_FLOAT_BLOCK;
  if (rest >= 4) {
    if (scalar_index_ != 1) {
      vmovups(xmm_src1, ptr[param1 + offset]);
@@ -118,18 +118,237 @@ void VXXJitCode::generate() {
  ret();
 }
-bool ReluJitCode::init(int d) { return MayIUse(avx); }
+#define ALIGN32 __attribute__((aligned(32)))
+#define EXP_HIG 88.3762626647949f
+#define EXP_LOW -88.3762626647949f
+#define CEPHES_LOG2EF 1.44269504088896341
+#define CEPHES_EXP_C1 0.693359375
+#define CEPHES_EXP_C2 -2.12194440e-4
+#define CEPHES_EXP_P0 1.9875691500E-4
+#define CEPHES_EXP_P1 1.3981999507E-3
+#define CEPHES_EXP_P2 8.3334519073E-3
+#define CEPHES_EXP_P3 4.1665795894E-2
+#define CEPHES_EXP_P4 1.6666665459E-1
+#define CEPHES_EXP_P5 5.0000001201E-1
-void ReluJitCode::generate() {
+#define REPEAT_8TIMES(val) val, val, val, val, val, val, val, val
-  int offset = 0;
+#define OFFSET_EXP_ONE 0 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_TWO 1 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_0P5 2 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_HIG 3 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_LOW 4 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_LOG2EF 5 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_C1 6 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_C2 7 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_P0 8 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_P1 9 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_P2 10 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_P3 11 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_P4 12 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_P5 13 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_MAX_INPUT 14 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_SIGMOID_MAX 15 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_SIGMOID_MIN 16 * YMM_FLOAT_BLOCK * sizeof(float)
+static const float exp_float_consts[] ALIGN32 = {
+    REPEAT_8TIMES(1.f),
+    REPEAT_8TIMES(2.f),
+    REPEAT_8TIMES(0.5f),
+    REPEAT_8TIMES(EXP_HIG),
+    REPEAT_8TIMES(EXP_LOW),
+    REPEAT_8TIMES(CEPHES_LOG2EF),
+    REPEAT_8TIMES(CEPHES_EXP_C1),
+    REPEAT_8TIMES(CEPHES_EXP_C2),
+    REPEAT_8TIMES(CEPHES_EXP_P0),
+    REPEAT_8TIMES(CEPHES_EXP_P1),
+    REPEAT_8TIMES(CEPHES_EXP_P2),
+    REPEAT_8TIMES(CEPHES_EXP_P3),
+    REPEAT_8TIMES(CEPHES_EXP_P4),
+    REPEAT_8TIMES(CEPHES_EXP_P5),
+    REPEAT_8TIMES(EXP_MAX_INPUT),
+    REPEAT_8TIMES(SIGMOID_THRESHOLD_MAX),
+    REPEAT_8TIMES(SIGMOID_THRESHOLD_MIN)};
+static const int exp_int_0x7f[] ALIGN32 = {REPEAT_8TIMES(0x7f)};
+static int g_tmp_mem[16] ALIGN32 = {0};
+bool VActJitCode::init(int d, operand_type type) {
+  bool ok = MayIUse(avx);
+  if (type == operand_type::relu) {
+    return ok;
+  } else if (type == operand_type::exp) {
+    // exp is slower than mkl when d >= 256
+    return ok && d % 8 == 0 && d < 256;
+  } else {
+    // TODO(TJ): support more
+    return ok && d % 8 == 0;
+  }
+}
+void VActJitCode::relu_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, ymm_t& ymm_zero) {
+  vmaxps(ymm_dst, ymm_zero, ymm_src);
+}
+void VActJitCode::exp_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, int fx_idx,
+                          int fy_idx, int mask_idx, int tmp_idx) {
+  assert(ymm_src.getIdx() != ymm_dst.getIdx());  // TODO(TJ): use enfore
+  // check all idx can not equal
+  ymm_t ymm_fx = ymm_t(fx_idx);
+  ymm_t ymm_fy = ymm_t(fy_idx);
+  ymm_t ymm_mask = ymm_t(mask_idx);
+  ymm_t ymm_tmp = ymm_t(tmp_idx);
+  reg64_t reg_ptr_global = rax;
+  push(reg_ptr_global);
+  mov(reg_ptr_global, reinterpret_cast<size_t>(exp_float_consts));
+  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_HIG]);
+  vminps(ymm_src, ymm_src, ymm_tmp);
+  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOW]);
+  vmaxps(ymm_src, ymm_src, ymm_tmp);
+  // express exp(x) as exp(g + n*log(2))
+  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOG2EF]);
+  vmulps(ymm_fx, ymm_src, ymm_tmp);
+  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_0P5]);
+  vaddps(ymm_fx, ymm_fx, ymm_tmp);
+  vroundps(ymm_fy, ymm_fx, 0x01);
+  // if greater, substract 1
+  vcmpgtps(ymm_mask, ymm_fy, ymm_fx);
+  vmovaps(ymm_tmp, ptr[reg_ptr_global]);
+  vandps(ymm_mask, ymm_mask, ymm_tmp);
+  vsubps(ymm_fx, ymm_fy, ymm_mask);
+  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C1]);
+  vmulps(ymm_fy, ymm_fx, ymm_tmp);
+  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C2]);
+  ymm_t ymm_z = ymm_t(ymm_mask.getIdx());
+  vmulps(ymm_z, ymm_fx, ymm_tmp);
+  vsubps(ymm_src, ymm_src, ymm_fy);
+  vsubps(ymm_src, ymm_src, ymm_z);
+  vmulps(ymm_z, ymm_src, ymm_src);
+  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P0]);
+  vmulps(ymm_dst, ymm_src, ymm_tmp);
+  for (size_t i = OFFSET_EXP_P1; i < OFFSET_EXP_P5;
+       i += (YMM_FLOAT_BLOCK * sizeof(float))) {
+    vmovaps(ymm_tmp, ptr[reg_ptr_global + i]);  // P1~P4
+    vaddps(ymm_dst, ymm_dst, ymm_tmp);
+    vmulps(ymm_dst, ymm_dst, ymm_src);
+  }
+  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P5]);
+  vaddps(ymm_dst, ymm_dst, ymm_tmp);
+  vmulps(ymm_dst, ymm_dst, ymm_z);
+  vaddps(ymm_dst, ymm_dst, ymm_src);
+  vmovaps(ymm_tmp, ptr[reg_ptr_global]);
+  vaddps(ymm_dst, ymm_dst, ymm_tmp);
+  // build 2^n
+  ymm_t ymm_int = ymm_fx;
+  vcvttps2dq(ymm_int, ymm_fx);
+  mov(reg_ptr_global, reinterpret_cast<size_t>(exp_int_0x7f));
+  vmovdqa(ymm_tmp, ptr[reg_ptr_global]);
+  if (MayIUse(avx2)) {
+    vpaddd(ymm_int, ymm_int, ymm_tmp);
+    vpslld(ymm_int, ymm_int, 23);
+  } else if (MayIUse(avx)) {
+    xmm_t xtmp1 = xmm_t(ymm_int.getIdx());
+    xmm_t xtmp2 = xmm_t(ymm_tmp.getIdx());
+    reg64_t reg_ptr_tmp = reg_ptr_global;
+    mov(reg_ptr_tmp, reinterpret_cast<size_t>(g_tmp_mem));
+    vmovdqa(ptr[reg_ptr_tmp], ymm_int);
+    vmovdqa(ptr[reg_ptr_tmp + YMM_FLOAT_BLOCK * sizeof(float)], ymm_tmp);
+    vpaddd(xtmp1, xtmp1, xtmp2);
+    vpslld(xtmp1, xtmp1, 23);
+    vmovdqa(ptr[reg_ptr_tmp], xtmp1);
+    // next 128bits
+    vmovdqa(xtmp1, ptr[reg_ptr_tmp + 4 /*xmm float block*/ * sizeof(float)]);
+    vmovdqa(xtmp2,
+            ptr[reg_ptr_tmp +
+                (YMM_FLOAT_BLOCK + 4 /*xmm float block*/) * sizeof(float)]);
+    vpaddd(xtmp1, xtmp1, xtmp2);
+    vpslld(xtmp1, xtmp1, 23);
+    vmovdqa(ptr[reg_ptr_tmp + 4 /*xmm float block*/ * sizeof(float)], xtmp1);
+    // load out
+    vmovdqa(ymm_int, ptr[reg_ptr_tmp]);
+  }
+  vmulps(ymm_dst, ymm_dst, ymm_int);
+  pop(reg_ptr_global);
+}
+void VActJitCode::sigmoid_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, int fx_idx,
+                              int fy_idx, int mask_idx, int tmp_idx) {
+  // y = 1 / (1 + e^-x)
+  ymm_t ymm_tmp = ymm_t(tmp_idx);
+  reg64_t reg_ptr_global = rax;
+  push(reg_ptr_global);
+  mov(reg_ptr_global, reinterpret_cast<size_t>(exp_float_consts));
+  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_SIGMOID_MAX]);
+  vminps(ymm_src, ymm_src, ymm_tmp);
+  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_SIGMOID_MIN]);
+  vmaxps(ymm_src, ymm_src, ymm_tmp);
+  vxorps(ymm_tmp, ymm_tmp, ymm_tmp);
+  vsubps(ymm_src, ymm_tmp, ymm_src);
+  exp_ymm(ymm_dst, ymm_src, fx_idx, fy_idx, mask_idx, tmp_idx);
+  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]);
+  vaddps(ymm_dst, ymm_dst, ymm_tmp);
+  vdivps(ymm_dst, ymm_tmp, ymm_dst);
+  pop(reg_ptr_global);
+}
+void VActJitCode::tanh_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, int fx_idx,
+                           int fy_idx, int mask_idx, int tmp_idx) {
+  // y = 2 / (1 + e^(-2x)) - 1
+  ymm_t ymm_tmp = ymm_t(tmp_idx);
+  ymm_t ymm_zero = ymm_t(mask_idx);
+  reg64_t reg_ptr_global = rax;
+  push(reg_ptr_global);
+  mov(reg_ptr_global, reinterpret_cast<size_t>(exp_float_consts));
+  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_TWO]);
  vxorps(ymm_zero, ymm_zero, ymm_zero);
-  for (int i = 0; i < num_ / AVX_FLOAT_BLOCK; ++i) {
+  vsubps(ymm_tmp, ymm_zero, ymm_tmp);
+  vmulps(ymm_src, ymm_src, ymm_tmp);
+  exp_ymm(ymm_dst, ymm_src, fx_idx, fy_idx, mask_idx, tmp_idx);
+  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]);
+  vaddps(ymm_dst, ymm_dst, ymm_tmp);
+  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_TWO]);
+  vdivps(ymm_dst, ymm_tmp, ymm_dst);
+  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]);
+  vsubps(ymm_dst, ymm_dst, ymm_tmp);
+  pop(reg_ptr_global);
+}
+void VActJitCode::generate() {
+  xmm_t xmm_zero = xmm_t(2);
+  ymm_t ymm_zero = ymm_t(2);
+  if (type_ == operand_type::relu) {
+    vxorps(ymm_zero, ymm_zero, ymm_zero);
+  }
+  int offset = 0;
+  for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) {
    vmovups(ymm_src, ptr[param1 + offset]);
-    vmaxps(ymm_dst, ymm_zero, ymm_src);
+    switch (type_) {
+      case operand_type::relu:
+        relu_ymm(ymm_dst, ymm_src, ymm_zero);
+        break;
+      case operand_type::exp:
+        exp_ymm(ymm_dst, ymm_src, 2, 3, 4, 5);
+        break;
+      case operand_type::sigmoid:
+        sigmoid_ymm(ymm_dst, ymm_src, 2, 3, 4, 5);
+        break;
+      case operand_type::tanh:
+        tanh_ymm(ymm_dst, ymm_src, 2, 3, 4, 5);
+        break;
+      case operand_type::identity:
+        break;
+      default:
+        break;
+    }
    vmovups(ptr[param2 + offset], ymm_dst);
-    offset += sizeof(float) * AVX_FLOAT_BLOCK;
+    offset += sizeof(float) * YMM_FLOAT_BLOCK;
+  }
+  if (type_ != operand_type::relu) {
+    // TODO(TJ): remove me
+    ret();
+    return;
  }
-  int rest = num_ % AVX_FLOAT_BLOCK;
+  int rest = num_ % YMM_FLOAT_BLOCK;
  if (rest >= 4) {
    vmovups(xmm_src, ptr[param1 + offset]);
    vmaxps(xmm_dst, xmm_zero, xmm_src);
@@ -151,6 +370,7 @@ void ReluJitCode::generate() {
  }
  ret();
 }
 }  // namespace gen
 }  // namespace jitkernel
 }  // namespace math

--- a/paddle/fluid/operators/math/jit_code.h
+++ b/paddle/fluid/operators/math/jit_code.h
@@ -29,7 +29,16 @@ using ymm_t = const Xbyak::Ymm;
 using zmm_t = const Xbyak::Zmm;
 using Label = Xbyak::Label;
-typedef enum { mul = 0, add } operand_type;
+typedef enum {
+  mul = 0,
+  add,
+  sub,
+  relu,
+  exp,
+  sigmoid,
+  tanh,
+  identity
+} operand_type;
 // function: vec = Operand(vec(or scalar), vec(or scalar)) (maybe with relu)
 class VXXJitCode : public JitCode {
@@ -85,26 +94,65 @@ class VXXJitCode : public JitCode {
  ymm_t ymm_zero = ymm_t(3);
 };
-class ReluJitCode : public JitCode {
+class VActJitCode : public JitCode {
 public:
-  DECLARE_JIT_CODE(ReluJitCode);
+  const char* name() const override {
-  explicit ReluJitCode(int d, size_t code_size = 256 * 1024,
+    std::string base = "VActJitCode";
+    switch (type_) {
+      case operand_type::relu:
+        base += "_Relu";
+        break;
+      case operand_type::exp:
+        base += "_Exp";
+        break;
+      case operand_type::sigmoid:
+        base += "_Sigmoid";
+        break;
+      case operand_type::tanh:
+        base += "_Tanh";
+        break;
+      case operand_type::identity:
+        base += "_Identity";
+        break;
+      default:
+        break;
+    }
+    return base.c_str();
+  }
+  explicit VActJitCode(int d, operand_type type, size_t code_size = 256 * 1024,
                       void* code_ptr = nullptr)
-      : JitCode(code_size, code_ptr), num_(d) {}
+      : JitCode(code_size, code_ptr), num_(d), type_(type) {}
-  static bool init(int d);
+  static bool init(int d, operand_type type);
  void generate() override;
- private:
+ protected:
+  // compute relu with ymm
+  void relu_ymm(const Xbyak::Ymm& dst, const Xbyak::Ymm& src,
+                const Xbyak::Ymm& zero);
+  // compute exp with ymm
+  void exp_ymm(const Xbyak::Ymm& dst, const Xbyak::Ymm& src, int fx_idx = 2,
+               int fy_idx = 3, int mask_idx = 4, int tmp_idx = 5);
+  // compute sigmoid with ymm
+  void sigmoid_ymm(const Xbyak::Ymm& dst, const Xbyak::Ymm& src, int fx_idx = 2,
+                   int fy_idx = 3, int mask_idx = 4, int tmp_idx = 5);
+  // compute tanh with ymm
+  void tanh_ymm(const Xbyak::Ymm& dst, const Xbyak::Ymm& src, int fx_idx = 2,
+                int fy_idx = 3, int mask_idx = 4, int tmp_idx = 5);
+ protected:
  int num_;
+  operand_type type_;
  reg64_t param1{abi_param1};
  reg64_t param2{abi_param2};
-  xmm_t xmm_zero = xmm_t(0);
+  xmm_t xmm_src = xmm_t(0);
-  xmm_t xmm_src = xmm_t(1);
+  ymm_t ymm_src = ymm_t(0);
-  xmm_t xmm_dst = xmm_t(1);
-  ymm_t ymm_zero = ymm_t(0);
+  xmm_t xmm_dst = xmm_t(1);
-  ymm_t ymm_src = ymm_t(1);
  ymm_t ymm_dst = ymm_t(1);
 };

--- a/paddle/fluid/operators/math/jit_kernel.h
+++ b/paddle/fluid/operators/math/jit_kernel.h
@@ -29,9 +29,9 @@ namespace jitkernel {
 #define SIGMOID_THRESHOLD_MIN -40.0
 #define SIGMOID_THRESHOLD_MAX 13.0
 #define EXP_MAX_INPUT 40.0
-#define AVX_FLOAT_BLOCK 8
+#define XMM_FLOAT_BLOCK 4
-#define AVX2_FLOAT_BLOCK 8
+#define YMM_FLOAT_BLOCK 8
-#define AVX512_FLOAT_BLOCK 16
+#define ZMM_FLOAT_BLOCK 16
 typedef enum { kLT8, kEQ8, kGT8LT16, kEQ16, kGT16 } jit_block;
@@ -97,39 +97,23 @@ class VAddBiasKernel : public Kernel {
 template <typename T>
 class VActKernel : public Kernel {
 public:
-  virtual void ComputeDeprecated(const T *x, T *y) const = 0;
+  void (*Compute)(const T *, T *, int);
 };
 template <typename T>
-class VReluKernel : public VActKernel<T> {
+class VReluKernel : public VActKernel<T> {};
- public:
-  virtual void ComputeDeprecated(const T *x, T *y) const = 0;
-  void (*Compute)(const T *, T *, int);
-};
 template <typename T>
-class VIdentityKernel : public VActKernel<T> {
+class VIdentityKernel : public VActKernel<T> {};
- public:
-  virtual void ComputeDeprecated(const T *x, T *y) const = 0;
-};
 template <typename T>
-class VExpKernel : public VActKernel<T> {
+class VExpKernel : public VActKernel<T> {};
- public:
-  virtual void ComputeDeprecated(const T *x, T *y) const = 0;
-};
 template <typename T>
-class VSigmoidKernel : public VActKernel<T> {
+class VSigmoidKernel : public VActKernel<T> {};
- public:
-  virtual void ComputeDeprecated(const T *x, T *y) const = 0;
-};
 template <typename T>
-class VTanhKernel : public VActKernel<T> {
+class VTanhKernel : public VActKernel<T> {};
- public:
-  virtual void ComputeDeprecated(const T *x, T *y) const = 0;
-};
 template <typename T>
 class LSTMKernel : public Kernel {

--- a/paddle/fluid/operators/math/jit_kernel_blas.cc
+++ b/paddle/fluid/operators/math/jit_kernel_blas.cc
@@ -25,10 +25,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/dynload/mklml.h"
 #endif
-#ifdef __AVX__
-#include <immintrin.h>
-#endif
 namespace paddle {
 namespace operators {
 namespace math {
@@ -128,23 +124,16 @@ void VScalMKL<double>(const double* a, const double* x, double* y, int n) {
 #endif
-#define DECLARE_STATIC_FUNC                                 \
-  static inline std::string name(int d) {                   \
-    PADDLE_THROW("DType should be either float or double"); \
-  }                                                         \
-  static inline bool useJIT(int d) { return false; }        \
-  static inline bool useMKL(int d) { return false; }
 /* VMUL JitKernel */
 template <typename T>
 class VMulKernelImpl : public VMulKernel<T> {
 public:
-  DECLARE_STATIC_FUNC;
+  JITKERNEL_DECLARE_STATIC_FUNC;
  explicit VMulKernelImpl(int d) : VMulKernel<T>() {
 #ifdef PADDLE_WITH_XBYAK
    if (useJIT(d)) {
      // roughly estimate the size of code
-      size_t sz = 96 + d / AVX_FLOAT_BLOCK * 4 * 8;
+      size_t sz = 96 + d / YMM_FLOAT_BLOCK * 4 * 8;
      jitcode_.reset(new gen::VXXJitCode(d, gen::operand_type::mul, 0, false,
                                         sz > 4096 ? sz : 4096));
      this->Compute =
@@ -191,11 +180,11 @@ bool VMulKernelImpl<double>::useMKL(int d) {
 template <typename T>
 class VAddKernelImpl : public VAddKernel<T> {
 public:
-  DECLARE_STATIC_FUNC;
+  JITKERNEL_DECLARE_STATIC_FUNC;
  explicit VAddKernelImpl(int d) : VAddKernel<T>() {
 #ifdef PADDLE_WITH_XBYAK
    if (useJIT(d)) {
-      size_t sz = 96 + d / AVX_FLOAT_BLOCK * 4 * 8;
+      size_t sz = 96 + d / YMM_FLOAT_BLOCK * 4 * 8;
      jitcode_.reset(new gen::VXXJitCode(d, gen::operand_type::add, 0, false,
                                         sz > 4096 ? sz : 4096));
      this->Compute =
@@ -241,11 +230,11 @@ bool VAddKernelImpl<double>::useMKL(int d) {
 template <typename T>
 class VAddReluKernelImpl : public VAddReluKernel<T> {
 public:
-  DECLARE_STATIC_FUNC;
+  JITKERNEL_DECLARE_STATIC_FUNC;
  explicit VAddReluKernelImpl(int d) : VAddReluKernel<T>() {
 #ifdef PADDLE_WITH_XBYAK
    if (useJIT(d)) {
-      size_t sz = 96 + d / AVX_FLOAT_BLOCK * 4 * 8;
+      size_t sz = 96 + d / YMM_FLOAT_BLOCK * 4 * 8;
      jitcode_.reset(new gen::VXXJitCode(d, gen::operand_type::add, 0, true,
                                         sz > 4096 ? sz : 4096));
      this->Compute =
@@ -273,11 +262,11 @@ bool VAddReluKernelImpl<float>::useJIT(int d) {
 template <typename T>
 class VScalKernelImpl : public VScalKernel<T> {
 public:
-  DECLARE_STATIC_FUNC;
+  JITKERNEL_DECLARE_STATIC_FUNC;
  explicit VScalKernelImpl(int d) : VScalKernel<T>() {
 #ifdef PADDLE_WITH_XBYAK
    if (useJIT(d)) {
-      size_t sz = 96 + d / AVX_FLOAT_BLOCK * 4 * 8;
+      size_t sz = 96 + d / YMM_FLOAT_BLOCK * 4 * 8;
      jitcode_.reset(new gen::VXXJitCode(d, gen::operand_type::mul, 1, false,
                                         sz > 4096 ? sz : 4096));
      this->Compute =
@@ -322,11 +311,11 @@ bool VScalKernelImpl<double>::useMKL(int d) {
 template <typename T>
 class VAddBiasKernelImpl : public VAddBiasKernel<T> {
 public:
-  DECLARE_STATIC_FUNC;
+  JITKERNEL_DECLARE_STATIC_FUNC;
  explicit VAddBiasKernelImpl(int d) : VAddBiasKernel<T>() {
 #ifdef PADDLE_WITH_XBYAK
    if (useJIT(d)) {
-      size_t sz = 96 + d / AVX_FLOAT_BLOCK * 4 * 8;
+      size_t sz = 96 + d / YMM_FLOAT_BLOCK * 4 * 8;
      jitcode_.reset(new gen::VXXJitCode(d, gen::operand_type::add, 1, false,
                                         sz > 4096 ? sz : 4096));
      this->Compute =
@@ -355,15 +344,15 @@ bool VAddBiasKernelImpl<float>::useJIT(int d) {
 template <typename T>
 class VReluKernelImpl : public VReluKernel<T> {
 public:
-  DECLARE_STATIC_FUNC;
+  JITKERNEL_DECLARE_STATIC_FUNC;
  explicit VReluKernelImpl(int d) : VReluKernel<T>() {
-    this->num_ = d;  // TODO(TJ): remove me when ComputeDeprecated done
 #ifdef PADDLE_WITH_XBYAK
    if (useJIT(d)) {
-      size_t sz = 96 /*init*/ +
+      size_t sz = 96 /* init size */ +
-                  d / AVX_FLOAT_BLOCK * 4 /* instructions*/ *
+                  d / YMM_FLOAT_BLOCK * 4 /* instructions */ *
-                      8 /*everage byte for each instruction*/;
+                      8 /* average bytes for each instruction */;
-      jitcode_.reset(new gen::ReluJitCode(d, sz > 4096 ? sz : 4096));
+      jitcode_.reset(new gen::VActJitCode(d, gen::operand_type::relu,
+                                          sz > 4096 ? sz : 4096));
      this->Compute = jitcode_->getCode<void (*)(const T*, T*, int)>();
      return;
    }
@@ -371,24 +360,32 @@ class VReluKernelImpl : public VReluKernel<T> {
    this->Compute = VReluRefer<T>;
  }
-  void ComputeDeprecated(const T* x, T* y) const override {
-    VReluRefer(x, y, this->num_);
-  }
 #ifdef PADDLE_WITH_XBYAK
 private:
-  std::unique_ptr<gen::ReluJitCode> jitcode_{nullptr};
+  std::unique_ptr<gen::VActJitCode> jitcode_{nullptr};
 #endif
 };
 #ifdef PADDLE_WITH_XBYAK
 template <>
 bool VReluKernelImpl<float>::useJIT(int d) {
-  return gen::ReluJitCode::init(d);
+  return gen::VActJitCode::init(d, gen::operand_type::relu);
 }
 #endif
-#undef DECLARE_STATIC_FUNC
+template <typename T>
+inline void VIdentityRefer(const T* x, T* y, int n) {}
+/* An empty JitKernel */
+template <typename T>
+class VIdentityKernelImpl : public VIdentityKernel<T> {
+ public:
+  JITKERNEL_DECLARE_STATIC_FUNC;
+  explicit VIdentityKernelImpl(int d) : VIdentityKernel<T>() {
+    this->Compute = VIdentityRefer<T>;
+  }
+};
 REGISTER_JITKERNEL(vmul, VMulKernel);
 REGISTER_JITKERNEL(vadd, VAddKernel);
@@ -396,16 +393,7 @@ REGISTER_JITKERNEL(vaddrelu, VAddReluKernel);
 REGISTER_JITKERNEL(vscal, VScalKernel);
 REGISTER_JITKERNEL(vaddbias, VAddBiasKernel);
 REGISTER_JITKERNEL(vrelu, VReluKernel);
+REGISTER_JITKERNEL(videntity, VIdentityKernel);
-/* An empty JitKernel */
-template <typename T, platform::jit::cpu_isa_t isa, jit_block>
-class VIdentityKernelImpl : public VIdentityKernel<T> {
- public:
-  explicit VIdentityKernelImpl(int d) : VIdentityKernel<T>() { this->num_ = d; }
-  void ComputeDeprecated(const T* x, T* y) const override {}
-};
-REGISTER_JITKERNEL_DEPRECATED(videntity, VIdentityKernel);
 }  // namespace jitkernel
 }  // namespace math

--- a/paddle/fluid/operators/math/jit_kernel_crf_decode.cc
+++ b/paddle/fluid/operators/math/jit_kernel_crf_decode.cc
@@ -105,14 +105,14 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel<T> {
      int tag_num)                                                             \
      : CRFDecodeKernel<float>() {                                             \
    this->num_ = tag_num;                                                      \
-    this->end_ = this->num_ / AVX_FLOAT_BLOCK;                                 \
+    this->end_ = this->num_ / YMM_FLOAT_BLOCK;                                 \
-    this->rest_ = this->num_ % AVX_FLOAT_BLOCK;                                \
+    this->rest_ = this->num_ % YMM_FLOAT_BLOCK;                                \
  }                                                                            \
  template <>                                                                  \
  void CRFDecodeKernelImpl<float, jit::avx, block>::Compute(                   \
      const int seq_len, const float* x, const float* w, float* alpha,         \
      int* track) const {                                                      \
-    INIT_ALPHA(AVX_FLOAT_BLOCK)                                                \
+    INIT_ALPHA(YMM_FLOAT_BLOCK)                                                \
    /* Use the column-major strategy to get the location of maximum score.*/   \
    int seq_offset = 0;                                                        \
    constexpr int state_trans_base_idx = 2;                                    \
@@ -150,7 +150,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel<T> {
          max_score = _mm256_max_ps(max_score, score_v);                       \
          trans_offset += this->num_;                                          \
        }                                                                      \
-        UPDATE_ALPHA(AVX_FLOAT_BLOCK)                                          \
+        UPDATE_ALPHA(YMM_FLOAT_BLOCK)                                          \
      }                                                                        \
      seq_offset += this->num_;                                                \
    }                                                                          \
@@ -161,14 +161,14 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel<T> {
  CRFDecodeKernelImpl<float, isa, block>::CRFDecodeKernelImpl(int tag_num)     \
      : CRFDecodeKernel<float>() {                                             \
    this->num_ = tag_num;                                                      \
-    this->end_ = this->num_ / AVX2_FLOAT_BLOCK;                                \
+    this->end_ = this->num_ / YMM_FLOAT_BLOCK;                                 \
-    this->rest_ = this->num_ % AVX2_FLOAT_BLOCK;                               \
+    this->rest_ = this->num_ % YMM_FLOAT_BLOCK;                                \
  }                                                                            \
  template <>                                                                  \
  void CRFDecodeKernelImpl<float, isa, block>::Compute(                        \
      const int seq_len, const float* x, const float* w, float* alpha,         \
      int* track) const {                                                      \
-    INIT_ALPHA(AVX2_FLOAT_BLOCK)                                               \
+    INIT_ALPHA(YMM_FLOAT_BLOCK)                                                \
    /* Use the column-major strategy to get the location of maximum score.*/   \
    int seq_offset = 0;                                                        \
    constexpr int state_trans_base_idx = 2;                                    \
@@ -196,7 +196,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel<T> {
          max_score = _mm256_max_ps(max_score, score_v);                       \
          trans_offset += this->num_;                                          \
        }                                                                      \
-        UPDATE_ALPHA(AVX2_FLOAT_BLOCK)                                         \
+        UPDATE_ALPHA(YMM_FLOAT_BLOCK)                                          \
      }                                                                        \
      seq_offset += this->num_;                                                \
    }                                                                          \
@@ -208,14 +208,14 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel<T> {
      int tag_num)                                                             \
      : CRFDecodeKernel<float>() {                                             \
    this->num_ = tag_num;                                                      \
-    this->end_ = this->num_ / AVX512_FLOAT_BLOCK;                              \
+    this->end_ = this->num_ / ZMM_FLOAT_BLOCK;                                 \
-    this->rest_ = this->num_ % AVX512_FLOAT_BLOCK;                             \
+    this->rest_ = this->num_ % ZMM_FLOAT_BLOCK;                                \
  }                                                                            \
  template <>                                                                  \
  void CRFDecodeKernelImpl<float, jit::avx512f, block>::Compute(               \
      const int seq_len, const float* x, const float* w, float* alpha,         \
      int* track) const {                                                      \
-    INIT_ALPHA(AVX512_FLOAT_BLOCK)                                             \
+    INIT_ALPHA(ZMM_FLOAT_BLOCK)                                                \
    /* Use the column-major strategy to get the location of maximum score.*/   \
    int seq_offset = 0;                                                        \
    constexpr int state_trans_base_idx = 2;                                    \
@@ -250,7 +250,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel<T> {
                                                       this->num_ + j_offset), \
                            max_j);                                            \
        /* Calculate the offset of next step*/                                 \
-        j_offset += AVX512_FLOAT_BLOCK;                                        \
+        j_offset += ZMM_FLOAT_BLOCK;                                           \
        if (j == this->end_ - 1) {                                             \
          if (this->rest_ > 0) {                                               \
            j_offset += last_offset;                                           \

--- a/paddle/fluid/operators/math/jit_kernel_exp.cc
+++ b/paddle/fluid/operators/math/jit_kernel_exp.cc
@@ -16,6 +16,11 @@ limitations under the License. */
 #include <cmath>  // for exp
 #include <string>
 #include "paddle/fluid/operators/math/jit_kernel_macro.h"
+#ifdef PADDLE_WITH_XBYAK
+#include "paddle/fluid/operators/math/jit_code.h"
+#endif
 #ifdef PADDLE_WITH_MKLML
 #include "paddle/fluid/platform/dynload/mklml.h"
 #endif
@@ -30,38 +35,238 @@ namespace math {
 namespace jitkernel {
 namespace jit = platform::jit;
+// TODO(TJ): move refer codes to one file
+// Refer code only focus on correctness
+template <typename T>
+void VExpRefer(const T* x, T* y, int n) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = std::exp(x[i]);
+  }
+}
+template <typename T>
+void VSigmoidRefer(const T* x, T* y, int n) {
+  // y = 1 / (1 + e^-x)
+  const T min = SIGMOID_THRESHOLD_MIN;
+  const T max = SIGMOID_THRESHOLD_MAX;
+  for (int i = 0; i < n; ++i) {
+    T tmp = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
+    y[i] = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-tmp));
+  }
+}
+template <typename T>
+void VTanhRefer(const T* x, T* y, int n) {
+  // y = 2 * sigmoid(2x) - 1
+  for (int i = 0; i < n; ++i) {
+    y[i] = static_cast<T>(2) * x[i];
+  }
+  VSigmoidRefer(y, y, n);
+  for (int i = 0; i < n; ++i) {
+    y[i] = static_cast<T>(2) * y[i] - static_cast<T>(1);
+  }
+}
+#ifdef PADDLE_WITH_MKLML
+// try to use MKL to speedup
+template <typename T>
+void VExpMKL(const T* x, T* y, int n);
+template <>
+void VExpMKL<float>(const float* x, float* y, int n) {
+  platform::dynload::vsExp(n, x, y);
+}
+template <>
+void VExpMKL<double>(const double* x, double* y, int n) {
+  platform::dynload::vdExp(n, x, y);
+}
+template <typename T>
+void VSigmoidMKL(const T* x, T* y, int n) {
+  const T min = SIGMOID_THRESHOLD_MIN;
+  const T max = SIGMOID_THRESHOLD_MAX;
+  for (int i = 0; i < n; ++i) {
+    y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
+    y[i] = static_cast<T>(0) - y[i];
+  }
+  VExpMKL(y, y, n);
+  for (int i = 0; i < n; ++i) {
+    y[i] = static_cast<T>(1) / (static_cast<T>(1) + y[i]);
+  }
+}
+template <typename T>
+void VTanhMKL(const T* x, T* y, int n) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = static_cast<T>(2) * x[i];
+  }
+  VSigmoidMKL(y, y, n);
+  for (int i = 0; i < n; ++i) {
+    y[i] = static_cast<T>(2) * y[i] - static_cast<T>(1);
+  }
+}
+#endif
 /* VExp JitKernel */
-template <typename T, jit::cpu_isa_t isa, jit_block>
+template <typename T>
 class VExpKernelImpl : public VExpKernel<T> {
 public:
-  explicit VExpKernelImpl(int d) : VExpKernel<T>() { this->num_ = d; }
+  JITKERNEL_DECLARE_STATIC_FUNC;
-  void ComputeDeprecated(const T* x, T* y) const override {
+  explicit VExpKernelImpl(int d) : VExpKernel<T>() {
-    for (int i = 0; i < this->num_; ++i) {
+#ifdef PADDLE_WITH_XBYAK
-      y[i] = std::exp(x[i]);
+    if (useJIT(d)) {
+      size_t sz = 96 + d / YMM_FLOAT_BLOCK * 70 * 8;
+      jitcode_.reset(new gen::VActJitCode(d, gen::operand_type::exp,
+                                          sz > 4096 ? sz : 4096));
+      this->Compute = jitcode_->getCode<void (*)(const T*, T*, int)>();
+      return;
+    }
+#endif
+#ifdef PADDLE_WITH_MKLML
+    if (useMKL(d)) {
+      this->Compute = VExpMKL<T>;
+      return;
    }
+#endif
+    this->Compute = VExpRefer<T>;
  }
+#ifdef PADDLE_WITH_XBYAK
+ private:
+  std::unique_ptr<gen::VActJitCode> jitcode_{nullptr};
+#endif
 };
+#ifdef PADDLE_WITH_XBYAK
+template <>
+bool VExpKernelImpl<float>::useJIT(int d) {
+  return gen::VActJitCode::init(d, gen::operand_type::exp);
+}
+#endif
 #ifdef PADDLE_WITH_MKLML
-#define MKL_FLOAT(isa, block)                                                 \
+template <>
-  template <>                                                                 \
+bool VExpKernelImpl<float>::useMKL(int d) {
-  void VExpKernelImpl<float, isa, block>::ComputeDeprecated(const float* x,   \
+  return d > 512;
-                                                            float* y) const { \
+}
-    platform::dynload::vsExp(this->num_, x, y);                               \
+template <>
+bool VExpKernelImpl<double>::useMKL(int d) {
+  return true;
+}
+#endif
+/* VSigmoid JitKernel */
+template <typename T>
+class VSigmoidKernelImpl : public VSigmoidKernel<T> {
+ public:
+  JITKERNEL_DECLARE_STATIC_FUNC;
+  explicit VSigmoidKernelImpl(int d) : VSigmoidKernel<T>() {
+#ifdef PADDLE_WITH_XBYAK
+    if (useJIT(d)) {
+      size_t sz = 96 + d / YMM_FLOAT_BLOCK * 82 * 8;
+      jitcode_.reset(new gen::VActJitCode(d, gen::operand_type::sigmoid,
+                                          sz > 4096 ? sz : 4096));
+      this->Compute = jitcode_->getCode<void (*)(const T*, T*, int)>();
+      return;
+    }
+#endif
+#ifdef PADDLE_WITH_MKLML
+    // strictly it's a better impl with MKL, then is refer
+    if (useMKL(d)) {
+      this->Compute = VSigmoidMKL<T>;
+      return;
+    }
+#endif
+    this->Compute = VSigmoidRefer<T>;
  }
-#define MKL_DOUBLE(isa, block)                                \
+#ifdef PADDLE_WITH_XBYAK
-  template <>                                                 \
-  void VExpKernelImpl<double, isa, block>::ComputeDeprecated( \
+ private:
-      const double* x, double* y) const {                     \
+  std::unique_ptr<gen::VActJitCode> jitcode_{nullptr};
-    platform::dynload::vdExp(this->num_, x, y);               \
+#endif
+};
+#ifdef PADDLE_WITH_XBYAK
+template <>
+bool VSigmoidKernelImpl<float>::useJIT(int d) {
+  return gen::VActJitCode::init(d, gen::operand_type::sigmoid);
+}
+#endif
+#ifdef PADDLE_WITH_MKLML
+template <>
+bool VSigmoidKernelImpl<float>::useMKL(int d) {
+  return d > 512;
+}
+template <>
+bool VSigmoidKernelImpl<double>::useMKL(int d) {
+  return true;
+}
+#endif
+/* VTanh JitKernel */
+template <typename T>
+class VTanhKernelImpl : public VTanhKernel<T> {
+ public:
+  JITKERNEL_DECLARE_STATIC_FUNC;
+  explicit VTanhKernelImpl(int d) : VTanhKernel<T>() {
+#ifdef PADDLE_WITH_XBYAK
+    if (useJIT(d)) {
+      size_t sz = 96 + d / YMM_FLOAT_BLOCK * 84 * 8;
+      jitcode_.reset(new gen::VActJitCode(d, gen::operand_type::tanh,
+                                          sz > 4096 ? sz : 4096));
+      this->Compute = jitcode_->getCode<void (*)(const T*, T*, int)>();
+      return;
+    }
+#endif
+#ifdef PADDLE_WITH_MKLML
+    // strictly it's a better impl with MKL, then is refer
+    if (useMKL(d)) {
+      this->Compute = VTanhMKL<T>;
+      return;
+    }
+#endif
+    this->Compute = VTanhRefer<T>;
  }
-FOR_EACH_ISA(MKL_FLOAT, kLT8);
-FOR_EACH_ISA(MKL_FLOAT, kGT8LT16);
+#ifdef PADDLE_WITH_XBYAK
-FOR_EACH_ISA(MKL_FLOAT, kGT16);
-FOR_EACH_ISA_BLOCK(MKL_DOUBLE);
+ private:
+  std::unique_ptr<gen::VActJitCode> jitcode_{nullptr};
+#endif
+};
+#ifdef PADDLE_WITH_XBYAK
+template <>
+bool VTanhKernelImpl<float>::useJIT(int d) {
+  return gen::VActJitCode::init(d, gen::operand_type::tanh);
+}
+#endif
+#ifdef PADDLE_WITH_MKLML
+template <>
+bool VTanhKernelImpl<float>::useMKL(int d) {
+  return d > 512;
+}
+template <>
+bool VTanhKernelImpl<double>::useMKL(int d) {
+  return true;
+}
 #endif
+REGISTER_JITKERNEL(vexp, VExpKernel);
+REGISTER_JITKERNEL(vsigmoid, VSigmoidKernel);
+REGISTER_JITKERNEL(vtanh, VTanhKernel);
 namespace detail {
 #ifdef __AVX__
@@ -210,334 +415,6 @@ __m256 ExpAVX2(__m256 x) {
 #endif
 }  // namespace detail
-#define INTRI8_FLOAT(isa, expisa)                                            \
-  template <>                                                                \
-  void VExpKernelImpl<float, isa, kEQ8>::ComputeDeprecated(const float* x,   \
-                                                           float* y) const { \
-    __m256 tmp = _mm256_loadu_ps(x);                                         \
-    _mm256_storeu_ps(y, expisa(tmp));                                        \
-  }
-#define INTRI16_FLOAT(isa, expisa)                                            \
-  template <>                                                                 \
-  void VExpKernelImpl<float, isa, kEQ16>::ComputeDeprecated(const float* x,   \
-                                                            float* y) const { \
-    __m256 tmp0 = _mm256_loadu_ps(x);                                         \
-    __m256 tmp1 = _mm256_loadu_ps(x + 8);                                     \
-    tmp0 = expisa(tmp0);                                                      \
-    tmp1 = expisa(tmp1);                                                      \
-    _mm256_storeu_ps(y, tmp0);                                                \
-    _mm256_storeu_ps(y + 8, tmp1);                                            \
-  }
-#ifdef __AVX__
-INTRI8_FLOAT(jit::avx, detail::ExpAVX);
-INTRI16_FLOAT(jit::avx, detail::ExpAVX);
-#endif
-#ifdef __AVX2__
-INTRI8_FLOAT(jit::avx2, detail::ExpAVX2);
-INTRI16_FLOAT(jit::avx2, detail::ExpAVX2);
-#endif
-#ifdef __AVX512F__
-INTRI8_FLOAT(jit::avx512f, detail::ExpAVX2);
-INTRI16_FLOAT(jit::avx512f, detail::ExpAVX2);
-#endif
-// TODO(TJ): eq16 test and complete avx512
-#undef INTRI8_FLOAT
-#undef INTRI16_FLOAT
-#undef MKL_FLOAT
-#undef MKL_DOUBLE
-REGISTER_JITKERNEL_DEPRECATED(vexp, VExpKernel);
-/* VSigmoid JitKernel */
-template <typename T, jit::cpu_isa_t isa, jit_block>
-class VSigmoidKernelImpl : public VSigmoidKernel<T> {
- public:
-  explicit VSigmoidKernelImpl(int d) : VSigmoidKernel<T>() {
-    this->num_ = d;
-    vexp_ = KernelPool::Instance().template Get<VExpKernel<T>>(d);
-  }
-  void ComputeDeprecated(const T* x, T* y) const override {
-    const T min = SIGMOID_THRESHOLD_MIN;
-    const T max = SIGMOID_THRESHOLD_MAX;
-    for (int i = 0; i < this->num_; ++i) {
-      y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
-      y[i] = static_cast<T>(0) - y[i];
-    }
-    vexp_->ComputeDeprecated(y, y);
-    for (int i = 0; i < this->num_; ++i) {
-      y[i] = static_cast<T>(1) / (static_cast<T>(1) + y[i]);
-    }
-  }
- private:
-  std::shared_ptr<const VExpKernel<T>> vexp_;
-};
-#define INTRI_SIGMOID(tmp, min, max, expisa)      \
-  tmp = _mm256_max_ps(tmp, min);                  \
-  tmp = _mm256_min_ps(tmp, max);                  \
-  tmp = _mm256_sub_ps(_mm256_set1_ps(0.0f), tmp); \
-  tmp = expisa(tmp);                              \
-  tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp); \
-  tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp)
-#define INTRI8_FLOAT(isa, expisa)                               \
-  template <>                                                   \
-  void VSigmoidKernelImpl<float, isa, kEQ8>::ComputeDeprecated( \
-      const float* x, float* y) const {                         \
-    /* TODO(TJ): try to use static const*/                      \
-    __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);         \
-    __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);         \
-    __m256 tmp = _mm256_loadu_ps(x);                            \
-    INTRI_SIGMOID(tmp, min, max, expisa);                       \
-    _mm256_storeu_ps(y, tmp);                                   \
-  }
-#define INTRI16_FLOAT(isa, expisa)                               \
-  template <>                                                    \
-  void VSigmoidKernelImpl<float, isa, kEQ16>::ComputeDeprecated( \
-      const float* x, float* y) const {                          \
-    __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);          \
-    __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);          \
-    __m256 tmp0 = _mm256_loadu_ps(x);                            \
-    __m256 tmp1 = _mm256_loadu_ps(x + 8);                        \
-    INTRI_SIGMOID(tmp0, min, max, expisa);                       \
-    INTRI_SIGMOID(tmp1, min, max, expisa);                       \
-    _mm256_storeu_ps(y, tmp0);                                   \
-    _mm256_storeu_ps(y + 8, tmp1);                               \
-  }
-#define INTRI_GT8LT16_FLOAT(isa, expisa)                                     \
-  template <>                                                                \
-  VSigmoidKernelImpl<float, isa, kGT8LT16>::VSigmoidKernelImpl(int d)        \
-      : VSigmoidKernel<float>() {                                            \
-    this->num_ = d;                                                          \
-    this->end_ = AVX_FLOAT_BLOCK;                                            \
-    this->rest_ = d - this->end_;                                            \
-    vexp_ =                                                                  \
-        KernelPool::Instance().template Get<VExpKernel<float>>(this->rest_); \
-  }                                                                          \
-  template <>                                                                \
-  void VSigmoidKernelImpl<float, isa, kGT8LT16>::ComputeDeprecated(          \
-      const float* x, float* y) const {                                      \
-    __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);                      \
-    __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);                      \
-    __m256 tmp = _mm256_loadu_ps(x);                                         \
-    INTRI_SIGMOID(tmp, min, max, expisa);                                    \
-    _mm256_storeu_ps(y, tmp);                                                \
-    const float min_ = SIGMOID_THRESHOLD_MIN;                                \
-    const float max_ = SIGMOID_THRESHOLD_MAX;                                \
-    for (int i = this->end_; i < this->num_; ++i) {                          \
-      y[i] = (x[i] < min_) ? min_ : ((x[i] > max_) ? max_ : x[i]);           \
-      y[i] = 0.f - y[i];                                                     \
-    }                                                                        \
-    vexp_->ComputeDeprecated(y + this->end_, y + this->end_);                \
-    for (int i = this->end_; i < this->num_; ++i) {                          \
-      y[i] = 1.f / (1.f + y[i]);                                             \
-    }                                                                        \
-  }
-#define INTRI_GT16_FLOAT(isa, expisa)                                        \
-  template <>                                                                \
-  VSigmoidKernelImpl<float, isa, kGT16>::VSigmoidKernelImpl(int d)           \
-      : VSigmoidKernel<float>() {                                            \
-    this->num_ = d;                                                          \
-    this->rest_ = d % AVX_FLOAT_BLOCK;                                       \
-    this->end_ = d - this->rest_;                                            \
-    vexp_ =                                                                  \
-        KernelPool::Instance().template Get<VExpKernel<float>>(this->rest_); \
-  }                                                                          \
-  template <>                                                                \
-  void VSigmoidKernelImpl<float, isa, kGT16>::ComputeDeprecated(             \
-      const float* x, float* y) const {                                      \
-    __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);                      \
-    __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);                      \
-    for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) {                  \
-      __m256 tmp = _mm256_loadu_ps(x + i);                                   \
-      INTRI_SIGMOID(tmp, min, max, expisa);                                  \
-      _mm256_storeu_ps(y + i, tmp);                                          \
-    }                                                                        \
-    const float min_ = SIGMOID_THRESHOLD_MIN;                                \
-    const float max_ = SIGMOID_THRESHOLD_MAX;                                \
-    for (int i = this->end_; i < this->num_; ++i) {                          \
-      y[i] = (x[i] < min_) ? min_ : ((x[i] > max_) ? max_ : x[i]);           \
-      y[i] = 0.f - y[i];                                                     \
-    }                                                                        \
-    vexp_->ComputeDeprecated(y + this->end_, y + this->end_);                \
-    for (int i = this->end_; i < this->num_; ++i) {                          \
-      y[i] = 1.f / (1.f + y[i]);                                             \
-    }                                                                        \
-  }
-#ifdef __AVX__
-INTRI8_FLOAT(jit::avx, detail::ExpAVX);
-INTRI16_FLOAT(jit::avx, detail::ExpAVX);
-INTRI_GT8LT16_FLOAT(jit::avx, detail::ExpAVX);
-INTRI_GT16_FLOAT(jit::avx, detail::ExpAVX);
-#endif
-#ifdef __AVX2__
-INTRI8_FLOAT(jit::avx2, detail::ExpAVX2);
-INTRI16_FLOAT(jit::avx2, detail::ExpAVX2);
-// maybe use avx at gt8lt16 and gt16
-#endif
-#ifdef __AVX512F__
-INTRI8_FLOAT(jit::avx512f, detail::ExpAVX2);
-INTRI16_FLOAT(jit::avx512f, detail::ExpAVX2);
-// maybe use avx2 at gt8lt16 and gt16
-#endif
-#undef INTRI8_FLOAT
-#undef INTRI16_FLOAT
-#undef INTRI_GT8LT16_FLOAT
-#undef INTRI_GT16_FLOAT
-#undef INTRI_VSIGMOID
-REGISTER_JITKERNEL_DEPRECATED(vsigmoid, VSigmoidKernel);
-/* VTanh JitKernel */
-template <typename T, jit::cpu_isa_t isa, jit_block>
-class VTanhKernelImpl : public VTanhKernel<T> {
- public:
-  explicit VTanhKernelImpl(int d) : VTanhKernel<T>() {
-    this->num_ = d;
-    vscal_ = KernelPool::Instance().template Get<VScalKernel<T>>(d);
-    vsigmoid_ = KernelPool::Instance().template Get<VSigmoidKernel<T>>(d);
-    vaddbias_ = KernelPool::Instance().template Get<VAddBiasKernel<T>>(d);
-  }
-  void ComputeDeprecated(const T* x, T* y) const override {
-    const T a = static_cast<T>(2), b = static_cast<T>(-1);
-    vscal_->Compute(&a, x, y, this->num_);
-    vsigmoid_->ComputeDeprecated(y, y);
-    vscal_->Compute(&a, y, y, this->num_);
-    vaddbias_->Compute(&b, y, y, this->num_);
-  }
- private:
-  std::shared_ptr<const VScalKernel<T>> vscal_;
-  std::shared_ptr<const VSigmoidKernel<T>> vsigmoid_;
-  std::shared_ptr<const VAddBiasKernel<T>> vaddbias_;
-};
-#define INTRI_VTANH(tmp, expisa)                           \
-  tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), tmp);         \
-  tmp = _mm256_min_ps(tmp, _mm256_set1_ps(EXP_MAX_INPUT)); \
-  tmp = expisa(tmp);                                       \
-  tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp);          \
-  tmp = _mm256_div_ps(_mm256_set1_ps(2.0f), tmp);          \
-  tmp = _mm256_sub_ps(tmp, _mm256_set1_ps(1.0f))
-#define INTRI8_FLOAT(isa, expisa)                                             \
-  template <>                                                                 \
-  void VTanhKernelImpl<float, isa, kEQ8>::ComputeDeprecated(const float* x,   \
-                                                            float* y) const { \
-    __m256 tmp = _mm256_loadu_ps(x);                                          \
-    INTRI_VTANH(tmp, expisa);                                                 \
-    _mm256_storeu_ps(y, tmp);                                                 \
-  }
-#define INTRI16_FLOAT(isa, expisa)                                             \
-  template <>                                                                  \
-  void VTanhKernelImpl<float, isa, kEQ16>::ComputeDeprecated(const float* x,   \
-                                                             float* y) const { \
-    __m256 tmp0 = _mm256_loadu_ps(x);                                          \
-    __m256 tmp1 = _mm256_loadu_ps(x + 8);                                      \
-    INTRI_VTANH(tmp0, expisa);                                                 \
-    INTRI_VTANH(tmp1, expisa);                                                 \
-    _mm256_storeu_ps(y, tmp0);                                                 \
-    _mm256_storeu_ps(y + 8, tmp1);                                             \
-  }
-#define INTRI_GT8LT16_FLOAT(isa, expisa)                                      \
-  template <>                                                                 \
-  VTanhKernelImpl<float, isa, kGT8LT16>::VTanhKernelImpl(int d)               \
-      : VTanhKernel<float>() {                                                \
-    this->num_ = d;                                                           \
-    this->end_ = AVX_FLOAT_BLOCK;                                             \
-    this->rest_ = d - this->end_;                                             \
-    vscal_ =                                                                  \
-        KernelPool::Instance().template Get<VScalKernel<float>>(this->rest_); \
-    vsigmoid_ = KernelPool::Instance().template Get<VSigmoidKernel<float>>(   \
-        this->rest_);                                                         \
-    vaddbias_ = KernelPool::Instance().template Get<VAddBiasKernel<float>>(   \
-        this->rest_);                                                         \
-  }                                                                           \
-  template <>                                                                 \
-  void VTanhKernelImpl<float, isa, kGT8LT16>::ComputeDeprecated(              \
-      const float* x, float* y) const {                                       \
-    __m256 tmp = _mm256_loadu_ps(x);                                          \
-    INTRI_VTANH(tmp, expisa);                                                 \
-    _mm256_storeu_ps(y, tmp);                                                 \
-    x += AVX_FLOAT_BLOCK;                                                     \
-    y += AVX_FLOAT_BLOCK;                                                     \
-    const float a = 2.f, b = -1.f;                                            \
-    vscal_->Compute(&a, x, y, this->num_);                                    \
-    vsigmoid_->ComputeDeprecated(y, y);                                       \
-    vscal_->Compute(&a, y, y, this->num_);                                    \
-    vaddbias_->Compute(&b, y, y, this->num_);                                 \
-  }
-#define INTRI_GT16_FLOAT(isa, expisa)                                          \
-  template <>                                                                  \
-  VTanhKernelImpl<float, isa, kGT16>::VTanhKernelImpl(int d)                   \
-      : VTanhKernel<float>() {                                                 \
-    this->num_ = d;                                                            \
-    this->rest_ = d % AVX_FLOAT_BLOCK;                                         \
-    this->end_ = d - this->rest_;                                              \
-    vscal_ =                                                                   \
-        KernelPool::Instance().template Get<VScalKernel<float>>(this->rest_);  \
-    vsigmoid_ = KernelPool::Instance().template Get<VSigmoidKernel<float>>(    \
-        this->rest_);                                                          \
-    vaddbias_ = KernelPool::Instance().template Get<VAddBiasKernel<float>>(    \
-        this->rest_);                                                          \
-  }                                                                            \
-  template <>                                                                  \
-  void VTanhKernelImpl<float, isa, kGT16>::ComputeDeprecated(const float* x,   \
-                                                             float* y) const { \
-    for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) {                    \
-      __m256 tmp = _mm256_loadu_ps(x + i);                                     \
-      INTRI_VTANH(tmp, expisa);                                                \
-      _mm256_storeu_ps(y + i, tmp);                                            \
-    }                                                                          \
-    x += this->end_;                                                           \
-    y += this->end_;                                                           \
-    const float a = 2.f, b = -1.f;                                             \
-    vscal_->Compute(&a, x, y, this->num_);                                     \
-    vsigmoid_->ComputeDeprecated(y, y);                                        \
-    vscal_->Compute(&a, y, y, this->num_);                                     \
-    vaddbias_->Compute(&b, y, y, this->num_);                                  \
-  }
-#ifdef __AVX__
-INTRI8_FLOAT(jit::avx, detail::ExpAVX);
-INTRI16_FLOAT(jit::avx, detail::ExpAVX);
-INTRI_GT8LT16_FLOAT(jit::avx, detail::ExpAVX);
-INTRI_GT16_FLOAT(jit::avx, detail::ExpAVX);
-#endif
-#ifdef __AVX2__
-INTRI8_FLOAT(jit::avx2, detail::ExpAVX2);
-INTRI16_FLOAT(jit::avx2, detail::ExpAVX2);
-// maybe use avx at gt8lt16 and gt16
-#endif
-#ifdef __AVX512F__
-INTRI8_FLOAT(jit::avx512f, detail::ExpAVX2);
-INTRI16_FLOAT(jit::avx512f, detail::ExpAVX2);
-// maybe use avx at gt8lt16 and gt16
-#endif
-#undef INTRI8_FLOAT
-#undef INTRI16_FLOAT
-#undef INTRI_GT8LT16_FLOAT
-#undef INTRI_GT16_FLOAT
-#undef INTRI_VTANH
-REGISTER_JITKERNEL_DEPRECATED(vtanh, VTanhKernel);
-#undef JITKERNEL_NEW_ACT_IMPL
 }  // namespace jitkernel
 }  // namespace math
 }  // namespace operators

--- a/paddle/fluid/operators/math/jit_kernel_macro.h
+++ b/paddle/fluid/operators/math/jit_kernel_macro.h
@@ -15,12 +15,20 @@ limitations under the License. */
 #pragma once
 #include <string>
 #include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/fluid/platform/enforce.h"
 namespace paddle {
 namespace operators {
 namespace math {
 namespace jitkernel {
+#define JITKERNEL_DECLARE_STATIC_FUNC                       \
+  static inline std::string name(int d) {                   \
+    PADDLE_THROW("DType should be either float or double"); \
+  }                                                         \
+  static inline bool useJIT(int d) { return false; }        \
+  static inline bool useMKL(int d) { return false; }
 #define JITKERNEL_DEFINE_NAME(ker_key, ker_class)    \
  template <>                                        \
  std::string ker_class##Impl<float>::name(int d) {  \
@@ -86,17 +94,17 @@ namespace jitkernel {
 namespace jit = platform::jit;
 // TODO(TJ): below defines are deprecated, would be remove recently
-#define SEARCH_BLOCK(macro_, ker, dtype, isa)                 \
+#define SEARCH_BLOCK(macro_, ker, dtype, isa)              \
-  if (d < AVX_FLOAT_BLOCK) {                                  \
+  if (d < YMM_FLOAT_BLOCK) {                               \
-    macro_(ker, dtype, isa, kLT8);                            \
+    macro_(ker, dtype, isa, kLT8);                         \
-  } else if (d == AVX_FLOAT_BLOCK) {                          \
+  } else if (d == YMM_FLOAT_BLOCK) {                       \
-    macro_(ker, dtype, isa, kEQ8);                            \
+    macro_(ker, dtype, isa, kEQ8);                         \
-  } else if (d > AVX_FLOAT_BLOCK && d < AVX512_FLOAT_BLOCK) { \
+  } else if (d > YMM_FLOAT_BLOCK && d < ZMM_FLOAT_BLOCK) { \
-    macro_(ker, dtype, isa, kGT8LT16);                        \
+    macro_(ker, dtype, isa, kGT8LT16);                     \
-  } else if (d == AVX512_FLOAT_BLOCK) {                       \
+  } else if (d == ZMM_FLOAT_BLOCK) {                       \
-    macro_(ker, dtype, isa, kEQ16);                           \
+    macro_(ker, dtype, isa, kEQ16);                        \
-  } else {                                                    \
+  } else {                                                 \
-    macro_(ker, dtype, isa, kGT16);                           \
+    macro_(ker, dtype, isa, kGT16);                        \
  }
 #define SEARCH_ISA_BLOCK(macro_, ker, dtype)        \

--- a/paddle/fluid/operators/math/jit_kernel_rnn.cc
+++ b/paddle/fluid/operators/math/jit_kernel_rnn.cc
@@ -175,26 +175,26 @@ class LSTMKernelImpl : public LSTMKernel<T> {
  void ComputeCtHt(T* gates, const T* ct_1, T* ct, T* ht, const T* wp_data,
                   T* checked) const override {
    // gates: W_ch, W_ih, W_fh, W_oh
-    act_gate_d3_->ComputeDeprecated(gates + d_, gates + d_);
+    act_gate_d3_->Compute(gates + d_, gates + d_, d3_);
    /* C_t = C_t-1 * fgated + cand_gated * igated */
-    act_cand_d_->ComputeDeprecated(gates, gates);
+    act_cand_d_->Compute(gates, gates, d_);
    vmul_d_->Compute(gates, gates + d_, gates + d_, d_);
    vmul_d_->Compute(ct_1, gates + d2_, gates + d2_, d_);
    vadd_d_->Compute(gates + d_, gates + d2_, ct, d_);
    /* H_t = act_cell(C_t) * ogated */
-    act_cell_d_->ComputeDeprecated(ct, gates + d2_);
+    act_cell_d_->Compute(ct, gates + d2_, d_);
    vmul_d_->Compute(gates + d2_, gates + d3_, ht, d_);
  }
  void ComputeC1H1(T* gates, T* ct, T* ht, const T* wp_data) const override {
    /* C_t = igated * cgated*/
-    act_gate_d_->ComputeDeprecated(gates + d_, gates + d_);
+    act_gate_d_->Compute(gates + d_, gates + d_, d_);
-    act_cand_d_->ComputeDeprecated(gates, gates);
+    act_cand_d_->Compute(gates, gates, d_);
    vmul_d_->Compute(gates, gates + d_, ct, d_);
    /* H_t = act_cell(C_t) * ogated */
-    act_gate_d_->ComputeDeprecated(gates + d3_, gates + d3_);
+    act_gate_d_->Compute(gates + d3_, gates + d3_, d_);
-    act_cell_d_->ComputeDeprecated(ct, gates + d2_);
+    act_cell_d_->Compute(ct, gates + d2_, d_);
    vmul_d_->Compute(gates + d2_, gates + d3_, ht, d_);
  }
@@ -292,32 +292,32 @@ class PeepholeKernelImpl : public LSTMKernel<T> {
    vmul_d_->Compute(wp_data, ct_1, checked, d_);
    vmul_d_->Compute(wp_data + d_, ct_1, checked + d_, d_);
    vadd_d2_->Compute(checked, gates + d_, gates + d_, d2_);
-    act_gate_d2_->ComputeDeprecated(gates + d_, gates + d_);
+    act_gate_d2_->Compute(gates + d_, gates + d_, d2_);
    /* C_t = C_t-1 * fgated + cand_gated * igated*/
-    act_cand_d_->ComputeDeprecated(gates, gates);
+    act_cand_d_->Compute(gates, gates, d_);
    vmul_d_->Compute(gates, gates + d_, gates + d_, d_);
    vmul_d_->Compute(ct_1, gates + d2_, gates + d2_, d_);
    vadd_d_->Compute(gates + d_, gates + d2_, ct, d_);
    /* get ogated*/
    vmul_d_->Compute(wp_data + d2_, ct, gates + d_, d_);
    vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_, d_);
-    act_gate_d_->ComputeDeprecated(gates + d3_, gates + d3_);
+    act_gate_d_->Compute(gates + d3_, gates + d3_, d_);
    /* H_t = act_cell(C_t) * ogated */
-    act_cell_d_->ComputeDeprecated(ct, gates + d2_);
+    act_cell_d_->Compute(ct, gates + d2_, d_);
    vmul_d_->Compute(gates + d2_, gates + d3_, ht, d_);
  }
  void ComputeC1H1(T* gates, T* ct, T* ht, const T* wp_data) const override {
    /* C_t = igated * cgated*/
-    act_gate_d_->ComputeDeprecated(gates + d_, gates + d_);
+    act_gate_d_->Compute(gates + d_, gates + d_, d_);
-    act_cand_d_->ComputeDeprecated(gates, gates);
+    act_cand_d_->Compute(gates, gates, d_);
    vmul_d_->Compute(gates, gates + d_, ct, d_);
    /* get outgated, put W_oc * C_t on igated */
    vmul_d_->Compute(wp_data + d2_, ct, gates + d_, d_);
    vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_, d_);
    /* H_t = act_cell(C_t) * ogated */
-    act_gate_d_->ComputeDeprecated(gates + d3_, gates + d3_);
+    act_gate_d_->Compute(gates + d3_, gates + d3_, d_);
-    act_cell_d_->ComputeDeprecated(ct, gates + d2_);
+    act_cell_d_->Compute(ct, gates + d2_, d_);
    vmul_d_->Compute(gates + d2_, gates + d3_, ht, d_);
  }
@@ -376,20 +376,20 @@ class GRUKernelImpl : public GRUKernel<T> {
  }
  void ComputeH1(T* gates, T* ht) const override {
-    act_gate_d_->ComputeDeprecated(gates, gates);
+    act_gate_d_->Compute(gates, gates, d_);
-    act_state_d_->ComputeDeprecated(gates + d2_, gates + d2_);
+    act_state_d_->Compute(gates + d2_, gates + d2_, d_);
    vmul_d_->Compute(gates, gates + d2_, ht, d_);
  }
  void ComputeHtPart1(T* gates, const T* ht_1, T* ht) const override {
    // W: {W_update, W_reset; W_state}
-    act_gate_d2_->ComputeDeprecated(gates, gates);
+    act_gate_d2_->Compute(gates, gates, d2_);
    vmul_d_->Compute(ht_1, gates + d_, ht, d_);
  }
  void ComputeHtPart2(T* gates, const T* ht_1, T* ht) const override {
    T* y = gates + d2_;
-    act_state_d_->ComputeDeprecated(y, y);
+    act_state_d_->Compute(y, y, d_);
    // out = zt*ht~ + (1-zt)*ht_1
    for (int i = 0; i < d_; ++i) {
      ht[i] = gates[i] * y[i] + (static_cast<T>(1) - gates[i]) * ht_1[i];

--- a/paddle/fluid/operators/math/jit_kernel_test.cc
+++ b/paddle/fluid/operators/math/jit_kernel_test.cc
@@ -181,7 +181,8 @@ TEST(JitKernel, vexp) {
    auto ttgts = GetCurrentUS();
    for (int i = 0; i < repeat; ++i) {
-      ker->ComputeDeprecated(x_data, ztgt_data);
+      // ker->Compute(x_data, ztgt_data);
+      ker->Compute(x_data, ztgt_data, d);
    }
    auto ttgte = GetCurrentUS();
@@ -222,7 +223,7 @@ void vsigmoid_better(
    y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
    y[i] = 0.f - y[i];
  }
-  vexp->ComputeDeprecated(y, y);
+  vexp->Compute(y, y, n);
  for (int i = 0; i < n; ++i) {
    y[i] = 1.f / (1.f + y[i]);
  }
@@ -253,7 +254,7 @@ TEST(JitKernel, vsigmoid) {
    auto trefe = GetCurrentUS();
    auto ttgts = GetCurrentUS();
    for (int i = 0; i < repeat; ++i) {
-      ker->ComputeDeprecated(x_data, ztgt_data);
+      ker->Compute(x_data, ztgt_data, d);
    }
    auto ttgte = GetCurrentUS();
@@ -287,7 +288,7 @@ void vtanh_better(
    const int n, const float* x, float* y) {
  const float a = 2.f, b = -1.f;
  vscal->Compute(&a, x, y, n);
-  vsigmoid->ComputeDeprecated(y, y);
+  vsigmoid->Compute(y, y, n);
  vscal->Compute(&a, y, y, n);
  vaddbias->Compute(&b, y, y, n);
 }
@@ -321,7 +322,7 @@ TEST(JitKernel, vtanh) {
    auto trefe = GetCurrentUS();
    auto ttgts = GetCurrentUS();
    for (int i = 0; i < repeat; ++i) {
-      ker->ComputeDeprecated(x_data, ztgt_data);
+      ker->Compute(x_data, ztgt_data, d);
    }
    auto ttgte = GetCurrentUS();
@@ -344,8 +345,8 @@ void lstm_ctht_ref(
    const std::shared_ptr<
        const paddle::operators::math::jitkernel::VExpKernel<float>>& vexp_1,
    const int d, float* gates, const float* ct_1, float* ct, float* ht) {
-  vsigmoid_3d->ComputeDeprecated(gates + d, gates + d);
+  vsigmoid_3d->Compute(gates + d, gates + d, 3 * d);
-  vtanh_d->ComputeDeprecated(gates, gates);
+  vtanh_d->Compute(gates, gates, d);
  const float *i = gates + d, *f = gates + d * 2, *o = gates + d * 3;
  const float min = SIGMOID_THRESHOLD_MIN;
  const float max = SIGMOID_THRESHOLD_MAX;
@@ -355,7 +356,7 @@ void lstm_ctht_ref(
    // H_t = act_cell(C_t) * ogated
    float tmp = ct[k] * 2;
    tmp = 0.f - ((tmp < min) ? min : ((tmp > max) ? max : tmp));
-    vexp_1->ComputeDeprecated(&tmp, &tmp);
+    vexp_1->Compute(&tmp, &tmp, 1);
    tmp = 2.f / (1.f + tmp) - 1.f;
    ht[k] = tmp * o[k];
  }
@@ -373,13 +374,13 @@ void lstm_ctht_better(
        const paddle::operators::math::jitkernel::VAddKernel<float>>& vadd_d,
    const int d, float* gates, const float* ct_1, float* ct, float* ht) {
  int d2 = d * 2;
-  vsigmoid_3d->ComputeDeprecated(gates + d, gates + d);
+  vsigmoid_3d->Compute(gates + d, gates + d, 3 * d);
-  vtanh_d->ComputeDeprecated(gates, gates);
+  vtanh_d->Compute(gates, gates, d);
  vmul_d->Compute(gates, gates + d, gates + d, d);
  vmul_d->Compute(ct_1, gates + d2, gates + d2, d);
  vadd_d->Compute(gates + d, gates + d2, ct, d);
  /* H_t = act_cell(C_t) * ogated */
-  vtanh_d->ComputeDeprecated(ct, gates + d2);
+  vtanh_d->Compute(ct, gates + d2, d);
  vmul_d->Compute(gates + d2, gates + d * 3, ht, d);
 }
@@ -736,7 +737,7 @@ void vaddrelu_better(
        const paddle::operators::math::jitkernel::VReluKernel<float>>& vrelu,
    const float* x, const float* y, float* z, int d) {
  vadd->Compute(x, y, z, d);
-  vrelu->ComputeDeprecated(z, z);
+  vrelu->Compute(z, z, d);
 }
 TEST(JitKernel, vaddrelu) {

--- a/paddle/fluid/operators/math/sampler.cc
+++ b/paddle/fluid/operators/math/sampler.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,52 +13,46 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/math/sampler.h"
+#include <iostream>
+#include <queue>
+#include <utility>
+#include <vector>
 namespace paddle {
-namespace random {
+namespace operators {
+namespace math {
 Sampler::~Sampler() {}
-UniformSampler::UniformSampler(int64 range)
+UniformSampler::UniformSampler(int64_t range, unsigned int seed)
-    : Sampler(range), inv_range_(1.0 / range) {
+    : Sampler(range, seed), inv_range_(1.0 / (range + 1)) {
-  random_engine_ = std::make_shared<std::mt19937>(seed_);
+  random_engine_ = std::make_shared<std::mt19937_64>(seed_);
  dist_ = std::make_shared<std::uniform_int_distribution<>>(0, range);
 }
-UniformSampler::UniformSampler(int64 range, unsigned int seed)
+int64_t UniformSampler::Sample() const { return (*dist_)(*random_engine_); }
-    : Sampler(range, seed), inv_range_(1.0 / range) {
-  random_engine_ = std::make_shared<std::mt19937>(seed_);
-  dist_ = std::make_shared<std::uniform_int_distribution<>>(0, range);
-}
-int64 UniformSampler::Sample() const { return (*dist_)(*random_engine_); }
-float UniformSampler::Probability(int64 value) const { return inv_range_; }
+float UniformSampler::Probability(int64_t value) const { return inv_range_; }
-LogUniformSampler::LogUniformSampler(int64 range)
+LogUniformSampler::LogUniformSampler(int64_t range, unsigned int seed)
-    : Sampler(range), log_range_(log(range + 1)) {
-  random_engine_ = std::make_shared<std::mt19937>(seed_);
-  dist_ = std::make_shared<std::uniform_real_distribution<>>(0, 1);
-}
-LogUniformSampler::LogUniformSampler(int64 range, unsigned int seed)
    : Sampler(range, seed), log_range_(log(range + 1)) {
-  random_engine_ = std::make_shared<std::mt19937>(seed_);
+  random_engine_ = std::make_shared<std::mt19937_64>(seed_);
  dist_ = std::make_shared<std::uniform_real_distribution<>>(0, 1);
 }
-int64 LogUniformSampler::Sample() const {
+int64_t LogUniformSampler::Sample() const {
  // Got Log Uniform distribution from uniform distribution by
  // inverse_transform_sampling method
  // More details:
  // https://wanghaoshuang.github.io/2017/11/Log-uniform-distribution-sampler/
-  const int64 value =
+  const int64_t value =
-      static_cast<int64>(exp((*dist_)(*random_engine_) * log_range_)) - 1;
+      static_cast<int64_t>(exp((*dist_)(*random_engine_) * log_range_)) - 1;
  // Mathematically, value should be <= range_, but might not be due to some
  // floating point roundoff, so we mod by range_.
  return value % range_;
 }
-float LogUniformSampler::Probability(int64 value) const {
+float LogUniformSampler::Probability(int64_t value) const {
  // Given f(x) = 1/[(x+1) * log_range_]
  // The value's  probability  is integral of f(x) from value to (value + 1)
  // More details:
@@ -66,5 +60,76 @@ float LogUniformSampler::Probability(int64 value) const {
  return (log((value + 2.0) / (value + 1.0))) / log_range_;
 }
-}  // namespace random
+CustomSampler::CustomSampler(int64_t range, const float* probabilities,
+                             unsigned int seed)
+    : Sampler(range, seed) {
+  random_engine_ = std::make_shared<std::mt19937_64>(seed_);
+  real_dist_ = std::make_shared<std::uniform_real_distribution<>>(0, 1);
+  int_dist_ = std::make_shared<std::uniform_int_distribution<>>(0, range);
+  alias_probs_ = std::make_shared<std::vector<float>>(range + 1);
+  alias_ = std::make_shared<std::vector<int64_t>>(range + 1);
+  probs_ = std::make_shared<std::vector<float>>(range + 1);
+  std::queue<std::pair<int64_t, float>> bigs;
+  std::queue<std::pair<int64_t, float>> littles;
+  for (int64_t i = 0; i <= range; ++i) {
+    (*probs_)[i] = probabilities[i];
+    float normal_prob = probabilities[i] * (range + 1);
+    if (normal_prob - 1.0 > 1e-4) {
+      bigs.emplace(i, normal_prob);
+    } else if (1.0 - normal_prob > 1e-4) {
+      littles.emplace(i, normal_prob);
+    } else {
+      (*alias_probs_)[i] = normal_prob;
+      (*alias_)[i] = -1;
+    }
+  }
+  while ((!littles.empty()) && (!bigs.empty())) {
+    auto big = bigs.front();
+    auto little = littles.front();
+    bigs.pop();
+    littles.pop();
+    (*alias_probs_)[little.first] = little.second;
+    (*alias_)[little.first] = big.first;
+    auto big_left = big.second - (1 - little.second);
+    if (big_left - 1.0 > 1e-4) {
+      bigs.emplace(big.first, big_left);
+    } else if (1.0 - big_left > 1e-4) {
+      littles.emplace(big.first, big_left);
+    } else {
+      (*alias_probs_)[big.first] = big_left;
+      (*alias_)[big.first] = -1;
+    }
+  }
+  if (!littles.empty()) {  // littles.second is close to 1.0
+    auto little = littles.front();
+    (*alias_probs_)[little.first] = 1.0;
+    (*alias_)[little.first] = -1;
+  }
+  if (!bigs.empty()) {  // bigs.second is close to 1.0
+    auto big = bigs.front();
+    (*alias_probs_)[big.first] = 1.0;
+    (*alias_)[big.first] = -1;
+  }
+}
+int64_t CustomSampler::Sample() const {
+  auto index = (*int_dist_)(*random_engine_);
+  auto p = (*real_dist_)(*random_engine_);
+  if (p > (*alias_probs_)[index]) {
+    return (*alias_)[index];
+  } else {
+    return index;
+  }
+}
+float CustomSampler::Probability(int64_t value) const {
+  return (*probs_)[value];
+}
+}  // namespace math
+}  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/math/sampler.h
+++ b/paddle/fluid/operators/math/sampler.h
@@ -16,6 +16,8 @@ limitations under the License. */
 #include <cstdint>
 #include <memory>
 #include <random>
+#include <vector>
 namespace paddle {
 namespace operators {
 namespace math {
@@ -27,14 +29,14 @@ namespace math {
 */
 class Sampler {
 public:
-  explicit Sampler(int64_t range) : range_(range) {
+  explicit Sampler(int64_t range, unsigned int seed = 0UL) : range_(range) {
-    PADDLE_ENFORCE_GT(range, 0);
+    //    PADDLE_ENFORCE_GT(range, 0, "Range should be greater than 0.");
-    std::random_device r;
+    if (seed == 0) {
-    seed_ = r();
+      std::random_device r;
-  }
+      seed_ = r();
-  explicit Sampler(int64_t range, unsigned int seed)
+    } else {
-      : range_(range), seed_(seed) {
+      seed_ = seed;
-    PADDLE_ENFORCE_GT(range, 0);
+    }
  }
  virtual ~Sampler();
  // Sample a single value
@@ -42,7 +44,7 @@ class Sampler {
  // The probability that a single call to Sample() returns the given value.
  virtual float Probability(int64_t value) const = 0;
-  int64 range() { return range_; }
+  int64_t range() { return range_; }
 protected:
  const int64_t range_;
@@ -56,13 +58,11 @@ class Sampler {
 */
 class UniformSampler : public Sampler {
 public:
-  explicit UniformSampler(int64_t range);
+  explicit UniformSampler(int64_t range, unsigned int seed = 0UL);
-  explicit UniformSampler(int64_t range, unsigned int seed);
  ~UniformSampler() override {}
-  int64 Sample() const override;
+  int64_t Sample() const override;
  float Probability(int64_t value) const override;
@@ -79,13 +79,11 @@ class UniformSampler : public Sampler {
 */
 class LogUniformSampler : public Sampler {
 public:
-  explicit LogUniformSampler(int64_t range);
+  explicit LogUniformSampler(int64_t range, unsigned int seed = 0UL);
-  explicit LogUniformSampler(int64_t range, unsigned int seed);
  ~LogUniformSampler() override {}
-  int64 Sample() const override;
+  int64_t Sample() const override;
  float Probability(int64_t value) const override;
@@ -95,6 +93,29 @@ class LogUniformSampler : public Sampler {
  std::shared_ptr<std::uniform_real_distribution<>> dist_;
 };
+/**
+ * Sample integers from [0, range) from custom distribution.
+ */
+class CustomSampler : public Sampler {
+ public:
+  explicit CustomSampler(int64_t range, const float* probabilities,
+                         unsigned int seed = 0UL);
+  ~CustomSampler() override {}
+  int64_t Sample() const override;
+  float Probability(int64_t value) const override;
+ private:
+  std::shared_ptr<std::vector<float>> alias_probs_;
+  std::shared_ptr<std::vector<int64_t>> alias_;
+  std::shared_ptr<std::vector<float>> probs_;
+  std::shared_ptr<std::mt19937_64> random_engine_;
+  std::shared_ptr<std::uniform_real_distribution<>> real_dist_;
+  std::shared_ptr<std::uniform_int_distribution<>> int_dist_;
+};
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/metrics/CMakeLists.txt
+++ b/paddle/fluid/operators/metrics/CMakeLists.txt
+include(operators)
+register_operators()
--- a/paddle/fluid/operators/accuracy_op.cc
+++ b/paddle/fluid/operators/accuracy_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/accuracy_op.h"
+#include "paddle/fluid/operators/metrics/accuracy_op.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/accuracy_op.cu
+++ b/paddle/fluid/operators/accuracy_op.cu
@@ -14,7 +14,7 @@ limitations under the License. */
 #include <thrust/execution_policy.h>
 #include <thrust/reduce.h>
-#include "paddle/fluid/operators/accuracy_op.h"
+#include "paddle/fluid/operators/metrics/accuracy_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 #include "paddle/fluid/platform/gpu_info.h"

--- a/paddle/fluid/operators/accuracy_op.h
+++ b/paddle/fluid/operators/accuracy_op.h
--- a/paddle/fluid/operators/auc_op.cc
+++ b/paddle/fluid/operators/auc_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/auc_op.h"
+#include "paddle/fluid/operators/metrics/auc_op.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/auc_op.h
+++ b/paddle/fluid/operators/auc_op.h
--- a/paddle/fluid/operators/precision_recall_op.cc
+++ b/paddle/fluid/operators/precision_recall_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/precision_recall_op.h"
+#include "paddle/fluid/operators/metrics/precision_recall_op.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/precision_recall_op.h
+++ b/paddle/fluid/operators/precision_recall_op.h
--- a/paddle/fluid/operators/nccl/CMakeLists.txt
+++ b/paddle/fluid/operators/nccl/CMakeLists.txt
 if(WITH_GPU AND NOT WIN32)
  nv_library(nccl_common SRCS nccl_gpu_common.cc DEPS device_context operator )
 endif()
+if(WITH_GPU)
+    op_library(nccl_op DEPS nccl_common)
+    file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(ncclAllReduce);\n")
+    set(OPERATOR_DEPS ${OPERATOR_DEPS} nccl_common PARENT_SCOPE)
+endif()
+if(NOT WIN32)
+    nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
+endif()
--- a/paddle/fluid/operators/nccl_op.cc
+++ b/paddle/fluid/operators/nccl_op.cc
--- a/paddle/fluid/operators/nccl_op.cu.cc
+++ b/paddle/fluid/operators/nccl_op.cu.cc
--- a/paddle/fluid/operators/nccl_op_test.cu.cc
+++ b/paddle/fluid/operators/nccl_op_test.cu.cc
--- a/paddle/fluid/operators/nce_op.cc
+++ b/paddle/fluid/operators/nce_op.cc
@@ -35,6 +35,7 @@ class NCEOp : public framework::OperatorWithKernel {
    auto x_dims = ctx->GetInputDim("Input");
    auto label_dims = ctx->GetInputDim("Label");
+    auto w_dims = ctx->GetInputDim("Weight");
    PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0]);
    int num_true_classes = label_dims.size() == 2 ? label_dims[1] : 1;
    if (ctx->HasInput("Bias")) {
@@ -98,6 +99,13 @@ class NCEOpMaker : public framework::OpProtoAndCheckerMaker {
             "each sample. And it is a dispensable input. The default value of "
             "sample is 1.")
        .AsDispensable();
+    AddInput(
+        "CustomDistribution",
+        "(Tensor) It is used in 'CostumDist' sampler. "
+        "It is a tensor with shape [num_total_classes]."
+        "The i-th element is the probsbility of the i-th class being sampled.")
+        .AsDispensable();
    AddOutput("Cost",
              "(Tensor) A tensor of shape [batch_size, 1]. Cost of samples.");
    AddOutput("SampleLogits",
@@ -121,6 +129,17 @@ class NCEOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<int>("num_neg_samples",
                 "The number of negative classes. The default value is 10.")
        .SetDefault(10);
+    AddAttr<int>("sampler",
+                 "(int) Which sampler to be used to sample negative class."
+                 "0: Uniform; 1: LogUniform; 2: CostumDist.")
+        .SetDefault(0);
+    AddAttr<int>("seed",
+                 "(int) The seed used in sampler. If it is 0, "
+                 "the sampler will generate a seed randomly.")
+        .SetDefault(0);
    AddAttr<std::vector<int>>("custom_neg_classes",
                              "This attribute only be used in unitest. Classes "
                              "in this list wiil be used as negative classes "

--- a/paddle/fluid/operators/nce_op.h
+++ b/paddle/fluid/operators/nce_op.h
@@ -19,29 +19,28 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/sampler.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 namespace paddle {
 namespace operators {
 using Tensor = framework::Tensor;
+using Sampler = math::Sampler;
 template <typename T, int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 template <typename DeviceContext, typename T>
-void PrepareSamples(const framework::ExecutionContext& context) {
+void PrepareSamples(const framework::ExecutionContext& context,
+                    Sampler* sampler) {
  auto label = context.Input<Tensor>("Label");
  const int64_t* label_data = label->data<int64_t>();
  auto label_dims = label->dims();
-  int num_total_classes = context.Attr<int>("num_total_classes");
+  //  int num_total_classes = context.Attr<int>("num_total_classes");
  // for unitest
  std::vector<int> custom_neg_classes =
      context.Attr<std::vector<int>>("custom_neg_classes");
-  // random machine
-  std::random_device rd;
-  std::mt19937 rng(rd());
-  std::uniform_int_distribution<int> rand(0, num_total_classes - 1);
  auto sample_labels = context.Output<Tensor>("SampleLabels");
  auto sample_labels_dims = sample_labels->dims();
@@ -62,7 +61,7 @@ void PrepareSamples(const framework::ExecutionContext& context) {
    } else {
      for (; j < sample_labels_dims[1]; ++j) {
        // TODO(wanghaoshuang): support more distribution sampling
-        sample_labels_data[index++] = rand(rng);
+        sample_labels_data[index++] = sampler->Sample();
      }
    }
  }
@@ -72,7 +71,33 @@ template <typename DeviceContext, typename T>
 class NCEKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
-    PrepareSamples<DeviceContext, T>(context);
+    int sampler_type = context.Attr<int>("sampler");
+    int seed = context.Attr<int>("seed");
+    int num_total_classes = context.Attr<int>("num_total_classes");
+    int num_neg_samples = context.Attr<int>("num_neg_samples");
+    Sampler* sampler;
+    switch (sampler_type) {
+      case 0: {
+        sampler = new math::UniformSampler(num_total_classes - 1, seed);
+        break;
+      }
+      case 1: {
+        sampler = new math::LogUniformSampler(num_total_classes - 1, seed);
+        break;
+      }
+      case 2: {
+        auto custom_dist = context.Input<Tensor>("CustomDistribution");
+        const float* custom_dist_data = custom_dist->data<float>();
+        PADDLE_ENFORCE_EQ(custom_dist->numel(), num_total_classes);
+        sampler = new math::CustomSampler(num_total_classes - 1,
+                                          custom_dist_data, seed);
+        break;
+      }
+      default: { PADDLE_THROW("Unsupported SamplerType."); }
+    }
+    PrepareSamples<DeviceContext, T>(context, sampler);
    auto sample_labels = context.Output<Tensor>("SampleLabels");
    const int64_t* sample_labels_data = sample_labels->data<int64_t>();
    auto sample_out = context.Output<Tensor>("SampleLogits");
@@ -85,13 +110,12 @@ class NCEKernel : public framework::OpKernel<T> {
    }
    auto out = context.Output<Tensor>("Cost");
    T* out_data = out->mutable_data<T>(context.GetPlace());
-    int num_neg_samples = context.Attr<int>("num_neg_samples");
-    int num_total_classes = context.Attr<int>("num_total_classes");
    int64_t num_true_class = 1;
    if (label != nullptr) {
      num_true_class = label->dims()[1];
    }
-    T b = 1. / num_total_classes * num_neg_samples;
+    int64_t sampled_labels_num = sample_labels->dims()[1];
+    //    T b = 1. / num_total_classes * num_neg_samples;
    // forward bias
    auto bias = context.Input<Tensor>("Bias");
    if (bias != nullptr) {
@@ -117,22 +141,17 @@ class NCEKernel : public framework::OpKernel<T> {
    }
    // forward cost
    for (int64_t i = 0; i < sample_labels->dims()[0]; ++i) {
-      int64_t j = 0;
      out_data[i] = 0;
      T w = sample_weight == nullptr ? 1. : sample_weight_data[i];
-      // for true classes
+      for (int64_t j = 0; j < sampled_labels_num; ++j) {
-      for (; j < num_true_class; ++j) {
+        int64_t target = sample_labels_data[i * sampled_labels_num + j];
-        T o = sample_out_data[i * sample_out->dims()[1] + j];
+        T o = sample_out_data[i * sampled_labels_num + j];
-        T cost = -log(o / (o + b));
+        float b = sampler->Probability(target) * num_neg_samples;
-        out_data[i] += w * cost;
+        T cost = (j < num_true_class) ? -log(o / (o + b)) : -log(b / (o + b));
-      }
-      // for sampled neg classes
-      for (; j < sample_labels->dims()[1]; ++j) {
-        T o = sample_out_data[i * sample_out->dims()[1] + j];
-        T cost = -log(b / (o + b));
        out_data[i] += w * cost;
      }
    }
+    delete sampler;
  }
 };
@@ -158,20 +177,45 @@ class NCEGradKernel : public framework::OpKernel<T> {
    if (label != nullptr) {
      num_true_class = label->dims()[1];
    }
-    T b = 1. / num_total_classes * num_neg_samples;
+    int sampler_type = context.Attr<int>("sampler");
+    int seed = context.Attr<int>("seed");
+    Sampler* sampler;
+    switch (sampler_type) {
+      case 0: {
+        sampler = new math::UniformSampler(num_total_classes - 1, seed);
+        break;
+      }
+      case 1: {
+        sampler = new math::LogUniformSampler(num_total_classes - 1, seed);
+        break;
+      }
+      case 2: {
+        auto custom_dist = context.Input<Tensor>("CustomDistribution");
+        const float* custom_dist_data = custom_dist->data<float>();
+        PADDLE_ENFORCE_EQ(custom_dist->numel(), num_total_classes);
+        sampler = new math::CustomSampler(num_total_classes - 1,
+                                          custom_dist_data, seed);
+        break;
+      }
+      default: { PADDLE_THROW("Unsupported SamplerType."); }
+    }
+    //    T b = 1. / num_total_classes * num_neg_samples;
    Tensor sample_grad;  // tmp tensor
    T* sample_grad_data =
        sample_grad.mutable_data<T>(sample_labels->dims(), context.GetPlace());
    // backward cost
    for (int64_t i = 0; i < sample_labels->numel(); ++i) {
+      int64_t label_idx = i % sample_labels->dims()[1];
+      int64_t sample_idx = i / sample_labels->dims()[1];
+      float b = sampler->Probability(sample_labels_data[i]) * num_neg_samples;
      T o = sample_out_data[i];
-      T w = sample_weight == nullptr
+      T w = sample_weight == nullptr ? 1 : sample_weight_data[sample_idx];
-                ? 1
+      sample_grad_data[i] = label_idx < num_true_class
-                : sample_weight_data[i / sample_labels->dims()[1]];
-      sample_grad_data[i] = (i % sample_labels->dims()[1]) < num_true_class
                                ? w * (b / (o + b)) * (o - 1)
                                : w * (o * (1 - o) / (o + b));
-      sample_grad_data[i] *= d_out_data[i / sample_labels->dims()[1]];
+      sample_grad_data[i] *= d_out_data[sample_idx];
    }
    // get d_bias
    auto d_bias = context.Output<Tensor>(framework::GradVarName("Bias"));
@@ -207,6 +251,7 @@ class NCEGradKernel : public framework::OpKernel<T> {
            w_matrix.chip(sample_labels_data[i], 0) * sample_grad_data[i];
      }
    }
+    delete sampler;
  }
 };
 }  // namespace operators

--- a/paddle/fluid/operators/optimizers/CMakeLists.txt
+++ b/paddle/fluid/operators/optimizers/CMakeLists.txt
+include(operators)
+register_operators()
--- a/paddle/fluid/operators/adadelta_op.cc
+++ b/paddle/fluid/operators/adadelta_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/adadelta_op.h"
+#include "paddle/fluid/operators/optimizers/adadelta_op.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/adadelta_op.cu
+++ b/paddle/fluid/operators/adadelta_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #define EIGEN_USE_GPU
-#include "paddle/fluid/operators/adadelta_op.h"
+#include "paddle/fluid/operators/optimizers/adadelta_op.h"
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(

--- a/paddle/fluid/operators/adadelta_op.h
+++ b/paddle/fluid/operators/adadelta_op.h
--- a/paddle/fluid/operators/adagrad_op.cc
+++ b/paddle/fluid/operators/adagrad_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/adagrad_op.h"
+#include "paddle/fluid/operators/optimizers/adagrad_op.h"
 #include <vector>
 #include <cmath>

--- a/paddle/fluid/operators/adagrad_op.cu
+++ b/paddle/fluid/operators/adagrad_op.cu
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #define EIGEN_USE_GPU
-#include "paddle/fluid/operators/adagrad_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
+#include "paddle/fluid/operators/optimizers/adagrad_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 namespace paddle {

--- a/paddle/fluid/operators/adagrad_op.h
+++ b/paddle/fluid/operators/adagrad_op.h
--- a/paddle/fluid/operators/adam_op.cc
+++ b/paddle/fluid/operators/adam_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/adam_op.h"
+#include "paddle/fluid/operators/optimizers/adam_op.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/adam_op.cu
+++ b/paddle/fluid/operators/adam_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #define EIGEN_USE_GPU
-#include "paddle/fluid/operators/adam_op.h"
+#include "paddle/fluid/operators/optimizers/adam_op.h"
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(

--- a/paddle/fluid/operators/adam_op.h
+++ b/paddle/fluid/operators/adam_op.h
--- a/paddle/fluid/operators/adamax_op.cc
+++ b/paddle/fluid/operators/adamax_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/adamax_op.h"
+#include "paddle/fluid/operators/optimizers/adamax_op.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/adamax_op.cu
+++ b/paddle/fluid/operators/adamax_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #define EIGEN_USE_GPU
-#include "paddle/fluid/operators/adamax_op.h"
+#include "paddle/fluid/operators/optimizers/adamax_op.h"
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(

--- a/paddle/fluid/operators/adamax_op.h
+++ b/paddle/fluid/operators/adamax_op.h
--- a/paddle/fluid/operators/decayed_adagrad_op.cc
+++ b/paddle/fluid/operators/decayed_adagrad_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/decayed_adagrad_op.h"
+#include "paddle/fluid/operators/optimizers/decayed_adagrad_op.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/decayed_adagrad_op.cu
+++ b/paddle/fluid/operators/decayed_adagrad_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #define EIGEN_USE_GPU
-#include "paddle/fluid/operators/decayed_adagrad_op.h"
+#include "paddle/fluid/operators/optimizers/decayed_adagrad_op.h"
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(

--- a/paddle/fluid/operators/decayed_adagrad_op.h
+++ b/paddle/fluid/operators/decayed_adagrad_op.h
--- a/paddle/fluid/operators/ftrl_op.cc
+++ b/paddle/fluid/operators/ftrl_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/ftrl_op.h"
+#include "paddle/fluid/operators/optimizers/ftrl_op.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/ftrl_op.cu
+++ b/paddle/fluid/operators/ftrl_op.cu
@@ -12,7 +12,7 @@ CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License. */
 #define EIGEN_USE_GPU
-#include "paddle/fluid/operators/ftrl_op.h"
+#include "paddle/fluid/operators/optimizers/ftrl_op.h"
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(

--- a/paddle/fluid/operators/ftrl_op.h
+++ b/paddle/fluid/operators/ftrl_op.h
--- a/paddle/fluid/operators/lars_momentum_op.cc
+++ b/paddle/fluid/operators/lars_momentum_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/lars_momentum_op.h"
+#include "paddle/fluid/operators/optimizers/lars_momentum_op.h"
-#include "paddle/fluid/operators/momentum_op.h"
+#include "paddle/fluid/operators/optimizers/momentum_op.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/lars_momentum_op.cu
+++ b/paddle/fluid/operators/lars_momentum_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/lars_momentum_op.h"
+#include "paddle/fluid/operators/optimizers/lars_momentum_op.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/lars_momentum_op.h
+++ b/paddle/fluid/operators/lars_momentum_op.h
--- a/paddle/fluid/operators/momentum_op.cc
+++ b/paddle/fluid/operators/momentum_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/momentum_op.h"
+#include "paddle/fluid/operators/optimizers/momentum_op.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/momentum_op.cu
+++ b/paddle/fluid/operators/momentum_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/momentum_op.h"
+#include "paddle/fluid/operators/optimizers/momentum_op.h"
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(

--- a/paddle/fluid/operators/momentum_op.h
+++ b/paddle/fluid/operators/momentum_op.h
--- a/paddle/fluid/operators/proximal_adagrad_op.cc
+++ b/paddle/fluid/operators/proximal_adagrad_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/proximal_adagrad_op.h"
+#include "paddle/fluid/operators/optimizers/proximal_adagrad_op.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/proximal_adagrad_op.cu
+++ b/paddle/fluid/operators/proximal_adagrad_op.cu
@@ -12,7 +12,7 @@ CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License. */
 #define EIGEN_USE_GPU
-#include "paddle/fluid/operators/proximal_adagrad_op.h"
+#include "paddle/fluid/operators/optimizers/proximal_adagrad_op.h"
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(

--- a/paddle/fluid/operators/proximal_adagrad_op.h
+++ b/paddle/fluid/operators/proximal_adagrad_op.h
--- a/paddle/fluid/operators/proximal_gd_op.cc
+++ b/paddle/fluid/operators/proximal_gd_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/proximal_gd_op.h"
+#include "paddle/fluid/operators/optimizers/proximal_gd_op.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/proximal_gd_op.cu
+++ b/paddle/fluid/operators/proximal_gd_op.cu
@@ -12,7 +12,7 @@ CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License. */
 #define EIGEN_USE_GPU
-#include "paddle/fluid/operators/proximal_gd_op.h"
+#include "paddle/fluid/operators/optimizers/proximal_gd_op.h"
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(

--- a/paddle/fluid/operators/proximal_gd_op.h
+++ b/paddle/fluid/operators/proximal_gd_op.h
--- a/paddle/fluid/operators/rmsprop_op.cc
+++ b/paddle/fluid/operators/rmsprop_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/rmsprop_op.h"
+#include "paddle/fluid/operators/optimizers/rmsprop_op.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/rmsprop_op.cu
+++ b/paddle/fluid/operators/rmsprop_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #define EIGEN_USE_GPU
-#include "paddle/fluid/operators/rmsprop_op.h"
+#include "paddle/fluid/operators/optimizers/rmsprop_op.h"
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(

--- a/paddle/fluid/operators/rmsprop_op.h
+++ b/paddle/fluid/operators/rmsprop_op.h
--- a/paddle/fluid/operators/sgd_op.cc
+++ b/paddle/fluid/operators/sgd_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/sgd_op.h"
+#include "paddle/fluid/operators/optimizers/sgd_op.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/sgd_op.cu
+++ b/paddle/fluid/operators/sgd_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <algorithm>
-#include "paddle/fluid/operators/sgd_op.h"
+#include "paddle/fluid/operators/optimizers/sgd_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 namespace paddle {

--- a/paddle/fluid/operators/sgd_op.h
+++ b/paddle/fluid/operators/sgd_op.h
--- a/paddle/fluid/operators/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/pool_mkldnn_op.cc
@@ -87,6 +87,7 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+    bool is_test = ctx.Attr<bool>("is_test");
    if (ctx.Attr<bool>("global_pooling")) {
      for (size_t i = 0; i < ksize.size(); ++i) {
        paddings[i] = 0;
@@ -142,16 +143,10 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
      std::shared_ptr<mkldnn::pooling_forward::primitive_desc> pool_pd =
          CreatePrimitiveDesc(src_md, dst_md, strides, padding_left_top,
                              padding_right_bottom, ksize, pooling_type,
-                              mkldnn_engine, ceil_mode);
+                              mkldnn_engine, ceil_mode, is_test);
      // save pool_pd into global device context to be referred in backward path
-      dev_ctx.SetBlob(key_pool_pd, pool_pd);
+      if (!is_test) dev_ctx.SetBlob(key_pool_pd, pool_pd);
-      std::shared_ptr<mkldnn::memory> workspace_memory =
-          CreateWorkspaceMemory(pool_pd, pooling_type, mkldnn_engine);
-      // save pool_workspace_memory to be referred in backward path
-      dev_ctx.SetBlob(key_pool_workspace_memory, workspace_memory);
      auto src_memory = std::make_shared<memory>(pool_pd->src_primitive_desc(),
                                                 to_void_cast<T>(input_data));
@@ -161,9 +156,19 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
      dev_ctx.SetBlob(key_pool_src_mem_p, src_memory);
      dev_ctx.SetBlob(key_pool_dst_mem_p, dst_memory);
-      pool_p = std::make_shared<pooling_forward>(*pool_pd, *(src_memory.get()),
+      if (is_test) {
-                                                 *(dst_memory.get()),
+        pool_p = std::make_shared<pooling_forward>(*pool_pd, *src_memory,
-                                                 *workspace_memory);
+                                                   *dst_memory);
+      } else {
+        std::shared_ptr<mkldnn::memory> workspace_memory =
+            CreateWorkspaceMemory(pool_pd, pooling_type, mkldnn_engine);
+        // save pool_workspace_memory to be referred in backward path
+        dev_ctx.SetBlob(key_pool_workspace_memory, workspace_memory);
+        pool_p = std::make_shared<pooling_forward>(
+            *pool_pd, *src_memory, *dst_memory, *workspace_memory);
+      }
      dev_ctx.SetBlob(key_pool_p, pool_p);
@@ -201,9 +206,12 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
      const std::vector<int>& stride, const std::vector<int>& padding_left_top,
      const std::vector<int>& padding_right_bot, const std::vector<int>& kernel,
      const std::string& pooling_type, const mkldnn::engine& engine,
-      bool ceil_mode) const {
+      bool ceil_mode, bool is_test) const {
+    auto mkldnn_forward_prop_kind = is_test
+                                        ? mkldnn::prop_kind::forward_inference
+                                        : mkldnn::prop_kind::forward_training;
    auto pool_desc = mkldnn::pooling_forward::desc(
-        mkldnn::prop_kind::forward,
+        mkldnn_forward_prop_kind,
        pooling_type == "max" ? mkldnn::algorithm::pooling_max
                              : mkldnn::algorithm::pooling_avg,
        src, dst, stride, kernel, padding_left_top, padding_right_bot,
@@ -248,6 +256,10 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
                       out_grad->format() != memory::format::format_undef,
                   "Wrong layout/format set for Input output_grad tensor");
+    PADDLE_ENFORCE(
+        !ctx.Attr<bool>("is_test"),
+        "is_test attribute should be set to False in training phase.");
    std::string pooling_type = ctx.Attr<std::string>("pooling_type");
    std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");

--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -206,6 +206,11 @@ void Pool2dOpMaker::Make() {
      "Defaults to \"NHWC\". Specify the data format of the output data, "
      "the input will be transformed automatically. ")
      .SetDefault("AnyLayout");
+  AddAttr<bool>("is_test",
+                "(bool, default false) Set to true for inference only, false "
+                "for training. Some layers may run faster when this is true.")
+      .SetDefault(false);
  // TODO(dzhwinter): need to registered layout transform function
  AddComment(R"DOC(

--- a/paddle/fluid/operators/reader/CMakeLists.txt
+++ b/paddle/fluid/operators/reader/CMakeLists.txt
+include(operators)
 cc_library(reader_op_registry SRCS reader_op_registry.cc DEPS operator op_registry reader)
 set(LOCAL_READER_LIBS)
@@ -28,4 +30,10 @@ reader_library(create_py_reader_op SRCS create_py_reader_op.cc)
 cc_test(reader_blocking_queue_test SRCS reader_blocking_queue_test.cc)
 # Export local libraries to parent
-set(READER_LIBRARY ${LOCAL_READER_LIBS} PARENT_SCOPE)
+# set(READER_LIBRARY ${LOCAL_READER_LIBS} PARENT_SCOPE)
+op_library(read_op)
+foreach(src ${LOCAL_READER_LIBS})
+    set(OP_LIBRARY ${src} ${OP_LIBRARY} CACHE INTERNAL "op libs")
+endforeach()
--- a/paddle/fluid/operators/read_op.cc
+++ b/paddle/fluid/operators/read_op.cc
--- a/paddle/fluid/operators/reduce_ops/CMakeLists.txt
+++ b/paddle/fluid/operators/reduce_ops/CMakeLists.txt
+include(operators)
+register_operators()
+if(WITH_GPU)
+    file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.part.cu")
+    string(REPLACE ".part.cu" "" OPS "${OPS}")
+    foreach(src ${OPS})
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.part.cu)
+            set(CUDA_KERNEL_FILE ${CMAKE_CURRENT_SOURCE_DIR}/${src}.part.cu)
+            file(READ ${CUDA_KERNEL_FILE} TARGET_CONTENT)
+            string(REGEX MATCH "REGISTER_OP_CUDA_KERNEL\\(\\n?([^,]+),.*" MATCHED ${TARGET_CONTENT})
+            if (MATCHED)
+                string(STRIP ${CMAKE_MATCH_1} MATCHED)
+                file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${MATCHED}, CUDA);\n")
+            endif()
+        endif()
+    endforeach()
+endif()
--- a/paddle/fluid/operators/cub_reduce.h
+++ b/paddle/fluid/operators/cub_reduce.h
--- a/paddle/fluid/operators/reduce_max_op.cc
+++ b/paddle/fluid/operators/reduce_max_op.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/operators/reduce_min_max_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
 REGISTER_REDUCE_OP(reduce_max);
 REGISTER_OP_CPU_KERNEL(

--- a/paddle/fluid/operators/reduce_max_op.cu
+++ b/paddle/fluid/operators/reduce_max_op.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/operators/reduce_min_max_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
 REGISTER_OP_CUDA_KERNEL(reduce_max,
                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,

--- a/paddle/fluid/operators/reduce_max_op.part.cu
+++ b/paddle/fluid/operators/reduce_max_op.part.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/operators/reduce_min_max_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
 REGISTER_OP_CUDA_KERNEL(
    reduce_max_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,

--- a/paddle/fluid/operators/reduce_mean_op.cc
+++ b/paddle/fluid/operators/reduce_mean_op.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/operators/reduce_mean_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h"
 REGISTER_REDUCE_OP(reduce_mean);
 REGISTER_OP_CPU_KERNEL(reduce_mean,

--- a/paddle/fluid/operators/reduce_mean_op.cu
+++ b/paddle/fluid/operators/reduce_mean_op.cu
@@ -13,8 +13,8 @@
 // limitations under the License.
 #include <vector>
-#include "paddle/fluid/operators/cub_reduce.h"
+#include "paddle/fluid/operators/reduce_ops/cub_reduce.h"
-#include "paddle/fluid/operators/reduce_mean_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/reduce_mean_op.h
+++ b/paddle/fluid/operators/reduce_mean_op.h
@@ -14,7 +14,7 @@
 #pragma once
-#include "paddle/fluid/operators/reduce_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/reduce_mean_op.part.cu
+++ b/paddle/fluid/operators/reduce_mean_op.part.cu
@@ -13,7 +13,7 @@
 // limitations under the License.
 // .part used to speed up nvcc compile
-#include "paddle/fluid/operators/reduce_mean_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h"
 REGISTER_OP_CUDA_KERNEL(
    reduce_mean_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,

--- a/paddle/fluid/operators/reduce_min_max_op.h
+++ b/paddle/fluid/operators/reduce_min_max_op.h
@@ -13,7 +13,7 @@
 // limitations under the License.
 #pragma once
-#include "paddle/fluid/operators/reduce_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/reduce_min_op.cc
+++ b/paddle/fluid/operators/reduce_min_op.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/operators/reduce_min_max_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
 REGISTER_REDUCE_OP(reduce_min);
 REGISTER_OP_CPU_KERNEL(

--- a/paddle/fluid/operators/reduce_min_op.cu
+++ b/paddle/fluid/operators/reduce_min_op.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/operators/reduce_min_max_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
 REGISTER_OP_CUDA_KERNEL(reduce_min,
                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,

--- a/paddle/fluid/operators/reduce_min_op.part.cu
+++ b/paddle/fluid/operators/reduce_min_op.part.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/operators/reduce_min_max_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
 REGISTER_OP_CUDA_KERNEL(
    reduce_min_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,

--- a/paddle/fluid/operators/reduce_op.h
+++ b/paddle/fluid/operators/reduce_op.h
@@ -18,7 +18,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
-#include "paddle/fluid/operators/reduce_op_function.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op_function.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/reduce_op_function.h
+++ b/paddle/fluid/operators/reduce_op_function.h
--- a/paddle/fluid/operators/reduce_prod_op.cc
+++ b/paddle/fluid/operators/reduce_prod_op.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/operators/reduce_prod_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h"
 REGISTER_REDUCE_OP(reduce_prod);
 REGISTER_OP_CPU_KERNEL(reduce_prod,

--- a/paddle/fluid/operators/reduce_prod_op.cu
+++ b/paddle/fluid/operators/reduce_prod_op.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/operators/reduce_prod_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h"
 REGISTER_OP_CUDA_KERNEL(reduce_prod,
                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,

--- a/paddle/fluid/operators/reduce_prod_op.h
+++ b/paddle/fluid/operators/reduce_prod_op.h
@@ -14,7 +14,7 @@
 #pragma once
-#include "paddle/fluid/operators/reduce_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/reduce_prod_op.part.cu
+++ b/paddle/fluid/operators/reduce_prod_op.part.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/operators/reduce_prod_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h"
 REGISTER_OP_CUDA_KERNEL(
    reduce_prod_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,

--- a/paddle/fluid/operators/reduce_sum_op.cc
+++ b/paddle/fluid/operators/reduce_sum_op.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/operators/reduce_sum_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h"
 REGISTER_REDUCE_OP(reduce_sum);
 REGISTER_OP_CPU_KERNEL(

--- a/paddle/fluid/operators/reduce_sum_op.cu
+++ b/paddle/fluid/operators/reduce_sum_op.cu
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/operators/cub_reduce.h"
+#include "paddle/fluid/operators/reduce_ops/cub_reduce.h"
-#include "paddle/fluid/operators/reduce_sum_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/reduce_sum_op.h
+++ b/paddle/fluid/operators/reduce_sum_op.h
@@ -16,7 +16,7 @@
 #include <vector>
-#include "paddle/fluid/operators/reduce_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/reduce_sum_op.part.cu
+++ b/paddle/fluid/operators/reduce_sum_op.part.cu
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/operators/cub_reduce.h"
+#include "paddle/fluid/operators/reduce_ops/cub_reduce.h"
-#include "paddle/fluid/operators/reduce_sum_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h"
 REGISTER_OP_CUDA_KERNEL(
    reduce_sum_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,

--- a/paddle/fluid/operators/selu_op.cc
+++ b/paddle/fluid/operators/selu_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/selu_op.h"
+#include <string>
+namespace paddle {
+namespace operators {
+class SeluOp : public framework::OperatorWithKernel {
+ public:
+  SeluOp(const std::string &type, const framework::VariableNameMap &inputs,
+         const framework::VariableNameMap &outputs,
+         const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SeluOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SeluOp should not be null.");
+    ctx->ShareDim("X", /*->*/ "Out");
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::GetDataTypeOfVar(ctx.InputVar("X")), ctx.GetPlace());
+  }
+};
+class SeluOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput {
+ protected:
+  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
+      const override {
+    return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Out"}};
+  }
+};
+class SeluOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "The input tensor of selu operator.");
+    AddOutput("Out", "The output tensor of selu operator.");
+    AddAttr<float>("scale",
+                   "(float) the default value is 1.0507~. For more "
+                   "information about this value, please refer to:"
+                   "https://arxiv.org/abs/1706.02515.")
+        .SetDefault(1.0507009873554804934193349852946);
+    AddAttr<float>("alpha",
+                   "(float) the default value is 1.6732~. For more "
+                   "information about this value, please refer to:"
+                   "https://arxiv.org/abs/1706.02515.")
+        .SetDefault(1.6732632423543772848170429916717);
+    AddComment(R"DOC(
+Selu Operator.
+The equation is:
+$$
+f(x) =\lambda*
+\begin{cases}
+ \quad \quad   x,  \quad \quad \quad \text{if} \ x > 0 \\
+ \alpha * e^x - \alpha,  \qquad  \text{if} \ x <= 0
+\end{cases}
+$$
+The input `X` can carry the LoD (Level of Details) information,
+or not. And the output shares the LoD information with input `X`.
+)DOC");
+  }
+};
+class SeluGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("selu_grad");
+    grad_op->SetInput("Out", Output("Out"));
+    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    grad_op->SetAttrMap(this->Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+class SeluGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("Out"), "Input(Out) should not be null");
+    auto x_grad_name = framework::GradVarName("X");
+    ctx->SetOutputDim(x_grad_name, ctx->GetInputDim("Out"));
+  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::GetDataTypeOfVar(ctx.InputVar("Out")), ctx.GetPlace());
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(selu, ops::SeluOp, ops::SeluOpMaker, ops::SeluOpInferVarType,
+                  ops::SeluGradMaker);
+REGISTER_OPERATOR(selu_grad, ops::SeluGradOp);
+REGISTER_OP_CPU_KERNEL(
+    selu, ops::SeluKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SeluKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    selu_grad, ops::SeluGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SeluGradKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/selu_op.cu
+++ b/paddle/fluid/operators/selu_op.cu
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/selu_op.h"
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    selu, ops::SeluKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SeluKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    selu_grad, ops::SeluGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SeluGradKernel<paddle::platform::CUDADeviceContext, double>);
--- a/paddle/fluid/operators/selu_op.h
+++ b/paddle/fluid/operators/selu_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <string>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/for_range.h"
+namespace paddle {
+namespace operators {
+static HOSTDEVICE float real_exp(float x) { return expf(x); }
+static HOSTDEVICE float real_exp(double x) { return exp(x); }
+template <typename T>
+struct SeluFunctor {
+  SeluFunctor(const T* x_data_ptr, float alpha, float scale, T* y_data_ptr)
+      : x_data_ptr_(x_data_ptr),
+        alpha_(alpha),
+        scale_(scale),
+        y_data_ptr_(y_data_ptr) {}
+  HOSTDEVICE void operator()(size_t idx) const {
+    T x_ele = x_data_ptr_[idx];
+    if (x_ele <= 0) {
+      x_ele = alpha_ * real_exp(x_ele) - alpha_;
+    }
+    y_data_ptr_[idx] = scale_ * x_ele;
+  }
+  const T* x_data_ptr_;
+  const float alpha_;
+  const float scale_;
+  T* y_data_ptr_;
+};
+template <typename T>
+struct SeluGradFunctor {
+  SeluGradFunctor(const T* y_data_ptr, const T* dy_data_ptr, float alpha,
+                  float scale, T* dx_data_ptr)
+      : y_data_ptr_(y_data_ptr),
+        dy_data_ptr_(dy_data_ptr),
+        alpha_(alpha),
+        scale_(scale),
+        la_(alpha * scale),
+        dx_data_ptr_(dx_data_ptr) {}
+  HOSTDEVICE void operator()(size_t idx) const {
+    T y_ele = y_data_ptr_[idx];
+    T dy_ele = dy_data_ptr_[idx];
+    float tmp = scale_;
+    if (y_ele <= 0) {
+      tmp = y_ele + la_;
+    }
+    dx_data_ptr_[idx] = dy_ele * tmp;
+  }
+  const T* y_data_ptr_;
+  const T* dy_data_ptr_;
+  const float alpha_;
+  const float scale_;
+  const float la_;
+  T* dx_data_ptr_;
+};
+template <typename DeviceContext, typename T>
+class SeluKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    using Tensor = framework::Tensor;
+    auto* x = context.Input<Tensor>("X");
+    auto* out = context.Output<Tensor>("Out");
+    float alpha = context.Attr<float>("alpha");
+    float scale = context.Attr<float>("scale");
+    auto out_ptr = out->mutable_data<T>(context.GetPlace());
+    SeluFunctor<T> functor(x->data<T>(), alpha, scale, out_ptr);
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    size_t limit = static_cast<size_t>(x->numel());
+    platform::ForRange<DeviceContext> for_range(dev_ctx, limit);
+    for_range(functor);
+  }
+};
+template <typename DeviceContext, typename T>
+class SeluGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    using Tensor = framework::Tensor;
+    auto* out = context.Input<Tensor>("Out");
+    auto* dout = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = context.Output<Tensor>(framework::GradVarName("X"));
+    float alpha = context.Attr<float>("alpha");
+    float scale = context.Attr<float>("scale");
+    auto dx_ptr = dx->mutable_data<T>(context.GetPlace());
+    SeluGradFunctor<T> functor(out->data<T>(), dout->data<T>(), alpha, scale,
+                               dx_ptr);
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    size_t limit = static_cast<size_t>(out->numel());
+    platform::ForRange<DeviceContext> for_range(dev_ctx, limit);
+    for_range(functor);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/sequence_ops/CMakeLists.txt
+++ b/paddle/fluid/operators/sequence_ops/CMakeLists.txt
+include(operators)
+register_operators()
--- a/paddle/fluid/operators/sequence_concat_op.cc
+++ b/paddle/fluid/operators/sequence_concat_op.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/operators/sequence_concat_op.h"
+#include "paddle/fluid/operators/sequence_ops/sequence_concat_op.h"
 #include <vector>
 namespace paddle {

--- a/paddle/fluid/operators/sequence_concat_op.cu.cc
+++ b/paddle/fluid/operators/sequence_concat_op.cu.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/operators/sequence_concat_op.h"
+#include "paddle/fluid/operators/sequence_ops/sequence_concat_op.h"
 template <typename T>
 using Kernel =

--- a/paddle/fluid/operators/sequence_concat_op.h
+++ b/paddle/fluid/operators/sequence_concat_op.h
--- a/paddle/fluid/operators/sequence_conv_op.cc
+++ b/paddle/fluid/operators/sequence_conv_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/sequence_conv_op.h"
+#include "paddle/fluid/operators/sequence_ops/sequence_conv_op.h"
 #include <algorithm>

--- a/paddle/fluid/operators/sequence_conv_op.cu.cc
+++ b/paddle/fluid/operators/sequence_conv_op.cu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/sequence_conv_op.h"
+#include "paddle/fluid/operators/sequence_ops/sequence_conv_op.h"
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(

--- a/paddle/fluid/operators/sequence_conv_op.h
+++ b/paddle/fluid/operators/sequence_conv_op.h
--- a/paddle/fluid/operators/sequence_enumerate_op.cc
+++ b/paddle/fluid/operators/sequence_enumerate_op.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/operators/sequence_enumerate_op.h"
+#include "paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/sequence_enumerate_op.cu
+++ b/paddle/fluid/operators/sequence_enumerate_op.cu
@@ -14,7 +14,7 @@
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
-#include "paddle/fluid/operators/sequence_enumerate_op.h"
+#include "paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 namespace paddle {

--- a/paddle/fluid/operators/sequence_enumerate_op.h
+++ b/paddle/fluid/operators/sequence_enumerate_op.h
--- a/paddle/fluid/operators/sequence_erase_op.cc
+++ b/paddle/fluid/operators/sequence_erase_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/sequence_erase_op.h"
+#include "paddle/fluid/operators/sequence_ops/sequence_erase_op.h"
 #include <vector>
 namespace paddle {

--- a/paddle/fluid/operators/sequence_erase_op.cu
+++ b/paddle/fluid/operators/sequence_erase_op.cu
@@ -14,7 +14,7 @@ limitations under the License. */
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
-#include "paddle/fluid/operators/sequence_erase_op.h"
+#include "paddle/fluid/operators/sequence_ops/sequence_erase_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 namespace paddle {

--- a/paddle/fluid/operators/sequence_erase_op.h
+++ b/paddle/fluid/operators/sequence_erase_op.h
--- a/paddle/fluid/operators/sequence_expand_as_op.cc
+++ b/paddle/fluid/operators/sequence_expand_as_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/sequence_expand_as_op.h"
+#include "paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/sequence_expand_as_op.cu
+++ b/paddle/fluid/operators/sequence_expand_as_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <algorithm>
-#include "paddle/fluid/operators/sequence_expand_as_op.h"
+#include "paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 namespace paddle {

--- a/paddle/fluid/operators/sequence_expand_as_op.h
+++ b/paddle/fluid/operators/sequence_expand_as_op.h
--- a/paddle/fluid/operators/sequence_expand_op.cc
+++ b/paddle/fluid/operators/sequence_expand_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/sequence_expand_op.h"
+#include "paddle/fluid/operators/sequence_ops/sequence_expand_op.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/sequence_expand_op.cu
+++ b/paddle/fluid/operators/sequence_expand_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <algorithm>
-#include "paddle/fluid/operators/sequence_expand_op.h"
+#include "paddle/fluid/operators/sequence_ops/sequence_expand_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 namespace paddle {

--- a/paddle/fluid/operators/sequence_expand_op.h
+++ b/paddle/fluid/operators/sequence_expand_op.h
--- a/paddle/fluid/operators/sequence_mask_op.cc
+++ b/paddle/fluid/operators/sequence_mask_op.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/operators/sequence_mask_op.h"
+#include "paddle/fluid/operators/sequence_ops/sequence_mask_op.h"
 REGISTER_OPERATOR(sequence_mask, paddle::operators::SequenceMaskOp,
                  paddle::operators::SequenceMaskOpMaker,

--- a/paddle/fluid/operators/sequence_mask_op.cu
+++ b/paddle/fluid/operators/sequence_mask_op.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/operators/sequence_mask_op.h"
+#include "paddle/fluid/operators/sequence_ops/sequence_mask_op.h"
 REGISTER_OP_CUDA_KERNEL(
    sequence_mask,

--- a/paddle/fluid/operators/sequence_mask_op.h
+++ b/paddle/fluid/operators/sequence_mask_op.h
--- a/paddle/fluid/operators/sequence_pad_op.cc
+++ b/paddle/fluid/operators/sequence_pad_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/sequence_pad_op.h"
+#include "paddle/fluid/operators/sequence_ops/sequence_pad_op.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/sequence_pad_op.cu
+++ b/paddle/fluid/operators/sequence_pad_op.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/sequence_pad_op.h"
+#include "paddle/fluid/operators/sequence_ops/sequence_pad_op.h"
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(

--- a/paddle/fluid/operators/sequence_pad_op.h
+++ b/paddle/fluid/operators/sequence_pad_op.h
--- a/paddle/fluid/operators/sequence_pool_op.cc
+++ b/paddle/fluid/operators/sequence_pool_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/sequence_pool_op.h"
+#include "paddle/fluid/operators/sequence_ops/sequence_pool_op.h"
 #include <string>
 namespace paddle {
@@ -47,7 +47,10 @@ class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker {
              "(Tensor<int>) This tensor is used for the sequence max-pooling "
              "to record the max indexes.")
        .AsIntermediate();
-    AddAttr<bool>("is_test", "").SetDefault(false);
+    AddAttr<bool>("is_test",
+                  "(bool, default false) Set to true for inference only, false "
+                  "for training. Some layers may run faster when this is true.")
+        .SetDefault(false);
    AddAttr<std::string>(
        "pooltype",
        "(string, default 'AVERAGE') the pooling pooltype of SequencePoolOp.")

--- a/paddle/fluid/operators/sequence_pool_op.cu
+++ b/paddle/fluid/operators/sequence_pool_op.cu
@@ -14,7 +14,7 @@ limitations under the License. */
 #define EIGEN_USE_GPU
-#include "paddle/fluid/operators/sequence_pool_op.h"
+#include "paddle/fluid/operators/sequence_ops/sequence_pool_op.h"
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(

--- a/paddle/fluid/operators/sequence_pool_op.h
+++ b/paddle/fluid/operators/sequence_pool_op.h
--- a/paddle/fluid/operators/sequence_reshape_op.cc
+++ b/paddle/fluid/operators/sequence_reshape_op.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/operators/sequence_reshape_op.h"
+#include "paddle/fluid/operators/sequence_ops/sequence_reshape_op.h"
 #include "paddle/fluid/framework/ddim.h"
 namespace paddle {

--- a/paddle/fluid/operators/sequence_reshape_op.cu
+++ b/paddle/fluid/operators/sequence_reshape_op.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/sequence_reshape_op.h"
+#include "paddle/fluid/operators/sequence_ops/sequence_reshape_op.h"
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(

--- a/paddle/fluid/operators/sequence_reshape_op.h
+++ b/paddle/fluid/operators/sequence_reshape_op.h
--- a/paddle/fluid/operators/sequence_reverse_op.cc
+++ b/paddle/fluid/operators/sequence_reverse_op.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/operators/sequence_reverse_op.h"
+#include "paddle/fluid/operators/sequence_ops/sequence_reverse_op.h"
 namespace ops = paddle::operators;

--- a/paddle/fluid/operators/sequence_reverse_op.cu
+++ b/paddle/fluid/operators/sequence_reverse_op.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/operators/sequence_reverse_op.h"
+#include "paddle/fluid/operators/sequence_ops/sequence_reverse_op.h"
 namespace ops = paddle::operators;

--- a/paddle/fluid/operators/sequence_reverse_op.h
+++ b/paddle/fluid/operators/sequence_reverse_op.h
--- a/paddle/fluid/operators/sequence_scatter_op.cc
+++ b/paddle/fluid/operators/sequence_scatter_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/sequence_scatter_op.h"
+#include "paddle/fluid/operators/sequence_ops/sequence_scatter_op.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/gather.h"

--- a/paddle/fluid/operators/sequence_scatter_op.h
+++ b/paddle/fluid/operators/sequence_scatter_op.h
--- a/paddle/fluid/operators/sequence_slice_op.cc
+++ b/paddle/fluid/operators/sequence_slice_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/sequence_slice_op.h"
+#include "paddle/fluid/operators/sequence_ops/sequence_slice_op.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/sequence_slice_op.cu
+++ b/paddle/fluid/operators/sequence_slice_op.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/sequence_slice_op.h"
+#include "paddle/fluid/operators/sequence_ops/sequence_slice_op.h"
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(

--- a/paddle/fluid/operators/sequence_slice_op.h
+++ b/paddle/fluid/operators/sequence_slice_op.h
--- a/paddle/fluid/operators/sequence_softmax_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/sequence_softmax_cudnn_op.cu.cc
--- a/paddle/fluid/operators/sequence_softmax_op.cc
+++ b/paddle/fluid/operators/sequence_softmax_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/sequence_softmax_op.h"
+#include "paddle/fluid/operators/sequence_ops/sequence_softmax_op.h"
 #include <string>
 namespace paddle {

--- a/paddle/fluid/operators/sequence_softmax_op.cu
+++ b/paddle/fluid/operators/sequence_softmax_op.cu
@@ -14,7 +14,7 @@ limitations under the License. */
 #include <algorithm>
 #include <cub/cub.cuh>  // NOLINT
-#include "paddle/fluid/operators/sequence_softmax_op.h"
+#include "paddle/fluid/operators/sequence_ops/sequence_softmax_op.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/sequence_softmax_op.h
+++ b/paddle/fluid/operators/sequence_softmax_op.h
--- a/paddle/fluid/operators/sequence_unpad_op.cc
+++ b/paddle/fluid/operators/sequence_unpad_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/sequence_unpad_op.h"
+#include "paddle/fluid/operators/sequence_ops/sequence_unpad_op.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/sequence_unpad_op.cu
+++ b/paddle/fluid/operators/sequence_unpad_op.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/sequence_unpad_op.h"
+#include "paddle/fluid/operators/sequence_ops/sequence_unpad_op.h"
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(

--- a/paddle/fluid/operators/sequence_unpad_op.h
+++ b/paddle/fluid/operators/sequence_unpad_op.h
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -96,20 +96,21 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
                  "(bool, default false) Only used in mkldnn kernel")
        .SetDefault(false);
    AddAttr<bool>("is_test",
-                  "Disable epsilon adding to softmax results. Used by MKLDNN.")
+                  "(bool, default false) Set to true for inference only, false "
+                  "for training. Some layers may run faster when this is true.")
        .SetDefault(false);
    AddComment(R"DOC(
 Softmax Operator.
-The input of the softmax operator is a tensor of any rank. The output tensor 
+The input of the softmax operator is a tensor of any rank. The output tensor
 has the same shape as the input.
-The input tensor will first be logically flattened to a 2-D matrix. The matrix's 
+The input tensor will first be logically flattened to a 2-D matrix. The matrix's
-second dimension(row length) is as same as the last dimension of the input 
+second dimension(row length) is as same as the last dimension of the input
-tensor, and the first dimension(column length) is the product of all other 
+tensor, and the first dimension(column length) is the product of all other
-dimensions of the input tensor. For each row of the matrix, the softmax operator 
+dimensions of the input tensor. For each row of the matrix, the softmax operator
-squashes the K-dimensional(K is the width of the matrix, which is also the size 
+squashes the K-dimensional(K is the width of the matrix, which is also the size
-of the input tensor's last dimension) vector of arbitrary real values to a 
+of the input tensor's last dimension) vector of arbitrary real values to a
 K-dimensional vector of real values in the range [0, 1] that add up to 1.
 It computes the exponential of the given dimension and the sum of exponential
 values of all the other dimensions in the K-dimensional vector input.

--- a/paddle/fluid/operators/space_to_depth_op.cc
+++ b/paddle/fluid/operators/space_to_depth_op.cc
@@ -86,7 +86,7 @@ class SpaceToDepthOpMaker : public framework::OpProtoAndCheckerMaker {
        .GreaterThan(1);
    AddComment(R"DOC(
        reorg operator used in Yolo v2.
-        The equation is: C2 = C1/blocksize * blocksize, W2 = W1 ∗ blocksize + offset % blocksize, H2 = H1 ∗ blocksize + offset / blocksize, 
+        The equation is: C2 = C1/blocksize * blocksize, W2 = W1 * blocksize + offset % blocksize, H2 = H1 * blocksize + offset / blocksize,   
        Reshape Input(X) into the shape according to Attr(blocksize). The
        data in Input(X) are unchanged.

--- a/paddle/fluid/operators/tensorrt/CMakeLists.txt
+++ b/paddle/fluid/operators/tensorrt/CMakeLists.txt
+op_library(tensorrt_engine_op DEPS tensorrt_engine tensorrt_converter)
+file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(tensorrt_engine);\n")
+nv_test(test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc
+  DEPS tensorrt_engine_op
+  analysis)
--- a/paddle/fluid/operators/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op.cc
@@ -17,7 +17,7 @@
 #include <string>
 #include <vector>
-#include "paddle/fluid/operators/tensorrt_engine_op.h"
+#include "paddle/fluid/operators/tensorrt/tensorrt_engine_op.h"
 namespace paddle {

--- a/paddle/fluid/operators/tensorrt_engine_op.cu.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op.cu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/tensorrt_engine_op.h"
+#include "paddle/fluid/operators/tensorrt/tensorrt_engine_op.h"
 namespace ops = paddle::operators;

--- a/paddle/fluid/operators/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
--- a/paddle/fluid/operators/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/tensorrt_engine_op.h"
+#include "paddle/fluid/operators/tensorrt/tensorrt_engine_op.h"
 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/lod_tensor.h"

--- a/paddle/fluid/operators/warpctc_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/warpctc_cudnn_op.cu.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/framework/mixed_vector.h"
+#include "paddle/fluid/operators/math/softmax.h"
+#include "paddle/fluid/operators/warpctc_op.h"
+#include "paddle/fluid/platform/cudnn_helper.h"
+namespace paddle {
+namespace operators {
+#if CUDNN_VERSION >= 7001
+using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
+using ScopedCTCLossDescriptor = platform::ScopedCTCLossDescriptor;
+using DataLayout = platform::DataLayout;
+template <typename DeviceContext, typename T>
+class CudnnCTCKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    // =====================Copied code from warpctc===========================
+    auto* logits = ctx.Input<LoDTensor>("Logits");
+    auto* label = ctx.Input<LoDTensor>("Label");
+    auto* warpctc_grad = ctx.Output<LoDTensor>("WarpCTCGrad");
+    auto* loss = ctx.Output<LoDTensor>("Loss");
+    const size_t level = 0;
+    auto logits_lod = framework::ToAbsOffset(logits->lod());
+    auto logits_dims = logits->dims();
+    PADDLE_ENFORCE_EQ(logits_dims[0],
+                      static_cast<int64_t>(logits_lod[level].back()),
+                      "The first dimension of Input(Logits) should be equal to "
+                      "the sum of all sequences' lengths.");
+    auto label_lod = framework::ToAbsOffset(label->lod());
+    auto label_dims = label->dims();
+    PADDLE_ENFORCE_EQ(
+        label_dims[0], label->numel(),
+        "The width of each timestep in Input(Label) should be 1.");
+    const size_t num_sequences = logits_lod[level].size() - 1;
+    PADDLE_ENFORCE_EQ(num_sequences, label_lod[level].size() - 1,
+                      "The number of sequences of Input(Logits) should be "
+                      "equal to that of Input(Label).");
+    PADDLE_ENFORCE_LE(num_sequences, 256,
+                      "The labelLengths must less than 256 for cudnn call.");
+    const size_t sequence_width = logits->numel() / logits_dims[0];
+    auto loss_dims =
+        framework::make_ddim({static_cast<int64_t>(num_sequences), 1});
+    // NOTE: cudnn takes softmax input, calculate softmax first, then do padding
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    LoDTensor softmax_logits;
+    softmax_logits.mutable_data<T>(logits->dims(), ctx.GetPlace());
+    softmax_logits.set_lod(logits_lod);
+    int rank = logits->dims().size();
+    Tensor in_2d = framework::ReshapeToMatrix(*logits, rank - 1);
+    Tensor out_2d = framework::ReshapeToMatrix(softmax_logits, rank - 1);
+    math::SoftmaxFunctor<DeviceContext, T, false>()(dev_ctx, &in_2d, &out_2d);
+    // ctc needs sequences data stored in transposed padding format
+    // logits and grad using padding data of layout 'TNC'
+    // T: max_sequence_length
+    // N: batch_size (num_sequences)
+    // C: width
+    LoDTensor warpctc_logits;
+    const size_t max_sequence_length =
+        math::MaximumSequenceLength(logits_lod[level]);
+    auto warpctc_logits_dims =
+        framework::make_ddim({static_cast<int64_t>(max_sequence_length),
+                              static_cast<int64_t>(num_sequences),
+                              static_cast<int64_t>(sequence_width)});
+    warpctc_logits.mutable_data<T>(warpctc_logits_dims, ctx.GetPlace());
+    LoDTensor cpu_pad_value;
+    T* pad_value_data =
+        cpu_pad_value.mutable_data<T>({1}, platform::CPUPlace());
+    *pad_value_data = static_cast<T>(0);
+    LoDTensor pad_value;
+    if (platform::is_cpu_place(ctx.GetPlace())) {
+      pad_value = cpu_pad_value;
+    } else {
+      TensorCopySync(cpu_pad_value, ctx.GetPlace(), &pad_value);
+    }
+    math::PaddingLoDTensorFunctor<DeviceContext, T>()(
+        ctx.template device_context<DeviceContext>(), softmax_logits,
+        &warpctc_logits, pad_value, -1, 0, false /* norm_by_times */,
+        math::kLengthBatchWidth);
+    const T* warpctc_logits_data = warpctc_logits.data<T>();
+    std::vector<int> warpctc_label_lengths(num_sequences);
+    std::vector<int> warpctc_logits_lengths(num_sequences);
+    for (size_t i = 0; i < num_sequences; ++i) {
+      warpctc_label_lengths[i] = label_lod[level][i + 1] - label_lod[level][i];
+      warpctc_logits_lengths[i] =
+          logits_lod[level][i + 1] - logits_lod[level][i];
+    }
+    T* warpctc_grad_data =
+        warpctc_grad->mutable_data<T>(warpctc_logits.dims(), ctx.GetPlace());
+    math::SetConstant<DeviceContext, T>()(
+        ctx.template device_context<DeviceContext>(), warpctc_grad,
+        static_cast<T>(0));
+    Tensor warpctc_label;
+    TensorCopySync(*label, platform::CPUPlace(), &warpctc_label);
+    const int* warpctc_label_data = warpctc_label.data<int>();
+    // ========================================================================
+    ScopedTensorDescriptor logits_desc;
+    ScopedTensorDescriptor grad_desc;
+    ScopedCTCLossDescriptor ctcloss_desc;
+    // layout here doesn't have effect.
+    DataLayout layout = DataLayout::kNCHW;
+    auto cu_logits_desc = logits_desc.descriptor<T>(
+        layout, framework::vectorize2int(warpctc_logits.dims()));
+    auto cu_grad_desc = grad_desc.descriptor<T>(
+        layout, framework::vectorize2int(warpctc_grad->dims()));
+    auto cu_ctcloss_desc = ctcloss_desc.descriptor<T>();
+    auto handle = dev_ctx.cudnn_handle();
+    size_t workspace_size;
+    CUDNN_ENFORCE(platform::dynload::cudnnGetCTCLossWorkspaceSize(
+        handle, cu_logits_desc, cu_grad_desc, warpctc_label_data,
+        warpctc_label_lengths.data(), warpctc_logits_lengths.data(),
+        CUDNN_CTC_LOSS_ALGO_DETERMINISTIC, cu_ctcloss_desc, &workspace_size));
+    T* loss_data = loss->mutable_data<T>(loss_dims, ctx.GetPlace());
+    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
+    auto cudnn_func = [&](void* cudnn_workspace) {
+      CUDNN_ENFORCE(platform::dynload::cudnnCTCLoss(
+          handle, cu_logits_desc, warpctc_logits_data, warpctc_label_data,
+          warpctc_label_lengths.data(), warpctc_logits_lengths.data(),
+          loss_data, cu_grad_desc, warpctc_grad_data,
+          CUDNN_CTC_LOSS_ALGO_DETERMINISTIC, cu_ctcloss_desc, cudnn_workspace,
+          workspace_size));
+    };
+    workspace_handle.RunFunc(cudnn_func, workspace_size);
+  }
+};
+template <typename DeviceContext, typename T>
+class CudnnCTCGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* warpctc_grad = ctx.Input<LoDTensor>("WarpCTCGrad");
+    auto* logits_grad = ctx.Output<LoDTensor>(framework::GradVarName("Logits"));
+    const Tensor* loss_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
+    logits_grad->mutable_data<T>(ctx.GetPlace());
+    bool norm_by_times = ctx.Attr<bool>("norm_by_times");
+    math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
+        ctx.template device_context<DeviceContext>(), *warpctc_grad,
+        logits_grad, -1, 0, norm_by_times, math::kLengthBatchWidth);
+    const T* loss_grad_data = loss_grad->data<T>();
+    math::ScaleLoDTensorFunctor<DeviceContext, T>()(
+        ctx.template device_context<DeviceContext>(), loss_grad_data,
+        logits_grad);
+  }
+};
+#endif
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+#if CUDNN_VERSION >= 7001
+REGISTER_OP_KERNEL(
+    warpctc, CUDNN, plat::CUDAPlace,
+    ops::CudnnCTCKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_KERNEL(
+    warpctc_grad, CUDNN, plat::CUDAPlace,
+    ops::CudnnCTCGradKernel<paddle::platform::CUDADeviceContext, float>);
+#endif
--- a/paddle/fluid/operators/warpctc_op.cc
+++ b/paddle/fluid/operators/warpctc_op.cc
@@ -14,6 +14,10 @@ limitations under the License. */
 #include "paddle/fluid/operators/warpctc_op.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/cudnn_helper.h"
+#endif
 namespace paddle {
 namespace operators {
@@ -45,9 +49,16 @@ class WarpCTCOp : public framework::OperatorWithKernel {
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
+    framework::LibraryType library_{framework::LibraryType::kPlain};
+#ifdef PADDLE_WITH_CUDA
+    if (platform::CanCUDNNBeUsed(ctx)) {
+      library_ = framework::LibraryType::kCUDNN;
+    }
+#endif
+    framework::DataLayout layout_ = framework::DataLayout::kAnyLayout;
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<Tensor>("Logits")->type()),
-        ctx.device_context());
+        ctx.device_context(), layout_, library_);
  }
 };
@@ -86,6 +97,10 @@ class WarpCTCOpMaker : public framework::OpProtoAndCheckerMaker {
                  "normalize the gradients by the number of time-step, "
                  "which is also the sequence's length.")
        .SetDefault(false);
+    AddAttr<bool>("use_cudnn",
+                  "(bool, default: false), whether to "
+                  "use cudnn kernel.")
+        .SetDefault(false);
    AddComment(R"DOC(
 An operator integrating the open-source
 [warp-ctc](https://github.com/baidu-research/warp-ctc) library, which is used in

--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@@ -380,5 +380,28 @@ inline bool CanCUDNNBeUsed(const framework::ExecutionContext& ctx) {
  return use_cudnn;
 }
+#if CUDNN_VERSION >= 7001
+class ScopedCTCLossDescriptor {
+ public:
+  ScopedCTCLossDescriptor() {
+    PADDLE_ENFORCE(dynload::cudnnCreateCTCLossDescriptor(&desc_));
+  }
+  ~ScopedCTCLossDescriptor() {
+    PADDLE_ENFORCE(dynload::cudnnDestroyCTCLossDescriptor(desc_));
+  }
+  template <typename T>
+  inline cudnnCTCLossDescriptor_t descriptor() {
+    PADDLE_ENFORCE(
+        dynload::cudnnSetCTCLossDescriptor(desc_, CudnnDataType<T>::type));
+    return desc_;
+  }
+ private:
+  cudnnCTCLossDescriptor_t desc_;
+  DISABLE_COPY_AND_ASSIGN(ScopedCTCLossDescriptor);
+};
+#endif
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -154,7 +154,13 @@ CUDNN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #if CUDNN_VERSION >= 7001
 #define CUDNN_DNN_ROUTINE_EACH_R7(__macro) \
  __macro(cudnnSetConvolutionGroupCount);  \
-  __macro(cudnnSetConvolutionMathType);
+  __macro(cudnnSetConvolutionMathType);    \
+  __macro(cudnnCreateCTCLossDescriptor);   \
+  __macro(cudnnDestroyCTCLossDescriptor);  \
+  __macro(cudnnGetCTCLossDescriptor);      \
+  __macro(cudnnSetCTCLossDescriptor);      \
+  __macro(cudnnGetCTCLossWorkspaceSize);   \
+  __macro(cudnnCTCLoss);
 CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif

--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -10,12 +10,12 @@ if(WITH_PYTHON)
    hip_library(paddle_pybind SHARED
      SRCS ${PYBIND_SRCS}
      DEPS ${PYBIND_DEPS}
-      ${GLOB_OP_LIB})
+      ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
  else()
    cc_library(paddle_pybind SHARED
      SRCS ${PYBIND_SRCS}
      DEPS ${PYBIND_DEPS}
-      ${GLOB_OP_LIB})
+      ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
    if(NOT APPLE AND NOT ANDROID AND NOT WIN32)
      target_link_libraries(paddle_pybind rt)
    endif(NOT APPLE AND NOT ANDROID AND NOT WIN32)

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -110,6 +110,7 @@ __all__ = [
    'random_crop',
    'mean_iou',
    'relu',
+    'selu',
    'log',
    'crop',
    'rank_loss',
@@ -4186,7 +4187,7 @@ def ctc_greedy_decoder(input, blank, name=None):
    return ctc_out
-def warpctc(input, label, blank=0, norm_by_times=False):
+def warpctc(input, label, blank=0, norm_by_times=False, use_cudnn=False):
    """
    An operator integrating the open source Warp-CTC library
    (https://github.com/baidu-research/warp-ctc)
@@ -4211,6 +4212,7 @@ def warpctc(input, label, blank=0, norm_by_times=False):
         by the number of time-step, which is also the sequence's length.
         There is no need to normalize the gradients if warpctc layer was
         follewed by a mean_op.
+       use_cudnn (bool, default false): Whether to use cudnn.
    Returns:
        Variable: The Connectionist Temporal Classification (CTC) loss,
@@ -4234,8 +4236,11 @@ def warpctc(input, label, blank=0, norm_by_times=False):
                'Label': [label]},
        outputs={'WarpCTCGrad': [grad_out],
                 'Loss': [loss_out]},
-        attrs={'blank': blank,
+        attrs={
-               'norm_by_times': norm_by_times})
+            'blank': blank,
+            'norm_by_times': norm_by_times,
+            'use_cudnn': use_cudnn
+        })
    return loss_out
@@ -4308,7 +4313,10 @@ def nce(input,
        param_attr=None,
        bias_attr=None,
        num_neg_samples=None,
-        name=None):
+        name=None,
+        sampler="uniform",
+        custom_dist=None,
+        seed=0):
    """
    ${comment}
@@ -4331,6 +4339,14 @@ def nce(input,
        num_neg_samples (int): ${num_neg_samples_comment}
        name (str|None): A name for this layer(optional). If set None, the layer
             will be named automatically. Default: None.
+        sampler (str): The sampler used to sample class from negtive classes.
+                       It can be 'uniform', 'log_uniform' or 'custom_dist'.
+                       default: 'uniform'.
+        custom_dist (Variable): A tensor with shape [num_total_classes]. 
+                       It is used when sampler is set to 'custom_dist'.
+                       custom_dist[i] is the probsbility of i-th class to be sampled.
+                       default: None.
+        seed (int): The seed used in sampler. default: 0.
    Returns:
        Variable: The output nce loss.
@@ -4360,6 +4376,16 @@ def nce(input,
            loss = layers.nce(input=embs, label=words[label_word],
                          num_total_classes=dict_size, param_attr='nce.w',
                          bias_attr='nce.b')
+            #or use custom distribution
+            dist = fluid.layers.assign(input=np.array([0.05,0.5,0.1,0.3,0.05]).astype("float32"))
+            loss = layers.nce(input=embs, label=words[label_word],
+                          num_total_classes=5, param_attr='nce.w',
+                          bias_attr='nce.b',
+                          num_neg_samples=3,
+                          sampler="custom_dist",
+                          custom_dist=dist)
    """
    helper = LayerHelper('nce', **locals())
    assert isinstance(input, Variable)
@@ -4394,9 +4420,31 @@ def nce(input,
    else:
        num_neg_samples = int(num_neg_samples)
+    inputs = {
+        'Input': input,
+        'Label': label,
+        'Weight': w,
+        'Bias': b,
+        'SampleWeight': sample_weight if sample_weight is not None else []
+    }
+    if sampler == "uniform":
+        sampler = 0
+    elif sampler == "log_uniform":
+        sampler = 1
+    elif sampler == "custom_dist":
+        assert custom_dist is not None
+        assert isinstance(custom_dist, Variable)
+        inputs['CustomDistribution'] = custom_dist
+        sampler = 2
+    else:
+        raise Exception("Unsupported sampler type.")
    attrs = {
        'num_total_classes': int(num_total_classes),
-        'num_neg_samples': num_neg_samples
+        'num_neg_samples': num_neg_samples,
+        'seed': seed,
+        'sampler': sampler
    }
    helper.append_op(
@@ -6182,6 +6230,47 @@ def relu(x, name=None):
    return out
+@templatedoc()
+def selu(x, scale=None, alpha=None, name=None):
+    """
+    ${comment}
+    Args:
+        x (Variable): The input tensor.
+        scale(float, None): If the scale is not set,
+            the default value is 1.0507009873554804934193349852946.
+            For more information about this value, please refer
+            to: https://arxiv.org/abs/1706.02515.
+        alpha(float, None): If the alpha is not set,
+            the default value is 1.6732632423543772848170429916717.
+            For more information about this value, please refer
+            to: https://arxiv.org/abs/1706.02515.
+        name (str|None, default None): A name for this layer If set None,
+            the layer will be named automatically.
+    Returns:
+        Variable: The output tensor with the same shape as input.
+    Examples:
+        .. code-block:: python
+            output = fluid.layers.selu(x)
+    """
+    helper = LayerHelper('selu', **locals())
+    dtype = helper.input_dtype(input_param_name='x')
+    out = helper.create_variable_for_type_inference(dtype)
+    attrs = {}
+    if scale is not None:
+        attrs["scale"] = scale
+    if alpha is not None:
+        attrs["alpha"] = alpha
+    helper.append_op(
+        type="selu", inputs={"X": x}, outputs={"Out": out}, attrs=attrs)
+    return out
 def mean_iou(input, label, num_classes):
    """
    Mean Intersection-Over-Union is a common evaluation metric for

--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
@@ -38,7 +38,7 @@ depth = 8
 mix_hidden_lr = 1e-3
 IS_SPARSE = True
-PASS_NUM = 1
+PASS_NUM = 2
 BATCH_SIZE = 10
 embedding_name = 'emb'
@@ -196,7 +196,7 @@ def train(use_cuda, save_dirname=None, is_local=True):
                        print("second per batch: " + str((time.time(
                        ) - start_time) / batch_id))
                    # Set the threshold low to speed up the CI test
-                    if float(cost) < 60.0:
+                    if float(cost) < 80.0:
                        if save_dirname is not None:
                            # TODO(liuyiqun): Change the target to crf_decode
                            fluid.io.save_inference_model(save_dirname, [
@@ -208,6 +208,10 @@ def train(use_cuda, save_dirname=None, is_local=True):
                batch_id = batch_id + 1
+        raise RuntimeError(
+            "This model should save_inference_model and return, but not reach here, please check!"
+        )
    if is_local:
        train_loop(fluid.default_main_program())
    else:

--- a/python/paddle/fluid/tests/unittests/test_infer_shape.py
+++ b/python/paddle/fluid/tests/unittests/test_infer_shape.py
@@ -83,6 +83,34 @@ class TestInferShape(unittest.TestCase):
        mul_op_desc.infer_shape(block)
        self.assertEqual(out.shape(), [x_shape[0], y_shape[1]])
+    def test_expand_op(self):
+        prog = core.ProgramDesc()
+        self.assertIsNotNone(prog)
+        block = prog.block(0)
+        self.assertIsNotNone(block)
+        shape = [-1, 20]
+        expand_times = [3, 1]
+        # prepare input/output
+        x1 = block.var(six.b("x"))
+        x1.set_type(core.VarDesc.VarType.LOD_TENSOR)
+        x1.set_shape(shape)
+        out = block.var(six.b("out"))
+        out.set_type(core.VarDesc.VarType.LOD_TENSOR)
+        # prepare the operator
+        sum_op_desc = block.append_op()
+        sum_op_desc.set_type("expand")
+        sum_op_desc.set_input("X", ["x"])
+        sum_op_desc.set_output("Out", ["out"])
+        sum_op_desc._set_attr('expand_times', expand_times)
+        sum_op_desc.check_attrs()
+        sum_op_desc.infer_shape(block)
+        self.assertEqual(out.shape(), shape)
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_nce.py
+++ b/python/paddle/fluid/tests/unittests/test_nce.py
@@ -68,7 +68,9 @@ class TestNCE(OpTest):
        self.attrs = {
            'num_total_classes': num_classes,
            'num_neg_samples': num_neg_samples,
-            'custom_neg_classes': list(range(num_neg_samples))
+            'custom_neg_classes': list(range(num_neg_samples)),
+            'seed': 0,
+            'sampler': 0
        }
        self.inputs = {
            'Input': input,

--- a/python/paddle/fluid/tests/unittests/test_selu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_selu_op.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import unittest
+import numpy as np
+import six
+from op_test import OpTest
+class SeluTest(OpTest):
+    def setUp(self):
+        self.op_type = "selu"
+        self.x_shape = [3, 5, 5, 10]
+        self.dtype = np.float32
+        self.init_x_shape()
+        self.init_dtype()
+        alpha = 1.6732632423543772848170429916717
+        scale = 1.0507009873554804934193349852946
+        x = np.random.normal(size=self.x_shape).astype(self.dtype)
+        # Since zero point in selu is not differentiable, avoid randomize
+        # zero.
+        x[np.abs(x) < 0.005] = 0.02
+        x_flat = x.flatten()
+        for i in range(x_flat.size):
+            if x_flat[i] < 0:
+                x_flat[i] = alpha * np.exp(x_flat[i]) - alpha
+            x_flat[i] = scale * x_flat[i]
+        out_np = x_flat.reshape(self.x_shape)
+        self.inputs = {'X': x}
+        self.outputs = {'Out': out_np}
+        self.attrs = {
+            'alpha': alpha,
+            'scale': scale,
+        }
+    def init_x_shape(self):
+        pass
+    def init_dtype(self):
+        pass
+    def test_check_output(self):
+        self.check_output()
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_warpctc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
@@ -183,6 +183,7 @@ class TestWarpCTCOp(OpTest):
        self.labels_lod = [[3, 1, 4, 4]]
        self.blank = self.num_classes - 1
        self.norm_by_times = False
+        self.use_cudnn = False
    def setUp(self):
        self.op_type = "warpctc"
@@ -215,7 +216,11 @@ class TestWarpCTCOp(OpTest):
            "Label": (labels, self.labels_lod)
        }
        self.outputs = {"Loss": loss}
-        self.attrs = {"blank": self.blank, "norm_by_times": self.norm_by_times}
+        self.attrs = {
+            "blank": self.blank,
+            "norm_by_times": self.norm_by_times,
+            "use_cudnn": self.use_cudnn
+        }
    def test_check_output(self):
        self.check_output()
@@ -233,6 +238,22 @@ class TestWarpCTCOpCase1(TestWarpCTCOp):
        self.labels_lod = [[3, 1, 4, 4]]
        self.blank = 0
        self.norm_by_times = False
+        self.use_cudnn = False
+class TestCudnnCTCOp(TestWarpCTCOp):
+    def config(self):
+        self.batch_size = 4
+        self.num_classes = 8
+        self.logits_lod = [[4, 1, 3, 3]]
+        self.labels_lod = [[3, 1, 4, 4]]
+        self.blank = 0
+        self.norm_by_times = False
+        self.use_cudnn = True
+    def test_check_grad(self):
+        self.outputs['WarpCTCGrad'] = self.gradient
+        self.check_grad(["Logits"], "Loss", max_relative_error=0.01)
 if __name__ == "__main__":

--- a/python/paddle/fluid/transpiler/details/checkport.py
+++ b/python/paddle/fluid/transpiler/details/checkport.py
@@ -34,6 +34,7 @@ def wait_server_ready(endpoints):
    """
    while True:
        all_ok = True
+        not_ready_endpoints = []
        for ep in endpoints:
            ip_port = ep.split(":")
            with closing(socket.socket(socket.AF_INET,
@@ -42,8 +43,11 @@ def wait_server_ready(endpoints):
                result = sock.connect_ex((ip_port[0], int(ip_port[1])))
                if result != 0:
                    all_ok = False
+                    not_ready_endpoints.append(ep)
        if not all_ok:
            sys.stderr.write("pserver not ready, wait 3 sec to retry...\n")
+            sys.stderr.write("not ready endpoints:" + str(not_ready_endpoints) +
+                             "\n")
            sys.stderr.flush()
            time.sleep(3)
        else:

--- a/python/paddle/fluid/transpiler/inference_transpiler.py
+++ b/python/paddle/fluid/transpiler/inference_transpiler.py
@@ -73,6 +73,38 @@ class InferenceTranspiler(object):
                program)  # ResNet residual block merging
            self._fuse_bn_relu_mkldnn(program)
+        self._is_test_pass(program)
+    def _is_test_pass(self, program):
+        '''
+        Transpile the program setting is_test = true for all layers and
+        inserts is_test attribute to pooling and activation layers.
+        As a result some operators might run faster
+        :param program: program to transpile
+        :type program: Program
+        '''
+        self.block = program.block(0)
+        i = 0
+        while i < len(self.block.ops):
+            current_op = self.block.ops[i]
+            if current_op.has_attr("is_test"):
+                current_op._set_attr("is_test", True)
+            elif current_op.type in [
+                    "pool2d", "sigmoid", "logsigmoid", "softshrink", "exp",
+                    "brelu", "pow", "leaky_relu", "stanh", "relu", "tanh",
+                    "tanh_shrink", "sqrt", "abs", "ceil", "elu", "floor", "cos",
+                    "sin", "round", "reciprocal", "hard_shrink", "hard_sigmoid",
+                    "relu6", "soft_relu", "swish", "thresholded_relu", "log",
+                    "square", "softplus", "softsign"
+            ]:
+                current_op._set_attr("is_test", True)
+            i = i + 1
+        # TODO(luotao): use clone() method to flush the program.desc in force,
+        # since some large program.desc will not be flushed immediately.
+        # And a better solution will be considered later.
+        program = program.clone()
    def _depthwise_conv_mkldnn(self, program):
        '''
        Transpile the program by replacing depthwise_conv2d to conv2d for MKLDNN program.

--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -142,7 +142,7 @@ if os.name == 'nt':
 if '${WITH_FLUID_ONLY}'== 'OFF':
    package_data['paddle.v2.master']=['libpaddle_master' + ext_name]
-    package_data['py_paddle']=['*.py','_swig_paddle' +  + ext_name]
+    package_data['py_paddle']=['*.py','_swig_paddle' + ext_name]
 package_dir={
    '': '${PADDLE_BINARY_DIR}/python',