Merge remote-tracking branch 'upstream/develop' into windows/build

6d0d5a76 · peizhilin · 162f2d41 · b984c709 · 6d0d5a76 · 6d0d5a76
35 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -315,7 +315,6 @@ endif()
 if (ON_INFER)
    message(STATUS "On inference mode, will take place some specific optimization.")
-    add_definitions(-DPADDLE_ON_INFERENCE)
 else()
    #TODO(luotao), combine this warning with `make inference_lib_dist` command.
    message(WARNING "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only.")

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -218,3 +218,7 @@ endif(WITH_GRPC)
 if(WITH_BRPC_RDMA)
    add_definitions(-DPADDLE_WITH_BRPC_RDMA)
 endif(WITH_BRPC_RDMA)
+if(ON_INFER)
+    add_definitions(-DPADDLE_ON_INFERENCE)
+endif(ON_INFER)
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -128,6 +128,7 @@ paddle.fluid.layers.sequence_scatter ArgSpec(args=['input', 'index', 'updates',
 paddle.fluid.layers.random_crop ArgSpec(args=['x', 'shape', 'seed'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.mean_iou ArgSpec(args=['input', 'label', 'num_classes'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.relu ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.selu ArgSpec(args=['x', 'scale', 'alpha', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.log ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,))

--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -41,6 +41,7 @@ pass_library(seq_concat_fc_fuse_pass inference)
 pass_library(multi_batch_merge_pass base)
 pass_library(conv_bn_fuse_pass inference)
 pass_library(seqconv_eltadd_relu_fuse_pass inference)
+pass_library(is_test_pass base)
 if(WITH_MKLDNN)
    pass_library(mkldnn_placement_pass base)
    pass_library(depthwise_conv_mkldnn_pass base)
@@ -62,6 +63,7 @@ cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_r
 cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass)
 cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector)
 cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
+cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass)
 if (WITH_MKLDNN)
    cc_test(test_depthwise_conv_mkldnn_pass SRCS depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass)
    cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)

--- a/paddle/fluid/framework/ir/is_test_pass.cc
+++ b/paddle/fluid/framework/ir/is_test_pass.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/framework/ir/is_test_pass.h"
+#include <string>
+#include <utility>
+namespace paddle {
+namespace framework {
+namespace ir {
+std::unique_ptr<ir::Graph> IsTestPass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  VLOG(3) << "Sets is_test attrbiute to true and if it is missing, inserts it "
+             "for activations and pooling.";
+  auto op_list = {"pool2d",      "sigmoid",      "logsigmoid",
+                  "softshrink",  "exp",          "brelu",
+                  "pow",         "leaky_relu",   "stanh",
+                  "relu",        "tanh",         "tanh_shrink",
+                  "sqrt",        "abs",          "ceil",
+                  "elu",         "floor",        "cos",
+                  "sin",         "round",        "reciprocal",
+                  "hard_shrink", "hard_sigmoid", "relu6",
+                  "soft_relu",   "swish",        "thresholded_relu",
+                  "log",         "square",       "softplus",
+                  "softsign"};
+  for (const Node* n : graph->Nodes()) {
+    if (n->IsOp()) {
+      auto* op = n->Op();
+      if (op->HasAttr("is_test")) {
+        op->SetAttr("is_test", true);
+      } else if (std::find(begin(op_list), end(op_list), op->Type()) !=
+                 end(op_list)) {
+        op->MutableAttrMap()->insert(
+            std::pair<std::string, Attribute>("is_test", true));
+      }
+    }
+  }
+  return graph;
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+REGISTER_PASS(is_test_pass, paddle::framework::ir::IsTestPass);
--- a/paddle/fluid/framework/ir/is_test_pass.h
+++ b/paddle/fluid/framework/ir/is_test_pass.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/fluid/framework/ir/pass.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+class IsTestPass : public Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+};
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/is_test_pass_tester.cc
+++ b/paddle/fluid/framework/ir/is_test_pass_tester.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/ir/is_test_pass.h"
+#include <gtest/gtest.h>
+namespace paddle {
+namespace framework {
+namespace ir {
+enum class ISTEST_STATE { FALSE, TRUE, UNSET };
+void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs, bool use_mkldnn = false,
+           ISTEST_STATE is_test = ISTEST_STATE::UNSET) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+  op->SetType(type);
+  op->SetAttr("name", name);
+  op->SetInput("X", inputs);
+  op->SetOutput("Out", outputs);
+  op->SetAttr("use_mkldnn", use_mkldnn);
+  if (is_test == ISTEST_STATE::UNSET)
+    op->MutableAttrMap()->erase("is_test");
+  else if (is_test == ISTEST_STATE::FALSE)
+    op->SetAttr("is_test", false);
+  else
+    op->SetAttr("is_test", true);
+}
+// a->pool2d->b
+// b->relu->c
+// c,weights1)->conv2d->d
+//
+// d->pool2d->e
+// e->hard_sigmoid->f
+// (f,weights2)->conv2d->g
+//
+// g->pool2d->h
+// h->tanh->i
+// (i,weights3)->conv2d->j
+ProgramDesc BuildProgramDesc() {
+  ProgramDesc prog;
+  for (auto& v :
+       std::vector<std::string>({"a", "b", "c", "d", "e", "f", "g", "h", "i",
+                                 "j", "weights1", "weights2", "weights3"})) {
+    auto* var = prog.MutableBlock(0)->Var(v);
+    var->SetType(proto::VarType::SELECTED_ROWS);
+    if (v == "weights1" || v == "weights2" || v == "weights3") {
+      var->SetPersistable(true);
+    }
+  }
+  SetOp(&prog, "pool2d", "pooling1", std::vector<std::string>({"a"}),
+        std::vector<std::string>({"b"}), true, ISTEST_STATE::TRUE);
+  SetOp(&prog, "relu", "activation1", std::vector<std::string>({"b"}),
+        std::vector<std::string>({"c"}), true, ISTEST_STATE::TRUE);
+  SetOp(&prog, "conv2d", "conv1", std::vector<std::string>({"c", "weights1"}),
+        std::vector<std::string>({"d"}), true, ISTEST_STATE::TRUE);
+  SetOp(&prog, "pool2d", "pooling2", std::vector<std::string>({"d"}),
+        std::vector<std::string>({"e"}), false, ISTEST_STATE::FALSE);
+  SetOp(&prog, "hard_sigmoid", "activation2", std::vector<std::string>({"e"}),
+        std::vector<std::string>({"f"}), false, ISTEST_STATE::FALSE);
+  SetOp(&prog, "conv2d", "conv2", std::vector<std::string>({"f", "weights2"}),
+        std::vector<std::string>({"g"}), false, ISTEST_STATE::FALSE);
+  SetOp(&prog, "pool2d", "pooling3", std::vector<std::string>({"g"}),
+        std::vector<std::string>({"h"}), false, ISTEST_STATE::UNSET);
+  SetOp(&prog, "tanh", "activation3", std::vector<std::string>({"h"}),
+        std::vector<std::string>({"i"}), true, ISTEST_STATE::UNSET);
+  SetOp(&prog, "conv2d", "conv3", std::vector<std::string>({"i", "weights3"}),
+        std::vector<std::string>({"j"}), false, ISTEST_STATE::UNSET);
+  return prog;
+}
+TEST(IsTestPass, basic) {
+  auto prog = BuildProgramDesc();
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+  auto pass = PassRegistry::Instance().Get("is_test_pass");
+  graph = pass->Apply(std::move(graph));
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp()) {
+      auto* op = node->Op();
+      auto op_name = boost::get<std::string>(op->GetAttr("name"));
+      if (op_name == "conv3") {
+        ASSERT_FALSE(op->HasAttr("is_test"));
+      } else {
+        ASSERT_TRUE(op->HasAttr("is_test"));
+        EXPECT_TRUE(boost::get<bool>(op->GetAttr("is_test")));
+      }
+    }
+  }
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+USE_PASS(is_test_pass);
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -70,6 +70,16 @@ void NaiveExecutor::Prepare(Scope *scope, const ProgramDesc &program_desc,
 }
 void NaiveExecutor::Run() {
+#ifndef PADDLE_ON_INFERENCE
+  LOG_FIRST_N(WARNING, 15) << "The NaiveExecutor can not work properly if the "
+                              "cmake flag ON_INFER is not set.";
+  LOG_FIRST_N(WARNING, 15) << "Unlike the training phase, all the scopes and "
+                              "variables will be reused to save the allocation "
+                              "overhead.";
+  LOG_FIRST_N(WARNING, 15) << "Please re-compile the inference library by "
+                              "setting the cmake flag ON_INFER=ON if you are "
+                              "running Paddle Inference";
+#endif  // PADDLE_ON_INFERENCE
  for (auto &op : ops_) {
    VLOG(3) << std::this_thread::get_id() << " run " << op->Type()
            << " on scope " << scope_;

--- a/paddle/fluid/framework/op_kernel_type.h
+++ b/paddle/fluid/framework/op_kernel_type.h
@@ -63,6 +63,8 @@ struct OpKernelType {
        place_(dev_ctx.GetPlace()),
        library_type_(library_type) {}
+  size_t hash_key() const { return Hash()(*this); }
  bool operator==(const OpKernelType& o) const {
    return platform::places_are_same_class(place_, o.place_) &&
           data_type_ == o.data_type_ && data_layout_ == o.data_layout_ &&

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -35,6 +35,11 @@ DEFINE_bool(check_nan_inf, false,
 namespace paddle {
 namespace framework {
+// Combine two hash values to a single hash.
+inline size_t CombineHash(size_t seed, size_t a) {
+  return (seed ^ a) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+}
 std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority = {
    std::make_tuple(platform::CUDAPlace(0), LibraryType::kCUDNN),
    std::make_tuple(platform::CUDAPlace(0), LibraryType::kPlain),
@@ -794,6 +799,17 @@ void OperatorWithKernel::TransferInplaceVarsBack(
 Scope* OperatorWithKernel::TryTransferData(
    const Scope& scope, const OpKernelType& expected_kernel_key,
    std::vector<std::string>* transfered_inplace_vars) const {
+// In the inference scenerio, the scopes will be reused across the batches, so
+// the `new_scope` here will result in GPU memroy explosion over the running of
+// operators.
+// We use a thread_local cache to fix that issue, the key in the cache is the
+// combination of the `scope` argument, from_kernel_type, target_kernel_type.
+// Have a discussion with @Superjomn or the inference developers if some changes
+// on this logic for this macro might not tested on the other scenerios.
+#ifdef PADDLE_ON_INFERENCE
+  thread_local std::unordered_map<size_t, Scope*> infer_transfer_scope_cache;
+#endif
  Scope* new_scope = nullptr;
  for (auto& var_name_item : Inputs()) {
    for (auto& var_name : var_name_item.second) {
@@ -824,11 +840,28 @@ Scope* OperatorWithKernel::TryTransferData(
      VLOG(30) << "Transform Variable " << var_name << " from "
               << kernel_type_for_var << " to " << expected_kernel_key;
+#ifdef PADDLE_ON_INFERENCE
+      size_t infer_cache_key =
+          CombineHash(OpKernelType::Hash()(kernel_type_for_var),
+                      OpKernelType::Hash()(expected_kernel_key));
+      infer_cache_key =
+          CombineHash(infer_cache_key, std::hash<const Scope*>()(&scope));
+      auto it = infer_transfer_scope_cache.find(infer_cache_key);
+      if (it != infer_transfer_scope_cache.end()) {
+        new_scope = infer_transfer_scope_cache[infer_cache_key];
+      } else {
+        new_scope = &scope.NewScope();
+        infer_transfer_scope_cache[infer_cache_key] = new_scope;
+      }
+#endif
      if (new_scope == nullptr) {
        new_scope = &scope.NewScope();
      }
      auto* trans_var = new_scope->Var(var_name);
      Tensor out;
      TransformData(expected_kernel_key, kernel_type_for_var, *tensor_in, &out);
      SetTensorToVariable(*var, out, trans_var);

--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -42,7 +42,7 @@ DEFINE_double(
 // a mean time, but a scope may be read by multiple threads concurrently, and
 // the mutex will cause serious performance issue.
 // So the mutex is disabled when `ON_INFER`.
-#ifdef ON_INFER
+#ifdef PADDLE_ON_INFERENCE
 #define SCOPE_LOCK_GUARD
 #else
 #define SCOPE_LOCK_GUARD std::lock_guard<std::mutex> lock(mutex_);

--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -86,6 +86,7 @@ class CpuPassStrategy : public PassStrategy {
        "fc_fuse_pass",                  //
        "conv_bn_fuse_pass",             //
        "conv_eltwiseadd_bn_fuse_pass",  //
+        "is_test_pass",                  //
    });
  }

--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -78,6 +78,10 @@ inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_te
 inference_analysis_api_test_with_fake_data(test_analyzer_resnet50 
  "${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz")
+# mobilenet with depthwise_conv op
+inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet 
+  "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz")
 # anakin
 if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
   # anakin rnn1

--- a/paddle/fluid/operators/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/activation_mkldnn_op.cc
@@ -71,6 +71,10 @@ class MKLDNNActivationGradKernel
                       diff_y->format() != memory::format::format_undef,
                   "Wrong layout/format set for Input OutGrad tensor");
+    PADDLE_ENFORCE(
+        !ctx.Attr<bool>("is_test"),
+        "is_test attribute should be set to False in training phase.");
    Functor functor;
    auto attrs = functor.GetAttrs();
@@ -115,11 +119,15 @@ void eltwise_forward(const framework::ExecutionContext &ctx,
  const std::string key_fwd = key_with_layout + "@eltwise_fwd";
  const std::string key_fwd_pd = key_with_layout + "@eltwise_fwd_pd";
+  bool is_test = ctx.Attr<bool>("is_test");
  // save input data and layout to be referred in backward path
  auto p_src_data = std::make_shared<const T *>(x_data);
-  dev_ctx.SetBlob(key_src_data, p_src_data);
  auto p_src_layout = std::make_shared<memory::format>(src_format);
-  dev_ctx.SetBlob(key_src_layout, p_src_layout);
+  if (!is_test) {
+    dev_ctx.SetBlob(key_src_data, p_src_data);
+    dev_ctx.SetBlob(key_src_layout, p_src_layout);
+  }
  auto p_fwd = std::static_pointer_cast<mkldnn::eltwise_forward>(
      dev_ctx.GetBlob(key_fwd));
@@ -136,14 +144,17 @@ void eltwise_forward(const framework::ExecutionContext &ctx,
    dev_ctx.SetBlob(key_src_mem, src_memory);
    // create primitive descriptor for activation forward and save it
+    auto mkldnn_forward_prop_kind = is_test
+                                        ? mkldnn::prop_kind::forward_inference
+                                        : mkldnn::prop_kind::forward_training;
    auto forward_desc = mkldnn::eltwise_forward::desc(
-        mkldnn::prop_kind::forward_training, algorithm,
+        mkldnn_forward_prop_kind, algorithm,
        src_memory->get_primitive_desc().desc(), alpha, beta);
    auto forward_pd = std::make_shared<mkldnn::eltwise_forward::primitive_desc>(
        forward_desc, mkldnn_engine);
    // save prim desc into global device context to be referred in backward path
-    dev_ctx.SetBlob(key_fwd_pd, forward_pd);
+    if (!is_test) dev_ctx.SetBlob(key_fwd_pd, forward_pd);
    // create mkldnn memory for output y
    dst_memory =

--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -22,18 +22,23 @@ namespace operators {
 using paddle::framework::Tensor;
-#define REGISTER_ACTIVATION_OP_MAKER(OP_NAME, OP_COMMENT)               \
+#define REGISTER_ACTIVATION_OP_MAKER(OP_NAME, OP_COMMENT)                \
-  class OP_NAME##OpMaker                                                \
+  class OP_NAME##OpMaker                                                 \
-      : public ::paddle::framework::OpProtoAndCheckerMaker {            \
+      : public ::paddle::framework::OpProtoAndCheckerMaker {             \
-   public:                                                              \
+   public:                                                               \
-    void Make() override {                                              \
+    void Make() override {                                               \
-      AddInput("X", "Input of " #OP_NAME " operator");                  \
+      AddInput("X", "Input of " #OP_NAME " operator");                   \
-      AddOutput("Out", "Output of " #OP_NAME " operator");              \
+      AddOutput("Out", "Output of " #OP_NAME " operator");               \
-      AddAttr<bool>("use_mkldnn",                                       \
+      AddAttr<bool>("use_mkldnn",                                        \
-                    "(bool, default false) Only used in mkldnn kernel") \
+                    "(bool, default false) Only used in mkldnn kernel")  \
-          .SetDefault(false);                                           \
+          .SetDefault(false);                                            \
-      AddComment(#OP_COMMENT);                                          \
+      AddAttr<bool>(                                                     \
-    }                                                                   \
+          "is_test",                                                     \
+          "(bool, default false) Set to true for inference only, false " \
+          "for training. Some layers may run faster when this is true.") \
+          .SetDefault(false);                                            \
+      AddComment(#OP_COMMENT);                                           \
+    }                                                                    \
  }
 #define REGISTER_ACTIVATION_OP_GRAD_MAKER(OP_NAME, KERNEL_TYPE)              \
@@ -269,7 +274,7 @@ class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
 :strong:`Softshrink Activation Operator`
 ..  math::
-    out = \begin{cases} 
+    out = \begin{cases}
         x - \lambda, \text{if } x > \lambda \\
         x + \lambda, \text{if } x < -\lambda \\
         0,  \text{otherwise}
@@ -435,7 +440,7 @@ class HardSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
    AddComment(R"DOC(
 HardSigmoid Activation Operator.
-Segment-wise linear approximation of sigmoid(https://arxiv.org/abs/1603.00391), 
+Segment-wise linear approximation of sigmoid(https://arxiv.org/abs/1603.00391),
 which is much faster than sigmoid.
 $out = \max(0, \min(1, slope * x + shift))$

--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -113,7 +113,10 @@ class BatchNormOp : public framework::OperatorWithKernel {
 class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
-    AddAttr<bool>("is_test", "").SetDefault(false);
+    AddAttr<bool>("is_test",
+                  "(bool, default false) Set to true for inference only, false "
+                  "for training. Some layers may run faster when this is true.")
+        .SetDefault(false);
    AddAttr<float>("momentum", "").SetDefault(0.9);
    AddAttr<float>("epsilon", "")
        .SetDefault(1e-5)

--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
@@ -383,20 +383,22 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    // create a conv primitive descriptor and save it for usage in backward
    std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd;
+    auto fwd_prop_kind = is_test ? mkldnn::prop_kind::forward_inference
+                                 : mkldnn::prop_kind::forward_training;
    if (bias) {
      bias_tz = paddle::framework::vectorize2int(bias->dims());
      auto bias_md = platform::MKLDNNMemDesc(
          bias_tz, platform::MKLDNNGetDataType<T>(), memory::format::x);
-      conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md,
+      conv_pd = ConvFwdPrimitiveDesc(
-                                     strides, paddings, mkldnn_engine,
+          src_md, weights_md, bias_md, dst_md, strides, paddings, mkldnn_engine,
-                                     fuse_relu, fuse_residual_conn);
+          fuse_relu, fuse_residual_conn, fwd_prop_kind);
    } else {
-      conv_pd =
+      conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides,
-          ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings,
+                                     paddings, mkldnn_engine, fuse_relu,
-                               mkldnn_engine, fuse_relu, fuse_residual_conn);
+                                     fuse_residual_conn, fwd_prop_kind);
    }
    // Save conv_pd/src_memory/weights_memory for backward pass
-    dev_ctx.SetBlob(key_conv_pd, conv_pd);
+    if (!is_test) dev_ctx.SetBlob(key_conv_pd, conv_pd);
    ConvMKLDNNHandler handler(conv_pd, dev_ctx, mkldnn_engine, key);
@@ -510,14 +512,14 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                       const memory::desc& dst, const std::vector<int>& strides,
                       const std::vector<int>& paddings,
                       const mkldnn::engine& engine, const bool fuse_relu,
-                       const bool fuse_residual_conn) const {
+                       const bool fuse_residual_conn,
+                       mkldnn::prop_kind fwd_prop_kind) const {
    memory::dims stride_dims = {strides[0], strides[1]};
    memory::dims padding_dims = {paddings[0], paddings[1]};
    auto conv_desc = mkldnn::convolution_forward::desc(
-        mkldnn::prop_kind::forward, mkldnn::convolution_direct, src, weights,
+        fwd_prop_kind, mkldnn::convolution_direct, src, weights, dst,
-        dst, stride_dims, padding_dims, padding_dims,
+        stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero);
-        mkldnn::padding_kind::zero);
    mkldnn::primitive_attr conv_attr =
        CreatePostOps(fuse_relu, fuse_residual_conn);
@@ -535,14 +537,14 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                       const std::vector<int>& strides,
                       const std::vector<int>& paddings,
                       const mkldnn::engine& engine, const bool fuse_relu,
-                       const bool fuse_residual_conn) const {
+                       const bool fuse_residual_conn,
+                       mkldnn::prop_kind fwd_prop_kind) const {
    memory::dims stride_dims = {strides[0], strides[1]};
    memory::dims padding_dims = {paddings[0], paddings[1]};
    auto conv_desc = mkldnn::convolution_forward::desc(
-        mkldnn::prop_kind::forward, mkldnn::convolution_direct, src, weights,
+        fwd_prop_kind, mkldnn::convolution_direct, src, weights, bias, dst,
-        bias, dst, stride_dims, padding_dims, padding_dims,
+        stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero);
-        mkldnn::padding_kind::zero);
    mkldnn::primitive_attr conv_attr =
        CreatePostOps(fuse_relu, fuse_residual_conn);
@@ -587,6 +589,10 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
                       output_grad->format() != memory::format::format_undef,
                   "Wrong layout/format set for output_grad tensor");
+    PADDLE_ENFORCE(
+        !ctx.Attr<bool>("is_test"),
+        "is_test attribute should be set to False in training phase.");
    if (!input_grad && !filter_grad) return;
    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");

--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -109,7 +109,10 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
 }
 void Conv2DOpMaker::Make() {
-  AddAttr<bool>("is_test", "").SetDefault(false);
+  AddAttr<bool>("is_test",
+                "(bool, default false) Set to true for inference only, false "
+                "for training. Some layers may run faster when this is true.")
+      .SetDefault(false);
  AddInput(
      "Input",
      "(Tensor) The input tensor of convolution operator. "

--- a/paddle/fluid/operators/dropout_op.cc
+++ b/paddle/fluid/operators/dropout_op.cc
@@ -49,7 +49,10 @@ class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
          PADDLE_ENFORCE(drop_p >= 0.0f && drop_p <= 1.0f,
                         "'dropout_prob' must be between 0.0 and 1.0.");
        });
-    AddAttr<bool>("is_test", "True if in test phase.").SetDefault(false);
+    AddAttr<bool>("is_test",
+                  "(bool, default false) Set to true for inference only, false "
+                  "for training. Some layers may run faster when this is true.")
+        .SetDefault(false);
    AddAttr<bool>("fix_seed",
                  "A flag indicating whether to use a fixed seed to generate "
                  "random mask. NOTE: DO NOT set this flag to true in "

--- a/paddle/fluid/operators/fake_quantize_op.cc
+++ b/paddle/fluid/operators/fake_quantize_op.cc
@@ -138,7 +138,7 @@ class FakeQuantizeAbsMaxOpMaker : public framework::OpProtoAndCheckerMaker {
    AddComment(R"DOC(
 FakeQuantize operator
-$$scale = max(abs(X))$$ 
+$$scale = max(abs(X))$$
 $$range = 2^{bit_length - 1} - 1$$
 $$Out = round(X/scale * range)$$
@@ -199,11 +199,14 @@ class FakeQuantizeRangeAbsMaxOpMaker
          PADDLE_ENFORCE(bit_length >= 1 && bit_length <= 16,
                         "'bit_length' should be between 1 and 16.");
        });
-    AddAttr<bool>("is_test", "").SetDefault(false);
+    AddAttr<bool>("is_test",
+                  "(bool, default false) Set to true for inference only, false "
+                  "for training. Some layers may run faster when this is true.")
+        .SetDefault(false);
    AddComment(R"DOC(
 FakeQuantize operator is used in static quantization.
-$$scale = max(max(abs(x)), history_abs_max)$$ 
+$$scale = max(max(abs(x)), history_abs_max)$$
 $$range = 2^{bit_length - 1} - 1$$
 $$Out = round(X/scale * range)$$

--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
@@ -46,7 +46,7 @@ struct LRNFunctor<platform::CPUDeviceContext, T> {
    int pre_pad = (n - 1) / 2;
    // compute batches one by one
    for (int i = 0; i < N; ++i) {
-      blas.VSQR(fea_size, idata + i * fea_size, sdata + pre_pad * img_size);
+      blas.VSQUARE(fea_size, idata + i * fea_size, sdata + pre_pad * img_size);
      // init the first channel of mid
      for (int c = 0; c < n; ++c) {
        blas.AXPY(img_size, alpha, sdata + c * img_size, mdata + i * fea_size);
@@ -229,8 +229,8 @@ class LRNOpMaker : public framework::OpProtoAndCheckerMaker {
        "the input will be transformed automatically. ")
        .SetDefault("AnyLayout");
    AddAttr<bool>("is_test",
-                  "Turns on memory optimization that optimizes away "
+                  "(bool, default false) Set to true for inference only, false "
-                  "unnecessary memory allocations. Used by MKLDNN.")
+                  "for training. Some layers may run faster when this is true.")
        .SetDefault(false);
    AddComment(R"DOC(

--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
@@ -153,7 +153,7 @@ class Blas {
  void VEXP(int n, const T* x, T* y) const;
  template <typename T>
-  void VSQR(int n, const T* x, T* y) const;
+  void VSQUARE(int n, const T* x, T* y) const;
  template <typename T>
  void VPOW(int n, const T* x, T alpha, T* y) const;
@@ -245,8 +245,8 @@ class BlasT : private Blas<DeviceContext> {
  }
  template <typename... ARGS>
-  void VSQR(ARGS... args) const {
+  void VSQUARE(ARGS... args) const {
-    Base()->template VSQR<T>(args...);
+    Base()->template VSQUARE<T>(args...);
  }
  template <typename... ARGS>

--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -105,7 +105,7 @@ struct CBlas<float> {
  }
  template <typename... ARGS>
-  static void VSQR(ARGS... args) {
+  static void VSQUARE(ARGS... args) {
    platform::dynload::vsSqr(args...);
  }
@@ -195,7 +195,7 @@ struct CBlas<double> {
  }
  template <typename... ARGS>
-  static void VSQR(ARGS... args) {
+  static void VSQUARE(ARGS... args) {
    platform::dynload::vdSqr(args...);
  }
@@ -262,7 +262,9 @@ struct CBlas<platform::float16> {
  }
  static void VMUL(...) { PADDLE_THROW("float16 VMUL not supported on CPU"); }
  static void VEXP(...) { PADDLE_THROW("float16 VEXP not supported on CPU"); }
-  static void VSQR(...) { PADDLE_THROW("float16 VSQR not supported on CPU"); }
+  static void VSQUARE(...) {
+    PADDLE_THROW("float16 VSQUARE not supported on CPU");
+  }
  static void VPOW(...) { PADDLE_THROW("float16 VPOW not supported on CPU"); }
  static void DOT(...) { PADDLE_THROW("float16 DOT not supported on CPU"); };
  static void SCAL(...) { PADDLE_THROW("float16 SCAL not supported on CPU"); };
@@ -423,12 +425,12 @@ void Blas<platform::CPUDeviceContext>::VEXP(int n, const T *x, T *y) const {
 template <>
 template <typename T>
-void Blas<platform::CPUDeviceContext>::VSQR(int n, const T *x, T *y) const {
+void Blas<platform::CPUDeviceContext>::VSQUARE(int n, const T *x, T *y) const {
 #ifdef PADDLE_WITH_MKLML
-  CBlas<T>::VSQR(n, x, y);
+  CBlas<T>::VSQUARE(n, x, y);
 #else
  for (int i = 0; i < n; ++i) {
-    y[i] = std::sqrt(x[i]);
+    y[i] = x[i] * x[i];
  }
 #endif
 }

--- a/paddle/fluid/operators/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/pool_mkldnn_op.cc
@@ -87,6 +87,7 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+    bool is_test = ctx.Attr<bool>("is_test");
    if (ctx.Attr<bool>("global_pooling")) {
      for (size_t i = 0; i < ksize.size(); ++i) {
        paddings[i] = 0;
@@ -142,16 +143,10 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
      std::shared_ptr<mkldnn::pooling_forward::primitive_desc> pool_pd =
          CreatePrimitiveDesc(src_md, dst_md, strides, padding_left_top,
                              padding_right_bottom, ksize, pooling_type,
-                              mkldnn_engine, ceil_mode);
+                              mkldnn_engine, ceil_mode, is_test);
      // save pool_pd into global device context to be referred in backward path
-      dev_ctx.SetBlob(key_pool_pd, pool_pd);
+      if (!is_test) dev_ctx.SetBlob(key_pool_pd, pool_pd);
-      std::shared_ptr<mkldnn::memory> workspace_memory =
-          CreateWorkspaceMemory(pool_pd, pooling_type, mkldnn_engine);
-      // save pool_workspace_memory to be referred in backward path
-      dev_ctx.SetBlob(key_pool_workspace_memory, workspace_memory);
      auto src_memory = std::make_shared<memory>(pool_pd->src_primitive_desc(),
                                                 to_void_cast<T>(input_data));
@@ -161,9 +156,19 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
      dev_ctx.SetBlob(key_pool_src_mem_p, src_memory);
      dev_ctx.SetBlob(key_pool_dst_mem_p, dst_memory);
-      pool_p = std::make_shared<pooling_forward>(*pool_pd, *(src_memory.get()),
+      if (is_test) {
-                                                 *(dst_memory.get()),
+        pool_p = std::make_shared<pooling_forward>(*pool_pd, *src_memory,
-                                                 *workspace_memory);
+                                                   *dst_memory);
+      } else {
+        std::shared_ptr<mkldnn::memory> workspace_memory =
+            CreateWorkspaceMemory(pool_pd, pooling_type, mkldnn_engine);
+        // save pool_workspace_memory to be referred in backward path
+        dev_ctx.SetBlob(key_pool_workspace_memory, workspace_memory);
+        pool_p = std::make_shared<pooling_forward>(
+            *pool_pd, *src_memory, *dst_memory, *workspace_memory);
+      }
      dev_ctx.SetBlob(key_pool_p, pool_p);
@@ -201,9 +206,12 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
      const std::vector<int>& stride, const std::vector<int>& padding_left_top,
      const std::vector<int>& padding_right_bot, const std::vector<int>& kernel,
      const std::string& pooling_type, const mkldnn::engine& engine,
-      bool ceil_mode) const {
+      bool ceil_mode, bool is_test) const {
+    auto mkldnn_forward_prop_kind = is_test
+                                        ? mkldnn::prop_kind::forward_inference
+                                        : mkldnn::prop_kind::forward_training;
    auto pool_desc = mkldnn::pooling_forward::desc(
-        mkldnn::prop_kind::forward,
+        mkldnn_forward_prop_kind,
        pooling_type == "max" ? mkldnn::algorithm::pooling_max
                              : mkldnn::algorithm::pooling_avg,
        src, dst, stride, kernel, padding_left_top, padding_right_bot,
@@ -248,6 +256,10 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
                       out_grad->format() != memory::format::format_undef,
                   "Wrong layout/format set for Input output_grad tensor");
+    PADDLE_ENFORCE(
+        !ctx.Attr<bool>("is_test"),
+        "is_test attribute should be set to False in training phase.");
    std::string pooling_type = ctx.Attr<std::string>("pooling_type");
    std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");

--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -206,6 +206,11 @@ void Pool2dOpMaker::Make() {
      "Defaults to \"NHWC\". Specify the data format of the output data, "
      "the input will be transformed automatically. ")
      .SetDefault("AnyLayout");
+  AddAttr<bool>("is_test",
+                "(bool, default false) Set to true for inference only, false "
+                "for training. Some layers may run faster when this is true.")
+      .SetDefault(false);
  // TODO(dzhwinter): need to registered layout transform function
  AddComment(R"DOC(

--- a/paddle/fluid/operators/selu_op.cc
+++ b/paddle/fluid/operators/selu_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/selu_op.h"
+#include <string>
+namespace paddle {
+namespace operators {
+class SeluOp : public framework::OperatorWithKernel {
+ public:
+  SeluOp(const std::string &type, const framework::VariableNameMap &inputs,
+         const framework::VariableNameMap &outputs,
+         const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SeluOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SeluOp should not be null.");
+    ctx->ShareDim("X", /*->*/ "Out");
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::GetDataTypeOfVar(ctx.InputVar("X")), ctx.GetPlace());
+  }
+};
+class SeluOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput {
+ protected:
+  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
+      const override {
+    return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Out"}};
+  }
+};
+class SeluOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "The input tensor of selu operator.");
+    AddOutput("Out", "The output tensor of selu operator.");
+    AddAttr<float>("scale",
+                   "(float) the default value is 1.0507~. For more "
+                   "information about this value, please refer to:"
+                   "https://arxiv.org/abs/1706.02515.")
+        .SetDefault(1.0507009873554804934193349852946);
+    AddAttr<float>("alpha",
+                   "(float) the default value is 1.6732~. For more "
+                   "information about this value, please refer to:"
+                   "https://arxiv.org/abs/1706.02515.")
+        .SetDefault(1.6732632423543772848170429916717);
+    AddComment(R"DOC(
+Selu Operator.
+The equation is:
+$$
+f(x) =\lambda*
+\begin{cases}
+ \quad \quad   x,  \quad \quad \quad \text{if} \ x > 0 \\
+ \alpha * e^x - \alpha,  \qquad  \text{if} \ x <= 0
+\end{cases}
+$$
+The input `X` can carry the LoD (Level of Details) information,
+or not. And the output shares the LoD information with input `X`.
+)DOC");
+  }
+};
+class SeluGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("selu_grad");
+    grad_op->SetInput("Out", Output("Out"));
+    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    grad_op->SetAttrMap(this->Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+class SeluGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("Out"), "Input(Out) should not be null");
+    auto x_grad_name = framework::GradVarName("X");
+    ctx->SetOutputDim(x_grad_name, ctx->GetInputDim("Out"));
+  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::GetDataTypeOfVar(ctx.InputVar("Out")), ctx.GetPlace());
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(selu, ops::SeluOp, ops::SeluOpMaker, ops::SeluOpInferVarType,
+                  ops::SeluGradMaker);
+REGISTER_OPERATOR(selu_grad, ops::SeluGradOp);
+REGISTER_OP_CPU_KERNEL(
+    selu, ops::SeluKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SeluKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    selu_grad, ops::SeluGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SeluGradKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/selu_op.cu
+++ b/paddle/fluid/operators/selu_op.cu
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/selu_op.h"
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    selu, ops::SeluKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SeluKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    selu_grad, ops::SeluGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SeluGradKernel<paddle::platform::CUDADeviceContext, double>);
--- a/paddle/fluid/operators/selu_op.h
+++ b/paddle/fluid/operators/selu_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <string>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/for_range.h"
+namespace paddle {
+namespace operators {
+static HOSTDEVICE float real_exp(float x) { return expf(x); }
+static HOSTDEVICE float real_exp(double x) { return exp(x); }
+template <typename T>
+struct SeluFunctor {
+  SeluFunctor(const T* x_data_ptr, float alpha, float scale, T* y_data_ptr)
+      : x_data_ptr_(x_data_ptr),
+        alpha_(alpha),
+        scale_(scale),
+        y_data_ptr_(y_data_ptr) {}
+  HOSTDEVICE void operator()(size_t idx) const {
+    T x_ele = x_data_ptr_[idx];
+    if (x_ele <= 0) {
+      x_ele = alpha_ * real_exp(x_ele) - alpha_;
+    }
+    y_data_ptr_[idx] = scale_ * x_ele;
+  }
+  const T* x_data_ptr_;
+  const float alpha_;
+  const float scale_;
+  T* y_data_ptr_;
+};
+template <typename T>
+struct SeluGradFunctor {
+  SeluGradFunctor(const T* y_data_ptr, const T* dy_data_ptr, float alpha,
+                  float scale, T* dx_data_ptr)
+      : y_data_ptr_(y_data_ptr),
+        dy_data_ptr_(dy_data_ptr),
+        alpha_(alpha),
+        scale_(scale),
+        la_(alpha * scale),
+        dx_data_ptr_(dx_data_ptr) {}
+  HOSTDEVICE void operator()(size_t idx) const {
+    T y_ele = y_data_ptr_[idx];
+    T dy_ele = dy_data_ptr_[idx];
+    float tmp = scale_;
+    if (y_ele <= 0) {
+      tmp = y_ele + la_;
+    }
+    dx_data_ptr_[idx] = dy_ele * tmp;
+  }
+  const T* y_data_ptr_;
+  const T* dy_data_ptr_;
+  const float alpha_;
+  const float scale_;
+  const float la_;
+  T* dx_data_ptr_;
+};
+template <typename DeviceContext, typename T>
+class SeluKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    using Tensor = framework::Tensor;
+    auto* x = context.Input<Tensor>("X");
+    auto* out = context.Output<Tensor>("Out");
+    float alpha = context.Attr<float>("alpha");
+    float scale = context.Attr<float>("scale");
+    auto out_ptr = out->mutable_data<T>(context.GetPlace());
+    SeluFunctor<T> functor(x->data<T>(), alpha, scale, out_ptr);
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    size_t limit = static_cast<size_t>(x->numel());
+    platform::ForRange<DeviceContext> for_range(dev_ctx, limit);
+    for_range(functor);
+  }
+};
+template <typename DeviceContext, typename T>
+class SeluGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    using Tensor = framework::Tensor;
+    auto* out = context.Input<Tensor>("Out");
+    auto* dout = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = context.Output<Tensor>(framework::GradVarName("X"));
+    float alpha = context.Attr<float>("alpha");
+    float scale = context.Attr<float>("scale");
+    auto dx_ptr = dx->mutable_data<T>(context.GetPlace());
+    SeluGradFunctor<T> functor(out->data<T>(), dout->data<T>(), alpha, scale,
+                               dx_ptr);
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    size_t limit = static_cast<size_t>(out->numel());
+    platform::ForRange<DeviceContext> for_range(dev_ctx, limit);
+    for_range(functor);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/sequence_pool_op.cc
+++ b/paddle/fluid/operators/sequence_pool_op.cc
@@ -47,7 +47,10 @@ class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker {
              "(Tensor<int>) This tensor is used for the sequence max-pooling "
              "to record the max indexes.")
        .AsIntermediate();
-    AddAttr<bool>("is_test", "").SetDefault(false);
+    AddAttr<bool>("is_test",
+                  "(bool, default false) Set to true for inference only, false "
+                  "for training. Some layers may run faster when this is true.")
+        .SetDefault(false);
    AddAttr<std::string>(
        "pooltype",
        "(string, default 'AVERAGE') the pooling pooltype of SequencePoolOp.")

--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -96,20 +96,21 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
                  "(bool, default false) Only used in mkldnn kernel")
        .SetDefault(false);
    AddAttr<bool>("is_test",
-                  "Disable epsilon adding to softmax results. Used by MKLDNN.")
+                  "(bool, default false) Set to true for inference only, false "
+                  "for training. Some layers may run faster when this is true.")
        .SetDefault(false);
    AddComment(R"DOC(
 Softmax Operator.
-The input of the softmax operator is a tensor of any rank. The output tensor 
+The input of the softmax operator is a tensor of any rank. The output tensor
 has the same shape as the input.
-The input tensor will first be logically flattened to a 2-D matrix. The matrix's 
+The input tensor will first be logically flattened to a 2-D matrix. The matrix's
-second dimension(row length) is as same as the last dimension of the input 
+second dimension(row length) is as same as the last dimension of the input
-tensor, and the first dimension(column length) is the product of all other 
+tensor, and the first dimension(column length) is the product of all other
-dimensions of the input tensor. For each row of the matrix, the softmax operator 
+dimensions of the input tensor. For each row of the matrix, the softmax operator
-squashes the K-dimensional(K is the width of the matrix, which is also the size 
+squashes the K-dimensional(K is the width of the matrix, which is also the size
-of the input tensor's last dimension) vector of arbitrary real values to a 
+of the input tensor's last dimension) vector of arbitrary real values to a
 K-dimensional vector of real values in the range [0, 1] that add up to 1.
 It computes the exponential of the given dimension and the sum of exponential
 values of all the other dimensions in the K-dimensional vector input.

--- a/paddle/fluid/operators/while_op.cc
+++ b/paddle/fluid/operators/while_op.cc
@@ -92,7 +92,10 @@ class WhileOpMaker : public framework::OpProtoAndCheckerMaker {
              "variables generated in the i'th step.");
    AddAttr<framework::BlockDesc *>(kStepBlock,
                                    "The step block inside WhileOp");
-    AddAttr<bool>("is_test", "True if in test phase.").SetDefault(false);
+    AddAttr<bool>("is_test",
+                  "(bool, default false) Set to true for inference only, false "
+                  "for training. Some layers may run faster when this is true.")
+        .SetDefault(false);
    AddComment(R"DOC(
 )DOC");
  }

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -110,6 +110,7 @@ __all__ = [
    'random_crop',
    'mean_iou',
    'relu',
+    'selu',
    'log',
    'crop',
    'rank_loss',
@@ -6188,6 +6189,47 @@ def relu(x, name=None):
    return out
+@templatedoc()
+def selu(x, scale=None, alpha=None, name=None):
+    """
+    ${comment}
+    Args:
+        x (Variable): The input tensor.
+        scale(float, None): If the scale is not set,
+            the default value is 1.0507009873554804934193349852946.
+            For more information about this value, please refer
+            to: https://arxiv.org/abs/1706.02515.
+        alpha(float, None): If the alpha is not set,
+            the default value is 1.6732632423543772848170429916717.
+            For more information about this value, please refer
+            to: https://arxiv.org/abs/1706.02515.
+        name (str|None, default None): A name for this layer If set None,
+            the layer will be named automatically.
+    Returns:
+        Variable: The output tensor with the same shape as input.
+    Examples:
+        .. code-block:: python
+            output = fluid.layers.selu(x)
+    """
+    helper = LayerHelper('selu', **locals())
+    dtype = helper.input_dtype(input_param_name='x')
+    out = helper.create_variable_for_type_inference(dtype)
+    attrs = {}
+    if scale is not None:
+        attrs["scale"] = scale
+    if alpha is not None:
+        attrs["alpha"] = alpha
+    helper.append_op(
+        type="selu", inputs={"X": x}, outputs={"Out": out}, attrs=attrs)
+    return out
 def mean_iou(input, label, num_classes):
    """
    Mean Intersection-Over-Union is a common evaluation metric for

--- a/python/paddle/fluid/tests/unittests/test_selu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_selu_op.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import unittest
+import numpy as np
+import six
+from op_test import OpTest
+class SeluTest(OpTest):
+    def setUp(self):
+        self.op_type = "selu"
+        self.x_shape = [3, 5, 5, 10]
+        self.dtype = np.float32
+        self.init_x_shape()
+        self.init_dtype()
+        alpha = 1.6732632423543772848170429916717
+        scale = 1.0507009873554804934193349852946
+        x = np.random.normal(size=self.x_shape).astype(self.dtype)
+        # Since zero point in selu is not differentiable, avoid randomize
+        # zero.
+        x[np.abs(x) < 0.005] = 0.02
+        x_flat = x.flatten()
+        for i in range(x_flat.size):
+            if x_flat[i] < 0:
+                x_flat[i] = alpha * np.exp(x_flat[i]) - alpha
+            x_flat[i] = scale * x_flat[i]
+        out_np = x_flat.reshape(self.x_shape)
+        self.inputs = {'X': x}
+        self.outputs = {'Out': out_np}
+        self.attrs = {
+            'alpha': alpha,
+            'scale': scale,
+        }
+    def init_x_shape(self):
+        pass
+    def init_dtype(self):
+        pass
+    def test_check_output(self):
+        self.check_output()
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/transpiler/inference_transpiler.py
+++ b/python/paddle/fluid/transpiler/inference_transpiler.py
@@ -73,6 +73,38 @@ class InferenceTranspiler(object):
                program)  # ResNet residual block merging
            self._fuse_bn_relu_mkldnn(program)
+        self._is_test_pass(program)
+    def _is_test_pass(self, program):
+        '''
+        Transpile the program setting is_test = true for all layers and
+        inserts is_test attribute to pooling and activation layers.
+        As a result some operators might run faster
+        :param program: program to transpile
+        :type program: Program
+        '''
+        self.block = program.block(0)
+        i = 0
+        while i < len(self.block.ops):
+            current_op = self.block.ops[i]
+            if current_op.has_attr("is_test"):
+                current_op._set_attr("is_test", True)
+            elif current_op.type in [
+                    "pool2d", "sigmoid", "logsigmoid", "softshrink", "exp",
+                    "brelu", "pow", "leaky_relu", "stanh", "relu", "tanh",
+                    "tanh_shrink", "sqrt", "abs", "ceil", "elu", "floor", "cos",
+                    "sin", "round", "reciprocal", "hard_shrink", "hard_sigmoid",
+                    "relu6", "soft_relu", "swish", "thresholded_relu", "log",
+                    "square", "softplus", "softsign"
+            ]:
+                current_op._set_attr("is_test", True)
+            i = i + 1
+        # TODO(luotao): use clone() method to flush the program.desc in force,
+        # since some large program.desc will not be flushed immediately.
+        # And a better solution will be considered later.
+        program = program.clone()
    def _depthwise_conv_mkldnn(self, program):
        '''
        Transpile the program by replacing depthwise_conv2d to conv2d for MKLDNN program.

--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -142,7 +142,7 @@ if os.name == 'nt':
 if '${WITH_FLUID_ONLY}'== 'OFF':
    package_data['paddle.v2.master']=['libpaddle_master' + ext_name]
-    package_data['py_paddle']=['*.py','_swig_paddle' +  + ext_name]
+    package_data['py_paddle']=['*.py','_swig_paddle' + ext_name]
 package_dir={
    '': '${PADDLE_BINARY_DIR}/python',