Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into async_executor

21d2a05c · wangguibao · d8cb787d · cdf2579d · 21d2a05c · 21d2a05c
26 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -68,7 +68,6 @@ option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better d
 option(WITH_ANAKIN      "Compile with Anakin library"                   OFF)
 option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
 option(WITH_BRPC_RDMA     "Use brpc rdma as the rpc protocal"           OFF)
-option(WITH_INFERENCE    "Compile fluid inference library"              ON)
 option(ON_INFER         "Turn on inference optimization."               OFF)
 option(WITH_INFERENCE_API_TEST   "Test fluid inference high-level api interface"  OFF)
 option(WITH_SYSTEM_BLAS   "Use system blas library"           OFF)

--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -24,6 +24,7 @@ if(NOT WITH_FLUID_ONLY)
 endif()
 add_subdirectory(testing)
+set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests CACHE INTERNAL "python tests directory")
 if(NOT MOBILE_INFERENCE AND NOT RPI AND NOT WITH_C_API)
  add_subdirectory(fluid)
 endif()
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -64,7 +64,7 @@ paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', '
 paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None, None))
 paddle.fluid.layers.conv2d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None))
 paddle.fluid.layers.conv3d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None))
-paddle.fluid.layers.sequence_pool ArgSpec(args=['input', 'pool_type'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.sequence_pool ArgSpec(args=['input', 'pool_type', 'is_test'], varargs=None, keywords=None, defaults=(False,))
 paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(False, None))
 paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None))
 paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None))
@@ -177,6 +177,8 @@ paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, k
 paddle.fluid.layers.sequence_reverse ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.affine_channel ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None))
 paddle.fluid.layers.hash ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None))
+paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None))
+paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
 paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)

--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -9,8 +9,6 @@ add_subdirectory(pybind)
 add_subdirectory(recordio)
 endif(NOT WIN32)
-if(WITH_INFERENCE)
+# NOTE: please add subdirectory inference at last.
-  # NOTE: please add subdirectory inference at last.
+add_subdirectory(inference)
-  add_subdirectory(inference)
+add_subdirectory(train)
-  add_subdirectory(train)
-endif()
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -358,7 +358,7 @@ static bool VarIsTensor(const Variable* var) {
  return var->IsType<LoDTensor>() || var->IsType<SelectedRows>();
 }
-static const Tensor* GetTensorFromVar(Variable* var) {
+const Tensor* GetTensorFromVar(Variable* var) {
  if (var->IsType<LoDTensor>()) {
    return var->GetMutable<LoDTensor>();
  } else if (var->IsType<SelectedRows>()) {

--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -63,6 +63,7 @@ inline std::string GradVarName(const std::string& var_name) {
 }
 proto::VarType::Type GetDataTypeOfVar(const Variable* var);
+const Tensor* GetTensorFromVar(Variable* var);
 class OperatorBase;
 class ExecutionContext;

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -303,10 +303,8 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
 }
 ParallelExecutor::~ParallelExecutor() {
-  const auto dev_ctxs =
+  for (auto &p : member_->places_) {
-      platform::DeviceContextPool::Instance().GetAllDeviceContexts();
+    platform::DeviceContextPool::Instance().Get(p)->Wait();
-  for (auto &dev_ctx : dev_ctxs) {
-    dev_ctx->Wait();
  }
  if (member_->own_local_scope_) {

--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -61,8 +61,6 @@ cc_test(test_paddle_inference_api
 inference_api_test(test_api_impl SRC api_impl_tester.cc
                    ARGS test_word2vec test_image_classification)
-set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
 cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor ${inference_deps} paddle_inference_api
        ARGS --dirname=${PYTHON_TESTS_DIR}/book)

--- a/paddle/fluid/inference/api/api_impl_tester.cc
+++ b/paddle/fluid/inference/api/api_impl_tester.cc
@@ -22,9 +22,9 @@ limitations under the License. */
 #include "paddle/fluid/inference/tests/test_helper.h"
 #ifdef __clang__
-#define ACC_DIFF 4e-3
+#define ACC_DIFF 4e-2
 #else
-#define ACC_DIFF 1e-3
+#define ACC_DIFF 1e-2
 #endif
 DEFINE_string(dirname, "", "Directory of the inference model.");
@@ -187,7 +187,7 @@ void MainThreadsWord2Vec(bool use_gpu) {
  std::vector<std::thread> threads;
  for (int tid = 0; tid < num_jobs; ++tid) {
    threads.emplace_back([&, tid]() {
-      auto predictor = main_predictor->Clone();
+      auto predictor = CreatePaddlePredictor(config);
      auto& local_inputs = paddle_tensor_feeds[tid];
      std::vector<PaddleTensor> local_outputs;
      ASSERT_TRUE(predictor->Run(local_inputs, &local_outputs));
@@ -245,7 +245,7 @@ void MainThreadsImageClassification(bool use_gpu) {
  std::vector<std::thread> threads;
  for (int tid = 0; tid < num_jobs; ++tid) {
    threads.emplace_back([&, tid]() {
-      auto predictor = main_predictor->Clone();
+      auto predictor = CreatePaddlePredictor(config);
      auto& local_inputs = paddle_tensor_feeds[tid];
      std::vector<PaddleTensor> local_outputs;
      ASSERT_TRUE(predictor->Run(local_inputs, &local_outputs));

--- a/paddle/fluid/operators/add_position_encoding_op.cc
+++ b/paddle/fluid/operators/add_position_encoding_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/add_position_encoding_op.h"
+namespace paddle {
+namespace operators {
+class AddPositionEncodingOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "X(Input) of add_position_encoding_op should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("Out"),
+        "Out(Output) of add_position_encoding_op should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    ctx->SetOutputDim("Out", x_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+class AddPositionEncodingOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "X(Input) must not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Out"), "Out must not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Out@GRAD must not be null.");
+    auto out_dims = ctx->GetInputDim("Out");
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), out_dims);
+    }
+  }
+};
+class AddPositionEncodingOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "Input of AddPositionEncoding operator");
+    AddOutput("Out", "Output of AddPositionEncoding operator");
+    AddAttr<float>("alpha", "The scale of Original Embedding.")
+        .SetDefault(1.0f)
+        .AddCustomChecker([](const float& alpha) {
+          PADDLE_ENFORCE(alpha >= 0.0f, "'alpha' must be above 0.0.");
+        });
+    AddAttr<float>("beta", "The scale of Position Embedding.")
+        .SetDefault(1.0f)
+        .AddCustomChecker([](const float& beta) {
+          PADDLE_ENFORCE(beta >= 0.0f, "'beta' must be between 0.0.");
+        });
+    AddComment(R"DOC(
+    Add Position Encoding Operator.
+    The add position encoding calculates the output based on the input, alpha, beta.
+    The size of each dimension of the parameters checked in the infer-shape.
+  )DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+namespace plt = paddle::platform;
+REGISTER_OPERATOR(add_position_encoding, ops::AddPositionEncodingOp,
+                  ops::AddPositionEncodingOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(add_position_encoding_grad, ops::AddPositionEncodingOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    add_position_encoding,
+    ops::AddPositionEncodingKernel<plt::CPUDeviceContext, float>,
+    ops::AddPositionEncodingKernel<plt::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    add_position_encoding_grad,
+    ops::AddPositionEncodingGradKernel<plt::CPUDeviceContext, float>,
+    ops::AddPositionEncodingGradKernel<plt::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/add_position_encoding_op.h
+++ b/paddle/fluid/operators/add_position_encoding_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class AddPositionEncodingKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<framework::LoDTensor>("X");
+    auto& x_lod = X->lod();
+    auto* src_ptr = X->data<T>();
+    auto* Out = context.Output<framework::LoDTensor>("Out");
+    auto* dst_ptr = Out->mutable_data<T>(context.GetPlace());
+    float alpha = context.Attr<float>("alpha");
+    float beta = context.Attr<float>("beta");
+    auto x_dim = X->dims();
+    int batch_size = 0;
+    int max_seq_len = 0;
+    int enc_size = 0;
+    if (x_lod.empty()) {
+      PADDLE_ENFORCE(
+          x_dim.size() == 3UL,
+          "The input X of Add Position Encoding should be 3-D Tensor!");
+      batch_size = x_dim[0];
+      max_seq_len = x_dim[1];
+      enc_size = x_dim[2];
+    } else {
+      PADDLE_ENFORCE(
+          x_dim.size() == 2UL,
+          "The input X of Add Position Encoding should be 2-D LoDTensor!");
+      PADDLE_ENFORCE(
+          x_lod.size() == 1UL,
+          "The Add Position Encoding Op only supports lod_level == 1!");
+      batch_size = x_lod[0].size() - 1;
+      max_seq_len = -1;
+      enc_size = x_dim[1];
+    }
+    PADDLE_ENFORCE(enc_size % 2 == 0, "Only support even encode size!");
+    const int half_size = enc_size / 2;
+    for (int i = 0; i < batch_size; ++i) {
+      const int max_length =
+          x_lod.empty() ? max_seq_len : x_lod[0][i + 1] - x_lod[0][i];
+      for (int j = 0; j < max_length; ++j) {
+        for (int k = 0; k < half_size; ++k) {
+          const double val = (half_size > 1)
+                                 ? j / pow(10000.0, double(k) / (half_size - 1))
+                                 : j / 10000.0;
+          dst_ptr[k] = src_ptr[k] * alpha + sin(val) * beta;
+          dst_ptr[half_size + k] =
+              src_ptr[half_size + k] * alpha + cos(val) * beta;
+        }
+        src_ptr += enc_size;
+        dst_ptr += enc_size;
+      }
+    }
+  }
+};
+template <typename DeviceContext, typename T>
+class AddPositionEncodingGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* dOut =
+        context.Input<framework::LoDTensor>(framework::GradVarName("Out"));
+    auto dout = framework::EigenVector<T>::Flatten(*dOut);
+    auto* dX =
+        context.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    dX->mutable_data<T>(context.GetPlace());
+    auto dx = framework::EigenVector<T>::Flatten(*dX);
+    float alpha = context.Attr<float>("alpha");
+    auto* place =
+        context.template device_context<DeviceContext>().eigen_device();
+    dx.device(*place) = dout * static_cast<T>(alpha);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/math/sequence_pooling.cc
+++ b/paddle/fluid/operators/math/sequence_pooling.cc
@@ -31,7 +31,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-template <typename T>
+template <typename T, bool is_test>
 class MaxSeqPoolFunctor {
 public:
  void operator()(const platform::CPUDeviceContext& context,
@@ -70,7 +70,41 @@ class MaxSeqPoolFunctor {
    }
  }
 };
+// Instantisation of Max Sequence Pooling for test phase eg. no need to fill
+// index buffer
+template <typename T>
+class MaxSeqPoolFunctor<T, true> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::LoDTensor& input, framework::Tensor* output,
+                  framework::Tensor* index) {
+    auto in_dims = input.dims();
+    auto out_dims = output->dims();
+    PADDLE_ENFORCE_GT(in_dims.size(), 1);
+    PADDLE_ENFORCE_GT(out_dims.size(), 1);
+    for (int64_t i = 1; i < in_dims.size(); ++i) {
+      PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]);
+    }
+    auto starts = input.lod()[0];
+    const T* in_data = input.data<T>();
+    T* out_data = output->data<T>();
+    int64_t num_seq = out_dims[0];
+    int64_t dim = output->numel() / num_seq;
+    for (int64_t i = 0; i < num_seq; ++i) {
+      std::memcpy(&out_data[i * dim], &in_data[starts[i] * dim],
+                  dim * sizeof(T));
+      for (size_t j = starts[i] + 1; j < starts[i + 1]; ++j) {
+        for (int64_t k = 0; k < dim; ++k) {
+          if (in_data[j * dim + k] > out_data[i * dim + k]) {
+            out_data[i * dim + k] = in_data[j * dim + k];
+          }
+        }
+      }
+    }
+  }
+};
 template <typename T>
 class MaxSeqPoolGradFunctor {
 public:
@@ -188,11 +222,16 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
  /* max pool has index output */
  void operator()(const platform::CPUDeviceContext& context,
                  const std::string pooltype, const framework::LoDTensor& input,
-                  framework::Tensor* output,
+                  framework::Tensor* output, bool is_test,
                  framework::Tensor* index = nullptr) {
    if (pooltype == "MAX") {
-      math::MaxSeqPoolFunctor<T> max_pool;
+      if (is_test) {
-      max_pool(context, input, output, index);
+        math::MaxSeqPoolFunctor<T, true> max_pool;
+        max_pool(context, input, output, index);
+      } else {
+        math::MaxSeqPoolFunctor<T, false> max_pool;
+        max_pool(context, input, output, index);
+      }
      return;
    }
    if (pooltype == "LAST") {
@@ -200,6 +239,7 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
      last_pool(context, input, output);
      return;
    }
    if (pooltype == "FIRST") {
      math::FirstSeqPoolFunctor<T> first_pool;
      first_pool(context, input, output);

--- a/paddle/fluid/operators/math/sequence_pooling.cu
+++ b/paddle/fluid/operators/math/sequence_pooling.cu
@@ -133,7 +133,7 @@ class SequencePoolFunctor<platform::CUDADeviceContext, T> {
 public:
  void operator()(const platform::CUDADeviceContext& context,
                  const std::string pooltype, const framework::LoDTensor& input,
-                  framework::Tensor* output,
+                  framework::Tensor* output, bool is_test,
                  framework::Tensor* index = nullptr) {
    auto& lod = input.lod()[0];
    const size_t item_dim = output->numel() / output->dims()[0];

--- a/paddle/fluid/operators/math/sequence_pooling.h
+++ b/paddle/fluid/operators/math/sequence_pooling.h
@@ -28,7 +28,7 @@ class SequencePoolFunctor {
  /* max pool has index output */
  void operator()(const DeviceContext& context, const std::string pooltype,
                  const framework::LoDTensor& input, framework::Tensor* output,
-                  framework::Tensor* index = nullptr);
+                  bool is_test = false, framework::Tensor* index = nullptr);
 };
 template <typename DeviceContext, typename T>

--- a/paddle/fluid/operators/sequence_pool_op.cc
+++ b/paddle/fluid/operators/sequence_pool_op.cc
@@ -47,6 +47,7 @@ class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker {
              "(Tensor<int>) This tensor is used for the sequence max-pooling "
              "to record the max indexes.")
        .AsIntermediate();
+    AddAttr<bool>("is_test", "").SetDefault(false);
    AddAttr<std::string>(
        "pooltype",
        "(string, default 'AVERAGE') the pooling pooltype of SequencePoolOp.")

--- a/paddle/fluid/operators/sequence_pool_op.h
+++ b/paddle/fluid/operators/sequence_pool_op.h
@@ -32,10 +32,6 @@ class SequencePoolKernel : public framework::OpKernel<T> {
    auto* in = context.Input<LoDTensor>("X");
    auto* out = context.Output<Tensor>("Out");
    std::string pooltype = context.Attr<std::string>("pooltype");
-    Tensor* index = nullptr;
-    if (pooltype == "MAX") {
-      index = context.Output<Tensor>("MaxIndex");
-    }
    auto dims = in->dims();
    auto lod = in->lod();
@@ -48,13 +44,22 @@ class SequencePoolKernel : public framework::OpKernel<T> {
    dims[0] = lod[0].size() - 1;
    out->Resize({dims});
    out->mutable_data<T>(context.GetPlace());
-    if (pooltype == "MAX") {
+    Tensor* index = nullptr;
+    const bool is_test = context.Attr<bool>("is_test");
+    // Do not create index buffer for inference (is_test) mode
+    // TODO(jczaja): Skip index buffer creation for other devices eg. GPU
+    if (pooltype == "MAX" &&
+        (is_test == false ||
+         platform::is_cpu_place(context.GetPlace()) == false)) {
+      index = context.Output<Tensor>("MaxIndex");
      index->Resize({dims});
      index->mutable_data<int>(context.GetPlace());
    }
    math::SequencePoolFunctor<DeviceContext, T> pool;
    pool(context.template device_context<DeviceContext>(), pooltype, *in, out,
-         index);
+         is_test, index);
  }
 };

--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -82,14 +82,16 @@ class SumOp : public framework::OperatorWithKernel {
    if (x_vars[0]->IsType<framework::LoDTensor>()) {
      int dtype = -1;
      for (auto& x_var : x_vars) {
-        auto& lod_tensor = x_var->Get<framework::LoDTensor>();
+        // FIXME(zcd): The input x_var may be SelectedRows or LoDTensor.
-        if (lod_tensor.numel() == 0) {
+        auto tensor = framework::GetTensorFromVar(
+            const_cast<framework::Variable*>(x_var));
+        if (tensor->numel() == 0) {
          continue;
        }
        if (dtype == -1) {
-          dtype = framework::ToDataType(lod_tensor.type());
+          dtype = framework::ToDataType(tensor->type());
        } else {
-          PADDLE_ENFORCE_EQ(dtype, framework::ToDataType(lod_tensor.type()));
+          PADDLE_ENFORCE_EQ(dtype, framework::ToDataType(tensor->type()));
        }
      }
      PADDLE_ENFORCE_NE(dtype, -1,

--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -32,23 +32,25 @@ platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
        "'Place' is not supported, Please re-compile with WITH_GPU "
        "option");
  }
-  return it->second.get();
+  return it->second.get().get();
 }
-const std::vector<const DeviceContext*>
+template <typename DevCtx, typename PlaceType>
-DeviceContextPool::GetAllDeviceContexts() const {
+inline void EmplaceDeviceContext(
-  std::vector<const DeviceContext*> all_device_ctx;
+    std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
-  all_device_ctx.reserve(device_contexts_.size());
+        map_ptr,
-  for (auto& dev_ctx : device_contexts_) {
+    platform::Place p) {
-    all_device_ctx.emplace_back(dev_ctx.second.get());
+  using PtrType = std::unique_ptr<DeviceContext>;
-  }
+  map_ptr->emplace(p, std::async(std::launch::deferred, [=] {
-  return all_device_ctx;
+                     // lazy evaluation. i.e., only create device context at
+                     // first `Get`
+                     return PtrType(new DevCtx(boost::get<PlaceType>(p)));
+                   }));
 }
 DeviceContextPool::DeviceContextPool(
    const std::vector<platform::Place>& places) {
  PADDLE_ENFORCE_GT(places.size(), 0);
-  using PtrType = std::unique_ptr<DeviceContext>;
  std::set<Place> set;
  for (auto& p : places) {
    set.insert(p);
@@ -57,16 +59,13 @@ DeviceContextPool::DeviceContextPool(
  for (auto& p : set) {
    if (platform::is_cpu_place(p)) {
 #ifdef PADDLE_WITH_MKLDNN
-      device_contexts_.emplace(
+      EmplaceDeviceContext<MKLDNNDeviceContext, CPUPlace>(&device_contexts_, p);
-          p, PtrType(new MKLDNNDeviceContext(boost::get<CPUPlace>(p))));
 #else
-      device_contexts_.emplace(
+      EmplaceDeviceContext<CPUDeviceContext, CPUPlace>(&device_contexts_, p);
-          p, PtrType(new CPUDeviceContext(boost::get<CPUPlace>(p))));
 #endif
    } else if (platform::is_gpu_place(p)) {
 #ifdef PADDLE_WITH_CUDA
-      device_contexts_.emplace(
+      EmplaceDeviceContext<CUDADeviceContext, CUDAPlace>(&device_contexts_, p);
-          p, PtrType(new CUDADeviceContext(boost::get<CUDAPlace>(p))));
 #else
      PADDLE_THROW(
          "'CUDAPlace' is not supported, Please re-compile with WITH_GPU "
@@ -74,9 +73,8 @@ DeviceContextPool::DeviceContextPool(
 #endif
    } else if (platform::is_cuda_pinned_place(p)) {
 #ifdef PADDLE_WITH_CUDA
-      device_contexts_.emplace(
+      EmplaceDeviceContext<CUDAPinnedDeviceContext, CUDAPinnedPlace>(
-          p,
+          &device_contexts_, p);
-          PtrType(new CUDAPinnedDeviceContext(boost::get<CUDAPinnedPlace>(p))));
 #else
      PADDLE_THROW(
          "'CUDAPlace' is not supported, Please re-compile with WITH_GPU "

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <future>  // NOLINT
 #include <memory>
 #include <mutex>  // NOLINT
 #include <string>
@@ -223,9 +224,6 @@ class DeviceContextPool {
  /*! \brief  Return handle of single device context. */
  platform::DeviceContext* Get(const platform::Place& place);
-  /*! \brief  Return all the device contexts. */
-  const std::vector<const DeviceContext*> GetAllDeviceContexts() const;
  template <typename Place>
  const typename DefaultDeviceContextType<Place>::TYPE* GetByPlace(
      const Place& place) {
@@ -237,7 +235,8 @@ class DeviceContextPool {
 private:
  static DeviceContextPool* pool;
-  std::map<Place, std::unique_ptr<DeviceContext>> device_contexts_;
+  std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>
+      device_contexts_;
  DISABLE_COPY_AND_ASSIGN(DeviceContextPool);
 };

--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -153,7 +153,6 @@ function cmake_gen() {
        -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF}
        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
        -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
-        -DWITH_INFERENCE=${WITH_INFERENCE:-ON}
        -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON}
        -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR}
        -DWITH_ANAKIN=${WITH_ANAKIN:-OFF}
@@ -186,7 +185,6 @@ EOF
        -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
        -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \
-        -DWITH_INFERENCE=${WITH_INFERENCE:-ON} \
        -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} \
        -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} \
        -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \
@@ -653,7 +651,7 @@ function gen_capi_package() {
 function gen_fluid_lib() {
    mkdir -p ${PADDLE_ROOT}/build
    cd ${PADDLE_ROOT}/build
-    if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then
+    if [[ ${WITH_C_API:-OFF} == "OFF" ]] ; then
        cat <<EOF
    ========================================
    Generating fluid library for train and inference ...
@@ -666,7 +664,7 @@ EOF
 }
 function tar_fluid_lib() {
-    if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then
+    if [[ ${WITH_C_API:-OFF} == "OFF" ]] ; then
        cat <<EOF
    ========================================
    Taring fluid library for train and inference ...
@@ -681,7 +679,7 @@ EOF
 }
 function test_fluid_lib() {
-    if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then
+    if [[ ${WITH_C_API:-OFF} == "OFF" ]] ; then
        cat <<EOF
    ========================================
    Testing fluid library for inference ...

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -157,6 +157,8 @@ __all__ = [
    'sequence_reverse',
    'affine_channel',
    'hash',
+    'log_loss',
+    'add_position_encoding',
 ]
@@ -1823,7 +1825,7 @@ def conv3d(input,
    return helper.append_activation(pre_act)
-def sequence_pool(input, pool_type):
+def sequence_pool(input, pool_type, is_test=False):
    """
    This function add the operator for sequence pooling.
    It pools features of all time-steps of each instance, and is applied
@@ -1860,6 +1862,7 @@ def sequence_pool(input, pool_type):
        input(variable): The input variable which is a LoDTensor.
        pool_type (string): The pooling type of sequence_pool.
            It supports average, sum, sqrt and max.
+        is_test(bool, Default False): Used distinguish training from scoring mode.
    Returns:
        The sequence pooling variable which is a Tensor.
@@ -1887,7 +1890,8 @@ def sequence_pool(input, pool_type):
        inputs={"X": input},
        outputs={"Out": pool_out,
                 "MaxIndex": max_index},
-        attrs={"pooltype": pool_type.upper()})
+        attrs={"pooltype": pool_type.upper(),
+               "is_test": is_test})
    # when pool_type is max, variable max_index is initialized,
    # so we stop the gradient explicitly here
@@ -7580,3 +7584,99 @@ def hash(input, hash_size, num_hash=1, name=None):
        attrs={'num_hash': num_hash,
               'mod_by': hash_size})
    return out
+def log_loss(input, label, epsilon=1e-4, name=None):
+    """
+    **Negative Log Loss Layer**
+    This layer accepts input predictions and target label and returns the
+    negative log loss.
+    .. math::
+        Out = -label * \\log{(input + \\epsilon)}
+              - (1 - label) * \\log{(1 - input + \\epsilon)}
+    Args:
+        input (Variable|list):  a 2-D tensor with shape [N x 1], where N is the
+                                batch size. This input is a probability computed
+                                by the previous operator.
+        label (Variable|list):  the ground truth which is a 2-D tensor with
+                                shape [N x 1], where N is the batch size.
+        epsilon (float): epsilon
+        name (string): the name of log_loss
+    Returns:
+        Variable: A 2-D tensor with shape [N x 1], the negative log loss.
+    Examples:
+        .. code-block:: python
+          prob = fluid.layers.sigmoid(net)
+          cost = fluid.layers.log_loss(input=prob, label=label)
+    """
+    helper = LayerHelper('log_loss', **locals())
+    if name is None:
+        loss = helper.create_variable_for_type_inference(dtype=input.dtype)
+    else:
+        loss = helper.create_variable(
+            name=name, dtype=input.dtype, persistable=False)
+    helper.append_op(
+        type='log_loss',
+        inputs={'Predicted': [input],
+                'Labels': [label]},
+        outputs={'Loss': [loss]},
+        attrs={'epsilon': epsilon})
+    return loss
+def add_position_encoding(input, alpha, beta, name=None):
+    """
+    **Add Position Encoding Layer**
+    This layer accepts an input 3D-Tensor of shape [N x M x P], and return an
+    output Tensor of shape [N x M x P] with positional encoding value.
+    Refer to `Attention Is All You Need<http://arxiv.org/pdf/1706.03762.pdf>`_ .
+    .. math::
+        PE(pos, 2i) = \\sin{(pos / 10000^{2i / P})}   \\\\
+        PE(pos, 2i + 1) = \\cos{(pos / 10000^{2i / P})}  \\\\
+        Out(:, pos, i) = \\alpha * input(:, pos, i) + \\beta * PE(pos, i)
+    Where:
+    * PE(pos, 2i): the increment for the number at even position
+    * PE(pos, 2i + 1): the increment for the number at odd position
+    Args:
+        input (Variable): 3-D input tensor with shape [N x M x P]
+        alpha (float): multiple of Input Tensor
+        beta (float): multiple of Positional Encoding Tensor
+        name (string): the name of position encoding layer
+    Returns:
+        Variable: A 3-D Tensor of shape [N x M x P] with positional encoding.
+    Examples:
+        .. code-block:: python
+          position_tensor = fluid.layers.add_position_encoding(input=tensor)
+    """
+    helper = LayerHelper('add_position_encoding', **locals())
+    dtype = helper.input_dtype()
+    if name is None:
+        out = helper.create_variable_for_type_inference(dtype=dtype)
+    else:
+        out = helper.create_variable(name=name, dtype=dtype, persistable=False)
+    helper.append_op(
+        type="add_position_encoding",
+        inputs={"X": input},
+        outputs={"Out": out},
+        attrs={"alpha": alpha,
+               "beta": beta})
+    return out
--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
@@ -194,7 +194,7 @@ class CompositeMetric(MetricBase):
                               or soft-label, should custom the corresponding update rule.
        """
        for m in self._metrics:
-            ans.append(m.update(preds, labels))
+            m.update(preds, labels)
    def eval(self):
        """

--- a/python/paddle/fluid/tests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/CMakeLists.txt
-set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests CACHE INTERNAL "python tests directory")
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")

--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -55,6 +55,7 @@ function(py_test_modules TARGET_NAME)
    if (py_test_modules_SERIAL)
        set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
    endif()
+    set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
  endif()
 endfunction()
 list(REMOVE_ITEM TEST_OPS test_warpctc_op)

--- a/python/paddle/fluid/tests/unittests/test_add_position_encoding_op.py
+++ b/python/paddle/fluid/tests/unittests/test_add_position_encoding_op.py
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+import math
+import paddle.fluid.core as core
+from op_test import OpTest
+class TestAddPositionEncodingTensorOp(OpTest):
+    """
+    This class is to test the AddPositionEncodingOp
+    """
+    def setUp(self):
+        """
+        the prepared section for add position encoding op
+        """
+        self.op_type = "add_position_encoding"
+        self.dtype = np.float32
+        self.init_input_output()
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(self.x), }
+        self.outputs = {'Out': self.out}
+        self.attrs = {'alpha': self.alpha, 'beta': self.beta}
+    def test_check_output(self):
+        """
+        check the correctness of output
+        """
+        self.check_output()
+    def test_check_grad(self):
+        """
+        check the correctness of grad
+        """
+        self.check_grad(['X'], 'Out', max_relative_error=0.005)
+    def init_input_output(self):
+        """
+        init the input and output for test cases
+        """
+        self.alpha = 0.6
+        self.beta = 0.5
+        self.x = np.random.uniform(0.1, 1, [2, 4, 4]).astype(self.dtype)
+        self.out = np.copy(self.x)
+        batch_size = self.x.shape[0]
+        max_length = self.x.shape[1]
+        enc_size = self.x.shape[2]
+        half_shape = int(enc_size / 2)
+        for i in range(batch_size):
+            for j in range(max_length):
+                for k in range(half_shape):
+                    val = j / pow(10000.0, k / (
+                        half_shape - 1)) if half_shape > 1 else j / 10000.0
+                    self.out[i, j, k] = \
+                        self.x[i, j, k] * self.alpha + math.sin(val) * self.beta
+                    self.out[i, j, half_shape + k] = \
+                        self.x[i, j, half_shape + k] * self.alpha + math.cos(val) * self.beta
+class TestAddPositionEncodingLoDTensorOp(OpTest):
+    """
+    This class is to test the AddPositionEncodingLoDTensorOp
+    """
+    def setUp(self):
+        """
+        the prepared section for add position encoding LoDTensor op
+        """
+        self.op_type = "add_position_encoding"
+        self.dtype = np.float32
+        self.init_input_output()
+        self.inputs = {'X': (self.x, self.lod), }
+        self.outputs = {'Out': (self.out, self.lod)}
+        self.attrs = {'alpha': self.alpha, 'beta': self.beta}
+    def test_check_output(self):
+        """
+        check the correctness of output
+        """
+        self.check_output()
+    def test_check_grad(self):
+        """
+        check the correctness of grad
+        """
+        self.check_grad(['X'], 'Out', max_relative_error=0.005)
+    def init_input_output(self):
+        """
+        init the input and output for test cases
+        """
+        self.alpha = 0.6
+        self.beta = 0.5
+        self.x = np.random.uniform(0.1, 1, [10, 4]).astype(self.dtype)
+        self.lod = [[3, 7]]
+        self.out = np.copy(self.x)
+        batch_size = len(self.lod[0])
+        enc_size = self.x.shape[1]
+        start = 0
+        half_shape = int(enc_size / 2)
+        for i in range(batch_size):
+            max_length = self.lod[0][i]
+            for j in range(max_length):
+                for k in range(half_shape):
+                    val = j / pow(10000.0, k / (
+                        half_shape - 1)) if half_shape > 1 else j / 10000.0
+                    pos = start + j
+                    self.out[pos, k] = \
+                        self.x[pos, k] * self.alpha + math.sin(val) * self.beta
+                    self.out[pos, half_shape + k] = \
+                        self.x[pos, half_shape + k] * self.alpha + math.cos(val) * self.beta
+            start += max_length
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_seq_pool.py
+++ b/python/paddle/fluid/tests/unittests/test_seq_pool.py
@@ -184,6 +184,20 @@ class TestSeqMaxPool2D(TestSeqAvgPool2D):
            out[i] = np.reshape(np.amax(sub_x, axis=0), (3, 11))
+class TestSeqMaxPool2DInference(TestSeqMaxPool2D):
+    def compute(self, x, offset, out):
+        self.attrs = {'pooltype': "MAX", 'is_test': True}
+        for i in range(len(offset[0]) - 1):
+            sub_x = np.reshape(x[offset[0][i]:offset[0][i + 1], :],
+                               (-1, 3 * 11))
+            out[i] = np.reshape(np.amax(sub_x, axis=0), (3, 11))
+    def test_check_grad(self):
+        """Grad computation does not apply to Sequence MAX 
+            Pool executed when is_test is true """
+        return
 class TestSeqLastPool2D(TestSeqAvgPool2D):
    def compute(self, x, offset, out):
        self.attrs = {'pooltype': "LAST"}