Merge remote-tracking branch 'ups/develop' into fea/jit/gen

f3badacd · tensor-tang · a53b1b0b · d186e743 · f3badacd · f3badacd
44 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -68,7 +68,6 @@ option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better d
 option(WITH_ANAKIN      "Compile with Anakin library"                   OFF)
 option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
 option(WITH_BRPC_RDMA     "Use brpc rdma as the rpc protocal"           OFF)
-option(WITH_INFERENCE    "Compile fluid inference library"              ON)
 option(ON_INFER         "Turn on inference optimization."               OFF)
 option(WITH_INFERENCE_API_TEST   "Test fluid inference high-level api interface"  OFF)
 option(WITH_SYSTEM_BLAS   "Use system blas library"           OFF)

--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -24,6 +24,7 @@ if(NOT WITH_FLUID_ONLY)
 endif()
 add_subdirectory(testing)
+set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests CACHE INTERNAL "python tests directory")
 if(NOT MOBILE_INFERENCE AND NOT RPI AND NOT WITH_C_API)
  add_subdirectory(fluid)
 endif()
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -64,7 +64,7 @@ paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', '
 paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None, None))
 paddle.fluid.layers.conv2d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None))
 paddle.fluid.layers.conv3d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None))
-paddle.fluid.layers.sequence_pool ArgSpec(args=['input', 'pool_type'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.sequence_pool ArgSpec(args=['input', 'pool_type', 'is_test'], varargs=None, keywords=None, defaults=(False,))
 paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(False, None))
 paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None))
 paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None))
@@ -177,6 +177,8 @@ paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, k
 paddle.fluid.layers.sequence_reverse ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.affine_channel ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None))
 paddle.fluid.layers.hash ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None))
+paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None))
+paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
 paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)

--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -9,8 +9,6 @@ add_subdirectory(pybind)
 add_subdirectory(recordio)
 endif(NOT WIN32)
-if(WITH_INFERENCE)
+# NOTE: please add subdirectory inference at last.
-  # NOTE: please add subdirectory inference at last.
+add_subdirectory(inference)
-  add_subdirectory(inference)
+add_subdirectory(train)
-  add_subdirectory(train)
-endif()
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -34,7 +34,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
      nccl_ctxs_(ctxs) {
  if (nccl_ctxs_) {
    for (auto &p : places_) {
-      this->dev_ctxes_[p] = nccl_ctxs_->DevCtx(p);
+      this->SetDeviceContext(p, nccl_ctxs_->DevCtx(p));
    }
  }
 }
@@ -46,7 +46,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
 #endif
 void AllReduceOpHandle::RunImpl() {
-  platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
+  platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second);
  if (NoDummyInputSize() == 1) {
    return;  // No need to all reduce when GPU count = 1;
@@ -127,7 +127,7 @@ void AllReduceOpHandle::RunImpl() {
            *local_scopes_[i]->FindVar(kLocalExecScopeName)->Get<Scope *>();
        auto &p = places_[i];
        auto *var = scope.FindVar(out_var_handles[i]->name_);
-        auto *dev_ctx = dev_ctxes_[p];
+        auto *dev_ctx = dev_ctxes_.at(p);
        RunAndRecordEvent(p, [&trg, var, dev_ctx, p] {
          auto &tensor_gpu = *var->GetMutable<framework::LoDTensor>();

--- a/paddle/fluid/framework/details/broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle.h
@@ -44,7 +44,8 @@ struct BroadcastOpHandle : public OpHandleBase {
        nccl_ctxs_(nccl_ctxs) {
    if (nccl_ctxs_) {
      for (auto &p_ctx : nccl_ctxs_->contexts_) {
-        dev_ctxes_[platform::CUDAPlace(p_ctx.first)] = p_ctx.second.ctx_.get();
+        this->SetDeviceContext(platform::CUDAPlace(p_ctx.first),
+                               p_ctx.second.ctx_.get());
      }
    }
  }

--- a/paddle/fluid/framework/details/computation_op_handle.cc
+++ b/paddle/fluid/framework/details/computation_op_handle.cc
@@ -37,7 +37,7 @@ void ComputationOpHandle::RunImpl() {
 bool ComputationOpHandle::NeedWait(VarHandleBase *in_var) {
  bool need_wait =
      in_var && in_var->GeneratedOp() &&
-      in_var->GeneratedOp()->DeviceContext(place_) != dev_ctxes_[place_];
+      in_var->GeneratedOp()->DeviceContext(place_) != dev_ctxes_.at(place_);
  return need_wait;
 }

--- a/paddle/fluid/framework/details/data_balance_op_handle.cc
+++ b/paddle/fluid/framework/details/data_balance_op_handle.cc
@@ -28,7 +28,7 @@ DataBalanceOpHandle::DataBalanceOpHandle(
    : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {
  if (ctxs) {
    for (auto &p : places_) {
-      this->dev_ctxes_[p] = ctxs->DevCtx(p);
+      this->SetDeviceContext(p, ctxs->DevCtx(p));
    }
  }
 }
@@ -89,8 +89,8 @@ void DataBalanceOpHandle::RunImpl() {
  PADDLE_ENFORCE_GT(places_.size(), 1,
                    "Data balance can only be enabled when the number of "
                    "places to run larger than 1.");
-  auto in_var_handles = DynamicCast<VarHandle>(inputs_);
+  auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
-  auto out_var_handles = DynamicCast<VarHandle>(outputs_);
+  auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
  PADDLE_ENFORCE(in_var_handles.size() % places_.size() == 0);
  PADDLE_ENFORCE_EQ(
      in_var_handles.size(), out_var_handles.size(),

--- a/paddle/fluid/framework/details/gather_op_handle.cc
+++ b/paddle/fluid/framework/details/gather_op_handle.cc
@@ -36,7 +36,7 @@ void GatherOpHandle::RunImpl() {
  VarHandle *out_var_handle;
  {
-    auto out_var_handles = DynamicCast<VarHandle>(outputs_);
+    auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
    PADDLE_ENFORCE_EQ(out_var_handles.size(), 1,
                      "The number of output should be one.");
    out_var_handle = out_var_handles.front();
@@ -99,7 +99,7 @@ void GatherOpHandle::RunImpl() {
  Tensor *out_tensor = out_value->mutable_value();
  // copy
-  auto dev_ctx = dev_ctxes_[out_var_handle->place_];
+  auto dev_ctx = dev_ctxes_.at(out_var_handle->place_);
  RunAndRecordEvent(out_var_handle->place_, [in_tensors, out_tensor, &dev_ctx,
                                             t_out_p] {
    int s = 0, e = 0;

--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -103,7 +103,7 @@ void OpHandleBase::WaitInputVarGenerated() {
 void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
  for (auto *in : inputs_) {
    if (NeedWait(in)) {
-      in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_[place]);
+      in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_.at(place));
    }
  }
 }

--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -27,7 +27,7 @@ namespace framework {
 namespace details {
 void ReduceOpHandle::RunImpl() {
-  platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
+  platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second);
  if (places_.size() == 1) return;
  // the input and output may have dummy var.

--- a/paddle/fluid/framework/details/reduce_op_handle.h
+++ b/paddle/fluid/framework/details/reduce_op_handle.h
@@ -46,7 +46,8 @@ struct ReduceOpHandle : public OpHandleBase {
        nccl_ctxs_(nccl_ctxs) {
    if (nccl_ctxs_) {
      for (auto &p_ctx : nccl_ctxs_->contexts_) {
-        dev_ctxes_[platform::CUDAPlace(p_ctx.first)] = p_ctx.second.ctx_.get();
+        this->SetDeviceContext(platform::CUDAPlace(p_ctx.first),
+                               p_ctx.second.ctx_.get());
      }
    }
  }

--- a/paddle/fluid/framework/details/rpc_op_handle.cc
+++ b/paddle/fluid/framework/details/rpc_op_handle.cc
@@ -38,7 +38,7 @@ void RPCOpHandle::RunImpl() {
      continue;
    }
    if (in->GeneratedOp()) {
-      in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_[p]);
+      in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_.at(p));
    }
  }
  auto &tmp_scope = local_scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();

--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@@ -27,7 +27,7 @@ ScaleLossGradOpHandle::ScaleLossGradOpHandle(ir::Node *node, size_t num_dev,
      coeff_(static_cast<float>(1.0 / num_dev)),
      scope_(scope),
      place_(place) {
-  dev_ctxes_[place_] = dev_ctx;
+  this->SetDeviceContext(place_, dev_ctx);
 }
 ScaleLossGradOpHandle::~ScaleLossGradOpHandle() {}
@@ -46,9 +46,9 @@ void ScaleLossGradOpHandle::RunImpl() {
  } else {
 #ifdef PADDLE_WITH_CUDA
    this->RunAndRecordEvent([&] {
-      auto stream =
+      auto stream = static_cast<platform::CUDADeviceContext *>(
-          static_cast<platform::CUDADeviceContext *>(this->dev_ctxes_[place_])
+                        this->dev_ctxes_.at(place_))
-              ->stream();
+                        ->stream();
      memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp,
                   platform::CPUPlace(), &coeff_, sizeof(float), stream);
      VLOG(10) << place_ << "RUN Scale loss grad op";

--- a/paddle/fluid/framework/lod_tensor_array.h
+++ b/paddle/fluid/framework/lod_tensor_array.h
@@ -19,81 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
-// NOTE The vector<LoDTensor> can't be replaced with the class LoDTensorArray
-// directly, because there are many vector<LoDTensor> used accross the project,
-// and some of them are treated as LoDTensorArray.
-#if !defined(PADDLE_ON_INFERENCE)
 using LoDTensorArray = std::vector<LoDTensor>;
-#else  // !PADDLE_ON_INFERENCE
-#pragma message "LoDTensorArray is replaced with the inference one."
-/*
- * A LoDTensorArray which will not deallocate buffer when resized, fix the data
- * diff in inference, and more performance friendly in the concurrency
- * scenerios.
- */
-class LoDTensorArray {
- public:
-  LoDTensorArray() = default;
-  using iterator = std::vector<LoDTensor>::iterator;
-  using const_iterator = std::vector<LoDTensor>::const_iterator;
-  const_iterator begin() const { return array_.begin(); }
-  const_iterator end() const { return array_.begin() + size_; }
-  iterator begin() { return array_.begin(); }
-  iterator end() { return array_.begin() + size_; }
-  void push_back(const LoDTensor& x) {
-    if (size_ < array_.size()) {
-      array_[size_++] = x;
-    } else {
-      array_.push_back(x);
-      ++size_;
-    }
-  }
-  void resize(size_t size) {
-    if (array_.size() < size) {
-      array_.resize(size);
-    }
-    size_ = size;
-  }
-  void emplace_back() { array_.emplace_back(); }
-  void emplace_back(LoDTensor&& x) { array_.emplace_back(std::move(x)); }
-  LoDTensor& back() { return array_.back(); }
-  size_t space() const { return array_.size(); }
-  void reserve(size_t size) {
-    // Naive warning to tell user this array might be to large. The memory and
-    // buffer used by this TensorArray will not be deleted during the training
-    // and inference phase, so attention not to make it expand too long.
-    if (size > 800UL) {
-      LOG(WARNING) << "TensorArray has more than 800 items";
-    }
-    array_.reserve(size);
-  }
-  bool empty() const { return size_ == 0UL; }
-  void clear() { size_ = 0UL; }
-  LoDTensor& operator[](size_t id) { return array_[id]; }
-  const LoDTensor& operator[](size_t id) const { return array_[id]; }
-  LoDTensor& at(size_t id) { return array_.at(id); }
-  const LoDTensor& at(size_t id) const { return array_.at(id); }
-  size_t size() const { return size_; }
- private:
-  size_t size_{0};
-  std::vector<LoDTensor> array_;
-};
-#endif  // !PADDLE_ON_INFERENCE
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -358,7 +358,7 @@ static bool VarIsTensor(const Variable* var) {
  return var->IsType<LoDTensor>() || var->IsType<SelectedRows>();
 }
-static const Tensor* GetTensorFromVar(Variable* var) {
+const Tensor* GetTensorFromVar(Variable* var) {
  if (var->IsType<LoDTensor>()) {
    return var->GetMutable<LoDTensor>();
  } else if (var->IsType<SelectedRows>()) {

--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -63,6 +63,7 @@ inline std::string GradVarName(const std::string& var_name) {
 }
 proto::VarType::Type GetDataTypeOfVar(const Variable* var);
+const Tensor* GetTensorFromVar(Variable* var);
 class OperatorBase;
 class ExecutionContext;

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -303,10 +303,8 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
 }
 ParallelExecutor::~ParallelExecutor() {
-  const auto dev_ctxs =
+  for (auto &p : member_->places_) {
-      platform::DeviceContextPool::Instance().GetAllDeviceContexts();
+    platform::DeviceContextPool::Instance().Get(p)->Wait();
-  for (auto &dev_ctx : dev_ctxs) {
-    dev_ctx->Wait();
  }
  if (member_->own_local_scope_) {

--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -61,8 +61,6 @@ cc_test(test_paddle_inference_api
 inference_api_test(test_api_impl SRC api_impl_tester.cc
                    ARGS test_word2vec test_image_classification)
-set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
 cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor ${inference_deps} paddle_inference_api
        ARGS --dirname=${PYTHON_TESTS_DIR}/book)

--- a/paddle/fluid/inference/api/api_impl_tester.cc
+++ b/paddle/fluid/inference/api/api_impl_tester.cc
@@ -22,9 +22,9 @@ limitations under the License. */
 #include "paddle/fluid/inference/tests/test_helper.h"
 #ifdef __clang__
-#define ACC_DIFF 4e-3
+#define ACC_DIFF 4e-2
 #else
-#define ACC_DIFF 1e-3
+#define ACC_DIFF 1e-2
 #endif
 DEFINE_string(dirname, "", "Directory of the inference model.");
@@ -187,7 +187,7 @@ void MainThreadsWord2Vec(bool use_gpu) {
  std::vector<std::thread> threads;
  for (int tid = 0; tid < num_jobs; ++tid) {
    threads.emplace_back([&, tid]() {
-      auto predictor = main_predictor->Clone();
+      auto predictor = CreatePaddlePredictor(config);
      auto& local_inputs = paddle_tensor_feeds[tid];
      std::vector<PaddleTensor> local_outputs;
      ASSERT_TRUE(predictor->Run(local_inputs, &local_outputs));
@@ -245,7 +245,7 @@ void MainThreadsImageClassification(bool use_gpu) {
  std::vector<std::thread> threads;
  for (int tid = 0; tid < num_jobs; ++tid) {
    threads.emplace_back([&, tid]() {
-      auto predictor = main_predictor->Clone();
+      auto predictor = CreatePaddlePredictor(config);
      auto& local_inputs = paddle_tensor_feeds[tid];
      std::vector<PaddleTensor> local_outputs;
      ASSERT_TRUE(predictor->Run(local_inputs, &local_outputs));

--- a/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
+++ b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
@@ -70,8 +70,12 @@ void Main(bool use_gpu) {
    // The outputs' buffers are in CPU memory.
    for (size_t i = 0; i < std::min(static_cast<size_t>(5), num_elements);
         i++) {
-      CHECK_NEAR(static_cast<float*>(outputs.front().data.data())[i], result[i],
+      // Here will result random fail, for that the model is trained by CI, the
-                 0.001);
+      // train phase is not stable, so the result will be random.
+      // TODO(Superjomn) will restore after the model is upload.
+      // CHECK_NEAR(static_cast<float*>(outputs.front().data.data())[i],
+      // result[i],
+      // 0.001);
    }
  }
 }

--- a/paddle/fluid/operators/add_position_encoding_op.cc
+++ b/paddle/fluid/operators/add_position_encoding_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/add_position_encoding_op.h"
+namespace paddle {
+namespace operators {
+class AddPositionEncodingOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "X(Input) of add_position_encoding_op should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("Out"),
+        "Out(Output) of add_position_encoding_op should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    ctx->SetOutputDim("Out", x_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+class AddPositionEncodingOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "X(Input) must not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Out"), "Out must not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Out@GRAD must not be null.");
+    auto out_dims = ctx->GetInputDim("Out");
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), out_dims);
+    }
+  }
+};
+class AddPositionEncodingOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "Input of AddPositionEncoding operator");
+    AddOutput("Out", "Output of AddPositionEncoding operator");
+    AddAttr<float>("alpha", "The scale of Original Embedding.")
+        .SetDefault(1.0f)
+        .AddCustomChecker([](const float& alpha) {
+          PADDLE_ENFORCE(alpha >= 0.0f, "'alpha' must be above 0.0.");
+        });
+    AddAttr<float>("beta", "The scale of Position Embedding.")
+        .SetDefault(1.0f)
+        .AddCustomChecker([](const float& beta) {
+          PADDLE_ENFORCE(beta >= 0.0f, "'beta' must be between 0.0.");
+        });
+    AddComment(R"DOC(
+    Add Position Encoding Operator.
+    The add position encoding calculates the output based on the input, alpha, beta.
+    The size of each dimension of the parameters checked in the infer-shape.
+  )DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+namespace plt = paddle::platform;
+REGISTER_OPERATOR(add_position_encoding, ops::AddPositionEncodingOp,
+                  ops::AddPositionEncodingOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(add_position_encoding_grad, ops::AddPositionEncodingOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    add_position_encoding,
+    ops::AddPositionEncodingKernel<plt::CPUDeviceContext, float>,
+    ops::AddPositionEncodingKernel<plt::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    add_position_encoding_grad,
+    ops::AddPositionEncodingGradKernel<plt::CPUDeviceContext, float>,
+    ops::AddPositionEncodingGradKernel<plt::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/add_position_encoding_op.h
+++ b/paddle/fluid/operators/add_position_encoding_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class AddPositionEncodingKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<framework::LoDTensor>("X");
+    auto& x_lod = X->lod();
+    auto* src_ptr = X->data<T>();
+    auto* Out = context.Output<framework::LoDTensor>("Out");
+    auto* dst_ptr = Out->mutable_data<T>(context.GetPlace());
+    float alpha = context.Attr<float>("alpha");
+    float beta = context.Attr<float>("beta");
+    auto x_dim = X->dims();
+    int batch_size = 0;
+    int max_seq_len = 0;
+    int enc_size = 0;
+    if (x_lod.empty()) {
+      PADDLE_ENFORCE(
+          x_dim.size() == 3UL,
+          "The input X of Add Position Encoding should be 3-D Tensor!");
+      batch_size = x_dim[0];
+      max_seq_len = x_dim[1];
+      enc_size = x_dim[2];
+    } else {
+      PADDLE_ENFORCE(
+          x_dim.size() == 2UL,
+          "The input X of Add Position Encoding should be 2-D LoDTensor!");
+      PADDLE_ENFORCE(
+          x_lod.size() == 1UL,
+          "The Add Position Encoding Op only supports lod_level == 1!");
+      batch_size = x_lod[0].size() - 1;
+      max_seq_len = -1;
+      enc_size = x_dim[1];
+    }
+    PADDLE_ENFORCE(enc_size % 2 == 0, "Only support even encode size!");
+    const int half_size = enc_size / 2;
+    for (int i = 0; i < batch_size; ++i) {
+      const int max_length =
+          x_lod.empty() ? max_seq_len : x_lod[0][i + 1] - x_lod[0][i];
+      for (int j = 0; j < max_length; ++j) {
+        for (int k = 0; k < half_size; ++k) {
+          const double val = (half_size > 1)
+                                 ? j / pow(10000.0, double(k) / (half_size - 1))
+                                 : j / 10000.0;
+          dst_ptr[k] = src_ptr[k] * alpha + sin(val) * beta;
+          dst_ptr[half_size + k] =
+              src_ptr[half_size + k] * alpha + cos(val) * beta;
+        }
+        src_ptr += enc_size;
+        dst_ptr += enc_size;
+      }
+    }
+  }
+};
+template <typename DeviceContext, typename T>
+class AddPositionEncodingGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* dOut =
+        context.Input<framework::LoDTensor>(framework::GradVarName("Out"));
+    auto dout = framework::EigenVector<T>::Flatten(*dOut);
+    auto* dX =
+        context.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    dX->mutable_data<T>(context.GetPlace());
+    auto dx = framework::EigenVector<T>::Flatten(*dX);
+    float alpha = context.Attr<float>("alpha");
+    auto* place =
+        context.template device_context<DeviceContext>().eigen_device();
+    dx.device(*place) = dout * static_cast<T>(alpha);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
@@ -102,7 +102,9 @@ REGISTER_OPERATOR(gather, ops::GatherOp, ops::GatherOpMaker,
                  paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(gather_grad, ops::GatherGradOp);
 REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel<float>,
-                       ops::GatherOpKernel<int>, ops::GatherOpKernel<double>);
+                       ops::GatherOpKernel<double>, ops::GatherOpKernel<int>,
+                       ops::GatherOpKernel<int64_t>);
 REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel<float>,
+                       ops::GatherGradientOpKernel<double>,
                       ops::GatherGradientOpKernel<int>,
-                       ops::GatherGradientOpKernel<double>);
+                       ops::GatherGradientOpKernel<int64_t>);
--- a/paddle/fluid/operators/gather_op.cu
+++ b/paddle/fluid/operators/gather_op.cu
@@ -61,5 +61,11 @@ class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel<float>);
+REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel<float>,
-REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel<float>);
+                        ops::GatherOpCUDAKernel<double>,
+                        ops::GatherOpCUDAKernel<int64_t>,
+                        ops::GatherOpCUDAKernel<int>);
+REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel<float>,
+                        ops::GatherGradOpCUDAKernel<double>,
+                        ops::GatherGradOpCUDAKernel<int64_t>,
+                        ops::GatherGradOpCUDAKernel<int>);
--- a/paddle/fluid/operators/math/sequence_pooling.cc
+++ b/paddle/fluid/operators/math/sequence_pooling.cc
@@ -31,7 +31,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-template <typename T>
+template <typename T, bool is_test>
 class MaxSeqPoolFunctor {
 public:
  void operator()(const platform::CPUDeviceContext& context,
@@ -70,7 +70,41 @@ class MaxSeqPoolFunctor {
    }
  }
 };
+// Instantisation of Max Sequence Pooling for test phase eg. no need to fill
+// index buffer
+template <typename T>
+class MaxSeqPoolFunctor<T, true> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::LoDTensor& input, framework::Tensor* output,
+                  framework::Tensor* index) {
+    auto in_dims = input.dims();
+    auto out_dims = output->dims();
+    PADDLE_ENFORCE_GT(in_dims.size(), 1);
+    PADDLE_ENFORCE_GT(out_dims.size(), 1);
+    for (int64_t i = 1; i < in_dims.size(); ++i) {
+      PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]);
+    }
+    auto starts = input.lod()[0];
+    const T* in_data = input.data<T>();
+    T* out_data = output->data<T>();
+    int64_t num_seq = out_dims[0];
+    int64_t dim = output->numel() / num_seq;
+    for (int64_t i = 0; i < num_seq; ++i) {
+      std::memcpy(&out_data[i * dim], &in_data[starts[i] * dim],
+                  dim * sizeof(T));
+      for (size_t j = starts[i] + 1; j < starts[i + 1]; ++j) {
+        for (int64_t k = 0; k < dim; ++k) {
+          if (in_data[j * dim + k] > out_data[i * dim + k]) {
+            out_data[i * dim + k] = in_data[j * dim + k];
+          }
+        }
+      }
+    }
+  }
+};
 template <typename T>
 class MaxSeqPoolGradFunctor {
 public:
@@ -188,11 +222,16 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
  /* max pool has index output */
  void operator()(const platform::CPUDeviceContext& context,
                  const std::string pooltype, const framework::LoDTensor& input,
-                  framework::Tensor* output,
+                  framework::Tensor* output, bool is_test,
                  framework::Tensor* index = nullptr) {
    if (pooltype == "MAX") {
-      math::MaxSeqPoolFunctor<T> max_pool;
+      if (is_test) {
-      max_pool(context, input, output, index);
+        math::MaxSeqPoolFunctor<T, true> max_pool;
+        max_pool(context, input, output, index);
+      } else {
+        math::MaxSeqPoolFunctor<T, false> max_pool;
+        max_pool(context, input, output, index);
+      }
      return;
    }
    if (pooltype == "LAST") {
@@ -200,6 +239,7 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
      last_pool(context, input, output);
      return;
    }
    if (pooltype == "FIRST") {
      math::FirstSeqPoolFunctor<T> first_pool;
      first_pool(context, input, output);

--- a/paddle/fluid/operators/math/sequence_pooling.cu
+++ b/paddle/fluid/operators/math/sequence_pooling.cu
@@ -133,7 +133,7 @@ class SequencePoolFunctor<platform::CUDADeviceContext, T> {
 public:
  void operator()(const platform::CUDADeviceContext& context,
                  const std::string pooltype, const framework::LoDTensor& input,
-                  framework::Tensor* output,
+                  framework::Tensor* output, bool is_test,
                  framework::Tensor* index = nullptr) {
    auto& lod = input.lod()[0];
    const size_t item_dim = output->numel() / output->dims()[0];

--- a/paddle/fluid/operators/math/sequence_pooling.h
+++ b/paddle/fluid/operators/math/sequence_pooling.h
@@ -28,7 +28,7 @@ class SequencePoolFunctor {
  /* max pool has index output */
  void operator()(const DeviceContext& context, const std::string pooltype,
                  const framework::LoDTensor& input, framework::Tensor* output,
-                  framework::Tensor* index = nullptr);
+                  bool is_test = false, framework::Tensor* index = nullptr);
 };
 template <typename DeviceContext, typename T>

--- a/paddle/fluid/operators/sequence_pool_op.cc
+++ b/paddle/fluid/operators/sequence_pool_op.cc
@@ -47,6 +47,7 @@ class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker {
              "(Tensor<int>) This tensor is used for the sequence max-pooling "
              "to record the max indexes.")
        .AsIntermediate();
+    AddAttr<bool>("is_test", "").SetDefault(false);
    AddAttr<std::string>(
        "pooltype",
        "(string, default 'AVERAGE') the pooling pooltype of SequencePoolOp.")

--- a/paddle/fluid/operators/sequence_pool_op.h
+++ b/paddle/fluid/operators/sequence_pool_op.h
@@ -32,10 +32,6 @@ class SequencePoolKernel : public framework::OpKernel<T> {
    auto* in = context.Input<LoDTensor>("X");
    auto* out = context.Output<Tensor>("Out");
    std::string pooltype = context.Attr<std::string>("pooltype");
-    Tensor* index = nullptr;
-    if (pooltype == "MAX") {
-      index = context.Output<Tensor>("MaxIndex");
-    }
    auto dims = in->dims();
    auto lod = in->lod();
@@ -48,13 +44,22 @@ class SequencePoolKernel : public framework::OpKernel<T> {
    dims[0] = lod[0].size() - 1;
    out->Resize({dims});
    out->mutable_data<T>(context.GetPlace());
-    if (pooltype == "MAX") {
+    Tensor* index = nullptr;
+    const bool is_test = context.Attr<bool>("is_test");
+    // Do not create index buffer for inference (is_test) mode
+    // TODO(jczaja): Skip index buffer creation for other devices eg. GPU
+    if (pooltype == "MAX" &&
+        (is_test == false ||
+         platform::is_cpu_place(context.GetPlace()) == false)) {
+      index = context.Output<Tensor>("MaxIndex");
      index->Resize({dims});
      index->mutable_data<int>(context.GetPlace());
    }
    math::SequencePoolFunctor<DeviceContext, T> pool;
    pool(context.template device_context<DeviceContext>(), pooltype, *in, out,
-         index);
+         is_test, index);
  }
 };

--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -82,14 +82,16 @@ class SumOp : public framework::OperatorWithKernel {
    if (x_vars[0]->IsType<framework::LoDTensor>()) {
      int dtype = -1;
      for (auto& x_var : x_vars) {
-        auto& lod_tensor = x_var->Get<framework::LoDTensor>();
+        // FIXME(zcd): The input x_var may be SelectedRows or LoDTensor.
-        if (lod_tensor.numel() == 0) {
+        auto tensor = framework::GetTensorFromVar(
+            const_cast<framework::Variable*>(x_var));
+        if (tensor->numel() == 0) {
          continue;
        }
        if (dtype == -1) {
-          dtype = framework::ToDataType(lod_tensor.type());
+          dtype = framework::ToDataType(tensor->type());
        } else {
-          PADDLE_ENFORCE_EQ(dtype, framework::ToDataType(lod_tensor.type()));
+          PADDLE_ENFORCE_EQ(dtype, framework::ToDataType(tensor->type()));
        }
      }
      PADDLE_ENFORCE_NE(dtype, -1,

--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -32,23 +32,25 @@ platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
        "'Place' is not supported, Please re-compile with WITH_GPU "
        "option");
  }
-  return it->second.get();
+  return it->second.get().get();
 }
-const std::vector<const DeviceContext*>
+template <typename DevCtx, typename PlaceType>
-DeviceContextPool::GetAllDeviceContexts() const {
+inline void EmplaceDeviceContext(
-  std::vector<const DeviceContext*> all_device_ctx;
+    std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
-  all_device_ctx.reserve(device_contexts_.size());
+        map_ptr,
-  for (auto& dev_ctx : device_contexts_) {
+    platform::Place p) {
-    all_device_ctx.emplace_back(dev_ctx.second.get());
+  using PtrType = std::unique_ptr<DeviceContext>;
-  }
+  map_ptr->emplace(p, std::async(std::launch::deferred, [=] {
-  return all_device_ctx;
+                     // lazy evaluation. i.e., only create device context at
+                     // first `Get`
+                     return PtrType(new DevCtx(boost::get<PlaceType>(p)));
+                   }));
 }
 DeviceContextPool::DeviceContextPool(
    const std::vector<platform::Place>& places) {
  PADDLE_ENFORCE_GT(places.size(), 0);
-  using PtrType = std::unique_ptr<DeviceContext>;
  std::set<Place> set;
  for (auto& p : places) {
    set.insert(p);
@@ -57,16 +59,13 @@ DeviceContextPool::DeviceContextPool(
  for (auto& p : set) {
    if (platform::is_cpu_place(p)) {
 #ifdef PADDLE_WITH_MKLDNN
-      device_contexts_.emplace(
+      EmplaceDeviceContext<MKLDNNDeviceContext, CPUPlace>(&device_contexts_, p);
-          p, PtrType(new MKLDNNDeviceContext(boost::get<CPUPlace>(p))));
 #else
-      device_contexts_.emplace(
+      EmplaceDeviceContext<CPUDeviceContext, CPUPlace>(&device_contexts_, p);
-          p, PtrType(new CPUDeviceContext(boost::get<CPUPlace>(p))));
 #endif
    } else if (platform::is_gpu_place(p)) {
 #ifdef PADDLE_WITH_CUDA
-      device_contexts_.emplace(
+      EmplaceDeviceContext<CUDADeviceContext, CUDAPlace>(&device_contexts_, p);
-          p, PtrType(new CUDADeviceContext(boost::get<CUDAPlace>(p))));
 #else
      PADDLE_THROW(
          "'CUDAPlace' is not supported, Please re-compile with WITH_GPU "
@@ -74,9 +73,8 @@ DeviceContextPool::DeviceContextPool(
 #endif
    } else if (platform::is_cuda_pinned_place(p)) {
 #ifdef PADDLE_WITH_CUDA
-      device_contexts_.emplace(
+      EmplaceDeviceContext<CUDAPinnedDeviceContext, CUDAPinnedPlace>(
-          p,
+          &device_contexts_, p);
-          PtrType(new CUDAPinnedDeviceContext(boost::get<CUDAPinnedPlace>(p))));
 #else
      PADDLE_THROW(
          "'CUDAPlace' is not supported, Please re-compile with WITH_GPU "

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <future>  // NOLINT
 #include <memory>
 #include <mutex>  // NOLINT
 #include <string>
@@ -223,9 +224,6 @@ class DeviceContextPool {
  /*! \brief  Return handle of single device context. */
  platform::DeviceContext* Get(const platform::Place& place);
-  /*! \brief  Return all the device contexts. */
-  const std::vector<const DeviceContext*> GetAllDeviceContexts() const;
  template <typename Place>
  const typename DefaultDeviceContextType<Place>::TYPE* GetByPlace(
      const Place& place) {
@@ -237,7 +235,8 @@ class DeviceContextPool {
 private:
  static DeviceContextPool* pool;
-  std::map<Place, std::unique_ptr<DeviceContext>> device_contexts_;
+  std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>
+      device_contexts_;
  DISABLE_COPY_AND_ASSIGN(DeviceContextPool);
 };

--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -153,7 +153,6 @@ function cmake_gen() {
        -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF}
        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
        -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
-        -DWITH_INFERENCE=${WITH_INFERENCE:-ON}
        -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON}
        -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR}
        -DWITH_ANAKIN=${WITH_ANAKIN:-OFF}
@@ -186,7 +185,6 @@ EOF
        -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
        -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \
-        -DWITH_INFERENCE=${WITH_INFERENCE:-ON} \
        -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} \
        -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} \
        -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \
@@ -653,7 +651,7 @@ function gen_capi_package() {
 function gen_fluid_lib() {
    mkdir -p ${PADDLE_ROOT}/build
    cd ${PADDLE_ROOT}/build
-    if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then
+    if [[ ${WITH_C_API:-OFF} == "OFF" ]] ; then
        cat <<EOF
    ========================================
    Generating fluid library for train and inference ...
@@ -666,7 +664,7 @@ EOF
 }
 function tar_fluid_lib() {
-    if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then
+    if [[ ${WITH_C_API:-OFF} == "OFF" ]] ; then
        cat <<EOF
    ========================================
    Taring fluid library for train and inference ...
@@ -681,7 +679,7 @@ EOF
 }
 function test_fluid_lib() {
-    if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then
+    if [[ ${WITH_C_API:-OFF} == "OFF" ]] ; then
        cat <<EOF
    ========================================
    Testing fluid library for inference ...

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -157,6 +157,8 @@ __all__ = [
    'sequence_reverse',
    'affine_channel',
    'hash',
+    'log_loss',
+    'add_position_encoding',
 ]
@@ -747,7 +749,7 @@ def dynamic_gru(input,
        attr=helper.bias_attr, shape=[1, 3 * size], dtype=dtype, is_bias=True)
    batch_size = input.shape[0]
    inputs = {'Input': input, 'Weight': weight, 'Bias': bias}
-    if h_0 != None:
+    if h_0:
        assert h_0.shape == (
            batch_size, size
        ), 'The shape of h0 should be(batch_size, %d)' % size
@@ -1823,7 +1825,7 @@ def conv3d(input,
    return helper.append_activation(pre_act)
-def sequence_pool(input, pool_type):
+def sequence_pool(input, pool_type, is_test=False):
    """
    This function add the operator for sequence pooling.
    It pools features of all time-steps of each instance, and is applied
@@ -1860,6 +1862,7 @@ def sequence_pool(input, pool_type):
        input(variable): The input variable which is a LoDTensor.
        pool_type (string): The pooling type of sequence_pool.
            It supports average, sum, sqrt and max.
+        is_test(bool, Default False): Used distinguish training from scoring mode.
    Returns:
        The sequence pooling variable which is a Tensor.
@@ -1887,7 +1890,8 @@ def sequence_pool(input, pool_type):
        inputs={"X": input},
        outputs={"Out": pool_out,
                 "MaxIndex": max_index},
-        attrs={"pooltype": pool_type.upper()})
+        attrs={"pooltype": pool_type.upper(),
+               "is_test": is_test})
    # when pool_type is max, variable max_index is initialized,
    # so we stop the gradient explicitly here
@@ -7580,3 +7584,99 @@ def hash(input, hash_size, num_hash=1, name=None):
        attrs={'num_hash': num_hash,
               'mod_by': hash_size})
    return out
+def log_loss(input, label, epsilon=1e-4, name=None):
+    """
+    **Negative Log Loss Layer**
+    This layer accepts input predictions and target label and returns the
+    negative log loss.
+    .. math::
+        Out = -label * \\log{(input + \\epsilon)}
+              - (1 - label) * \\log{(1 - input + \\epsilon)}
+    Args:
+        input (Variable|list):  a 2-D tensor with shape [N x 1], where N is the
+                                batch size. This input is a probability computed
+                                by the previous operator.
+        label (Variable|list):  the ground truth which is a 2-D tensor with
+                                shape [N x 1], where N is the batch size.
+        epsilon (float): epsilon
+        name (string): the name of log_loss
+    Returns:
+        Variable: A 2-D tensor with shape [N x 1], the negative log loss.
+    Examples:
+        .. code-block:: python
+          prob = fluid.layers.sigmoid(net)
+          cost = fluid.layers.log_loss(input=prob, label=label)
+    """
+    helper = LayerHelper('log_loss', **locals())
+    if name is None:
+        loss = helper.create_variable_for_type_inference(dtype=input.dtype)
+    else:
+        loss = helper.create_variable(
+            name=name, dtype=input.dtype, persistable=False)
+    helper.append_op(
+        type='log_loss',
+        inputs={'Predicted': [input],
+                'Labels': [label]},
+        outputs={'Loss': [loss]},
+        attrs={'epsilon': epsilon})
+    return loss
+def add_position_encoding(input, alpha, beta, name=None):
+    """
+    **Add Position Encoding Layer**
+    This layer accepts an input 3D-Tensor of shape [N x M x P], and return an
+    output Tensor of shape [N x M x P] with positional encoding value.
+    Refer to `Attention Is All You Need<http://arxiv.org/pdf/1706.03762.pdf>`_ .
+    .. math::
+        PE(pos, 2i) = \\sin{(pos / 10000^{2i / P})}   \\\\
+        PE(pos, 2i + 1) = \\cos{(pos / 10000^{2i / P})}  \\\\
+        Out(:, pos, i) = \\alpha * input(:, pos, i) + \\beta * PE(pos, i)
+    Where:
+    * PE(pos, 2i): the increment for the number at even position
+    * PE(pos, 2i + 1): the increment for the number at odd position
+    Args:
+        input (Variable): 3-D input tensor with shape [N x M x P]
+        alpha (float): multiple of Input Tensor
+        beta (float): multiple of Positional Encoding Tensor
+        name (string): the name of position encoding layer
+    Returns:
+        Variable: A 3-D Tensor of shape [N x M x P] with positional encoding.
+    Examples:
+        .. code-block:: python
+          position_tensor = fluid.layers.add_position_encoding(input=tensor)
+    """
+    helper = LayerHelper('add_position_encoding', **locals())
+    dtype = helper.input_dtype()
+    if name is None:
+        out = helper.create_variable_for_type_inference(dtype=dtype)
+    else:
+        out = helper.create_variable(name=name, dtype=dtype, persistable=False)
+    helper.append_op(
+        type="add_position_encoding",
+        inputs={"X": input},
+        outputs={"Out": out},
+        attrs={"alpha": alpha,
+               "beta": beta})
+    return out
--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
@@ -194,7 +194,7 @@ class CompositeMetric(MetricBase):
                               or soft-label, should custom the corresponding update rule.
        """
        for m in self._metrics:
-            ans.append(m.update(preds, labels))
+            m.update(preds, labels)
    def eval(self):
        """

--- a/python/paddle/fluid/tests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/CMakeLists.txt
-set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests CACHE INTERNAL "python tests directory")
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")

--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/CMakeLists.txt
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/CMakeLists.txt
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
-# default test
+if(NOT APPLE)
-foreach(src ${TEST_OPS})
+    # default test
-    py_test(${src} SRCS ${src}.py)
+    foreach(src ${TEST_OPS})
-endforeach()
+        py_test(${src} SRCS ${src}.py)
+    endforeach()
+else()
+    foreach(src ${TEST_OPS})
+        if(${src} STREQUAL "test_image_classification_vgg")
+            message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src})
+        elseif(${src} STREQUAL "test_image_classification_resnet")
+            message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src})
+        elseif()
+            py_test(${src} SRCS ${src}.py)
+        endif()
+    endforeach()
+endif()
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -17,6 +17,10 @@ if(NOT WITH_DISTRIBUTE)
    list(REMOVE_ITEM TEST_OPS test_listen_and_serv_op)
    LIST(REMOVE_ITEM TEST_OPS test_dist_mnist)
    LIST(REMOVE_ITEM TEST_OPS test_dist_word2vec)
+    LIST(REMOVE_ITEM TEST_OPS test_dist_ctr)
+    LIST(REMOVE_ITEM TEST_OPS test_dist_simnet_bow)
+    LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_batch_merge)
+    LIST(REMOVE_ITEM TEST_OPS test_dist_text_classification)
 endif(NOT WITH_DISTRIBUTE)
 list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290
@@ -55,6 +59,7 @@ function(py_test_modules TARGET_NAME)
    if (py_test_modules_SERIAL)
        set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
    endif()
+    set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
  endif()
 endfunction()
 list(REMOVE_ITEM TEST_OPS test_warpctc_op)
@@ -88,4 +93,6 @@ py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SE
 py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
 set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 150)
 py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL)
-py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL)
+if(NOT APPLE)
+    py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL)
+endif()
--- a/python/paddle/fluid/tests/unittests/dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/dist_mnist.py
@@ -90,8 +90,10 @@ class TestDistMnist2x2(TestDistRunnerBase):
        inference_program = fluid.default_main_program().clone()
        # Optimization
-        opt = fluid.optimizer.AdamOptimizer(
+        # TODO(typhoonzero): fix distributed adam optimizer
-            learning_rate=0.001, beta1=0.9, beta2=0.999)
+        # opt = fluid.optimizer.AdamOptimizer(
+        #     learning_rate=0.001, beta1=0.9, beta2=0.999)
+        opt = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
        # Reader
        train_reader = paddle.batch(

--- a/python/paddle/fluid/tests/unittests/test_add_position_encoding_op.py
+++ b/python/paddle/fluid/tests/unittests/test_add_position_encoding_op.py
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+import math
+import paddle.fluid.core as core
+from op_test import OpTest
+class TestAddPositionEncodingTensorOp(OpTest):
+    """
+    This class is to test the AddPositionEncodingOp
+    """
+    def setUp(self):
+        """
+        the prepared section for add position encoding op
+        """
+        self.op_type = "add_position_encoding"
+        self.dtype = np.float32
+        self.init_input_output()
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(self.x), }
+        self.outputs = {'Out': self.out}
+        self.attrs = {'alpha': self.alpha, 'beta': self.beta}
+    def test_check_output(self):
+        """
+        check the correctness of output
+        """
+        self.check_output()
+    def test_check_grad(self):
+        """
+        check the correctness of grad
+        """
+        self.check_grad(['X'], 'Out', max_relative_error=0.005)
+    def init_input_output(self):
+        """
+        init the input and output for test cases
+        """
+        self.alpha = 0.6
+        self.beta = 0.5
+        self.x = np.random.uniform(0.1, 1, [2, 4, 4]).astype(self.dtype)
+        self.out = np.copy(self.x)
+        batch_size = self.x.shape[0]
+        max_length = self.x.shape[1]
+        enc_size = self.x.shape[2]
+        half_shape = int(enc_size / 2)
+        for i in range(batch_size):
+            for j in range(max_length):
+                for k in range(half_shape):
+                    val = j / pow(10000.0, k / (
+                        half_shape - 1)) if half_shape > 1 else j / 10000.0
+                    self.out[i, j, k] = \
+                        self.x[i, j, k] * self.alpha + math.sin(val) * self.beta
+                    self.out[i, j, half_shape + k] = \
+                        self.x[i, j, half_shape + k] * self.alpha + math.cos(val) * self.beta
+class TestAddPositionEncodingLoDTensorOp(OpTest):
+    """
+    This class is to test the AddPositionEncodingLoDTensorOp
+    """
+    def setUp(self):
+        """
+        the prepared section for add position encoding LoDTensor op
+        """
+        self.op_type = "add_position_encoding"
+        self.dtype = np.float32
+        self.init_input_output()
+        self.inputs = {'X': (self.x, self.lod), }
+        self.outputs = {'Out': (self.out, self.lod)}
+        self.attrs = {'alpha': self.alpha, 'beta': self.beta}
+    def test_check_output(self):
+        """
+        check the correctness of output
+        """
+        self.check_output()
+    def test_check_grad(self):
+        """
+        check the correctness of grad
+        """
+        self.check_grad(['X'], 'Out', max_relative_error=0.005)
+    def init_input_output(self):
+        """
+        init the input and output for test cases
+        """
+        self.alpha = 0.6
+        self.beta = 0.5
+        self.x = np.random.uniform(0.1, 1, [10, 4]).astype(self.dtype)
+        self.lod = [[3, 7]]
+        self.out = np.copy(self.x)
+        batch_size = len(self.lod[0])
+        enc_size = self.x.shape[1]
+        start = 0
+        half_shape = int(enc_size / 2)
+        for i in range(batch_size):
+            max_length = self.lod[0][i]
+            for j in range(max_length):
+                for k in range(half_shape):
+                    val = j / pow(10000.0, k / (
+                        half_shape - 1)) if half_shape > 1 else j / 10000.0
+                    pos = start + j
+                    self.out[pos, k] = \
+                        self.x[pos, k] * self.alpha + math.sin(val) * self.beta
+                    self.out[pos, half_shape + k] = \
+                        self.x[pos, half_shape + k] * self.alpha + math.cos(val) * self.beta
+            start += max_length
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -22,6 +22,8 @@ import signal
 import subprocess
 import six
 import argparse
+import pickle
+import numpy as np
 import paddle.fluid as fluid
@@ -128,10 +130,15 @@ class TestDistRunnerBase(object):
            else:
                return origin_batch
+        out_losses = []
        for _ in six.moves.xrange(RUN_STEP):
            loss, = exe.run(fetch_list=[avg_cost.name],
                            feed=feeder.feed(get_data()))
-            print(loss)
+            out_losses.append(loss[0])
+        if six.PY2:
+            print(pickle.dumps(out_losses))
+        else:
+            sys.stdout.buffer.write(pickle.dumps(out_losses))
 def runtime_main(test_class):
@@ -149,7 +156,7 @@ def runtime_main(test_class):
    parser.add_argument('--use_cuda', action='store_true')
    parser.add_argument('--use_reduce', action='store_true')
    parser.add_argument(
-        '--use_reader_alloc', action='store_true', required=False, default=True)
+        '--use_reader_alloc', action='store_true', required=False)
    parser.add_argument('--batch_size', required=False, type=int, default=2)
    parser.add_argument(
        '--batch_merge_repeat', required=False, type=int, default=1)
@@ -188,7 +195,7 @@ class TestDistBase(unittest.TestCase):
        self._pservers = 2
        self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
            self._find_free_port(), self._find_free_port())
-        self._python_interp = "python"
+        self._python_interp = sys.executable
        self._sync_mode = True
        self._enforce_place = None
        self._mem_opt = False
@@ -237,21 +244,6 @@ class TestDistBase(unittest.TestCase):
        return ps0_proc, ps1_proc, ps0_pipe, ps1_pipe
-    def _wait_ps_ready(self, pid):
-        retry_times = 50
-        while True:
-            assert retry_times >= 0, "wait ps ready failed"
-            time.sleep(3)
-            try:
-                # the listen_and_serv_op would touch a file which contains the listen port
-                # on the /tmp directory until it was ready to process all the RPC call.
-                os.stat("/tmp/paddle.%d.port" % pid)
-                return
-            except os.error as e:
-                sys.stderr.write('waiting for pserver: %s, left retry %d\n' %
-                                 (e, retry_times))
-                retry_times -= 1
    def _run_local(self,
                   model,
                   envs,
@@ -288,23 +280,20 @@ class TestDistBase(unittest.TestCase):
                env=envs)
        local_out, local_err = local_proc.communicate()
-        local_ret = cpt.to_text(local_out)
        if check_error_log:
            err_log.close()
-        sys.stderr.write('local_stdout: %s\n' % local_ret)
+        sys.stderr.write('local_stdout: %s\n' % pickle.loads(local_out))
        sys.stderr.write('local_stderr: %s\n' % local_err)
-        local_losses = local_ret.split("\n")
+        return pickle.loads(local_out)
-        return local_losses
    def _run_cluster(self, model, envs, check_error_log):
        # Run dist train to compare with local results
        ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver(model,
                                                          check_error_log, envs)
-        self._wait_ps_ready(ps0.pid)
-        self._wait_ps_ready(ps1.pid)
        ps0_ep, ps1_ep = self._ps_endpoints.split(",")
        tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --is_dist"
@@ -339,8 +328,8 @@ class TestDistBase(unittest.TestCase):
        env0.update(envs)
        env1.update(envs)
-        print("tr0_cmd:{}, env0: {}".format(tr0_cmd, env0))
+        print("tr0_cmd:{}".format(tr0_cmd))
-        print("tr1_cmd:{}, env1: {}".format(tr1_cmd, env1))
+        print("tr1_cmd:{}".format(tr1_cmd))
        tr0_pipe = open("/tmp/tr0_err.log", "wb")
        tr1_pipe = open("/tmp/tr1_err.log", "wb")
@@ -356,9 +345,7 @@ class TestDistBase(unittest.TestCase):
            env=env1)
        tr0_out, tr0_err = tr0_proc.communicate()
-        tr0_loss_text = cpt.to_text(tr0_out)
        tr1_out, tr1_err = tr1_proc.communicate()
-        tr1_loss_text = cpt.to_text(tr1_out)
        # close trainer file
        tr0_pipe.close()
@@ -373,15 +360,13 @@ class TestDistBase(unittest.TestCase):
        ps1.terminate()
        # print log
-        sys.stderr.write('trainer 0 stdout:\n %s\n' % tr0_loss_text)
+        sys.stderr.write('trainer 0 stdout: %s\n' % pickle.loads(tr0_out))
-        sys.stderr.write('trainer 0 stderr:\n %s\n' % tr0_err)
+        sys.stderr.write('trainer 0 stderr: %s\n' % tr0_err)
-        sys.stderr.write('trainer 1 stdout: %s\n' % tr1_loss_text)
+        sys.stderr.write('trainer 1 stdout: %s\n' % pickle.loads(tr1_out))
        sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err)
-        tr0_losses = tr0_loss_text.split("\n")
+        # return tr0_losses, tr1_losses
-        tr1_losses = tr1_loss_text.split("\n")
+        return pickle.loads(tr0_out), pickle.loads(tr1_out)
-        return tr0_losses, tr1_losses
    def check_with_place(self,
                         model_file,
@@ -411,9 +396,9 @@ class TestDistBase(unittest.TestCase):
                                                   check_error_log)
        for step_id in range(RUN_STEP):
-            local_loss = eval(local_losses[step_id])[0]
+            local_loss = local_losses[step_id]
-            tr0_loss = eval(tr0_losses[step_id])[0]
+            tr0_loss = tr0_losses[step_id]
-            tr1_loss = eval(tr1_losses[step_id])[0]
+            tr1_loss = tr1_losses[step_id]
-            dist_loss = (tr0_loss + tr1_loss) / 2
+            dist_loss = (np.array([tr0_loss]) + np.array([tr1_loss])) / 2
-            print(str(local_loss) + ":" + str(dist_loss))
+            print("=======", local_loss, ":", dist_loss[0], "=======")
-            self.assertAlmostEqual(local_loss, dist_loss, delta=delta)
+            self.assertAlmostEqual(local_loss, dist_loss[0], delta=delta)
--- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
@@ -23,16 +23,17 @@ class TestDistSeResneXt2x2(TestDistBase):
        self._use_reader_alloc = False
    def test_dist_train(self):
-        self.check_with_place("dist_se_resnext.py", delta=100)
+        self.check_with_place("dist_se_resnext.py", delta=1e-7)
 class TestDistseResnXt2x2WithMemopt(TestDistBase):
    def _setup_config(self):
        self._sync_mode = True
        self._mem_opt = True
+        self._use_reader_alloc = False
    def test_dist_train(self):
-        self.check_with_place("dist_se_resnext.py", delta=100)
+        self.check_with_place("dist_se_resnext.py", delta=1e-7)
 class TestDistSeResneXt2x2Async(TestDistBase):

--- a/python/paddle/fluid/tests/unittests/test_seq_pool.py
+++ b/python/paddle/fluid/tests/unittests/test_seq_pool.py
@@ -184,6 +184,20 @@ class TestSeqMaxPool2D(TestSeqAvgPool2D):
            out[i] = np.reshape(np.amax(sub_x, axis=0), (3, 11))
+class TestSeqMaxPool2DInference(TestSeqMaxPool2D):
+    def compute(self, x, offset, out):
+        self.attrs = {'pooltype': "MAX", 'is_test': True}
+        for i in range(len(offset[0]) - 1):
+            sub_x = np.reshape(x[offset[0][i]:offset[0][i + 1], :],
+                               (-1, 3 * 11))
+            out[i] = np.reshape(np.amax(sub_x, axis=0), (3, 11))
+    def test_check_grad(self):
+        """Grad computation does not apply to Sequence MAX 
+            Pool executed when is_test is true """
+        return
 class TestSeqLastPool2D(TestSeqAvgPool2D):
    def compute(self, x, offset, out):
        self.attrs = {'pooltype': "LAST"}