Merge branch 'develop' of github.com:PaddlePaddle/Paddle into auto_grwon_sparse_table

ed6241cd · Yancey1989 · 19152541 · 64bf3df0 · ed6241cd · ed6241cd
94 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -39,6 +39,7 @@ option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_F
 option(WITH_AMD_GPU     "Compile PaddlePaddle with AMD GPU"             OFF)
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
 option(WITH_MKL         "Compile PaddlePaddle with MKL support."        ${AVX_FOUND})
+option(WITH_TENSORRT    "Compile PaddlePaddle with TensorRT support."   OFF)
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        OFF)
 option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)
@@ -181,6 +182,11 @@ if(WITH_GPU)
    include(cuda)
 endif(WITH_GPU)
+# TensorRT depends on GPU.
+if (NOT WITH_GPU)
+  set(WITH_TENSORRT OFF)
+endif()
 if(WITH_AMD_GPU)
    find_package(HIP)
    include(hip)

--- a/Dockerfile
+++ b/Dockerfile
@@ -45,6 +45,13 @@ ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
 # install glide
 RUN curl -s -q https://glide.sh/get | sh
+# Install TensorRT
+# The unnecessary files has been removed to make the library small.
+RUN wget -qO- http://paddlepaddledeps.bj.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \
+    tar -xz -C /usr/local && \
+    cp -rf /usr/local/TensorRT/include /usr && \
+    cp -rf /usr/local/TensorRT/lib /usr
 # git credential to skip password typing
 RUN git config --global credential.helper store
@@ -57,7 +64,7 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
 # specify sphinx version as 1.5.6 and remove -U option for [pip install -U
 # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
 # version(1.7.1 for now), which causes building documentation failed.
-RUN pip install --upgrade pip && \
+RUN pip install --upgrade pip==9.0.3 && \
    pip install -U wheel && \
    pip install -U docopt PyYAML sphinx==1.5.6 && \
    pip install sphinx-rtd-theme==0.1.9 recommonmark

--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@@ -33,7 +33,7 @@ ExternalProject_Add(
    extern_grpc
    DEPENDS protobuf zlib
    GIT_REPOSITORY "https://github.com/grpc/grpc.git"
-    GIT_TAG "v1.11.x"
+    GIT_TAG "v1.10.x"
    PREFIX          ${GRPC_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CONFIGURE_COMMAND ""

--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -3,7 +3,9 @@ add_custom_target(paddle_apis ALL
 add_custom_target(paddle_docs ALL
                  DEPENDS paddle_v2_docs paddle_v2_docs_cn
-                  paddle_fluid_docs paddle_fluid_docs_cn)
+                  paddle_fluid_docs paddle_fluid_docs_cn
+                  paddle_mobile_docs paddle_mobile_docs_cn)
 add_subdirectory(v2)
 add_subdirectory(fluid)
+add_subdirectory(mobile)
--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@@ -473,6 +473,12 @@ multiplex
 ..  autofunction:: paddle.fluid.layers.multiplex
    :noindex:
+label_smooth
+------------
+..  autofunction:: paddle.fluid.layers.label_smooth
+    :noindex:
 ops
 ===

--- a/doc/fluid/dev/index_cn.rst
+++ b/doc/fluid/dev/index_cn.rst
@@ -4,6 +4,7 @@
 .. toctree::
  :maxdepth: 1
+  api_doc_std_cn.md
  new_op_cn.md
  new_op_kernel.md
  use_eigen_cn.md

--- a/doc/fluid/dev/index_en.rst
+++ b/doc/fluid/dev/index_en.rst
@@ -4,6 +4,7 @@ Development
 .. toctree::
  :maxdepth: 1
+  api_doc_std_en.md
  new_op_en.md
  new_op_kernel.md
  use_eigen_en.md

--- a/doc/mobile/CMakeLists.txt
+++ b/doc/mobile/CMakeLists.txt
+if(NOT DEFINED SPHINX_THEME)
+    set(SPHINX_THEME default)
+endif()
+if(NOT DEFINED SPHINX_THEME_DIR)
+    set(SPHINX_THEME_DIR)
+endif()
+# configured documentation tools and intermediate build results
+set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build")
+# Sphinx cache with pickled ReST documents
+set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
+# HTML output director
+set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
+configure_file(
+        "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in"
+        "${BINARY_BUILD_DIR_EN}/conf.py"
+        @ONLY)
+sphinx_add_target(paddle_mobile_docs
+        html
+        ${BINARY_BUILD_DIR_EN}
+        ${SPHINX_CACHE_DIR_EN}
+        ${CMAKE_CURRENT_SOURCE_DIR}
+        ${SPHINX_HTML_DIR_EN})
+add_dependencies(paddle_mobile_docs gen_proto_py paddle_python)
+# configured documentation tools and intermediate build results
+set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
+# Sphinx cache with pickled ReST documents
+set(SPHINX_CACHE_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_doctrees")
+# HTML output director
+set(SPHINX_HTML_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/html")
+configure_file(
+        "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.cn.in"
+        "${BINARY_BUILD_DIR_CN}/conf.py"
+        @ONLY)
+sphinx_add_target(paddle_mobile_docs_cn
+        html
+        ${BINARY_BUILD_DIR_CN}
+        ${SPHINX_CACHE_DIR_CN}
+        ${CMAKE_CURRENT_SOURCE_DIR}
+        ${SPHINX_HTML_DIR_CN})
+add_dependencies(paddle_mobile_docs_cn gen_proto_py paddle_python)
--- a/doc/mobile/index_cn.rst
+++ b/doc/mobile/index_cn.rst
+移动端
+=====
+..  toctree::
+  :maxdepth: 1
+  cross_compiling_for_android_cn.md
+  cross_compiling_for_ios_cn.md
+  cross_compiling_for_raspberry_cn.md
\ No newline at end of file
--- a/doc/mobile/index_en.rst
+++ b/doc/mobile/index_en.rst
+Mobile
+======
+..  toctree::
+  :maxdepth: 1
+  cross_compiling_for_android_en.md
+  cross_compiling_for_ios_en.md
+  cross_compiling_for_raspberry_en.md
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
 cc_library(var_handle SRCS var_handle.cc DEPS place)
-cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context)
+cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context lod_tensor)
 cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
@@ -20,3 +20,11 @@ cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS
 cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto)
 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
        simple_threadpool device_context)
+cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory)
+cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory)
+cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
+        device_context broadcast_op_handle)
+cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
+        device_context gather_op_handle)
--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/details/broadcast_op_handle.h"
+namespace paddle {
+namespace framework {
+namespace details {
+Tensor *GetTensorFromVar(Variable *in_var) {
+  if (in_var->IsType<LoDTensor>()) {
+    return in_var->GetMutable<LoDTensor>();
+  } else if (in_var->IsType<SelectedRows>()) {
+    return in_var->GetMutable<SelectedRows>()->mutable_value();
+  } else {
+    PADDLE_THROW("Var should be LoDTensor or SelectedRows");
+  }
+  return nullptr;
+}
+BroadcastOpHandle::BroadcastOpHandle(const std::vector<Scope *> &local_scopes,
+                                     const std::vector<platform::Place> &places)
+    : local_scopes_(local_scopes), places_(places) {}
+void BroadcastOpHandle::RunImpl() {
+  // the input may have dummy var.
+  std::vector<VarHandle *> in_var_handle;
+  for (auto *in : inputs_) {
+    auto *out_handle = dynamic_cast<VarHandle *>(in);
+    if (out_handle) {
+      in_var_handle.push_back(out_handle);
+    }
+  }
+  PADDLE_ENFORCE_EQ(in_var_handle.size(), 1,
+                    "The number of input should be one.");
+  // the output may have dummy var.
+  std::vector<VarHandle *> out_var_handles;
+  for (auto *out : outputs_) {
+    auto *out_handle = dynamic_cast<VarHandle *>(out);
+    if (out_handle) {
+      out_var_handles.push_back(out_handle);
+    }
+  }
+  PADDLE_ENFORCE_EQ(
+      out_var_handles.size(), places_.size(),
+      "The number of output should equal to the number of places.");
+  // Wait input done, this Wait is asynchronous operation
+  auto &in_place = in_var_handle[0]->place_;
+  if (in_var_handle[0]->generated_op_) {
+    for (auto *out : out_var_handles) {
+      auto &out_p = out->place_;
+      in_var_handle[0]->generated_op_->Wait(dev_ctxes_[out_p]);
+    }
+  }
+  //
+  auto in_scope_idx = in_var_handle[0]->scope_idx_;
+  auto in_var =
+      local_scopes_.at(in_scope_idx)->FindVar(in_var_handle[0]->name_);
+  Tensor *in_tensor = GetTensorFromVar(in_var);
+  for (auto *out : out_var_handles) {
+    auto &out_p = out->place_;
+    auto out_var = local_scopes_.at(out->scope_idx_)->FindVar(out->name_);
+    PADDLE_ENFORCE_EQ(out_p.which(), in_place.which(),
+                      "Places must be all on CPU or all on CUDA.");
+    if (in_var->IsType<framework::SelectedRows>()) {
+      auto &in_sr = in_var->Get<framework::SelectedRows>();
+      auto out_sr = out_var->GetMutable<framework::SelectedRows>();
+      if (&in_sr == out_sr) continue;
+      out_sr->set_height(in_sr.height());
+      out_sr->set_rows(in_sr.rows());
+      out_sr->mutable_value()->Resize(in_sr.value().dims());
+      out_sr->mutable_value()->mutable_data(out_p, in_sr.value().type());
+    } else if (in_var->IsType<framework::LoDTensor>()) {
+      auto in_lod = in_var->Get<framework::LoDTensor>();
+      auto out_lod = out_var->GetMutable<framework::LoDTensor>();
+      if (&in_lod == out_lod) continue;
+      out_lod->set_lod(in_lod.lod());
+      out_lod->Resize(in_lod.dims());
+      out_lod->mutable_data(out_p, in_lod.type());
+    } else {
+      PADDLE_THROW("Var should be LoDTensor or SelectedRows.");
+    }
+    Tensor *out_tensor = GetTensorFromVar(out_var);
+    paddle::framework::TensorCopy(*in_tensor, out_p, *(dev_ctxes_[in_place]),
+                                  out_tensor);
+  }
+}
+std::string BroadcastOpHandle::Name() const { return "broadcast"; }
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle.h
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <map>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/platform/device_context.h"
+namespace paddle {
+namespace framework {
+namespace details {
+struct BroadcastOpHandle : public OpHandleBase {
+  const std::vector<Scope *> &local_scopes_;
+  const std::vector<platform::Place> &places_;
+  BroadcastOpHandle(const std::vector<Scope *> &local_scopes,
+                    const std::vector<platform::Place> &places);
+  std::string Name() const override;
+  bool IsMultiDeviceTransfer() override { return false; };
+ protected:
+  void RunImpl() override;
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/broadcast_op_handle_test.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/details/broadcast_op_handle.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/platform/device_context.h"
+namespace paddle {
+namespace framework {
+namespace details {
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+// test data amount
+const f::DDim kDims = {20, 20};
+struct TestBroadcastOpHandle {
+  std::vector<std::unique_ptr<p::DeviceContext>> ctxs_;
+  std::vector<Scope*> local_scopes_;
+  Scope g_scope_;
+  std::unique_ptr<OpHandleBase> op_handle_;
+  std::vector<std::unique_ptr<VarHandleBase>> vars_;
+  std::vector<p::Place> gpu_list_;
+  void WaitAll() {
+    for (size_t j = 0; j < ctxs_.size(); ++j) {
+      ctxs_[j]->Wait();
+    }
+  }
+  void InitCtxOnGpu(bool use_gpu) {
+    if (use_gpu) {
+#ifdef PADDLE_WITH_CUDA
+      int count = p::GetCUDADeviceCount();
+      if (count <= 1) {
+        LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
+                        "device count is "
+                     << count;
+        exit(0);
+      }
+      for (int i = 0; i < count; ++i) {
+        auto p = p::CUDAPlace(i);
+        gpu_list_.push_back(p);
+        ctxs_.emplace_back(new p::CUDADeviceContext(p));
+      }
+#else
+      PADDLE_THROW("CUDA is not support.");
+#endif
+    } else {
+      int count = 8;
+      for (int i = 0; i < count; ++i) {
+        auto p = p::CPUPlace();
+        gpu_list_.push_back(p);
+        ctxs_.emplace_back(new p::CPUDeviceContext(p));
+      }
+    }
+  }
+  void InitBroadcastOp(size_t input_scope_idx) {
+    for (size_t j = 0; j < gpu_list_.size(); ++j) {
+      local_scopes_.push_back(&(g_scope_.NewScope()));
+      local_scopes_[j]->Var("out");
+    }
+    local_scopes_[input_scope_idx]->Var("input");
+    op_handle_.reset(new BroadcastOpHandle(local_scopes_, gpu_list_));
+    vars_.emplace_back(new VarHandle());
+    VarHandle* in_var_handle = static_cast<VarHandle*>(vars_.back().get());
+    in_var_handle->place_ = gpu_list_[input_scope_idx];
+    in_var_handle->name_ = "input";
+    in_var_handle->version_ = 1;
+    in_var_handle->scope_idx_ = input_scope_idx;
+    in_var_handle->generated_op_ = nullptr;
+    op_handle_->AddInput(in_var_handle);
+    // add dummy var
+    vars_.emplace_back(new DummyVarHandle());
+    DummyVarHandle* dummy_var_handle =
+        static_cast<DummyVarHandle*>(vars_.back().get());
+    dummy_var_handle->generated_op_ = nullptr;
+    op_handle_->AddInput(dummy_var_handle);
+    for (size_t j = 0; j < gpu_list_.size(); ++j) {
+      op_handle_->dev_ctxes_[gpu_list_[j]] = ctxs_[j].get();
+      vars_.emplace_back(new VarHandle());
+      VarHandle* out_var_handle = static_cast<VarHandle*>(vars_.back().get());
+      out_var_handle->place_ = gpu_list_[j];
+      out_var_handle->name_ = "out";
+      out_var_handle->version_ = 2;
+      out_var_handle->scope_idx_ = j;
+      op_handle_->AddOutput(out_var_handle);
+    }
+    // add dummy var
+    vars_.emplace_back(new DummyVarHandle());
+    DummyVarHandle* out_dummy_var_handle =
+        static_cast<DummyVarHandle*>(vars_.back().get());
+    out_dummy_var_handle->generated_op_ = nullptr;
+    op_handle_->AddOutput(out_dummy_var_handle);
+  }
+  void TestBroadcastLodTensor(size_t input_scope_idx) {
+    auto in_var = local_scopes_[input_scope_idx]->Var("input");
+    auto in_lod_tensor = in_var->GetMutable<f::LoDTensor>();
+    in_lod_tensor->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
+    std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
+    for (size_t k = 0; k < send_vector.size(); ++k) {
+      send_vector[k] = k;
+    }
+    f::LoD lod{{0, 10, 20}};
+    paddle::framework::TensorFromVector<float>(
+        send_vector, *(ctxs_[input_scope_idx]), in_lod_tensor);
+    in_lod_tensor->set_lod(lod);
+    op_handle_->Run(false);
+    WaitAll();
+    p::CPUPlace cpu_place;
+    for (size_t j = 0; j < gpu_list_.size(); ++j) {
+      auto out_var = local_scopes_[j]->Var("out");
+      auto out_tensor = out_var->Get<f::LoDTensor>();
+      PADDLE_ENFORCE_EQ(out_tensor.lod(), lod, "lod is not equal.");
+      f::Tensor result_tensor;
+      f::TensorCopy(out_tensor, cpu_place, *(ctxs_[j]), &result_tensor);
+      float* ct = result_tensor.mutable_data<float>(cpu_place);
+      for (int64_t i = 0; i < f::product(kDims); ++i) {
+        ASSERT_NEAR(ct[i], send_vector[i], 1e-5);
+      }
+    }
+  }
+  void TestBroadcastSelectedRows(size_t input_scope_idx) {
+    auto in_var = local_scopes_[input_scope_idx]->Var("input");
+    auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
+    auto value = in_selected_rows->mutable_value();
+    value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
+    int height = static_cast<int>(kDims[0]) * 2;
+    std::vector<int64_t> rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1,
+                              2, 4, 6, 3, 1, 1, 1,  1, 3, 7};
+    in_selected_rows->set_height(height);
+    in_selected_rows->set_rows(rows);
+    std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
+    for (size_t k = 0; k < send_vector.size(); ++k) {
+      send_vector[k] = k;
+    }
+    paddle::framework::TensorFromVector<float>(
+        send_vector, *(ctxs_[input_scope_idx]), value);
+    op_handle_->Run(false);
+    WaitAll();
+    p::CPUPlace cpu_place;
+    for (size_t j = 0; j < gpu_list_.size(); ++j) {
+      auto out_var = local_scopes_[j]->Var("out");
+      auto& out_select_rows = out_var->Get<f::SelectedRows>();
+      auto rt = out_select_rows.value();
+      PADDLE_ENFORCE_EQ(out_select_rows.height(), height,
+                        "height is not equal.");
+      for (size_t k = 0; k < out_select_rows.rows().size(); ++k) {
+        PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k]);
+      }
+      f::Tensor result_tensor;
+      f::TensorCopy(rt, cpu_place, *(ctxs_[j]), &result_tensor);
+      float* ct = result_tensor.data<float>();
+      for (int64_t i = 0; i < f::product(kDims); ++i) {
+        ASSERT_NEAR(ct[i], send_vector[i], 1e-5);
+      }
+    }
+  }
+};
+TEST(BroadcastTester, TestCPUBroadcastTestLodTensor) {
+  TestBroadcastOpHandle test_op;
+  size_t input_scope_idx = 0;
+  test_op.InitCtxOnGpu(false);
+  test_op.InitBroadcastOp(input_scope_idx);
+  test_op.TestBroadcastLodTensor(input_scope_idx);
+}
+TEST(BroadcastTester, TestCPUBroadcastTestSelectedRows) {
+  TestBroadcastOpHandle test_op;
+  size_t input_scope_idx = 0;
+  test_op.InitCtxOnGpu(false);
+  test_op.InitBroadcastOp(input_scope_idx);
+  test_op.TestBroadcastSelectedRows(input_scope_idx);
+}
+#ifdef PADDLE_WITH_CUDA
+TEST(BroadcastTester, TestGPUBroadcastTestLodTensor) {
+  TestBroadcastOpHandle test_op;
+  size_t input_scope_idx = 0;
+  test_op.InitCtxOnGpu(true);
+  test_op.InitBroadcastOp(input_scope_idx);
+  test_op.TestBroadcastLodTensor(input_scope_idx);
+}
+TEST(BroadcastTester, TestGPUBroadcastTestSelectedRows) {
+  TestBroadcastOpHandle test_op;
+  size_t input_scope_idx = 0;
+  test_op.InitCtxOnGpu(true);
+  test_op.InitBroadcastOp(input_scope_idx);
+  test_op.TestBroadcastSelectedRows(input_scope_idx);
+}
+#endif
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/computation_op_handle.cc
+++ b/paddle/fluid/framework/details/computation_op_handle.cc
@@ -35,7 +35,9 @@ void ComputationOpHandle::RunImpl() {
    }
  }
+  this->RunAndRecordEvent([this] {
    op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get<Scope *>(), place_);
+  });
 }
 std::string ComputationOpHandle::Name() const { return op_->Type(); }

--- a/paddle/fluid/framework/details/gather_op_handle.cc
+++ b/paddle/fluid/framework/details/gather_op_handle.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/details/gather_op_handle.h"
+namespace paddle {
+namespace framework {
+namespace details {
+GatherOpHandle::GatherOpHandle(const std::vector<Scope *> &local_scopes,
+                               const std::vector<platform::Place> &places)
+    : local_scopes_(local_scopes), places_(places) {}
+void GatherOpHandle::RunImpl() {
+  // the input may have dummy var.
+  std::vector<VarHandle *> in_var_handles;
+  for (auto *in : inputs_) {
+    auto *in_handle = dynamic_cast<VarHandle *>(in);
+    if (in_handle) {
+      in_var_handles.push_back(in_handle);
+    }
+  }
+  PADDLE_ENFORCE_EQ(
+      in_var_handles.size(), places_.size(),
+      "The number of output should equal to the number of places.");
+  // the output may have dummy var.
+  std::vector<VarHandle *> out_var_handles;
+  for (auto *out : outputs_) {
+    auto *out_handle = dynamic_cast<VarHandle *>(out);
+    if (out_handle) {
+      out_var_handles.push_back(out_handle);
+    }
+  }
+  PADDLE_ENFORCE_EQ(out_var_handles.size(), 1,
+                    "The number of output should be one.");
+  auto in_0_handle = static_cast<VarHandle *>(in_var_handles[0]);
+  auto pre_in_var =
+      local_scopes_[in_0_handle->scope_idx_]->FindVar(in_0_handle->name_);
+  auto pre_place = in_0_handle->place_;
+  PADDLE_ENFORCE(pre_in_var->IsType<framework::SelectedRows>(),
+                 "Currently, gather_op only can gather SelectedRows.");
+  PADDLE_ENFORCE_EQ(out_var_handles[0]->place_.which(), pre_place.which(),
+                    "The place of input and output should be the same.");
+  // Wait input done, this Wait is asynchronous operation
+  for (auto *in : in_var_handles) {
+    if (in->generated_op_) {
+      in->generated_op_->Wait(dev_ctxes_[in->place_]);
+    }
+  }
+  std::vector<int64_t> out_rows;
+  std::vector<Tensor> in_tensors;
+  std::vector<platform::Place> in_places;
+  auto &pre_in = pre_in_var->Get<framework::SelectedRows>();
+  // gather the inputs
+  for (auto *in : in_var_handles) {
+    auto in_handle = static_cast<VarHandle *>(in);
+    auto in_p = in_handle->place_;
+    in_places.push_back(in_p);
+    PADDLE_ENFORCE_EQ(in_p.which(), pre_place.which(),
+                      "Places must be all on CPU or all on CUDA.");
+    auto in_var =
+        local_scopes_.at(in_handle->scope_idx_)->FindVar(in_handle->name_);
+    auto &in_sr = in_var->Get<framework::SelectedRows>();
+    PADDLE_ENFORCE_EQ(in_sr.value().type(), pre_in.value().type(),
+                      "The type of input is not consistent.");
+    PADDLE_ENFORCE_EQ(pre_in.height(), in_sr.height(),
+                      "The height of inputs is not consistent.");
+    PADDLE_ENFORCE_EQ(pre_in.GetCompleteDims(), in_sr.GetCompleteDims(), ,
+                      "The dims of inputs is not consistent.");
+    auto in_sr_rows = in_sr.rows();
+    out_rows.insert(out_rows.end(), in_sr_rows.begin(), in_sr_rows.end());
+    in_tensors.emplace_back(in_sr.value());
+  }
+  // write the output
+  auto &out_place = out_var_handles[0]->place_;
+  auto out_scope_idx = out_var_handles[0]->scope_idx_;
+  auto out_var =
+      local_scopes_[out_scope_idx]->FindVar(out_var_handles[0]->name_);
+  auto out = out_var->GetMutable<framework::SelectedRows>();
+  out->set_height(pre_in.height());
+  out->set_rows(out_rows);
+  size_t rows = out_rows.size();
+  DDim out_dim = pre_in.GetCompleteDims();
+  out_dim[0] = static_cast<int64_t>(rows);
+  out->mutable_value()->Resize(out_dim);
+  out->mutable_value()->mutable_data(out_place, pre_in.value().type());
+  Tensor *out_tensor = out->mutable_value();
+  // copy
+  int s = 0, e = 0;
+  for (size_t j = 0; j < in_tensors.size(); ++j) {
+    e += in_tensors[j].dims()[0];
+    auto sub_out = out_tensor->Slice(s, e);
+    paddle::framework::TensorCopy(in_tensors[j], out_place,
+                                  *(dev_ctxes_[in_places[j]]), &sub_out);
+    s = e;
+  }
+}
+std::string GatherOpHandle::Name() const { return "gather"; }
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/gather_op_handle.h
+++ b/paddle/fluid/framework/details/gather_op_handle.h
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <map>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/platform/device_context.h"
+namespace paddle {
+namespace framework {
+namespace details {
+struct GatherOpHandle : public OpHandleBase {
+  const std::vector<Scope *> &local_scopes_;
+  const std::vector<platform::Place> &places_;
+  GatherOpHandle(const std::vector<Scope *> &local_scopes,
+                 const std::vector<platform::Place> &places);
+  std::string Name() const override;
+  bool IsMultiDeviceTransfer() override { return false; };
+ protected:
+  void RunImpl() override;
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/gather_op_handle_test.cc
+++ b/paddle/fluid/framework/details/gather_op_handle_test.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/details/gather_op_handle.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/platform/device_context.h"
+namespace paddle {
+namespace framework {
+namespace details {
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+// test data amount
+const f::DDim kDims = {20, 20};
+struct TestGatherOpHandle {
+  std::vector<std::unique_ptr<p::DeviceContext>> ctxs_;
+  std::vector<Scope*> local_scopes_;
+  Scope g_scope_;
+  std::unique_ptr<OpHandleBase> op_handle_;
+  std::vector<std::unique_ptr<VarHandleBase>> vars_;
+  std::vector<p::Place> gpu_list_;
+  void WaitAll() {
+    for (size_t j = 0; j < ctxs_.size(); ++j) {
+      ctxs_[j]->Wait();
+    }
+  }
+  void InitCtxOnGpu(bool use_gpu) {
+    if (use_gpu) {
+#ifdef PADDLE_WITH_CUDA
+      int count = p::GetCUDADeviceCount();
+      if (count <= 1) {
+        LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
+                        "device count is "
+                     << count;
+        exit(0);
+      }
+      for (int i = 0; i < count; ++i) {
+        auto p = p::CUDAPlace(i);
+        gpu_list_.push_back(p);
+        ctxs_.emplace_back(new p::CUDADeviceContext(p));
+      }
+#else
+      PADDLE_THROW("CUDA is not support.");
+#endif
+    } else {
+      int count = 8;
+      for (int i = 0; i < count; ++i) {
+        auto p = p::CPUPlace();
+        gpu_list_.push_back(p);
+        ctxs_.emplace_back(new p::CPUDeviceContext(p));
+      }
+    }
+  }
+  void InitGatherOp(size_t input_scope_idx) {
+    for (size_t j = 0; j < gpu_list_.size(); ++j) {
+      local_scopes_.push_back(&(g_scope_.NewScope()));
+      local_scopes_[j]->Var("out");
+    }
+    local_scopes_[input_scope_idx]->Var("input");
+    op_handle_.reset(new GatherOpHandle(local_scopes_, gpu_list_));
+    // add input
+    for (size_t j = 0; j < gpu_list_.size(); ++j) {
+      op_handle_->dev_ctxes_[gpu_list_[j]] = ctxs_[j].get();
+      vars_.emplace_back(new VarHandle());
+      VarHandle* in_var_handle = static_cast<VarHandle*>(vars_.back().get());
+      in_var_handle->place_ = gpu_list_[j];
+      in_var_handle->name_ = "input";
+      in_var_handle->version_ = 1;
+      in_var_handle->scope_idx_ = j;
+      in_var_handle->generated_op_ = nullptr;
+      op_handle_->AddInput(in_var_handle);
+    }
+    // add dummy var
+    vars_.emplace_back(new DummyVarHandle());
+    DummyVarHandle* in_dummy_var_handle =
+        static_cast<DummyVarHandle*>(vars_.back().get());
+    in_dummy_var_handle->generated_op_ = nullptr;
+    op_handle_->AddInput(in_dummy_var_handle);
+    // add output
+    vars_.emplace_back(new VarHandle());
+    VarHandle* out_var_handle = static_cast<VarHandle*>(vars_.back().get());
+    out_var_handle->place_ = gpu_list_[input_scope_idx];
+    out_var_handle->name_ = "out";
+    out_var_handle->version_ = 2;
+    out_var_handle->scope_idx_ = input_scope_idx;
+    op_handle_->AddOutput(out_var_handle);
+    // add dummy var
+    vars_.emplace_back(new DummyVarHandle());
+    DummyVarHandle* dummy_var_handle =
+        static_cast<DummyVarHandle*>(vars_.back().get());
+    op_handle_->AddOutput(dummy_var_handle);
+  }
+  void TestGatherSelectedRows(size_t output_scope_idx) {
+    int height = kDims[0] * 2;
+    std::vector<int64_t> rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1,
+                              2, 4, 6, 3, 1, 1, 1,  1, 3, 7};
+    std::vector<float> send_vector(f::product(kDims));
+    for (size_t k = 0; k < send_vector.size(); ++k) {
+      send_vector[k] = k;
+    }
+    for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
+         ++input_scope_idx) {
+      auto in_var = local_scopes_[input_scope_idx]->Var("input");
+      auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
+      auto value = in_selected_rows->mutable_value();
+      value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
+      in_selected_rows->set_height(height);
+      in_selected_rows->set_rows(rows);
+      paddle::framework::TensorFromVector<float>(
+          send_vector, *(ctxs_[input_scope_idx]), value);
+      value->Resize(kDims);
+    }
+    auto out_var = local_scopes_[output_scope_idx]->Var("out");
+    auto out_selected_rows = out_var->GetMutable<f::SelectedRows>();
+    auto in_var = local_scopes_[output_scope_idx]->Var("input");
+    auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
+    out_selected_rows->mutable_value()->ShareDataWith(
+        in_selected_rows->value());
+    op_handle_->Run(false);
+    WaitAll();
+    p::CPUPlace cpu_place;
+    auto& out_select_rows = out_var->Get<f::SelectedRows>();
+    auto rt = out_select_rows.value();
+    PADDLE_ENFORCE_EQ(out_select_rows.height(), height, "height is not equal.");
+    for (size_t k = 0; k < out_select_rows.rows().size(); ++k) {
+      PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k % rows.size()]);
+    }
+    f::Tensor result_tensor;
+    f::TensorCopy(rt, cpu_place, *(ctxs_[output_scope_idx]), &result_tensor);
+    float* ct = result_tensor.data<float>();
+    for (int64_t j = 0; j < f::product(kDims); ++j) {
+      ASSERT_NEAR(ct[j], send_vector[j % send_vector.size()], 1e-5);
+    }
+  }
+};
+TEST(GatherTester, TestCPUGatherTestSelectedRows) {
+  TestGatherOpHandle test_op;
+  size_t input_scope_idx = 0;
+  test_op.InitCtxOnGpu(false);
+  test_op.InitGatherOp(input_scope_idx);
+  test_op.TestGatherSelectedRows(input_scope_idx);
+}
+#ifdef PADDLE_WITH_CUDA
+TEST(GatherTester, TestGPUGatherTestSelectedRows) {
+  TestGatherOpHandle test_op;
+  size_t input_scope_idx = 0;
+  test_op.InitCtxOnGpu(false);
+  test_op.InitGatherOp(input_scope_idx);
+  test_op.TestGatherSelectedRows(input_scope_idx);
+}
+#endif
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -55,21 +55,21 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
  }
 }
-void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result, OpDesc *op,
+void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result,
+                                                const OpDesc &op,
                                                const platform::Place &p,
                                                const size_t &i) const {
  auto *op_handle = result->ops_.back().get();
-  op_handle->dev_ctxes_[p] = const_cast<platform::DeviceContext *>(
+  op_handle->dev_ctxes_[p] = platform::DeviceContextPool::Instance().Get(p);
-      platform::DeviceContextPool::Instance().Get(p));
-  auto var_names = op->InputArgumentNames();
+  auto var_names = op.InputArgumentNames();
  for (auto &each_var_name : var_names) {
    VarHandle *var = CreateOrGetLatestVarHandle(result, each_var_name, p, i);
    op_handle->AddInput(var);
  }
-  var_names = op->OutputArgumentNames();
+  var_names = op.OutputArgumentNames();
  for (auto &each_var_name : var_names) {
    CreateOpOutput(result, op_handle, each_var_name, p, i);
@@ -107,7 +107,7 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
      result.ops_.emplace_back(new SendOpHandle(*op, s, p));
      // Create inputs for output on original place and no ssa output
      // is created for send op.
-      CreateOpHandleIOs(&result, op, p, 0);
+      CreateOpHandleIOs(&result, *op, p, 0);
      continue;
    }
@@ -117,7 +117,7 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
      result.ops_.emplace_back(new ComputationOpHandle(*op, s, p));
      auto *op_handle = result.ops_.back().get();
-      CreateOpHandleIOs(&result, op, p, i);
+      CreateOpHandleIOs(&result, *op, p, i);
      auto var_names = op->OutputArgumentNames();

--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@@ -45,8 +45,8 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
  std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const override;
 private:
-  void CreateOpHandleIOs(SSAGraph *result, OpDesc *op, const platform::Place &p,
+  void CreateOpHandleIOs(SSAGraph *result, const OpDesc &op,
-                         const size_t &i) const;
+                         const platform::Place &p, const size_t &i) const;
 private:
  std::string loss_var_name_;

--- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
@@ -14,6 +14,8 @@
 #include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
+#include <algorithm>
 namespace paddle {
 namespace framework {
 namespace details {
@@ -27,6 +29,32 @@ NCCLAllReduceOpHandle::NCCLAllReduceOpHandle(
  }
 }
+struct ReduceLoDTensor {
+  const std::vector<LoDTensor> &src_tensors_;
+  LoDTensor &dst_tensor_;
+  ReduceLoDTensor(const std::vector<LoDTensor> &src, LoDTensor *dst)
+      : src_tensors_(src), dst_tensor_(*dst) {}
+  template <typename T>
+  void operator()() const {
+    PADDLE_ENFORCE(!src_tensors_.empty());
+    auto &t0 = src_tensors_[0];
+    PADDLE_ENFORCE_NE(t0.numel(), 0);
+    dst_tensor_.Resize(t0.dims());
+    T *dst = dst_tensor_.mutable_data<T>(platform::CPUPlace());
+    std::copy(t0.data<T>(), t0.data<T>() + t0.numel(), dst);
+    for (size_t i = 1; i < src_tensors_.size(); ++i) {
+      auto &t = src_tensors_[i];
+      PADDLE_ENFORCE_EQ(t.dims(), t0.dims());
+      PADDLE_ENFORCE_EQ(t.type(), t0.type());
+      std::transform(t.data<T>(), t.data<T>() + t.numel(), dst, dst,
+                     [](T a, T b) -> T { return a + b; });
+    }
+  }
+};
 void NCCLAllReduceOpHandle::RunImpl() {
  if (inputs_.size() == 1) {
    return;  // No need to all reduce when GPU count = 1;
@@ -41,14 +69,20 @@ void NCCLAllReduceOpHandle::RunImpl() {
    int dtype = -1;
    size_t numel = 0;
-    std::vector<std::function<void()>> all_reduce_calls;
+    std::vector<LoDTensor> lod_tensors;
    for (size_t i = 0; i < local_scopes_.size(); ++i) {
-      auto &p = places_[i];
      auto *s = local_scopes_[i];
-      int dev_id = boost::get<platform::CUDAPlace>(p).device;
      auto &lod_tensor = s->FindVar(var_name)->Get<LoDTensor>();
+      lod_tensors.emplace_back(lod_tensor);
+    }
+    if (platform::is_gpu_place(lod_tensors[0].place())) {
+      std::vector<std::function<void()>> all_reduce_calls;
+      for (size_t i = 0; i < local_scopes_.size(); ++i) {
+        auto &p = places_[i];
+        auto &lod_tensor = lod_tensors[i];
        void *buffer = const_cast<void *>(lod_tensor.data<void>());
        if (dtype == -1) {
@@ -59,20 +93,43 @@ void NCCLAllReduceOpHandle::RunImpl() {
          numel = static_cast<size_t>(lod_tensor.numel());
        }
+        int dev_id = boost::get<platform::CUDAPlace>(p).device;
        auto &nccl_ctx = nccl_ctxs_.at(dev_id);
        auto stream = nccl_ctx.stream();
        auto comm = nccl_ctx.comm_;
        all_reduce_calls.emplace_back([=] {
          PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
-            buffer, buffer, numel, static_cast<ncclDataType_t>(dtype), ncclSum,
+              buffer, buffer, numel, static_cast<ncclDataType_t>(dtype),
-            comm, stream));
+              ncclSum, comm, stream));
        });
      }
+      this->RunAndRecordEvent([&] {
        platform::NCCLGroupGuard guard;
        for (auto &call : all_reduce_calls) {
          call();
        }
+      });
+    } else {  // Special handle CPU only Operator's gradient. Like CRF
+      auto &trg =
+          *this->local_scopes_[0]->Var()->GetMutable<framework::LoDTensor>();
+      // Reduce All Tensor to trg in CPU
+      ReduceLoDTensor func(lod_tensors, &trg);
+      VisitDataType(ToDataType(lod_tensors[0].type()), func);
+      for (size_t i = 0; i < local_scopes_.size(); ++i) {
+        auto &scope = local_scopes_[i];
+        auto &p = places_[i];
+        auto *var = scope->FindVar(var_name);
+        auto *dev_ctx = dev_ctxes_[p];
+        RunAndRecordEvent(p, [&trg, var, dev_ctx, p] {
+          auto &tensor_gpu = *var->GetMutable<framework::LoDTensor>();
+          auto &tensor_cpu = trg;
+          TensorCopy(tensor_cpu, p, *dev_ctx, &tensor_gpu);
+        });
+      }
+    }
  }
 }

--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -54,17 +54,6 @@ void OpHandleBase::Run(bool use_event) {
 #endif
  RunImpl();
-#ifdef PADDLE_WITH_CUDA
-  if (use_event) {
-    for (auto &p : dev_ctxes_) {
-      int dev_id = boost::get<platform::CUDAPlace>(p.first).device;
-      auto stream =
-          static_cast<platform::CUDADeviceContext *>(p.second)->stream();
-      PADDLE_ENFORCE(cudaEventRecord(events_.at(dev_id), stream));
-    }
-  }
-#endif
 }
 void OpHandleBase::Wait(platform::DeviceContext *waited_dev) {
@@ -97,6 +86,43 @@ void OpHandleBase::AddOutput(VarHandleBase *out) {
  out->generated_op_ = this;
 }
+void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
+#ifdef PADDLE_WITH_CUDA
+  if (!events_.empty()) {  // Use event
+    std::function<void()> method = callback;
+    for (auto &p : dev_ctxes_) {
+      method = [method, p, this]() {
+        static_cast<platform::CUDADeviceContext *>(p.second)->RecordEvent(
+            events_.at(boost::get<platform::CUDAPlace>(p.first).device),
+            method);
+      };
+    }
+    method();
+  } else {
+#endif
+    callback();
+#ifdef PADDLE_WITH_CUDA
+  }
+#endif
+}
+void OpHandleBase::RunAndRecordEvent(platform::Place p,
+                                     const std::function<void()> &callback) {
+#ifdef PADDLE_WITH_CUDA
+  if (platform::is_cpu_place(p) || events_.empty()) {
+    callback();
+  } else {
+    auto *ctx = dev_ctxes_.at(p);
+    auto *cuda_ctx = static_cast<platform::CUDADeviceContext *>(ctx);
+    cuda_ctx->RecordEvent(events_.at(boost::get<platform::CUDAPlace>(p).device),
+                          callback);
+  }
+#else
+  callback();
+#endif
+}
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -62,6 +62,11 @@ class OpHandleBase {
  virtual bool IsMultiDeviceTransfer() { return false; }
 protected:
+  void RunAndRecordEvent(const std::function<void()> &callback);
+  void RunAndRecordEvent(platform::Place p,
+                         const std::function<void()> &callback);
  virtual void RunImpl() = 0;
 };

--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@@ -14,6 +14,8 @@
 #include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
+#include <string>
 namespace paddle {
 namespace framework {
 namespace details {
@@ -37,11 +39,13 @@ void ScaleLossGradOpHandle::RunImpl() {
    *tmp = coeff_;
  } else {
 #ifdef PADDLE_WITH_CUDA
+    this->RunAndRecordEvent([&] {
      auto stream =
          static_cast<platform::CUDADeviceContext *>(this->dev_ctxes_[place_])
              ->stream();
      memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp,
                   platform::CPUPlace(), &coeff_, sizeof(float), stream);
+    });
 #endif
  }
 }

--- a/paddle/fluid/framework/details/send_op_handle.cc
+++ b/paddle/fluid/framework/details/send_op_handle.cc
@@ -34,7 +34,7 @@ void SendOpHandle::RunImpl() {
    }
    in->generated_op_->Wait(dev_ctxes_[p]);
  }
-  op_->Run(*local_scope_, place_);
+  this->RunAndRecordEvent([&] { op_->Run(*local_scope_, place_); });
 }
 std::string SendOpHandle::Name() const { return "send"; }

--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -196,10 +196,12 @@ void ThreadedSSAGraphExecutor::RunOp(
    BlockingQueue<VarHandleBase *> *ready_var_q, details::OpHandleBase *op) {
  auto op_run = [ready_var_q, op, this] {
    try {
-      VLOG(10) << op->Name() << " : " << op->DebugString();
+      VLOG(10) << op << " " << op->Name() << " : " << op->DebugString();
      op->Run(use_event_);
+      VLOG(10) << op << " " << op->Name() << " Done ";
      running_ops_--;
      ready_var_q->Extend(op->outputs_);
+      VLOG(10) << op << " " << op->Name() << "Signal posted";
    } catch (platform::EnforceNotMet ex) {
      exception_.reset(new platform::EnforceNotMet(ex));
    } catch (...) {

--- a/paddle/fluid/framework/details/var_handle.h
+++ b/paddle/fluid/framework/details/var_handle.h
@@ -50,6 +50,7 @@ struct VarHandle : public VarHandleBase {
  // version field currently is not used, however, just store the version to
  // debug easily.
  size_t version_;
+  size_t scope_idx_;
  std::string name_;
  platform::Place place_;
 };

--- a/paddle/fluid/framework/program_desc_test.cc
+++ b/paddle/fluid/framework/program_desc_test.cc
@@ -66,7 +66,7 @@ TEST(ProgramDesc, copy_ctor) {
  for (size_t i = 0; i < global_block->OpSize(); ++i) {
    auto op_origin = global_block->Op(i);
-    auto op_copy = global_block->Op(i);
+    auto op_copy = global_block_copy->Op(i);
    ASSERT_EQ(op_origin->Type(), op_copy->Type());
    ASSERT_EQ(op_origin->Inputs(), op_copy->Inputs());
@@ -131,7 +131,7 @@ TEST(ProgramDescBind, serialize_and_deserialize) {
  for (size_t i = 0; i < global_block->OpSize(); ++i) {
    auto op_origin = global_block->Op(i);
-    auto op_restored = global_block->Op(i);
+    auto op_restored = global_block_restored->Op(i);
    ASSERT_EQ(op_origin->Type(), op_restored->Type());
    ASSERT_EQ(op_origin->Inputs(), op_restored->Inputs());

--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -11,8 +11,10 @@
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License. */
 #include "paddle/fluid/framework/tensor_util.h"
+#include <algorithm>
+#include <limits>
+#include <vector>
 namespace paddle {
 namespace framework {
@@ -65,8 +67,6 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
    auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
    auto ctx_place = ctx.GetPlace();
    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
-    auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
-    PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
    memory::Copy(
        dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());

--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -21,4 +21,7 @@ endif()
 if(WITH_TESTING)
  add_subdirectory(tests/book)
+  if (WITH_TENSORRT)
+    add_subdirectory(tensorrt)
+  endif()
 endif()
--- a/paddle/fluid/inference/tensorrt/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt
+nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
--- a/paddle/fluid/inference/tensorrt/test_tensorrt.cc
+++ b/paddle/fluid/inference/tensorrt/test_tensorrt.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include "NvInfer.h"
+#include "cuda.h"
+#include "cuda_runtime_api.h"
+#include "paddle/fluid/platform/dynload/tensorrt.h"
+namespace dy = paddle::platform::dynload;
+class Logger : public nvinfer1::ILogger {
+ public:
+  void log(nvinfer1::ILogger::Severity severity, const char* msg) override {
+    switch (severity) {
+      case Severity::kINFO:
+        LOG(INFO) << msg;
+        break;
+      case Severity::kWARNING:
+        LOG(WARNING) << msg;
+        break;
+      case Severity::kINTERNAL_ERROR:
+      case Severity::kERROR:
+        LOG(ERROR) << msg;
+        break;
+      default:
+        break;
+    }
+  }
+};
+class ScopedWeights {
+ public:
+  ScopedWeights(float value) : value_(value) {
+    w.type = nvinfer1::DataType::kFLOAT;
+    w.values = &value_;
+    w.count = 1;
+  }
+  const nvinfer1::Weights& get() { return w; }
+ private:
+  float value_;
+  nvinfer1::Weights w;
+};
+// The following two API are implemented in TensorRT's header file, cannot load
+// from the dynamic library. So create our own implementation and directly
+// trigger the method from the dynamic library.
+nvinfer1::IBuilder* createInferBuilder(nvinfer1::ILogger& logger) {
+  return static_cast<nvinfer1::IBuilder*>(
+      dy::createInferBuilder_INTERNAL(&logger, NV_TENSORRT_VERSION));
+}
+nvinfer1::IRuntime* createInferRuntime(nvinfer1::ILogger& logger) {
+  return static_cast<nvinfer1::IRuntime*>(
+      dy::createInferRuntime_INTERNAL(&logger, NV_TENSORRT_VERSION));
+}
+const char* kInputTensor = "input";
+const char* kOutputTensor = "output";
+// Creates a network to compute y = 2x + 3
+nvinfer1::IHostMemory* CreateNetwork() {
+  Logger logger;
+  // Create the engine.
+  nvinfer1::IBuilder* builder = createInferBuilder(logger);
+  ScopedWeights weights(2.);
+  ScopedWeights bias(3.);
+  nvinfer1::INetworkDefinition* network = builder->createNetwork();
+  // Add the input
+  auto input = network->addInput(kInputTensor, nvinfer1::DataType::kFLOAT,
+                                 nvinfer1::DimsCHW{1, 1, 1});
+  EXPECT_NE(input, nullptr);
+  // Add the hidden layer.
+  auto layer = network->addFullyConnected(*input, 1, weights.get(), bias.get());
+  EXPECT_NE(layer, nullptr);
+  // Mark the output.
+  auto output = layer->getOutput(0);
+  output->setName(kOutputTensor);
+  network->markOutput(*output);
+  // Build the engine.
+  builder->setMaxBatchSize(1);
+  builder->setMaxWorkspaceSize(1 << 10);
+  auto engine = builder->buildCudaEngine(*network);
+  EXPECT_NE(engine, nullptr);
+  // Serialize the engine to create a model, then close.
+  nvinfer1::IHostMemory* model = engine->serialize();
+  network->destroy();
+  engine->destroy();
+  builder->destroy();
+  return model;
+}
+void Execute(nvinfer1::IExecutionContext& context, const float* input,
+             float* output) {
+  const nvinfer1::ICudaEngine& engine = context.getEngine();
+  // Two binds, input and output
+  ASSERT_EQ(engine.getNbBindings(), 2);
+  const int input_index = engine.getBindingIndex(kInputTensor);
+  const int output_index = engine.getBindingIndex(kOutputTensor);
+  // Create GPU buffers and a stream
+  void* buffers[2];
+  ASSERT_EQ(0, cudaMalloc(&buffers[input_index], sizeof(float)));
+  ASSERT_EQ(0, cudaMalloc(&buffers[output_index], sizeof(float)));
+  cudaStream_t stream;
+  ASSERT_EQ(0, cudaStreamCreate(&stream));
+  // Copy the input to the GPU, execute the network, and copy the output back.
+  ASSERT_EQ(0, cudaMemcpyAsync(buffers[input_index], input, sizeof(float),
+                               cudaMemcpyHostToDevice, stream));
+  context.enqueue(1, buffers, stream, nullptr);
+  ASSERT_EQ(0, cudaMemcpyAsync(output, buffers[output_index], sizeof(float),
+                               cudaMemcpyDeviceToHost, stream));
+  cudaStreamSynchronize(stream);
+  // Release the stream and the buffers
+  cudaStreamDestroy(stream);
+  ASSERT_EQ(0, cudaFree(buffers[input_index]));
+  ASSERT_EQ(0, cudaFree(buffers[output_index]));
+}
+TEST(TensorrtTest, BasicFunction) {
+  // Create the network serialized model.
+  nvinfer1::IHostMemory* model = CreateNetwork();
+  // Use the model to create an engine and an execution context.
+  Logger logger;
+  nvinfer1::IRuntime* runtime = createInferRuntime(logger);
+  nvinfer1::ICudaEngine* engine =
+      runtime->deserializeCudaEngine(model->data(), model->size(), nullptr);
+  model->destroy();
+  nvinfer1::IExecutionContext* context = engine->createExecutionContext();
+  // Execute the network.
+  float input = 1234;
+  float output;
+  Execute(*context, &input, &output);
+  EXPECT_EQ(output, input * 2 + 3);
+  // Destroy the engine.
+  context->destroy();
+  engine->destroy();
+  runtime->destroy();
+}
--- a/paddle/fluid/operators/average_accumulates_op.cu
+++ b/paddle/fluid/operators/average_accumulates_op.cu
@@ -25,12 +25,14 @@ void GetAccumulators<paddle::platform::CUDADeviceContext>(
  auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
  auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
  auto stream = ctx.cuda_device_context().stream();
-  memory::Copy(platform::CPUPlace(), old_num_accumulates_,
+  auto cuda_place =
-               platform::CUDAPlace(), in_old_num_accumulates->data<int64_t>(),
+      boost::get<platform::CUDAPlace>(in_old_num_accumulates->place());
-               sizeof(int64_t), stream);
+  memory::Copy(platform::CPUPlace(), old_num_accumulates_, cuda_place,
-  memory::Copy(platform::CPUPlace(), num_accumulates_, platform::CUDAPlace(),
+               in_old_num_accumulates->data<int64_t>(), sizeof(int64_t),
+               stream);
+  memory::Copy(platform::CPUPlace(), num_accumulates_, cuda_place,
               in_num_accumulates->data<int64_t>(), sizeof(int64_t), stream);
-  memory::Copy(platform::CPUPlace(), num_updates_, platform::CUDAPlace(),
+  memory::Copy(platform::CPUPlace(), num_updates_, cuda_place,
               in_num_updates->data<int64_t>(), sizeof(int64_t), stream);
 }
@@ -42,14 +44,16 @@ void SetAccumulators<paddle::platform::CUDADeviceContext>(
  auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates");
  auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates");
  auto* out_num_updates = ctx.Output<Tensor>("out_num_updates");
+  auto cuda_place =
+      boost::get<platform::CUDAPlace>(out_old_num_accumulates->place());
-  memory::Copy(platform::CUDAPlace(), out_old_num_accumulates->data<int64_t>(),
+  memory::Copy(cuda_place, out_old_num_accumulates->data<int64_t>(),
               platform::CPUPlace(), &old_num_accumulates_, sizeof(int64_t),
               stream);
-  memory::Copy(platform::CUDAPlace(), out_num_accumulates->data<int64_t>(),
+  memory::Copy(cuda_place, out_num_accumulates->data<int64_t>(),
               platform::CPUPlace(), &num_accumulates_, sizeof(int64_t),
               stream);
-  memory::Copy(platform::CUDAPlace(), out_num_updates->data<int64_t>(),
+  memory::Copy(cuda_place, out_num_updates->data<int64_t>(),
               platform::CPUPlace(), &num_updates_, sizeof(int64_t), stream);
 }

--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -268,6 +268,7 @@ void batched_gemm<platform::CUDADeviceContext, float16>(
    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
    const float16 alpha, const float16* A, const float16* B, const float16 beta,
    float16* C, const int batchCount, const int strideA, const int strideB) {
+#if CUDA_VERSION >= 8000
  // Note that cublas follows fortran order, so the order is different from
  // the cblas convention.
  int lda = (transA == CblasNoTrans) ? K : M;
@@ -289,7 +290,6 @@ void batched_gemm<platform::CUDADeviceContext, float16>(
  PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53,
                    "cublas Hgemm requires GPU compute capability >= 53");
-#if CUDA_VERSION >= 8000
  PADDLE_ENFORCE(platform::dynload::cublasHgemmStridedBatched(
      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb,
      strideB, h_A, lda, strideA, &h_beta, h_C, ldc, strideC, batchCount));
@@ -304,6 +304,7 @@ void batched_gemm<platform::CUDADeviceContext, float>(
    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
    const float alpha, const float* A, const float* B, const float beta,
    float* C, const int batchCount, const int strideA, const int strideB) {
+#if CUDA_VERSION >= 8000
  // Note that cublas follows fortran order, so the order is different from
  // the cblas convention.
  int lda = (transA == CblasNoTrans) ? K : M;
@@ -315,7 +316,6 @@ void batched_gemm<platform::CUDADeviceContext, float>(
      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
  const int strideC = M * N;
-#if CUDA_VERSION >= 8000
  PADDLE_ENFORCE(platform::dynload::cublasSgemmStridedBatched(
      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb,
      strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount));
@@ -330,6 +330,7 @@ void batched_gemm<platform::CUDADeviceContext, double>(
    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
    const double alpha, const double* A, const double* B, const double beta,
    double* C, const int batchCount, const int strideA, const int strideB) {
+#if CUDA_VERSION >= 8000
  // Note that cublas follows fortran order, so the order is different from
  // the cblas convention.
  int lda = (transA == CblasNoTrans) ? K : M;
@@ -341,7 +342,6 @@ void batched_gemm<platform::CUDADeviceContext, double>(
      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
  const int strideC = M * N;
-#if CUDA_VERSION >= 8000
  PADDLE_ENFORCE(platform::dynload::cublasDgemmStridedBatched(
      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb,
      strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount));

--- a/paddle/fluid/operators/reshape_op.h
+++ b/paddle/fluid/operators/reshape_op.h
@@ -147,6 +147,7 @@ class ReshapeKernel : public framework::OpKernel<T> {
    if (!inplace) {
      out->mutable_data<T>(ctx.GetPlace());
      framework::TensorCopy(*in, ctx.GetPlace(), ctx.device_context(), out);
+      ctx.device_context().Wait();
      // TensorCopy will resize to in_dims.
      out->Resize(out_dims);
    } else {
@@ -169,6 +170,7 @@ class ReshapeGradKernel : public framework::OpKernel<T> {
    auto in_dims = d_x->dims();
    if (!inplace) {
      framework::TensorCopy(*d_out, ctx.GetPlace(), ctx.device_context(), d_x);
+      ctx.device_context().Wait();
      d_x->Resize(in_dims);
    } else {
      d_x->ShareDataWith(*d_out);

--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/scale_op.h"
 #include <string>
 namespace paddle {

--- a/paddle/fluid/operators/scatter_op.cu
+++ b/paddle/fluid/operators/scatter_op.cu
@@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "gather.cu.h"
+#include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/gather_op.h"
-#include "scatter.cu.h"
+#include "paddle/fluid/operators/scatter.cu.h"
+#include "paddle/fluid/operators/scatter_op.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/scatter_op.h
+++ b/paddle/fluid/operators/scatter_op.h
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
-#include "gather.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "scatter.h"
+#include "paddle/fluid/operators/gather.h"
+#include "paddle/fluid/operators/scatter.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/scatter_test.cc
+++ b/paddle/fluid/operators/scatter_test.cc
@@ -13,44 +13,48 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/scatter.h"
-#include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/place.h"
 #include <gtest/gtest.h>
 #include <iostream>
 #include <string>
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/place.h"
 TEST(scatter, ScatterUpdate) {
-  using namespace paddle::framework;
+  // using namespace paddle::framework;
-  using namespace paddle::platform;
+  // using namespace paddle::platform;
-  using namespace paddle::operators;
+  // using namespace paddle::operators;
-  Tensor* src = new Tensor();
+  paddle::framework::Tensor* src = new paddle::framework::Tensor();
-  Tensor* index = new Tensor();
+  paddle::framework::Tensor* index = new paddle::framework::Tensor();
-  Tensor* output = new Tensor();
+  paddle::framework::Tensor* output = new paddle::framework::Tensor();
  float* p_src = nullptr;
  int* p_index = nullptr;
-  p_src = src->mutable_data<float>(make_ddim({1, 4}), CPUPlace());
+  p_src = src->mutable_data<float>(paddle::framework::make_ddim({1, 4}),
-  p_index = index->mutable_data<int>(make_ddim({1}), CPUPlace());
+                                   paddle::platform::CPUPlace());
+  p_index = index->mutable_data<int>(paddle::framework::make_ddim({1}),
+                                     paddle::platform::CPUPlace());
-  for (size_t i = 0; i < 4; ++i) p_src[i] = float(i);
+  for (size_t i = 0; i < 4; ++i) p_src[i] = static_cast<float>(i);
  p_index[0] = 1;
-  float* p_output = output->mutable_data<float>(make_ddim({4, 4}), CPUPlace());
+  float* p_output = output->mutable_data<float>(
+      paddle::framework::make_ddim({4, 4}), paddle::platform::CPUPlace());
  auto* cpu_place = new paddle::platform::CPUPlace();
  paddle::platform::CPUDeviceContext ctx(*cpu_place);
-  ScatterAssign<float>(ctx, *src, *index, output);
+  paddle::operators::ScatterAssign<float>(ctx, *src, *index, output);
-  for (size_t i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], float(0));
+  for (size_t i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], 0.0f);
-  for (size_t i = 0; i < 4; ++i) EXPECT_EQ(output->data<float>()[i], float(0));
+  for (size_t i = 0; i < 4; ++i) EXPECT_EQ(output->data<float>()[i], 0.0f);
-  for (size_t i = 4; i < 8; ++i) EXPECT_EQ(p_output[i], float(i - 4));
+  for (size_t i = 4; i < 8; ++i) {
+    EXPECT_EQ(p_output[i], static_cast<float>(i - 4));
+  }
  for (size_t i = 4; i < 8; ++i)
-    EXPECT_EQ(output->data<float>()[i], float(i - 4));
+    EXPECT_EQ(output->data<float>()[i], static_cast<float>(i - 4));
-  for (size_t i = 8; i < 16; ++i) EXPECT_EQ(p_output[i], float(0));
+  for (size_t i = 8; i < 16; ++i) EXPECT_EQ(p_output[i], 0.0f);
-  for (size_t i = 8; i < 16; ++i) EXPECT_EQ(output->data<float>()[i], float(0));
+  for (size_t i = 8; i < 16; ++i) EXPECT_EQ(output->data<float>()[i], 0.0f);
  delete src;
  delete index;

--- a/paddle/fluid/operators/send_barrier_op.cc
+++ b/paddle/fluid/operators/send_barrier_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <future>  // NOLINT
 #include <ostream>
 #include "paddle/fluid/framework/data_type.h"
@@ -19,7 +20,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include <future>
 #include "paddle/fluid/operators/detail/grpc_client.h"
 namespace paddle {

--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <future>
+#include <future>  // NOLINT
 #include <ostream>
 #include "paddle/fluid/framework/data_type.h"

--- a/paddle/fluid/operators/send_recv_util.h
+++ b/paddle/fluid/operators/send_recv_util.h
@@ -12,6 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#pragma once
+#include <string>
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/sequence_concat_op.h
+++ b/paddle/fluid/operators/sequence_concat_op.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/strided_memcpy.h"

--- a/paddle/fluid/operators/sequence_conv_op.h
+++ b/paddle/fluid/operators/sequence_conv_op.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <algorithm>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/context_project.h"
 #include "paddle/fluid/operators/math/math_function.h"

--- a/paddle/fluid/operators/sequence_erase_op.cc
+++ b/paddle/fluid/operators/sequence_erase_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/sequence_erase_op.h"
+#include <vector>
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/sequence_erase_op.h
+++ b/paddle/fluid/operators/sequence_erase_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #pragma once
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 namespace paddle {

--- a/paddle/fluid/operators/sequence_expand_op.cc
+++ b/paddle/fluid/operators/sequence_expand_op.cc
@@ -84,13 +84,12 @@ class SequenceExpandOp : public framework::OperatorWithKernel {
        }
      }
      out_dims[0] = out_first_dim;
-      ctx->SetOutputDim("Out", out_dims);
    } else {
      out_dims[0] = -1;
+    }
    ctx->SetOutputDim("Out", out_dims);
    ctx->ShareLoD("X", /*->*/ "Out");
  }
-  }
 };
 class SequenceExpandOpMaker : public framework::OpProtoAndCheckerMaker {

--- a/paddle/fluid/operators/sequence_expand_op.cu
+++ b/paddle/fluid/operators/sequence_expand_op.cu
@@ -12,8 +12,135 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#define EIGEN_USE_GPU
+#include <algorithm>
 #include "paddle/fluid/operators/sequence_expand_op.h"
+#include "paddle/fluid/platform/cuda_helper.h"
+namespace paddle {
+namespace operators {
+using LoDTensor = framework::LoDTensor;
+template <typename T>
+__global__ void sequence_expand_kernel(const T* x_data, const size_t* x_lod,
+                                       const size_t* ref_lod,
+                                       const size_t* offset,
+                                       const size_t lod_size,
+                                       /* default=1,
+                                          the instance length*/
+                                       const int x_item_length, T* out_data) {
+  int bid = blockIdx.x;
+  if (bid >= lod_size - 1) return;
+  int x_item_count = x_lod[bid + 1] - x_lod[bid];
+  int repeats = ref_lod[bid + 1] - ref_lod[bid];
+  int out_offset = static_cast<int>(offset[bid]);
+  int x_offset = x_lod[bid];
+  for (int tid_z = threadIdx.z; tid_z < repeats; tid_z += blockDim.z) {
+    for (int tid_y = threadIdx.y; tid_y < x_item_count; tid_y += blockDim.y) {
+      for (int tid_x = threadIdx.x; tid_x < x_item_length;
+           tid_x += blockDim.x) {
+        out_data[(out_offset + tid_z * x_item_count + tid_y) * x_item_length +
+                 tid_x] = x_data[(x_offset + tid_y) * x_item_length + tid_x];
+      }
+    }
+  }
+}
+template <typename T>
+__global__ void sequence_expand_grad_kernel(
+    const T* dout_data, const size_t* ref_lod, const size_t* dx_lod,
+    const size_t* offset, const size_t lod_size,
+    /* default=1,
+       the instance length*/
+    const int x_item_length, T* dx_data) {
+  int bid = blockIdx.x;
+  if (bid >= lod_size - 1) return;
+  int x_item_count = dx_lod[bid + 1] - dx_lod[bid];
+  int repeats = ref_lod[bid + 1] - ref_lod[bid];
+  int out_offset = static_cast<int>(offset[bid]);
+  int x_offset = dx_lod[bid];
+  for (int tid_z = threadIdx.z; tid_z < repeats; tid_z += blockDim.z) {
+    for (int tid_y = threadIdx.y; tid_y < x_item_count; tid_y += blockDim.y) {
+      for (int tid_x = threadIdx.x; tid_x < x_item_length;
+           tid_x += blockDim.x) {
+        platform::CudaAtomicAdd(
+            &dx_data[(x_offset + tid_y) * x_item_length + tid_x],
+            dout_data[(out_offset + tid_z * x_item_count + tid_y) *
+                          x_item_length +
+                      tid_x]);
+      }
+    }
+  }
+}
+void GetOutputOffset(const framework::Vector<size_t>& x_lod,
+                     const framework::Vector<size_t>& ref_lod,
+                     framework::Vector<size_t>* out_offset) {
+  size_t offset = 0;
+  int lod_size = static_cast<int>(x_lod.size());
+  for (int i = 0; i < static_cast<int>(x_lod.size()); ++i) {
+    (*out_offset)[i] = offset;
+    if (i < lod_size - 1) {
+      offset += (ref_lod[i + 1] - ref_lod[i]) * (x_lod[i + 1] - x_lod[i]);
+    }
+  }
+}
+template <typename T>
+struct SequenceExpandFunctor<platform::CUDADeviceContext, T> {
+  void operator()(
+      const platform::CUDADeviceContext& context, const LoDTensor& x,
+      const framework::Vector<size_t>& x_lod,   /*expand source lod*/
+      const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
+      LoDTensor* out) {
+    int x_item_length = x.numel() / x.dims()[0];
+    framework::Vector<size_t> out_offset(x_lod.size());
+    GetOutputOffset(x_lod, ref_lod, &out_offset);
+    int thread_x = std::min(32, std::max(static_cast<int>(ref_lod.size()), 16));
+    int thread_y = 16;
+    int thread_z = 1024 / thread_x / thread_y;
+    int block_x = static_cast<int>(ref_lod.size());
+    dim3 block_size(thread_x, thread_y, thread_z);
+    dim3 grid_size(block_x, 1);
+    sequence_expand_kernel<<<grid_size, block_size, 0, context.stream()>>>(
+        x.data<T>(), x_lod.CUDAData(context.GetPlace()),
+        ref_lod.CUDAData(context.GetPlace()),
+        out_offset.CUDAData(context.GetPlace()), x_lod.size(), x_item_length,
+        out->mutable_data<T>(context.GetPlace()));
+  }
+};
+template <typename T>
+struct SequenceExpandGradFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& context,
+                  const LoDTensor& dout,
+                  const framework::Vector<size_t>& x_lod, /*expand source lod*/
+                  const framework::Vector<size_t>& ref_lod, /*expand based lod*/
+                  LoDTensor* dx) {
+    int x_item_length = framework::product(dx->dims()) / dx->dims()[0];
+    framework::Vector<size_t> out_offset(x_lod.size());
+    GetOutputOffset(x_lod, ref_lod, &out_offset);
+    int thread_x = std::min(32, std::max(static_cast<int>(ref_lod.size()), 16));
+    int thread_y = 16;
+    int thread_z = 1024 / thread_x / thread_y;
+    int block_x = static_cast<int>(ref_lod.size());
+    dim3 block_size(thread_x, thread_y, thread_z);
+    dim3 grid_size(block_x, 1);
+    sequence_expand_grad_kernel<<<grid_size, block_size, 0, context.stream()>>>(
+        dout.data<T>(), ref_lod.CUDAData(context.GetPlace()),
+        x_lod.CUDAData(context.GetPlace()),
+        out_offset.CUDAData(context.GetPlace()), ref_lod.size(), x_item_length,
+        dx->mutable_data<T>(context.GetPlace()));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(

--- a/paddle/fluid/operators/sequence_expand_op.h
+++ b/paddle/fluid/operators/sequence_expand_op.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <numeric>  // std::iota
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memcpy.h"
@@ -26,6 +27,57 @@ template <typename T, int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+template <typename DeviceContext, typename T>
+struct SequenceExpandFunctor {
+  void operator()(
+      const DeviceContext& ctx, const LoDTensor& x,
+      const framework::Vector<size_t>& x_lod,   /*expand source lod*/
+      const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
+      LoDTensor* out);
+};
+template <typename DeviceContext, typename T>
+struct SequenceExpandGradFunctor {
+  void operator()(
+      const DeviceContext& ctx, const LoDTensor& dout,
+      const framework::Vector<size_t>& x_lod,   /*expand source lod*/
+      const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
+      LoDTensor* dx);
+};
+template <typename T>
+struct SequenceExpandFunctor<platform::CPUDeviceContext, T> {
+  void operator()(
+      const platform::CPUDeviceContext& context, const LoDTensor& x,
+      const framework::Vector<size_t>& x_lod,   /*expand source lod*/
+      const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
+      LoDTensor* out) {
+    int out_offset = 0;
+    auto& eigen_place = *context.eigen_device();
+    for (size_t i = 1; i < ref_lod.size(); ++i) {
+      int repeat_num = ref_lod[i] - ref_lod[i - 1];
+      int x_start = x_lod[i - 1];
+      int x_end = x_lod[i];
+      int x_seq_len = x_end - x_start;
+      if (repeat_num > 0) {
+        auto x_sub_tensor = x.Slice(x_start, x_end);
+        x_sub_tensor.Resize({1, x_sub_tensor.numel()});
+        int out_start = out_offset;
+        if (out->lod().size() == 1) {
+          out_start = out->lod()[0][out_offset];
+        }
+        auto out_sub_tensor =
+            out->Slice(out_start, out_start + x_seq_len * repeat_num);
+        out_sub_tensor.Resize({repeat_num, x_sub_tensor.dims()[1]});
+        EigenMatrix<T>::From(out_sub_tensor).device(eigen_place) =
+            EigenMatrix<T>::From(x_sub_tensor)
+                .broadcast(Eigen::array<int, 2>({{repeat_num, 1}}));
+      }
+      out_offset += repeat_num;
+    }
+  }
+};
 template <typename DeviceContext, typename T>
 class SequenceExpandKernel : public framework::OpKernel<T> {
 public:
@@ -47,45 +99,36 @@ class SequenceExpandKernel : public framework::OpKernel<T> {
      return;
    }
-    auto& out_lod = *out->mutable_lod();
+    // x lod level is at most 1.
+    framework::Vector<size_t> out_lod;
    if (x_lod.size() == 1) {
-      out_lod.resize(1);
+      out_lod.push_back(0);
-      out_lod[0] = {0};
-    }
      int out_offset = 0;
-    auto& eigen_place =
-        *context.template device_context<DeviceContext>().eigen_device();
      for (size_t i = 1; i < y_lod[ref_level].size(); ++i) {
        int repeat_num = y_lod[ref_level][i] - y_lod[ref_level][i - 1];
-      int x_start = i - 1;
+        int x_start = x_lod[0][i - 1];
-      int x_end = i;
+        int x_end = x_lod[0][i];
-      if (x_lod.size() == 1) {
-        x_start = x_lod[0][i - 1];
-        x_end = x_lod[0][i];
-      }
        int x_seq_len = x_end - x_start;
-      if (repeat_num > 0) {
-        auto x_sub_tensor = x->Slice(x_start, x_end);
-        x_sub_tensor.Resize({1, x_sub_tensor.numel()});
-        int out_start = out_offset;
-        if (x_lod.size() == 1) {
-          out_start = out_lod[0][out_offset];
-        }
-        auto out_sub_tensor =
-            out->Slice(out_start, out_start + x_seq_len * repeat_num);
-        out_sub_tensor.Resize({repeat_num, x_sub_tensor.dims()[1]});
-        EigenMatrix<T>::From(out_sub_tensor).device(eigen_place) =
-            EigenMatrix<T>::From(x_sub_tensor)
-                .broadcast(Eigen::array<int, 2>({{repeat_num, 1}}));
-      }
        for (int j = 0; j < repeat_num; ++j) {
-        if (x_lod.size() == 1) {
+          out_lod.push_back(out_lod.back() + x_seq_len);
-          out_lod[0].push_back(out_lod[0].back() + x_seq_len);
-        }
          out_offset++;
        }
      }
+      // write lod to out if x has lod
+      auto& ref_lod = *out->mutable_lod();
+      ref_lod[0] = out_lod;
+    }
+    framework::Vector<size_t> ref_x_lod;
+    if (x->lod().size() == 1) {
+      ref_x_lod = x->lod()[0];
+    } else {
+      // x_lod doesn't has lod, use fake x lod, level = 0
+      ref_x_lod.resize(x->dims()[0] + 1);
+      std::iota(ref_x_lod.begin(), ref_x_lod.end(), 0);
+    }
+    SequenceExpandFunctor<DeviceContext, T> functor;
+    functor(context.template device_context<DeviceContext>(), *x, ref_x_lod,
+            y_lod[ref_level], out);
  }
 };
@@ -101,6 +144,36 @@ class SequenceExpandKernel : public framework::OpKernel<T> {
 *    Grad(X).lod = Input(X).lod
 *
 * */
+template <typename T>
+struct SequenceExpandGradFunctor<platform::CPUDeviceContext, T> {
+  void operator()(
+      const platform::CPUDeviceContext& context, const LoDTensor& dout,
+      const framework::Vector<size_t>& x_lod,   /*expand source lod*/
+      const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
+      LoDTensor* dx) {
+    math::SetConstant<platform::CPUDeviceContext, T> set_zero;
+    set_zero(context, dx, static_cast<T>(0));
+    int dout_offset = 0;
+    for (size_t i = 1; i < ref_lod.size(); ++i) {
+      int repeat_num = ref_lod[i] - ref_lod[i - 1];
+      if (repeat_num > 0) {
+        int x_start = x_lod[i - 1];
+        int x_end = x_lod[i];
+        int x_seq_len = x_end - x_start;
+        auto dx_sub = dx->Slice(x_start, x_end);
+        dx_sub.Resize(flatten_to_1d(dx_sub.dims()));
+        int dout_end = dout_offset + repeat_num * x_seq_len;
+        auto dout_sub = dout.Slice(dout_offset, dout_end);
+        dout_sub.Resize({repeat_num, dx_sub.dims()[0]});
+        math::ColwiseSum<platform::CPUDeviceContext, T> col_sum;
+        col_sum(context, dout_sub, &dx_sub);
+        dout_offset += repeat_num * x_seq_len;
+      }
+    }
+  }
+};
 template <typename DeviceContext, typename T>
 class SequenceExpandGradKernel : public framework::OpKernel<T> {
 public:
@@ -114,43 +187,26 @@ class SequenceExpandGradKernel : public framework::OpKernel<T> {
    g_x->mutable_data<T>(context.GetPlace());
    g_x->set_lod(x->lod());
-    auto& x_lod = x->lod();
    auto& y_lod = y->lod();
    if (ref_level == -1) ref_level = y_lod.size() - 1;
    // just copy the gradient
    if (y_lod[ref_level].size() <= 1) {
      framework::TensorCopy(*g_out, context.GetPlace(), g_x);
      return;
    }
-    auto& dev_ctx = context.template device_context<DeviceContext>();
+    framework::Vector<size_t> ref_x_lod;
+    framework::Vector<size_t> ref_lod = y_lod[ref_level];
-    math::SetConstant<DeviceContext, T> set_zero;
+    if (x->lod().size() == 1) {
-    set_zero(dev_ctx, g_x, static_cast<T>(0));
+      ref_x_lod = x->lod()[0];
+    } else {
-    int g_out_offset = 0;
+      // x_lod doesn't has lod, use fake x lod, level = 0
-    for (size_t i = 1; i < y_lod[ref_level].size(); ++i) {
+      ref_x_lod.resize(x->dims()[0] + 1);
-      int repeat_num = y_lod[ref_level][i] - y_lod[ref_level][i - 1];
+      std::iota(ref_x_lod.begin(), ref_x_lod.end(), 0);
-      if (repeat_num > 0) {
-        int x_start = i - 1;
-        int x_end = i;
-        if (x_lod.size() == 1) {
-          x_start = x_lod[0][i - 1];
-          x_end = x_lod[0][i];
-        }
-        int x_seq_len = x_end - x_start;
-        auto g_x_sub = g_x->Slice(x_start, x_end);
-        g_x_sub.Resize(flatten_to_1d(g_x_sub.dims()));
-        int g_out_end = g_out_offset + repeat_num * x_seq_len;
-        auto g_out_sub = g_out->Slice(g_out_offset, g_out_end);
-        g_out_sub.Resize({repeat_num, g_x_sub.dims()[0]});
-        math::ColwiseSum<DeviceContext, T> col_sum;
-        col_sum(dev_ctx, g_out_sub, &g_x_sub);
-        g_out_offset += repeat_num * x_seq_len;
-      }
    }
+    SequenceExpandGradFunctor<DeviceContext, T> functor;
+    functor(context.template device_context<DeviceContext>(), *g_out, ref_x_lod,
+            ref_lod, g_x);
  }
 };

--- a/paddle/fluid/operators/sequence_pool_op.cc
+++ b/paddle/fluid/operators/sequence_pool_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/sequence_pool_op.h"
+#include <string>
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/sequence_pool_op.h
+++ b/paddle/fluid/operators/sequence_pool_op.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <string>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"

--- a/paddle/fluid/operators/sequence_softmax_op.cc
+++ b/paddle/fluid/operators/sequence_softmax_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/sequence_softmax_op.h"
+#include <string>
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/softmax_mkldnn_op.cc
+++ b/paddle/fluid/operators/softmax_mkldnn_op.cc
@@ -12,12 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <iostream>
 #include "mkldnn.hpp"
 #include "paddle/fluid/operators/softmax_op.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
-#include <iostream>
 namespace paddle {
 namespace operators {
@@ -63,9 +62,11 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> {
                                              softmax_md, 1 /*dim: C*/);
    // create memory primitives
    auto softmax_src_memory =
-        memory({softmax_md, mkldnn_engine}, (void*)input_data);
+        memory({softmax_md, mkldnn_engine},
+               static_cast<void*>(const_cast<T*>(input_data)));
    auto softmax_dst_memory =
-        memory({softmax_md, mkldnn_engine}, (void*)output_data);
+        memory({softmax_md, mkldnn_engine},
+               static_cast<void*>(const_cast<T*>(output_data)));
    auto softmax_prim_desc =
        softmax_forward::primitive_desc(softmax_desc, mkldnn_engine);
    auto softmax = softmax_forward(softmax_prim_desc, softmax_src_memory,

--- a/paddle/fluid/operators/split_ids_op.h
+++ b/paddle/fluid/operators/split_ids_op.h
@@ -60,7 +60,9 @@ class SplitIdsOpKernel : public framework::OpKernel<T> {
    } else if (ids_var->IsType<framework::SelectedRows>()) {
      const auto *ids_selected_rows = ctx.Input<framework::SelectedRows>("Ids");
      auto &ids_dims = ids_selected_rows->value().dims();
-      PADDLE_ENFORCE_EQ(ids_dims[0], ids_selected_rows->rows().size(), "");
+      PADDLE_ENFORCE_EQ(ids_dims[0],
+                        static_cast<int64_t>(ids_selected_rows->rows().size()),
+                        "");
      const T *ids = ids_selected_rows->value().data<T>();
      const auto &ids_rows = ids_selected_rows->rows();
      auto outs = ctx.MultiOutput<framework::SelectedRows>("Out");
@@ -77,7 +79,7 @@ class SplitIdsOpKernel : public framework::OpKernel<T> {
        framework::DDim ddim = framework::make_ddim(
            {static_cast<int64_t>(out->rows().size()), row_width});
        T *output = out->mutable_value()->mutable_data<T>(ddim, place);
-        for (size_t i = 0; i < ddim[0]; ++i) {
+        for (int64_t i = 0; i < ddim[0]; ++i) {
          memcpy(output + i * row_width, ids + out->rows()[i] * row_width,
                 row_width * sizeof(T));
        }

--- a/paddle/fluid/operators/split_op.h
+++ b/paddle/fluid/operators/split_op.h
@@ -14,7 +14,7 @@ limitations under the License. */
 #pragma once
-#include <chrono>
+#include <chrono>  // NOLINT
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/strided_memcpy.h"

--- a/paddle/fluid/platform/cuda_profiler.h
+++ b/paddle/fluid/platform/cuda_profiler.h
@@ -11,12 +11,13 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include <cuda_profiler_api.h>
-#include <stdio.h>
-#include <stdlib.h>
+#include <string>
-#include <string.h>
+#include "paddle/fluid/platform/enforce.h"
 namespace paddle {
 namespace platform {

--- a/paddle/fluid/platform/details/device_ptr_cast.h
+++ b/paddle/fluid/platform/details/device_ptr_cast.h
@@ -18,16 +18,22 @@ limitations under the License. */
 #error device_ptr_cast must be include by .cu file
 #endif
-#include <thrust/device_ptr.h>
+#include <type_traits>  // For std::remove_pointer and std::is_pointer.
+#include "thrust/device_ptr.h"
 namespace paddle {
 namespace platform {
 namespace details {
+// PointerToThrustDevicePtr has two speicalizations, one casts a (CUDA
+// device) pointer into thrust::device_ptr, the other keeps rest types
+// un-casted.
 template <typename T, bool is_ptr>
-struct DevicePtrCast;
+struct PointerToThrustDevicePtr;
 template <typename T>
-struct DevicePtrCast<T, true> {
+struct PointerToThrustDevicePtr<T, true> {
  using ELEM = typename std::remove_pointer<T>::type;
  using RTYPE = thrust::device_ptr<ELEM>;
@@ -37,17 +43,26 @@ struct DevicePtrCast<T, true> {
 };
 template <typename T>
-struct DevicePtrCast<T, false> {
+struct PointerToThrustDevicePtr<T, false> {
  using RTYPE = T;
  inline RTYPE operator()(RTYPE it) const { return it; }
 };
-// Cast T to thrust::device_ptr if T is a pointer.
+// CastToCUDATransformIterator casts a pointer to thrust::device_ptr
-// Otherwise, e.g., T is a iterator, return T itself.
+// so it could be used as the iterator of thrust::transform.  It
+// doesn't cast other types.
+//
+// We need CastToCUDATransformIterator because it is often that we
+// want to use device memory pointers as transform iterators, e.g., to
+// transform a block of float32 to float16.  In this case, we want
+// CastToCUDATransformIterator to cast float16/32 pointers to
+// thrust::device_ptr, otherwise they cannot work as the iterator
+// required by thrust::transform.  At the same time, we don't want to
+// cast thrust::device_ptr to thrust::device_ptr repeatedly.
 template <typename T>
-auto DevPtrCast(T t) ->
+auto CastToCUDATransformIterator(T t) ->
-    typename DevicePtrCast<T, std::is_pointer<T>::value>::RTYPE {
+    typename PointerToThrustDevicePtr<T, std::is_pointer<T>::value>::RTYPE {
-  DevicePtrCast<T, std::is_pointer<T>::value> cast;
+  PointerToThrustDevicePtr<T, std::is_pointer<T>::value> cast;
  return cast(t);
 }

--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -175,7 +175,7 @@ CUDADeviceContext::~CUDADeviceContext() {
 Place CUDADeviceContext::GetPlace() const { return place_; }
 void CUDADeviceContext::Wait() const {
-  std::lock_guard<std::mutex> guard(mutex_);
+  std::lock_guard<std::recursive_mutex> guard(mutex_);
  PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
  PADDLE_ENFORCE(cudaGetLastError());
 }

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -98,13 +98,20 @@ class CUDADeviceContext : public DeviceContext {
  /*! \brief  Return cuda stream in the device context. */
  cudaStream_t stream() const;
+  template <typename Callback>
+  void RecordEvent(cudaEvent_t ev, Callback callback) {
+    std::lock_guard<std::recursive_mutex> guard(mutex_);
+    callback();
+    PADDLE_ENFORCE(cudaEventRecord(ev, stream_));
+  }
 private:
  CUDAPlace place_;
  std::unique_ptr<Eigen::GpuDevice> eigen_device_;
  std::unique_ptr<EigenCudaStreamDevice> eigen_stream_;
-  mutable std::mutex mutex_;
+  mutable std::recursive_mutex mutex_;
  cudaStream_t stream_;
  cudnnHandle_t cudnn_handle_;
  cublasHandle_t cublas_handle_;

--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
 cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)
 list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc nccl.cc)
+if (WITH_TENSORRT)
+  list(APPEND CUDA_SRCS tensorrt.cc)
+endif()
 configure_file(cupti_lib_path.h.in ${CMAKE_CURRENT_BINARY_DIR}/cupti_lib_path.h)
 if (CUPTI_FOUND)
    list(APPEND CUDA_SRCS cupti.cc)

--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -45,6 +45,10 @@ DEFINE_string(nccl_dir, "",
 DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so.");
+DEFINE_string(
+    tensorrt_dir, "",
+    "Specify path for loading tensorrt library, such as libnvinfer.so.");
 namespace paddle {
 namespace platform {
 namespace dynload {
@@ -194,6 +198,14 @@ void* GetNCCLDsoHandle() {
 #endif
 }
+void* GetTensorRtDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.dylib");
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.so");
+#endif
+}
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/dynload/dynamic_loader.h
+++ b/paddle/fluid/platform/dynload/dynamic_loader.h
@@ -25,6 +25,7 @@ void* GetCurandDsoHandle();
 void* GetWarpCTCDsoHandle();
 void* GetLapackDsoHandle();
 void* GetNCCLDsoHandle();
+void* GetTensorRtDsoHandle();
 }  // namespace dynload
 }  // namespace platform

--- a/paddle/fluid/platform/dynload/tensorrt.cc
+++ b/paddle/fluid/platform/dynload/tensorrt.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/fluid/platform/dynload/tensorrt.h"
+namespace paddle {
+namespace platform {
+namespace dynload {
+std::once_flag tensorrt_dso_flag;
+void *tensorrt_dso_handle;
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+TENSORRT_RAND_ROUTINE_EACH(DEFINE_WRAP);
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/dynload/tensorrt.h
+++ b/paddle/fluid/platform/dynload/tensorrt.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <NvInfer.h>
+#include <dlfcn.h>
+#include <mutex>  // NOLINT
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/enforce.h"
+namespace paddle {
+namespace platform {
+namespace dynload {
+extern std::once_flag tensorrt_dso_flag;
+extern void* tensorrt_dso_handle;
+#ifdef PADDLE_USE_DSO
+#define DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP(__name)                      \
+  struct DynLoad__##__name {                                            \
+    template <typename... Args>                                         \
+    auto operator()(Args... args) -> decltype(__name(args...)) {        \
+      using tensorrt_func = decltype(__name(args...)) (*)(Args...);     \
+      std::call_once(tensorrt_dso_flag, []() {                          \
+        tensorrt_dso_handle =                                           \
+            paddle::platform::dynload::GetTensorRtDsoHandle();          \
+        PADDLE_ENFORCE(tensorrt_dso_handle, "load tensorrt so failed"); \
+      });                                                               \
+      void* p_##__name = dlsym(tensorrt_dso_handle, #__name);           \
+      PADDLE_ENFORCE(p_##__name, "load %s failed", #__name);            \
+      return reinterpret_cast<tensorrt_func>(p_##__name)(args...);      \
+    }                                                                   \
+  };                                                                    \
+  extern DynLoad__##__name __name
+#else
+#define DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP(__name) \
+  struct DynLoad__##__name {                       \
+    template <typename... Args>                    \
+    tensorrtResult_t operator()(Args... args) {    \
+      return __name(args...);                      \
+    }                                              \
+  };                                               \
+  extern DynLoad__##__name __name
+#endif
+#define TENSORRT_RAND_ROUTINE_EACH(__macro) \
+  __macro(createInferBuilder_INTERNAL);     \
+  __macro(createInferRuntime_INTERNAL);
+TENSORRT_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP)
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/float16.h
+++ b/paddle/fluid/platform/float16.h
@@ -873,6 +873,11 @@ HOSTDEVICE inline bool(isfinite)(const float16& a) {
  return !((isnan)(a)) && !((isinf)(a));
 }
+inline std::ostream& operator<<(std::ostream& os, const float16& a) {
+  os << static_cast<float>(a);
+  return os;
+}
 }  // namespace platform
 }  // namespace paddle

--- a/paddle/fluid/platform/float16_test.cc
+++ b/paddle/fluid/platform/float16_test.cc
@@ -141,5 +141,10 @@ TEST(float16, lod_tensor_cpu) {
  }
 }
+TEST(float16, print) {
+  float16 a = float16(1.0f);
+  std::cout << a << std::endl;
+}
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/transform.h
+++ b/paddle/fluid/platform/transform.h
@@ -14,29 +14,44 @@ limitations under the License. */
 #pragma once
+#include <algorithm>
+#include <type_traits>
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/hostdevice.h"
 #include "paddle/fluid/platform/place.h"
-#include <algorithm>
-#include <type_traits>
 #ifdef __NVCC__
 #include <thrust/execution_policy.h>
 #include <thrust/transform.h>
-#include "paddle/fluid/platform/details/device_ptr_cast.h"
+#include "paddle/fluid/platform/details/cuda_transform_iterator_cast.h"
 #endif
 namespace paddle {
 namespace platform {
-// Transform on host or device. It provides the same API in std library.
+// Transform applys a unary or a binary functor on each element in a
+// range defined by a pair of iterators.
+//
+// - The specialization for CPU calls std::transform.
+// - The specialization for CUDA calls thrust::tranform.
+//
+// NOTE: We need to define InputIter and OutputIter defined as
+//       different types, because the InputIter points op's inputs and
+//       OutputIter pints to op's outputs.
+//
+// NOTE: We don't assume that InputIter to be const InputType* and
+//       OutputIter to be OutputType*, because we might use a iterator
+//       class, paddle::fluid::operators::RowwiseTRansformIterator.
 template <typename DeviceContext>
 struct Transform {
+  // The unary version.
  template <typename InputIter, typename OutputIter, typename UnaryOperation>
  void operator()(const DeviceContext& context, InputIter first, InputIter last,
                  OutputIter result, UnaryOperation op);
+  // The binary version.
  template <typename InputIter1, typename InputIter2, typename OutputIter,
            typename BinaryOperation>
  void operator()(const DeviceContext& context, InputIter1 first1,
@@ -70,8 +85,9 @@ struct Transform<platform::CUDADeviceContext> {
    auto place = context.GetPlace();
    PADDLE_ENFORCE(is_gpu_place(place), "It must use GPU place.");
    thrust::transform(thrust::cuda::par.on(context.stream()),
-                      details::DevPtrCast(first), details::DevPtrCast(last),
+                      details::CastToCUDATransformIterator(first),
-                      details::DevPtrCast(result), op);
+                      details::CastToCUDATransformIterator(last),
+                      details::CastToCUDATransformIterator(result), op);
  }
  template <typename InputIter1, typename InputIter2, typename OutputIter,
@@ -82,9 +98,10 @@ struct Transform<platform::CUDADeviceContext> {
    auto place = context.GetPlace();
    PADDLE_ENFORCE(is_gpu_place(place), "It must use GPU place.");
    thrust::transform(thrust::cuda::par.on(context.stream()),
-                      details::DevPtrCast(first1), details::DevPtrCast(last1),
+                      details::CastToCUDATransformIterator(first1),
-                      details::DevPtrCast(first2), details::DevPtrCast(result),
+                      details::CastToCUDATransformIterator(last1),
-                      op);
+                      details::CastToCUDATransformIterator(first2),
+                      details::CastToCUDATransformIterator(result), op);
  }
 };
 #endif

--- a/paddle/fluid/platform/transform_test.cu
+++ b/paddle/fluid/platform/transform_test.cu
@@ -18,11 +18,12 @@ limitations under the License. */
 #include "paddle/fluid/platform/hostdevice.h"
 #include "paddle/fluid/platform/transform.h"
+namespace {
 template <typename T>
 class Scale {
 public:
  explicit Scale(const T& scale) : scale_(scale) {}
  HOSTDEVICE T operator()(const T& a) const { return a * scale_; }
 private:
@@ -35,11 +36,23 @@ class Multiply {
  HOSTDEVICE T operator()(const T& a, const T& b) const { return a * b; }
 };
+}  // namespace
+using paddle::memory::Alloc;
+using paddle::memory::Free;
+using paddle::memory::Copy;
+using paddle::platform::CPUPlace;
+using paddle::platform::CUDAPlace;
+using paddle::platform::CPUDeviceContext;
+using paddle::platform::CUDADeviceContext;
+using paddle::platform::Transform;
 TEST(Transform, CPUUnary) {
-  using namespace paddle::platform;
  CPUDeviceContext ctx;
  float buf[4] = {0.1, 0.2, 0.3, 0.4};
-  Transform<paddle::platform::CPUDeviceContext> trans;
+  Transform<CPUDeviceContext> trans;
  trans(ctx, buf, buf + 4, buf, Scale<float>(10));
  for (int i = 0; i < 4; ++i) {
    ASSERT_NEAR(buf[i], static_cast<float>(i + 1), 1e-5);
@@ -47,14 +60,12 @@ TEST(Transform, CPUUnary) {
 }
 TEST(Transform, GPUUnary) {
-  using namespace paddle::platform;
-  using namespace paddle::memory;
  CUDAPlace gpu0(0);
  CUDADeviceContext ctx(gpu0);
  float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4};
  float* gpu_buf = static_cast<float*>(Alloc(gpu0, sizeof(float) * 4));
  Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf), ctx.stream());
-  Transform<paddle::platform::CUDADeviceContext> trans;
+  Transform<CUDADeviceContext> trans;
  trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, Scale<float>(10));
  ctx.Wait();
  Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf), ctx.stream());
@@ -65,10 +76,8 @@ TEST(Transform, GPUUnary) {
 }
 TEST(Transform, CPUBinary) {
-  using namespace paddle::platform;
-  using namespace paddle::memory;
  int buf[4] = {1, 2, 3, 4};
-  Transform<paddle::platform::CPUDeviceContext> trans;
+  Transform<CPUDeviceContext> trans;
  CPUDeviceContext ctx;
  trans(ctx, buf, buf + 4, buf, buf, Multiply<int>());
  for (int i = 0; i < 4; ++i) {
@@ -77,14 +86,12 @@ TEST(Transform, CPUBinary) {
 }
 TEST(Transform, GPUBinary) {
-  using namespace paddle::platform;
-  using namespace paddle::memory;
  int buf[4] = {1, 2, 3, 4};
  CUDAPlace gpu0(0);
  CUDADeviceContext ctx(gpu0);
  int* gpu_buf = static_cast<int*>(Alloc(gpu0, sizeof(buf)));
  Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx.stream());
-  Transform<paddle::platform::CUDADeviceContext> trans;
+  Transform<CUDADeviceContext> trans;
  trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply<int>());
  ctx.Wait();
  Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf), ctx.stream());

--- a/paddle/fluid/platform/variant.h
+++ b/paddle/fluid/platform/variant.h
@@ -14,29 +14,25 @@ limitations under the License. */
 #pragma once
-#ifdef __CUDACC__
+// Boost 1.41.0 requires __CUDACC_VER__, but in CUDA 9 __CUDACC_VER__
-#ifdef __CUDACC_VER_MAJOR__
+// is removed, so we have to manually define __CUDACC_VER__ instead.
-// CUDA 9 define `__CUDACC_VER__` as a warning message, manually define
+// For details, please refer to
-// __CUDACC_VER__ instead.
+// https://github.com/PaddlePaddle/Paddle/issues/6626
+#if defined(__CUDACC__) && defined(__CUDACC_VER_MAJOR__)
 #undef __CUDACC_VER__
 #define __CUDACC_VER__                                  \
-  (__CUDACC_VER_MAJOR__ * 10000 + __CUDACC_VER_MINOR__ * 100 + \
+  __CUDACC_VER_BUILD__ + __CUDACC_VER_MAJOR__ * 10000 + \
-   __CUDACC_VER_BUILD__)
+      __CUDACC_VER_MINOR__ * 100
-#endif
 #endif
-#include <boost/config.hpp>
+#include "boost/config.hpp"
-#ifdef PADDLE_WITH_CUDA
+// Because Boost 1.41.0's variadic templates has bug on nvcc, boost
+// will disable variadic template support in NVCC mode.  Define
-// Because boost's variadic templates has bug on nvcc, boost will disable
+// BOOST_NO_CXX11_VARIADIC_TEMPLATES on gcc/clang to generate same
-// variadic template support when GPU enabled on nvcc.
+// function symbols.  For details,
-// Define BOOST_NO_CXX11_VARIADIC_TEMPLATES on gcc/clang to generate same
-// function symbols.
-//
 // https://github.com/PaddlePaddle/Paddle/issues/3386
+#ifdef PADDLE_WITH_CUDA
 #ifndef BOOST_NO_CXX11_VARIADIC_TEMPLATES
 #define BOOST_NO_CXX11_VARIADIC_TEMPLATES
 #endif

--- a/paddle/fluid/recordio/chunk_test.cc
+++ b/paddle/fluid/recordio/chunk_test.cc
@@ -43,5 +43,5 @@ TEST(Chunk, Compressor) {
  ch.Clear();
  ch.Parse(ss);
-  ASSERT_EQ(ch.NumBytes(), 18);
+  ASSERT_EQ(ch.NumBytes(), 18ul);
 }
--- a/paddle/utils/DynamicLoader.cpp
+++ b/paddle/utils/DynamicLoader.cpp
@@ -32,6 +32,8 @@ DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
 DEFINE_string(lapack_dir, "", "Specify path for loading liblapack.so.");
+DEFINE_string(tensorrt_dir, "", "Specify path for loading libnvinfer.so.");
 static inline std::string join(const std::string& part1,
                               const std::string& part2) {
  // directory separator
@@ -157,3 +159,12 @@ void GetLapackDsoHandle(void** dso_handle) {
  GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.so", dso_handle);
 #endif
 }
+void GetTensorRtDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+  GetDsoHandleFromSearchPath(
+      FLAGS_tensorrt_dir, "libnvinfer.dylib", dso_handle);
+#else
+  GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.so", dso_handle);
+#endif
+}
--- a/paddle/utils/DynamicLoader.h
+++ b/paddle/utils/DynamicLoader.h
@@ -58,3 +58,11 @@ void GetWarpCTCDsoHandle(void** dso_handle);
 *
 */
 void GetLapackDsoHandle(void** dso_handle);
+/**
+ * @brief    load the DSO of tensorrt
+ *
+ * @param    **dso_handle   dso handler
+ *
+ */
+void GetTensorRtDsoHandle(void** dso_handle);
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
 file(GLOB UTILS_PY_FILES . ./paddle/utils/*.py)
-file(GLOB_RECURSE FLUID_PY_FILES ./paddle/fluid/ *.py)
+file(GLOB_RECURSE FLUID_PY_FILES ./paddle/fluid/*.py)
 set(PY_FILES paddle/__init__.py
  ${UTILS_PY_FILES}
  ${FLUID_PY_FILES})
@@ -7,7 +7,7 @@ set(PY_FILES paddle/__init__.py
 if(NOT WITH_FLUID_ONLY)
  file(GLOB TRAINER_PY_FILES . ./paddle/trainer/*.py)
  file(GLOB HELPERS_PY_FILES . ./paddle/trainer_config_helpers/*.py)
-  file(GLOB_RECURSE V2_PY_FILES ./paddle/v2/ *.py)
+  file(GLOB_RECURSE V2_PY_FILES ./paddle/v2/*.py)
  set(PY_FILES ${PY_FILES}
    ${TRAINER_PY_FILES}
    ${HELPERS_PY_FILES}
@@ -55,7 +55,7 @@ add_custom_target(copy_paddle_pybind ALL DEPENDS ${PADDLE_BINARY_DIR}/python/pad
 add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
    COMMAND touch stub.cc
-    COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python/paddle
+    COMMAND cp -r ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python
    COMMAND cp -r ${PADDLE_SOURCE_DIR}/paddle/py_paddle ${PADDLE_BINARY_DIR}/python/
    COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
    COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp

--- a/python/paddle/fluid/distribute_transpiler.py
+++ b/python/paddle/fluid/distribute_transpiler.py
@@ -1115,4 +1115,6 @@ class DistributeTranspiler:
            for op2 in find_ops:
                if ufind.is_connected(op1, op2):
                    lr_ops.append(op1)
+                    # we only need to append op for once
+                    break
        return lr_ops
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -77,6 +77,7 @@ __all__ = [
    'lod_reset',
    'lrn',
    'pad',
+    'label_smooth',
 ]
@@ -3678,3 +3679,68 @@ def pad(x, paddings, pad_value=0., name=None):
        attrs={'paddings': paddings,
               'pad_value': float(pad_value)})
    return out
+def label_smooth(label,
+                 prior_dist=None,
+                 epsilon=0.1,
+                 dtype="float32",
+                 name=None):
+    """
+    Label smoothing is a mechanism to regularize the classifier layer and is
+    called label-smoothing regularization (LSR). 
+    Label smoothing is proposed to encourage the model to be less confident,
+    since optimizing the log-likelihood of the correct label directly may
+    cause overfitting and reduce the ability of the model to adapt. Label
+    smoothing replaces the ground-truth label :math:`y` with the weighted sum
+    of itself and some fixed distribution :math:`\mu`. For class :math:`k`,
+    i.e.
+    .. math::
+        \\tilde{y_k} = (1 - \epsilon) * y_k + \epsilon * \mu_k,
+    where :math:`1 - \epsilon` and :math:`\epsilon` are the weights
+    respectively, and :math:`\\tilde{y}_k` is the smoothed label. Usually
+    uniform distribution is used for :math:`\mu`.
+    See more details about label smoothing in https://arxiv.org/abs/1512.00567.
+    Args:
+        label(Variable): The input variable containing the label data. The
+                          label data should use one-hot representation.
+        prior_dist(Variable): The prior distribution to be used to smooth
+                              labels. If not provided, an uniform distribution
+                              is used. The shape of :attr:`prior_dist` should
+                              be :math:`(1, class\_num)`. 
+        epsilon(float): The weight used to mix up the original ground-truth
+                        distribution and the fixed distribution.
+        dtype(np.dtype|core.VarDesc.VarType|str): The type of data : float32, 
+                                                  float_64, int etc.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically.
+    Returns:
+        Variable: The tensor variable containing the smoothed labels.
+    Examples:
+        .. code-block:: python
+            label = layers.data(name="label", shape=[1], dtype="float32")
+            one_hot_label = layers.one_hot(input=label, depth=10)
+            smooth_label = layers.label_smooth(
+                label=one_hot_label, epsilon=0.1, dtype="float32")
+    """
+    if epsilon > 1. or epsilon < 0.:
+        raise ValueError("The value of epsilon must be between 0 and 1.")
+    helper = LayerHelper("label_smooth", **locals())
+    label.stop_gradient = True
+    smooth_label = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type="label_smooth",
+        inputs={"X": label,
+                "PriorDist": prior_dist} if prior_dist else {"X": label},
+        outputs={"Out": smooth_label},
+        attrs={"epsilon": float(epsilon)})
+    return smooth_label
--- a/python/paddle/fluid/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/memory_optimization_transpiler.py
@@ -29,17 +29,20 @@ dtype_to_size = {
    core.VarDesc.VarType.BOOL: 1
 }
-sub_block_ops = [
+SUB_BLOCK_OPS = [
    "while", "while_grad", "parallel_do", "parallel_do_grad",
    "conditional_block", "conditional_block_grad"
 ]
+SUB_BLOCK_PAIR = [("while", "while_grad"), ("parallel_do", "parallel_do_grad"),
+                  ("conditional_block", "conditional_block_grad")]
 PRINT_LOG = False
 class ControlFlowGraph(object):
-    def __init__(self, Program, ops, forward_num, skip_opt):
+    def __init__(self, program, ops, forward_num, skip_opt):
-        self._program = Program
+        self._program = program
        self._ops = ops
        self._forward_num = forward_num
        self._successors = defaultdict(set)
@@ -51,6 +54,7 @@ class ControlFlowGraph(object):
        self._skip_opt = skip_opt
    def _add_connections(self, connections):
+        """Populates _successors and _presuccessors for two neighbor nodes."""
        for node1, node2 in connections:
            self._add(node1, node2)
@@ -58,7 +62,11 @@ class ControlFlowGraph(object):
        self._successors[node1].add(node2)
        self._presuccessors[node2].add(node1)
+    # TODO(panyx0718): We need to have a unified way of building intermediate
+    # representation.
    def _build_graph(self):
+        """Build a graph based on op sequence.
+        """
        self.op_size = len(self._ops)
        op_node_connections = [(i, i + 1) for i in range(self.op_size - 1)]
        self._add_connections(op_node_connections)
@@ -82,15 +90,14 @@ class ControlFlowGraph(object):
                self._live_out[i].add(new_name)
    def _reach_fixed_point(self, live_in, live_out):
+        """Check if the liveness set has stablized."""
        if len(live_in) != len(self._live_in):
            return False
        if len(live_out) != len(self._live_out):
            return False
        for i in range(self.op_size):
-            if live_in[i] != self._live_in[i]:
+            if (live_in[i] != self._live_in[i] or
-                return False
+                    live_out[i] != self._live_out[i]):
-        for i in range(self.op_size):
-            if live_out[i] != self._live_out[i]:
                return False
        return True
@@ -98,6 +105,8 @@ class ControlFlowGraph(object):
        self._build_graph()
        live_in = defaultdict(set)
        live_out = defaultdict(set)
+        # Repeatedly apply liveness updates until the algorithm stablize
+        # on a complete set live input vars and live output vars.
        while True:
            for i in range(self.op_size, 0, -1):
                live_in[i] = set(self._live_in[i])
@@ -141,6 +150,8 @@ class ControlFlowGraph(object):
            return False
        return True
+    # TODO(panyx0718): This needs to be less hacky. It seems memory optimization
+    # doesn't consider vars copied between cpu and gpu.
    def _update_skip_opt_set(self):
        for i in range(self.op_size):
            op = self._ops[i]
@@ -154,7 +165,7 @@ class ControlFlowGraph(object):
        bwd_id = 0
        for i in range(self.op_size):
            op = self._ops[i]
-            if op.type() in sub_block_ops:
+            if op.type() in SUB_BLOCK_OPS:
                continue
            block_desc = op.block()
            is_forward = i < self._forward_num
@@ -177,13 +188,15 @@ class ControlFlowGraph(object):
        def compare_shape(x_shape, cache_shape, opt_level):
            if opt_level == 0:
                return x_shape == cache_shape
-            if opt_level == 1:
+            elif opt_level == 1:
                if (x_shape[0] == -1) ^ (cache_shape[0] == -1):
                    return False
                x_size = abs(reduce(lambda x, y: x * y, x_shape))
                cache_size = abs(reduce(lambda x, y: x * y, cache_shape))
                if x_size <= cache_size:
                    return True
+            else:
+                raise ValueError("only support opt_level 0 or 1.")
            return False
        self._dataflow_analyze()
@@ -191,10 +204,9 @@ class ControlFlowGraph(object):
        self.pool = []
        for i in range(self.op_size):
            op = self._ops[i]
-            if op.type() in sub_block_ops:
+            if op.type() in SUB_BLOCK_OPS:
                continue
            block_desc = op.block()
-            self.current_block_desc = block_desc
            is_forward = i < self._forward_num
            if self.pool:
                defs_can_optimize = filter(
@@ -211,37 +223,40 @@ class ControlFlowGraph(object):
                    for index, cache_pair in enumerate(self.pool):
                        cache_var = cache_pair[0]
                        cache_shape = cache_pair[1]
-                        if compare_shape(x_shape, cache_shape, level):
+                        if not compare_shape(x_shape, cache_shape, level):
-                            if self._has_var(block_desc, cache_var, is_forward):
+                            continue
+                        if not self._has_var(block_desc, cache_var, is_forward):
+                            continue
                        x_dtype = self._find_var(block_desc, x,
                                                 is_forward).dtype()
-                                cache_dtype = self._find_var(
+                        cache_dtype = self._find_var(block_desc, cache_var,
-                                    block_desc, cache_var, is_forward).dtype()
+                                                     is_forward).dtype()
-                                # TODO(qijun): actually, we should compare dtype_to_size[x_dtype]
+                        # TODO(qijun): actually, we should compare
-                                # and dtype_to_size[cache_dtype]
+                        # dtype_to_size[x_dtype] and dtype_to_size[cache_dtype]
-                                if x_dtype == cache_dtype:
+                        if x_dtype != cache_dtype:
+                            continue
                        if PRINT_LOG:
-                                        print(
+                            print(("Hit Cache !!!! cache pool index "
-                                            ("Hit Cache !!!! cache pool index "
                                   "is %d, var name is %s, "
                                   "cached var name is %s, "
-                                             "var shape is %s ") %
+                                   "var shape is %s ") % (index, x, cache_var,
-                                            (index, x, cache_var,
                                                          str(cache_shape)))
                        self.pool.pop(index)
                        if x == cache_var:
                            break
-                                    _rename_arg_(
+                        # Rename the var to the cache var already with
-                                        self._ops, x, cache_var, begin_idx=i)
+                        # memory allocated in order to reuse the memory.
-                                    self._program.block(block_desc.id).var(
+                        _rename_arg_(self._ops, x, cache_var, begin_idx=i)
-                                        str(x)).desc = self._find_var(
+                        self._program.block(block_desc.id).var(str(
-                                            block_desc, cache_var, is_forward)
+                            x)).desc = self._find_var(block_desc, cache_var,
-                                    self._update_graph(
+                                                      is_forward)
-                                        x, cache_var, begin_idx=i)
+                        self._update_graph(x, cache_var, begin_idx=i)
                        break
-            in_diff, out_diff = self._get_diff(self._live_in[i],
+            in_diff, _ = self._get_diff(self._live_in[i], self._live_out[i])
-                                               self._live_out[i])
            can_optimize = filter(
                lambda x: self._check_var_validity(block_desc, x, is_forward),
                in_diff)
@@ -252,6 +267,19 @@ class ControlFlowGraph(object):
 def _process_sub_block_pair(pdesc, sub_block_pair):
+    """Creates a list of tuple each of which tracks info of a subblock.
+      Note: this function doesn't handle nested subblocks yet.
+      TODO(panyx0718): assert if case nested subblocks happen.
+    :param pdesc: ProgramDesc.
+    :param sub_block_pair: A list op pairs. Each op pair is the forward
+        op and backward op. The ops in the list are special that they contain
+        a subblock of ops.
+    :return: A list of tuples, each tuple is (all ops in a subblock pair
+        including forward and backward, number of forward ops,
+        all output args names of the ops in the subblock pairs).
+    """
    ops_list = []
    block_desc = pdesc.block(0)
    op_size = block_desc.op_size()
@@ -308,6 +336,11 @@ def _process_sub_block_pair(pdesc, sub_block_pair):
 def _get_cfgs(input_program):
+    """Process each block and create ControlFlowGraph for each of them.
+    :param input_program: Program object.
+    :return: A list of ControlFlowGraph, each corresponds to a block.
+    """
    ops_list = []
    pdesc = input_program.get_desc()
    block_desc = pdesc.block(0)
@@ -316,11 +349,8 @@ def _get_cfgs(input_program):
    ops_list.append(
        ([block_desc.op(i) for i in range(op_size)], op_size, set()))
-    sub_block_pair = [("while", "while_grad"), ("parallel_do",
+    # Only process one level of nested subblock.
-                                                "parallel_do_grad"),
+    ops_list.extend(_process_sub_block_pair(pdesc, SUB_BLOCK_PAIR))
-                      ("conditional_block", "conditional_block_grad")]
-    ops_list.extend(_process_sub_block_pair(pdesc, sub_block_pair))
    cfgs = [
        ControlFlowGraph(input_program, ops, forward_num, skip_opt)
@@ -330,6 +360,17 @@ def _get_cfgs(input_program):
 def memory_optimize(input_program, print_log=False, level=0):
+    """Optimize memory by reusing var memory.
+      Note: it doesn't not support subblock nested in subblock.
+    :param input_program: Input Program
+    :param print_log: whether to print debug log.
+    :param level: If level=0, reuse if the shape is completely equal, o
+    :return:
+    """
+    if level != 0 and level != 1:
+        raise ValueError("only support opt_level 0 or 1.")
    global PRINT_LOG
    PRINT_LOG = print_log
    cfgs = _get_cfgs(input_program)

--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
@@ -169,7 +169,7 @@ class Accuracy(MetricBase):
        return self.value / self.weight
-class ChunkEvalutor(MetricBase):
+class ChunkEvaluator(MetricBase):
    """
    Accumulate counter numbers output by chunk_eval from mini-batches and
    compute the precision recall and F1-score using the accumulated counter
@@ -177,7 +177,7 @@ class ChunkEvalutor(MetricBase):
    """
    def __init__(self, name=None):
-        super(ChunkEvalutor, self).__init__(name)
+        super(ChunkEvaluator, self).__init__(name)
        self.num_infer_chunks = 0
        self.num_label_chunks = 0
        self.num_correct_chunks = 0

--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
@@ -12,17 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import contextlib
 import math
 import numpy as np
+import os
+import time
+import unittest
 import paddle
 import paddle.dataset.conll05 as conll05
 import paddle.fluid as fluid
-from paddle.fluid.initializer import init_on_cpu
-import contextlib
-import time
-import unittest
-import os
 word_dict, verb_dict, label_dict = conll05.get_dict()
 word_dict_len = len(word_dict)

--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -34,6 +34,8 @@ function(py_test_modules TARGET_NAME)
  endif()
 endfunction()
+list(REMOVE_ITEM TEST_OPS test_sequence_expand)
 # test time consuming OPs in a separate process for expliot parallism
 list(REMOVE_ITEM TEST_OPS test_parallel_executor)
 list(REMOVE_ITEM TEST_OPS test_warpctc_op)
@@ -70,6 +72,8 @@ else()
    endforeach(TEST_OP)
 endif(WITH_FAST_BUNDLE_TEST)
+#
+py_test_modules(test_sequence_expand MODULES test_sequence_expand)
 # tests with high overhead
 py_test_modules(test_parallel_executor MODULES test_parallel_executor)
 py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR})

--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -340,6 +340,16 @@ class TestBook(unittest.TestCase):
            print(layers.lod_reset(x=x, y=y))
        print(str(program))
+    def test_label_smooth(self):
+        program = Program()
+        with program_guard(program):
+            label = layers.data(name="label", shape=[1], dtype="float32")
+            one_hot_label = layers.one_hot(input=label, depth=10)
+            smooth_label = layers.label_smooth(
+                label=one_hot_label, epsilon=0.1, dtype="float32")
+            self.assertIsNotNone(smooth_label)
+        print(str(program))
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
@@ -505,3 +505,148 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase):
                        train_loss, test_loss, atol=1e-8),
                    "Train loss: " + str(train_loss) + "\n Test loss:" +
                    str(test_loss))
+import paddle.dataset.conll05 as conll05
+import paddle.fluid as fluid
+word_dict, verb_dict, label_dict = conll05.get_dict()
+word_dict_len = len(word_dict)
+label_dict_len = len(label_dict)
+pred_dict_len = len(verb_dict)
+mark_dict_len = 2
+word_dim = 32
+mark_dim = 5
+hidden_dim = 512
+depth = 8
+mix_hidden_lr = 1e-3
+embedding_name = 'emb'
+def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
+            **ignored):
+    # 8 features
+    predicate_embedding = fluid.layers.embedding(
+        input=predicate,
+        size=[pred_dict_len, word_dim],
+        dtype='float32',
+        param_attr='vemb')
+    mark_embedding = fluid.layers.embedding(
+        input=mark, size=[mark_dict_len, mark_dim], dtype='float32')
+    word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
+    emb_layers = [
+        fluid.layers.embedding(
+            size=[word_dict_len, word_dim],
+            input=x,
+            param_attr=fluid.ParamAttr(
+                name=embedding_name, trainable=False)) for x in word_input
+    ]
+    emb_layers.append(predicate_embedding)
+    emb_layers.append(mark_embedding)
+    hidden_0_layers = [
+        fluid.layers.fc(input=emb, size=hidden_dim, act='tanh')
+        for emb in emb_layers
+    ]
+    hidden_0 = fluid.layers.sums(input=hidden_0_layers)
+    lstm_0 = fluid.layers.dynamic_lstm(
+        input=hidden_0,
+        size=hidden_dim,
+        candidate_activation='relu',
+        gate_activation='sigmoid',
+        cell_activation='sigmoid')
+    # stack L-LSTM and R-LSTM with direct edges
+    input_tmp = [hidden_0, lstm_0]
+    for i in range(1, depth):
+        mix_hidden = fluid.layers.sums(input=[
+            fluid.layers.fc(input=input_tmp[0], size=hidden_dim, act='tanh'),
+            fluid.layers.fc(input=input_tmp[1], size=hidden_dim, act='tanh')
+        ])
+        lstm = fluid.layers.dynamic_lstm(
+            input=mix_hidden,
+            size=hidden_dim,
+            candidate_activation='relu',
+            gate_activation='sigmoid',
+            cell_activation='sigmoid',
+            is_reverse=((i % 2) == 1))
+        input_tmp = [mix_hidden, lstm]
+    feature_out = fluid.layers.sums(input=[
+        fluid.layers.fc(input=input_tmp[0], size=label_dict_len, act='tanh'),
+        fluid.layers.fc(input=input_tmp[1], size=label_dict_len, act='tanh')
+    ])
+    return feature_out
+class TestCRFModel(unittest.TestCase):
+    def test_all(self):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            word = fluid.layers.data(
+                name='word_data', shape=[1], dtype='int64', lod_level=1)
+            predicate = fluid.layers.data(
+                name='verb_data', shape=[1], dtype='int64', lod_level=1)
+            ctx_n2 = fluid.layers.data(
+                name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
+            ctx_n1 = fluid.layers.data(
+                name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
+            ctx_0 = fluid.layers.data(
+                name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
+            ctx_p1 = fluid.layers.data(
+                name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
+            ctx_p2 = fluid.layers.data(
+                name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
+            mark = fluid.layers.data(
+                name='mark_data', shape=[1], dtype='int64', lod_level=1)
+            feature_out = db_lstm(**locals())
+            target = fluid.layers.data(
+                name='target', shape=[1], dtype='int64', lod_level=1)
+            crf_cost = fluid.layers.linear_chain_crf(
+                input=feature_out,
+                label=target,
+                param_attr=fluid.ParamAttr(
+                    name='crfw', learning_rate=1e-1))
+            avg_cost = fluid.layers.mean(crf_cost)
+            sgd_optimizer = fluid.optimizer.SGD(
+                learning_rate=fluid.layers.exponential_decay(
+                    learning_rate=0.01,
+                    decay_steps=100000,
+                    decay_rate=0.5,
+                    staircase=True))
+            sgd_optimizer.minimize(avg_cost)
+            train_data = paddle.batch(
+                paddle.reader.shuffle(
+                    paddle.dataset.conll05.test(), buf_size=8192),
+                batch_size=16)
+            place = fluid.CUDAPlace(0)
+            exe = fluid.Executor(place)
+            exe.run(startup)
+            pe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name)
+            feeder = fluid.DataFeeder(
+                feed_list=[
+                    word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate,
+                    mark, target
+                ],
+                place=fluid.CPUPlace())
+            data = train_data()
+            for i in xrange(10):
+                cur_batch = next(data)
+                print map(numpy.array,
+                          pe.run(feed_dict=feeder.feed(cur_batch),
+                                 fetch_list=[avg_cost.name]))[0]
--- a/python/paddle/fluid/tests/unittests/test_sequence_expand.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_expand.py
@@ -47,8 +47,10 @@ class TestSequenceExpand(OpTest):
            x_len = x_idx[i] - x_idx[i - 1]
            if repeat_num > 0:
                x_sub = x_data[x_idx[i - 1]:x_idx[i], :]
-                x_sub = np.repeat(x_sub, repeat_num, axis=0)
+                stacked_x_sub = x_sub
-                out = np.vstack((out, x_sub))
+                for r in range(repeat_num - 1):
+                    stacked_x_sub = np.vstack((stacked_x_sub, x_sub))
+                out = np.vstack((out, stacked_x_sub))
                if x_lod is not None:
                    for j in xrange(repeat_num):
                        out_lod[0].append(out_lod[0][-1] + x_len)
@@ -101,11 +103,11 @@ class TestSequenceExpandCase3(TestSequenceExpand):
 class TestSequenceExpandCase4(TestSequenceExpand):
    def set_data(self):
-        data = [0.1, 0.3, 0.2, 0.15, 0.25, 0.2, 0.15, 0.25, 0.1, 0.3]
+        data = np.random.uniform(0.1, 1, [5 * 2, 1])
        x_data = np.array(data).reshape([5, 2]).astype('float32')
        x_lod = [[0, 2, 5]]
-        y_data = np.random.uniform(0.1, 1, [2, 1]).astype('float32')
+        y_data = np.random.uniform(0.1, 1, [3, 1]).astype('float32')
-        y_lod = [[0, 1, 2], [0, 1, 2]]
+        y_lod = [[0, 1, 3], [0, 1, 3]]
        self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}

--- a/python/paddle/v2/reader/__init__.py
+++ b/python/paddle/v2/reader/__init__.py
--- a/tools/aws_benchmarking/README.md
+++ b/tools/aws_benchmarking/README.md
+# AWS benchmark testing tool
+This is an automation tool for deploying paddlepaddle benchmark testing to AWS.
+## Features
+ - subnet creation to fit just the amount of ec2 instances required.
+ - pserver and trainer ec2 instances allocation, and instance state verification
+ - nvidia-docker ready for GPU training
+ - Instances and network element garbage collection when a task is accomplished or an error occurred
+ - Test log is collected in realtime
+ - Web service for checking log or tearing down the testing setup
+ - No testing code change needed
+ - Lots of optional configuration options
+ ## Usages
+ ### Prerequisites
+ - You have a working AWS account
+ - You have [AWS Command Line Interface](https://aws.amazon.com/cli/) installed
+ - Your AWS cli is bind with a account which has `AmazonEC2FullAccess` permission, and it's set as default credential.
+ - You have key pair created and pem file downloaded.
+ - You have a default VPC in the region you want to run the test.
+ - You have a Security Group created for the VPC mentioned above, which allows port 22 and the port you want to expose your control web service (5436 by default)
+ - If your test is supposed to run in a GPU machine, especially a multi card GPU machine (p2, p3 series), you might need to contact amazon to raise the limit which allows no more than 1 GPU instance at a time.
+ ### Start a benchmark test
+#### Create training image
+*What to expect in this step:*
+*You will have your training logic packed with paddle runtime in a docker image, and be able to be picked up by AWS instance for training.*
+Training python script and PaddlePaddle runtime are supposed to be packed into one docker image. Use PaddlePaddle production images as base image and create the training images with the docker file as follows:
+```Dockerfile
+FROM paddlepaddle/paddle:latest-gpu
+ENV HOME /root
+COPY ./ /root/
+WORKDIR /root
+RUN pip install -r /root/requirements.txt
+ENTRYPOINT ["python", "my_training.py"]
+```
+***Please Note***
+Training nodes will run your `ENTRYPOINT` script with the following environment variables:
+ - `TASK_NAME`: unique name to identify this training process.
+ - `TRAINING_ROLE`: current node's role in this training process, either "PSERVER" or "TRAINER"
+ - `PSERVER_HOSTS`: comma separated value of pserver end points, I.E. "192.168.1.2:5436,192.168.1.3:5436"
+ - `PSERVERS`: same as above
+ - `TRAINERS`: trainer count
+ - `SERVER_ENDPOINT`: current server end point if the node role is a pserver
+ - `TRAINER_INDEX`: an integer to identify the index of current trainer if the node role is a trainer.
+ - `PADDLE_INIT_TRAINER_ID`: same as above
+ Now we have a working distributed training script which takes advantage of node environment variables and docker file to generate the training image. Run the following command:
+ ```bash
+ docker build -t myreponname/paddle_benchmark .
+ ```
+ Now you have the image built and tagged with `myreponame/paddle_benchmark`, let's push it to dockerhub so that it can be picked up by out AWS instance.
+ ```bash
+ docker push myreponame/paddle_benchmark
+ ```
+#### Create instances and start training
+*What to expect in this step*
+*you will be asked to provide some basic settings to config your training, and this tool will have your training started and monitored*
+Now let's start the training process:
+```bash
+docker run -i -v $HOME/.aws:/root/.aws -v <full path to your pem file>:/root/<key pare name>.pem \
+putcn/paddle_aws_client \
+--action create \
+--key_name <your key pare name> \
+--security_group_id <your security group id> \
+--docker_image myreponame/paddle_benchmark \
+--pserver_count 2 \
+--trainer_count 2
+```
+Now just wait until you see this:
+```
+master server finished init process, visit http://XXX:XXX/status to check master log
+```
+That means you can turn off your laptop and your cluster is creating instances, starting training process, collecting logs and eventually shut all pservers and trainers down when training is finished.
+#### Post creation operations
+To access the master log:
+```bash
+docker run -i -v $HOME/.aws:/root/.aws \
+putcn/paddle_aws_client \
+--action status \
+--master_server_public_ip <master ip> \
+--master_server_port <master port>
+```
+To tear down the training setup:
+```bash
+docker run -i -v $HOME/.aws:/root/.aws \
+putcn/paddle_aws_client \
+--action cleanup \
+--master_server_public_ip <master ip> \
+--master_server_port <master port>
+```
+To retrieve training logs
+TBD
+### Tech details
+*What to expect in this step*
+*You will understand what is happening behind the scene, and how to check the training log, how to tear down the training on the fly, etc.*
+Let's understand what is happening under the hood when you run above command in your laptop
+![alt](diagram.png)
+There are 4 roles in the figure above:
+ - client: your laptop
+ - master: who tasks to aws api server to create/tear down instances, and monitor training process
+ - AWS api server: the one who actually creates and manages instances
+ - pservers and trainers: training instances
+When you run the `docker run` command above, what it actually does is to ask aws api service to create a subnet (step 1) and a master instance (step 2), and pass all the parameters the client collected or generated (step 3). The master is kept as minimum hardware config to keep the running cost low.
+Then when the master is up and running, it will ask the aws api server to create the heavy lifting training instances who are expensive to run (step 4). And the master will start training process as soon as they are done initializing (step 5).
+Meanwhile, the master will expose a web service for client to check training log or even tear the training setup down by a web service call.
+if you are creating the training with client docker container, and also monitoring your aws dashboard, you will initially see a instance tagged with `ROLE=MASTER` and `TASK_NAME=<yourtask name>_master` starts, then you will see several instances tagged with `ROLE=PSERVER` and `ROLE=TRAINER` starts.
+When the training is finished, pservers and trainers will be terminated. All their logs are kept in master node's docker env.
+Master exposes 4 major services:
+ - GET `/status`: return master log
+ - GET `/logs`: return list of log file names
+ - GET `/log/<logfile name>`: return a particular log by log file name
+ - POST `/cleanup`: teardown the whole setup
+### Parameters
+TBD, please refer to client/cluster_launcher.py for now
+### Trouble shooting
+TBD
--- a/tools/aws_benchmarking/client/Dockerfile
+++ b/tools/aws_benchmarking/client/Dockerfile
+FROM python:2.7.14-stretch
+ENV HOME /root
+COPY ./ /root/
+WORKDIR /root
+RUN pip install -r /root/requirements.txt
+ENTRYPOINT ["python", "cluster_launcher.py"]
\ No newline at end of file
--- a/tools/aws_benchmarking/client/cluster_launcher.py
+++ b/tools/aws_benchmarking/client/cluster_launcher.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import time
+import math
+import logging
+import copy
+import netaddr
+import boto3
+import namesgenerator
+import paramiko
+from scp import SCPClient
+import requests
+def str2bool(v):
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    '--key_name', type=str, default="", help="required, key pair name")
+parser.add_argument(
+    '--security_group_id',
+    type=str,
+    default="",
+    help="required, the security group id associated with your VPC")
+parser.add_argument(
+    '--vpc_id',
+    type=str,
+    default="",
+    help="The VPC in which you wish to run test")
+parser.add_argument(
+    '--subnet_id',
+    type=str,
+    default="",
+    help="The Subnet_id in which you wish to run test")
+parser.add_argument(
+    '--pserver_instance_type',
+    type=str,
+    default="c5.2xlarge",
+    help="your pserver instance type, c5.2xlarge by default")
+parser.add_argument(
+    '--trainer_instance_type',
+    type=str,
+    default="p2.8xlarge",
+    help="your trainer instance type, p2.8xlarge by default")
+parser.add_argument(
+    '--task_name',
+    type=str,
+    default="",
+    help="the name you want to identify your job")
+parser.add_argument(
+    '--pserver_image_id',
+    type=str,
+    default="ami-da2c1cbf",
+    help="ami id for system image, default one has nvidia-docker ready, \
+    use ami-1ae93962 for us-east-2")
+parser.add_argument(
+    '--pserver_command', type=str, default="", help="pserver start command")
+parser.add_argument(
+    '--trainer_image_id',
+    type=str,
+    default="ami-da2c1cbf",
+    help="ami id for system image, default one has nvidia-docker ready, \
+    use ami-1ae93962 for us-west-2")
+parser.add_argument(
+    '--trainer_command', type=str, default="", help="trainer start command")
+parser.add_argument(
+    '--availability_zone',
+    type=str,
+    default="us-east-2a",
+    help="aws zone id to place ec2 instances")
+parser.add_argument(
+    '--trainer_count', type=int, default=1, help="Trainer count")
+parser.add_argument(
+    '--pserver_count', type=int, default=1, help="Pserver count")
+parser.add_argument(
+    '--action', type=str, default="create", help="create|cleanup|status")
+parser.add_argument('--pem_path', type=str, help="private key file")
+parser.add_argument(
+    '--pserver_port', type=str, default="5436", help="pserver port")
+parser.add_argument(
+    '--docker_image', type=str, default="busybox", help="training docker image")
+parser.add_argument(
+    '--master_server_port', type=int, default=5436, help="master server port")
+parser.add_argument(
+    '--master_server_public_ip', type=str, help="master server public ip")
+parser.add_argument(
+    '--master_docker_image',
+    type=str,
+    default="putcn/paddle_aws_master:latest",
+    help="master docker image id")
+parser.add_argument(
+    '--no_clean_up',
+    type=str2bool,
+    default=False,
+    help="whether to clean up after training")
+args = parser.parse_args()
+logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
+ec2client = boto3.client('ec2')
+def print_arguments():
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+def create_subnet():
+    # if no vpc id provided, list vpcs
+    logging.info("start creating subnet")
+    if not args.vpc_id:
+        logging.info("no vpc provided, trying to find the default one")
+        vpcs_desc = ec2client.describe_vpcs(
+            Filters=[{
+                "Name": "isDefault",
+                "Values": ["true", ]
+            }], )
+        if len(vpcs_desc["Vpcs"]) == 0:
+            raise ValueError('No default VPC')
+        args.vpc_id = vpcs_desc["Vpcs"][0]["VpcId"]
+        vpc_cidrBlock = vpcs_desc["Vpcs"][0]["CidrBlock"]
+        logging.info("default vpc fount with id %s and CidrBlock %s" %
+                     (args.vpc_id, vpc_cidrBlock))
+    if not vpc_cidrBlock:
+        logging.info("trying to find cidrblock for vpc")
+        vpcs_desc = ec2client.describe_vpcs(
+            Filters=[{
+                "Name": "vpc-id",
+                "Values": [args.vpc_id, ],
+            }], )
+        if len(vpcs_desc["Vpcs"]) == 0:
+            raise ValueError('No VPC found')
+        vpc_cidrBlock = vpcs_desc["Vpcs"][0]["CidrBlock"]
+        logging.info("cidrblock for vpc is %s" % vpc_cidrBlock)
+    # list subnets in vpc in order to create a new one
+    logging.info("trying to find ip blocks for new subnet")
+    subnets_desc = ec2client.describe_subnets(
+        Filters=[{
+            "Name": "vpc-id",
+            "Values": [args.vpc_id, ],
+        }], )
+    ips_taken = []
+    for subnet_dec in subnets_desc["Subnets"]:
+        ips_taken.append(subnet_dec["CidrBlock"])
+    ip_blocks_avaliable = netaddr.IPSet(
+        [vpc_cidrBlock]) ^ netaddr.IPSet(ips_taken)
+    # adding 10 addresses as buffer
+    cidr_prefix = 32 - math.ceil(
+        math.log(args.pserver_count + args.trainer_count + 10, 2))
+    if cidr_prefix <= 16:
+        raise ValueError('Too many nodes to fit in current VPC')
+    for ipnetwork in ip_blocks_avaliable.iter_cidrs():
+        try:
+            subnet_cidr = ipnetwork.subnet(int(cidr_prefix)).next()
+            logging.info("subnet ip block found %s" % (subnet_cidr))
+            break
+        except Exception:
+            pass
+    if not subnet_cidr:
+        raise ValueError(
+            'No avaliable subnet to fit required nodes in current VPC')
+    logging.info("trying to create subnet")
+    subnet_desc = ec2client.create_subnet(
+        CidrBlock=str(subnet_cidr),
+        VpcId=args.vpc_id,
+        AvailabilityZone=args.availability_zone)
+    subnet_id = subnet_desc["Subnet"]["SubnetId"]
+    subnet_waiter = ec2client.get_waiter('subnet_available')
+    # sleep for 1s before checking its state
+    time.sleep(1)
+    subnet_waiter.wait(SubnetIds=[subnet_id, ])
+    logging.info("subnet created")
+    logging.info("adding tags to newly created subnet")
+    ec2client.create_tags(
+        Resources=[subnet_id, ],
+        Tags=[{
+            "Key": "Task_name",
+            'Value': args.task_name
+        }])
+    return subnet_id
+def run_instances(image_id, instance_type, count=1, role="MASTER", cmd=""):
+    response = ec2client.run_instances(
+        ImageId=image_id,
+        InstanceType=instance_type,
+        MaxCount=count,
+        MinCount=count,
+        UserData=cmd,
+        DryRun=False,
+        InstanceInitiatedShutdownBehavior="stop",
+        KeyName=args.key_name,
+        Placement={'AvailabilityZone': args.availability_zone},
+        NetworkInterfaces=[{
+            'DeviceIndex': 0,
+            'SubnetId': args.subnet_id,
+            "AssociatePublicIpAddress": True,
+            'Groups': args.security_group_ids
+        }],
+        TagSpecifications=[{
+            'ResourceType': "instance",
+            'Tags': [{
+                "Key": 'Task_name',
+                "Value": args.task_name + "_master"
+            }, {
+                "Key": 'Role',
+                "Value": role
+            }]
+        }])
+    instance_ids = []
+    for instance in response["Instances"]:
+        instance_ids.append(instance["InstanceId"])
+    if len(instance_ids) > 0:
+        logging.info(str(len(instance_ids)) + " instance(s) created")
+    else:
+        logging.info("no instance created")
+    #create waiter to make sure it's running
+    logging.info("waiting for instance to become accessible")
+    waiter = ec2client.get_waiter('instance_status_ok')
+    waiter.wait(
+        Filters=[{
+            "Name": "instance-status.status",
+            "Values": ["ok"]
+        }, {
+            "Name": "instance-status.reachability",
+            "Values": ["passed"]
+        }, {
+            "Name": "instance-state-name",
+            "Values": ["running"]
+        }],
+        InstanceIds=instance_ids)
+    instances_response = ec2client.describe_instances(InstanceIds=instance_ids)
+    return instances_response["Reservations"][0]["Instances"]
+def generate_task_name():
+    return namesgenerator.get_random_name()
+def init_args():
+    if not args.task_name:
+        args.task_name = generate_task_name()
+        logging.info("task name generated %s" % (args.task_name))
+    if not args.pem_path:
+        args.pem_path = os.path.expanduser("~") + "/" + args.key_name + ".pem"
+    if args.security_group_id:
+        args.security_group_ids = (args.security_group_id, )
+def create():
+    init_args()
+    # create subnet
+    if not args.subnet_id:
+        args.subnet_id = create_subnet()
+    # create master node
+    master_instance_response = run_instances(
+        image_id="ami-7a05351f", instance_type="t2.nano")
+    logging.info("master server started")
+    args.master_server_public_ip = master_instance_response[0][
+        "PublicIpAddress"]
+    args.master_server_ip = master_instance_response[0]["PrivateIpAddress"]
+    logging.info("master server started, master_ip=%s, task_name=%s" %
+                 (args.master_server_public_ip, args.task_name))
+    # cp config file and pems to master node
+    ssh_key = paramiko.RSAKey.from_private_key_file(args.pem_path)
+    ssh_client = paramiko.SSHClient()
+    ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+    ssh_client.connect(
+        hostname=args.master_server_public_ip, username="ubuntu", pkey=ssh_key)
+    with SCPClient(ssh_client.get_transport()) as scp:
+        scp.put(os.path.expanduser("~") + "/" + ".aws",
+                recursive=True,
+                remote_path='/home/ubuntu/')
+        scp.put(args.pem_path,
+                remote_path='/home/ubuntu/' + args.key_name + ".pem")
+    logging.info("credentials and pem copied to master")
+    # set arguments and start docker
+    kick_off_cmd = "docker run -d -v /home/ubuntu/.aws:/root/.aws/"
+    kick_off_cmd += " -v /home/ubuntu/" + args.key_name + ".pem:/root/" + args.key_name + ".pem"
+    kick_off_cmd += " -v /home/ubuntu/logs/:/root/logs/"
+    kick_off_cmd += " -p " + str(args.master_server_port) + ":" + str(
+        args.master_server_port)
+    kick_off_cmd += " " + args.master_docker_image
+    args_to_pass = copy.copy(args)
+    args_to_pass.action = "serve"
+    del args_to_pass.pem_path
+    del args_to_pass.security_group_ids
+    del args_to_pass.master_docker_image
+    del args_to_pass.master_server_public_ip
+    for arg, value in sorted(vars(args_to_pass).iteritems()):
+        if value:
+            kick_off_cmd += ' --%s %s' % (arg, value)
+    logging.info(kick_off_cmd)
+    stdin, stdout, stderr = ssh_client.exec_command(command=kick_off_cmd)
+    return_code = stdout.channel.recv_exit_status()
+    logging.info(return_code)
+    if return_code != 0:
+        raise Exception("Error while kicking off master")
+    logging.info(
+        "master server finished init process, visit %s to check master log" %
+        (get_master_web_url("/status")))
+def cleanup():
+    print requests.post(get_master_web_url("/cleanup")).text
+def status():
+    print requests.post(get_master_web_url("/status")).text
+def get_master_web_url(path):
+    return "http://" + args.master_server_public_ip + ":" + str(
+        args.master_server_port) + path
+if __name__ == "__main__":
+    print_arguments()
+    if args.action == "create":
+        if not args.key_name or not args.security_group_id:
+            raise ValueError("key_name and security_group_id are required")
+        create()
+    elif args.action == "cleanup":
+        if not args.master_server_public_ip:
+            raise ValueError("master_server_public_ip is required")
+        cleanup()
+    elif args.action == "status":
+        if not args.master_server_public_ip:
+            raise ValueError("master_server_public_ip is required")
+        status()
--- a/tools/aws_benchmarking/client/requirements.txt
+++ b/tools/aws_benchmarking/client/requirements.txt
+netaddr==0.7.19
+boto3==1.6.21
+namesgenerator==0.3
+paramiko==2.4.1
+scp
+requests
--- a/tools/aws_benchmarking/diagram.png
+++ b/tools/aws_benchmarking/diagram.png
--- a/tools/aws_benchmarking/server/Dockerfile
+++ b/tools/aws_benchmarking/server/Dockerfile
+FROM python:2.7.14-stretch
+ENV HOME /root
+COPY ./ /root/
+WORKDIR /root
+RUN pip install -r /root/requirements.txt
+ENTRYPOINT ["python", "cluster_master.py"]
\ No newline at end of file
--- a/tools/aws_benchmarking/server/cluster_master.py
+++ b/tools/aws_benchmarking/server/cluster_master.py
--- a/tools/aws_benchmarking/server/logs/master.log
+++ b/tools/aws_benchmarking/server/logs/master.log
--- a/tools/aws_benchmarking/server/pserver.sh.template
+++ b/tools/aws_benchmarking/server/pserver.sh.template
+#!/bin/bash
+docker run --network="host" -i -e "SERVER_ENDPOINT={SERVER_ENDPOINT}" -e "MASTER_ENDPOINT={MASTER_ENDPOINT}" -e "TASK_NAME={TASK_NAME}" -e "TRAINER_INDEX={TRAINER_INDEX}" -e "TRAINING_ROLE=PSERVER" -e "TRAINER_COUNT={TRAINER_COUNT}" -e "TRAINERS={TRAINER_COUNT}" -e "PSERVER_HOSTS={PSERVER_HOSTS}" -e "PSERVERS={PSERVER_HOSTS}" {DOCKER_IMAGE} {COMMAND} --device CPU
\ No newline at end of file
--- a/tools/aws_benchmarking/server/requirements.txt
+++ b/tools/aws_benchmarking/server/requirements.txt
+netaddr==0.7.19
+boto3==1.6.21
+namesgenerator==0.3
+paramiko==2.4.1
--- a/tools/aws_benchmarking/server/trainer.sh.template
+++ b/tools/aws_benchmarking/server/trainer.sh.template
+#!/bin/bash 
+nvidia-docker run --network="host" -i  -e "MASTER_ENDPOINT={MASTER_ENDPOINT}" -e "TASK_NAME={TASK_NAME}" -e "TRAINER_COUNT={TRAINER_COUNT}" -e "TRAINERS={TRAINER_COUNT}" -e "TRAINER_INDEX={TRAINER_INDEX}"  -e "PADDLE_INIT_TRAINER_ID={TRAINER_INDEX}" -e "TRAINING_ROLE=TRAINER"  -e "PSERVER_HOSTS={PSERVER_HOSTS}"  -e "PSERVERS={PSERVER_HOSTS}" {DOCKER_IMAGE} {COMMAND} --device GPU
\ No newline at end of file