Merge branch 'develop' into core_add_inference_unittest

f6f7102b · Liu Yiqun · 250206d1 · b4565172 · f6f7102b · f6f7102b
107 changed file
--- a/.copyright.hook
+++ b/.copyright.hook
@@ -9,7 +9,7 @@ import subprocess
 import platform

 COPYRIGHT = '''
-  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -31,9 +31,6 @@ if(NOT CMAKE_CROSSCOMPILING)
 endif(NOT CMAKE_CROSSCOMPILING)
 find_package(Git REQUIRED)
 find_package(Threads REQUIRED)
-if(NOT ANDROID AND NOT IOS)
-    find_package(Boost QUIET)
-endif()

 include(simd)

@@ -140,6 +137,7 @@ include(external/openblas)  # download, build, install openblas
 include(external/mkldnn)    # download, build, install mkldnn
 include(external/swig)      # download, build, install swig
 include(external/warpctc)   # download, build, install warpctc
+include(external/boost)     # download, build, install boost
 include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
 include(external/pybind11)  # download pybind11
@@ -164,7 +162,6 @@ include_directories("${PADDLE_SOURCE_DIR}")
 include_directories("${PADDLE_SOURCE_DIR}/paddle/cuda/include")
 include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto")
 include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/client/c")
-include_directories(${Boost_INCLUDE_DIRS})

 set(EXTERNAL_LIBS
    ${GFLAGS_LIBRARIES}

--- a/Dockerfile
+++ b/Dockerfile
@@ -27,7 +27,7 @@ RUN apt-get update && \
    curl sed grep graphviz libjpeg-dev zlib1g-dev  \
    python-matplotlib gcc-4.8 g++-4.8 \
    automake locales clang-format swig doxygen cmake  \
-    liblapack-dev liblapacke-dev libboost-dev \
+    liblapack-dev liblapacke-dev \
    clang-3.8 llvm-3.8 libclang-3.8-dev \
    net-tools libtool && \
    apt-get clean -y

--- a/cmake/external/boost.cmake
+++ b/cmake/external/boost.cmake
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include(ExternalProject)
+
+set(BOOST_PROJECT       "extern_boost")
+set(BOOST_VER           "1.66.0")
+set(BOOST_TAR           "boost_1_66_0")
+set(BOOST_URL           "https://dl.bintray.com/boostorg/release/${BOOST_VER}/source/${BOOST_TAR}.tar.gz")
+set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
+set(BOOST_DOWNLOAD_DIR  "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
+set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE)
+
+include_directories(${BOOST_INCLUDE_DIR})
+
+ExternalProject_Add(
+    ${BOOST_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    DOWNLOAD_DIR          ${BOOST_DOWNLOAD_DIR}
+    DOWNLOAD_COMMAND      wget --no-check-certificate ${BOOST_URL} -c -q -O ${BOOST_TAR}.tar.gz
+                          && tar zxf ${BOOST_TAR}.tar.gz
+    DOWNLOAD_NO_PROGRESS  1
+    PREFIX                ${BOOST_SOURCES_DIR}
+    CONFIGURE_COMMAND     ""
+    BUILD_COMMAND         ""
+    INSTALL_COMMAND       ""
+    UPDATE_COMMAND        ""
+)
+
+if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
+    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/boost_dummy.c)
+    file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";")
+    add_library(boost STATIC ${dummyfile})
+else()
+    add_library(boost INTERFACE)
+endif()
+
+add_dependencies(boost ${BOOST_PROJECT})
+list(APPEND external_project_dependencies boost)
+set(Boost_INCLUDE_DIR ${BOOST_INCLUDE_DIR})
--- a/doc/api/v2/fluid/layers.rst
+++ b/doc/api/v2/fluid/layers.rst
@@ -505,6 +505,11 @@ swish
 ..  autofunction:: paddle.v2.fluid.layers.swish
    :noindex:

+im2sequence
+------
+..  autofunction:: paddle.v2.fluid.layers.im2sequence
+    :noindex:
+
 edit_distance
 ---------------
 ..  autofunction:: paddle.v2.fluid.layers.edit_distance_error
@@ -524,3 +529,8 @@ sequence_reshape
 ----------------
 ..  autofunction:: paddle.v2.fluid.layers.sequence_reshape
    :noindex:
+
+row_conv
+--------
+..  autofunction:: paddle.v2.fluid.layers.row_conv
+    :noindex:
--- a/doc/api/v2/fluid/nets.rst
+++ b/doc/api/v2/fluid/nets.rst
@@ -26,8 +26,8 @@ glu
    :noindex:


-dot_product_attention
---------------------
-..  autofunction:: paddle.v2.fluid.nets.dot_product_attention
+scaled_dot_product_attention
+----------------------------
+..  autofunction:: paddle.v2.fluid.nets.scaled_dot_product_attention
    :noindex:

--- a/doc/design/csp.md
+++ b/doc/design/csp.md
+# Design Doc: CSP in PaddlePaddle Fluid
+
+## Motivation
+
+Concurrent programming is important for deep learning.  Few example applications are:
+
+1.  The main thread keeps reading the next mini-batch while another thread uses the GPU for computing.
+2.  The main thread performs the computation while another thread uploads the local gradients from each trainer to the parameter server.
+
+Most DL systems, including TensorFlow, Caffe2, and MxNet, can asynchronously execute operators in a graph. However, Fluid doesn't have the concept of a graph at all, as the design goal of Fluid is that of a programming language.
+
+## Concurrent Programming Models
+
+There were many concurrent programming models, implemented in various forms:
+
+| concurrent programming model | implementation |
+|-----|-----|
+| mutex | types and functions in standard libraries |
+| semaphore | types and functions in standard libraries |
+| communicating sequential processes (CSP) | Go programming language |
+| actor model | Erlang programming language |
+| message passing | MPI |
+| bulk synchronous parallel (BSP) | Pregel distributed programming framework |
+
+Since Fluid was designed to be a programming language, we would like to implement CSP in Fluid.
+
+### CSP v.s. Actor Model
+
+A well-known implementation of Actor Model is the Erlang programming language.  In Actor Model, *processes* could send messages to another process and receive messages from another process given the process IDs.  We can find the three ingredients, process with ID, send, and recv, in MPI too.  Indeed, we can rewrite Erlang programs in Python + MPI with possibly fewer lines of code.  Our concern with Actor Model is that it doesn't seem reasonable to implement process management in a programming language's runtime library; instead, it should be the operating systems' responsibility to manage processes and libraries like MPI for send/recv.
+
+## CSP in Fluid
+
+Fluid has two fundamental control-flows: *if-else* and *while*.  If we are to implement CSP, we need the following:
+
+1. a new data type: *channel* and operators *send* and *recv*,
+1. *goroutine* or thread, and
+1. a new control-flow: select.
+
+We also need Python wrappers for the above components.
+
+The type *channel* is conceptually the blocking queue.  In Go, its implemented is a [blocking circular queue](https://github.com/golang/go/blob/68ce117cf17b8debf5754bfd476345779b5b6616/src/runtime/chan.go#L31-L50), which supports send and recv.
+
+The `select` operation has been in OS kernels long before Go language.  All Unix kernels implement system calls *poll* and *select*.  They monitor multiple file descriptors to see if I/O is possible on any of them.  This takes O(N) time.  Since Linux 2.6, a new system call, *epoll*, can do the same in O(1) time.  In BSD systems, there is a similar system call *kqueue*.  Go's Linux implementation uses epoll.
+
+It might be a good idea to implement Fluid's select using epoll too.  In this design doc, we start from the O(N) way, so we could focus on Python binding and the syntax.
+
+### Type Channel
+
+Fluid supports many data types:
+
+1. Tensor,
+1. Row-sparse Tensor
+1. LoD Tensor,
+1. Tensor array, etc
+
+Each data type is registered in the [`framework.proto`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L117-L127) as an enum value.  To add a new type channel, we need to add a new type enum.
+
+To expose a C++ type to Python, we need to edit the [`pybind.cc`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/pybind.cc) file.  [Here](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/pybind.cc#L120-L164) is an example how we expose C++ class LoDTensor.
+
+## Syntax Design
+
+### Create Channel
+
+In Go, we create a channel by specifying the element type and buffer size:
+
+```go
+ch  := make(chan int)       // a channel without buffer
+ch1 := make(chan int, 100)  // a channel that can buffer 100 ints.
+```
+
+In Fluid, we should be able to do the same:
+
+```python
+ch  = fluid.make_chan(dtype=INT)
+ch1 = fluid.make_chan(dtype=INT, 100)
+```
+
+In addition to that, we want channels that can hold more complex element types, e.g., Tensors of float16:
+
+```python
+ch = fluid.make_chan(dtype=Tensor, etype=float16)
+```
+
+or Tensors of Tensors of float16 etc.
+
+The point here is that we need a consistent way to compose types, like in C++ we can have `Tensor<Tensor<...<float16>...> >`.
+
+### Send and Recv
+
+### Select
+
+## Example Programs
+
+### 1. RPC between Trainers and Parameter Servers
+
+### 2. Concurrent Minibatch Loading
--- a/doc/design/dist_refactor/distributed_architecture.md
+++ b/doc/design/dist_refactor/distributed_architecture.md
@@ -152,12 +152,12 @@ for data in train_reader():
 `JobDesc` object describe the distributed job resource specification to run on
 Cluster environment.

-<img src="src/remote_executor.png"/>
+<img src="src/remote_executor.png" width="500" align="center" />

 `RemoteExecutor.run` sends the `ProgramDesc` and
 [TrainingJob](https://github.com/PaddlePaddle/cloud/blob/develop/doc/autoscale/README.md#training-job-resource)
 to a server in the cluster which executes `RemoteExecutor.listen`. This server is responsible
-to start the final Kubernetes Jobs to run the different role of `ProgramDesc`.
+to start the final Kubernetes Jobs to run the different role of `ProgramDesc` from `ConfigMap`.


 ### Placement Algorithm

--- a/doc/design/dist_refactor/parameter_server.md
+++ b/doc/design/dist_refactor/parameter_server.md
@@ -9,16 +9,16 @@ different purposes.

 ## Background

-The previous implementations of the parameter server does not run a
+The previous implementations of the parameter server do not run a
 fluid sub-program. Parameter initialization, optimizer computation, network
 communication and checkpointing are implemented twice on both the
-trainer and the parameter server.
+trainer as well as the parameter server.

-It would be great if we can write code once and use them on both the
-trainer and the parameter server: reduces code duplication and
-improves extensibility. Given that after the current refactor, we are
-representing everything as a computing graph on the
-trainer. Representing everything as a computing graph on the parameter
+It would be great if we can write code once and use them on both: the
+trainer and the parameter server, since this reduces code duplication and
+improves extensibility. Given that after the current refactoring, we are
+representing everything as a computation graph on the
+trainer. Representing everything as a computation graph on the parameter
 server becomes a natural extension.

 ## Design
@@ -30,9 +30,9 @@ into sub-programs to be scheduled on different nodes with the following
 steps:

 1. OP placement: the OPs will be placed on different nodes according
-   to heuristic that minimizes estimated total computation
+   to a heuristic that minimizes the estimated total computation
   time. Currently we will use a simple heuristic that puts parameter
-   varable on parameter server workers and everything else on trainer
+   variable on parameter server workers and everything else on trainer
   workers.
 1. Add communication OPs to enable the communication between nodes.

@@ -47,22 +47,22 @@ After converting:

 <img src="src/dist-graph.png" width="700"/>

-1. The parameter variable W and it's optimizer program are placed on the parameter server.
+1. The parameter variable W and its optimizer program are placed on the parameter server.
 1. Operators are added to the program.
   - *Send* sends data to the connected *Recv* operator.  The
 	 scheduler on the receive node will only schedule *Recv* operator
 	 to run when the *Send* operator has ran (the *Send* OP will mark
 	 the *Recv* OP runnable automatically).
-   - *Enueue* enqueues the input variable, it can block until space
+   - *Enqueue* enqueues the input variable, it can block until space
     become available in the queue.
   - *Dequeue* outputs configurable numbers of tensors from the
-     queue. It will block until the queue have the required number of
+     queue. It will block until the queue has the required number of
     tensors.


 ### Benefits

- Model parallelism become easier to implement: it's an extension to
+- Model parallelism becomes easier to implement: it is an extension to
  the trainer - parameter server approach. We can have several "Transpilers"
  to achieve different goals.
 - User-defined optimizer is easier to add - user can now express it as
@@ -72,22 +72,22 @@ After converting:

 ### Challenges

- It's important to balance the parameter shards of on multiple
-  parameter server. If a single parameter is very big (some
+- It is important to balance the parameter shards on multiple
+  parameter servers. If a single parameter is very big (for example: some
  word-embedding, fully connected, softmax layer), we need to
  automatically partition the single parameter onto different
  parameter servers when possible (only element-wise optimizer depends
  on the parameter variable).
- In the "Aync SGD" figure, the "W" variable on the parameter server
-  could be read and wrote concurrently. See
+- In the "Async SGD" figure, the "W" variable on the parameter server
+  could be read and written concurrently. See
  [here](https://github.com/PaddlePaddle/Paddle/pull/6394) for more
-  details about concurrent program in fluid.
+  details about concurrent program in Fluid.

 ### Discussion

 - Can the Enqueue OP be implemented under our current tensor design
-  (puts the input tensor into the queue tensor)?
- *Dequeue* OP will have variable numbers of output (depends on the
+  (put the input tensor into the queue tensor)?
+- *Dequeue* OP will have variable numbers of output (depending on the
  `min_count` attribute), does our current design support it? (similar
  question for the *Add* OP)


--- a/doc/design/dist_refactor/src/remote_executor.graffle
+++ b/doc/design/dist_refactor/src/remote_executor.graffle
--- a/doc/design/dist_refactor/src/remote_executor.png
+++ b/doc/design/dist_refactor/src/remote_executor.png
--- a/doc/design/ops/sequence_decoder.md
+++ b/doc/design/ops/sequence_decoder.md
@@ -22,7 +22,7 @@ The current `LoDTensor` is designed to store levels of variable-length sequences
 The integers in each level represent the begin and end (not inclusive) offset of a sequence **in the underlying tensor**,
 let's call this format the **absolute-offset LoD** for clarity.

-The relative-offset LoD can retrieve any sequence very quickly but fails to represent empty sequences, for example, a two-level LoD is as follows
+The absolute-offset LoD can retrieve any sequence very quickly but fails to represent empty sequences, for example, a two-level LoD is as follows
 ```python
 [[0, 3, 9]
 [0, 2, 3, 3, 3, 9]]
@@ -119,7 +119,7 @@ def generate():
        encoder_ctx_expanded = pd.lod_expand(encoder_ctx, target_word)
        decoder_input = pd.fc(
            act=pd.activation.Linear(),
-            input=[target_word, encoder_ctx],
+            input=[target_word, encoder_ctx_expanded],
            size=3 * decoder_dim)
        gru_out, cur_mem = pd.gru_step(
            decoder_input, mem=decoder_mem, size=decoder_dim)

--- a/doc/howto/optimization/cpu_profiling.md
+++ b/doc/howto/optimization/cpu_profiling.md
@@ -60,8 +60,7 @@ each column is as follows:
 | column | meaning |
 | --- | --- |
 | ncalls | the number of calls into a function |
-| tottime | the total execution time of the function, not including the
- execution time of other functions called by the function |
+| tottime | the total execution time of the function, not including the execution time of other functions called by the function |
 | percall | tottime divided by ncalls |
 | cumtime | the total execution time of the function, including the execution time of other functions being called |
 | percall | cumtime divided by ncalls |

--- a/doc/howto/usage/cluster/fluid_cluster_train_en.md
+++ b/doc/howto/usage/cluster/fluid_cluster_train_en.md
@@ -16,6 +16,12 @@ PaddlePaddle must be installed on all nodes. If you have GPU cards on your nodes

 PaddlePaddle build and installation guide can be found  [here](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html).

+In addition to above, the `cmake` command should be run with the option `WITH_DISTRIBUTE` set to on. An example bare minimum `cmake` command would look as follows:
+
+``` bash
+cmake .. -DWITH_DOC=OFF -DWITH_GPU=OFF -DWITH_DISTRIBUTE=ON -DWITH_SWIG_PY=ON -DWITH_PYTHON=ON
+```
+
 ### Update the training script

 #### Non-cluster training script
@@ -119,7 +125,14 @@ for pass_id in range(100):

 ### E2E demo

-Please find the complete demo from [here](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/tests/book_distribute/notest_dist_fit_a_line.py). In parameter server node run the following in the command line:
+Please find the complete demo from [here](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/tests/book_distribute/notest_dist_fit_a_line.py).
+First `cd` into the folder that contains the `python` files. In this case:
+
+```bash
+cd /paddle/python/paddle/v2/fluid/tests/book_distribute
+```
+
+In parameter server node run the following in the command line:

 ``` bash
 PSERVERS=192.168.1.2:6174 SERVER_ENDPOINT=192.168.1.2:6174 TRAINING_ROLE=PSERVER python notest_dist_fit_a_line.py

--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -18,7 +18,7 @@ else()
    add_subdirectory(capi)
  endif()

-  if(Boost_FOUND)
+  if(NOT ANDROID AND NOT IOS)
    add_subdirectory(memory)
    add_subdirectory(platform)
    add_subdirectory(framework)

--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
 # ddim lib
 proto_library(framework_proto SRCS framework.proto)

-cc_library(ddim SRCS ddim.cc DEPS eigen3)
+cc_library(ddim SRCS ddim.cc DEPS eigen3 boost)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)

@@ -45,7 +45,7 @@ cc_test(data_layout_transform_test SRCS data_layout_transform_test.cc DEPS data_
 cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor
        framework_proto selected_rows data_device_transform data_type_transform data_layout_transform)

-cc_library(attribute SRCS attribute.cc DEPS framework_proto)
+cc_library(attribute SRCS attribute.cc DEPS framework_proto boost)
 cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc
 device_context)
 cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute)
@@ -74,7 +74,8 @@ cc_library(backward SRCS backward.cc DEPS net_op)
 cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context fill_constant_op)
 cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)

-cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto backward glog lod_rank_table)
+cc_library(executor SRCS executor.cc DEPS op_registry device_context scope
+framework_proto backward glog lod_rank_table profiler)

 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)

--- a/paddle/framework/attribute.cc
+++ b/paddle/framework/attribute.cc
@@ -61,6 +61,9 @@ Attribute GetAttrValue(const proto::OpDesc::Attr& attr_desc) {
      }
      return val;
    }
+    case proto::AttrType::LONG: {
+      return attr_desc.l();
+    }
    default:
      PADDLE_THROW("Unsupport attr type %d", attr_desc.type());
  }

--- a/paddle/framework/attribute.h
+++ b/paddle/framework/attribute.h
@@ -168,6 +168,32 @@ struct ExtractAttribute<bool> {
  const std::string& attr_name_;
 };

+template <>
+struct ExtractAttribute<int64_t> {
+  explicit ExtractAttribute(const std::string& attr_name)
+      : attr_name_(attr_name) {}
+
+  int64_t* operator()(Attribute& attr) const {
+    if (attr.type() == typeid(int)) {  // NOLINT
+      int val = boost::get<int>(attr);
+      attr = static_cast<int64_t>(val);
+    } else if (attr.type() == typeid(float)) {  // NOLINT
+      int val = boost::get<float>(attr);
+      attr = static_cast<int64_t>(val);
+    }
+    int64_t* attr_value = nullptr;
+    try {
+      attr_value = &boost::get<int64_t>(attr);
+    } catch (boost::bad_get& bad_get) {
+      PADDLE_THROW("Cannot get attribute %s by type int64_t, its type is %s",
+                   attr_name_, attr.type().name());
+    }
+    return attr_value;
+  }
+
+  const std::string& attr_name_;
+};
+
 // check whether a certain attribute fit its limits
 // an attribute can have more than one limits
 template <typename T>

--- a/paddle/framework/block_desc.cc
+++ b/paddle/framework/block_desc.cc
@@ -75,7 +75,7 @@ std::vector<VarDesc *> BlockDesc::AllVars() const {

 OpDesc *BlockDesc::AppendOp() {
  need_update_ = true;
-  ops_.emplace_back(new OpDesc());
+  ops_.emplace_back(new OpDesc(this));
  return ops_.back().get();
 }

@@ -86,7 +86,7 @@ void BlockDesc::AppendAllocatedOp(std::unique_ptr<OpDesc> &&op_desc) {

 OpDesc *BlockDesc::PrependOp() {
  need_update_ = true;
-  ops_.emplace_front(new OpDesc());
+  ops_.emplace_front(new OpDesc(this));
  return ops_.front().get();
 }

@@ -153,7 +153,7 @@ BlockDesc::BlockDesc(ProgramDesc *prog, proto::BlockDesc *desc)
    vars_[var_desc.name()].reset(new VarDesc(var_desc));
  }
  for (const proto::OpDesc &op_desc : desc_->ops()) {
-    ops_.emplace_back(new OpDesc(op_desc, prog));
+    ops_.emplace_back(new OpDesc(op_desc, prog, this));
  }
 }

@@ -162,7 +162,7 @@ BlockDesc::BlockDesc(const BlockDesc &other, proto::BlockDesc *desc,
    : prog_(prog), desc_(desc) {
  need_update_ = true;
  for (auto &op : other.ops_) {
-    ops_.emplace_back(new OpDesc(*op));
+    ops_.emplace_back(new OpDesc(*op, this));
  }

  for (auto &it : other.vars_) {

--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -22,6 +22,7 @@ limitations under the License. */
 #include "paddle/framework/lod_tensor_array.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/platform/place.h"
+#include "paddle/platform/profiler.h"

 DECLARE_bool(do_memory_benchmark);
 DEFINE_bool(check_nan_inf, false,
@@ -116,8 +117,13 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,

  for (auto& op_desc : block.AllOps()) {
    auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
-    VLOG(3) << op->DebugStringEx(local_scope);
+    VLOG(4) << op->DebugStringEx(local_scope);
+
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    platform::RecordEvent record_event(op->Type(), pool.Get(place_));
+
    op->Run(*local_scope, place_);
+    VLOG(3) << op->DebugStringEx(local_scope);
    if (FLAGS_do_memory_benchmark) {
      VLOG(2) << "Memory used after operator " + op->Type() + " running: "
              << memory::memory_usage(place_);

--- a/paddle/framework/framework.proto
+++ b/paddle/framework/framework.proto
@@ -26,6 +26,7 @@ enum AttrType {
  BOOLEAN = 6;
  BOOLEANS = 7;
  BLOCK = 8;
+  LONG = 9;
 }

 // OpDesc describes an instance of a C++ framework::OperatorBase
@@ -44,6 +45,7 @@ message OpDesc {
    optional bool b = 10;
    repeated bool bools = 11;
    optional int32 block_idx = 12;
+    optional int64 l = 13;
  };

  message Var {

--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@@ -107,9 +107,10 @@ LoD ToAbsOffset(const LoD &in) {
  // the lowest level stores relative offsets
  if (in.empty() || in.size() == 1) return in;
  LoD result = in;
-  for (int level = result.size() - 2; level >= 0; level--) {
-    for (auto &ele : result[level]) {
-      ele = result[level + 1][ele];
+  for (auto level = static_cast<int>(in.size() - 2); level >= 0; level--) {
+    for (size_t i = 0; i < in[level].size(); ++i) {
+      size_t index = in[level][i];
+      result[level][i] = result[level + 1][index];
    }
  }
  return result;

--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -97,7 +97,7 @@ void OpDesc::CopyFrom(const OpDesc &op_desc) {
  need_update_ = true;
 }

-OpDesc::OpDesc(const proto::OpDesc &desc, ProgramDesc *prog)
+OpDesc::OpDesc(const proto::OpDesc &desc, ProgramDesc *prog, BlockDesc *block)
    : desc_(desc), need_update_(false) {
  // restore inputs_
  int input_size = desc_.inputs_size();
@@ -131,6 +131,7 @@ OpDesc::OpDesc(const proto::OpDesc &desc, ProgramDesc *prog)
      attrs_[attr_name] = prog->MutableBlock(bid);
    }
  }
+  this->block_ = block;
 }

 proto::OpDesc *OpDesc::Proto() {
@@ -282,6 +283,7 @@ struct SetAttrDescVisitor : public boost::static_visitor<void> {
    VectorToRepeated(v, attr_->mutable_bools());
  }
  void operator()(BlockDesc *desc) const { attr_->set_block_idx(desc->ID()); }
+  void operator()(int64_t v) const { attr_->set_l(v); }
  void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); }
 };


--- a/paddle/framework/op_desc.h
+++ b/paddle/framework/op_desc.h
@@ -25,7 +25,6 @@ namespace framework {

 class BlockDesc;
 class ProgramDesc;
-
 class OpDesc {
 public:
  OpDesc() {}
@@ -33,7 +32,14 @@ class OpDesc {
  OpDesc(const std::string &type, const VariableNameMap &inputs,
         const VariableNameMap &outputs, const AttributeMap &attrs);

-  OpDesc(const proto::OpDesc &desc, ProgramDesc *prog);
+  OpDesc(const proto::OpDesc &desc, ProgramDesc *prog, BlockDesc *block);
+
+  explicit OpDesc(BlockDesc *block) : block_(block) {}
+
+  OpDesc(const OpDesc &other, BlockDesc *block) {
+    *this = other;
+    block_ = block;
+  }

  void CopyFrom(const OpDesc &op_desc);

@@ -117,6 +123,10 @@ class OpDesc {

  void Flush();

+  BlockDesc *Block() { return this->block_; }
+
+  void SetBlock(BlockDesc *block) { this->block_ = block; }
+
 private:
  template <typename MapType>
  static std::vector<typename MapType::key_type> MapKeys(const MapType &map) {
@@ -129,6 +139,7 @@ class OpDesc {
  }

  proto::OpDesc desc_;
+  BlockDesc *block_;  // not_own
  // input arg name => input variable names
  VariableNameMap inputs_;
  // output arg name => output variable names

--- a/paddle/framework/type_defs.h
+++ b/paddle/framework/type_defs.h
@@ -35,7 +35,7 @@ using VariableNameMap = std::map<std::string, std::vector<std::string>>;
 using Attribute =
    boost::variant<boost::blank, int, float, std::string, std::vector<int>,
                   std::vector<float>, std::vector<std::string>, bool,
-                   std::vector<bool>, BlockDesc*>;
+                   std::vector<bool>, BlockDesc*, int64_t>;

 using AttributeMap = std::unordered_map<std::string, Attribute>;


--- a/paddle/framework/var_desc.h
+++ b/paddle/framework/var_desc.h
@@ -66,6 +66,8 @@ class VarDesc {

  std::string Name() const { return desc_.name(); }

+  void SetName(std::string name) { desc_.set_name(name); }
+
  void SetShape(const std::vector<int64_t> &dims);

  void SetDataType(proto::DataType data_type);

--- a/paddle/gserver/layers/PriorBox.cpp
+++ b/paddle/gserver/layers/PriorBox.cpp
@@ -69,7 +69,7 @@ bool PriorBoxLayer::init(const LayerMap& layerMap,
  if (maxSize_.size() > 0) CHECK_EQ(minSize_.size(), maxSize_.size());

  // flip aspect ratios
-  for (int index = 0; index < tmp.size(); index++) {
+  for (unsigned index = 0; index < tmp.size(); index++) {
    real ar = tmp[index];
    if (fabs(ar - 1.) < 1e-6) continue;
    aspectRatio_.push_back(ar);

--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -991,8 +991,10 @@ TEST(Layer, SequenceLastInstanceLayer) {
                   "seqlastins",
                   "non-seq",
                   -1);  // hasSubseq seqlastins to non-seq
-  testDegradeLayer(
-      true, "seqlastins", "seq", -1);  // hasSubseq seqlastins to seq
+  testDegradeLayer(true,
+                   "seqlastins",
+                   "seq",
+                   -1);  // hasSubseq seqlastins to seq
 }

 TEST(Layer, AverageLayer) {
@@ -1001,8 +1003,10 @@ TEST(Layer, AverageLayer) {
                   "average",
                   "non-seq",
                   5);  // seq average to a shorten seq, stride window = 5
-  testDegradeLayer(
-      true, "average", "non-seq", -1);           // hasSubseq average to non-seq
+  testDegradeLayer(true,
+                   "average",
+                   "non-seq",
+                   -1);                          // hasSubseq average to non-seq
  testDegradeLayer(true, "average", "seq", -1);  // hasSubseq average to seq
 }

@@ -1287,8 +1291,9 @@ TEST(Layer, PoolLayer) {
  testPoolLayer("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
  testPoolLayer2("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
  testPoolLayer2("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
-  testPoolLayer2(
-      "cudnn-avg-incl-pad-pool", /* trans= */ false, /* useGpu= */ true);
+  testPoolLayer2("cudnn-avg-incl-pad-pool",
+                 /* trans= */ false,
+                 /* useGpu= */ true);
  testPoolLayer("max-pool-with-mask", /* trans= */ false, /* useGpu= */ true);
 #endif
 }
@@ -2431,18 +2436,21 @@ TEST(Layer, test3DDeConvLayer) {
 }

 TEST(Layer, ScaleShiftLayer) {
-  const size_t batchSize = 16;
-  const size_t size = 32;
-  TestConfig config;
-  config.layerConfig.set_type("scale_shift");
-  config.layerConfig.set_size(size);
-  config.biasSize = 1;
-  config.inputDefs.push_back(
-      {INPUT_DATA, "input", /* dim= */ size, /* paraSize= */ 1});
-  config.layerConfig.add_inputs();
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "scale_shift", batchSize, false, useGpu, false);
-  }
+  // FIXME: Disable ScaleShiftLayer because it is not stable.
+  // https://github.com/PaddlePaddle/Paddle/issues/7781
+  return;
+  //  const size_t batchSize = 16;
+  //  const size_t size = 32;
+  //  TestConfig config;
+  //  config.layerConfig.set_type("scale_shift");
+  //  config.layerConfig.set_size(size);
+  //  config.biasSize = 1;
+  //  config.inputDefs.push_back(
+  //      {INPUT_DATA, "input", /* dim= */ size, /* paraSize= */ 1});
+  //  config.layerConfig.add_inputs();
+  //  for (auto useGpu : {false, true}) {
+  //    testLayerGrad(config, "scale_shift", batchSize, false, useGpu, false);
+  //  }
 }

 TEST(Layer, ScaleSubRegionLayer) {

--- a/paddle/inference/inference.cc
+++ b/paddle/inference/inference.cc
@@ -19,14 +19,10 @@ limitations under the License. */
 #include "paddle/framework/init.h"
 #include "paddle/framework/scope.h"

-#ifdef PADDLE_USE_PTOOLS
-#include "chooseser.h"
-#endif
-
 namespace paddle {

 void InferenceEngine::LoadInferenceModel(const std::string& dirname) {
-  std::string model_filename = dirname + "/__model__.dat";
+  std::string model_filename = dirname + "/__model__";
  LOG(INFO) << "loading model from " << model_filename;
  std::ifstream inputfs(model_filename, std::ios::in | std::ios::binary);
  std::string program_desc_str;
@@ -52,39 +48,15 @@ void InferenceEngine::LoadInferenceModel(const std::string& dirname) {
  }
 }

-void InferenceEngine::LoadInferenceModel(
-    const std::string& dirname,
-    const std::vector<std::string>& feed_var_names,
-    const std::vector<std::string>& fetch_var_names) {
-  std::string model_filename = dirname + "/__model__.dat";
-  LOG(INFO) << "loading model from " << model_filename;
-  std::ifstream inputfs(model_filename, std::ios::in | std::ios::binary);
-  std::string program_desc_str;
-  inputfs.seekg(0, std::ios::end);
-  program_desc_str.resize(inputfs.tellg());
-  inputfs.seekg(0, std::ios::beg);
-  LOG(INFO) << "program_desc_str's size: " << program_desc_str.size();
-  inputfs.read(&program_desc_str[0], program_desc_str.size());
-  inputfs.close();
-
-  program_ = new framework::ProgramDesc(program_desc_str);
-  GenerateLoadProgram(dirname);
-
-  if (feed_var_names.empty() || fetch_var_names.empty()) {
-    LOG(FATAL) << "Please specify the feed_var_names and fetch_var_names.";
-  }
-  feed_var_names_ = feed_var_names;
-  fetch_var_names_ = fetch_var_names;
-  PrependFeedOp();
-  AppendFetchOp();
-}
-
 bool InferenceEngine::IsParameter(const framework::VarDesc* var) {
-  if (var->Persistable() && var->Name() != "feed" && var->Name() != "fetch") {
+  if (var->Persistable()) {
    // There are many unreachable variables in the program
    for (size_t i = 0; i < program_->Size(); ++i) {
      const framework::BlockDesc& block = program_->Block(i);
      for (auto* op : block.AllOps()) {
+        if (op->Type() == "feed") {
+          continue;
+        }
        for (auto input_argument_name : op->InputArgumentNames()) {
          if (input_argument_name == var->Name()) {
            return true;

--- a/paddle/inference/inference.h
+++ b/paddle/inference/inference.h
@@ -29,9 +29,6 @@ public:
  }

  void LoadInferenceModel(const std::string& dirname);
-  void LoadInferenceModel(const std::string& dirname,
-                          const std::vector<std::string>& feed_var_names,
-                          const std::vector<std::string>& fetch_var_names);
  void Execute(const std::vector<framework::LoDTensor>& feeds,
               std::vector<framework::LoDTensor>& fetchs);


--- a/paddle/memory/CMakeLists.txt
+++ b/paddle/memory/CMakeLists.txt
 add_subdirectory(detail)

 cc_library(memory SRCS memory.cc DEPS place enforce)
-cc_library(memcpy SRCS memcpy.cc)
+cc_library(memcpy SRCS memcpy.cc DEPS place)

 cc_library(paddle_memory
    DEPS

--- a/paddle/operators/beam_search_op.cc
+++ b/paddle/operators/beam_search_op.cc
@@ -24,8 +24,18 @@ namespace operators {
 void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
                            framework::LoDTensor *selected_ids,
                            framework::LoDTensor *selected_scores) {
+  auto abs_lod = framework::ToAbsOffset(ids_->lod());
+  auto &high_level = abs_lod[lod_level_];
+
  auto items = SelectTopBeamSizeItems();
-  auto selected_items = ToMap(items);
+  auto selected_items = ToMap(items, high_level.back());
+  VLOG(3) << "selected_items:";
+  for (size_t i = 0; i < selected_items.size(); ++i) {
+    VLOG(3) << "offset:" << i;
+    for (auto &item : selected_items[i]) {
+      VLOG(3) << ItemToString(item);
+    }
+  }
  PruneEndidCandidates(pre_ids, &selected_items);
  // calculate the output tensor's height
  size_t num_instances = std::accumulate(
@@ -63,11 +73,12 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
  low_level.push_back(low_offset);

  // fill lod
-  auto abs_lod = framework::ToAbsOffset(ids_->lod());
-  auto &high_level = abs_lod[lod_level_];
  framework::LoD lod(2);
  lod[0].assign(high_level.begin(), high_level.end());
  lod[1].assign(low_level.begin(), low_level.end());
+  if (!framework::CheckLoD(lod)) {
+    PADDLE_THROW("lod %s is not right", framework::LoDToString(lod));
+  }
  selected_ids->set_lod(lod);
  selected_scores->set_lod(lod);
 }
@@ -90,13 +101,11 @@ int BeamSearch::PruneEndidCandidates(const framework::LoDTensor &pre_ids,
 }

 std::vector<std::vector<BeamSearch::Item>> BeamSearch::ToMap(
-    const std::vector<std::vector<Item>> &items) {
+    const std::vector<std::vector<Item>> &items, size_t element_num) {
  std::vector<std::vector<Item>> result;
+  result.resize(element_num);
  for (auto &entries : items) {
    for (const auto &item : entries) {
-      if (item.offset >= result.size()) {
-        result.resize(item.offset + 1);
-      }
      result[item.offset].push_back(item);
    }
  }
@@ -122,6 +131,14 @@ BeamSearch::SelectTopBeamSizeItems() {
    }
    result.emplace_back(items);
  }
+  VLOG(3) << "SelectTopBeamSizeItems result size " << result.size();
+  for (auto &items : result) {
+    VLOG(3) << "item set:";
+    for (auto &item : items) {
+      VLOG(3) << ItemToString(item);
+    }
+  }
+
  return result;
 }

@@ -159,6 +176,22 @@ bool BeamSearch::NextItemSet(std::vector<BeamSearch::Item> *items) {
  return true;
 }

+std::ostream &operator<<(std::ostream &os, const BeamSearch::Item &item) {
+  os << "{";
+  os << "offset: " << item.offset << ", ";
+  os << "id: " << item.id << ", ";
+  os << "score: " << item.score << "";
+  os << "}";
+
+  return os;
+}
+
+std::string ItemToString(const BeamSearch::Item &item) {
+  std::ostringstream stream;
+  stream << item;
+  return stream.str();
+}
+
 class BeamSearchProtoAndCheckerMaker
    : public framework::OpProtoAndCheckerMaker {
 public:
@@ -186,8 +219,40 @@ class BeamSearchProtoAndCheckerMaker
  }
 };

+class BeamSearchInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    for (const std::string &arg :
+         std::vector<std::string>({"pre_ids", "ids", "scores"})) {
+      PADDLE_ENFORCE(context->HasInput(arg),
+                     "BeamSearch need input argument '%s'", arg);
+    }
+    for (const std::string &arg :
+         std::vector<std::string>({"selected_ids", "selected_scores"})) {
+      PADDLE_ENFORCE(context->HasOutput(arg),
+                     "BeamSearch need output argument '%s'", arg);
+    }
+  }
+};
+
+class BeamSearchInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
+    for (auto &o : op_desc.Output("selected_ids")) {
+      block->Var(o)->SetType(framework::proto::VarDesc::LOD_TENSOR);
+    }
+    for (auto &o : op_desc.Output("selected_scores")) {
+      block->Var(o)->SetType(framework::proto::VarDesc::LOD_TENSOR);
+    }
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle

-REGISTER_OP_WITHOUT_GRADIENT(beam_search, paddle::operators::BeamSearchOp,
-                             paddle::operators::BeamSearchProtoAndCheckerMaker);
+REGISTER_OPERATOR(beam_search, paddle::operators::BeamSearchOp,
+                  paddle::operators::BeamSearchProtoAndCheckerMaker,
+                  paddle::operators::BeamSearchInferShape,
+                  paddle::operators::BeamSearchInferVarType,
+                  paddle::framework::EmptyGradOpMaker);
--- a/paddle/operators/beam_search_op.h
+++ b/paddle/operators/beam_search_op.h
@@ -136,8 +136,6 @@ class BeamSearch {
  void operator()(const framework::LoDTensor& pre_ids,
                  framework::LoDTensor* selected_ids,
                  framework::LoDTensor* selected_scores);
-
- protected:
  /*
   * The basic items help to sort.
   */
@@ -155,6 +153,7 @@ class BeamSearch {
    score_t score;
  };

+ protected:
  /*
   * Delete all the records that follows the end token.
   */
@@ -166,7 +165,7 @@ class BeamSearch {
   * NOTE low performance
   */
  std::vector<std::vector<Item>> ToMap(
-      const std::vector<std::vector<Item>>& inputs);
+      const std::vector<std::vector<Item>>& inputs, size_t element_num);

  /*
   * For each source, select top beam_size records.
@@ -187,6 +186,10 @@ class BeamSearch {
  int end_id_{0};
 };

+std::ostream& operator<<(std::ostream& os, const BeamSearch::Item& item);
+
+std::string ItemToString(const BeamSearch::Item& item);
+
 class BeamSearchOp : public framework::OperatorBase {
 public:
  BeamSearchOp(const std::string& type,
@@ -203,7 +206,6 @@ class BeamSearchOp : public framework::OperatorBase {

  void Run(const framework::Scope& scope,
           const platform::Place& dev_place) const override {
-    LOG(INFO) << "run beam search op";
    auto ids_var = scope.FindVar(Input("ids"));
    auto scores_var = scope.FindVar(Input("scores"));
    auto pre_ids_var = scope.FindVar(Input("pre_ids"));
@@ -217,10 +219,8 @@ class BeamSearchOp : public framework::OperatorBase {
    size_t level = Attr<int>("level");
    size_t beam_size = Attr<int>("beam_size");
    int end_id = Attr<int>("end_id");
-    LOG(INFO) << "init beam search";
    BeamSearch alg(ids, scores, level, beam_size, end_id);

-    LOG(INFO) << "after beam search";
    auto selected_ids_var = scope.FindVar(Output("selected_ids"));
    auto selected_scores_var = scope.FindVar(Output("selected_scores"));
    PADDLE_ENFORCE_NOT_NULL(selected_ids_var);
@@ -229,9 +229,7 @@ class BeamSearchOp : public framework::OperatorBase {
        *selected_ids_var->GetMutable<framework::LoDTensor>();
    auto& selected_scores_tensor =
        *selected_scores_var->GetMutable<framework::LoDTensor>();
-    LOG(INFO) << "run beam search";
    alg(pre_ids, &selected_ids_tensor, &selected_scores_tensor);
-    LOG(INFO) << "finish beam search";
  }
 };


--- a/paddle/operators/ctc_align_op.h
+++ b/paddle/operators/ctc_align_op.h
@@ -51,7 +51,7 @@ class CTCAlignKernel : public framework::OpKernel<T> {
      T prev_token = -1;
      for (size_t i = input_lod[level][seq_idx];
           i < input_lod[level][seq_idx + 1]; ++i) {
-        if (input_data[i] != blank &&
+        if ((unsigned)input_data[i] != blank &&
            !(merge_repeated && input_data[i] == prev_token)) {
          output_data[output_idx] = input_data[i];
          ++output_idx;

--- a/paddle/operators/detail/grpc_client.cc
+++ b/paddle/operators/detail/grpc_client.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "grpc_client.h"
+#include "paddle/framework/threadpool.h"
 namespace paddle {
 namespace operators {
 namespace detail {
@@ -22,25 +23,32 @@ bool RPCClient::AsyncSendVariable(const std::string& ep,
                                  const framework::Scope& scope,
                                  const std::string& var_name,
                                  int64_t time_out) {
-  sendrecv::VariableMessage req;
-  auto* var = scope.FindVar(var_name);
-  SerializeToMessage(var_name, var, ctx, &req);
-
-  // varhandle
-  VarHandle var_h;
-  var_h.ep = ep;
-  var_h.scope = &scope;
-  var_h.name = var_name;
-  var_h.ctx = &ctx;
-
-  // stub context
-  auto ch = GetChannel(ep);
-  SendProcessor* s = new SendProcessor(ch);
-  s->Prepare(var_h, time_out);
-  s->response_call_back_ = NULL;
-
-  auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
-  rpc->Finish(&s->reply_, &s->status_, (void*)s);
+  const platform::DeviceContext* p_ctx = &ctx;
+  const std::string ep_val = ep;
+  const std::string var_name_val = var_name;
+  const framework::Scope* p_scope = &scope;
+  const auto ch = GetChannel(ep_val);
+
+  framework::Async([var_name_val, p_ctx, ep_val, p_scope, time_out, ch, this] {
+    auto* var = p_scope->FindVar(var_name_val);
+    sendrecv::VariableMessage req;
+    SerializeToMessage(var_name_val, var, *p_ctx, &req);
+
+    // varhandle
+    VarHandle var_h;
+    var_h.ep = ep_val;
+    var_h.scope = p_scope;
+    var_h.name = var_name_val;
+    var_h.ctx = p_ctx;
+
+    // stub context
+    SendProcessor* s = new SendProcessor(ch);
+    s->Prepare(var_h, time_out);
+    s->response_call_back_ = NULL;
+
+    auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
+    rpc->Finish(&s->reply_, &s->status_, (void*)s);
+  });

  req_count_++;

@@ -50,8 +58,6 @@ bool RPCClient::AsyncSendVariable(const std::string& ep,
 void ProcGetResponse(const VarHandle& var_h,
                     const sendrecv::VariableMessage& ret_msg) {
  auto* outvar = var_h.scope->FindVar(var_h.name);
-
-  std::istringstream iss(ret_msg.serialized());
  DeserializeFromMessage(ret_msg, *var_h.ctx, outvar);
 }

@@ -60,24 +66,31 @@ bool RPCClient::AsyncGetVariable(const std::string& ep,
                                 const framework::Scope& scope,
                                 const std::string& var_name,
                                 int64_t time_out) {
-  sendrecv::VariableMessage req;
-  req.set_varname(var_name);
-
-  // varhandle
-  VarHandle var_h;
-  var_h.ep = ep;
-  var_h.scope = &scope;
-  var_h.name = var_name;
-  var_h.ctx = &ctx;
-
-  // stub context
-  auto ch = GetChannel(ep);
-  GetProcessor* s = new GetProcessor(ch);
-  s->Prepare(var_h, time_out);
-  s->response_call_back_ = ProcGetResponse;
-
-  auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
-  rpc->Finish(&s->reply_, &s->status_, (void*)s);
+  const platform::DeviceContext* p_ctx = &ctx;
+  const std::string ep_val = ep;
+  const std::string var_name_val = var_name;
+  const framework::Scope* p_scope = &scope;
+  const auto ch = GetChannel(ep_val);
+
+  framework::Async([var_name_val, ep_val, p_scope, p_ctx, time_out, ch, this] {
+    sendrecv::VariableMessage req;
+    req.set_varname(var_name_val);
+
+    // varhandle
+    VarHandle var_h;
+    var_h.ep = ep_val;
+    var_h.scope = p_scope;
+    var_h.name = var_name_val;
+    var_h.ctx = p_ctx;
+
+    // stub context
+    GetProcessor* s = new GetProcessor(ch);
+    s->Prepare(var_h, time_out);
+    s->response_call_back_ = ProcGetResponse;
+
+    auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
+    rpc->Finish(&s->reply_, &s->status_, (void*)s);
+  });

  req_count_++;

@@ -85,19 +98,31 @@ bool RPCClient::AsyncGetVariable(const std::string& ep,
 }

 bool RPCClient::Wait() {
-  bool ok = true;
+  if (req_count_ <= 0) {
+    return true;
+  }

-  while (true) {
-    if (req_count_ <= 0) {
-      break;
-    }
+  std::vector<bool> a(req_count_);
+  std::vector<std::future<void>> waits(req_count_);

-    if (!Proceed()) {
+  for (int i = 0; i < req_count_; i++) {
+    waits[i] = framework::Async([i, &a, this] { a[i] = Proceed(); });
+  }
+
+  for (int i = 0; i < req_count_; i++) {
+    waits[i].wait();
+  }
+
+  int last_req_count = req_count_;
+  req_count_ = 0;
+
+  for (int i = 0; i < last_req_count; i++) {
+    if (!a[i]) {
      return false;
    }
  }

-  return ok;
+  return true;
 }

 bool RPCClient::Proceed() {
@@ -124,7 +149,6 @@ bool RPCClient::Proceed() {

  c->Process();
  delete c;
-  req_count_--;
  return true;
 }


--- a/paddle/operators/im2sequence_op.h
+++ b/paddle/operators/im2sequence_op.h
@@ -79,7 +79,7 @@ class Im2SequenceKernel : public framework::OpKernel<T> {
    framework::LoD lod(1);
    lod[0].reserve(batch_size + 1);
    for (int i = 0, offset = 0; i < batch_size + 1; ++i) {
-      lod[0][i] = offset;
+      lod[0].push_back(offset);
      offset += output_height * output_width;
    }
    out->set_lod(lod);

--- a/paddle/operators/iou_similarity_op.cc
+++ b/paddle/operators/iou_similarity_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/iou_similarity_op.h"
+
+namespace paddle {
+namespace operators {
+
+class IOUSimilarityOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of IOUSimilarityOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"),
+                   "Input(Y) of IOUSimilarityOp should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "The rank of Input(X) must be 2.");
+    PADDLE_ENFORCE_EQ(x_dims[1], 4UL, "The shape of X is [N, 4]");
+    PADDLE_ENFORCE_EQ(y_dims.size(), 2UL, "The rank of Input(Y) must be 2.");
+    PADDLE_ENFORCE_EQ(y_dims[1], 4UL, "The shape of Y is [M, 4]");
+
+    ctx->ShareLoD("X", /*->*/ "Out");
+    ctx->SetOutputDim("Out", framework::make_ddim({x_dims[0], y_dims[0]}));
+  }
+};
+
+class IOUSimilarityOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  IOUSimilarityOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(LoDTensor, default LoDTensor<float>) "
+             "Box list X is a 2-D LoDTensor with shape [N, 4] holds N boxes, "
+             "each box is represented as [xmin, ymin, xmax, ymax], "
+             "the shape of X is [N, 4]. [xmin, ymin] is the left top "
+             "coordinate of the box if the input is image feature map, they "
+             "are close to the origin of the coordinate system. "
+             "[xmax, ymax] is the right bottom coordinate of the box. "
+             "This tensor can contain LoD information to represent a batch "
+             "of inputs. One instance of this batch can contain different "
+             "numbers of entities.");
+    AddInput("Y",
+             "(Tensor, default Tensor<float>) "
+             "Box list Y holds M boxes, each box is represented as "
+             "[xmin, ymin, xmax, ymax], the shape of X is [N, 4]. "
+             "[xmin, ymin] is the left top coordinate of the box if the "
+             "input is image feature map, and [xmax, ymax] is the right "
+             "bottom coordinate of the box.");
+
+    AddOutput("Out",
+              "(LoDTensor, the lod is same as input X) The output of "
+              "iou_similarity op, a tensor with shape [N, M] "
+              "representing pairwise iou scores.");
+
+    AddComment(R"DOC(
+IOU Similarity Operator.
+Computes intersection-over-union (IOU) between two box lists.
+ Box list 'X' should be a LoDTensor and 'Y' is a common Tensor,
+ boxes in 'Y' are shared by all instance of the batched inputs of X.
+ Given two boxes A and B, the calculation of IOU is as follows:
+
+$$
+IOU(A, B) = 
+\frac{area(A\cap B)}{area(A)+area(B)-area(A\cap B)}
+$$
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(iou_similarity, ops::IOUSimilarityOp,
+                             ops::IOUSimilarityOpMaker);
+
+REGISTER_OP_CPU_KERNEL(
+    iou_similarity,
+    ops::IOUSimilarityKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::IOUSimilarityKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/operators/iou_similarity_op.cu
+++ b/paddle/operators/iou_similarity_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/iou_similarity_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    iou_similarity,
+    ops::IOUSimilarityKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::IOUSimilarityKernel<paddle::platform::CUDADeviceContext, double>);
--- a/paddle/operators/iou_similarity_op.h
+++ b/paddle/operators/iou_similarity_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/for_range.h"
+
+template <typename T>
+inline HOSTDEVICE T IOUSimilarity(T xmin1, T ymin1, T xmax1, T ymax1, T xmin2,
+                                  T ymin2, T xmax2, T ymax2) {
+  constexpr T zero = static_cast<T>(0);
+  T area1 = (ymax1 - ymin1) * (xmax1 - xmin1);
+  T area2 = (ymax2 - ymin2) * (xmax2 - xmin2);
+  T inter_xmax = xmax1 > xmax2 ? xmax2 : xmax1;
+  T inter_ymax = ymax1 > ymax2 ? ymax2 : ymax1;
+  T inter_xmin = xmin1 > xmin2 ? xmin1 : xmin2;
+  T inter_ymin = ymin1 > ymin2 ? ymin1 : ymin2;
+  T inter_height = inter_ymax - inter_ymin;
+  T inter_width = inter_xmax - inter_xmin;
+  inter_height = inter_height > zero ? inter_height : zero;
+  inter_width = inter_width > zero ? inter_width : zero;
+  T inter_area = inter_width * inter_height;
+  T union_area = area1 + area2 - inter_area;
+  T sim_score = inter_area / union_area;
+  return sim_score;
+}
+
+template <typename T>
+struct IOUSimilarityFunctor {
+  IOUSimilarityFunctor(const T* x, const T* y, T* z, int cols)
+      : x_(x), y_(y), z_(z), cols_(static_cast<size_t>(cols)) {}
+
+  inline HOSTDEVICE void operator()(size_t row_id) const {
+    T x_min1 = x_[row_id * 4];
+    T y_min1 = x_[row_id * 4 + 1];
+    T x_max1 = x_[row_id * 4 + 2];
+    T y_max1 = x_[row_id * 4 + 3];
+    for (size_t i = 0; i < cols_; ++i) {
+      T x_min2 = y_[i * 4];
+      T y_min2 = y_[i * 4 + 1];
+      T x_max2 = y_[i * 4 + 2];
+      T y_max2 = y_[i * 4 + 3];
+
+      T sim = IOUSimilarity(x_min1, y_min1, x_max1, y_max1, x_min2, y_min2,
+                            x_max2, y_max2);
+
+      z_[row_id * cols_ + i] = sim;
+    }
+  }
+  const T* x_;
+  const T* y_;
+  T* z_;
+  const size_t cols_;
+};
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class IOUSimilarityKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const framework::LoDTensor* in_x = ctx.Input<framework::LoDTensor>("X");
+    const framework::Tensor* in_y = ctx.Input<framework::Tensor>("Y");
+    framework::LoDTensor* out = ctx.Output<framework::LoDTensor>("Out");
+
+    int x_n = in_x->dims()[0];
+    int y_n = in_y->dims()[0];
+    IOUSimilarityFunctor<T> functor(in_x->data<T>(), in_y->data<T>(),
+                                    out->mutable_data<T>(ctx.GetPlace()), y_n);
+
+    platform::ForRange<DeviceContext> for_range(
+        static_cast<const DeviceContext&>(ctx.device_context()), x_n);
+    for_range(functor);
+  }
+};  // namespace operators
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/lookup_table_op.cc
+++ b/paddle/operators/lookup_table_op.cc
@@ -66,6 +66,12 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
                  "(boolean, default false) "
                  "Sparse update")
        .SetDefault(false);
+    AddAttr<int64_t>("padding_idx",
+                     "(int64, default -1) "
+                     "If the value is -1, it makes no effect to lookup. "
+                     "Otherwise the given value indicates padding the output "
+                     "with zeros whenever lookup encounters it in Ids.")
+        .SetDefault(-1);
    AddComment(R"DOC(
 Lookup Table Operator.


--- a/paddle/operators/lookup_table_op.cu
+++ b/paddle/operators/lookup_table_op.cu
@@ -21,9 +21,11 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-template <typename T, int BlockDimX, int BlockDimY, int GridDimX>
+template <typename T, int BlockDimX, int BlockDimY, int GridDimX,
+          bool PaddingFlag>
 __global__ void LookupTable(T* output, const T* table, const int64_t* ids,
-                            const int64_t N, const int64_t K, const int64_t D) {
+                            const int64_t N, const int64_t K, const int64_t D,
+                            const int64_t padding_idx) {
  int idx = threadIdx.x;
  int idy = blockIdx.x + threadIdx.y * GridDimX;

@@ -34,7 +36,14 @@ __global__ void LookupTable(T* output, const T* table, const int64_t* ids,
    T* out = output + idy * D;
    const T* tab = table + id * D;
    for (int i = idx; i < D; i += BlockDimX) {
-      out[i] = tab[i];
+      if (PaddingFlag) {
+        if (id == padding_idx)
+          out[i] = static_cast<T>(0);
+        else
+          out[i] = tab[i];
+      } else {
+        out[i] = tab[i];
+      }
    }
    idy += BlockDimY * GridDimX;
  }
@@ -67,6 +76,7 @@ class LookupTableCUDAKernel : public framework::OpKernel<T> {
    auto* table_t = context.Input<LoDTensor>("W");
    auto* ids_t = context.Input<LoDTensor>("Ids");
    auto* output_t = context.Output<LoDTensor>("Out");
+    int64_t padding_idx = context.Attr<int64_t>("padding_idx");

    size_t N = table_t->dims()[0];
    size_t D = table_t->dims()[1];
@@ -77,10 +87,17 @@ class LookupTableCUDAKernel : public framework::OpKernel<T> {

    dim3 threads(128, 8);
    dim3 grids(8, 1);
-    LookupTable<
-        T, 128, 8,
-        8><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
-        output, table, ids, N, K, D);
+
+    if (padding_idx == -1)
+      LookupTable<
+          T, 128, 8, 8,
+          false><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
+          output, table, ids, N, K, D, padding_idx);
+    else
+      LookupTable<
+          T, 128, 8, 8,
+          true><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
+          output, table, ids, N, K, D, padding_idx);
  }
 };

@@ -91,6 +108,8 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
    auto& dev_ctx =
        context.template device_context<platform::CUDADeviceContext>();
    bool is_sparse = context.Attr<bool>("is_sparse");
+    // Since paddings are not trainable and fixed in forward, the gradient of
+    // paddings makes no sense and we don't deal with it in backward.
    if (is_sparse) {
      auto* ids = context.Input<LoDTensor>("Ids");
      auto* table = context.Input<LoDTensor>("W");

--- a/paddle/operators/lookup_table_op.h
+++ b/paddle/operators/lookup_table_op.h
@@ -32,16 +32,30 @@ class LookupTableKernel : public framework::OpKernel<T> {
    auto* table_t = context.Input<LoDTensor>("W");      // float tensor
    auto* ids_t = context.Input<LoDTensor>("Ids");      // int tensor
    auto* output_t = context.Output<LoDTensor>("Out");  // float tensor
+    int64_t padding_idx = context.Attr<int64_t>("padding_idx");

    int N = table_t->dims()[0];
    int D = table_t->dims()[1];
    auto* ids = ids_t->data<int64_t>();
    auto* table = table_t->data<T>();
    auto* output = output_t->mutable_data<T>(context.GetPlace());
-    for (int64_t i = 0; i < ids_t->numel(); ++i) {
-      PADDLE_ENFORCE_LT(ids[i], N);
-      PADDLE_ENFORCE_GE(ids[i], 0);
-      memcpy(output + i * D, table + ids[i] * D, D * sizeof(T));
+
+    if (padding_idx == -1) {
+      for (int64_t i = 0; i < ids_t->numel(); ++i) {
+        PADDLE_ENFORCE_LT(ids[i], N);
+        PADDLE_ENFORCE_GE(ids[i], 0);
+        memcpy(output + i * D, table + ids[i] * D, D * sizeof(T));
+      }
+    } else {
+      for (int64_t i = 0; i < ids_t->numel(); ++i) {
+        if (ids[i] == padding_idx) {
+          memset(output + i * D, 0, D * sizeof(T));
+        } else {
+          PADDLE_ENFORCE_LT(ids[i], N);
+          PADDLE_ENFORCE_GE(ids[i], 0);
+          memcpy(output + i * D, table + ids[i] * D, D * sizeof(T));
+        }
+      }
    }
  }
 };
@@ -51,6 +65,8 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    bool is_sparse = context.Attr<bool>("is_sparse");
+    // Since paddings are not trainable and fixed in forward, the gradient of
+    // paddings makes no sense and we don't deal with it in backward.
    if (is_sparse) {
      auto* ids = context.Input<LoDTensor>("Ids");
      auto* table = context.Input<LoDTensor>("W");

--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
@@ -11,7 +11,7 @@ if(WITH_GPU)
    nv_library(sequence_pooling SRCS sequence_pooling.cc sequence_pooling.cu DEPS device_context math_function)
    nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context tensor)
    nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context math_function)
-    nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context tensor)
+    nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context tensor math_function)
    nv_library(sequence_padding SRCS sequence_padding.cc sequence_padding.cu DEPS lod_tensor device_context)
    nv_library(sequence_scale SRCS sequence_scale.cc sequence_scale.cu DEPS lod_tensor device_context)
    nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions)
@@ -28,7 +28,7 @@ else()
    cc_library(sequence_pooling SRCS sequence_pooling.cc DEPS device_context math_function)
    cc_library(vol2col SRCS vol2col.cc DEPS device_context tensor)
    cc_library(context_project SRCS context_project.cc DEPS device_context math_function)
-    cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context tensor)
+    cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context tensor math_function)
    cc_library(sequence_padding SRCS sequence_padding.cc DEPS lod_tensor device_context)
    cc_library(sequence_scale SRCS sequence_scale.cc DEPS lod_tensor device_context)
    cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions)

--- a/paddle/operators/nccl_op_test.cu.cc
+++ b/paddle/operators/nccl_op_test.cu.cc
@@ -241,7 +241,7 @@ TEST_F(NCCLTester, ncclReduceOp) {
 // ncclBcastOp with desc
 TEST_F(NCCLTester, ncclBcastOp) {
  std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
-  const int kRoot = 5;
+  const int kRoot = 0;
  op2->SetType("ncclBcast");
  op2->SetInput("X", {"st"});
  op2->SetInput("Communicator", {"comm"});

--- a/paddle/operators/parallel_do_op.cc
+++ b/paddle/operators/parallel_do_op.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/framework/executor.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/threadpool.h"
+#include "paddle/operators/detail/safe_ref.h"

 namespace paddle {
 namespace operators {
@@ -31,6 +32,7 @@ static constexpr char kParallelScopes[] = "parallel_scopes";
 static constexpr char kParallelBlock[] = "sub_block";

 using LoDTensor = framework::LoDTensor;
+using SelectedRows = framework::SelectedRows;

 static void SplitTensorAndMoveTensorToScopes(
    const framework::Scope &scope, std::vector<framework::Scope *> *sub_scopes,
@@ -38,8 +40,10 @@ static void SplitTensorAndMoveTensorToScopes(
    const std::vector<std::string> &names) {
  size_t num_sub_scopes = 0;
  for (auto &argu : names) {
-    auto *var = scope.FindVar(argu);
-    const auto &tensor = var->Get<LoDTensor>();
+    const auto &tensor =
+        detail::Ref(scope.FindVar(argu),
+                    "Cannot find variable %s in the parent scope", argu)
+            .Get<LoDTensor>();
    auto lod_tensors = tensor.SplitLoDTensor(places);

    for (auto &lod : lod_tensors) {
@@ -59,11 +63,37 @@ static void SplitTensorAndMoveTensorToScopes(
    }

    for (size_t i = 0; i < lod_tensors.size(); ++i) {
-      *(*sub_scopes)[i]->Var(argu)->GetMutable<LoDTensor>() = lod_tensors[i];
+      *detail::Ref(sub_scopes->at(i)->Var(argu),
+                   "Cannot find variable in the sub-scope", argu)
+           .GetMutable<LoDTensor>() = lod_tensors[i];
    }
  }
 }

+inline void CopyOrShare(const framework::Variable &src,
+                        const platform::Place &dst_place,
+                        framework::Variable *dst) {
+  if (src.IsType<LoDTensor>()) {
+    if (src.Get<LoDTensor>().place() == dst_place) {
+      dst->GetMutable<LoDTensor>()->ShareDataWith(src.Get<LoDTensor>());
+    } else {
+      Copy(src.Get<LoDTensor>(), dst_place, dst->GetMutable<LoDTensor>());
+    }
+  } else if (src.IsType<SelectedRows>()) {
+    auto &src_sr = src.Get<SelectedRows>();
+    auto *dst_sr = dst->GetMutable<SelectedRows>();
+    dst_sr->set_rows(src_sr.rows());
+    dst_sr->set_height(src_sr.height());
+    if (src_sr.value().place() == dst_place) {
+      dst_sr->mutable_value()->ShareDataWith(src_sr.value());
+    } else {
+      Copy(src_sr.value(), dst_place, dst_sr->mutable_value());
+    }
+  } else {
+    PADDLE_THROW("Expect LoDTensor/SelectedRows, get %s", src.Type().name());
+  }
+}
+
 void WaitOnPlace(const platform::Place place) {
  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
  auto &dev_ctx = *pool.Get(place);
@@ -210,30 +240,30 @@ class ParallelDoGradOp : public framework::OperatorBase {
    }
    WaitOnPlaces(places);

-    // merge grad
+    AccumulateGrad(scope, place, sub_scopes, places);
+  }
+
+  void AccumulateGrad(const framework::Scope &scope,
+                      const platform::Place &place,
+                      const std::vector<framework::Scope *> &sub_scopes,
+                      const platform::PlaceList &places) const {
    for (auto &s : Outputs(framework::GradVarName(kParameters))) {
-      auto &result = sub_scopes[0]->FindVar(s)->Get<LoDTensor>();
      std::string tmp_name;
-      auto *tmp = sub_scopes[0]->Var(&tmp_name)->GetMutable<LoDTensor>();
+      auto *tmp = sub_scopes[0]->Var(&tmp_name);

      for (size_t i = 1; i < sub_scopes.size(); ++i) {
-        auto &tensor_to_merge = sub_scopes[i]->FindVar(s)->Get<LoDTensor>();
-        if (!(places[i] == places[0])) {
-          framework::Copy(tensor_to_merge, places[0], tmp);
-          WaitOnPlace(places[0]);
-        } else {
-          tmp->ShareDataWith(tensor_to_merge);
-        }
+        CopyOrShare(*sub_scopes[i]->FindVar(s), places[0], tmp);
+        WaitOnPlace(places[0]);

        auto sum_op = framework::OpRegistry::CreateOp(
            "sum", {{"X", {s, tmp_name}}}, {{"Out", {s}}},
            framework::AttributeMap{});
+        VLOG(3) << sum_op->DebugStringEx(sub_scopes[0]);
        sum_op->Run(*sub_scopes[0], places[0]);
        WaitOnPlace(places[0]);
      }

-      VLOG(3) << result;
-      framework::Copy(result, place, scope.FindVar(s)->GetMutable<LoDTensor>());
+      CopyOrShare(*sub_scopes[0]->FindVar(s), place, scope.FindVar(s));
    }
    WaitOnPlaces(places);
  }
@@ -262,6 +292,17 @@ class ParallelDoGradOpDescMaker : public framework::SingleGradOpDescMaker {
                        this->InputGrad(input_param, false));
      }
    }
+    auto *g_block = this->grad_block_[0];
+
+    // All variable name that needed by gradient operators
+    std::unordered_set<std::string> all_inputs_in_grad_blocks;
+
+    for (size_t i = 0; i < g_block->OpSize(); ++i) {
+      auto *op = g_block->Op(i);
+      for (auto &var_name : op->InputArgumentNames()) {
+        all_inputs_in_grad_blocks.insert(var_name);
+      }
+    }

    for (auto &output_param : this->OutputNames()) {
      if (output_param == kParallelScopes) {
@@ -270,8 +311,17 @@ class ParallelDoGradOpDescMaker : public framework::SingleGradOpDescMaker {
                       this->Output(output_param));
      } else {
        grad->SetInput(output_param, this->Output(output_param));
-        grad->SetInput(framework::GradVarName(output_param),
-                       this->OutputGrad(output_param));
+        std::vector<std::string> og_names;
+        for (auto &og_name : this->OutputGrad(output_param)) {
+          if (all_inputs_in_grad_blocks.count(og_name) != 0) {
+            // there are some gradient operators who need the OG. So make this
+            // OG as an input of parallel.do
+            og_names.push_back(og_name);
+          }
+          // else, there is no operator who need the OG. Do not use this OG as
+          // an input
+        }
+        grad->SetInput(framework::GradVarName(output_param), og_names);
      }
    }
    grad->SetAttrMap(this->Attrs());
@@ -289,7 +339,7 @@ class ParallelDoGradOpShapeInference : public framework::InferShapeBase {

    PADDLE_ENFORCE(ctx->HasInputs(kParameters));
    PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kParameters)));
-    PADDLE_ENFORCE(ctx->HasInput(kInputs));
+    PADDLE_ENFORCE(ctx->HasInputs(kInputs));

    for (auto &s : output) {
      PADDLE_ENFORCE(ctx->HasInputs(s));

--- a/paddle/operators/prior_box_op.cc
+++ b/paddle/operators/prior_box_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/prior_box_op.h"
+
+namespace paddle {
+namespace operators {
+
+class PriorBoxOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(Input) of PriorBoxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Image"),
+                   "Input(Image) of PriorBoxOp should not be null.");
+
+    auto image_dims = ctx->GetInputDim("Image");
+    auto input_dims = ctx->GetInputDim("Input");
+    PADDLE_ENFORCE(image_dims.size() == 4, "The layout of image is NCHW.");
+    PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW.");
+
+    PADDLE_ENFORCE_LT(input_dims[2], image_dims[2],
+                      "The height of input must smaller than image.");
+
+    PADDLE_ENFORCE_LT(input_dims[3], image_dims[3],
+                      "The width of input must smaller than image.");
+
+    auto min_sizes = ctx->Attrs().Get<std::vector<int>>("min_sizes");
+    auto max_sizes = ctx->Attrs().Get<std::vector<int>>("max_sizes");
+    auto variances = ctx->Attrs().Get<std::vector<float>>("variances");
+    auto aspect_ratios = ctx->Attrs().Get<std::vector<float>>("aspect_ratios");
+    bool flip = ctx->Attrs().Get<bool>("flip");
+
+    PADDLE_ENFORCE_GT(min_sizes.size(), 0,
+                      "Size of min_sizes must be at least 1.");
+    for (size_t i = 0; i < min_sizes.size(); ++i) {
+      PADDLE_ENFORCE_GT(min_sizes[i], 0, "min_sizes[%d] must be positive.", i);
+    }
+
+    std::vector<float> aspect_ratios_vec;
+    ExpandAspectRatios(aspect_ratios, flip, aspect_ratios_vec);
+
+    int num_priors = aspect_ratios_vec.size() * min_sizes.size();
+    if (max_sizes.size() > 0) {
+      PADDLE_ENFORCE_EQ(max_sizes.size(), min_sizes.size(),
+                        "The number of min_size and max_size must be equal.");
+      for (size_t i = 0; i < min_sizes.size(); ++i) {
+        PADDLE_ENFORCE_GT(max_sizes[i], min_sizes[i],
+                          "max_size[%d] must be greater than min_size[%d].", i,
+                          i);
+        num_priors += 1;
+      }
+    }
+
+    PADDLE_ENFORCE_EQ(variances.size(), 4, "Must and only provide 4 variance.");
+    for (size_t i = 0; i < variances.size(); ++i) {
+      PADDLE_ENFORCE_GT(variances[i], 0.0,
+                        "variance[%d] must be greater than 0.", i);
+    }
+
+    const float step_h = ctx->Attrs().Get<float>("step_h");
+    PADDLE_ENFORCE_GT(step_h, 0.0, "step_h should be larger than 0.");
+    const float step_w = ctx->Attrs().Get<float>("step_w");
+    PADDLE_ENFORCE_GT(step_w, 0.0, "step_w should be larger than 0.");
+
+    std::vector<int64_t> dim_vec(4);
+    dim_vec[0] = input_dims[2];
+    dim_vec[1] = input_dims[3];
+    dim_vec[2] = num_priors;
+    dim_vec[3] = 4;
+    ctx->SetOutputDim("Boxes", framework::make_ddim(dim_vec));
+    ctx->SetOutputDim("Variances", framework::make_ddim(dim_vec));
+  }
+};
+
+class PriorBoxOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  PriorBoxOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Input",
+             "(Tensor, default Tensor<float>), "
+             "the input feature data of PriorBoxOp, The layout is NCHW.");
+    AddInput("Image",
+             "(Tensor, default Tensor<float>), "
+             "the input image data of PriorBoxOp, The layout is NCHW.");
+    AddOutput("Boxes",
+              "(Tensor, default Tensor<float>), the output prior boxes of "
+              "PriorBoxOp. The layout is [H, W, num_priors, 4]. "
+              "H is the height of input, W is the width of input, num_priors "
+              "is the box count of each position.");
+    AddOutput("Variances",
+              "(Tensor, default Tensor<float>), the expanded variances of "
+              "PriorBoxOp. The layout is [H, W, num_priors, 4]. "
+              "H is the height of input, W is the width of input, num_priors "
+              "is the box count of each position.");
+    AddAttr<std::vector<int>>("min_sizes", "(vector<int>) ",
+                              "List of min sizes of generated prior boxes.");
+    AddAttr<std::vector<int>>("max_sizes", "(vector<int>) ",
+                              "List of max sizes of generated prior boxes.");
+    AddAttr<std::vector<float>>(
+        "aspect_ratios", "(vector<float>) ",
+        "List of aspect ratios of generated prior boxes.");
+    AddAttr<std::vector<float>>(
+        "variances", "(vector<float>) ",
+        "List of variances to be encoded in prior boxes.");
+    AddAttr<bool>("flip", "(bool) ", "Whether to flip aspect ratios.")
+        .SetDefault(true);
+    AddAttr<bool>("clip", "(bool) ", "Whether to clip out-of-boundary boxes.")
+        .SetDefault(true);
+    AddAttr<float>("step_w",
+                   "Prior boxes step across width, 0 for auto calculation.")
+        .SetDefault(0.0);
+    AddAttr<float>("step_h",
+                   "Prior boxes step across height, 0 for auto calculation.")
+        .SetDefault(0.0);
+    AddAttr<float>("offset",
+                   "(float) "
+                   "Prior boxes center offset.")
+        .SetDefault(0.5);
+    AddComment(R"DOC(
+Prior box operator
+Generate prior boxes for SSD(Single Shot MultiBox Detector) algorithm.
+Each position of the input produce N prior boxes, N is determined by
+ the count of min_sizes, max_sizes and aspect_ratios, The size of the
+ box is in range(min_size, max_size) interval, which is generated in
+ sequence according to the aspect_ratios.
+
+Please get more information from the following papers:
+https://arxiv.org/abs/1512.02325.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(prior_box, ops::PriorBoxOp, ops::PriorBoxOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    prior_box, ops::PriorBoxOpKernel<paddle::platform::CPUPlace, float>,
+    ops::PriorBoxOpKernel<paddle::platform::CPUPlace, double>);
--- a/paddle/operators/prior_box_op.h
+++ b/paddle/operators/prior_box_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/platform/transform.h"
+
+namespace paddle {
+namespace operators {
+
+inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
+                               bool flip,
+                               std::vector<float>& output_aspect_ratior) {
+  constexpr float epsilon = 1e-6;
+  output_aspect_ratior.clear();
+  output_aspect_ratior.push_back(1.);
+  for (size_t i = 0; i < input_aspect_ratior.size(); ++i) {
+    float ar = input_aspect_ratior[i];
+    bool already_exist = false;
+    for (size_t j = 0; j < output_aspect_ratior.size(); ++j) {
+      if (fabs(ar - output_aspect_ratior[j]) < epsilon) {
+        already_exist = true;
+        break;
+      }
+    }
+    if (!already_exist) {
+      output_aspect_ratior.push_back(ar);
+      if (flip) {
+        output_aspect_ratior.push_back(1. / ar);
+      }
+    }
+  }
+}
+
+template <typename T>
+struct ClipFunctor {
+  HOSTDEVICE T operator()(T in) const {
+    return std::min<T>(std::max<T>(in, 0.), 1.);
+  }
+};
+
+template <typename Place, typename T>
+class PriorBoxOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<paddle::framework::Tensor>("Input");
+    auto* image = ctx.Input<paddle::framework::Tensor>("Image");
+    auto* boxes = ctx.Output<paddle::framework::Tensor>("Boxes");
+    auto* vars = ctx.Output<paddle::framework::Tensor>("Variances");
+
+    auto min_sizes = ctx.Attr<std::vector<int>>("min_sizes");
+    auto max_sizes = ctx.Attr<std::vector<int>>("max_sizes");
+    auto input_aspect_ratio = ctx.Attr<std::vector<float>>("aspect_ratios");
+    auto variances = ctx.Attr<std::vector<float>>("variances");
+    auto flip = ctx.Attr<bool>("flip");
+    auto clip = ctx.Attr<bool>("clip");
+
+    std::vector<float> aspect_ratios;
+    ExpandAspectRatios(input_aspect_ratio, flip, aspect_ratios);
+
+    T step_w = static_cast<T>(ctx.Attr<float>("step_w"));
+    T step_h = static_cast<T>(ctx.Attr<float>("step_h"));
+    T offset = static_cast<T>(ctx.Attr<float>("offset"));
+
+    auto img_width = image->dims()[3];
+    auto img_height = image->dims()[2];
+
+    auto feature_width = input->dims()[3];
+    auto feature_height = input->dims()[2];
+
+    T step_width, step_height;
+    if (step_w == 0 || step_h == 0) {
+      step_width = static_cast<T>(img_width) / feature_width;
+      step_height = static_cast<T>(img_height) / feature_height;
+    } else {
+      step_width = step_w;
+      step_height = step_h;
+    }
+
+    int num_priors = aspect_ratios.size() * min_sizes.size();
+    if (max_sizes.size() > 0) {
+      num_priors += max_sizes.size();
+    }
+
+    boxes->mutable_data<T>(ctx.GetPlace());
+    vars->mutable_data<T>(ctx.GetPlace());
+
+    auto e_boxes = framework::EigenTensor<T, 4>::From(*boxes);
+    for (int h = 0; h < feature_height; ++h) {
+      for (int w = 0; w < feature_width; ++w) {
+        T center_x = (w + offset) * step_width;
+        T center_y = (h + offset) * step_height;
+        T box_width, box_height;
+        int idx = 0;
+        for (size_t s = 0; s < min_sizes.size(); ++s) {
+          int min_size = min_sizes[s];
+          // first prior: aspect_ratio = 1, size = min_size
+          box_width = box_height = min_size;
+          // xmin
+          e_boxes(h, w, idx, 0) = (center_x - box_width / 2.) / img_width;
+          // ymin
+          e_boxes(h, w, idx, 1) = (center_y - box_height / 2.) / img_height;
+          // xmax
+          e_boxes(h, w, idx, 2) = (center_x + box_width / 2.) / img_width;
+          // ymax
+          e_boxes(h, w, idx, 3) = (center_y + box_height / 2.) / img_height;
+
+          idx++;
+          if (max_sizes.size() > 0) {
+            int max_size = max_sizes[s];
+            // second prior: aspect_ratio = 1,
+            // size = sqrt(min_size * max_size)
+            box_width = box_height = sqrt(min_size * max_size);
+            // xmin
+            e_boxes(h, w, idx, 0) = (center_x - box_width / 2.) / img_width;
+            // ymin
+            e_boxes(h, w, idx, 1) = (center_y - box_height / 2.) / img_height;
+            // xmax
+            e_boxes(h, w, idx, 2) = (center_x + box_width / 2.) / img_width;
+            // ymax
+            e_boxes(h, w, idx, 3) = (center_y + box_height / 2.) / img_height;
+            idx++;
+          }
+
+          // rest of priors
+          for (size_t r = 0; r < aspect_ratios.size(); ++r) {
+            float ar = aspect_ratios[r];
+            if (fabs(ar - 1.) < 1e-6) {
+              continue;
+            }
+            box_width = min_size * sqrt(ar);
+            box_height = min_size / sqrt(ar);
+            // xmin
+            e_boxes(h, w, idx, 0) = (center_x - box_width / 2.) / img_width;
+            // ymin
+            e_boxes(h, w, idx, 1) = (center_y - box_height / 2.) / img_height;
+            // xmax
+            e_boxes(h, w, idx, 2) = (center_x + box_width / 2.) / img_width;
+            // ymax
+            e_boxes(h, w, idx, 3) = (center_y + box_height / 2.) / img_height;
+            idx++;
+          }
+        }
+      }
+    }
+
+    if (clip) {
+      platform::Transform<platform::CPUDeviceContext> trans;
+      ClipFunctor<T> clip_func;
+      trans(ctx.template device_context<platform::CPUDeviceContext>(),
+            boxes->data<T>(), boxes->data<T>() + boxes->numel(),
+            boxes->data<T>(), clip_func);
+    }
+
+    framework::Tensor var_t;
+    var_t.mutable_data<T>(
+        framework::make_ddim({1, static_cast<int>(variances.size())}),
+        ctx.GetPlace());
+    auto var_et = framework::EigenTensor<T, 2>::From(var_t);
+    for (size_t i = 0; i < variances.size(); ++i) {
+      var_et(0, i) = variances[i];
+    }
+
+    int box_num = feature_height * feature_width * num_priors;
+    auto var_dim = vars->dims();
+    vars->Resize({box_num, static_cast<int>(variances.size())});
+
+    auto e_vars = framework::EigenMatrix<T, Eigen::RowMajor>::From(*vars);
+    e_vars = var_et.broadcast(Eigen::DSizes<int, 2>(box_num, 1));
+
+    vars->Resize(var_dim);
+  }
+};  // namespace operators
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/reshape_op.cc
+++ b/paddle/operators/reshape_op.cc
@@ -90,14 +90,10 @@ Reshape Operator.
 Reshape Input(X) into the shape specified by Attr(shape).

 An example:
-Given a 2-D tensor X with 2 rows and 2 columns
-
-    [[1, 2], [3, 4]]
+Given a 2-D tensor X with 2 rows and 2 columns : [[1, 2], [3, 4]]

 and target shape = [1, 4], the reshape operator will transform
-the tensor X into a 2-D tensor:
-
-    [[1, 2, 3, 4]]
+the tensor X into a 2-D tensor: [[1, 2, 3, 4]]

 One dimension in the target shape can be set -1, representing that its
 size is unknown. In this case, the real dimension will be infered from 

--- a/paddle/operators/sequence_expand_op.h
+++ b/paddle/operators/sequence_expand_op.h
@@ -32,6 +32,7 @@ class SequenceExpandKernel : public framework::OpKernel<T> {
    const T* x_data = x->data<T>();
    auto x_dims = x->dims();
    auto* y = context.Input<LoDTensor>("Y");
+    PADDLE_ENFORCE(!y->lod().empty(), "y should have lod");
    PADDLE_ENFORCE_EQ(static_cast<size_t>(x_dims[0]),
                      y->lod().back().size() - 1,
                      "The size of last lod level in Input(Y)"

--- a/paddle/operators/sequence_reshape_op.h
+++ b/paddle/operators/sequence_reshape_op.h
@@ -35,7 +35,7 @@ class SequenceReshapeKernel : public framework::OpKernel<T> {
    PADDLE_ENFORCE_EQ(in_lod.size(), 1UL,
                      "Only support one level sequence now.");
    PADDLE_ENFORCE_EQ(
-        in_dims[0], in_lod[0].back(),
+        (uint64_t)in_dims[0], in_lod[0].back(),
        "Inconsistent size between X.shape[0] and X.lod()[0].back().");

    auto in_lod_l0 = in_lod[0];

--- a/paddle/operators/top_k_op.h
+++ b/paddle/operators/top_k_op.h
@@ -22,6 +22,7 @@ namespace paddle {
 namespace operators {

 using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;

 template <typename T, int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
@@ -33,9 +34,9 @@ class TopkKernel : public framework::OpKernel<T> {
  void Compute(const framework::ExecutionContext& ctx) const override {
    // Get the top k elements of each row of input tensor
    // FIXME: only deal with matrix(2d tensor).
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Output<Tensor>("Out");
-    auto* indices = ctx.Output<Tensor>("Indices");
+    auto* input = ctx.Input<LoDTensor>("X");
+    auto* output = ctx.Output<LoDTensor>("Out");
+    auto* indices = ctx.Output<LoDTensor>("Indices");
    // k is determined by Attr
    const size_t k = static_cast<int>(ctx.Attr<int>("k"));


--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -10,7 +10,7 @@ cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)

 nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce)

-cc_library(place SRCS place.cc DEPS enforce)
+cc_library(place SRCS place.cc DEPS enforce boost)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)

 add_subdirectory(dynload)

--- a/paddle/platform/call_once.h
+++ b/paddle/platform/call_once.h
@@ -29,20 +29,25 @@ namespace platform {
 */
 template <typename Callable, typename... Args>
 inline void call_once(std::once_flag& flag, Callable&& f, Args&&... args) {
-  bool good = false;
+  bool good = true;
  std::exception ex;
-  std::call_once(flag,
-                 [&](Args&&... args) {
-                   try {
-                     f(args...);
-                     good = true;
-                   } catch (const std::exception& e) {
-                     ex = e;
-                   } catch (...) {
-                     ex = std::runtime_error("excption caught in call_once");
-                   }
-                 },
-                 args...);
+  try {
+    std::call_once(flag,
+                   [&](Args&&... args) {
+                     try {
+                       f(args...);
+                     } catch (const std::exception& e) {
+                       ex = e;
+                       good = false;
+                     } catch (...) {
+                       ex = std::runtime_error("excption caught in call_once");
+                       good = false;
+                     }
+                   },
+                   args...);
+  } catch (std::system_error& x) {
+    throw std::runtime_error("call once failed");
+  }
  if (!good) {
    throw std::exception(ex);
  }

--- a/paddle/platform/profiler.cc
+++ b/paddle/platform/profiler.cc
@@ -47,16 +47,16 @@ inline uint64_t GetTimeInNsec() {
 }

 Event::Event(EventKind kind, std::string name, uint32_t thread_id,
-             DeviceContext* dev_ctx)
+             const DeviceContext* dev_ctx)
    : kind_(kind), name_(name), thread_id_(thread_id), has_cuda_(false) {
 #ifdef PADDLE_WITH_CUDA
-  auto* cuda_dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctx);
-  if (cuda_dev_ctx) {
+  has_cuda_ = dev_ctx ? platform::is_gpu_place(dev_ctx->GetPlace()) : false;
+  if (has_cuda_) {
+    auto* cuda_dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctx);
    PADDLE_ENFORCE(cudaGetDevice(&device_));
    PADDLE_ENFORCE(cudaEventCreate(&event_));
    auto stream = cuda_dev_ctx->stream();
    PADDLE_ENFORCE(cudaEventRecord(event_, stream));
-    has_cuda_ = true;
  }
 #endif
  cpu_ns_ = GetTimeInNsec();
@@ -114,19 +114,20 @@ inline EventList& GetEventList() {
  return *g_event_list;
 }

-void Mark(const std::string& name, DeviceContext* dev_ctx) {
+void Mark(const std::string& name, const DeviceContext* dev_ctx) {
  GetEventList().Record(EventKind::kMark, name, g_thread_id, dev_ctx);
 }

-void PushEvent(const std::string& name, DeviceContext* dev_ctx) {
+void PushEvent(const std::string& name, const DeviceContext* dev_ctx) {
  GetEventList().Record(EventKind::kPushRange, name, g_thread_id, dev_ctx);
 }

-void PopEvent(const std::string& name, DeviceContext* dev_ctx) {
+void PopEvent(const std::string& name, const DeviceContext* dev_ctx) {
  GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx);
 }

-RecordEvent::RecordEvent(const std::string& name, DeviceContext* dev_ctx) {
+RecordEvent::RecordEvent(const std::string& name,
+                         const DeviceContext* dev_ctx) {
  if (g_state == ProfilerState::kDisabled) return;
  dev_ctx_ = dev_ctx;
  name_ = name;
@@ -155,6 +156,7 @@ void EnableProfiler(ProfilerState state) {
        DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(d));
        Mark("_cuda_startup_", dev_ctx);
        dev_ctx->Wait();
+        delete dev_ctx;
      });
    }
  }
@@ -163,14 +165,17 @@ void EnableProfiler(ProfilerState state) {
  Mark("_start_profiler_", nullptr);
 }

-std::vector<std::vector<Event>> DisableProfiler() {
-  PADDLE_ENFORCE(g_state != ProfilerState::kDisabled,
-                 "Can't disable profiling, since it's not starting.");
-  // Mark the profiling stop.
-  Mark("_stop_profiler_", nullptr);
-  g_state = ProfilerState::kDisabled;
-  std::vector<std::vector<Event>> result;
+void ResetProfiler() {
  std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
+  for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
+       ++it) {
+    (*it)->Clear();
+  }
+}
+
+std::vector<std::vector<Event>> GetAllEvents() {
+  std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
+  std::vector<std::vector<Event>> result;
  for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
       ++it) {
    result.emplace_back((*it)->Reduce());
@@ -178,6 +183,18 @@ std::vector<std::vector<Event>> DisableProfiler() {
  return result;
 }

+void DisableProfiler(EventSortingKey sorted_key) {
+  PADDLE_ENFORCE(g_state != ProfilerState::kDisabled,
+                 "Can't disable profiling, since it's not starting.");
+  // Mark the profiling stop.
+  Mark("_stop_profiler_", nullptr);
+  g_state = ProfilerState::kDisabled;
+
+  std::vector<std::vector<Event>> all_events = GetAllEvents();
+  ParseEvents(all_events, sorted_key);
+  ResetProfiler();
+}
+
 void ParseEvents(std::vector<std::vector<Event>>& events,
                 EventSortingKey sorted_by) {
  if (g_profiler_place == "") return;
@@ -291,12 +308,12 @@ void ParseEvents(std::vector<std::vector<Event>>& events,
  }

  // Print report
-  PrintProfilingReport(events_table, sorted_domain, max_name_width + 4, 12);
+  PrintProfiler(events_table, sorted_domain, max_name_width + 4, 12);
 }

-void PrintProfilingReport(std::vector<std::vector<EventItem>>& events_table,
-                          std::string& sorted_domain, const size_t name_width,
-                          const size_t data_width) {
+void PrintProfiler(std::vector<std::vector<EventItem>>& events_table,
+                   std::string& sorted_domain, const size_t name_width,
+                   const size_t data_width) {
  // Output header information
  std::cout << "\n------------------------->"
            << "     Profiling Report     "

--- a/paddle/platform/profiler.h
+++ b/paddle/platform/profiler.h
@@ -29,7 +29,7 @@ class Event {
  // The DeviceContext is used to get the cuda stream.
  // If CPU profiling mode, can pass nullptr.
  Event(EventKind kind, std::string name, uint32_t thread_id,
-        DeviceContext* dev_ctx);
+        const DeviceContext* dev_ctx);

  std::string kind() const;
  std::string name() const { return name_; }
@@ -84,6 +84,8 @@ struct EventList {
    return result;
  }

+  void Clear() { event_blocks.clear(); }
+
  std::forward_list<std::vector<Event>> event_blocks;
 };

@@ -93,29 +95,26 @@ enum ProfilerState {
  kCUDA,      // GPU profiling state
 };

-void Mark(const std::string& name, DeviceContext* dev_ctx);
+void Mark(const std::string& name, const DeviceContext* dev_ctx);

-void PushEvent(const std::string& name, DeviceContext* dev_ctx);
+void PushEvent(const std::string& name, const DeviceContext* dev_ctx);

-void PopEvent(const std::string& name, DeviceContext* dev_ctx);
+void PopEvent(const std::string& name, const DeviceContext* dev_ctx);

 struct RecordEvent {
-  explicit RecordEvent(const std::string& name, DeviceContext* dev_ctx);
+  explicit RecordEvent(const std::string& name, const DeviceContext* dev_ctx);

  ~RecordEvent();

  // The device context is used by Event to get the current cuda stream.
-  DeviceContext* dev_ctx_;
+  const DeviceContext* dev_ctx_;
  // Event name
  std::string name_;
 };

-// Enable the profiling function.
-void EnableProfiler(ProfilerState state);
-
 // Return the event list of all threads. Asummed the returned value calls
 // event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
-std::vector<std::vector<Event>> DisableProfiler();
+std::vector<std::vector<Event>> GetAllEvents();

 // The information of each event given in the profiling report
 struct EventItem {
@@ -130,13 +129,22 @@ struct EventItem {
 // Candidate keys to sort the profiling report
 enum EventSortingKey { kDefault, kCalls, kTotal, kMin, kMax, kAve };

+// Enable the profiling function.
+void EnableProfiler(ProfilerState state);
+
+// Clear the g_all_event_lists, which is total event lists of all threads.
+void ResetProfiler();
+
+void DisableProfiler(EventSortingKey sorted_key);
+
 // Parse the event list and output the profiling report
 void ParseEvents(std::vector<std::vector<Event>>&,
                 EventSortingKey sorted_by = EventSortingKey::kDefault);

 // Print results
-void PrintProfilingReport(std::vector<std::vector<EventItem>>& events_table,
-                          std::string& sorted_domain, const size_t name_width,
-                          const size_t data_width);
+void PrintProfiler(std::vector<std::vector<EventItem>>& events_table,
+                   std::string& sorted_domain, const size_t name_width,
+                   const size_t data_width);
+
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/platform/profiler_test.cc
+++ b/paddle/platform/profiler_test.cc
@@ -103,18 +103,14 @@ TEST(RecordEvent, RecordEvent) {
  // Bad Usage:
  PushEvent("event_without_pop", dev_ctx);
  PopEvent("event_without_push", dev_ctx);
-  std::vector<std::vector<Event>> events = paddle::platform::DisableProfiler();
-  // Will remove parsing-related code from test later
-  ParseEvents(events, EventSortingKey::kTotal);
+  std::vector<std::vector<Event>> events = paddle::platform::GetAllEvents();

  int cuda_startup_count = 0;
  int start_profiler_count = 0;
-  int stop_profiler_count = 0;
  for (size_t i = 0; i < events.size(); ++i) {
    for (size_t j = 0; j < events[i].size(); ++j) {
      if (events[i][j].name() == "_cuda_startup_") ++cuda_startup_count;
      if (events[i][j].name() == "_start_profiler_") ++start_profiler_count;
-      if (events[i][j].name() == "_stop_profiler_") ++stop_profiler_count;
      if (events[i][j].name() == "push") {
        EXPECT_EQ(events[i][j + 1].name(), "pop");
 #ifdef PADDLE_WITH_CUDA
@@ -127,5 +123,7 @@ TEST(RecordEvent, RecordEvent) {
  }
  EXPECT_EQ(cuda_startup_count % 5, 0);
  EXPECT_EQ(start_profiler_count, 1);
-  EXPECT_EQ(stop_profiler_count, 1);
+
+  // Will remove parsing-related code from test later
+  DisableProfiler(EventSortingKey::kTotal);
 }
--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
 if(WITH_PYTHON)
  cc_library(paddle_pybind SHARED
    SRCS pybind.cc exception.cc protobuf.cc const_value.cc
-    DEPS pybind python backward proto_desc paddle_memory executor prune init
+    DEPS pybind python backward proto_desc paddle_memory executor prune init profiler
    ${GLOB_OP_LIB})
  if(NOT APPLE AND NOT ANDROID)
    target_link_libraries(paddle_pybind rt)

--- a/paddle/pybind/print_operators_doc.cc
+++ b/paddle/pybind/print_operators_doc.cc
@@ -64,6 +64,8 @@ std::string AttrType(paddle::framework::proto::AttrType at) {
      return "bool array";
    case paddle::framework::proto::BLOCK:
      return "block id";
+    case paddle::framework::proto::LONG:
+      return "long";
  }
  return "UNKNOWN";  // not possible
 }

--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
@@ -212,6 +212,7 @@ void BindVarDsec(py::module &m) {
             return name;
           },
           py::return_value_policy::reference)
+      .def("set_name", &VarDesc::SetName)
      .def("set_shape", &VarDesc::SetShape)
      .def("set_dtype", &VarDesc::SetDataType)
      .def("shape", &VarDesc::Shape, py::return_value_policy::reference)
@@ -280,7 +281,8 @@ void BindOpDesc(py::module &m) {
      .def("check_attrs", &OpDesc::CheckAttrs)
      .def("infer_shape", &OpDesc::InferShape)
      .def("infer_var_type", &OpDesc::InferVarType)
-      .def("serialize_to_string", SerializeMessage<OpDesc>);
+      .def("serialize_to_string", SerializeMessage<OpDesc>)
+      .def("block", &OpDesc::Block, py::return_value_policy::reference);
 }

 }  // namespace pybind

--- a/paddle/pybind/protobuf.h
+++ b/paddle/pybind/protobuf.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <Python.h>
 #include <fstream>
 #include <vector>
+#include "paddle/platform/variant.h"
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"

--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -30,6 +30,7 @@ limitations under the License. */
 #include "paddle/operators/net_op.h"
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/place.h"
+#include "paddle/platform/profiler.h"
 #include "paddle/pybind/const_value.h"
 #include "paddle/pybind/exception.h"
 #include "paddle/pybind/pybind.h"
@@ -52,7 +53,7 @@ static size_t UniqueIntegerGenerator(const std::string &prefix) {
  return generators[prefix].fetch_add(1);
 }

-bool IsCompileGPU() {
+bool IsCompiledWithCUDA() {
 #ifndef PADDLE_WITH_CUDA
  return false;
 #else
@@ -430,7 +431,7 @@ All parameter, weight, gradient are variables in Paddle.
  m.def("init_glog", framework::InitGLOG);
  m.def("init_devices", &framework::InitDevices);

-  m.def("is_compile_gpu", IsCompileGPU);
+  m.def("is_compiled_with_cuda", IsCompiledWithCUDA);

  m.def("set_feed_variable", framework::SetFeedVariable);
  m.def("get_fetch_variable", framework::GetFetchVariable);
@@ -476,6 +477,24 @@ All parameter, weight, gradient are variables in Paddle.
  m.def("nvprof_stop", platform::CudaProfilerStop);
 #endif

+  py::enum_<platform::ProfilerState>(m, "ProfilerState", py::arithmetic())
+      .value("kDisabled", platform::ProfilerState::kDisabled)
+      .value("kCPU", platform::ProfilerState::kCPU)
+      .value("kCUDA", platform::ProfilerState::kCUDA)
+      .export_values();
+
+  py::enum_<platform::EventSortingKey>(m, "EventSortingKey", py::arithmetic())
+      .value("kDefault", platform::EventSortingKey::kDefault)
+      .value("kCalls", platform::EventSortingKey::kCalls)
+      .value("kTotal", platform::EventSortingKey::kTotal)
+      .value("kMin", platform::EventSortingKey::kMin)
+      .value("kMax", platform::EventSortingKey::kMax)
+      .value("kAve", platform::EventSortingKey::kAve)
+      .export_values();
+
+  m.def("enable_profiler", platform::EnableProfiler);
+  m.def("disable_profiler", platform::DisableProfiler);
+  m.def("reset_profiler", platform::ResetProfiler);
  return m.ptr();
 }
 }  // namespace pybind

--- a/python/paddle/v2/fluid/__init__.py
+++ b/python/paddle/v2/fluid/__init__.py
@@ -89,7 +89,7 @@ def __bootstrap__():
    read_env_flags = [
        'use_pinned_memory', 'check_nan_inf', 'do_memory_benchmark'
    ]
-    if core.is_compile_gpu():
+    if core.is_compiled_with_cuda():
        read_env_flags += ['fraction_of_gpu_memory_to_use', 'op_sync']
    core.init_gflags([sys.argv[0]] +
                     ["--tryfromenv=" + ",".join(read_env_flags)])

--- a/python/paddle/v2/fluid/backward.py
+++ b/python/paddle/v2/fluid/backward.py
@@ -178,7 +178,7 @@ def _remove_no_grad_branch_(op_descs, no_grad_set):
        if _all_in_set_(
                filter(lambda name: name.find(core.grad_var_suffix()) != -1,
                       op_desc.input_arg_names()), no_grad_set):
-            no_grad_set.union(out_arg_names)
+            no_grad_set.update(out_arg_names)
            return True
        return False


--- a/python/paddle/v2/fluid/distribute_transpiler.py
+++ b/python/paddle/v2/fluid/distribute_transpiler.py
@@ -33,6 +33,10 @@ class VarBlock:
        return "%s:%d:%d" % (self.varname, self.offset, self.size)


+def same_or_split_var(p_name, var_name):
+    return p_name == var_name or p_name.startswith(var_name + ".block")
+
+
 def split_dense_variable(var_list,
                         pserver_count,
                         min_block_size=1024,
@@ -303,8 +307,8 @@ class DistributeTranspiler:
                return True
            else:
                for n in param_names:
-                    if n.startswith(op.inputs["Param"].name+".block") and \
-                       n != op.inputs["Param"].name:
+                    if same_or_split_var(n, op.inputs[
+                            "Param"].name) and n != op.inputs["Param"].name:
                        return True
                return False
        else:
@@ -335,7 +339,7 @@ class DistributeTranspiler:
            if key == "Grad":
                grad_block = None
                for g in self.param_grad_ep_mapping[endpoint]["grads"]:
-                    if g.name.startswith(var.name):
+                    if same_or_split_var(g.name, var.name):
                        grad_block = g
                        break
                if not grad_block:
@@ -365,7 +369,7 @@ class DistributeTranspiler:
                # param is already created on global program
                param_block = None
                for p in self.param_grad_ep_mapping[endpoint]["params"]:
-                    if p.name.startswith(var.name):
+                    if same_or_split_var(p.name, var.name):
                        param_block = p
                        break
                if not param_block:
@@ -502,7 +506,7 @@ class DistributeTranspiler:
        def _get_splited_name_and_shape(varname):
            for idx, splited_param in enumerate(params):
                pname = splited_param.name
-                if pname.startswith(varname) and varname != pname:
+                if same_or_split_var(pname, varname) and varname != pname:
                    return pname, splited_param.shape
            return "", []


--- a/python/paddle/v2/fluid/executor.py
+++ b/python/paddle/v2/fluid/executor.py
@@ -68,6 +68,84 @@ def as_numpy(tensor):
    return ans


+def has_feed_operators(block, feed_targets, feed_holder_name):
+    """ Check whether the block already has feed operators.
+
+    Return false if the block does not have any feed operators.
+    If some feed operators have been prepended to the block, check that
+    the info contained in these feed operators matches the feed_targets
+    and feed_holder_name. Raise exception when any mismatch is found.
+    Return true when the block has feed operators with matching info.
+
+    Args:
+        block: a block instance (typically global block of a program)
+        feed_targets: a dictionary of {feed_target_name: feed_target_data}
+        feed_holder_name: the name of the variable that holds the data of 
+            all feed targets. The type of this feed_holder variable is 
+            FEED_MINIBATCH, which is essentially vector<LoDTensor>.
+
+    Returns:
+        A boolean value that indicates whether a block has feed operators 
+        that match the info contained in feed_targets and feed_holder_name.
+    """
+
+    feed_count = 0
+    for op in block.ops:
+        if op.desc.type() == 'feed':
+            feed_count += 1
+            assert op.desc.input('X')[0] == feed_holder_name
+            feed_target_name = op.desc.output('Out')[0]
+            if feed_target_name not in feed_targets:
+                raise Exception("'feed_targets' does not have {} variable".
+                                format(feed_target_name))
+        else:
+            break
+    if feed_count > 0 and feed_count != len(feed_targets):
+        raise Exception(
+            "Feed operators in program desc do not match 'feed_targets'")
+    return feed_count > 0
+
+
+def has_fetch_operators(block, fetch_targets, fetch_holder_name):
+    """ Check whether the block already has fetch operators.
+    
+    Return false if the block does not have any fetch operators.
+    If some fetch operators have been appended to the block, check that
+    the info contained in these fetch operators matches the fetch_targets
+    and fetch_holder_name. Raise exception when any mismatch is found.
+    Return true when the block has fetch operators with matching info.
+
+    Args:
+        block: a block instance (typically global block of a program)
+        fetch_targets: a dictionary of {fetch_target_name: fetch_target_data}
+        fetch_holder_name: the name of the variable that holds the data of 
+            all fetch targets. The type of this fetch_holder variable is 
+            FETCH_LIST, which is essentially vector<LoDTensor>.    
+
+    Return:    
+        A boolean value that indicates whether a block has fetch operators 
+        that match the info contained in fetch_targets and fetch_holder_name.     
+    """
+
+    fetch_count = 0
+    for op in block.ops:
+        if op.desc.type() == 'fetch':
+            fetch_count += 1
+            assert op.desc.output('Out')[0] == fetch_holder_name
+            fetch_target_name = op.desc.input('X')[0]
+            if fetch_target_name not in [
+                    var.desc.name() for var in fetch_targets
+            ]:
+                raise Exception("'fetch_targets' does not have {} variable".
+                                format(fetch_target_name))
+            idx = op.desc.attr('col')
+            assert fetch_target_name == fetch_targets[idx].desc.name()
+    if fetch_count > 0 and fetch_count != len(fetch_targets):
+        raise Exception(
+            "Fetch operators in program desc do not match 'fetch_targets'")
+    return fetch_count > 0
+
+
 class Executor(object):
    def __init__(self, places):
        if not isinstance(places, list) and not isinstance(places, tuple):
@@ -147,33 +225,50 @@ class Executor(object):

        program = program.clone()
        global_block = program.global_block()
-        feed_var = global_block.create_var(
-            name=feed_var_name,
-            type=core.VarDesc.VarType.FEED_MINIBATCH,
-            persistable=True)
-
-        for i, name in enumerate(feed):
-            out = global_block.var(name)
-            global_block.prepend_op(
-                'feed',
-                inputs={'X': [feed_var]},
-                outputs={'Out': [out]},
-                attrs={'col': i})
-            cur_feed = feed[name]
-            if not isinstance(cur_feed, core.LoDTensor):
-                cur_feed = self.aslodtensor(cur_feed)
-            core.set_feed_variable(scope, cur_feed, feed_var.name, i)
-
-        fetch_var = global_block.create_var(
-            name=fetch_var_name,
-            type=core.VarDesc.VarType.FETCH_LIST,
-            persistable=True)
-        for i, var in enumerate(fetch_list):
-            global_block.append_op(
-                type='fetch',
-                inputs={'X': [var]},
-                outputs={'Out': [fetch_var]},
-                attrs={'col': i})
+
+        if feed_var_name in global_block.vars:
+            feed_var = global_block.var(feed_var_name)
+        else:
+            feed_var = global_block.create_var(
+                name=feed_var_name,
+                type=core.VarDesc.VarType.FEED_MINIBATCH,
+                persistable=True)
+
+        if fetch_var_name in global_block.vars:
+            fetch_var = global_block.var(fetch_var_name)
+        else:
+            fetch_var = global_block.create_var(
+                name=fetch_var_name,
+                type=core.VarDesc.VarType.FETCH_LIST,
+                persistable=True)
+
+        if not has_feed_operators(global_block, feed, feed_var_name):
+            for i, name in enumerate(feed):
+                out = global_block.var(name)
+                global_block.prepend_op(
+                    type='feed',
+                    inputs={'X': [feed_var]},
+                    outputs={'Out': [out]},
+                    attrs={'col': i})
+
+        for op in global_block.ops:
+            if op.desc.type() == 'feed':
+                feed_target_name = op.desc.output('Out')[0]
+                cur_feed = feed[feed_target_name]
+                if not isinstance(cur_feed, core.LoDTensor):
+                    cur_feed = self.aslodtensor(cur_feed)
+                idx = op.desc.attr('col')
+                core.set_feed_variable(scope, cur_feed, feed_var_name, idx)
+            else:
+                break
+
+        if not has_fetch_operators(global_block, fetch_list, fetch_var_name):
+            for i, var in enumerate(fetch_list):
+                global_block.append_op(
+                    type='fetch',
+                    inputs={'X': [var]},
+                    outputs={'Out': [fetch_var]},
+                    attrs={'col': i})

        self.executor.run(program.desc, scope, 0, True, True)
        outs = [

--- a/python/paddle/v2/fluid/io.py
+++ b/python/paddle/v2/fluid/io.py
@@ -13,8 +13,8 @@
 # limitations under the License.

 import os
-import cPickle as pickle

+from paddle.v2.fluid.evaluator import Evaluator
 from paddle.v2.fluid.framework import Program, Parameter, default_main_program, Variable
 from . import core

@@ -187,18 +187,28 @@ def get_inference_program(target_vars, main_program=None):
        main_program = default_main_program()
    if not isinstance(target_vars, list):
        target_vars = [target_vars]
-
-    pruned_program = main_program.prune(targets=target_vars)
+    vars = []
+    for var in target_vars:
+        if isinstance(var, Evaluator):
+            vars.extend(var.states)
+            vars.extend(var.metrics)
+        else:
+            vars.append(var)
+    pruned_program = main_program.prune(targets=vars)
    inference_program = pruned_program.inference_optimize()
    return inference_program


-def prepend_feed_ops(inference_program, feeded_var_names):
+def prepend_feed_ops(inference_program,
+                     feed_target_names,
+                     feed_holder_name='feed'):
    global_block = inference_program.global_block()
    feed_var = global_block.create_var(
-        name='feed', type=core.VarDesc.VarType.FEED_MINIBATCH, persistable=True)
+        name=feed_holder_name,
+        type=core.VarDesc.VarType.FEED_MINIBATCH,
+        persistable=True)

-    for i, name in enumerate(feeded_var_names):
+    for i, name in enumerate(feed_target_names):
        out = global_block.var(name)
        global_block.prepend_op(
            type='feed',
@@ -207,12 +217,16 @@ def prepend_feed_ops(inference_program, feeded_var_names):
            attrs={'col': i})


-def append_fetch_ops(inference_program, fetch_var_names):
+def append_fetch_ops(inference_program,
+                     fetch_target_names,
+                     fetch_holder_name='fetch'):
    global_block = inference_program.global_block()
    fetch_var = global_block.create_var(
-        name='fetch', type=core.VarDesc.VarType.FETCH_LIST, persistable=True)
+        name=fetch_holder_name,
+        type=core.VarDesc.VarType.FETCH_LIST,
+        persistable=True)

-    for i, name in enumerate(fetch_var_names):
+    for i, name in enumerate(fetch_target_names):
        global_block.append_op(
            type='fetch',
            inputs={'X': [name]},
@@ -262,21 +276,12 @@ def save_inference_model(dirname,
    inference_program = pruned_program.inference_optimize()
    fetch_var_names = [v.name for v in target_vars]

-    model_file_name = dirname + "/__model__"
-    with open(model_file_name, "w") as f:
-        pickle.dump({
-            "program_desc_str": inference_program.desc.serialize_to_string(),
-            "feed_var_names": feeded_var_names,
-            "fetch_var_names": fetch_var_names
-        }, f, -1)
-
    prepend_feed_ops(inference_program, feeded_var_names)
    append_fetch_ops(inference_program, fetch_var_names)

-    # Save only programDesc of inference_program in binary format
-    # in another file: __model__.dat
-    with open(model_file_name + ".dat", "wb") as fp:
-        fp.write(inference_program.desc.serialize_to_string())
+    model_file_name = dirname + "/__model__"
+    with open(model_file_name, "wb") as f:
+        f.write(inference_program.desc.serialize_to_string())

    save_params(executor, dirname, main_program)

@@ -299,6 +304,24 @@ def load_persistables_if_exist(executor, dirname, main_program=None):
        predicate=_is_presistable_and_exist_)


+def get_feed_targets_names(program):
+    feed_targets_names = []
+    global_block = program.global_block()
+    for op in global_block.ops:
+        if op.desc.type() == 'feed':
+            feed_targets_names.insert(0, op.desc.output('Out')[0])
+    return feed_targets_names
+
+
+def get_fetch_targets_names(program):
+    fetch_targets_names = []
+    global_block = program.global_block()
+    for op in global_block.ops:
+        if op.desc.type() == 'fetch':
+            fetch_targets_names.append(op.desc.input('X')[0])
+    return fetch_targets_names
+
+
 def load_inference_model(dirname, executor):
    """
    Load inference model from a directory
@@ -306,24 +329,28 @@ def load_inference_model(dirname, executor):
    :param dirname: directory path
    :param executor: executor that load inference model

-    :return: [program, feed_var_names, fetch_var_names]
+    :return: [program, feed_target_names, fetch_targets]
             program: program especially for inference.
-             feeded_var_names: Names of variables that need to feed data
-             fetch_vars: Variables from which we can get inference results.
+             feed_target_names: Names of variables that need to feed data
+             fetch_targets: Variables from which we can get inference results.
    """
    if not os.path.isdir(dirname):
        raise ValueError("There is no directory named '%s'", dirname)

    model_file_name = dirname + "/__model__"
-    model = pickle.load(open(model_file_name, "r"))
-    program_desc_str = model["program_desc_str"]
-    feed_var_names = model["feed_var_names"]
-    fetch_var_names = model["fetch_var_names"]
+    with open(model_file_name, "rb") as f:
+        program_desc_str = f.read()
+
    program = Program.parse_from_string(program_desc_str)
    load_persistables_if_exist(executor, dirname, program)
-    fetch_vars = [program.global_block().var(name) for name in fetch_var_names]

-    return [program, feed_var_names, fetch_vars]
+    feed_target_names = get_feed_targets_names(program)
+    fetch_target_names = get_fetch_targets_names(program)
+    fetch_targets = [
+        program.global_block().var(name) for name in fetch_target_names
+    ]
+
+    return [program, feed_target_names, fetch_targets]


 def get_parameter_value(para, executor):

--- a/python/paddle/v2/fluid/layer_helper.py
+++ b/python/paddle/v2/fluid/layer_helper.py
@@ -100,7 +100,8 @@ class LayerHelper(object):
            if dtype is None:
                dtype = each.dtype
            elif dtype != each.dtype:
-                raise ValueError("Data Type mismatch")
+                raise ValueError("Data Type mismatch: %d to %d" %
+                                 (dtype, each.dtype))
        return dtype

    def create_parameter(self,
@@ -110,6 +111,7 @@ class LayerHelper(object):
                         is_bias=False,
                         default_initializer=None):
        # Deepcopy the attr so that parameters can be shared in program
+        attr = copy.deepcopy(attr)
        assert isinstance(attr, ParamAttr)
        suffix = 'b' if is_bias else 'w'


--- a/python/paddle/v2/fluid/layers/control_flow.py
+++ b/python/paddle/v2/fluid/layers/control_flow.py
@@ -289,6 +289,7 @@ class ParallelDo(object):
                for in_var_name in op.input(iname):
                    if in_var_name not in local_inputs:
                        params.append(in_var_name)
+        params = list(set(params))

        return [parent_block.var(name) for name in params]

@@ -769,7 +770,7 @@ def topk(input, k):
          array = fluid.layers.topk(x, k)
    """
    helper = LayerHelper('topk', **locals())
-    topk_out = helper.create_tmp_variable(dtype=input.data_type)
+    topk_out = helper.create_tmp_variable(dtype=input.dtype)
    topk_indices = helper.create_tmp_variable(dtype='int64')
    helper.append_op(
        type='top_k',

--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
--- a/python/paddle/v2/fluid/memory_optimization_transpiler.py
+++ b/python/paddle/v2/fluid/memory_optimization_transpiler.py
@@ -31,10 +31,12 @@ dtype_to_size = {


 class ControlFlowGraph(object):
-    def __init__(self, Program):
+    def __init__(self, Program, ops, forward_num):
        self._program = Program
-        self._succesors = defaultdict(set)
-        self._presucessors = defaultdict(set)
+        self._ops = ops
+        self._forward_num = forward_num
+        self._successors = defaultdict(set)
+        self._presuccessors = defaultdict(set)
        self._uses = defaultdict(set)
        self._defs = defaultdict(set)
        self._live_in = defaultdict(set)
@@ -45,25 +47,16 @@ class ControlFlowGraph(object):
            self._add(node1, node2)

    def _add(self, node1, node2):
-        self._succesors[node1].add(node2)
-        self._presucessors[node2].add(node1)
+        self._successors[node1].add(node2)
+        self._presuccessors[node2].add(node1)

    def _build_graph(self):
-        program_desc = self._program.get_desc()
-        block_size = program_desc.num_blocks()
-
-        # TODO(qijun) handle Program with if/while operators
-        self.global_block_desc = program_desc.block(0)
-        self.op_size = self.global_block_desc.op_size()
-
+        self.op_size = len(self._ops)
        op_node_connections = [(i, i + 1) for i in range(self.op_size - 1)]
        self._add_connections(op_node_connections)
-
-        self.ops = [self.global_block_desc.op(i) for i in range(self.op_size)]
-
        for i in range(self.op_size):
-            self._uses[i].update(self.ops[i].input_arg_names())
-            self._defs[i].update(self.ops[i].output_arg_names())
+            self._uses[i].update(self._ops[i].input_arg_names())
+            self._defs[i].update(self._ops[i].output_arg_names())

    def _update_graph(self, old_name, new_name, begin_idx=0):
        for i in range(begin_idx, self.op_size):
@@ -103,7 +96,7 @@ class ControlFlowGraph(object):
                live_out[i] = set(self._live_out[i])
                self._live_in[i] = self._uses[i] | (
                    self._live_out[i] - self._defs[i])
-                for s in self._succesors[i]:
+                for s in self._successors[i]:
                    self._live_out[i] |= self._live_in[s]

            if self._reach_fixed_point(live_in, live_out):
@@ -113,39 +106,76 @@ class ControlFlowGraph(object):
        u = a & b
        return a - u, b - u

+    def _has_var(self, block_desc, var_name, is_forward):
+        if is_forward:
+            return block_desc.has_var(str(var_name))
+        else:
+            return block_desc.has_var_recursive(str(var_name))
+
+    def _find_var(self, block_desc, var_name, is_forward):
+        if is_forward:
+            return block_desc.find_var(str(var_name))
+        else:
+            return block_desc.find_var_recursive(str(var_name))
+
    def memory_optimize(self):
+        def check_var_validity(block_desc, x, is_forward):
+            if str(x) == "@EMPTY@":
+                return False
+            if not self._has_var(block_desc, x, is_forward):
+                return False
+            if self._find_var(block_desc, x, is_forward).persistable():
+                return False
+            if self._find_var(
+                    block_desc, x,
+                    is_forward).type() != core.VarDesc.VarType.LOD_TENSOR:
+                return False
+            return True
+
        self._build_graph()
        self._dataflow_analyze()
        self.pool = []
        for i in range(self.op_size):
+            op = self._ops[i]
+            if op.type() == "while" or op.type() == "while_grad":
+                continue
+            block_desc = op.block()
+            is_forward = i < self._forward_num
            if self.pool:
-                out_pair = [(x, self.global_block_desc.var(str(x)).shape())
-                            for x in self._defs[i]]
+                defs_can_optimize = filter(
+                    lambda x: check_var_validity(block_desc, x, is_forward),
+                    self._defs[i])
+                out_pair = [
+                    (x, self._find_var(block_desc, x, is_forward).shape())
+                    for x in defs_can_optimize
+                ]
                for x, x_shape in out_pair:
-                    if not self.global_block_desc.var(str(x)).persistable():
-                        for index, cache_pair in enumerate(self.pool):
-                            cache_var = cache_pair[0]
-                            cache_shape = cache_pair[1]
-                            if x_shape == cache_shape:
-                                x_dtype = self.global_block_desc.var(str(
-                                    x)).dtype()
-                                cache_dtype = self.global_block_desc.var(
-                                    str(cache_var)).dtype()
+                    for index, cache_pair in enumerate(self.pool):
+                        cache_var = cache_pair[0]
+                        cache_shape = cache_pair[1]
+                        if x_shape == cache_shape:
+                            if self._has_var(block_desc, cache_var, is_forward):
+                                x_dtype = self._find_var(block_desc, x,
+                                                         is_forward).dtype()
+                                cache_dtype = self._find_var(
+                                    block_desc, cache_var, is_forward).dtype()
                                # TODO(qijun): actually, we should compare dtype_to_size[x_dtype]
                                # and dtype_to_size[cache_dtype]
                                if x_dtype == cache_dtype:
-                                    print(
-                                        ("Hit Cache !!!! cache pool index "
-                                         "is %d, var name is %s, "
-                                         "cached var name is %s, "
-                                         "var shape is %s ") %
-                                        (index, x, cache_var, str(cache_shape)))
+                                    print(("Hit Cache !!!! cache pool index "
+                                           "is %d, var name is %s, "
+                                           "cached var name is %s, "
+                                           "var shape is %s ") %
+                                          (index, x, cache_var,
+                                           str(cache_shape)))
                                    self.pool.pop(index)
+                                    if x == cache_var:
+                                        break
                                    _rename_arg_(
-                                        self.ops, x, cache_var, begin_idx=i)
-                                    self._program.current_block().var(str(
-                                        x)).desc = self.global_block_desc.var(
-                                            str(cache_var))
+                                        self._ops, x, cache_var, begin_idx=i)
+                                    self._program.block(block_desc.id).var(
+                                        str(x)).desc = self._find_var(
+                                            block_desc, cache_var, is_forward)
                                    self._update_graph(
                                        x, cache_var, begin_idx=i)
                                    break
@@ -153,20 +183,70 @@ class ControlFlowGraph(object):
            in_diff, out_diff = self._get_diff(self._live_in[i],
                                               self._live_out[i])
            can_optimize = filter(
-                lambda x: not self.global_block_desc.var(str(x)).persistable(),
+                lambda x: check_var_validity(block_desc, x, is_forward),
                in_diff)
            if can_optimize:
                for var_name in can_optimize:
-                    self.pool.append(
-                        (var_name,
-                         self.global_block_desc.var(str(var_name)).shape()))
-
-    def get_program(self):
-        return self._program
+                    self.pool.append((var_name, self._find_var(
+                        block_desc, var_name, is_forward).shape()))
+
+
+def get_cfgs(input_program):
+    ops_list = []
+    pdesc = input_program.get_desc()
+    block_desc = pdesc.block(0)
+    op_size = block_desc.op_size()
+    # Get global block ops
+    ops_list.append(([block_desc.op(i) for i in range(op_size)], op_size))
+
+    while_sub_block_ids = []
+    while_grad_sub_block_ids = []
+    while_pair = []
+
+    for i in range(op_size):
+        op = block_desc.op(i)
+        if op.type() == "while":
+            while_sub_block_ids.append(op.attr("sub_block").id)
+        elif op.type() == "while_grad":
+            while_grad_sub_block_ids.append(op.attr("sub_block").id)
+
+    # Find while/while_grad block pair
+    for grad_id in while_grad_sub_block_ids:
+        parent_id = pdesc.block(grad_id).parent
+        if parent_id in while_sub_block_ids:
+            while_pair.append((parent_id, grad_id))
+            while_sub_block_ids.remove(parent_id)
+
+    # Get while/while_grad block ops
+    for parent_id, grad_id in while_pair:
+        while_block_ops = []
+        while_block = pdesc.block(parent_id)
+        while_block_op_size = while_block.op_size()
+        for i in range(while_block_op_size):
+            while_block_ops.append(while_block.op(i))
+
+        while_grad_block = pdesc.block(grad_id)
+        while_grad_block_op_size = while_grad_block.op_size()
+        for i in range(while_grad_block_op_size):
+            while_block_ops.append(while_grad_block.op(i))
+
+        ops_list.append((while_block_ops, while_block_op_size))
+
+    # Process rest while block ops
+    for parent_id in while_sub_block_ids:
+        while_block_ops = []
+        while_block = pdesc.block(parent_id)
+        while_block_op_size = while_block.op_size()
+        for i in range(while_block_op_size):
+            while_block_ops.append(while_block.op(i))
+
+        ops_list.append((while_block_ops, while_block_op_size))
+
+    cfgs = [ControlFlowGraph(input_program, i, j) for i, j in ops_list]
+    return cfgs


 def memory_optimize(input_program):
-    graph = ControlFlowGraph(input_program)
-    graph.memory_optimize()
-    result_program = graph.get_program()
-    return result_program
+    cfgs = get_cfgs(input_program)
+    for cfg in cfgs:
+        cfg.memory_optimize()
--- a/python/paddle/v2/fluid/nets.py
+++ b/python/paddle/v2/fluid/nets.py
@@ -11,14 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import layers

 __all__ = [
    "simple_img_conv_pool",
    "sequence_conv_pool",
    "glu",
-    "dot_product_attention",
+    "scaled_dot_product_attention",
 ]


@@ -56,7 +55,7 @@ def img_conv_group(input,
                   conv_act=None,
                   param_attr=None,
                   conv_with_batchnorm=False,
-                   conv_batchnorm_drop_rate=None,
+                   conv_batchnorm_drop_rate=0.0,
                   pool_stride=1,
                   pool_type=None,
                   use_cudnn=True):
@@ -127,21 +126,21 @@ def sequence_conv_pool(input,

 def glu(input, dim=-1):
    """
-    The gated linear unit composed by split, sigmoid activation and elementwise 
-    multiplication. Specifically, Split the input into two equal sized parts 
-    :math:`a` and :math:`b` along the given dimension and then compute as 
+    The gated linear unit composed by split, sigmoid activation and elementwise
+    multiplication. Specifically, Split the input into two equal sized parts
+    :math:`a` and :math:`b` along the given dimension and then compute as
    following:

        .. math::

            {GLU}(a, b)= a \otimes \sigma(b)

-    Refer to `Language Modeling with Gated Convolutional Networks 
+    Refer to `Language Modeling with Gated Convolutional Networks
    <https://arxiv.org/pdf/1612.08083.pdf>`_.
-    
+
    Args:
        input (Variable): The input variable which is a Tensor or LoDTensor.
-        dim (int): The dimension along which to split. If :math:`dim < 0`, the 
+        dim (int): The dimension along which to split. If :math:`dim < 0`, the
            dimension to split along is :math:`rank(input) + dim`.

    Returns:
@@ -160,53 +159,180 @@ def glu(input, dim=-1):
    return out


-def dot_product_attention(querys, keys, values):
+def scaled_dot_product_attention(queries,
+                                 keys,
+                                 values,
+                                 num_heads=1,
+                                 dropout_rate=0.):
    """
    The dot-product attention.

-    Attention mechanism can be seen as mapping a query and a set of key-value 
-    pairs to an output. The output is computed as a weighted sum of the values, 
-    where the weight assigned to each value is computed by a compatibility 
+    Attention mechanism can be seen as mapping a query and a set of key-value
+    pairs to an output. The output is computed as a weighted sum of the values,
+    where the weight assigned to each value is computed by a compatibility
    function (dot-product here) of the query with the corresponding key.
-    
-    The dot-product attention can be implemented through (batch) matrix 
+
+    The dot-product attention can be implemented through (batch) matrix
    multipication as follows:

        .. math::

-            Attention(Q, K, V)= softmax(QK^\mathrm{T})V
+            Attention(Q, K, V)= softmax(QK^\mathrm{T})V

-    Refer to `Attention Is All You Need 
+    Refer to `Attention Is All You Need
    <https://arxiv.org/pdf/1706.03762.pdf>`_.

-    Note that batch data containing sequences with different lengths is not 
-    supported by this because of the (batch) matrix multipication.
-    
    Args:
-        query (Variable): The input variable which is a Tensor or LoDTensor.
-        key (Variable): The input variable which is a Tensor or LoDTensor.
-        value (Variable): The input variable which is a Tensor or LoDTensor.
+
+        queries (Variable): The input variable which should be a 3-D Tensor.
+        keys (Variable): The input variable which should be a 3-D Tensor.
+        values (Variable): The input variable which should be a 3-D Tensor.
+        num_heads (int): Head number to compute the scaled dot product
+                         attention. Default value is 1.
+        dropout_rate (float): The dropout rate to drop the attention weight.
+                              Default value is 0.

    Returns:
-        tuple: The Tensor variables representing the output and attention scores.
+
+        Variable: A 3-D Tensor computed by multi-head scaled dot product
+                  attention.
+
+    Raises:
+
+        ValueError: If input queries, keys, values are not 3-D Tensors.
+
+    NOTE:
+        1. When num_heads > 1, three linear projections are learned respectively
+        to map input queries, keys and values into queries', keys' and values'.
+        queries', keys' and values' have the same shapes with queries, keys
+        and values.
+
+        1. When num_heads == 1, scaled_dot_product_attention has no learnable
+        parameters.

    Examples:
        .. code-block:: python

-            # Suppose q, k, v are tensor variables with the following shape:
+            # Suppose q, k, v are Tensors with the following shape:
            # q: [3, 5, 9], k: [3, 6, 9], v: [3, 6, 10]
-            out, attn_scores = fluid.nets.dot_product_attention(q, k, v)
-            out.shape  # [3, 5, 10]
-            attn_scores.shape  # [3, 5, 6]
+
+            contexts = fluid.nets.scaled_dot_product_attention(q, k, v)
+            contexts.shape  # [3, 5, 10]
    """
-    assert keys.shape[-2] == values.shape[
-        -2], 'The shapes of keys and values mismatch.'
-    assert querys.shape[-1] == keys.shape[
-        -1], 'The shapes of querys and keys mismatch.'
-    product = layers.matmul(x=querys, y=keys, transpose_y=True)
-    attn_scores = layers.reshape(
+    if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
+        raise ValueError(
+            "Inputs quries, keys and values should all be 3-D tensors.")
+
+    if queries.shape[-1] != keys.shape[-1]:
+        raise ValueError(
+            "The hidden size of queries and keys should be the same.")
+    if keys.shape[-2] != values.shape[-2]:
+        raise ValueError(
+            "The max sequence length in query batch and in key batch "
+            "should be the same.")
+    if keys.shape[-1] % num_heads != 0:
+        raise ValueError("The hidden size of keys (%d) must be divisible "
+                         "by the number of attention heads (%d)." %
+                         (keys.shape[-1], num_heads))
+    if values.shape[-1] % num_heads != 0:
+        raise ValueError("The hidden size of values (%d) must be divisible "
+                         "by the number of attention heads (%d)." %
+                         (values.shape[-1], num_heads))
+
+    def __compute_qkv(queries, keys, values, num_heads):
+        """
+        Add linear projection to queries, keys, and values.
+
+        Args:
+            queries(Tensor): a 3-D input Tensor.
+            keys(Tensor): a 3-D input Tensor.
+            values(Tensor): a 3-D input Tensor.
+            num_heads(int): The number of heads. Linearly project the inputs
+                            ONLY when num_heads > 1.
+
+        Returns:
+            Tensor: linearly projected output Tensors: queries', keys' and
+                    values'. They have the same shapes with queries, keys and
+                    values.
+        """
+
+        if num_heads == 1:
+            return queries, keys, values
+
+        q = layers.fc(input=queries, size=queries.shape[-1], num_flatten_dims=2)
+        k = layers.fc(input=keys, size=keys.shape[-1], num_flatten_dims=2)
+        v = layers.fc(input=values, size=values.shape[-1], num_flatten_dims=2)
+        return q, k, v
+
+    def __split_heads(x, num_heads):
+        """
+        Reshape the last dimension of inpunt tensor x so that it becomes two
+        dimensions.
+
+        Args:
+            x(Tensor): a 3-D input Tensor.
+            num_heads(int): The number of heads.
+
+        Returns:
+            Tensor: a Tensor with shape [..., n, m/num_heads], where m is size
+                    of the last dimension of x.
+        """
+        if num_heads == 1:
+            return x
+
+        hidden_size = x.shape[-1]
+        # reshape the 3-D input: [batch_size, max_sequence_length, hidden_dim]
+        # into a 4-D output:
+        # [batch_size, max_sequence_length, num_heads, hidden_size_per_head].
+        reshaped = layers.reshape(
+            x=x,
+            shape=list(x.shape[:-1]) + [num_heads, hidden_size // num_heads])
+
+        # permuate the dimensions into:
+        # [batch_size, num_heads, max_sequence_len, hidden_size_per_head]
+        return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
+
+    def __combine_heads(x):
+        """
+        Reshape the last two dimensions of inpunt tensor x so that it becomes
+        one dimension.
+
+        Args:
+            x(Tensor): a 4-D input Tensor with shape
+                       [bs, num_heads, max_sequence_length, hidden_dim].
+
+        Returns:
+            Tensor: a Tensor with shape
+                    [bs, max_sequence_length, num_heads * hidden_dim].
+        """
+
+        if len(x.shape) == 3: return x
+        if len(x.shape) != 4:
+            raise ValueError("Input(x) should be a 4-D Tensor.")
+
+        trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
+        return layers.reshape(
+            x=trans_x,
+            shape=map(int, [
+                trans_x.shape[0], trans_x.shape[1],
+                trans_x.shape[2] * trans_x.shape[3]
+            ]))
+
+    q, k, v = __compute_qkv(queries, keys, values, num_heads)
+
+    q = __split_heads(q, num_heads)
+    k = __split_heads(k, num_heads)
+    v = __split_heads(v, num_heads)
+
+    key_dim_per_head = keys.shape[-1] // num_heads
+    scaled_q = layers.scale(x=q, scale=key_dim_per_head**-0.5)
+    product = layers.matmul(x=k, y=scaled_q, transpose_y=True)
+
+    weights = layers.reshape(
        x=layers.reshape(
-            x=product, shape=[-1, product.shape[-1]], act='softmax'),
+            x=product, shape=[-1, product.shape[-1]], act="softmax"),
        shape=product.shape)
-    out = layers.matmul(attn_scores, values)
-    return out, attn_scores
+    if dropout_rate:
+        weights = layers.dropout(x, dropout_prob=dropout_rate, is_test=False)
+    ctx_multiheads = layers.matmul(weights, v)
+    return __combine_heads(ctx_multiheads)
--- a/python/paddle/v2/fluid/profiler.py
+++ b/python/paddle/v2/fluid/profiler.py
@@ -63,3 +63,58 @@ def cuda_profiler(output_file, output_mode=None, config=None):
    # Disables profiler collection.
    core.nvprof_stop()
    os.remove(config_file)
+
+
+def reset_profiler():
+    """The profiler clear interface.
+    reset_profiler will clear the previous time record.
+    """
+    core.reset_profiler()
+
+
+@contextmanager
+def profiler(state, sorted_key=None):
+    """The profiler interface.
+    Different from cuda_profiler, this profiler can be used to profile both CPU
+    and GPU program. By defalut, it records the CPU and GPU operator kernels,
+    if you want to profile other program, you can refer the profiling tutorial
+    to add more records.
+
+    Args:
+        state (string) : The profiling state, which should be 'CPU' or 'GPU',
+            telling the profiler to use CPU timer or GPU timer for profiling.
+            Although users may have already specified the execution place
+            (CPUPlace/CUDAPlace) in the begining, for flexibility the profiler
+            would not inherit this place.
+        sorted_key (string) : If None, the profiling results will be printed
+            in the order of first end time of events. Otherwise, the profiling
+            results will be sorted by the this flag. This flag should be one
+            of 'calls', 'total', 'max', 'min' or 'ave'.
+            The `calls` means sorting by the number of calls.
+            The `total` means sorting by the total execution time.
+            The `max` means sorting by the maximum execution time.
+            The `min` means sorting by the minimum execution time.
+            The `ave` means sorting by the average execution time.
+    """
+
+    if state not in ['CPU', 'GPU']:
+        raise ValueError("The state must be 'CPU' or 'GPU'.")
+    prof_state = core.ProfilerState.kCUDA if state == "GPU" else core.ProfilerState.kCPU
+    core.enable_profiler(prof_state)
+    yield
+
+    if sorted_key not in ['calls', 'total', 'max', 'min', 'ave']:
+        raise ValueError("The state must be in 'calls', 'total', "
+                         "'max', 'min', 'ave'")
+    sorted_key = 'default' if sorted_key is None else sorted_key
+    key_map = {
+        'default': core.EventSortingKey.kDefault,
+        'calls': core.EventSortingKey.kCalls,
+        'total': core.EventSortingKey.kTotal,
+        'max': core.EventSortingKey.kMax,
+        'min': core.EventSortingKey.kMin,
+        'ave': core.EventSortingKey.kAve,
+    }
+    # TODO(qingqing) : redirect C++ ostream to Python stream.
+    # with core.ostream_redirect(stdout=True, stderr=True):
+    core.disable_profiler(key_map[sorted_key])
--- a/python/paddle/v2/fluid/tests/book/CMakeLists.txt
+++ b/python/paddle/v2/fluid/tests/book/CMakeLists.txt
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")

-list(REMOVE_ITEM TEST_OPS test_image_classification_train)
+list(REMOVE_ITEM TEST_OPS test_image_classification_train test_recognize_digits)
 py_test(test_image_classification_train_resnet SRCS test_image_classification_train.py ARGS resnet)
 py_test(test_image_classification_train_vgg SRCS test_image_classification_train.py ARGS vgg)
+py_test(test_recognize_digits_mlp_cpu
+  SRCS test_recognize_digits.py
+  ARGS mlp)
+py_test(test_recognize_digits_mlp_cuda
+  SRCS test_recognize_digits.py
+  ARGS mlp --use_cuda)
+py_test(test_recognize_digits_conv_cpu
+  SRCS test_recognize_digits.py
+  ARGS conv)
+py_test(test_recognize_digits_conv_cuda
+  SRCS test_recognize_digits.py
+  ARGS conv --use_cuda)
+py_test(test_recognize_digits_mlp_cpu_parallel
+  SRCS test_recognize_digits.py
+  ARGS mlp --parallel)
+py_test(test_recognize_digits_mlp_cuda_parallel
+  SRCS test_recognize_digits.py
+  ARGS mlp --use_cuda --parallel)
+py_test(test_recognize_digits_conv_cpu_parallel
+  SRCS test_recognize_digits.py
+  ARGS conv --parallel)
+py_test(test_recognize_digits_conv_cuda_parallel
+  SRCS test_recognize_digits.py
+  ARGS conv --use_cuda --parallel)

 # default test
 foreach(src ${TEST_OPS})

--- a/python/paddle/v2/fluid/tests/book/__init__.py
+++ b/python/paddle/v2/fluid/tests/book/__init__.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
@@ -49,7 +49,7 @@ for pass_id in range(PASS_NUM):
        avg_loss_value, = exe.run(fluid.default_main_program(),
                                  feed=feeder.feed(data),
                                  fetch_list=[avg_cost])
-
+        print(avg_loss_value)
        if avg_loss_value[0] < 10.0:
            exit(0)  # if avg cost less than 10.0, we think our code is good.
 exit(1)
--- a/python/paddle/v2/fluid/tests/book/test_machine_translation.py
+++ b/python/paddle/v2/fluid/tests/book/test_machine_translation.py
@@ -17,7 +17,7 @@ import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
 import paddle.v2.fluid.core as core
 import paddle.v2.fluid.framework as framework
-import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.layers as pd
 from paddle.v2.fluid.executor import Executor

 dict_size = 30000
@@ -26,53 +26,136 @@ src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
 hidden_dim = 32
 word_dim = 16
 IS_SPARSE = True
-batch_size = 10
-max_length = 50
+batch_size = 2
+max_length = 8
 topk_size = 50
 trg_dic_size = 10000
+beam_size = 2

 decoder_size = hidden_dim

+place = core.CPUPlace()

-def encoder_decoder():
+
+def encoder():
    # encoder
-    src_word_id = layers.data(
+    src_word_id = pd.data(
        name="src_word_id", shape=[1], dtype='int64', lod_level=1)
-    src_embedding = layers.embedding(
+    src_embedding = pd.embedding(
        input=src_word_id,
        size=[dict_size, word_dim],
        dtype='float32',
        is_sparse=IS_SPARSE,
        param_attr=fluid.ParamAttr(name='vemb'))

-    fc1 = fluid.layers.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
-    lstm_hidden0, lstm_0 = layers.dynamic_lstm(input=fc1, size=hidden_dim * 4)
-    encoder_out = layers.sequence_last_step(input=lstm_hidden0)
+    fc1 = pd.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
+    lstm_hidden0, lstm_0 = pd.dynamic_lstm(input=fc1, size=hidden_dim * 4)
+    encoder_out = pd.sequence_last_step(input=lstm_hidden0)
+    return encoder_out
+

+def decoder_train(context):
    # decoder
-    trg_language_word = layers.data(
+    trg_language_word = pd.data(
        name="target_language_word", shape=[1], dtype='int64', lod_level=1)
-    trg_embedding = layers.embedding(
+    trg_embedding = pd.embedding(
        input=trg_language_word,
        size=[dict_size, word_dim],
        dtype='float32',
        is_sparse=IS_SPARSE,
        param_attr=fluid.ParamAttr(name='vemb'))

-    rnn = fluid.layers.DynamicRNN()
+    rnn = pd.DynamicRNN()
    with rnn.block():
        current_word = rnn.step_input(trg_embedding)
-        mem = rnn.memory(init=encoder_out)
-        fc1 = fluid.layers.fc(input=[current_word, mem],
+        pre_state = rnn.memory(init=context)
+        current_state = pd.fc(input=[current_word, pre_state],
                              size=decoder_size,
                              act='tanh')
-        out = fluid.layers.fc(input=fc1, size=target_dict_dim, act='softmax')
-        rnn.update_memory(mem, fc1)
-        rnn.output(out)
+
+        current_score = pd.fc(input=current_state,
+                              size=target_dict_dim,
+                              act='softmax')
+        rnn.update_memory(pre_state, current_state)
+        rnn.output(current_score)

    return rnn()


+def decoder_decode(context):
+    init_state = context
+    array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length)
+    counter = pd.zeros(shape=[1], dtype='int64')
+
+    # fill the first element with init_state
+    state_array = pd.create_array('float32')
+    pd.array_write(init_state, array=state_array, i=counter)
+
+    # ids, scores as memory
+    ids_array = pd.create_array('int64')
+    scores_array = pd.create_array('float32')
+
+    init_ids = pd.data(name="init_ids", shape=[1], dtype="int64", lod_level=2)
+    init_scores = pd.data(
+        name="init_scores", shape=[1], dtype="float32", lod_level=2)
+
+    pd.array_write(init_ids, array=ids_array, i=counter)
+    pd.array_write(init_scores, array=scores_array, i=counter)
+
+    cond = pd.less_than(x=counter, y=array_len)
+
+    while_op = pd.While(cond=cond)
+    with while_op.block():
+        pre_ids = pd.array_read(array=ids_array, i=counter)
+        pre_state = pd.array_read(array=state_array, i=counter)
+        pre_score = pd.array_read(array=scores_array, i=counter)
+
+        # expand the lod of pre_state to be the same with pre_score
+        pre_state_expanded = pd.sequence_expand(pre_state, pre_score)
+
+        pre_ids_emb = pd.embedding(
+            input=pre_ids,
+            size=[dict_size, word_dim],
+            dtype='float32',
+            is_sparse=IS_SPARSE)
+
+        # use rnn unit to update rnn
+        current_state = pd.fc(input=[pre_ids_emb, pre_state_expanded],
+                              size=decoder_size,
+                              act='tanh')
+
+        # use score to do beam search
+        current_score = pd.fc(input=current_state,
+                              size=target_dict_dim,
+                              act='softmax')
+        topk_scores, topk_indices = pd.topk(current_score, k=50)
+        selected_ids, selected_scores = pd.beam_search(
+            pre_ids, topk_indices, topk_scores, beam_size, end_id=10, level=0)
+
+        pd.increment(x=counter, value=1, in_place=True)
+
+        # update the memories
+        pd.array_write(current_state, array=state_array, i=counter)
+        pd.array_write(selected_ids, array=ids_array, i=counter)
+        pd.array_write(selected_scores, array=scores_array, i=counter)
+
+        pd.less_than(x=counter, y=array_len, cond=cond)
+
+    translation_ids, translation_scores = pd.beam_search_decode(
+        ids=ids_array, scores=scores_array)
+
+    # return init_ids, init_scores
+
+    return translation_ids, translation_scores
+
+
+def set_init_lod(data, lod, place):
+    res = core.LoDTensor()
+    res.set(data, place)
+    res.set_lod(lod)
+    return res
+
+
 def to_lodtensor(data, place):
    seq_lens = [len(seq) for seq in data]
    cur_len = 0
@@ -88,12 +171,13 @@ def to_lodtensor(data, place):
    return res


-def main():
-    rnn_out = encoder_decoder()
-    label = layers.data(
+def train_main():
+    context = encoder()
+    rnn_out = decoder_train(context)
+    label = pd.data(
        name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
-    cost = layers.cross_entropy(input=rnn_out, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
+    cost = pd.cross_entropy(input=rnn_out, label=label)
+    avg_cost = pd.mean(x=cost)

    optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4)
    optimizer.minimize(avg_cost)
@@ -103,13 +187,12 @@ def main():
            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
        batch_size=batch_size)

-    place = core.CPUPlace()
    exe = Executor(place)

    exe.run(framework.default_startup_program())

    batch_id = 0
-    for pass_id in xrange(2):
+    for pass_id in xrange(1):
        for data in train_data():
            word_data = to_lodtensor(map(lambda x: x[0], data), place)
            trg_word = to_lodtensor(map(lambda x: x[1], data), place)
@@ -125,9 +208,48 @@ def main():
            print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
                  " avg_cost=" + str(avg_cost_val))
            if batch_id > 3:
-                exit(0)
+                break
            batch_id += 1


+def decode_main():
+    context = encoder()
+    translation_ids, translation_scores = decoder_decode(context)
+
+    exe = Executor(place)
+    exe.run(framework.default_startup_program())
+
+    init_ids_data = np.array([1 for _ in range(batch_size)], dtype='int64')
+    init_scores_data = np.array(
+        [1. for _ in range(batch_size)], dtype='float32')
+    init_ids_data = init_ids_data.reshape((batch_size, 1))
+    init_scores_data = init_scores_data.reshape((batch_size, 1))
+    init_lod = [i for i in range(batch_size)] + [batch_size]
+    init_lod = [init_lod, init_lod]
+
+    train_data = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
+        batch_size=batch_size)
+    for _, data in enumerate(train_data()):
+        init_ids = set_init_lod(init_ids_data, init_lod, place)
+        init_scores = set_init_lod(init_scores_data, init_lod, place)
+
+        src_word_data = to_lodtensor(map(lambda x: x[0], data), place)
+
+        result_ids, result_scores = exe.run(
+            framework.default_main_program(),
+            feed={
+                'src_word_id': src_word_data,
+                'init_ids': init_ids,
+                'init_scores': init_scores
+            },
+            fetch_list=[translation_ids, translation_scores],
+            return_numpy=False)
+        print result_ids.lod()
+        break
+
+
 if __name__ == '__main__':
-    main()
+    # train_main()
+    decode_main()
--- a/python/paddle/v2/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import argparse
+import paddle.v2.fluid as fluid
+import paddle.v2 as paddle
+import sys
+import numpy
+
+
+def parse_arg():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "nn_type",
+        help="The neural network type, in ['mlp', 'conv']",
+        type=str,
+        choices=['mlp', 'conv'])
+    parser.add_argument(
+        "--parallel",
+        help='Run in parallel or not',
+        default=False,
+        action="store_true")
+    parser.add_argument(
+        "--use_cuda",
+        help="Run the program by using CUDA",
+        default=False,
+        action="store_true")
+    return parser.parse_args()
+
+
+BATCH_SIZE = 64
+
+
+def loss_net(hidden, label):
+    prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    return fluid.layers.mean(x=loss), fluid.layers.accuracy(
+        input=prediction, label=label)
+
+
+def mlp(img, label):
+    hidden = fluid.layers.fc(input=img, size=200, act='tanh')
+    hidden = fluid.layers.fc(input=hidden, size=200, act='tanh')
+    return loss_net(hidden, label)
+
+
+def conv_net(img, label):
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=img,
+        filter_size=5,
+        num_filters=20,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    return loss_net(conv_pool_2, label)
+
+
+def main():
+    args = parse_arg()
+    print("recognize digits with args: {0}".format(" ".join(sys.argv[1:])))
+
+    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    if args.nn_type == 'mlp':
+        net_conf = mlp
+    else:
+        net_conf = conv_net
+
+    if args.parallel:
+        places = fluid.layers.get_places()
+        pd = fluid.layers.ParallelDo(places)
+        with pd.do():
+            img_ = pd.read_input(img)
+            label_ = pd.read_input(label)
+            for o in net_conf(img_, label_):
+                pd.write_output(o)
+
+        avg_loss, acc = pd()
+        # get mean loss and acc through every devices.
+        avg_loss = fluid.layers.mean(x=avg_loss)
+        acc = fluid.layers.mean(x=acc)
+    else:
+        avg_loss, acc = net_conf(img, label)
+
+    test_program = fluid.default_main_program().clone()
+
+    optimizer = fluid.optimizer.Adam(learning_rate=0.001)
+    optimizer.minimize(avg_loss)
+
+    place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace()
+
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.mnist.train(), buf_size=500),
+        batch_size=BATCH_SIZE)
+    test_reader = paddle.batch(
+        paddle.dataset.mnist.test(), batch_size=BATCH_SIZE)
+    feeder = fluid.DataFeeder(feed_list=[img, label], place=place)
+
+    PASS_NUM = 100
+    for pass_id in range(PASS_NUM):
+        for batch_id, data in enumerate(train_reader()):
+            # train a mini-batch, fetch nothing
+            exe.run(feed=feeder.feed(data))
+            if (batch_id + 1) % 10 == 0:
+                acc_set = []
+                avg_loss_set = []
+                for test_data in test_reader():
+                    acc_np, avg_loss_np = exe.run(program=test_program,
+                                                  feed=feeder.feed(test_data),
+                                                  fetch_list=[acc, avg_loss])
+                    acc_set.append(float(acc_np))
+                    avg_loss_set.append(float(avg_loss_np))
+                # get test acc and loss
+                acc_val = numpy.array(acc_set).mean()
+                avg_loss_val = numpy.array(avg_loss_set).mean()
+                if float(acc_val) > 0.85:  # test acc > 85%
+                    exit(0)
+                else:
+                    print(
+                        'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}'.
+                        format(pass_id, batch_id + 1,
+                               float(avg_loss_val), float(acc_val)))
+
+
+if __name__ == '__main__':
+    main()
--- a/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import numpy as np
-import paddle.v2 as paddle
-import paddle.v2.fluid as fluid
-
-images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype='float32')
-label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-conv_pool_1 = fluid.nets.simple_img_conv_pool(
-    input=images,
-    filter_size=5,
-    num_filters=20,
-    pool_size=2,
-    pool_stride=2,
-    act="relu")
-conv_pool_2 = fluid.nets.simple_img_conv_pool(
-    input=conv_pool_1,
-    filter_size=5,
-    num_filters=50,
-    pool_size=2,
-    pool_stride=2,
-    act="relu")
-
-predict = fluid.layers.fc(input=conv_pool_2, size=10, act="softmax")
-cost = fluid.layers.cross_entropy(input=predict, label=label)
-avg_cost = fluid.layers.mean(x=cost)
-optimizer = fluid.optimizer.Adam(learning_rate=0.01)
-optimizer.minimize(avg_cost)
-
-accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
-
-BATCH_SIZE = 50
-PASS_NUM = 3
-train_reader = paddle.batch(
-    paddle.reader.shuffle(
-        paddle.dataset.mnist.train(), buf_size=500),
-    batch_size=BATCH_SIZE)
-
-place = fluid.CPUPlace()
-exe = fluid.Executor(place)
-feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
-exe.run(fluid.default_startup_program())
-
-for pass_id in range(PASS_NUM):
-    accuracy.reset(exe)
-    for data in train_reader():
-        loss, acc = exe.run(fluid.default_main_program(),
-                            feed=feeder.feed(data),
-                            fetch_list=[avg_cost] + accuracy.metrics)
-        pass_acc = accuracy.eval(exe)
-        print("pass_id=" + str(pass_id) + " acc=" + str(acc) + " pass_acc=" +
-              str(pass_acc))
-        # print loss, acc
-        if loss < 10.0 and pass_acc > 0.9:
-            # if avg cost less than 10.0 and accuracy is larger than 0.9, we think our code is good.
-            exit(0)
-
-    pass_acc = accuracy.eval(exe)
-    print("pass_id=" + str(pass_id) + " pass_acc=" + str(pass_acc))
-
-exit(1)
--- a/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import numpy as np
-import paddle.v2 as paddle
-import paddle.v2.fluid as fluid
-
-BATCH_SIZE = 128
-image = fluid.layers.data(name='x', shape=[784], dtype='float32')
-
-regularizer = fluid.regularizer.L2Decay(0.0005 * BATCH_SIZE)
-
-hidden1 = fluid.layers.fc(input=image,
-                          size=128,
-                          act='relu',
-                          param_attr=fluid.ParamAttr(
-                              regularizer=regularizer,
-                              gradient_clip=fluid.clip.ClipByValue(10)))
-
-hidden2 = fluid.layers.fc(input=hidden1,
-                          size=64,
-                          act='relu',
-                          param_attr=regularizer)
-
-predict = fluid.layers.fc(input=hidden2,
-                          size=10,
-                          act='softmax',
-                          param_attr=regularizer)
-
-label = fluid.layers.data(name='y', shape=[1], dtype='int64')
-
-cost = fluid.layers.cross_entropy(input=predict, label=label)
-avg_cost = fluid.layers.mean(x=cost)
-
-optimizer = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
-opts = optimizer.minimize(avg_cost)
-
-accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
-
-inference_program = fluid.default_main_program().clone()
-with fluid.program_guard(inference_program):
-    test_accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
-    test_target = [avg_cost] + test_accuracy.metrics + test_accuracy.states
-    inference_program = fluid.io.get_inference_program(test_target)
-
-train_reader = paddle.batch(
-    paddle.reader.shuffle(
-        paddle.dataset.mnist.train(), buf_size=8192),
-    batch_size=BATCH_SIZE)
-
-test_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=128)
-
-place = fluid.CPUPlace()
-exe = fluid.Executor(place)
-feeder = fluid.DataFeeder(feed_list=[image, label], place=place)
-exe.run(fluid.default_startup_program())
-
-PASS_NUM = 100
-for pass_id in range(PASS_NUM):
-    accuracy.reset(exe)
-    for data in train_reader():
-        out, acc = exe.run(fluid.default_main_program(),
-                           feed=feeder.feed(data),
-                           fetch_list=[avg_cost] + accuracy.metrics)
-        pass_acc = accuracy.eval(exe)
-
-        test_accuracy.reset(exe)
-        for data in test_reader():
-            out, acc = exe.run(inference_program,
-                               feed=feeder.feed(data),
-                               fetch_list=[avg_cost] + test_accuracy.metrics)
-
-        test_pass_acc = test_accuracy.eval(exe)
-        print("pass_id=" + str(pass_id) + " train_cost=" + str(
-            out) + " train_acc=" + str(acc) + " train_pass_acc=" + str(pass_acc)
-              + " test_acc=" + str(test_pass_acc))
-
-        if test_pass_acc > 0.7:
-            fluid.io.save_inference_model(
-                "./recognize_digits_mlp.inference.model/", ["x"], [predict],
-                exe)
-            exit(0)
-
-exit(1)
--- a/python/paddle/v2/fluid/tests/book_distribute/notest_dist_fit_a_line.py
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_dist_fit_a_line.py
@@ -68,10 +68,10 @@ else:
        fluid.io.save_persistables(exe, "./fit_a_line.model/")
        fluid.io.load_persistables(exe, "./fit_a_line.model/")
        for data in train_reader():
-            avg_loss_value, = exe.run(trainer_prog,
-                                      feed=feeder.feed(data),
-                                      fetch_list=[avg_cost])
-
+            avg_loss_value = exe.run(trainer_prog,
+                                     feed=feeder.feed(data),
+                                     fetch_list=[avg_cost])
+            print("loss:" + str(avg_loss_value))
            if avg_loss_value[0] < 10.0:
                exit(0)
 exit(1)
--- a/python/paddle/v2/fluid/tests/book_distribute/notest_machine_translation.py
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_machine_translation.py
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid.layers as layers
+from paddle.v2.fluid.executor import Executor
+import os
+
+dict_size = 30000
+source_dict_dim = target_dict_dim = dict_size
+src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
+hidden_dim = 32
+word_dim = 16
+IS_SPARSE = True
+batch_size = 10
+max_length = 50
+topk_size = 50
+trg_dic_size = 10000
+
+decoder_size = hidden_dim
+
+
+def encoder_decoder():
+    # encoder
+    src_word_id = layers.data(
+        name="src_word_id", shape=[1], dtype='int64', lod_level=1)
+    src_embedding = layers.embedding(
+        input=src_word_id,
+        size=[dict_size, word_dim],
+        dtype='float32',
+        is_sparse=IS_SPARSE,
+        param_attr=fluid.ParamAttr(name='vemb'))
+
+    fc1 = fluid.layers.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
+    lstm_hidden0, lstm_0 = layers.dynamic_lstm(input=fc1, size=hidden_dim * 4)
+    encoder_out = layers.sequence_last_step(input=lstm_hidden0)
+
+    # decoder
+    trg_language_word = layers.data(
+        name="target_language_word", shape=[1], dtype='int64', lod_level=1)
+    trg_embedding = layers.embedding(
+        input=trg_language_word,
+        size=[dict_size, word_dim],
+        dtype='float32',
+        is_sparse=IS_SPARSE,
+        param_attr=fluid.ParamAttr(name='vemb'))
+
+    rnn = fluid.layers.DynamicRNN()
+    with rnn.block():
+        current_word = rnn.step_input(trg_embedding)
+        mem = rnn.memory(init=encoder_out)
+        fc1 = fluid.layers.fc(input=[current_word, mem],
+                              size=decoder_size,
+                              act='tanh')
+        out = fluid.layers.fc(input=fc1, size=target_dict_dim, act='softmax')
+        rnn.update_memory(mem, fc1)
+        rnn.output(out)
+
+    return rnn()
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = core.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+def main():
+    rnn_out = encoder_decoder()
+    label = layers.data(
+        name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
+    cost = layers.cross_entropy(input=rnn_out, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4)
+    optimize_ops, params_grads = optimizer.minimize(avg_cost)
+
+    train_data = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
+        batch_size=batch_size)
+
+    place = core.CPUPlace()
+    exe = Executor(place)
+
+    t = fluid.DistributeTranspiler()
+    # all parameter server endpoints list for spliting parameters
+    pserver_endpoints = os.getenv("PSERVERS")
+    # server endpoint for current node
+    current_endpoint = os.getenv("SERVER_ENDPOINT")
+    # run as trainer or parameter server
+    training_role = os.getenv(
+        "TRAINING_ROLE", "TRAINER")  # get the training role: trainer/pserver
+    t.transpile(
+        optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
+
+    if training_role == "PSERVER":
+        if not current_endpoint:
+            print("need env SERVER_ENDPOINT")
+            exit(1)
+        pserver_prog = t.get_pserver_program(current_endpoint)
+        pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
+        exe.run(pserver_startup)
+        exe.run(pserver_prog)
+    elif training_role == "TRAINER":
+        trainer_prog = t.get_trainer_program()
+        exe.run(framework.default_startup_program())
+
+        batch_id = 0
+        for pass_id in xrange(2):
+            for data in train_data():
+                word_data = to_lodtensor(map(lambda x: x[0], data), place)
+                trg_word = to_lodtensor(map(lambda x: x[1], data), place)
+                trg_word_next = to_lodtensor(map(lambda x: x[2], data), place)
+                outs = exe.run(trainer_prog,
+                               feed={
+                                   'src_word_id': word_data,
+                                   'target_language_word': trg_word,
+                                   'target_language_next_word': trg_word_next
+                               },
+                               fetch_list=[avg_cost])
+                avg_cost_val = np.array(outs[0])
+                print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
+                      " avg_cost=" + str(avg_cost_val))
+                if batch_id > 3:
+                    exit(0)
+                batch_id += 1
+    else:
+        print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
+
+
+if __name__ == '__main__':
+    main()
--- a/python/paddle/v2/fluid/tests/book_distribute/notest_recommender_system_dist.py
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_recommender_system_dist.py
--- a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
+++ b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
@@ -16,6 +16,11 @@ import numpy as np
 import paddle.v2 as paddle
 import paddle.v2.fluid as fluid

+# need to fix random seed and training data to compare the loss
+# value accurately calculated by the default and the memory optimization
+# version.
+fluid.default_startup_program().random_seed = 111
+
 x = fluid.layers.data(name='x', shape=[13], dtype='float32')

 y_predict = fluid.layers.fc(input=x, size=1, act=None)
@@ -28,15 +33,18 @@ avg_cost = fluid.layers.mean(x=cost)
 sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
 sgd_optimizer.minimize(avg_cost)

-# memopt_program = fluid.default_main_program()
-memopt_program = fluid.memory_optimize(fluid.default_main_program())
+fluid.memory_optimize(fluid.default_main_program())

 BATCH_SIZE = 200

+# fix the order of training data
 train_reader = paddle.batch(
-    paddle.reader.shuffle(
-        paddle.dataset.uci_housing.train(), buf_size=500),
-    batch_size=BATCH_SIZE)
+    paddle.dataset.uci_housing.train(), batch_size=BATCH_SIZE)
+
+# train_reader = paddle.batch(
+#     paddle.reader.shuffle(
+#         paddle.dataset.uci_housing.train(), buf_size=500),
+#     batch_size=BATCH_SIZE)

 place = fluid.CPUPlace()
 feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
@@ -49,7 +57,7 @@ for pass_id in range(PASS_NUM):
    fluid.io.save_persistables(exe, "./fit_a_line.model/")
    fluid.io.load_persistables(exe, "./fit_a_line.model/")
    for data in train_reader():
-        avg_loss_value, = exe.run(memopt_program,
+        avg_loss_value, = exe.run(fluid.default_main_program(),
                                  feed=feeder.feed(data),
                                  fetch_list=[avg_cost])


--- a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
+++ b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
@@ -19,6 +19,11 @@ import sys
 import paddle.v2 as paddle
 import paddle.v2.fluid as fluid

+# need to fix random seed and training data to compare the loss
+# value accurately calculated by the default and the memory optimization
+# version.
+fluid.default_startup_program().random_seed = 111
+

 def resnet_cifar10(input, depth=32):
    def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
@@ -117,31 +122,37 @@ opts = optimizer.minimize(avg_cost)

 accuracy = fluid.evaluator.Accuracy(input=predict, label=label)

-# memopt_program = fluid.default_main_program()
-memopt_program = fluid.memory_optimize(fluid.default_main_program())
+fluid.memory_optimize(fluid.default_main_program())

 BATCH_SIZE = 128
 PASS_NUM = 1

+# fix the order of training data
 train_reader = paddle.batch(
-    paddle.reader.shuffle(
-        paddle.dataset.cifar.train10(), buf_size=128 * 10),
-    batch_size=BATCH_SIZE)
+    paddle.dataset.cifar.train10(), batch_size=BATCH_SIZE)
+
+# train_reader = paddle.batch(
+#     paddle.reader.shuffle(
+#         paddle.dataset.cifar.train10(), buf_size=128 * 10),
+#     batch_size=BATCH_SIZE)

 place = fluid.CPUPlace()
 exe = fluid.Executor(place)
 feeder = fluid.DataFeeder(place=place, feed_list=[images, label])
 exe.run(fluid.default_startup_program())

+i = 0
 for pass_id in range(PASS_NUM):
    accuracy.reset(exe)
    for data in train_reader():
-        loss, acc = exe.run(memopt_program,
+        loss, acc = exe.run(fluid.default_main_program(),
                            feed=feeder.feed(data),
                            fetch_list=[avg_cost] + accuracy.metrics)
        pass_acc = accuracy.eval(exe)
        print("loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str(
            pass_acc))
        # this model is slow, so if we can train two mini batch, we think it works properly.
-        exit(0)
+        if i > 2:
+            exit(0)
+        i += 1
 exit(1)
--- a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
+++ b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
--- a/python/paddle/v2/fluid/tests/op_test.py
+++ b/python/paddle/v2/fluid/tests/op_test.py
@@ -334,7 +334,7 @@ class OpTest(unittest.TestCase):

    def check_output(self, atol=1e-5):
        places = [core.CPUPlace()]
-        if core.is_compile_gpu() and core.op_support_gpu(self.op_type):
+        if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type):
            places.append(core.CUDAPlace(0))
        for place in places:
            self.check_output_with_place(place, atol)
@@ -367,7 +367,7 @@ class OpTest(unittest.TestCase):
                   max_relative_error=0.005,
                   user_defined_grads=None):
        places = [core.CPUPlace()]
-        if core.is_compile_gpu() and core.op_support_gpu(self.op_type):
+        if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type):
            places.append(core.CUDAPlace(0))
        for place in places:
            self.check_grad_with_place(place, inputs_to_check, output_names,

--- a/python/paddle/v2/fluid/tests/test_adagrad_op.py
+++ b/python/paddle/v2/fluid/tests/test_adagrad_op.py
@@ -180,7 +180,7 @@ class TestSparseAdagradOp(unittest.TestCase):

    def test_sparse_adagrad(self):
        places = [core.CPUPlace()]
-        if core.is_compile_gpu():
+        if core.is_compiled_with_cuda():
            places.append(core.CUDAPlace(0))
        for place in places:
            self.check_with_place(place)

--- a/python/paddle/v2/fluid/tests/test_adam_op.py
+++ b/python/paddle/v2/fluid/tests/test_adam_op.py
--- a/python/paddle/v2/fluid/tests/test_batch_norm_op.py
+++ b/python/paddle/v2/fluid/tests/test_batch_norm_op.py
--- a/python/paddle/v2/fluid/tests/test_detection_output_op.py
+++ b/python/paddle/v2/fluid/tests/test_detection_output_op.py
--- a/python/paddle/v2/fluid/tests/test_gaussian_random_op.py
+++ b/python/paddle/v2/fluid/tests/test_gaussian_random_op.py
--- a/python/paddle/v2/fluid/tests/test_iou_similarity_op.py
+++ b/python/paddle/v2/fluid/tests/test_iou_similarity_op.py
--- a/python/paddle/v2/fluid/tests/test_layers.py
+++ b/python/paddle/v2/fluid/tests/test_layers.py
--- a/python/paddle/v2/fluid/tests/test_lookup_table_op.py
+++ b/python/paddle/v2/fluid/tests/test_lookup_table_op.py
--- a/python/paddle/v2/fluid/tests/test_multihead_attention.py
+++ b/python/paddle/v2/fluid/tests/test_multihead_attention.py
--- a/python/paddle/v2/fluid/tests/test_nce.py
+++ b/python/paddle/v2/fluid/tests/test_nce.py
--- a/python/paddle/v2/fluid/tests/test_normalization_wrapper.py
+++ b/python/paddle/v2/fluid/tests/test_normalization_wrapper.py
--- a/python/paddle/v2/fluid/tests/test_op_support_gpu.py
+++ b/python/paddle/v2/fluid/tests/test_op_support_gpu.py
--- a/python/paddle/v2/fluid/tests/test_parallel_op.py
+++ b/python/paddle/v2/fluid/tests/test_parallel_op.py
--- a/python/paddle/v2/fluid/tests/test_prior_box_op.py
+++ b/python/paddle/v2/fluid/tests/test_prior_box_op.py
--- a/python/paddle/v2/fluid/tests/test_profiler.py
+++ b/python/paddle/v2/fluid/tests/test_profiler.py
--- a/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py
+++ b/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py
--- a/python/paddle/v2/fluid/tests/test_sgd_op.py
+++ b/python/paddle/v2/fluid/tests/test_sgd_op.py
--- a/python/paddle/v2/fluid/tests/test_split_selected_rows_op.py
+++ b/python/paddle/v2/fluid/tests/test_split_selected_rows_op.py
--- a/python/paddle/v2/fluid/tests/test_uniform_random_op.py
+++ b/python/paddle/v2/fluid/tests/test_uniform_random_op.py
--- a/python/paddle/v2/image.py
+++ b/python/paddle/v2/image.py
--- a/tools/manylinux1/Dockerfile.x64
+++ b/tools/manylinux1/Dockerfile.x64