delete conflict

5f15037e · chengduoZH · 3cdb419b · 720994b4 · 5f15037e · 5f15037e
104 changed file
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -179,20 +179,24 @@ function(cc_library TARGET_NAME)
  set(oneValueArgs "")
  set(multiValueArgs SRCS DEPS)
  cmake_parse_arguments(cc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-  if (cc_library_SRCS)
-    if (cc_library_SHARED OR cc_library_shared) # build *.so
+  if(cc_library_SRCS)
+    if(cc_library_SHARED OR cc_library_shared) # build *.so
      add_library(${TARGET_NAME} SHARED ${cc_library_SRCS})
    else()
      add_library(${TARGET_NAME} STATIC ${cc_library_SRCS})
    endif()
-    if (cc_library_DEPS)
+    if(cc_library_DEPS)
      # Don't need link libwarpctc.so
-      if ("${cc_library_DEPS};" MATCHES "warpctc;")
+      if("${cc_library_DEPS};" MATCHES "warpctc;")
        list(REMOVE_ITEM cc_library_DEPS warpctc)
        add_dependencies(${TARGET_NAME} warpctc)
      endif()
+      # Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
+      target_circle_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
+      if("${cc_library_DEPS}" MATCHES "ARCHIVE_START")
+        list(REMOVE_ITEM cc_library_DEPS ARCHIVE_START ARCHIVE_END)
+      endif()
      add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
-      target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
    endif()
    
    # cpplint code style

--- a/doc/api/v2/fluid/layers.rst
+++ b/doc/api/v2/fluid/layers.rst
@@ -323,6 +323,12 @@ batch_norm
 ..  autofunction:: paddle.v2.fluid.layers.batch_norm
    :noindex:

+layer_norm
+----------
+
+..  autofunction:: paddle.v2.fluid.layers.layer_norm
+    :noindex:
+
 beam_search_decode
 ------------------


--- a/doc/design/cpp_data_feeding.md
+++ b/doc/design/cpp_data_feeding.md
+# C++ Data Feeding
+
+In training with Paddle V2 API, data feeding wholly dependents on Python code. To get rid of the Python environment and achieve the goal of "wrapping the whole training by a while loop op" in Paddle Fluid, a C++ data feeding mechanism is required. 
+
+In this document we show the fundamental design of C++ data feeding process, which includes the data reading, shuffling and batching.
+
+## Reader
+
+A new concept named 'Reader' is introduced. `Reader` is a series of inherited classes which can be hold by our `Variable` and they are used to read or process file data.
+
+
+### `ReaderBase`
+
+`ReaderBase` is the abstract base class of all readers. It defines the all readers' interfaces.
+
+```cpp
+class ReaderBase {
+ public:
+  explicit ReaderBase(const std::vector<DDim>& shapes) : shapes_(shapes) {
+    PADDLE_ENFORCE(!shapes_.empty());
+  }
+  // Read the next batch of data. (A 'batch' can be only one instance)
+  virtual void ReadNext(std::vector<LoDTensor>* out) = 0;
+  // Show whether the next bacth exists.
+  virtual bool HasNext() const = 0;
+  
+  // Reinitialize the reader and read the file from the begin.
+  virtual void ReInit() = 0;
+  
+  // Get a certain read in data's shape.
+  DDim shape(size_t idx) const;
+  // Get shapes of all read in data.
+  std::vector<DDim> shapes() const { return shapes_; }
+  // Set shapes of read in data.
+  void set_shapes(const std::vector<DDim>& shapes) { shapes_ = shapes; }
+
+  virtual ~ReaderBase() {}
+
+ protected:
+  std::vector<DDim> shapes_;
+};
+```
+
+### `FileReader` and `DecoratedReader`
+
+These two classes are derived from the `ReaderBase` and will further be derived by respective specific readers. That is to say, in our design, there are two kinds of readers: file readers and decorated readers. A file reader reads from a file of some specific format, and yield only one instance of data at a time. e.g. RecordIO reader, jpg reader, .... A decorated reader takes another reader(both file reader and decorated reader are OK) as its 'underlying reader'. It gets data from its underlying reader, does some process on them(shuffling, or batching), then yields processed data. The output data of a decorated reader can be a single instance or a batch. `ShuffleReader` and `BatchReader` are both decorated readers.
+
+All the readers share exactly the same interfaces defined in `ReaderBase`. So they can be decorated for more than one time: We can **shuffle** a reader's outputs and then **batch** the shuffle outputs. The interface consistency also allows related ops use readers without knowing what they are exactly.
+
+
+### `ReaderHolder`
+
+Different readers belong to different class types. It leads to a problem: How can we drop them into `Variable`s and fetch them out by a unified method? For example, if a Variable holds a `BatchReader`, we can not get it by the following code:
+
+```cpp
+var->Get<ReaderBase>("batch_reader");
+```
+
+we have to write:
+
+```cpp
+var->Get<BatchReader>("batch_reader");
+```
+
+This requires each time getting a reader from a variable we must know the reader's type exactly. It is nearly impossible.
+
+To solve this problem, we introduce `ReaderHolder` as a wrapper. It acts as an empty decorator of `ReaderBase`, which erases reader's type. With `ReaderHolder` we are able to fetch all types of readers by `var->Get<ReaderHolder>("...")` and regard the obtained object as a reader.
+
+## Related Operators
+
+To create and invoke readers, some now ops are introduced:
+
+### `CreateReaderOp`
+
+Each reader has its creating op. File readers' creating ops have no input and yield the created file reader as its output. Decorated readers' creating ops take the underlying readers as inputs and then yield new decorated readers.
+
+### `ReadOp`
+
+A reader is only a Variable. It cannot trigger the reading process by itself. So we add the `ReadOp` to execute it. A `ReadOp` takes a reader Variable as its input. Each time it runs, it invokes the reader‘s `ReadNext()` function and gets a new batch of data(or only one instance of data, if we use file reader directly). The output data of a reader are in the form of `std::vector<LoDTenosr>`, so the `ReadOp` also needs to split the vector and move LoDTensors to their respective output Variables.
--- a/doc/getstarted/quickstart_cn.rst
+++ b/doc/getstarted/quickstart_cn.rst
 快速开始
 ========

+快速安装
+--------
+
 PaddlePaddle支持使用pip快速安装，目前支持CentOS 6以上, Ubuntu 14.04以及MacOS 10.12，并安装有Python2.7。
 执行下面的命令完成快速安装，版本为cpu_avx_openblas：

@@ -16,6 +19,9 @@ PaddlePaddle支持使用pip快速安装，目前支持CentOS 6以上, Ubuntu 14.

 更详细的安装和编译方法参考：:ref:`install_steps` 。

+快速使用
+--------
+
 创建一个 housing.py 并粘贴此Python代码：

  .. code-block:: python

--- a/doc/getstarted/quickstart_en.rst
+++ b/doc/getstarted/quickstart_en.rst
 Quick Start
 ============

+Quick Install
+-------------
+
 You can use pip to install PaddlePaddle with a single command, supports
 CentOS 6 above, Ubuntu 14.04 above or MacOS 10.12, with Python 2.7 installed.
 Simply run the following command to install, the version is cpu_avx_openblas:
@@ -17,6 +20,9 @@ If you need to install GPU version (cuda7.5_cudnn5_avx_openblas), run:

 For more details about installation and build: :ref:`install_steps` .

+Quick Use
+---------
+
 Create a new file called housing.py, and paste this Python
 code:


--- a/doc/howto/cluster/index_cn.rst
+++ b/doc/howto/cluster/index_cn.rst
 分布式训练
 ==========

+本节将介绍如何使用PaddlePaddle在不同的集群框架下完成分布式训练。分布式训练架构如下图所示：
+
+.. image:: src/ps_cn.png
+   :width: 500
+
+- 数据分片（Data shard): 用于训练神经网络的数据，被切分成多个部分，每个部分分别给每个trainer使用。
+- 计算节点（Trainer）: 每个trainer启动后读取切分好的一部分数据，开始神经网络的“前馈”和“后馈”计算，并和参数服务器通信。在完成一定量数据的训练后，上传计算得出的梯度（gradients），然后下载优化更新后的神经网络参数（parameters）。
+- 参数服务器（Parameter server）:每个参数服务器只保存整个神经网络所有参数的一部分。参数服务器接收从计算节点上传的梯度，并完成参数优化更新，再将更新后的参数下发到每个计算节点。
+
+这样，通过计算节点和参数服务器的分布式协作，可以完成神经网络的SGD方法的训练。PaddlePaddle可以同时支持同步随机梯度下降（SGD）和异步随机梯度下降。
+
+在使用同步SGD训练神经网络时，PaddlePaddle使用同步屏障（barrier），使梯度的提交和参数的更新按照顺序方式执行。在异步SGD中，则并不会等待所有trainer提交梯度才更新参数，这样极大地提高了计算的并行性：参数服务器之间不相互依赖，并行地接收梯度和更新参数，参数服务器也不会等待计算节点全部都提交梯度之后才开始下一步，计算节点之间也不会相互依赖，并行地执行模型的训练。可以看出，虽然异步SGD方式会提高参数更新并行度, 但是并不能保证参数同步更新，在任意时间某一台参数服务器上保存的参数可能比另一台要更新，与同步SGD相比，梯度会有噪声。
+
 ..  toctree::
  :maxdepth: 1

-  introduction_cn.md
  preparations_cn.md
  cmd_argument_cn.md
  multi_cluster/index_cn.rst
--- a/doc/howto/cluster/index_en.rst
+++ b/doc/howto/cluster/index_en.rst
 Distributed Training
 ====================

+In this section, we'll explain how to run distributed training jobs with PaddlePaddle on different types of clusters. The diagram below shows the main architecture of a distributed trainning job:
+
+.. image:: src/ps_en.png
+   :width: 500
+
+- Data shard: training data will be split into multiple partitions, trainers use the partitions of the whole dataset to do the training job.
+- Trainer: each trainer reads the data shard, and train the neural network. Then the trainer will upload calculated "gradients" to parameter servers, and wait for parameters to be optimized on the parameter server side. When that finishes, the trainer download optimized parameters and continues its training.
+- Parameter server: every parameter server stores part of the whole neural network model data. They will do optimization calculations when gradients are uploaded from trainers, and then send updated parameters to trainers.
+
+PaddlePaddle can support both synchronize stochastic gradient descent (SGD) and asynchronous SGD.
+
+When training with synchronize SGD, PaddlePaddle uses an internal "synchronize barrier" which makes gradients update and parameter download in strict order. On the other hand, asynchronous SGD won't wait for all trainers to finish upload at a single step, this will increase the parallelism of distributed training: parameter servers do not depend on each other, they'll do parameter optimization concurrently. Parameter servers will not wait for trainers, so trainers will also do their work concurrently. But asynchronous SGD will introduce more randomness and noises in the gradient.
+
 ..  toctree::
  :maxdepth: 1

-  introduction_en.md
  preparations_en.md
  cmd_argument_en.md
  multi_cluster/index_en.rst
--- a/doc/howto/cluster/introduction_cn.md
+++ b/doc/howto/cluster/introduction_cn.md
-## 概述
-
-本节将介绍如何使用PaddlePaddle在不同的集群框架下完成分布式训练。分布式训练架构如下图所示：
-
-<img src="https://user-images.githubusercontent.com/13348433/31772175-5f419eca-b511-11e7-9db7-5231fe3d9ccb.png" width="500">
-
- 数据分片（Data shard): 用于训练神经网络的数据，被切分成多个部分，每个部分分别给每个trainer使用。
- 计算节点（Trainer）: 每个trainer启动后读取切分好的一部分数据，开始神经网络的“前馈”和“后馈”计算，并和参数服务器通信。在完成一定量数据的训练后，上传计算得出的梯度（gradients），然后下载优化更新后的神经网络参数（parameters）。
- 参数服务器（Parameter server）:每个参数服务器只保存整个神经网络所有参数的一部分。参数服务器接收从计算节点上传的梯度，并完成参数优化更新，再将更新后的参数下发到每个计算节点。
-
-这样，通过计算节点和参数服务器的分布式协作，可以完成神经网络的SGD方法的训练。PaddlePaddle可以同时支持同步随机梯度下降（SGD）和异步随机梯度下降。
-
-在使用同步SGD训练神经网络时，PaddlePaddle使用同步屏障（barrier），使梯度的提交和参数的更新按照顺序方式执行。在异步SGD中，则并不会等待所有trainer提交梯度才更新参数，这样极大地提高了计算的并行性：参数服务器之间不相互依赖，并行地接收梯度和更新参数，参数服务器也不会等待计算节点全部都提交梯度之后才开始下一步，计算节点之间也不会相互依赖，并行地执行模型的训练。可以看出，虽然异步SGD方式会提高参数更新并行度, 但是并不能保证参数同步更新，在任意时间某一台参数服务器上保存的参数可能比另一台要更新，与同步SGD相比，梯度会有噪声。
--- a/doc/howto/cluster/introduction_en.md
+++ b/doc/howto/cluster/introduction_en.md
-## Introduction
-
-In this section, we'll explain how to run distributed training jobs with PaddlePaddle on different types of clusters. The diagram below shows the main architecture of a distributed trainning job:
-
-<img src="https://user-images.githubusercontent.com/13348433/31772146-41523d84-b511-11e7-8a12-a69fd136c283.png" width="500">
-
- Data shard: training data will be split into multiple partitions, trainers use the partitions of the whole dataset to do the training job.
- Trainer: each trainer reads the data shard, and train the neural network. Then the trainer will upload calculated "gradients" to parameter servers, and wait for parameters to be optimized on the parameter server side. When that finishes, the trainer download optimized parameters and continues its training.
- Parameter server: every parameter server stores part of the whole neural network model data. They will do optimization calculations when gradients are uploaded from trainers, and then send updated parameters to trainers.
-
-PaddlePaddle can support both synchronize stochastic gradient descent (SGD) and asynchronous SGD.
-
-When training with synchronize SGD, PaddlePaddle uses an internal "synchronize barrier" which makes gradients update and parameter download in strict order. On the other hand, asynchronous SGD won't wait for all trainers to finish upload at a single step, this will increase the parallelism of distributed training: parameter servers do not depend on each other, they'll do parameter optimization concurrently. Parameter servers will not wait for trainers, so trainers will also do their work concurrently. But asynchronous SGD will introduce more randomness and noises in the gradient.
--- a/doc/howto/cluster/src/ps_cn.png
+++ b/doc/howto/cluster/src/ps_cn.png
--- a/doc/howto/cluster/src/ps_en.png
+++ b/doc/howto/cluster/src/ps_en.png
--- a/doc/howto/rnn/index_cn.rst
+++ b/doc/howto/rnn/index_cn.rst
-RNN相关模型
+RNN模型
 ===========

 ..  toctree::

--- a/doc/index_cn.rst
+++ b/doc/index_cn.rst
@@ -8,5 +8,4 @@ PaddlePaddle 文档
  build_and_install/index_cn.rst
  howto/index_cn.rst
  dev/index_cn.rst
-  api/index_cn.rst
  faq/index_cn.rst
--- a/doc/index_en.rst
+++ b/doc/index_en.rst
@@ -8,4 +8,3 @@ PaddlePaddle Documentation
  build_and_install/index_en.rst
  howto/index_en.rst
  dev/index_en.rst
-  api/index_en.rst
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -20,6 +20,7 @@ endif()

 cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)

+nv_test(mixed_vector_test SRCS mixed_vector_test.cu DEPS place paddle_memory device_context init)
 cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto)
 cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor paddle_memory)
 nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor init)

--- a/paddle/framework/block_desc.cc
+++ b/paddle/framework/block_desc.cc
@@ -162,9 +162,8 @@ BlockDesc::BlockDesc(const BlockDesc &other, proto::BlockDesc *desc,
    : prog_(prog), desc_(desc) {
  need_update_ = true;
  for (auto &op : other.ops_) {
-    ops_.emplace_back(new OpDesc(*op, this));
+    ops_.emplace_back(new OpDesc(*op->Proto(), prog, this));
  }
-
  for (auto &it : other.vars_) {
    auto *var = new VarDesc(*it.second);
    vars_[it.first].reset(var);

--- a/paddle/framework/channel.h
+++ b/paddle/framework/channel.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.

--- a/paddle/framework/channel_test.cc
+++ b/paddle/framework/channel_test.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -22,6 +22,28 @@ limitations under the License. */
 using paddle::framework::Channel;
 using paddle::framework::MakeChannel;
 using paddle::framework::CloseChannel;
+using paddle::framework::details::Buffered;
+using paddle::framework::details::UnBuffered;
+
+void RecevingOrderEqualToSendingOrder(Channel<int> *ch) {
+  unsigned sum_send = 0;
+  std::thread t([&]() {
+    for (int i = 0; i < 5; i++) {
+      EXPECT_EQ(ch->Send(&i), true);
+      sum_send += i;
+    }
+  });
+  for (int i = 0; i < 5; i++) {
+    int recv;
+    EXPECT_EQ(ch->Receive(&recv), true);
+    EXPECT_EQ(recv, i);
+  }
+
+  CloseChannel(ch);
+  t.join();
+  EXPECT_EQ(sum_send, 10U);
+  delete ch;
+}

 TEST(Channel, MakeAndClose) {
  using paddle::framework::details::Buffered;
@@ -60,13 +82,54 @@ TEST(Channel, SufficientBufferSizeDoesntBlock) {
  delete ch;
 }

-TEST(Channel, SendOnClosedChannelPanics) {
-  const size_t buffer_size = 10;
-  auto ch = MakeChannel<size_t>(buffer_size);
-  size_t i = 5;
-  EXPECT_EQ(ch->Send(&i), true);  // should not block or panic
+// This tests that a  channel must return false
+// on send and receive performed after closing the channel.
+// Receive will only return false after close when queue is empty.
+// By creating separate threads for sending and receiving, we make this
+// function able to test both buffered and unbuffered channels.
+void SendReceiveWithACloseChannelShouldPanic(Channel<size_t> *ch) {
+  const size_t data = 5;
+  std::thread send_thread{[&]() {
+    size_t i = data;
+    EXPECT_EQ(ch->Send(&i), true);  // should not block
+  }};
+
+  std::thread recv_thread{[&]() {
+    size_t i;
+    EXPECT_EQ(ch->Receive(&i), true);  // should not block
+    EXPECT_EQ(i, data);
+  }};
+
+  send_thread.join();
+  recv_thread.join();
+
+  // After closing send should return false. Receive should
+  // also return false as there is no data in queue.
  CloseChannel(ch);
-  EXPECT_EQ(ch->Send(&i), false);  // should panic
+  send_thread = std::thread{[&]() {
+    size_t i = data;
+    EXPECT_EQ(ch->Send(&i), false);  // should return false
+  }};
+  recv_thread = std::thread{[&]() {
+    size_t i;
+    // should return false because channel is closed and queue is empty
+    EXPECT_EQ(ch->Receive(&i), false);
+  }};
+
+  send_thread.join();
+  recv_thread.join();
+}
+
+TEST(Channel, SendReceiveClosedBufferedChannelPanics) {
+  size_t buffer_size = 10;
+  auto ch = MakeChannel<size_t>(buffer_size);
+  SendReceiveWithACloseChannelShouldPanic(ch);
+  delete ch;
+}
+
+TEST(Channel, SendReceiveClosedUnBufferedChannelPanics) {
+  auto ch = MakeChannel<size_t>(0);
+  SendReceiveWithACloseChannelShouldPanic(ch);
  delete ch;
 }

@@ -94,9 +157,7 @@ TEST(Channel, ReceiveFromBufferedChannelReturnResidualValuesTest) {

  for (size_t i = 0; i < buffer_size; ++i) {
    EXPECT_EQ(ch->Receive(&out),
-              false);  // after receiving residual values, return zeros.
-    // Note: we cannot check EXPECT_EQ(out, 0), because C++ doesn't
-    // define zero values like Go does.
+              false);  // receiving on closed channel should return false
  }
  delete ch;
 }
@@ -115,7 +176,7 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
      sum += i;
    }
  });
-  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.5 sec
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.1 sec
  EXPECT_EQ(sum, 45U);

  CloseChannel(ch);
@@ -123,31 +184,17 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
  delete ch;
 }

-TEST(Channel, SimpleUnbufferedChannelTest) {
+TEST(Channel, RecevingOrderEqualToSendingOrderWithUnBufferedChannel) {
  auto ch = MakeChannel<int>(0);
-  unsigned sum_send = 0;
-  std::thread t([&]() {
-    for (int i = 0; i < 5; i++) {
-      EXPECT_EQ(ch->Send(&i), true);
-      sum_send += i;
-    }
-  });
-  for (int i = 0; i < 5; i++) {
-    int recv;
-    EXPECT_EQ(ch->Receive(&recv), true);
-    EXPECT_EQ(recv, i);
-  }
+  RecevingOrderEqualToSendingOrder(ch);
+}

-  CloseChannel(ch);
-  t.join();
-  EXPECT_EQ(sum_send, 10U);
-  delete ch;
+TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel) {
+  auto ch = MakeChannel<int>(10);
+  RecevingOrderEqualToSendingOrder(ch);
 }

-// This tests that closing a buffered channel also unblocks
-//  any receivers waiting on the channel
-TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) {
-  auto ch = MakeChannel<int>(1);
+void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
  size_t num_threads = 5;
  std::thread t[num_threads];
  bool thread_ended[num_threads];
@@ -158,15 +205,14 @@ TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) {
    t[i] = std::thread(
        [&](bool *p) {
          int data;
-          // All reads should return false
          EXPECT_EQ(ch->Receive(&data), false);
          *p = true;
        },
        &thread_ended[i]);
  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.1 sec

-  // Verify that all threads are blocked
+  // Verify that all the threads are blocked
  for (size_t i = 0; i < num_threads; i++) {
    EXPECT_EQ(thread_ended[i], false);
  }
@@ -175,7 +221,7 @@ TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) {
  // This should unblock all receivers
  CloseChannel(ch);

-  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.1 sec

  // Verify that all threads got unblocked
  for (size_t i = 0; i < num_threads; i++) {
@@ -183,13 +229,12 @@ TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) {
  }

  for (size_t i = 0; i < num_threads; i++) t[i].join();
-  delete ch;
 }

-// This tests that closing a buffered channel also unblocks
-//  any senders waiting for channel to have write space
-TEST(Channel, BufferedChannelCloseUnblocksSendersTest) {
-  auto ch = MakeChannel<int>(1);
+void ChannelCloseUnblocksSendersTest(Channel<int> *ch) {
+  using paddle::framework::details::Buffered;
+  using paddle::framework::details::UnBuffered;
+
  size_t num_threads = 5;
  std::thread t[num_threads];
  bool thread_ended[num_threads];
@@ -209,34 +254,56 @@ TEST(Channel, BufferedChannelCloseUnblocksSendersTest) {
  }
  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait

-  // Verify that atleast 4 threads are blocked
-  int ct = 0;
-  for (size_t i = 0; i < num_threads; i++) {
-    if (thread_ended[i] == false) ct++;
+  if (dynamic_cast<Buffered<int> *>(ch)) {
+    // If ch is Buffered, atleast 4 threads must be blocked.
+    int ct = 0;
+    for (size_t i = 0; i < num_threads; i++) {
+      if (!thread_ended[i]) ct++;
+    }
+    EXPECT_GE(ct, 4);
+  } else {
+    // If ch is UnBuffered, all the threads should be blocked.
+    for (size_t i = 0; i < num_threads; i++) {
+      EXPECT_EQ(thread_ended[i], false);
+    }
  }
-  // Atleast 4 threads must be blocked
-  EXPECT_GE(ct, 4);
-
  // Explicitly close the thread
  // This should unblock all senders
  CloseChannel(ch);

-  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait

  // Verify that all threads got unblocked
  for (size_t i = 0; i < num_threads; i++) {
    EXPECT_EQ(thread_ended[i], true);
  }

-  // Verify that only 1 send was successful
-  ct = 0;
-  for (size_t i = 0; i < num_threads; i++) {
-    if (send_success[i]) ct++;
+  if (dynamic_cast<Buffered<int> *>(ch)) {
+    // Verify that only 1 send was successful
+    int ct = 0;
+    for (size_t i = 0; i < num_threads; i++) {
+      if (send_success[i]) ct++;
+    }
+    // Only 1 send must be successful
+    EXPECT_EQ(ct, 1);
  }
-  // Only 1 send must be successful
-  EXPECT_EQ(ct, 1);

  for (size_t i = 0; i < num_threads; i++) t[i].join();
+}
+
+// This tests that closing a buffered channel also unblocks
+//  any receivers waiting on the channel
+TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) {
+  auto ch = MakeChannel<int>(1);
+  ChannelCloseUnblocksReceiversTest(ch);
+  delete ch;
+}
+
+// This tests that closing a buffered channel also unblocks
+//  any senders waiting for channel to have write space
+TEST(Channel, BufferedChannelCloseUnblocksSendersTest) {
+  auto ch = MakeChannel<int>(1);
+  ChannelCloseUnblocksSendersTest(ch);
  delete ch;
 }

@@ -244,40 +311,7 @@ TEST(Channel, BufferedChannelCloseUnblocksSendersTest) {
 //  unblocks any receivers waiting for senders
 TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) {
  auto ch = MakeChannel<int>(0);
-  size_t num_threads = 5;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
-
-  // Launches threads that try to read and are blocked becausew of no writers
-  for (size_t i = 0; i < num_threads; i++) {
-    thread_ended[i] = false;
-    t[i] = std::thread(
-        [&](bool *p) {
-          int data;
-          EXPECT_EQ(ch->Receive(&data), false);
-          *p = true;
-        },
-        &thread_ended[i]);
-  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
-
-  // Verify that all the threads are blocked
-  for (size_t i = 0; i < num_threads; i++) {
-    EXPECT_EQ(thread_ended[i], false);
-  }
-
-  // Explicitly close the thread
-  // This should unblock all receivers
-  CloseChannel(ch);
-
-  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
-
-  // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
-    EXPECT_EQ(thread_ended[i], true);
-  }
-
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  ChannelCloseUnblocksReceiversTest(ch);
  delete ch;
 }

@@ -285,40 +319,7 @@ TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) {
 //  unblocks any senders waiting for senders
 TEST(Channel, UnbufferedChannelCloseUnblocksSendersTest) {
  auto ch = MakeChannel<int>(0);
-  size_t num_threads = 5;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
-
-  // Launches threads that try to read and are blocked becausew of no writers
-  for (size_t i = 0; i < num_threads; i++) {
-    thread_ended[i] = false;
-    t[i] = std::thread(
-        [&](bool *p) {
-          int data = 10;
-          EXPECT_EQ(ch->Send(&data), false);
-          *p = true;
-        },
-        &thread_ended[i]);
-  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
-
-  // Verify that all the threads are blocked
-  for (size_t i = 0; i < num_threads; i++) {
-    EXPECT_EQ(thread_ended[i], false);
-  }
-
-  // Explicitly close the thread
-  // This should unblock all receivers
-  CloseChannel(ch);
-
-  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
-
-  // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
-    EXPECT_EQ(thread_ended[i], true);
-  }
-
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  ChannelCloseUnblocksReceiversTest(ch);
  delete ch;
 }

@@ -381,3 +382,129 @@ TEST(Channel, UnbufferedMoreReceiveLessSendTest) {
  EXPECT_EQ(sum_receive, 28U);
  delete ch;
 }
+
+// This tests that destroying a channel unblocks
+//  any senders waiting for channel to have write space
+void ChannelDestroyUnblockSenders(Channel<int> *ch) {
+  size_t num_threads = 5;
+  std::thread t[num_threads];
+  bool thread_ended[num_threads];
+  bool send_success[num_threads];
+
+  // Launches threads that try to write and are blocked because of no readers
+  for (size_t i = 0; i < num_threads; i++) {
+    thread_ended[i] = false;
+    send_success[i] = false;
+    t[i] = std::thread(
+        [&](bool *ended, bool *success) {
+          int data = 10;
+          *success = ch->Send(&data);
+          *ended = true;
+        },
+        &thread_ended[i], &send_success[i]);
+  }
+
+  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
+  bool is_buffered_channel = false;
+  if (dynamic_cast<Buffered<int> *>(ch)) is_buffered_channel = true;
+
+  if (is_buffered_channel) {
+    // If channel is buffered, verify that atleast 4 threads are blocked
+    int ct = 0;
+    for (size_t i = 0; i < num_threads; i++) {
+      if (thread_ended[i] == false) ct++;
+    }
+    // Atleast 4 threads must be blocked
+    EXPECT_GE(ct, 4);
+  } else {
+    // Verify that all the threads are blocked
+    for (size_t i = 0; i < num_threads; i++) {
+      EXPECT_EQ(thread_ended[i], false);
+    }
+  }
+  // Explicitly destroy the channel
+  delete ch;
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
+
+  // Verify that all threads got unblocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], true);
+  }
+
+  // Count number of successfuld sends
+  int ct = 0;
+  for (size_t i = 0; i < num_threads; i++) {
+    if (send_success[i]) ct++;
+  }
+
+  if (is_buffered_channel) {
+    // Only 1 send must be successful
+    EXPECT_EQ(ct, 1);
+  } else {
+    // In unbuffered channel, no send should be successful
+    EXPECT_EQ(ct, 0);
+  }
+
+  // Join all threads
+  for (size_t i = 0; i < num_threads; i++) t[i].join();
+}
+
+// This tests that destroying a channel also unblocks
+//  any receivers waiting on the channel
+void ChannelDestroyUnblockReceivers(Channel<int> *ch) {
+  size_t num_threads = 5;
+  std::thread t[num_threads];
+  bool thread_ended[num_threads];
+
+  // Launches threads that try to read and are blocked because of no writers
+  for (size_t i = 0; i < num_threads; i++) {
+    thread_ended[i] = false;
+    t[i] = std::thread(
+        [&](bool *p) {
+          int data;
+          // All reads should return false
+          EXPECT_EQ(ch->Receive(&data), false);
+          *p = true;
+        },
+        &thread_ended[i]);
+  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
+
+  // Verify that all threads are blocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], false);
+  }
+  // delete the channel
+  delete ch;
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
+  // Verify that all threads got unblocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], true);
+  }
+
+  for (size_t i = 0; i < num_threads; i++) t[i].join();
+}
+
+TEST(Channel, BufferedChannelDestroyUnblocksReceiversTest) {
+  size_t buffer_size = 1;
+  auto ch = MakeChannel<int>(buffer_size);
+  ChannelDestroyUnblockReceivers(ch);
+}
+
+TEST(Channel, BufferedChannelDestroyUnblocksSendersTest) {
+  size_t buffer_size = 1;
+  auto ch = MakeChannel<int>(buffer_size);
+  ChannelDestroyUnblockSenders(ch);
+}
+
+// This tests that destroying an unbuffered channel also unblocks
+//  unblocks any receivers waiting for senders
+TEST(Channel, UnbufferedChannelDestroyUnblocksReceiversTest) {
+  auto ch = MakeChannel<int>(0);
+  ChannelDestroyUnblockReceivers(ch);
+}
+
+TEST(Channel, UnbufferedChannelDestroyUnblocksSendersTest) {
+  auto ch = MakeChannel<int>(0);
+  ChannelDestroyUnblockSenders(ch);
+}
--- a/paddle/framework/details/buffered_channel.h
+++ b/paddle/framework/details/buffered_channel.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -25,6 +25,14 @@ namespace paddle {
 namespace framework {
 namespace details {

+// Four of the properties of Buffered Channel:
+// - A send to a full channel blocks temporarily until a receive from the
+// channel or the channel is closed.
+// - A receive from an empty channel blocks temporarily until a send to the
+// channel or the channel is closed.
+// - A send to a closed channel returns false immediately.
+// - A receive from a closed channel returns false immediately.
+
 template <typename T>
 class Buffered : public paddle::framework::Channel<T> {
  friend Channel<T>* paddle::framework::MakeChannel<T>(size_t);
@@ -42,8 +50,11 @@ class Buffered : public paddle::framework::Channel<T> {
  std::mutex mu_;
  std::condition_variable empty_cond_var_;
  std::condition_variable full_cond_var_;
+  std::condition_variable destructor_cond_var_;
  std::deque<T> channel_;
  std::atomic<bool> closed_{false};
+  std::atomic<unsigned> send_ctr{0};
+  std::atomic<unsigned> recv_ctr{0};

  Buffered(size_t cap) : cap_(cap), closed_(false) {
    PADDLE_ENFORCE_GT(cap, 0);
@@ -58,6 +69,7 @@ bool Buffered<T>::Send(T* item) {
  if (closed_) {
    return ret;
  }
+  send_ctr++;
  std::unique_lock<std::mutex> lock(mu_);
  full_cond_var_.wait(lock,
                      [this]() { return channel_.size() < cap_ || closed_; });
@@ -67,20 +79,30 @@ bool Buffered<T>::Send(T* item) {
    empty_cond_var_.notify_one();
    ret = true;
  }
+  send_ctr--;
+  destructor_cond_var_.notify_one();
  return ret;
 }

 template <typename T>
 bool Buffered<T>::Receive(T* item) {
+  bool ret = false;
+  // Once the channel has been closed and all data has been consumed,
+  // just return false. Don't even try acquiring the mutex.
+  if (closed_ && channel_.empty()) {
+    return false;
+  }
+  recv_ctr++;
  std::unique_lock<std::mutex> lock(mu_);
  empty_cond_var_.wait(lock, [this]() { return !channel_.empty() || closed_; });
-  bool ret = false;
  if (!channel_.empty()) {
    *item = std::move(channel_.front());
    channel_.pop_front();
    full_cond_var_.notify_one();
    ret = true;
  }
+  recv_ctr--;
+  destructor_cond_var_.notify_one();
  return ret;
 }

@@ -100,6 +122,12 @@ Buffered<T>::~Buffered() {
  closed_ = true;
  channel_.clear();
  NotifyAllParticipants(&lock);
+
+  // The destructor must wait for all readers and writers to complete their task
+  // The channel has been closed, so we will not accept new readers and writers
+  lock.lock();
+  destructor_cond_var_.wait(
+      lock, [this]() { return send_ctr == 0 && recv_ctr == 0; });
 }

 template <typename T>

--- a/paddle/framework/details/cow_ptr.h
+++ b/paddle/framework/details/cow_ptr.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.

--- a/paddle/framework/details/cow_ptr_test.cc
+++ b/paddle/framework/details/cow_ptr_test.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.

--- a/paddle/framework/details/op_registry.h
+++ b/paddle/framework/details/op_registry.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.

--- a/paddle/framework/details/unbuffered_channel.h
+++ b/paddle/framework/details/unbuffered_channel.h
@@ -23,6 +23,13 @@ namespace paddle {
 namespace framework {
 namespace details {

+// Four of the properties of UnBuffered Channel:
+// - A send to a channel blocks temporarily until a receive from the
+// channel or the channel is closed.
+// - A receive from a channel blocks temporarily until a send to the
+// channel or the channel is closed.
+// - A send to a closed channel returns false immediately.
+// - A receive from a closed channel returns false immediately.
 template <typename T>
 class UnBuffered : public paddle::framework::Channel<T> {
  friend Channel<T>* paddle::framework::MakeChannel<T>(size_t);
@@ -45,9 +52,11 @@ class UnBuffered : public paddle::framework::Channel<T> {
  // A transaction occurs only when both are true
  std::atomic<bool> reader_found_{false}, writer_found_{false};
  std::condition_variable cv_channel_;
-  std::condition_variable_any cv_reader_, cv_writer_;
+  std::condition_variable_any cv_reader_, cv_writer_, cv_destructor_;
  T* item{nullptr};
  std::atomic<bool> closed_{false};
+  std::atomic<unsigned> send_ctr{0};
+  std::atomic<unsigned> recv_ctr{0};

  UnBuffered() : closed_(false) {}

@@ -62,6 +71,7 @@ bool UnBuffered<T>::Send(T* data) {
  if (closed_) {
    return ret;
  }
+  send_ctr++;
  // Prevent other writers from entering
  std::unique_lock<std::recursive_mutex> writer_lock(mu_write_);
  writer_found_ = true;
@@ -81,6 +91,8 @@ bool UnBuffered<T>::Send(T* data) {
    ret = true;
  }
  writer_found_ = false;
+  send_ctr--;
+  cv_destructor_.notify_one();
  return ret;
 }

@@ -88,6 +100,12 @@ bool UnBuffered<T>::Send(T* data) {
 // data that was sent by a writer is read from a reader.
 template <typename T>
 bool UnBuffered<T>::Receive(T* data) {
+  bool ret = false;
+  // If channel is closed, we don't even want any reader to enter.
+  // Unlike a buffered channel, an unbuffered channel does not allow
+  // readers to read after closing because there is no buffer to be consumed.
+  if (closed_) return ret;
+  recv_ctr++;
  // Prevent other readers from entering
  std::unique_lock<std::recursive_mutex> read_lock{mu_read_};
  reader_found_ = true;
@@ -96,7 +114,6 @@ bool UnBuffered<T>::Receive(T* data) {
  cv_reader_.wait(cv_lock,
                  [this]() { return writer_found_ == true || closed_; });
  cv_writer_.notify_one();
-  bool ret = false;
  if (!closed_) {
    std::unique_lock<std::mutex> lock_ch{mu_ch_};
    // Reader should wait for the writer to first write its data
@@ -110,6 +127,8 @@ bool UnBuffered<T>::Receive(T* data) {
    cv_channel_.notify_one();
  }
  reader_found_ = false;
+  recv_ctr--;
+  cv_destructor_.notify_one();
  return ret;
 }

@@ -135,6 +154,9 @@ UnBuffered<T>::~UnBuffered() {
  item = nullptr;
  closed_ = true;
  NotifyAllParticipants(&lock);
+  lock.lock();
+  cv_destructor_.wait(lock,
+                      [this]() { return send_ctr == 0 && recv_ctr == 0; });
 }

 // This function notifies all the readers, writers and

--- a/paddle/framework/grad_op_desc_maker.h
+++ b/paddle/framework/grad_op_desc_maker.h
@@ -122,6 +122,11 @@ class GradOpDescMakerBase {
    return it->second;
  }

+  template <typename T>
+  inline const T& Attr(const std::string& name) const {
+    return boost::get<T>(GetAttr(name));
+  }
+
  std::string ForwardOpType() const { return this->fwd_op_.Type(); }

 private:

--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@@ -48,12 +48,26 @@ namespace framework {
 */
 struct LoD : public std::vector<Vector<size_t>> {
  using std::vector<Vector<size_t>>::vector;
+  platform::Place place() const {
+    if (this->size() == 0) {
+      // Not Initialze Yet.
+      return platform::CPUPlace();
+    } else {
+      return this->front().place();
+    }
+  }

  void CopyFromCUDA() {
    for (auto it = this->begin(); it != this->end(); ++it) {
      it->CopyFromCUDA();
    }
  }
+
+  void CopyToPeer(platform::Place place) {
+    for (auto it = this->begin(); it != this->end(); ++it) {
+      it->CopyToPeer(place);
+    }
+  }
 };

 std::ostream& operator<<(std::ostream& os, const LoD& lod);

--- a/paddle/framework/lod_tensor_test.cu
+++ b/paddle/framework/lod_tensor_test.cu
@@ -28,28 +28,6 @@ __global__ void test(size_t* a, int size) {
  }
 }

-TEST(Vector, Normal) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-  using namespace paddle::memory;
-
-  paddle::framework::InitDevices();
-
-  paddle::framework::Vector<size_t> vec({1, 2, 3});
-  size_t* ptr = vec.data();
-  for (size_t i = 0; i < vec.size(); ++i) {
-    EXPECT_EQ(vec[i], *(ptr + i));
-  }
-
-  vec.clear();
-  vec.CopyFromCUDA();
-
-  std::vector<size_t> v = {1, 2, 3};
-  for (size_t i = 0; i < v.size(); ++i) {
-    EXPECT_EQ(v[i], vec[i]);
-  }
-}
-
 TEST(LoD, data) {
  paddle::framework::InitDevices();


--- a/paddle/framework/mixed_vector.h
+++ b/paddle/framework/mixed_vector.h
@@ -40,20 +40,21 @@ class Vector : public std::vector<T> {
  Vector() {}
  Vector(const std::vector<T> &v) : std::vector<T>(v) {}  // NOLINT

-  virtual ~Vector() {
-#ifdef PADDLE_WITH_CUDA
-    if (cuda_ptr_ != nullptr) {
-      memory::Free<platform::CUDAPlace>(place_, cuda_ptr_);
-    }
-#endif
-  }
+  inline platform::Place place() const { return place_; }

+  /*! Return a pointer to constant memory block. */
+  inline const T *data(platform::Place place) const;
+
+  /*! Return a pointer to mutable memory block. */
+  inline T *mutable_data(platform::Place place);
+
+  // TODO(dzhwinter): below interfaces should be removed
  /* Get device vector */
  T *cuda_data() {
    CopyToCUDA();
    PADDLE_ENFORCE_NOT_NULL(
        cuda_ptr_, "No data or Insufficient CUDA memory to allocation");
-    return static_cast<T *>(cuda_ptr_);
+    return static_cast<T *>(cuda_ptr_.get());
  }

  /* Get host vector */
@@ -76,25 +77,73 @@ class Vector : public std::vector<T> {
  void CopyToPeer(platform::Place);

 private:
-  void *cuda_ptr_ = nullptr;
+  std::shared_ptr<void> cuda_ptr_;
  size_t cuda_size_ = 0;  // device vector numel
  platform::CUDAPlace place_;
 };

 template <typename T>
-void Vector<T>::CopyToCUDA() {
+inline const T *Vector<T>::data(platform::Place place) const {
+  if (platform::is_cpu_place(place)) {
+    return std::vector<T>::data();
+  } else if (platform::is_gpu_place(place)) {
+    if (cuda_ptr_ == nullptr) {
+      return nullptr;
+    }
+    if (boost::get<platform::CUDAPlace>(place) == place_) {
+      return static_cast<const T *>(cuda_ptr_.get());
+    } else {
+      PADDLE_THROW(
+          "Unmatched place. Please use `mutable_data` copy lod to the target "
+          "Place first.");
+    }
+  } else {
+    PADDLE_THROW("Unsupport Place.");
+  }
+}
+
+template <typename T>
+inline T *Vector<T>::mutable_data(platform::Place place) {
+  if (platform::is_cpu_place(place)) {
+    return std::vector<T>::data();
+  } else if (platform::is_gpu_place(place)) {
+    if (boost::get<platform::CUDAPlace>(place) != place_) {
+      place_ = boost::get<platform::CUDAPlace>(place);
+    }
 #ifdef PADDLE_WITH_CUDA
-  if (cuda_size_ < this->size()) {
-    if (cuda_ptr_ != nullptr) {
-      memory::Free<platform::CUDAPlace>(place_, cuda_ptr_);
+    if (cuda_size_ < this->size() || cuda_ptr_ == nullptr) {
+      cuda_ptr_.reset(
+          memory::Alloc<platform::CUDAPlace>(place_, this->size() * sizeof(T)),
+          memory::PlainDeleter<void, platform::CUDAPlace>(place_));
    }
-    cuda_ptr_ =
-        memory::Alloc<platform::CUDAPlace>(place_, this->size() * sizeof(T));
+    cuda_size_ = this->size();
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto *ctx = pool.GetByPlace(place_);
+    memory::Copy(place_, cuda_ptr_.get(), platform::CPUPlace(),
+                 static_cast<const void *>(this->data()),
+                 this->size() * sizeof(T), ctx->stream());
+    ctx->Wait();
+    return static_cast<T *>(cuda_ptr_.get());
+#else
+    return nullptr;
+#endif
+  } else {
+    PADDLE_THROW("Unsupport Place.");
+  }
+}
+
+template <typename T>
+void Vector<T>::CopyToCUDA() {
+#ifdef PADDLE_WITH_CUDA
+  if (cuda_size_ < this->size() || cuda_ptr_ == nullptr) {
+    cuda_ptr_.reset(
+        memory::Alloc<platform::CUDAPlace>(place_, this->size() * sizeof(T)),
+        memory::PlainDeleter<void, platform::CUDAPlace>(place_));
  }
  cuda_size_ = this->size();
  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
  auto *ctx = pool.GetByPlace(place_);
-  memory::Copy(place_, cuda_ptr_, platform::CPUPlace(),
+  memory::Copy(place_, cuda_ptr_.get(), platform::CPUPlace(),
               static_cast<const void *>(this->data()),
               this->size() * sizeof(T), ctx->stream());
  ctx->Wait();
@@ -112,32 +161,32 @@ void Vector<T>::CopyFromCUDA() {
  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
  auto *ctx = pool.GetByPlace(place_);
  memory::Copy(platform::CPUPlace(), static_cast<void *>(this->data()), place_,
-               static_cast<const void *>(cuda_ptr_), this->size() * sizeof(T),
-               ctx->stream());
+               static_cast<const void *>(cuda_ptr_.get()),
+               this->size() * sizeof(T), ctx->stream());
  ctx->Wait();
 #endif
 }

 template <typename T>
-void Vector<T>::CopyToPeer(platform::Place peer_place) {
+void Vector<T>::CopyToPeer(platform::Place place) {
 #ifdef PADDLE_WITH_CUDA
-  auto *ctx = platform::DeviceContextPool::Instance().GetByPlace(place_);
-  void *peer_cuda_ptr = memory::Alloc<platform::CUDAPlace>(
-      boost::get<platform::CUDAPlace>(peer_place), this->size() * sizeof(T));
-  memory::Copy(boost::get<platform::CUDAPlace>(peer_place), peer_cuda_ptr,
-               place_, cuda_ptr_, this->size() * sizeof(T), ctx->stream());
+  if (boost::get<platform::CUDAPlace>(place) != place_) {
+    place_ = boost::get<platform::CUDAPlace>(place);
+  }
+  if (cuda_size_ < this->size() || cuda_ptr_ == nullptr) {
+    cuda_ptr_.reset(
+        memory::Alloc<platform::CUDAPlace>(place_, this->size() * sizeof(T)),
+        memory::PlainDeleter<void, platform::CUDAPlace>(place_));
+  }
+  cuda_size_ = this->size();
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  auto *ctx = pool.GetByPlace(place_);
+  memory::Copy(place_, cuda_ptr_.get(), platform::CPUPlace(),
+               static_cast<const void *>(this->data()),
+               this->size() * sizeof(T), ctx->stream());
  ctx->Wait();
-
-  memory::Free<platform::CUDAPlace>(place_, cuda_ptr_);
-  place_ = boost::get<platform::CUDAPlace>(peer_place);
-  cuda_ptr_ = peer_cuda_ptr;
 #endif
 }

-template class Vector<int>;
-template class Vector<unsigned>;
-template class Vector<size_t>;
-template class Vector<int64_t>;
-
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/mixed_vector_test.cu
+++ b/paddle/framework/mixed_vector_test.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "gtest/gtest.h"
+
+#include "paddle/framework/init.h"
+#include "paddle/framework/mixed_vector.h"
+
+using namespace paddle::framework;
+using namespace paddle::platform;
+using namespace paddle::memory;
+
+template <typename T>
+__global__ void test(T* data, int size) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size;
+       i += blockDim.x * gridDim.x) {
+    data[i] *= 2;
+  }
+}
+
+TEST(Vector, Normal) {
+  // fill the device context pool.
+  InitDevices();
+
+  Vector<size_t> vec({1, 2, 3});
+  size_t* ptr = vec.data();
+  for (size_t i = 0; i < vec.size(); ++i) {
+    EXPECT_EQ(vec[i], *(ptr + i));
+  }
+
+  vec.clear();
+  vec.CopyFromCUDA();
+
+  std::vector<size_t> v = {1, 2, 3};
+  for (size_t i = 0; i < v.size(); ++i) {
+    EXPECT_EQ(v[i], vec[i]);
+  }
+}
+
+TEST(Vector, MultipleCopy) {
+  InitDevices();
+  Vector<size_t> vec({1, 2, 3});
+  CUDAPlace place(0);
+  vec.mutable_data(place);
+  auto vec2 = Vector<size_t>(vec);
+  {
+    const size_t* ptr = vec2.data(CPUPlace());
+    for (size_t i = 0; i < vec2.size(); ++i) {
+      EXPECT_EQ(*(ptr + i), vec[i]);
+    }
+  }
+  test<size_t><<<3, 3>>>(vec2.mutable_data(place), vec2.size());
+  vec2.CopyFromCUDA();
+  {
+    const size_t* ptr = vec2.data(CPUPlace());
+    for (size_t i = 0; i < vec2.size(); ++i) {
+      EXPECT_EQ(*(ptr + i), vec[i] * 2);
+    }
+  }
+}
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -125,11 +125,10 @@ OpDesc::OpDesc(const proto::OpDesc &desc, ProgramDesc *prog, BlockDesc *block)
  // restore attrs_
  for (const proto::OpDesc::Attr &attr : desc_.attrs()) {
    std::string attr_name = attr.name();
+    // The sub_block referred to by the BLOCK attr hasn't been added
+    // to ProgramDesc class yet, we skip setting BLOCK attr here.
    if (attr.type() != proto::AttrType::BLOCK) {
      attrs_[attr_name] = GetAttrValue(attr);
-    } else {
-      auto bid = attr.block_idx();
-      attrs_[attr_name] = prog->MutableBlock(bid);
    }
  }
  this->block_ = block;

--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -143,7 +143,7 @@ class OpKernelRegistrar : public Registrar {

 /**
 * Macro to register Operator. When the input is duplicable, you should
- * use REGISTER_OP_EX with deop_empty_grad=false instead.
+ * use REGISTER_OP_EX with drop_empty_grad=false instead.
 */
 #define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, \
                    grad_op_class)                                   \

--- a/paddle/framework/program_desc.cc
+++ b/paddle/framework/program_desc.cc
@@ -43,11 +43,20 @@ ProgramDesc::ProgramDesc() {

 ProgramDesc::ProgramDesc(const ProgramDesc &o) {
  desc_ = o.desc_;
-
  for (int i = 0; i < desc_.blocks_size(); ++i) {
    auto *block = desc_.mutable_blocks(i);
    blocks_.emplace_back(new BlockDesc(*o.blocks_[i], block, this));
  }
+  for (auto &block : blocks_) {
+    for (auto *op : block->AllOps()) {
+      for (const auto &attr : op->Proto()->attrs()) {
+        if (attr.type() == proto::AttrType::BLOCK) {
+          size_t blk_idx = attr.block_idx();
+          op->SetBlockAttr(attr.name(), *this->MutableBlock(blk_idx));
+        }
+      }
+    }
+  }
 }

 ProgramDesc::ProgramDesc(const proto::ProgramDesc &desc) {
@@ -55,6 +64,16 @@ ProgramDesc::ProgramDesc(const proto::ProgramDesc &desc) {
  for (auto &block_desc : *desc_.mutable_blocks()) {
    blocks_.emplace_back(new BlockDesc(this, &block_desc));
  }
+  for (auto &block : blocks_) {
+    for (auto *op : block->AllOps()) {
+      for (const auto &attr : op->Proto()->attrs()) {
+        if (attr.type() == proto::AttrType::BLOCK) {
+          size_t blk_idx = attr.block_idx();
+          op->SetBlockAttr(attr.name(), *this->MutableBlock(blk_idx));
+        }
+      }
+    }
+  }
 }

 ProgramDesc::ProgramDesc(const std::string &binary_str) {

--- a/paddle/framework/prune.cc
+++ b/paddle/framework/prune.cc
@@ -49,11 +49,28 @@ bool IsTarget(const proto::OpDesc& op_desc) {
  return false;
 }

-void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
-                int block_id) {
-  // TODO(tonyyang-svail):
-  //    - will change to use multiple blocks for RNN op and Cond Op
+int GetSubBlockIndex(const proto::OpDesc& op_desc) {
+  for (auto& attr : op_desc.attrs()) {
+    if (attr.type() == proto::AttrType::BLOCK) {
+      PADDLE_ENFORCE(attr.has_block_idx());
+      return attr.block_idx();
+    }
+  }
+  return -1;
+}
+
+bool HasSubBlock(const proto::OpDesc& op_desc) {
+  return GetSubBlockIndex(op_desc) > 0;
+}

+// block_id is the idx of the current block in the input desc
+// parent_block_id is the idx of the parent of the current block
+// in the output desc, -1 means the current block is global block
+// dependent_vars is passed recursively from the parent block to
+// the child block to help pruning
+void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
+                int block_id, int parent_block_id,
+                std::set<std::string>& dependent_vars) {
  auto& block = input.blocks(block_id);
  auto& ops = block.ops();

@@ -72,11 +89,9 @@ void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
    expect_fetch = (op_desc.type() == kFetchOpType);
  }

-  std::set<std::string> dependent_vars;
  std::vector<bool> should_run;
  for (auto op_iter = ops.rbegin(); op_iter != ops.rend(); ++op_iter) {
    auto& op_desc = *op_iter;
-
    if (IsTarget(op_desc) || HasDependentVar(op_desc, dependent_vars)) {
      // insert its input to the dependency graph
      for (auto& var : op_desc.inputs()) {
@@ -84,7 +99,6 @@ void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
          dependent_vars.insert(argu);
        }
      }
-
      should_run.push_back(true);
    } else {
      should_run.push_back(false);
@@ -95,45 +109,81 @@ void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
  // we reverse the should_run vector
  std::reverse(should_run.begin(), should_run.end());

-  *output = input;
-  auto* op_field = output->mutable_blocks(block_id)->mutable_ops();
+  // copy the current block from input to output
+  auto* block_field = output->mutable_blocks();
+  *block_field->Add() = input.blocks(block_id);
+
+  int output_block_id = output->blocks_size() - 1;
+  auto* output_block = output->mutable_blocks(output_block_id);
+  output_block->set_idx(output_block_id);
+  output_block->set_parent_idx(parent_block_id);
+
+  auto* op_field = output_block->mutable_ops();
  op_field->Clear();
  for (size_t i = 0; i < should_run.size(); ++i) {
    if (should_run[i]) {
-      *op_field->Add() = input.blocks(block_id).ops(i);
+      auto* op = op_field->Add();
+      *op = input.blocks(block_id).ops(i);
+      if (HasSubBlock(*op)) {
+        // create sub_block_dependent_vars here to help prune the sub block
+        std::set<std::string> sub_block_dependent_vars;
+        for (auto& var : op->inputs()) {
+          for (auto& argu : var.arguments()) {
+            sub_block_dependent_vars.insert(argu);
+          }
+        }
+        for (auto& var : op->outputs()) {
+          for (auto& argu : var.arguments()) {
+            sub_block_dependent_vars.insert(argu);
+          }
+        }
+        // GetSubBlockIndex(*op) is the idx of the sub_block in the input desc
+        // output_block_id is the idx of the current block in the output desc
+        prune_impl(input, output, GetSubBlockIndex(*op), output_block_id,
+                   sub_block_dependent_vars);
+      }
    }
  }

  // remove the VarDescs in BlockDesc that are not referenced in
  // the pruned OpDescs
  std::unordered_map<std::string, proto::VarDesc> var_map;
-  auto* var_field = output->mutable_blocks(block_id)->mutable_vars();
+  auto* var_field = output->mutable_blocks(output_block_id)->mutable_vars();
  for (const auto& var : *var_field) {
    var_map[var.name()] = var;
  }

-  var_field->Clear();
+  std::set<std::string> var_names;
  for (const auto& op : *op_field) {
-    // add VarDescs of all input arguments for each OpDesc
    auto& input_field = op.inputs();
    for (auto& input_var : input_field) {
      for (auto& arg : input_var.arguments()) {
-        *var_field->Add() = var_map[arg];
+        if (var_map.count(arg) != 0) {
+          var_names.insert(arg);
+        }
      }
    }
-    // add VarDescs of all output arguments for each OpDesc
    auto& output_field = op.outputs();
    for (auto& output_var : output_field) {
      for (auto& arg : output_var.arguments()) {
-        *var_field->Add() = var_map[arg];
+        if (var_map.count(arg) != 0) {
+          var_names.insert(arg);
+        }
      }
    }
  }
+
+  var_field->Clear();
+  for (const auto& name : var_names) {
+    *var_field->Add() = var_map[name];
+  }
 }

 // TODO(fengjiayi): Prune() could be inplaced to avoid unnecessary copies
 void Prune(const proto::ProgramDesc& input, proto::ProgramDesc* output) {
-  prune_impl(input, output, 0);
+  std::set<std::string> dependent_vars;
+  output->clear_blocks();
+  prune_impl(input, output, 0, -1, dependent_vars);
 }

 void inference_optimize_impl(const proto::ProgramDesc& input,

--- a/paddle/gserver/tests/test_CompareSparse.cpp
+++ b/paddle/gserver/tests/test_CompareSparse.cpp
@@ -212,6 +212,10 @@ TEST(compareSparse, NeuralNetwork) {
 }

 int main(int argc, char** argv) {
+  // FIXME(tonyyang-svail):
+  //   Turn off this test due CI failure:
+  //   https://paddleci.ngrok.io/viewLog.html?buildId=27608&buildTypeId=Paddle_PrCi&tab=buildLog&_focus=10430
+  return 0;
  testing::InitGoogleTest(&argc, argv);
  initMain(argc, argv);
  initPython(argc, argv);

--- a/paddle/inference/CMakeLists.txt
+++ b/paddle/inference/CMakeLists.txt
@@ -4,19 +4,14 @@ cc_library(paddle_fluid_api
    SRCS io.cc
    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})

-# Merge all modules into a single static library
+# Create static library
 cc_library(paddle_fluid DEPS paddle_fluid_api ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})

 # Create shared library
-add_library(paddle_fluid_shared SHARED io.cc)
-
-target_circle_link_libraries(paddle_fluid_shared
-  ARCHIVE_START
-  ${GLOB_OP_LIB}
-  ${FLUID_CORE_MODULES}
-  ARCHIVE_END)
-
-SET_TARGET_PROPERTIES(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
+cc_library(paddle_fluid_shared SHARED
+    SRCS io.cc
+    DEPS ARCHIVE_START ${GLOB_OP_LIB} ${FLUID_CORE_MODULES} ARCHIVE_END)
+set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)

 if(WITH_TESTING)
  add_subdirectory(tests/book)

--- a/paddle/inference/io.cc
+++ b/paddle/inference/io.cc
@@ -21,6 +21,17 @@ limitations under the License. */
 namespace paddle {
 namespace inference {

+void ReadBinaryFile(const std::string& filename, std::string& contents) {
+  VLOG(3) << "loading model from " << filename;
+  std::ifstream inputfs(filename, std::ios::in | std::ios::binary);
+  inputfs.seekg(0, std::ios::end);
+  contents.clear();
+  contents.resize(inputfs.tellg());
+  inputfs.seekg(0, std::ios::beg);
+  inputfs.read(&contents[0], contents.size());
+  inputfs.close();
+}
+
 bool IsParameter(const framework::VarDesc* var,
                 const framework::ProgramDesc& main_program) {
  if (var->Persistable()) {
@@ -44,12 +55,15 @@ bool IsParameter(const framework::VarDesc* var,

 void LoadPersistables(framework::Executor& executor,
                      framework::Scope& scope,
+                      const framework::ProgramDesc& main_program,
                      const std::string& dirname,
-                      const framework::ProgramDesc& main_program) {
+                      const std::string& param_filename) {
  const framework::BlockDesc& global_block = main_program.Block(0);

  framework::ProgramDesc* load_program = new framework::ProgramDesc();
  framework::BlockDesc* load_block = load_program->MutableBlock(0);
+  std::vector<std::string> paramlist;
+
  for (auto* var : global_block.AllVars()) {
    if (IsParameter(var, main_program)) {
      VLOG(3) << "parameter's name: " << var->Name();
@@ -61,15 +75,33 @@ void LoadPersistables(framework::Executor& executor,
      new_var->SetLoDLevel(var->GetLoDLevel());
      new_var->SetPersistable(true);

-      // append_op
-      framework::OpDesc* op = load_block->AppendOp();
-      op->SetType("load");
-      op->SetOutput("Out", {new_var->Name()});
-      op->SetAttr("file_path", {dirname + "/" + new_var->Name()});
-      op->CheckAttrs();
+      if (!param_filename.empty()) {
+        paramlist.push_back(new_var->Name());
+      } else {
+        // append_op
+        framework::OpDesc* op = load_block->AppendOp();
+        op->SetType("load");
+        op->SetOutput("Out", {new_var->Name()});
+        op->SetAttr("file_path", {dirname + "/" + new_var->Name()});
+        op->CheckAttrs();
+      }
    }
  }
+
+  if (!param_filename.empty()) {
+    // sort paramlist to have consistent ordering
+    std::sort(paramlist.begin(), paramlist.end());
+    // append just the load_combine op
+    framework::OpDesc* op = load_block->AppendOp();
+    op->SetType("load_combine");
+    op->SetOutput("Out", paramlist);
+    op->SetAttr("file_path", {param_filename});
+    op->CheckAttrs();
+  }
+
  executor.Run(*load_program, &scope, 0, true, true);
+
+  VLOG(3) << "Ran loading successfully";
  delete load_program;
 }

@@ -77,20 +109,29 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
                                             framework::Scope& scope,
                                             const std::string& dirname) {
  std::string model_filename = dirname + "/__model__";
-  LOG(INFO) << "loading model from " << model_filename;
-  std::ifstream inputfs(model_filename, std::ios::in | std::ios::binary);
  std::string program_desc_str;
-  inputfs.seekg(0, std::ios::end);
-  program_desc_str.resize(inputfs.tellg());
-  inputfs.seekg(0, std::ios::beg);
-  LOG(INFO) << "program_desc_str's size: " << program_desc_str.size();
-  inputfs.read(&program_desc_str[0], program_desc_str.size());
-  inputfs.close();
+  ReadBinaryFile(model_filename, program_desc_str);
+
+  std::unique_ptr<framework::ProgramDesc> main_program(
+      new framework::ProgramDesc(program_desc_str));
+
+  LoadPersistables(executor, scope, *main_program, dirname, "");
+  return main_program;
+}
+
+std::unique_ptr<framework::ProgramDesc> Load(
+    framework::Executor& executor,
+    framework::Scope& scope,
+    const std::string& prog_filename,
+    const std::string& param_filename) {
+  std::string model_filename = prog_filename;
+  std::string program_desc_str;
+  ReadBinaryFile(model_filename, program_desc_str);

  std::unique_ptr<framework::ProgramDesc> main_program(
      new framework::ProgramDesc(program_desc_str));

-  LoadPersistables(executor, scope, dirname, *main_program);
+  LoadPersistables(executor, scope, *main_program, "", param_filename);
  return main_program;
 }


--- a/paddle/inference/io.h
+++ b/paddle/inference/io.h
@@ -26,12 +26,18 @@ namespace inference {

 void LoadPersistables(framework::Executor& executor,
                      framework::Scope& scope,
+                      const framework::ProgramDesc& main_program,
                      const std::string& dirname,
-                      const framework::ProgramDesc& main_program);
+                      const std::string& param_filename);

 std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
                                             framework::Scope& scope,
                                             const std::string& dirname);

+std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
+                                             framework::Scope& scope,
+                                             const std::string& prog_filename,
+                                             const std::string& param_filename);
+
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/inference/tests/book/CMakeLists.txt
+++ b/paddle/inference/tests/book/CMakeLists.txt
-set(PYTHON_TESTS_DIR ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/tests)
-cc_test(test_inference_recognize_digits_mlp
-    SRCS test_inference_recognize_digits.cc
-    DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
-    ARGS --dirname=${PYTHON_TESTS_DIR}/book/recognize_digits_mlp.inference.model)
-cc_test(test_inference_image_classification_vgg
-    SRCS test_inference_image_classification.cc
-    DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
-    ARGS --dirname=${PYTHON_TESTS_DIR}/book/image_classification_vgg.inference.model)
-cc_test(test_inference_image_classification_resnet
-    SRCS test_inference_image_classification.cc
-    DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
-    ARGS --dirname=${PYTHON_TESTS_DIR}/book/image_classification_resnet.inference.model)
-cc_test(test_inference_label_semantic_roles
-    SRCS test_inference_label_semantic_roles.cc
-    DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
-    ARGS --dirname=${PYTHON_TESTS_DIR}/book/label_semantic_roles.inference.model)
-set_tests_properties(test_inference_recognize_digits_mlp
-    PROPERTIES DEPENDS test_recognize_digits)
-set_tests_properties(test_inference_image_classification_vgg
-    PROPERTIES DEPENDS test_image_classification_train)
-set_tests_properties(test_inference_image_classification_resnet
-    PROPERTIES DEPENDS test_image_classification_train)
-set_tests_properties(test_inference_label_semantic_roles
-    PROPERTIES DEPENDS test_label_semantic_roles) 
+function(inference_test TARGET_NAME)
+  set(options "")
+  set(oneValueArgs "")
+  set(multiValueArgs ARGS)
+  cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  set(PYTHON_TESTS_DIR ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/tests)
+  set(arg_list "")
+  if(inference_test_ARGS)
+    foreach(arg ${inference_test_ARGS})
+      list(APPEND arg_list "_${arg}")
+    endforeach()
+  else()
+    list(APPEND arg_list "_")
+  endif()
+  foreach(arg ${arg_list})
+    string(REGEX REPLACE "^_$" "" arg "${arg}")
+    cc_test(test_inference_${TARGET_NAME}${arg}
+        SRCS test_inference_${TARGET_NAME}.cc
+        DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
+        ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.inference.model)
+    set_tests_properties(test_inference_${TARGET_NAME}${arg}
+        PROPERTIES DEPENDS test_${TARGET_NAME})
+  endforeach()
+endfunction(inference_test)
+
+inference_test(fit_a_line)
+inference_test(recognize_digits ARGS mlp)
+inference_test(image_classification ARGS vgg resnet)
+inference_test(label_semantic_roles)
+inference_test(rnn_encoder_decoder)
+inference_test(recommender_system)
--- a/paddle/inference/tests/book/test_helper.h
+++ b/paddle/inference/tests/book/test_helper.h
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#include <time.h>
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/inference/io.h"

@@ -29,6 +30,15 @@ void SetupTensor(paddle::framework::LoDTensor& input,
  }
 }

+template <typename T>
+void SetupTensor(paddle::framework::LoDTensor& input,
+                 paddle::framework::DDim dims,
+                 std::vector<T>& data) {
+  CHECK_EQ(paddle::framework::product(dims), data.size());
+  T* input_ptr = input.mutable_data<T>(dims, paddle::platform::CPUPlace());
+  memcpy(input_ptr, data.data(), input.numel() * sizeof(T));
+}
+
 template <typename T>
 void SetupLoDTensor(paddle::framework::LoDTensor& input,
                    paddle::framework::LoD& lod,
@@ -36,7 +46,18 @@ void SetupLoDTensor(paddle::framework::LoDTensor& input,
                    T upper) {
  input.set_lod(lod);
  int dim = lod[0][lod[0].size() - 1];
-  SetupTensor(input, {dim, 1}, lower, upper);
+  SetupTensor<T>(input, {dim, 1}, lower, upper);
+}
+
+template <typename T>
+void SetupLoDTensor(paddle::framework::LoDTensor& input,
+                    paddle::framework::DDim dims,
+                    paddle::framework::LoD lod,
+                    std::vector<T>& data) {
+  const size_t level = lod.size() - 1;
+  CHECK_EQ(dims[0], (lod[level]).back());
+  input.set_lod(lod);
+  SetupTensor<T>(input, dims, data);
 }

 template <typename T>
@@ -66,17 +87,31 @@ void CheckError(paddle::framework::LoDTensor& output1,
  EXPECT_EQ(count, 0) << "There are " << count << " different elements.";
 }

-template <typename Place, typename T>
+template <typename Place, bool IsCombined = false>
 void TestInference(const std::string& dirname,
                   const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
                   std::vector<paddle::framework::LoDTensor*>& cpu_fetchs) {
-  // 1. Define place, executor and scope
+  // 1. Define place, executor, scope and inference_program
  auto place = Place();
  auto executor = paddle::framework::Executor(place);
  auto* scope = new paddle::framework::Scope();

-  // 2. Initialize the inference_program and load all parameters from file
-  auto inference_program = paddle::inference::Load(executor, *scope, dirname);
+  // 2. Initialize the inference_program and load parameters
+  std::unique_ptr<paddle::framework::ProgramDesc> inference_program;
+  if (IsCombined) {
+    // All parameters are saved in a single file.
+    // Hard-coding the file names of program and parameters in unittest.
+    // Users are free to specify different filename.
+    std::string prog_filename = "__model_combined__";
+    std::string param_filename = "__params_combined__";
+    inference_program = paddle::inference::Load(executor,
+                                                *scope,
+                                                dirname + "/" + prog_filename,
+                                                dirname + "/" + param_filename);
+  } else {
+    // Parameters are saved in separate files sited in the specified `dirname`.
+    inference_program = paddle::inference::Load(executor, *scope, dirname);
+  }

  // 3. Get the feed_target_names and fetch_target_names
  const std::vector<std::string>& feed_target_names =

--- a/paddle/inference/tests/book/test_inference_fit_a_line.cc
+++ b/paddle/inference/tests/book/test_inference_fit_a_line.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "gflags/gflags.h"
+#include "test_helper.h"
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+TEST(inference, fit_a_line) {
+  if (FLAGS_dirname.empty()) {
+    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
+  }
+
+  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
+  std::string dirname = FLAGS_dirname;
+
+  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
+
+  paddle::framework::LoDTensor input;
+  // The second dim of the input tensor should be 13
+  // The input data should be >= 0
+  int64_t batch_size = 10;
+  SetupTensor<float>(
+      input, {batch_size, 13}, static_cast<float>(0), static_cast<float>(10));
+  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+  cpu_feeds.push_back(&input);
+
+  paddle::framework::LoDTensor output1;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+  cpu_fetchs1.push_back(&output1);
+
+  // Run inference on CPU
+  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
+  LOG(INFO) << output1.dims();
+
+#ifdef PADDLE_WITH_CUDA
+  paddle::framework::LoDTensor output2;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
+  cpu_fetchs2.push_back(&output2);
+
+  // Run inference on CUDA GPU
+  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
+  LOG(INFO) << output2.dims();
+
+  CheckError<float>(output1, output2);
+#endif
+}
--- a/paddle/inference/tests/book/test_inference_image_classification.cc
+++ b/paddle/inference/tests/book/test_inference_image_classification.cc
@@ -13,51 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include <gtest/gtest.h>
-#include <time.h>
-#include <sstream>
 #include "gflags/gflags.h"
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/inference/io.h"
+#include "test_helper.h"

 DEFINE_string(dirname, "", "Directory of the inference model.");

-template <typename Place, typename T>
-void TestInference(const std::string& dirname,
-                   const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
-                   std::vector<paddle::framework::LoDTensor*>& cpu_fetchs) {
-  // 1. Define place, executor and scope
-  auto place = Place();
-  auto executor = paddle::framework::Executor(place);
-  auto* scope = new paddle::framework::Scope();
-
-  // 2. Initialize the inference_program and load all parameters from file
-  auto inference_program = paddle::inference::Load(executor, *scope, dirname);
-
-  // 3. Get the feed_target_names and fetch_target_names
-  const std::vector<std::string>& feed_target_names =
-      inference_program->GetFeedTargetNames();
-  const std::vector<std::string>& fetch_target_names =
-      inference_program->GetFetchTargetNames();
-
-  // 4. Prepare inputs: set up maps for feed targets
-  std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
-  for (size_t i = 0; i < feed_target_names.size(); ++i) {
-    // Please make sure that cpu_feeds[i] is right for feed_target_names[i]
-    feed_targets[feed_target_names[i]] = cpu_feeds[i];
-  }
-
-  // 5. Define Tensor to get the outputs: set up maps for fetch targets
-  std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
-  for (size_t i = 0; i < fetch_target_names.size(); ++i) {
-    fetch_targets[fetch_target_names[i]] = cpu_fetchs[i];
-  }
-
-  // 6. Run the inference program
-  executor.Run(*inference_program, scope, feed_targets, fetch_targets);
-
-  delete scope;
-}
-
 TEST(inference, image_classification) {
  if (FLAGS_dirname.empty()) {
    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
@@ -69,13 +29,15 @@ TEST(inference, image_classification) {
  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc

+  int64_t batch_size = 1;
+
  paddle::framework::LoDTensor input;
-  srand(time(0));
-  float* input_ptr =
-      input.mutable_data<float>({1, 3, 32, 32}, paddle::platform::CPUPlace());
-  for (int i = 0; i < 3072; ++i) {
-    input_ptr[i] = rand() / (static_cast<float>(RAND_MAX));
-  }
+  // Use normilized image pixels as input data,
+  // which should be in the range [0.0, 1.0].
+  SetupTensor<float>(input,
+                     {batch_size, 3, 32, 32},
+                     static_cast<float>(0),
+                     static_cast<float>(1));
  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
  cpu_feeds.push_back(&input);

@@ -84,8 +46,7 @@ TEST(inference, image_classification) {
  cpu_fetchs1.push_back(&output1);

  // Run inference on CPU
-  TestInference<paddle::platform::CPUPlace, float>(
-      dirname, cpu_feeds, cpu_fetchs1);
+  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
  LOG(INFO) << output1.dims();

 #ifdef PADDLE_WITH_CUDA
@@ -94,20 +55,9 @@ TEST(inference, image_classification) {
  cpu_fetchs2.push_back(&output2);

  // Run inference on CUDA GPU
-  TestInference<paddle::platform::CUDAPlace, float>(
-      dirname, cpu_feeds, cpu_fetchs2);
+  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
  LOG(INFO) << output2.dims();

-  EXPECT_EQ(output1.dims(), output2.dims());
-  EXPECT_EQ(output1.numel(), output2.numel());
-
-  float err = 1E-3;
-  int count = 0;
-  for (int64_t i = 0; i < output1.numel(); ++i) {
-    if (fabs(output1.data<float>()[i] - output2.data<float>()[i]) > err) {
-      count++;
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different elements.";
+  CheckError<float>(output1, output2);
 #endif
 }
--- a/paddle/inference/tests/book/test_inference_label_semantic_roles.cc
+++ b/paddle/inference/tests/book/test_inference_label_semantic_roles.cc
@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include <gtest/gtest.h>
-#include <time.h>
-#include <sstream>
 #include "gflags/gflags.h"
 #include "test_helper.h"

@@ -60,8 +58,7 @@ TEST(inference, label_semantic_roles) {
  cpu_fetchs1.push_back(&output1);

  // Run inference on CPU
-  TestInference<paddle::platform::CPUPlace, float>(
-      dirname, cpu_feeds, cpu_fetchs1);
+  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
  LOG(INFO) << output1.lod();
  LOG(INFO) << output1.dims();

@@ -71,8 +68,7 @@ TEST(inference, label_semantic_roles) {
  cpu_fetchs2.push_back(&output2);

  // Run inference on CUDA GPU
-  TestInference<paddle::platform::CUDAPlace, float>(
-      dirname, cpu_feeds, cpu_fetchs2);
+  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
  LOG(INFO) << output2.lod();
  LOG(INFO) << output2.dims();


--- a/paddle/inference/tests/book/test_inference_recognize_digits.cc
+++ b/paddle/inference/tests/book/test_inference_recognize_digits.cc
@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include <gtest/gtest.h>
-#include <time.h>
-#include <sstream>
 #include "gflags/gflags.h"
 #include "test_helper.h"

@@ -31,6 +29,50 @@ TEST(inference, recognize_digits) {
  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc

+  int64_t batch_size = 1;
+
+  paddle::framework::LoDTensor input;
+  // Use normilized image pixels as input data,
+  // which should be in the range [-1.0, 1.0].
+  SetupTensor<float>(input,
+                     {batch_size, 1, 28, 28},
+                     static_cast<float>(-1),
+                     static_cast<float>(1));
+  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+  cpu_feeds.push_back(&input);
+
+  paddle::framework::LoDTensor output1;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+  cpu_fetchs1.push_back(&output1);
+
+  // Run inference on CPU
+  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
+  LOG(INFO) << output1.dims();
+
+#ifdef PADDLE_WITH_CUDA
+  paddle::framework::LoDTensor output2;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
+  cpu_fetchs2.push_back(&output2);
+
+  // Run inference on CUDA GPU
+  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
+  LOG(INFO) << output2.dims();
+
+  CheckError<float>(output1, output2);
+#endif
+}
+
+TEST(inference, recognize_digits_combine) {
+  if (FLAGS_dirname.empty()) {
+    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
+  }
+
+  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
+  std::string dirname = FLAGS_dirname;
+
+  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
+
  paddle::framework::LoDTensor input;
  // Use normilized image pixels as input data,
  // which should be in the range [-1.0, 1.0].
@@ -44,7 +86,7 @@ TEST(inference, recognize_digits) {
  cpu_fetchs1.push_back(&output1);

  // Run inference on CPU
-  TestInference<paddle::platform::CPUPlace, float>(
+  TestInference<paddle::platform::CPUPlace, true>(
      dirname, cpu_feeds, cpu_fetchs1);
  LOG(INFO) << output1.dims();

@@ -54,7 +96,7 @@ TEST(inference, recognize_digits) {
  cpu_fetchs2.push_back(&output2);

  // Run inference on CUDA GPU
-  TestInference<paddle::platform::CUDAPlace, float>(
+  TestInference<paddle::platform::CUDAPlace, true>(
      dirname, cpu_feeds, cpu_fetchs2);
  LOG(INFO) << output2.dims();


--- a/paddle/inference/tests/book/test_inference_recommender_system.cc
+++ b/paddle/inference/tests/book/test_inference_recommender_system.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "gflags/gflags.h"
+#include "test_helper.h"
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+TEST(inference, recommender_system) {
+  if (FLAGS_dirname.empty()) {
+    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
+  }
+
+  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
+  std::string dirname = FLAGS_dirname;
+
+  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
+
+  int64_t batch_size = 1;
+
+  paddle::framework::LoDTensor user_id, gender_id, age_id, job_id, movie_id,
+      category_id, movie_title;
+
+  // Use the first data from paddle.dataset.movielens.test() as input
+  std::vector<int64_t> user_id_data = {1};
+  SetupTensor<int64_t>(user_id, {batch_size, 1}, user_id_data);
+
+  std::vector<int64_t> gender_id_data = {1};
+  SetupTensor<int64_t>(gender_id, {batch_size, 1}, gender_id_data);
+
+  std::vector<int64_t> age_id_data = {0};
+  SetupTensor<int64_t>(age_id, {batch_size, 1}, age_id_data);
+
+  std::vector<int64_t> job_id_data = {10};
+  SetupTensor<int64_t>(job_id, {batch_size, 1}, job_id_data);
+
+  std::vector<int64_t> movie_id_data = {783};
+  SetupTensor<int64_t>(movie_id, {batch_size, 1}, movie_id_data);
+
+  std::vector<int64_t> category_id_data = {10, 8, 9};
+  SetupLoDTensor<int64_t>(category_id, {3, 1}, {{0, 3}}, category_id_data);
+
+  std::vector<int64_t> movie_title_data = {1069, 4140, 2923, 710, 988};
+  SetupLoDTensor<int64_t>(movie_title, {5, 1}, {{0, 5}}, movie_title_data);
+
+  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+  cpu_feeds.push_back(&user_id);
+  cpu_feeds.push_back(&gender_id);
+  cpu_feeds.push_back(&age_id);
+  cpu_feeds.push_back(&job_id);
+  cpu_feeds.push_back(&movie_id);
+  cpu_feeds.push_back(&category_id);
+  cpu_feeds.push_back(&movie_title);
+
+  paddle::framework::LoDTensor output1;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+  cpu_fetchs1.push_back(&output1);
+
+  // Run inference on CPU
+  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
+  LOG(INFO) << output1.dims();
+
+#ifdef PADDLE_WITH_CUDA
+  paddle::framework::LoDTensor output2;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
+  cpu_fetchs2.push_back(&output2);
+
+  // Run inference on CUDA GPU
+  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
+  LOG(INFO) << output2.dims();
+
+  CheckError<float>(output1, output2);
+#endif
+}
--- a/paddle/inference/tests/book/test_inference_rnn_encoder_decoder.cc
+++ b/paddle/inference/tests/book/test_inference_rnn_encoder_decoder.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "gflags/gflags.h"
+#include "test_helper.h"
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+TEST(inference, rnn_encoder_decoder) {
+  if (FLAGS_dirname.empty()) {
+    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
+  }
+
+  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
+  std::string dirname = FLAGS_dirname;
+
+  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
+
+  paddle::framework::LoDTensor word_data, trg_word;
+  paddle::framework::LoD lod{{0, 4, 10}};
+
+  SetupLoDTensor(
+      word_data, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
+  SetupLoDTensor(
+      trg_word, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
+
+  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+  cpu_feeds.push_back(&word_data);
+  cpu_feeds.push_back(&trg_word);
+
+  paddle::framework::LoDTensor output1;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+  cpu_fetchs1.push_back(&output1);
+
+  // Run inference on CPU
+  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
+  LOG(INFO) << output1.lod();
+  LOG(INFO) << output1.dims();
+
+#ifdef PADDLE_WITH_CUDA
+  paddle::framework::LoDTensor output2;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
+  cpu_fetchs2.push_back(&output2);
+
+  // Run inference on CUDA GPU
+  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
+  LOG(INFO) << output2.lod();
+  LOG(INFO) << output2.dims();
+
+  CheckError<float>(output1, output2);
+#endif
+}
--- a/paddle/memory/memory.h
+++ b/paddle/memory/memory.h
@@ -81,5 +81,23 @@ class PODDeleter {
  Place place_;
 };

+/**
+ * \brief   Free memory block in one place does not meet POD
+ *
+ * \note    In some cases, custom deleter is used to
+ *          deallocate the memory automatically for
+ *          std::unique_ptr<T> in tensor.h.
+ *
+ */
+template <typename T, typename Place>
+class PlainDeleter {
+ public:
+  explicit PlainDeleter(Place place) : place_(place) {}
+  void operator()(T* ptr) { Free(place_, reinterpret_cast<void*>(ptr)); }
+
+ private:
+  Place place_;
+};
+
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/operators/compare_op.h
+++ b/paddle/operators/compare_op.h
@@ -62,7 +62,7 @@ class CompareOpKernel
    z->mutable_data<T>(context.GetPlace());
    int axis = context.Attr<int>("axis");
    ElementwiseComputeEx<Functor, DeviceContext, T, bool>(context, x, y, axis,
-                                                          z);
+                                                          Functor(), z);
  }
 };


--- a/paddle/operators/ctc_align_op.cu
+++ b/paddle/operators/ctc_align_op.cu
@@ -80,6 +80,14 @@ class CTCAlignOpCUDAKernel : public framework::OpKernel<T> {

    // resize output dims
    output->Resize({static_cast<int64_t>(host_out_lod0.back()), 1});
+
+    if (host_out_lod0.back() == 0) {
+      output->Resize({1, 1});
+      output->mutable_data<T>(ctx.GetPlace());
+      math::SetConstant<platform::CUDADeviceContext, T> set_constant;
+      set_constant(ctx.template device_context<platform::CUDADeviceContext>(),
+                   output, -1);
+    }
  }
 };


--- a/paddle/operators/ctc_align_op.h
+++ b/paddle/operators/ctc_align_op.h
@@ -16,6 +16,8 @@ limitations under the License. */

 #include <string.h>
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+
 namespace paddle {
 namespace operators {

@@ -65,9 +67,14 @@ class CTCAlignKernel : public framework::OpKernel<T> {
    framework::LoD output_lod;
    output_lod.push_back(output_lod0);
    output->set_lod(output_lod);
-
    // resize output dims
    output->Resize({static_cast<int64_t>(output_lod0.back()), 1});
+    // for empty sequence
+    if (output_lod0.back() == 0) {
+      output->Resize({1, 1});
+      output_data = output->mutable_data<T>(ctx.GetPlace());
+      output_data[0] = -1;
+    }
  }
 };


--- a/paddle/operators/cum_op.h
+++ b/paddle/operators/cum_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+#include "paddle/operators/detail/safe_ref.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename Functor>
+class CumKernel : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  using T = typename Functor::ELEMENT_TYPE;
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto& X = detail::Ref(context.Input<framework::Tensor>("X"),
+                          "Cannot get input tensor X, variable name = %s",
+                          context.op().Input("X"));
+
+    auto& Out = detail::Ref(context.Output<framework::Tensor>("Out"),
+                            "Cannot get output tensor Out, variable name = %s",
+                            context.op().Output("Out"));
+    int axis = context.Attr<int>("axis");
+    bool exclusive = context.Attr<bool>("exclusive");
+    bool reverse = context.Attr<bool>("reverse");
+    auto x_dims = X.dims();
+    if (axis == -1) {
+      axis = x_dims.size() - 1;
+    }
+    PADDLE_ENFORCE_LT(
+        axis, x_dims.size(),
+        "axis should be less than the dimensiotn of the input tensor");
+    Out.mutable_data<T>(context.GetPlace());
+
+    int pre = 1;
+    int post = 1;
+    int mid = x_dims[axis];
+    for (int i = 0; i < axis; ++i) {
+      pre *= x_dims[i];
+    }
+    for (int i = axis + 1; i < x_dims.size(); ++i) {
+      post *= x_dims[i];
+    }
+
+    auto x = framework::EigenVector<T>::Flatten(X);
+    auto out = framework::EigenVector<T>::Flatten(Out);
+    auto* place =
+        context.template device_context<DeviceContext>().eigen_device();
+
+    using IndexT = Eigen::DenseIndex;
+    if (pre == 1) {
+      if (post == 1) {
+        ComputeImp(*place, Eigen::DSizes<IndexT, 1>(mid), x, out,
+                   /* axis= */ 0, reverse, exclusive);
+      } else {
+        ComputeImp(*place, Eigen::DSizes<IndexT, 2>(mid, post), x, out,
+                   /* axis= */ 0, reverse, exclusive);
+      }
+    } else {
+      if (post == 1) {
+        ComputeImp(*place, Eigen::DSizes<IndexT, 2>(pre, mid), x, out,
+                   /* axis= */ 1, reverse, exclusive);
+      } else {
+        ComputeImp(*place, Eigen::DSizes<IndexT, 3>(pre, mid, post), x, out,
+                   /* axis= */ 1, reverse, exclusive);
+      }
+    }
+  }
+
+ private:
+  template <typename Device, typename Dim, typename X, typename Out>
+  void ComputeImp(Device d, const Dim& dims, X x, Out out, int axis,
+                  bool reverse, bool exclusive) const {
+    if (!reverse) {
+      out.reshape(dims).device(d) = Functor()(x.reshape(dims), axis, exclusive);
+    } else {
+      std::array<bool, Dim::count> rev;
+      rev.fill(false);
+      rev[axis] = reverse;
+      out.reshape(dims).device(d) =
+          Functor()(x.reshape(dims).reverse(rev), axis, exclusive).reverse(rev);
+    }
+  }
+};
+
+template <typename T>
+struct CumsumFunctor {
+  using ELEMENT_TYPE = T;
+  template <typename X>
+  const typename X::TensorScanSumOp operator()(X x, int axis,
+                                               bool exclusive) const {
+    return x.cumsum(axis, exclusive);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/cumsum_op.cc
+++ b/paddle/operators/cumsum_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/cum_op.h"
+
+namespace paddle {
+namespace operators {
+
+class CumOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class CumsumOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CumsumOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Cumsum operator");
+    AddOutput("Out", "Output of Cumsum operator");
+    AddAttr<int>("axis",
+                 "(int, default -1). The dimenstion to accumulate along. "
+                 "-1 means the last dimenstion")
+        .SetDefault(-1)
+        .EqualGreaterThan(-1);
+    AddAttr<bool>("exclusive",
+                  "bool, default false). Whether to perform exclusive cumsum")
+        .SetDefault(false);
+    AddAttr<bool>("reverse",
+                  "bool, default false). If true, the cumsum is performed in "
+                  "the reversed direction")
+        .SetDefault(false);
+    AddComment(R"DOC(
+The cumulative sum of the elements along a given axis.
+By default, the first element of the result is the same of the first element of
+the input. If exlusive is true, the first element of the result is 0.
+)DOC");
+  }
+};
+
+class CumsumGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("cumsum");
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetAttr("axis", Attr<int>("axis"));
+    grad_op->SetAttr("reverse", !Attr<bool>("reverse"));
+    grad_op->SetAttr("exclusive", Attr<bool>("exclusive"));
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+using CPU = paddle::platform::CPUDeviceContext;
+
+REGISTER_OPERATOR(cumsum, ops::CumOp, ops::CumsumOpMaker, ops::CumsumGradMaker);
+REGISTER_OP_CPU_KERNEL(cumsum, ops::CumKernel<CPU, ops::CumsumFunctor<float>>,
+                       ops::CumKernel<CPU, ops::CumsumFunctor<double>>,
+                       ops::CumKernel<CPU, ops::CumsumFunctor<int>>)
--- a/paddle/operators/cumsum_op.cu
+++ b/paddle/operators/cumsum_op.cu
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/cum_op.h"
+
+namespace ops = paddle::operators;
+using CUDA = paddle::platform::CUDADeviceContext;
+
+REGISTER_OP_CUDA_KERNEL(cumsum, ops::CumKernel<CUDA, ops::CumsumFunctor<float>>,
+                        ops::CumKernel<CUDA, ops::CumsumFunctor<double>>,
+                        ops::CumKernel<CUDA, ops::CumsumFunctor<int>>)
--- a/paddle/operators/elementwise_add_op.h
+++ b/paddle/operators/elementwise_add_op.h
@@ -35,7 +35,8 @@ class ElementwiseAddKernel : public framework::OpKernel<T> {
    auto* z = ctx.Output<Tensor>("Out");
    z->mutable_data<T>(ctx.GetPlace());
    int axis = ctx.Attr<int>("axis");
-    ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
+    ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                          AddFunctor<T>(), z);
  }
 };


--- a/paddle/operators/elementwise_div_op.h
+++ b/paddle/operators/elementwise_div_op.h
@@ -35,7 +35,8 @@ class ElementwiseDivKernel : public framework::OpKernel<T> {
    auto* z = ctx.Output<Tensor>("Out");
    z->mutable_data<T>(ctx.GetPlace());
    int axis = ctx.Attr<int>("axis");
-    ElementwiseComputeEx<DivFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
+    ElementwiseComputeEx<DivFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                          DivFunctor<T>(), z);
  }
 };


--- a/paddle/operators/elementwise_max_op.h
+++ b/paddle/operators/elementwise_max_op.h
@@ -35,7 +35,8 @@ class ElementwiseMaxKernel : public framework::OpKernel<T> {
    auto* z = ctx.Output<Tensor>("Out");
    z->mutable_data<T>(ctx.GetPlace());
    int axis = ctx.Attr<int>("axis");
-    ElementwiseComputeEx<MaxFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
+    ElementwiseComputeEx<MaxFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                          MaxFunctor<T>(), z);
  }
 };


--- a/paddle/operators/elementwise_min_op.h
+++ b/paddle/operators/elementwise_min_op.h
@@ -35,7 +35,8 @@ class ElementwiseMinKernel : public framework::OpKernel<T> {
    auto* z = ctx.Output<Tensor>("Out");
    z->mutable_data<T>(ctx.GetPlace());
    int axis = ctx.Attr<int>("axis");
-    ElementwiseComputeEx<MinFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
+    ElementwiseComputeEx<MinFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                          MinFunctor<T>(), z);
  }
 };


--- a/paddle/operators/elementwise_mul_op.h
+++ b/paddle/operators/elementwise_mul_op.h
@@ -34,7 +34,8 @@ class ElementwiseMulKernel : public framework::OpKernel<T> {
    auto* z = ctx.Output<Tensor>("Out");
    z->mutable_data<T>(ctx.GetPlace());
    int axis = ctx.Attr<int>("axis");
-    ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
+    ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                          MulFunctor<T>(), z);
  }
 };


--- a/paddle/operators/elementwise_op_function.h
+++ b/paddle/operators/elementwise_op_function.h
@@ -365,10 +365,10 @@ template <typename Functor, typename DeviceContext, typename T,
          typename OutType = T>
 void ElementwiseComputeEx(const framework::ExecutionContext& ctx,
                          const framework::Tensor* x,
-                          const framework::Tensor* y, int axis,
+                          const framework::Tensor* y, int axis, Functor func,
                          framework::Tensor* z) {
  TransformFunctor<Functor, T, DeviceContext, OutType> functor(
-      x, y, z, ctx.template device_context<DeviceContext>(), Functor());
+      x, y, z, ctx.template device_context<DeviceContext>(), func);

  auto x_dims = x->dims();
  auto y_dims = y->dims();

--- a/paddle/operators/elementwise_pow_op.h
+++ b/paddle/operators/elementwise_pow_op.h
@@ -36,7 +36,8 @@ class ElementwisePowKernel : public framework::OpKernel<T> {
    auto* z = ctx.Output<Tensor>("Out");
    z->mutable_data<T>(ctx.GetPlace());
    int axis = ctx.Attr<int>("axis");
-    ElementwiseComputeEx<PowFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
+    ElementwiseComputeEx<PowFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                          PowFunctor<T>(), z);
  }
 };


--- a/paddle/operators/elementwise_sub_op.h
+++ b/paddle/operators/elementwise_sub_op.h
@@ -34,7 +34,8 @@ class ElementwiseSubKernel : public framework::OpKernel<T> {
    auto* z = ctx.Output<Tensor>("Out");
    z->mutable_data<T>(ctx.GetPlace());
    int axis = ctx.Attr<int>("axis");
-    ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
+    ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                          SubFunctor<T>(), z);
  }
 };


--- a/paddle/operators/layer_norm_op.cc
+++ b/paddle/operators/layer_norm_op.cc
@@ -21,13 +21,6 @@ using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
 using DataLayout = framework::DataLayout;

-template <typename T>
-using EigenMatrixMapRowMajor = Eigen::Map<
-    Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
-template <typename T>
-using ConstEigenMatrixMapRowMajor = Eigen::Map<
-    const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
-
 class LayerNormOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
@@ -108,7 +101,6 @@ class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker {

    AddComment(R"DOC(
 Layer Normalization.
-
 Layer Norm has been implemented as discussed in the paper:
 https://arxiv.org/abs/1607.06450
 ...
@@ -116,75 +108,6 @@ https://arxiv.org/abs/1607.06450
  }
 };

-template <typename T>
-class LayerNormKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const float epsilon = ctx.Attr<float>("epsilon");
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *bias = ctx.Input<Tensor>("Bias");
-    const auto *x = ctx.Input<Tensor>("X");
-    const auto &x_dims = x->dims();
-    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
-
-    auto *output = ctx.Output<Tensor>("Y");
-    auto *mean = ctx.Output<Tensor>("Mean");
-    auto *var = ctx.Output<Tensor>("Variance");
-    output->mutable_data<T>(ctx.GetPlace());
-    mean->mutable_data<T>(ctx.GetPlace());
-    var->mutable_data<T>(ctx.GetPlace());
-
-    auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
-    int left = static_cast<int>(matrix_dim[0]);
-    int right = static_cast<int>(matrix_dim[1]);
-
-    auto input_map = ConstEigenMatrixMapRowMajor<T>(x->data<T>(), left, right);
-
-    auto mean_map = EigenMatrixMapRowMajor<T>(mean->data<T>(), left, 1);
-    auto var_map = EigenMatrixMapRowMajor<T>(var->data<T>(), left, 1);
-    auto output_map = EigenMatrixMapRowMajor<T>(output->data<T>(), left, right);
-
-    auto squre = [](T ele) { return ele * ele; };
-    auto add_epslion = [epsilon](T ele) { return ele + epsilon; };
-
-    mean_map = input_map.rowwise().mean();
-    var_map = (input_map - mean_map.replicate(1, right))
-                  .unaryExpr(squre)
-                  .rowwise()
-                  .mean()
-                  .unaryExpr(add_epslion);
-
-    auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); };
-    // TODO(zcd): Some thinking about output_map, is it appropriate that
-    // `output_map` and `input_map` point to the same memory.
-    auto inv_std = var_map.unaryExpr(inv_std_func);
-    if (scale && bias) {
-      auto scale_map =
-          ConstEigenMatrixMapRowMajor<T>(scale->data<T>(), 1, right);
-      auto bias_map = ConstEigenMatrixMapRowMajor<T>(bias->data<T>(), 1, right);
-      output_map = (input_map - mean_map.replicate(1, right))
-                       .cwiseProduct(inv_std.replicate(1, right))
-                       .cwiseProduct(scale_map.replicate(left, 1)) +
-                   bias_map.replicate(left, 1);
-    } else if (scale) {
-      auto scale_map =
-          ConstEigenMatrixMapRowMajor<T>(scale->data<T>(), 1, right);
-      output_map = (input_map - mean_map.replicate(1, right))
-                       .cwiseProduct(inv_std.replicate(1, right))
-                       .cwiseProduct(scale_map.replicate(left, 1));
-    } else if (bias) {
-      auto bias_map = ConstEigenMatrixMapRowMajor<T>(bias->data<T>(), 1, right);
-      output_map = (input_map - mean_map.replicate(1, right))
-                       .cwiseProduct(inv_std.replicate(1, right)) +
-                   bias_map.replicate(left, 1);
-    } else {
-      output_map = (input_map - mean_map.replicate(1, right))
-                       .cwiseProduct(inv_std.replicate(1, right));
-    }
-  }
-};
-
 class LayerNormGradOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
@@ -193,8 +116,6 @@ class LayerNormGradOp : public framework::OperatorWithKernel {
    // check input
    PADDLE_ENFORCE(ctx->HasInput("X"),
                   "Input(X) of LayerNormOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Scale"),
-                   "Input(Scale) of LayerNormOp should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("Mean"),
                   "Input(Mean) of LayerNormOp should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("Variance"),
@@ -237,125 +158,6 @@ class LayerNormGradOp : public framework::OperatorWithKernel {
  }
 };

-template <typename T>
-class LayerNormGradKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *x = ctx.Input<Tensor>("X");
-    const auto *mean = ctx.Input<Tensor>("Mean");
-    const auto *var = ctx.Input<Tensor>("Variance");
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
-
-    const auto &x_dims = x->dims();
-
-    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
-    auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
-    int left = static_cast<int>(matrix_dim[0]);
-    int right = static_cast<int>(matrix_dim[1]);
-
-    // init output
-    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-
-    auto x_map = ConstEigenMatrixMapRowMajor<T>(x->data<T>(), left, right);
-    auto d_y_map = ConstEigenMatrixMapRowMajor<T>(d_y->data<T>(), left, right);
-    auto mean_map = ConstEigenMatrixMapRowMajor<T>(mean->data<T>(), left, 1);
-    auto var_map = ConstEigenMatrixMapRowMajor<T>(var->data<T>(), left, 1);
-
-    if (d_bias) {
-      d_bias->mutable_data<T>(ctx.GetPlace());
-      auto d_bias_map = EigenMatrixMapRowMajor<T>(d_bias->data<T>(), 1, right);
-      d_bias_map = d_y_map.colwise().sum();
-    }
-    if (d_scale) {
-      d_scale->mutable_data<T>(ctx.GetPlace());
-      auto d_scale_map =
-          EigenMatrixMapRowMajor<T>(d_scale->data<T>(), 1, right);
-      auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); };
-      // There are two equation to compute d_scale. One uses "Y" and the other
-      // does not use "Y"
-      d_scale_map =
-          ((x_map - mean_map.replicate(1, right))
-               .cwiseProduct(
-                   var_map.unaryExpr(inv_std_func).replicate(1, right))
-               .cwiseProduct(d_y_map))
-              .colwise()
-              .sum();
-    }
-
-    if (d_x) {
-      d_x->mutable_data<T>(ctx.GetPlace());
-      auto d_x_map = EigenMatrixMapRowMajor<T>(d_x->data<T>(), left, right);
-      auto triple_product_func = [](T ele) { return ele * ele * ele; };
-      auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); };
-      // TODO(zcd): these code can be refined
-      if (d_scale) {
-        auto scale_map =
-            ConstEigenMatrixMapRowMajor<T>(scale->data<T>(), 1, right);
-        // dy_dx
-        auto dx_end = var_map.unaryExpr(inv_std_func)
-                          .replicate(1, right)
-                          .cwiseProduct(d_y_map)
-                          .cwiseProduct(scale_map.replicate(left, 1));
-        // dy_dmean_dx
-        auto dx_mean = (T(-1.0) / right) *
-                       var_map.unaryExpr(inv_std_func)
-                           .replicate(1, right)
-                           .cwiseProduct(d_y_map)
-                           .cwiseProduct(scale_map.replicate(left, 1))
-                           .rowwise()
-                           .sum()
-                           .replicate(1, right);
-        // dy_var_dx
-        auto dvar_end_part = (x_map - mean_map.replicate(1, right))
-                                 .cwiseProduct(scale_map.replicate(left, 1))
-                                 .cwiseProduct(d_y_map)
-                                 .rowwise()
-                                 .sum();
-        auto dvar_end = var_map.unaryExpr(inv_std_func)
-                            .unaryExpr(triple_product_func)
-                            .cwiseProduct(dvar_end_part)
-                            .replicate(1, right);
-        auto dx_var =
-            (T(-1.0) / right) *
-            (x_map - mean_map.replicate(1, right)).cwiseProduct(dvar_end);
-
-        d_x_map = dx_end + dx_mean + dx_var;
-      } else {
-        // dy_dx
-        auto dx_end = var_map.unaryExpr(inv_std_func)
-                          .replicate(1, right)
-                          .cwiseProduct(d_y_map);
-        // dy_dmean_dx
-        auto dx_mean = (T(-1.0) / right) *
-                       var_map.unaryExpr(inv_std_func)
-                           .replicate(1, right)
-                           .cwiseProduct(d_y_map)
-                           .rowwise()
-                           .sum()
-                           .replicate(1, right);
-        // dy_var_dx
-        auto dvar_end_part = (x_map - mean_map.replicate(1, right))
-                                 .cwiseProduct(d_y_map)
-                                 .rowwise()
-                                 .sum();
-        auto dvar_end = var_map.unaryExpr(inv_std_func)
-                            .unaryExpr(triple_product_func)
-                            .cwiseProduct(dvar_end_part)
-                            .replicate(1, right);
-        auto dx_var =
-            (T(-1.0) / right) *
-            (x_map - mean_map.replicate(1, right)).cwiseProduct(dvar_end);
-
-        d_x_map = dx_end + dx_mean + dx_var;
-      }
-    }
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle

@@ -363,8 +165,9 @@ namespace ops = paddle::operators;
 REGISTER_OP(layer_norm, ops::LayerNormOp, ops::LayerNormOpMaker,
            layer_norm_grad, ops::LayerNormGradOp);
 REGISTER_OP_CPU_KERNEL(
-    layer_norm,
-    ops::LayerNormKernel<paddle::platform::CPUDeviceContext, float>);
+    layer_norm, ops::LayerNormKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LayerNormKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
    layer_norm_grad,
-    ops::LayerNormGradKernel<paddle::platform::CPUDeviceContext, float>);
+    ops::LayerNormGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LayerNormGradKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/operators/layer_norm_op.cu
+++ b/paddle/operators/layer_norm_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/layer_norm_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    layer_norm,
+    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    layer_norm_grad,
+    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, double>);
--- a/paddle/operators/layer_norm_op.h
+++ b/paddle/operators/layer_norm_op.h
@@ -16,19 +16,222 @@ limitations under the License. */
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"

+#include "paddle/operators/elementwise_op_function.h"
+#include "paddle/operators/math/math_function.h"
+
 namespace paddle {
 namespace operators {

+template <typename T>
+struct SubAndSquareFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return (a - b) * (a - b); }
+};
+
+template <typename T>
+struct DivAndSqrtFunctor {
+  explicit DivAndSqrtFunctor(T epsilon) { epsilon_ = epsilon; }
+  inline HOSTDEVICE T operator()(T a, T b) const {
+    return a / (sqrt(b + epsilon_));
+  }
+
+ private:
+  T epsilon_;
+};
+
+template <typename T>
+struct MulFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a * b; }
+};
+
+template <typename T>
+struct AddFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a + b; }
+};
+
+template <typename T>
+struct SubFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a - b; }
+};
+
+template <typename T>
+struct MulInvVarFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const {
+    return a * std::sqrt(1.0 / b);
+  }
+};
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using DataLayout = framework::DataLayout;
+
 template <typename DeviceContext, typename T>
 class LayerNormKernel : public framework::OpKernel<T> {
 public:
-  void Compute(const framework::ExecutionContext& ctx) const override;
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const float epsilon = ctx.Attr<float>("epsilon");
+    auto *scale = ctx.Input<Tensor>("Scale");
+    auto *bias = ctx.Input<Tensor>("Bias");
+    auto x = *ctx.Input<Tensor>("X");
+
+    auto *y = ctx.Output<Tensor>("Y");
+    auto *mean = ctx.Output<Tensor>("Mean");
+    auto *var = ctx.Output<Tensor>("Variance");
+    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
+
+    const auto x_dims = x.dims();
+
+    y->mutable_data<T>(ctx.GetPlace());
+    mean->mutable_data<T>(ctx.GetPlace());
+    var->mutable_data<T>(ctx.GetPlace());
+
+    auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
+    int left = static_cast<int>(matrix_dim[0]);
+    int right = static_cast<int>(matrix_dim[1]);
+    framework::DDim matrix_shape({left, right});
+
+    x.Resize(matrix_shape);
+    Tensor out;
+    out.ShareDataWith(*y);
+    out.Resize(matrix_shape);
+
+    auto &dev_ctx = ctx.template device_context<DeviceContext>();
+    math::RowwiseMean<DeviceContext, T> row_mean;
+
+    // get mean
+    row_mean(dev_ctx, x, mean);
+
+    // get variance
+    ElementwiseComputeEx<SubAndSquareFunctor<T>, DeviceContext, T>(
+        ctx, &x, mean, /*axis*/ 0, SubAndSquareFunctor<T>(), &out);
+    row_mean(dev_ctx, out, var);
+
+    // get x_norm
+    ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
+        ctx, &x, mean, /*axis*/ 0, SubFunctor<T>(), &out);
+    ElementwiseComputeEx<DivAndSqrtFunctor<T>, DeviceContext, T>(
+        ctx, &out, var, /*axis*/ 0,
+        DivAndSqrtFunctor<T>(static_cast<T>(epsilon)), &out);
+
+    if (scale) {
+      ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
+          ctx, &out, scale, /*axis*/ 1, MulFunctor<T>(), &out);
+    }
+    if (bias) {
+      ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(
+          ctx, &out, bias, /*axis*/ 1, AddFunctor<T>(), &out);
+    }
+  }
 };

 template <typename DeviceContext, typename T>
 class LayerNormGradKernel : public framework::OpKernel<T> {
 public:
-  void Compute(const framework::ExecutionContext& ctx) const override;
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const float epsilon = ctx.Attr<float>("epsilon");
+    auto x = *ctx.Input<Tensor>("X");
+    auto *y = ctx.Input<Tensor>("Y");
+    auto *mean = ctx.Input<Tensor>("Mean");
+    auto *var = ctx.Input<Tensor>("Variance");
+    auto *scale = ctx.Input<Tensor>("Scale");
+    auto *bias = ctx.Input<Tensor>("Bias");
+    auto d_y = *ctx.Input<Tensor>(framework::GradVarName("Y"));
+    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
+
+    // init output
+    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    const auto &x_dims = x.dims();
+    auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
+    int left = static_cast<int>(matrix_dim[0]);
+    int right = static_cast<int>(matrix_dim[1]);
+    framework::DDim matrix_shape({left, right});
+
+    d_y.Resize(matrix_shape);
+    auto &dev_ctx = ctx.template device_context<DeviceContext>();
+    math::ColwiseSum<DeviceContext, T> colwise_sum;
+
+    Tensor temp;
+    Tensor temp_norm;
+    if (d_scale || d_x) {
+      x.Resize(matrix_shape);
+      temp.mutable_data<T>(matrix_shape, ctx.GetPlace());
+
+      if (!(bias && scale)) {
+        temp_norm.ShareDataWith(*y);
+        temp_norm.Resize(matrix_shape);
+      } else {
+        temp_norm.mutable_data<T>(matrix_shape, ctx.GetPlace());
+        // get x_norm
+        ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
+            ctx, &x, mean, /*axis*/ 0, SubFunctor<T>(), &temp_norm);
+        ElementwiseComputeEx<DivAndSqrtFunctor<T>, DeviceContext, T>(
+            ctx, &temp_norm, var, /*axis*/ 0,
+            DivAndSqrtFunctor<T>(static_cast<T>(epsilon)), &temp_norm);
+      }
+    }
+
+    if (d_bias) {
+      d_bias->mutable_data<T>(ctx.GetPlace());
+      colwise_sum(dev_ctx, d_y, d_bias);
+    }
+    if (d_scale) {
+      d_scale->mutable_data<T>(ctx.GetPlace());
+      ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
+          ctx, &temp_norm, &d_y, /*axis*/ 0, MulFunctor<T>(), &temp);
+      colwise_sum(dev_ctx, temp, d_scale);
+    }
+
+    if (d_x) {
+      framework::DDim vec_shape({left});
+      d_x->mutable_data<T>(ctx.GetPlace());
+      auto dx_dim = d_x->dims();
+      Tensor temp_vec;
+      temp_vec.mutable_data<T>(vec_shape, ctx.GetPlace());
+
+      math::RowwiseMean<DeviceContext, T> row_mean;
+
+      if (d_scale) {
+        // dy_dx
+        ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
+            ctx, &d_y, scale, /*axis*/ 1, MulFunctor<T>(), &temp);
+        framework::Copy(temp, ctx.GetPlace(), ctx.device_context(), d_x);
+
+        // dy_dmean_dx
+        row_mean(dev_ctx, temp, &temp_vec);
+        ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
+            ctx, d_x, &temp_vec, /*axis*/ 0, SubFunctor<T>(), d_x);
+
+        // dy_var_dx
+        ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
+            ctx, &temp, &temp_norm, /*axis*/ 0, MulFunctor<T>(), &temp);
+      } else {
+        // dy_dx
+        framework::Copy(d_y, ctx.GetPlace(), ctx.device_context(), d_x);
+
+        // dy_dmean_dx
+        row_mean(dev_ctx, d_y, &temp_vec);
+        ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
+            ctx, d_x, &temp_vec, /*axis*/ 0, SubFunctor<T>(), d_x);
+
+        // dy_var_dx
+        ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
+            ctx, &d_y, &temp_norm, /*axis*/ 0, MulFunctor<T>(), &temp);
+      }
+      // dy_var_dx
+      row_mean(dev_ctx, temp, &temp_vec);
+      ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
+          ctx, &temp_norm, &temp_vec, /*axis*/ 0, MulFunctor<T>(), &temp);
+      ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
+          ctx, d_x, &temp, /*axis*/ 0, SubFunctor<T>(), d_x);
+
+      ElementwiseComputeEx<DivAndSqrtFunctor<T>, DeviceContext, T>(
+          ctx, d_x, var, /*axis*/ 0,
+          DivAndSqrtFunctor<T>(static_cast<T>(epsilon)), d_x);
+      d_x->Resize(dx_dim);
+    }
+  }
 };

 }  // namespace operators

--- a/paddle/operators/math/math_function.cc
+++ b/paddle/operators/math/math_function.cc
@@ -331,6 +331,12 @@ template struct RowwiseAdd<platform::CPUDeviceContext, double>;
 template struct ColwiseSum<platform::CPUDeviceContext, float>;
 template struct ColwiseSum<platform::CPUDeviceContext, double>;

+template struct RowwiseSum<platform::CPUDeviceContext, float>;
+template struct RowwiseSum<platform::CPUDeviceContext, double>;
+
+template struct RowwiseMean<platform::CPUDeviceContext, float>;
+template struct RowwiseMean<platform::CPUDeviceContext, double>;
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/operators/math/math_function.cu
+++ b/paddle/operators/math/math_function.cu
@@ -325,6 +325,31 @@ void ColwiseSum<platform::CUDADeviceContext, double>::operator()(
      vector->data<double>());
 }

+template struct RowwiseSum<platform::CUDADeviceContext, float>;
+// template struct RowwiseSum<platform::CUDADeviceContext, double>;
+// TODO(zcd): Following ColwiseSum format, need to confirm.
+// The RowwiseSum<platform::CUDADeviceContext, double> failed in debug mode,
+// and only failed for this case. So reimplemented it.
+template <>
+void RowwiseSum<platform::CUDADeviceContext, double>::operator()(
+    const platform::CUDADeviceContext& context, const framework::Tensor& input,
+    framework::Tensor* vector) {
+  auto in_dims = input.dims();
+  auto size = input.numel() / in_dims[0];
+  PADDLE_ENFORCE_EQ(vector->numel(), in_dims[0]);
+  framework::Tensor one;
+  one.mutable_data<double>({size}, context.GetPlace());
+  SetConstant<platform::CUDADeviceContext, double> set;
+  set(context, &one, static_cast<double>(1.0));
+  gemv<platform::CUDADeviceContext, double>(
+      context, true, static_cast<int>(in_dims[1]), static_cast<int>(in_dims[0]),
+      1.0, one.data<double>(), input.data<double>(), 0.0,
+      vector->data<double>());
+}
+
+template struct RowwiseMean<platform::CUDADeviceContext, float>;
+template struct RowwiseMean<platform::CUDADeviceContext, double>;
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/operators/math/math_function.h
+++ b/paddle/operators/math/math_function.h
@@ -128,6 +128,18 @@ struct ColwiseSum {
                  framework::Tensor* vec);
 };

+template <typename DeviceContext, typename T>
+struct RowwiseSum {
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  framework::Tensor* vec);
+};
+
+template <typename DeviceContext, typename T>
+struct RowwiseMean {
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  framework::Tensor* vec);
+};
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/operators/math/math_function_impl.h
+++ b/paddle/operators/math/math_function_impl.h
@@ -87,6 +87,88 @@ class ColwiseSum<platform::CPUDeviceContext, T> {
  }
 };

+template <typename DeviceContext, typename T>
+void RowwiseMean<DeviceContext, T>::operator()(const DeviceContext& context,
+                                               const framework::Tensor& input,
+                                               framework::Tensor* out) {
+  auto in_dims = input.dims();
+  PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
+  PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]);
+
+  auto in = framework::EigenMatrix<T>::From(input);
+  auto vec = framework::EigenVector<T>::Flatten(*out);
+
+  vec.device(*context.eigen_device()) = in.mean(Eigen::array<int, 1>({{1}}));
+}
+// TODO(zcd): Following ColwiseSum format, need to confirm.
+// Specialize for CPU, since Eigen implement a general reduce. However,
+// rowwise-sum can be easily implemented. General reduce has a huge overhead in
+// CPU
+template <typename T>
+class RowwiseMean<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* out) {
+    auto& in_dims = input.dims();
+    PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
+    auto height = in_dims[0];
+    auto size = in_dims[1];
+    PADDLE_ENFORCE_EQ(out->numel(), height);
+    auto inv_size = 1.0 / size;
+    T* out_buf = out->mutable_data<T>(out->place());
+    const T* in_buf = input.data<T>();
+
+    for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
+      T sum = 0;
+      for (size_t j = 0; j < static_cast<size_t>(size); ++j) {
+        sum += in_buf[i * size + j];
+      }
+      out_buf[i] = sum * inv_size;
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+void RowwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
+                                              const framework::Tensor& input,
+                                              framework::Tensor* out) {
+  auto in_dims = input.dims();
+  PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
+  PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]);
+
+  auto in = framework::EigenMatrix<T>::From(input);
+  auto vec = framework::EigenVector<T>::Flatten(*out);
+
+  vec.device(*context.eigen_device()) = in.sum(Eigen::array<int, 1>({{1}}));
+}
+// TODO(zcd): Following ColwiseSum format, need to confirm.
+// Specialize for CPU, since Eigen implement a general reduce. However,
+// rowwise-sum can be easily implemented. General reduce has a huge overhead in
+// CPU
+template <typename T>
+class RowwiseSum<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* out) {
+    auto& in_dims = input.dims();
+    PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
+    auto height = in_dims[0];
+    auto size = in_dims[1];
+    PADDLE_ENFORCE_EQ(out->numel(), size);
+
+    T* out_buf = out->mutable_data<T>(out->place());
+    const T* in_buf = input.data<T>();
+
+    for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
+      T sum = 0;
+      for (size_t j = 0; j < static_cast<size_t>(size); ++j) {
+        sum += in_buf[i * size + j];
+      }
+      out_buf[i] = sum;
+    }
+  }
+};
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/operators/nccl_op_test.cu.cc
+++ b/paddle/operators/nccl_op_test.cu.cc
@@ -287,6 +287,9 @@ TEST_F(NCCLTester, ncclBcastOp) {
 }

 int main(int argc, char **argv) {
+  // FIXME(tonyyang-svail):
+  //   Due to the driver issue on our CI, disable for now
+  return 0;
  const int dev_count = p::GetCUDADeviceCount();
  if (dev_count <= 1) {
    LOG(WARNING)

--- a/paddle/operators/parallel_do_op.cc
+++ b/paddle/operators/parallel_do_op.cc
@@ -76,18 +76,25 @@ inline void CopyOrShare(const framework::Variable &src,
  if (src.IsType<LoDTensor>()) {
    if (src.Get<LoDTensor>().place() == dst_place) {
      dst->GetMutable<LoDTensor>()->ShareDataWith(src.Get<LoDTensor>());
+      dst->GetMutable<LoDTensor>()->set_lod(src.Get<LoDTensor>().lod());
    } else {
      Copy(src.Get<LoDTensor>(), dst_place, dst->GetMutable<LoDTensor>());
+      framework::LoD lod(src.Get<LoDTensor>().lod());
+      lod.CopyToPeer(dst_place);
+      dst->GetMutable<LoDTensor>()->set_lod(lod);
    }
  } else if (src.IsType<SelectedRows>()) {
    auto &src_sr = src.Get<SelectedRows>();
    auto *dst_sr = dst->GetMutable<SelectedRows>();
-    dst_sr->set_rows(src_sr.rows());
    dst_sr->set_height(src_sr.height());
    if (src_sr.value().place() == dst_place) {
      dst_sr->mutable_value()->ShareDataWith(src_sr.value());
+      dst_sr->set_rows(src_sr.rows());
    } else {
      Copy(src_sr.value(), dst_place, dst_sr->mutable_value());
+      framework::Vector<int64_t> lod(src_sr.rows());
+      lod.CopyToPeer(dst_place);
+      dst_sr->set_rows(lod);
    }
  } else {
    PADDLE_THROW("Expect LoDTensor/SelectedRows, get %s", src.Type().name());
@@ -145,6 +152,9 @@ class ParallelDoOp : public framework::OperatorBase {
        auto *sub_scope = sub_scopes[i];
        auto *dst = sub_scope->Var(param)->GetMutable<LoDTensor>();
        framework::Copy(src, place, dst);
+        framework::LoD lod(src.lod());
+        lod.CopyToPeer(place);
+        dst->set_lod(lod);
      }
    }
    WaitOnPlaces(places);
@@ -248,17 +258,19 @@ class ParallelDoGradOp : public framework::OperatorBase {
                      const std::vector<framework::Scope *> &sub_scopes,
                      const platform::PlaceList &places) const {
    for (auto &s : Outputs(framework::GradVarName(kParameters))) {
+      VLOG(3) << "Accumulating " << s;
+      if (s == framework::kEmptyVarName) continue;
      std::string tmp_name;
      auto *tmp = sub_scopes[0]->Var(&tmp_name);

      for (size_t i = 1; i < sub_scopes.size(); ++i) {
        CopyOrShare(*sub_scopes[i]->FindVar(s), places[0], tmp);
-        WaitOnPlace(places[0]);
+        WaitOnPlaces(places);

        auto sum_op = framework::OpRegistry::CreateOp(
            "sum", {{"X", {s, tmp_name}}}, {{"Out", {s}}},
            framework::AttributeMap{});
-        VLOG(3) << sum_op->DebugStringEx(sub_scopes[0]);
+        VLOG(10) << sum_op->DebugStringEx(sub_scopes[0]);
        sum_op->Run(*sub_scopes[0], places[0]);
        WaitOnPlace(places[0]);
      }
@@ -334,16 +346,9 @@ class ParallelDoGradOpDescMaker : public framework::SingleGradOpDescMaker {
 class ParallelDoGradOpShapeInference : public framework::InferShapeBase {
 public:
  void operator()(framework::InferShapeContext *ctx) const override {
-    std::vector<std::string> input{kParameters, kInputs};
-    std::vector<std::string> output{kOutputs};
-
    PADDLE_ENFORCE(ctx->HasInputs(kParameters));
-    PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kParameters)));
    PADDLE_ENFORCE(ctx->HasInputs(kInputs));
-
-    for (auto &s : output) {
-      PADDLE_ENFORCE(ctx->HasInputs(s));
-    }
+    PADDLE_ENFORCE(ctx->HasInputs(kOutputs));

    ctx->SetOutputsDim(framework::GradVarName(kParameters),
                       ctx->GetInputsDim(kParameters));
@@ -360,10 +365,14 @@ class ParallelDoGradOpShapeInference : public framework::InferShapeBase {
      ctx->SetDims({ig_name}, {i_dims[i]});
    }

-    if (ctx->HasInputs(kParameters)) {
-      PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kParameters)));
-      ctx->SetOutputsDim(framework::GradVarName(kParameters),
-                         ctx->GetInputsDim(kParameters));
+    auto p_dims = ctx->GetInputsDim(kParameters);
+    auto pg_names = ctx->Outputs(framework::GradVarName(kParameters));
+    for (size_t i = 0; i < pg_names.size(); ++i) {
+      auto &pg_name = pg_names[i];
+      if (pg_name == framework::kEmptyVarName) {
+        continue;
+      }
+      ctx->SetDims({pg_name}, {p_dims[i]});
    }
  }
 };

--- a/paddle/operators/prior_box_op.cc
+++ b/paddle/operators/prior_box_op.cc
@@ -128,13 +128,13 @@ class PriorBoxOpMaker : public framework::OpProtoAndCheckerMaker {
                   "Prior boxes step across width, 0 for auto calculation.")
        .SetDefault(0.0)
        .AddCustomChecker([](const float& step_w) {
-          PADDLE_ENFORCE_GT(step_w, 0.0, "step_w should be larger than 0.");
+          PADDLE_ENFORCE_GE(step_w, 0.0, "step_w should be larger than 0.");
        });
    AddAttr<float>("step_h",
                   "Prior boxes step across height, 0 for auto calculation.")
        .SetDefault(0.0)
        .AddCustomChecker([](const float& step_h) {
-          PADDLE_ENFORCE_GT(step_h, 0.0, "step_h should be larger than 0.");
+          PADDLE_ENFORCE_GE(step_h, 0.0, "step_h should be larger than 0.");
        });

    AddAttr<float>("offset",

--- a/paddle/platform/nccl_test.cu
+++ b/paddle/platform/nccl_test.cu
@@ -127,6 +127,9 @@ TEST(NCCL, all_reduce) {
 }  // namespace paddle

 int main(int argc, char** argv) {
+  // FIXME(tonyyang-svail):
+  //   Due to the driver issue on our CI, disable for now
+  return 0;
  dev_count = paddle::platform::GetCUDADeviceCount();
  if (dev_count <= 1) {
    LOG(WARNING)

--- a/python/paddle/v2/fluid/__init__.py
+++ b/python/paddle/v2/fluid/__init__.py
@@ -29,7 +29,7 @@ import optimizer
 import learning_rate_decay
 import backward
 import regularizer
-from param_attr import ParamAttr
+from param_attr import ParamAttr, WeightNormParamAttr
 from data_feeder import DataFeeder
 from core import LoDTensor, CPUPlace, CUDAPlace
 from distribute_transpiler import DistributeTranspiler
@@ -41,11 +41,26 @@ import profiler
 Tensor = LoDTensor

 __all__ = framework.__all__ + executor.__all__ + [
-    'io', 'initializer', 'layers', 'nets', 'optimizer', 'learning_rate_decay',
-    'backward', 'regularizer', 'LoDTensor', 'CPUPlace', 'CUDAPlace', 'Tensor',
-    'ParamAttr'
-    'DataFeeder', 'clip', 'SimpleDistributeTranspiler', 'DistributeTranspiler',
-    'memory_optimize', 'profiler'
+    'io',
+    'initializer',
+    'layers',
+    'nets',
+    'optimizer',
+    'learning_rate_decay',
+    'backward',
+    'regularizer',
+    'LoDTensor',
+    'CPUPlace',
+    'CUDAPlace',
+    'Tensor',
+    'ParamAttr',
+    'WeightNormParamAttr',
+    'DataFeeder',
+    'clip',
+    'SimpleDistributeTranspiler',
+    'DistributeTranspiler',
+    'memory_optimize',
+    'profiler',
 ]



--- a/python/paddle/v2/fluid/distribute_transpiler.py
+++ b/python/paddle/v2/fluid/distribute_transpiler.py
@@ -300,6 +300,9 @@ class DistributeTranspiler:
            pass
        return orig_shape

+    def _op_input_var(self, op, varname):
+        pass
+
    def _is_op_on_pserver(self, endpoint, all_ops, idx):
        """
        Recursively check if the op need to run on current server.
@@ -309,44 +312,51 @@ class DistributeTranspiler:
            p.name for p in self.param_grad_ep_mapping[endpoint]["params"]
        ]
        op = all_ops[idx]
-        if op.inputs.has_key("Param"):
-            if op.inputs["Param"].name in param_names:
+        input_names = set(op.input_names)
+        # TODO(typhoonzero): using Param and Grad input name to identify
+        # that the operator is an optimization operator, need a better way.
+        if "Param" in input_names:
+            if op.input("Param")[0] in param_names:
                return True
            else:
                for n in param_names:
-                    if same_or_split_var(n, op.inputs[
-                            "Param"].name) and n != op.inputs["Param"].name:
+                    if same_or_split_var(n, op.input("Param")[0]) \
+                            and n != op.input("Param")[0]:
                        return True
                return False
        else:
            j = idx - 1
            while j >= 0:
                prev_op = all_ops[j]
-                prev_output_names = [o.name for o in prev_op.outputs.values()]
-                prev_input_names = [o.name for o in prev_op.inputs.values()]
+                # prev_output_names = [o.name for o in prev_op.outputs.values()]
+                # prev_input_names = [o.name for o in prev_op.inputs.values()]
+                # NOTE(typhoonzero): consider list input/output
+                prev_output_names = prev_op.desc.output_arg_names()
+                prev_input_names = prev_op.desc.input_arg_names()
                found1 = False
                found2 = False
-                for _, v in op.inputs.iteritems():
-                    if v.name in prev_output_names:
+                for varname in op.desc.input_arg_names():
+                    if varname in prev_output_names:
                        found1 = self._is_op_on_pserver(endpoint, all_ops, j)
                # later ops may produce output for prev op's next batch use.
-                for _, v in op.outputs.iteritems():
-                    if v.name in prev_input_names:
+                for varname in op.desc.output_arg_names():
+                    if varname in prev_input_names:
                        found2 = self._is_op_on_pserver(endpoint, all_ops, j)
                if found1 or found2:
                    return True
                j -= 1
            return False

-    def _append_pserver_ops(self, program, pserver_program, opt_op, endpoint):
+    def _append_pserver_ops(self, optimize_block, opt_op, endpoint):
+        program = optimize_block.program
        new_inputs = dict()
        # update param/grad shape first, then other inputs like
        # moment can use the updated shape
-        for key, var in opt_op.inputs.iteritems():
+        for key in opt_op.input_names:
            if key == "Grad":
                grad_block = None
                for g in self.param_grad_ep_mapping[endpoint]["grads"]:
-                    if same_or_split_var(g.name, var.name):
+                    if same_or_split_var(g.name, opt_op.input(key)[0]):
                        grad_block = g
                        break
                if not grad_block:
@@ -362,11 +372,11 @@ class DistributeTranspiler:
                if self.trainers > 1:
                    vars2merge = self._create_var_for_trainers(
                        program.global_block(), grad_block, self.trainers)
-                    program.global_block().append_op(
+                    optimize_block.append_op(
                        type="sum",
                        inputs={"X": vars2merge},
                        outputs={"Out": merged_var})
-                    program.global_block().append_op(
+                    optimize_block.append_op(
                        type="scale",
                        inputs={"X": merged_var},
                        outputs={"Out": merged_var},
@@ -376,7 +386,7 @@ class DistributeTranspiler:
                # param is already created on global program
                param_block = None
                for p in self.param_grad_ep_mapping[endpoint]["params"]:
-                    if same_or_split_var(p.name, var.name):
+                    if same_or_split_var(p.name, opt_op.input(key)[0]):
                        param_block = p
                        break
                if not param_block:
@@ -389,11 +399,12 @@ class DistributeTranspiler:

                new_inputs[key] = tmpvar

-        for key, var in opt_op.inputs.iteritems():
+        for key in opt_op.input_names:
            if key in ["Param", "Grad"]:
                continue
            # update accumulator variable shape
            param_shape = new_inputs["Param"].shape
+            var = program.global_block().vars[opt_op.input(key)[0]]
            new_shape = self._get_optimizer_input_shape(opt_op.type, key,
                                                        var.shape, param_shape)
            tmpvar = program.global_block().create_var(
@@ -402,40 +413,41 @@ class DistributeTranspiler:
                dtype=var.dtype,
                shape=new_shape)
            new_inputs[key] = tmpvar
-            # create var in pserver program global block.
-            # TODO(typhoonzero): put blocks in one program to avoid create two
-            # variables.
-            pserver_program.global_block().create_var(
-                name=var.name,
-                persistable=var.persistable,
-                dtype=var.dtype,
-                shape=new_shape)

        # change output's ParamOut variable
-        opt_op.outputs["ParamOut"] = new_inputs["Param"]
-        program.global_block().append_op(
+        outputs = self._get_output_map_from_op(program.global_block(), opt_op)
+        outputs["ParamOut"] = new_inputs["Param"]
+        optimize_block.append_op(
            type=opt_op.type,
            inputs=new_inputs,
-            outputs=opt_op.outputs,
+            outputs=outputs,
            attrs=opt_op.attrs)

-    def _append_pserver_non_opt_ops(self, program, pserver_program, opt_op):
+    def _append_pserver_non_opt_ops(self, optimize_block, opt_op):
+        program = optimize_block.program
        # Append the ops for parameters that do not need to be optimized/updated
-        for _, var in opt_op.inputs.iteritems():
-            program.global_block().create_var(
-                name=var.name,
-                persistable=var.persistable,
-                dtype=var.dtype,
-                shape=var.shape)
-            pserver_program.global_block().create_var(
-                name=var.name,
-                persistable=var.persistable,
-                dtype=var.dtype,
-                shape=var.shape)
-        program.global_block().append_op(
+        inputs = self._get_input_map_from_op(self.program.global_block().vars,
+                                             opt_op)
+        for var in inputs.itervalues():
+            if type(var) == list:
+                varlist = var
+            else:
+                varlist = [var]
+            for var in varlist:
+                if not program.global_block().vars.has_key(var.name):
+                    program.global_block().create_var(
+                        name=var.name,
+                        persistable=var.persistable,
+                        dtype=var.dtype,
+                        shape=var.shape)
+
+        outputs = self._get_output_map_from_op(self.program.global_block().vars,
+                                               opt_op)
+
+        optimize_block.append_op(
            type=opt_op.type,
-            inputs=opt_op.inputs,
-            outputs=opt_op.outputs,
+            inputs=inputs,
+            outputs=outputs,
            attrs=opt_op.attrs)

    def get_pserver_program(self, endpoint):
@@ -465,26 +477,25 @@ class DistributeTranspiler:
                    dtype=v.dtype,
                    shape=v.shape)
        # step6
-        optimize_sub_program = Program()
+        optimize_block = pserver_program.create_block(0)
        # Iterate through the ops and append ops as needed
        for idx, opt_op in enumerate(self.optimize_ops):
            is_op_on_pserver = self._is_op_on_pserver(endpoint,
                                                      self.optimize_ops, idx)
            if not is_op_on_pserver:
                continue
-            if opt_op.inputs.has_key("Grad"):
-                self._append_pserver_ops(optimize_sub_program, pserver_program,
-                                         opt_op, endpoint)
+            if "Grad" in opt_op.desc.input_arg_names():
+                self._append_pserver_ops(optimize_block, opt_op, endpoint)
            else:
-                self._append_pserver_non_opt_ops(optimize_sub_program,
-                                                 pserver_program, opt_op)
+                self._append_pserver_non_opt_ops(optimize_block, opt_op)
+
        # Append the listen_and_serv op
        pserver_program.global_block().append_op(
            type="listen_and_serv",
            inputs={},
            outputs={},
            attrs={
-                "OptimizeBlock": optimize_sub_program.global_block(),
+                "OptimizeBlock": optimize_block,
                "endpoint": endpoint,
                "ParamList": [
                    p.name
@@ -499,6 +510,30 @@ class DistributeTranspiler:
        pserver_program.sync_with_cpp()
        return pserver_program

+    def _get_input_map_from_op(self, varmap, op):
+        iomap = dict()
+        for key in op.input_names:
+            vars = []
+            for varname in op.input(key):
+                vars.append(varmap[varname])
+            if len(vars) == 1:
+                iomap[key] = vars[0]
+            else:
+                iomap[key] = vars
+        return iomap
+
+    def _get_output_map_from_op(self, varmap, op):
+        iomap = dict()
+        for key in op.output_names:
+            vars = []
+            for varname in op.output(key):
+                vars.append(varmap[varname])
+            if len(vars) == 1:
+                iomap[key] = vars[0]
+            else:
+                iomap[key] = vars
+        return iomap
+
    def get_startup_program(self, endpoint, pserver_program):
        """
        Get startup program for current parameter server.
@@ -529,17 +564,21 @@ class DistributeTranspiler:

        # 2. rename op outputs
        for op in orig_s_prog.global_block().ops:
+            new_inputs = dict()
            new_outputs = dict()
            # do not append startup op if var is not on this pserver
            op_on_pserver = False
-            for key, var in op.outputs.iteritems():
-                newname, _ = _get_splited_name_and_shape(var.name)
+            for key in op.output_names:
+                newname, _ = _get_splited_name_and_shape(op.output(key)[0])
                if newname:
                    op_on_pserver = True
                    new_outputs[key] = created_var_map[newname]
-                elif var.name in pserver_vars:
+                elif op.output(key)[0] in pserver_vars:
                    op_on_pserver = True
-                    new_outputs[key] = pserver_vars[var.name]
+                    new_outputs[key] = pserver_vars[op.output(key)[0]]
+
+            # most startup program ops have no inputs
+            new_inputs = self._get_input_map_from_op(pserver_vars, op)

            if op_on_pserver:
                if op.type in [
@@ -548,7 +587,7 @@ class DistributeTranspiler:
                    op.attrs["shape"] = new_outputs["Out"].shape
                s_prog.global_block().append_op(
                    type=op.type,
-                    inputs=op.inputs,
+                    inputs=new_inputs,
                    outputs=new_outputs,
                    attrs=op.attrs)
        return s_prog
--- a/python/paddle/v2/fluid/executor.py
+++ b/python/paddle/v2/fluid/executor.py
@@ -47,27 +47,13 @@ def as_numpy(tensor):
        return [as_numpy(t) for t in tensor]
    assert isinstance(tensor, core.LoDTensor)
    lod = tensor.lod()
-    tensor_data = np.array(tensor)
-    if len(lod) == 0:
-        ans = tensor_data
-    else:
-        raise RuntimeError("LoD Calculate lacks unit tests and buggy")
-    # elif len(lod) == 1:
-    #     ans = []
-    #     idx = 0
-    #     while idx < len(lod) - 1:
-    #         ans.append(tensor_data[lod[idx]:lod[idx + 1]])
-    #         idx += 1
-    # else:
-    #     for l in reversed(lod):
-    #         ans = []
-    #         idx = 0
-    #         while idx < len(l) - 1:
-    #             ans.append(tensor_data[l[idx]:l[idx + 1]])
-    #             idx += 1
-    #         tensor_data = ans
-    #     ans = tensor_data
-    return ans
+    if len(lod) > 0:
+        raise RuntimeError(
+            "Some of your featched tensors hold LoD information. \
+            They can not be completely cast to Python ndarray. \
+            Please set the parameter 'return_numpy' as 'False' to \
+            return LoDTensor itself directly.")
+    return np.array(tensor)


 def has_feed_operators(block, feed_targets, feed_holder_name):
@@ -306,7 +292,6 @@ class Executor(object):
            core.get_fetch_variable(scope, fetch_var_name, i)
            for i in xrange(len(fetch_list))
        ]
-
        if return_numpy:
            outs = as_numpy(outs)
        return outs
--- a/python/paddle/v2/fluid/framework.py
+++ b/python/paddle/v2/fluid/framework.py
@@ -740,6 +740,9 @@ class Block(object):
            raise e
        self.desc.remove_op(start, end + 1)

+    def slice_ops(self, start, end):
+        return list(self.ops)[start:end]
+
    def prepend_op(self, *args, **kwargs):
        op_desc = self.desc.prepend_op()
        op = Operator(self, op_desc, *args, **kwargs)

--- a/python/paddle/v2/fluid/initializer.py
+++ b/python/paddle/v2/fluid/initializer.py
@@ -14,14 +14,37 @@

 import framework
 import numpy as np
+import contextlib

 __all__ = [
-    'Constant',
-    'Uniform',
-    'Normal',
-    'Xavier',
+    'Constant', 'Uniform', 'Normal', 'Xavier', 'force_init_on_cpu',
+    'init_on_cpu'
 ]

+_force_init_on_cpu_ = False
+
+
+def force_init_on_cpu():
+    return _force_init_on_cpu_
+
+
+@contextlib.contextmanager
+def init_on_cpu():
+    """
+    Switch program with `with` statement
+
+    Examples:
+        >>> with init_on_cpu():
+        >>>   step = layers.create_global_var()
+
+    """
+    global _force_init_on_cpu_
+
+    pre_state = force_init_on_cpu()
+    _force_init_on_cpu_ = True
+    yield
+    _force_init_on_cpu_ = pre_state
+

 class Initializer(object):
    """Base class for variable initializers
@@ -80,7 +103,7 @@ class ConstantInitializer(Initializer):
    """Implements the constant initializer
    """

-    def __init__(self, value=0.0):
+    def __init__(self, value=0.0, force_cpu=False):
        """Constructor for ConstantInitializer

        Args:
@@ -89,6 +112,7 @@ class ConstantInitializer(Initializer):
        assert value is not None
        super(ConstantInitializer, self).__init__()
        self._value = value
+        self._force_cpu = force_cpu

    def __call__(self, var, block):
        """Add constant initialization ops for a variable
@@ -110,7 +134,8 @@ class ConstantInitializer(Initializer):
            attrs={
                "shape": var.shape,
                "dtype": int(var.dtype),
-                "value": self._value
+                "value": float(self._value),
+                'force_cpu': self._force_cpu or force_init_on_cpu()
            })
        var.op = op
        return op

--- a/python/paddle/v2/fluid/io.py
+++ b/python/paddle/v2/fluid/io.py
@@ -342,7 +342,11 @@ def save_inference_model(dirname,
    prepend_feed_ops(inference_program, feeded_var_names)
    append_fetch_ops(inference_program, fetch_var_names)

-    model_file_name = dirname + "/__model__"
+    if save_file_name == None:
+        model_file_name = dirname + "/__model__"
+    else:
+        model_file_name = dirname + "/__model_combined__"
+
    with open(model_file_name, "wb") as f:
        f.write(inference_program.desc.serialize_to_string())

@@ -384,7 +388,11 @@ def load_inference_model(dirname, executor, load_file_name=None):
    if not os.path.isdir(dirname):
        raise ValueError("There is no directory named '%s'", dirname)

-    model_file_name = dirname + "/__model__"
+    if load_file_name == None:
+        model_file_name = dirname + "/__model__"
+    else:
+        model_file_name = dirname + "/__model_combined__"
+
    with open(model_file_name, "rb") as f:
        program_desc_str = f.read()


--- a/python/paddle/v2/fluid/layers/control_flow.py
+++ b/python/paddle/v2/fluid/layers/control_flow.py
@@ -38,6 +38,7 @@ __all__ = [
    'array_write',
    'create_array',
    'less_than',
+    'equal',
    'array_read',
    'shrink_memory',
    'array_length',
@@ -276,21 +277,20 @@ class ParallelDo(object):
        parent_block = self.parent_block()

        local_inputs = set()
-
-        for op in current_block.ops:
-            for oname in op.output_names:
-                for out_var_name in op.output(oname):
-                    local_inputs.add(out_var_name)
-
+        params = list()
        for var in self.inputs:
            local_inputs.add(var.name)

-        params = list()
        for op in current_block.ops:
            for iname in op.input_names:
                for in_var_name in op.input(iname):
                    if in_var_name not in local_inputs:
                        params.append(in_var_name)
+
+            for oname in op.output_names:
+                for out_var_name in op.output(oname):
+                    local_inputs.add(out_var_name)
+
        params = list(set(params))

        return [parent_block.var(name) for name in params]
@@ -975,6 +975,36 @@ def less_than(x, y, cond=None, **ignored):
    return cond


+def equal(x, y, cond=None, **ignored):
+    """
+    **equal**
+
+    This layer returns the truth value of :math:`x == y` elementwise.
+
+    Args:
+        x(Variable): First operand of *equal*
+        y(Variable): Second operand of *equal*
+        cond(Variable|None): Optional output variable to store the result of *equal*
+
+    Returns:
+        Variable: The tensor variable storing the output of *equal*.
+
+    Examples:
+        .. code-block:: python
+
+          less = fluid.layers.equal(x=label, y=limit)
+    """
+    helper = LayerHelper("equal", **locals())
+    if cond is None:
+        cond = helper.create_tmp_variable(dtype='bool')
+        cond.stop_gradient = True
+
+    helper.append_op(
+        type='equal', inputs={'X': [x],
+                              'Y': [y]}, outputs={'Out': [cond]})
+    return cond
+
+
 def array_read(array, i):
    """This function performs the operation to read the data in as an
    LOD_TENSOR_ARRAY.

--- a/python/paddle/v2/fluid/layers/math_op_patch.py
+++ b/python/paddle/v2/fluid/layers/math_op_patch.py
@@ -14,6 +14,7 @@

 from ..framework import Variable, unique_name
 from layer_function_generator import OpProtoHolder
+from ..initializer import force_init_on_cpu

 __all__ = ['monkey_patch_variable']

@@ -36,9 +37,12 @@ def monkey_patch_variable():
        block.append_op(
            type="fill_constant",
            outputs={'Out': [var]},
-            attrs={'dtype': var.dtype,
-                   'shape': shape,
-                   'value': value})
+            attrs={
+                'dtype': var.dtype,
+                'shape': shape,
+                'value': value,
+                'force_cpu': force_init_on_cpu()
+            })
        return var

    def create_scalar(block, value, dtype):

--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -69,8 +69,9 @@ __all__ = [
    'reshape',
    'reshape_with_axis',
    'multiplex',
-    'prior_box'
+    'prior_box',
    'prior_boxes',
+    'layer_norm',
 ]


@@ -98,7 +99,7 @@ def fc(input,

    .. math::

-        Out = Act({\sum_{i=0}^{N-1}W_iX_i + b})
+        Out = Act({\sum_{i=0}^{N-1}X_iW_i + b})

    In the above equation:

@@ -190,7 +191,7 @@ def fc(input,
        helper.append_op(
            type="sum", inputs={"X": mul_results}, outputs={"Out": pre_bias})
    # add bias
-    pre_activation = helper.append_bias_op(pre_bias)
+    pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)
    # add activation
    return helper.append_activation(pre_activation)

@@ -416,12 +417,12 @@ def dynamic_lstmp(input,
    """
    **Dynamic LSTMP Layer**

-    LSTMP (LSTM with recurrent projection) layer has a separate projection 
-    layer after the LSTM layer, projecting the original hidden state to a 
-    lower-dimensional one, which is proposed to reduce the number of total 
-    parameters and furthermore computational complexity for the LSTM, 
-    espeacially for the case that the size of output units is relative 
-    large (https://research.google.com/pubs/archive/43905.pdf). 
+    LSTMP (LSTM with recurrent projection) layer has a separate projection
+    layer after the LSTM layer, projecting the original hidden state to a
+    lower-dimensional one, which is proposed to reduce the number of total
+    parameters and furthermore computational complexity for the LSTM,
+    espeacially for the case that the size of output units is relative
+    large (https://research.google.com/pubs/archive/43905.pdf).

    The formula is as follows:

@@ -447,27 +448,27 @@ def dynamic_lstmp(input,
          the matrix of weights from the input gate to the input).
    * :math:`W_{ic}`, :math:`W_{fc}`, :math:`W_{oc}`: Diagonal weight \
          matrices for peephole connections. In our implementation, \
-          we use vectors to reprenset these diagonal weight matrices. 
+          we use vectors to reprenset these diagonal weight matrices.
    * :math:`b`: Denotes bias vectors (e.g. :math:`b_i` is the input gate \
-          bias vector). 
+          bias vector).
    * :math:`\sigma`: The activation, such as logistic sigmoid function.
    * :math:`i, f, o` and :math:`c`: The input gate, forget gate, output \
          gate, and cell activation vectors, respectively, all of which have \
-          the same size as the cell output activation vector :math:`h`. 
+          the same size as the cell output activation vector :math:`h`.
    * :math:`h`: The hidden state.
-    * :math:`r`: The recurrent projection of the hidden state. 
+    * :math:`r`: The recurrent projection of the hidden state.
    * :math:`\\tilde{c_t}`: The candidate hidden state, whose \
          computation is based on the current input and previous hidden state.
-    * :math:`\odot`: The element-wise product of the vectors. 
+    * :math:`\odot`: The element-wise product of the vectors.
    * :math:`act_g` and :math:`act_h`: The cell input and cell output \
-          activation functions and `tanh` is usually used for them. 
+          activation functions and `tanh` is usually used for them.
    * :math:`\overline{act_h}`: The activation function for the projection \
          output, usually using `identity` or same as :math:`act_h`.

    Set `use_peepholes` to `False` to disable peephole connection. The formula
    is omitted here, please refer to the paper
    http://www.bioinf.jku.at/publications/older/2604.pdf for details.
-    
+
    Note that these :math:`W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}`
    operations on the input :math:`x_{t}` are NOT included in this operator.
    Users can choose to use fully-connected layer before LSTMP layer.
@@ -485,8 +486,8 @@ def dynamic_lstmp(input,

                               - Hidden-hidden weight = {:math:`W_{ch}, W_{ih}, \
                                                W_{fh}, W_{oh}`}.
-                               - The shape of hidden-hidden weight is (P x 4D), 
-                                 where P is the projection size and D the hidden 
+                               - The shape of hidden-hidden weight is (P x 4D),
+                                 where P is the projection size and D the hidden
                                 size.
                               - Projection weight = {:math:`W_{rh}`}.
                               - The shape of projection weight is (D x P).
@@ -531,9 +532,9 @@ def dynamic_lstmp(input,
            hidden_dim, proj_dim = 512, 256
            fc_out = fluid.layers.fc(input=input_seq, size=hidden_dim * 4,
                                     act=None, bias_attr=None)
-            proj_out, _ = fluid.layers.dynamic_lstmp(input=fc_out, 
-                                                     size=hidden_dim * 4, 
-                                                     proj_size=proj_dim, 
+            proj_out, _ = fluid.layers.dynamic_lstmp(input=fc_out,
+                                                     size=hidden_dim * 4,
+                                                     proj_size=proj_dim,
                                                     use_peepholes=False,
                                                     is_reverse=True,
                                                     cell_activation="tanh",
@@ -647,8 +648,8 @@ def dynamic_gru(input,
            Choices = ["sigmoid", "tanh", "relu", "identity"], default "tanh".

    Returns:
-        Variable: The hidden state of GRU. The shape is (T \\times D), and lod \
-            is the same with the input.
+        Variable: The hidden state of GRU. The shape is :math:`(T \\times D)`, \
+            and lod is the same with the input.

    Examples:
        .. code-block:: python
@@ -996,7 +997,7 @@ def square_error_cost(input, label, **kwargs):
       label(Variable): Label tensor, has target labels.

    Returns:
-        Variable: The tensor variable storing the element-wise squared error
+        Variable: The tensor variable storing the element-wise squared error \
                  difference of input and label.

    Examples:
@@ -1220,7 +1221,7 @@ def conv2d(input,
       act(str): Activation type. Default: None

    Returns:
-        Variable: The tensor variable storing the convolution and
+        Variable: The tensor variable storing the convolution and \
                  non-linearity activation result.

    Raises:
@@ -1571,6 +1572,102 @@ def batch_norm(input,
    return helper.append_activation(batch_norm_out)


+def layer_norm(input,
+               scale=True,
+               shift=True,
+               begin_norm_axis=1,
+               epsilon=1e-05,
+               param_attr=None,
+               bias_attr=None,
+               act=None,
+               name=None):
+    """
+    **Layer Normalization**
+
+    Assume feature vectors exist on dimensions 
+    :attr:`begin_norm_axis ... rank(input)` and calculate the moment statistics
+    along these dimensions for each feature vector :math:`a` with size
+    :math:`H`, then normalize each feature vector using the corresponding
+    statistics. After that, apply learnable gain and bias on the normalized
+    tensor to scale and shift if :attr:`scale` and :attr:`shift` are set.
+
+    Refer to `Layer Normalization <https://arxiv.org/pdf/1607.06450v1.pdf>`_
+
+    The formula is as follows:
+
+    .. math::
+
+        \\mu & = \\frac{1}{H}\\sum_{i=1}^{H} a_i
+
+        \\sigma & = \\sqrt{\\frac{1}{H}\sum_{i=1}^{H}(a_i - \\mu)^2}
+
+        h & = f(\\frac{g}{\\sigma}(a - \\mu) + b)
+
+    Args:
+        input(Variable): The input tensor variable.
+        scale(bool): Whether to learn the adaptive gain :math:`g` after 
+            normalization.
+        shift(bool): Whether to learn the adaptive bias :math:`b` after 
+            normalization.
+        begin_norm_axis(bool): The normalization will be performed along 
+            dimensions from :attr:`begin_norm_axis` to :attr:`rank(input)`.
+        epsilon(float): The small value added to the variance to prevent 
+            division by zero.
+        param_attr(ParamAttr|None): The parameter attribute for the learnable
+            gain :math:`g`.
+        bias_attr(ParamAttr|None): The parameter attribute for the learnable
+            bias :math:`b`.
+        act(str): Activation to be applied to the output of layer normalizaiton.
+
+    Returns:
+        Variable: A tensor variable with the same shape as the input.
+
+    Examples:
+        .. code-block:: python
+
+            data = fluid.layers.data(
+              name='data', shape=[3, 32, 32], dtype='float32')
+            x = fluid.layers.layer_norm(input=data, begin_norm_axis=1)
+    """
+    helper = LayerHelper('layer_norm', **locals())
+    dtype = helper.input_dtype()
+
+    # create intput and parameters
+    inputs = {'X': input}
+    input_shape = input.shape
+    param_shape = [reduce(lambda x, y: x * y, input_shape[begin_norm_axis:])]
+    if scale:
+        scale = helper.create_parameter(
+            attr=helper.param_attr,
+            shape=param_shape,
+            dtype=dtype,
+            default_initializer=Constant(1.0))
+        inputs['Scale'] = scale
+    if shift:
+        assert bias_attr is not False
+        bias = helper.create_parameter(
+            attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True)
+        inputs['Bias'] = bias
+
+    # create output
+    mean_out = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
+    variance_out = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
+    layer_norm_out = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type="layer_norm",
+        inputs=inputs,
+        outputs={
+            "Y": layer_norm_out,
+            "Mean": mean_out,
+            "Variance": variance_out,
+        },
+        attrs={"epsilon": epsilon,
+               "begin_norm_axis": begin_norm_axis})
+
+    return helper.append_activation(layer_norm_out)
+
+
 def beam_search_decode(ids, scores, name=None):
    helper = LayerHelper('beam_search_decode', **locals())
    sentence_ids = helper.create_tmp_variable(dtype=ids.dtype)
@@ -2531,7 +2628,8 @@ def ctc_greedy_decoder(input, blank, name=None):
                    interval [0, num_classes + 1).

    Returns:
-        Variable: CTC greedy decode result.
+        Variable: CTC greedy decode result. If all the sequences in result were
+        empty, the result LoDTensor will be [-1] with LoD [[0]] and dims [1, 1].

    Examples:
        .. code-block:: python
@@ -3270,8 +3368,8 @@ def prior_boxes(input_layers,

        box, var = prior_box(input, image, min_size, max_size, aspect_ratio,
                             variance, flip, clip, step_w[i]
-                             if step_w else [], step_h[i]
-                             if step_w else [], offset)
+                             if step_w else 0.0, step_h[i]
+                             if step_w else 0.0, offset)

        box_results.append(box)
        var_results.append(var)

--- a/python/paddle/v2/fluid/layers/ops.py
+++ b/python/paddle/v2/fluid/layers/ops.py
@@ -65,6 +65,8 @@ __all__ = [
    'logical_or',
    'logical_xor',
    'logical_not',
+    'uniform_random',
+    'cumsum',
 ] + __activations__

 for _OP in set(__all__):

--- a/python/paddle/v2/fluid/layers/tensor.py
+++ b/python/paddle/v2/fluid/layers/tensor.py
@@ -16,7 +16,7 @@ from ..layer_helper import LayerHelper
 from ..param_attr import ParamAttr
 from ..framework import convert_np_dtype_to_dtype_
 from ..framework import Variable
-from ..initializer import Constant
+from ..initializer import Constant, force_init_on_cpu
 from ..core import DataType
 import numpy

@@ -69,12 +69,30 @@ def create_parameter(shape,
                                   default_initializer)


-def create_global_var(shape, value, dtype, persistable=False, name=None):
+def create_global_var(shape,
+                      value,
+                      dtype,
+                      persistable=False,
+                      force_cpu=False,
+                      name=None):
+    """
+    Create a global variable. such as global_step
+    Args:
+        shape(list[int]): shape of the variable
+        value(float): the value of the variable
+        dtype(string): element type of the parameter
+        persistable(bool): if this variable is persistable
+        force_cpu(bool): force this variable to be on CPU
+
+    Returns:
+        Variable: the created Variable
+    """
    helper = LayerHelper("global_var", **locals())
    var = helper.create_global_variable(
        dtype=dtype, shape=shape, persistable=persistable, name=name)
    helper.set_variable_initializer(
-        var, initializer=Constant(value=float(value)))
+        var, initializer=Constant(
+            value=float(value), force_cpu=force_cpu))
    return var


@@ -221,6 +239,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None):
        dtype(np.dtype|core.DataType|str): Data type of the output tensor.
        value(float): The constant value used to initialize the output tensor.
        out(Variable): The output tensor.
+        force_cpu(True|False): data should be on CPU if set true.

    Returns:
        Variable: The tensor variable storing the output.
@@ -242,7 +261,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None):
            'shape': shape,
            'dtype': out.dtype,
            'value': float(value),
-            'force_cpu': force_cpu
+            'force_cpu': force_cpu or force_init_on_cpu()
        })
    out.stop_gradient = True
    return out

--- a/python/paddle/v2/fluid/learning_rate_decay.py
+++ b/python/paddle/v2/fluid/learning_rate_decay.py
@@ -14,8 +14,12 @@

 import layers
 from framework import Variable
+from initializer import init_on_cpu

-__all__ = ['exponential_decay', 'natural_exp_decay', 'inverse_time_decay']
+__all__ = [
+    'exponential_decay', 'natural_exp_decay', 'inverse_time_decay',
+    'polynomial_decay', 'piecewise_decay'
+]
 """
 When training a model, it's often useful to decay the
 learning rate during training process, this is called
@@ -51,11 +55,14 @@ def exponential_decay(learning_rate,
    if not isinstance(global_step, Variable):
        raise ValueError("global_step is required for exponential_decay.")

-    # update learning_rate
-    div_res = global_step / decay_steps
-    if staircase:
-        div_res = layers.floor(x=div_res)
-    return learning_rate * (decay_rate**div_res)
+    with init_on_cpu():
+        # update learning_rate
+        div_res = global_step / decay_steps
+        if staircase:
+            div_res = layers.floor(x=div_res)
+        decayed_lr = learning_rate * (decay_rate**div_res)
+
+    return decayed_lr


 def natural_exp_decay(learning_rate,
@@ -85,10 +92,13 @@ def natural_exp_decay(learning_rate,
    if not isinstance(global_step, Variable):
        raise ValueError("global_step is required for natural_exp_decay.")

-    div_res = global_step / decay_steps
-    if staircase:
-        div_res = layers.floor(x=div_res)
-    return learning_rate * layers.exp(x=(-1 * decay_rate * div_res))
+    with init_on_cpu():
+        div_res = global_step / decay_steps
+        if staircase:
+            div_res = layers.floor(x=div_res)
+        decayed_lr = learning_rate * layers.exp(x=(-1 * decay_rate * div_res))
+
+    return decayed_lr


 def inverse_time_decay(learning_rate,
@@ -101,7 +111,7 @@ def inverse_time_decay(learning_rate,
    ```python
    if staircase:
      decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step / decay_step))
-    else
+    else:
      decayed_learning_rate = learning_rate / (1 + decay_rate * global_step / decay_step)
    ```
    Args:
@@ -118,8 +128,114 @@ def inverse_time_decay(learning_rate,
    if not isinstance(global_step, Variable):
        raise ValueError("global_step is required for inverse_time_decay.")

-    div_res = global_step / decay_steps
-    if staircase:
-        div_res = layers.floor(x=div_res)
+    with init_on_cpu():
+        div_res = global_step / decay_steps
+        if staircase:
+            div_res = layers.floor(x=div_res)
+
+        decayed_lr = learning_rate / (1 + decay_rate * div_res)
+
+    return decayed_lr
+
+
+def polynomial_decay(learning_rate,
+                     global_step,
+                     decay_steps,
+                     end_learning_rate=0.0001,
+                     power=1.0,
+                     cycle=False):
+    """Applies polynomial decay to the initial learning rate.
+
+    ```python
+    if cycle:
+        decay_steps = decay_steps * ceil(global_step / decay_steps)
+    else:
+        global_step = min(global_step, decay_steps)
+    decayed_learning_rate = (learning_rate - end_learning_rate) *
+                      (1 - global_step / decay_steps) ^ power +
+                      end_learning_rate
+    ```
+    Args:
+        learning_rate: A scalar float32 value or a Variable. This
+          will be the initial learning rate during training
+        global_step: A Variable that record the training step.
+        decay_steps: A Python `int32` number.
+        end_learning_rate: A Python `float` number.
+        power: A Python `float` number
+        cycle: Boolean. If set true, decay the learning rate every decay_steps.

-    return learning_rate / (1 + decay_rate * div_res)
+    Returns:
+        The decayed learning rate
+    """
+    if not isinstance(global_step, Variable):
+        raise ValueError("global_step is required for inverse_time_decay.")
+
+    with init_on_cpu():
+        if cycle:
+            div_res = layers.ceil(x=(global_step / decay_steps))
+            zero_var = layers.fill_constant(
+                shape=[1], dtype='float32', value=0.0)
+            one_var = layers.fill_constant(
+                shape=[1], dtype='float32', value=1.0)
+
+            with layers.Switch() as switch:
+                with switch.case(layers.equal(x=global_step, y=zero_var)):
+                    layers.assign(input=one_var, output=div_res)
+            decay_steps = decay_steps * div_res
+        else:
+            decay_steps_var = layers.fill_constant(
+                shape=[1], dtype='float32', value=float(decay_steps))
+            global_step = layers.elementwise_min(
+                x=global_step, y=decay_steps_var)
+
+        decayed_lr = (learning_rate - end_learning_rate) * \
+                     ((1 - global_step / decay_steps) ** power) + end_learning_rate
+    return decayed_lr
+
+
+def piecewise_decay(global_step, boundaries, values):
+    """Applies piecewise decay to the initial learning rate.
+
+    ```python
+    boundaries = [10000, 20000]
+    values = [1.0, 0.5, 0.1]
+
+    if step < 10000:
+        learning_rate = 1.0
+    elif step >= 10000 and step < 20000:
+        learning_rate = 0.5
+    else:
+        learning_rate = 0.1
+    ```
+    """
+
+    if len(values) - len(boundaries) != 1:
+        raise ValueError("len(values) - len(boundaries) should be 1")
+
+    if not isinstance(global_step, Variable):
+        raise ValueError("global_step is required for piecewise_decay.")
+
+    with init_on_cpu():
+        lr = layers.create_global_var(
+            shape=[1],
+            value=0.0,
+            dtype='float32',
+            persistable=True,
+            name="learning_rate")
+
+        with layers.Switch() as switch:
+            for i in range(len(boundaries)):
+                boundary_val = layers.fill_constant(
+                    shape=[1], dtype='float32', value=float(boundaries[i]))
+                value_var = layers.fill_constant(
+                    shape=[1], dtype='float32', value=float(values[i]))
+                with switch.case(layers.less_than(global_step, boundary_val)):
+                    layers.assign(value_var, lr)
+            last_value_var = layers.fill_constant(
+                shape=[1],
+                dtype='float32',
+                value=float(values[len(values) - 1]))
+            with switch.default():
+                layers.assign(last_value_var, lr)
+
+    return lr
--- a/python/paddle/v2/fluid/memory_optimization_transpiler.py
+++ b/python/paddle/v2/fluid/memory_optimization_transpiler.py
@@ -92,14 +92,13 @@ class ControlFlowGraph(object):
        live_in = defaultdict(set)
        live_out = defaultdict(set)
        while True:
-            for i in range(self.op_size):
+            for i in range(self.op_size, 0, -1):
                live_in[i] = set(self._live_in[i])
                live_out[i] = set(self._live_out[i])
-                self._live_in[i] = self._uses[i] | (
-                    self._live_out[i] - self._defs[i])
                for s in self._successors[i]:
                    self._live_out[i] |= self._live_in[s]
-
+                self._live_in[i] = self._uses[i] | (
+                    self._live_out[i] - self._defs[i])
            if self._reach_fixed_point(live_in, live_out):
                break


--- a/python/paddle/v2/fluid/nets.py
+++ b/python/paddle/v2/fluid/nets.py
@@ -194,7 +194,7 @@ def scaled_dot_product_attention(queries,

    Returns:

-        Variable: A 3-D Tensor computed by multi-head scaled dot product
+        Variable: A 3-D Tensor computed by multi-head scaled dot product \
                  attention.

    Raises:
@@ -333,6 +333,7 @@ def scaled_dot_product_attention(queries,
            x=product, shape=[-1, product.shape[-1]], act="softmax"),
        shape=product.shape)
    if dropout_rate:
-        weights = layers.dropout(x, dropout_prob=dropout_rate, is_test=False)
+        weights = layers.dropout(
+            weights, dropout_prob=dropout_rate, is_test=False)
    ctx_multiheads = layers.matmul(weights, v)
    return __combine_heads(ctx_multiheads)
--- a/python/paddle/v2/fluid/optimizer.py
+++ b/python/paddle/v2/fluid/optimizer.py
@@ -190,6 +190,8 @@ class Optimizer(object):
        # Create any accumulators
        program = loss.block.program
        with program_guard(program, startup_program):
+            global_block = framework.default_main_program().global_block()
+            start = len(global_block.ops)
            self.helper = LayerHelper(self.__class__.__name__)
            self._create_accumulators(loss.block,
                                      [p[0] for p in parameters_and_grads])
@@ -203,19 +205,14 @@ class Optimizer(object):
                                                           param_and_grad)
                    optimize_ops.append(optimize_op)

-            # Returned list of ops can include more ops in addition
-            # to optimization ops
-            return_ops = optimize_ops
-
            # Get custom finish ops for subclasses
            # FIXME: Need to fix this once we figure out how to handle dependencies
-            finish_ops = self._finish_update(loss.block)
-            if finish_ops is not None:
-                return_ops += finish_ops
+            self._finish_update(loss.block)

            if self._global_step is not None:
-                return_ops.append(self._increment_global_step(loss.block))
-            return return_ops
+                self._increment_global_step(loss.block)
+            end = len(global_block.ops)
+            return global_block.slice_ops(start, end)

    def minimize(self,
                 loss,

--- a/python/paddle/v2/fluid/tests/book/.gitignore
+++ b/python/paddle/v2/fluid/tests/book/.gitignore
-recognize_digits_*.inference.model
+*.inference.model
--- a/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
@@ -15,13 +15,13 @@
 import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
 import contextlib
+import numpy
 import unittest
+import math
+import sys


-def main(use_cuda):
-    if use_cuda and not fluid.core.is_compiled_with_cuda():
-        return
-
+def train(use_cuda, save_dirname):
    x = fluid.layers.data(name='x', shape=[13], dtype='float32')

    y_predict = fluid.layers.fc(input=x, size=1, act=None)
@@ -49,19 +49,59 @@ def main(use_cuda):

    PASS_NUM = 100
    for pass_id in range(PASS_NUM):
-        fluid.io.save_persistables(exe, "./fit_a_line.model/")
-        fluid.io.load_persistables(exe, "./fit_a_line.model/")
        for data in train_reader():
            avg_loss_value, = exe.run(fluid.default_main_program(),
                                      feed=feeder.feed(data),
                                      fetch_list=[avg_cost])
            print(avg_loss_value)
            if avg_loss_value[0] < 10.0:
+                if save_dirname is not None:
+                    fluid.io.save_inference_model(save_dirname, ['x'],
+                                                  [y_predict], exe)
                return
+            if math.isnan(float(avg_loss_value)):
+                sys.exit("got NaN loss, training failed.")
    raise AssertionError("Fit a line cost is too large, {0:2.2}".format(
        avg_loss_value[0]))


+def infer(use_cuda, save_dirname=None):
+    if save_dirname is None:
+        return
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    # Use fluid.io.load_inference_model to obtain the inference program desc,
+    # the feed_target_names (the names of variables that will be feeded 
+    # data using feed operators), and the fetch_targets (variables that 
+    # we want to obtain data from using fetch operators).
+    [inference_program, feed_target_names,
+     fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+    # The input's dimension should be 2-D and the second dim is 13
+    # The input data should be >= 0
+    batch_size = 10
+    tensor_x = numpy.random.uniform(0, 10, [batch_size, 13]).astype("float32")
+    assert feed_target_names[0] == 'x'
+    results = exe.run(inference_program,
+                      feed={feed_target_names[0]: tensor_x},
+                      fetch_list=fetch_targets)
+    print("infer shape: ", results[0].shape)
+    print("infer results: ", results[0])
+
+
+def main(use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+
+    # Directory for saving the trained model
+    save_dirname = "fit_a_line.inference.model"
+
+    train(use_cuda, save_dirname)
+    infer(use_cuda, save_dirname)
+
+
 class TestFitALine(unittest.TestCase):
    def test_cpu(self):
        with self.program_scope_guard():

--- a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
+++ b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
@@ -17,6 +17,8 @@ from __future__ import print_function
 import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
 import contextlib
+import math
+import sys
 import numpy
 import unittest

@@ -145,6 +147,8 @@ def train(net_type, use_cuda, save_dirname):
                    loss_t, acc_t = exe.run(program=test_program,
                                            feed=feeder.feed(test_data),
                                            fetch_list=[avg_cost, acc])
+                    if math.isnan(float(loss_t)):
+                        sys.exit("got NaN loss, training failed.")
                    acc_list.append(float(acc_t))
                    avg_loss_list.append(float(loss_t))
                    break  # Use 1 segment for speeding up CI

--- a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
@@ -18,6 +18,7 @@ import numpy as np
 import paddle.v2 as paddle
 import paddle.v2.dataset.conll05 as conll05
 import paddle.v2.fluid as fluid
+from paddle.v2.fluid.initializer import init_on_cpu
 import contextlib
 import time
 import unittest
@@ -167,7 +168,16 @@ def train(use_cuda, save_dirname=None):

    # TODO(qiao)
    # check other optimizers and check why out will be NAN
-    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.0001)
+    global_step = fluid.layers.create_global_var(
+        shape=[1], value=0, dtype='float32', force_cpu=True, persistable=True)
+    sgd_optimizer = fluid.optimizer.SGD(
+        learning_rate=fluid.learning_rate_decay.exponential_decay(
+            learning_rate=0.0001,
+            global_step=global_step,
+            decay_steps=100000,
+            decay_rate=0.5,
+            staircase=True),
+        global_step=global_step)
    sgd_optimizer.minimize(avg_cost)

    # TODO(qiao)

--- a/python/paddle/v2/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits.py
@@ -18,6 +18,8 @@ import paddle.v2 as paddle
 import sys
 import numpy
 import unittest
+import math
+import sys


 def parse_arg():
@@ -65,6 +67,7 @@ def conv_net(img, label):
        pool_size=2,
        pool_stride=2,
        act="relu")
+    conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
    conv_pool_2 = fluid.nets.simple_img_conv_pool(
        input=conv_pool_1,
        filter_size=5,
@@ -75,7 +78,7 @@ def conv_net(img, label):
    return loss_net(conv_pool_2, label)


-def train(nn_type, use_cuda, parallel, save_dirname):
+def train(nn_type, use_cuda, parallel, save_dirname, save_param_filename):
    if use_cuda and not fluid.core.is_compiled_with_cuda():
        return
    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
@@ -140,18 +143,22 @@ def train(nn_type, use_cuda, parallel, save_dirname):
                avg_loss_val = numpy.array(avg_loss_set).mean()
                if float(acc_val) > 0.85:  # test acc > 85%
                    if save_dirname is not None:
-                        fluid.io.save_inference_model(save_dirname, ["img"],
-                                                      [prediction], exe)
+                        fluid.io.save_inference_model(
+                            save_dirname, ["img"], [prediction],
+                            exe,
+                            save_file_name=save_param_filename)
                    return
                else:
                    print(
                        'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}'.
                        format(pass_id, batch_id + 1,
                               float(avg_loss_val), float(acc_val)))
+                    if math.isnan(float(avg_loss_val)):
+                        sys.exit("got NaN loss, training failed.")
    raise AssertionError("Loss of recognize digits is too large")


-def infer(use_cuda, save_dirname=None):
+def infer(use_cuda, save_dirname=None, param_filename=None):
    if save_dirname is None:
        return

@@ -162,13 +169,14 @@ def infer(use_cuda, save_dirname=None):
    # the feed_target_names (the names of variables that will be feeded 
    # data using feed operators), and the fetch_targets (variables that 
    # we want to obtain data from using fetch operators).
-    [inference_program, feed_target_names,
-     fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+    [inference_program, feed_target_names, fetch_targets
+     ] = fluid.io.load_inference_model(save_dirname, exe, param_filename)

    # The input's dimension of conv should be 4-D or 5-D.
    # Use normilized image pixels as input data, which should be in the range [-1.0, 1.0].
+    batch_size = 1
    tensor_img = numpy.random.uniform(-1.0, 1.0,
-                                      [1, 1, 28, 28]).astype("float32")
+                                      [batch_size, 1, 28, 28]).astype("float32")

    # Construct feed as a dictionary of {feed_target_name: feed_target_data}
    # and results will contain a list of data corresponding to fetch_targets.
@@ -178,36 +186,45 @@ def infer(use_cuda, save_dirname=None):
    print("infer results: ", results[0])


-def main(use_cuda, parallel, nn_type):
+def main(use_cuda, parallel, nn_type, combine):
    if not use_cuda and not parallel:
        save_dirname = "recognize_digits_" + nn_type + ".inference.model"
+        save_filename = None
+        if combine == True:
+            save_filename = "__params_combined__"
    else:
        save_dirname = None
+        save_filename = None

    train(
        nn_type=nn_type,
        use_cuda=use_cuda,
        parallel=parallel,
-        save_dirname=save_dirname)
-    infer(use_cuda=use_cuda, save_dirname=save_dirname)
+        save_dirname=save_dirname,
+        save_param_filename=save_filename)
+    infer(
+        use_cuda=use_cuda,
+        save_dirname=save_dirname,
+        param_filename=save_filename)


 class TestRecognizeDigits(unittest.TestCase):
    pass


-def inject_test_method(use_cuda, parallel, nn_type):
+def inject_test_method(use_cuda, parallel, nn_type, combine):
    def __impl__(self):
        prog = fluid.Program()
        startup_prog = fluid.Program()
        scope = fluid.core.Scope()
        with fluid.scope_guard(scope):
            with fluid.program_guard(prog, startup_prog):
-                main(use_cuda, parallel, nn_type)
+                main(use_cuda, parallel, nn_type, combine)

-    fn = 'test_{0}_{1}_{2}'.format(nn_type, 'cuda'
-                                   if use_cuda else 'cpu', 'parallel'
-                                   if parallel else 'normal')
+    fn = 'test_{0}_{1}_{2}_{3}'.format(nn_type, 'cuda'
+                                       if use_cuda else 'cpu', 'parallel'
+                                       if parallel else 'normal', 'combine'
+                                       if combine else 'separate')

    setattr(TestRecognizeDigits, fn, __impl__)

@@ -216,7 +233,10 @@ def inject_all_tests():
    for use_cuda in (False, True):
        for parallel in (False, True):
            for nn_type in ('mlp', 'conv'):
-                inject_test_method(use_cuda, parallel, nn_type)
+                inject_test_method(use_cuda, parallel, nn_type, True)
+
+    # One unit-test for saving parameters as separate files
+    inject_test_method(False, False, 'mlp', False)


 inject_all_tests()

--- a/python/paddle/v2/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/v2/fluid/tests/book/test_recommender_system.py
@@ -12,9 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+import math
+import sys
 import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.fluid.core as core
+import paddle.v2.fluid as fluid
 import paddle.v2.fluid.framework as framework
 import paddle.v2.fluid.layers as layers
 import paddle.v2.fluid.nets as nets
@@ -102,7 +104,8 @@ def get_mov_combined_features():

    CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories())

-    category_id = layers.data(name='category_id', shape=[1], dtype='int64')
+    category_id = layers.data(
+        name='category_id', shape=[1], dtype='int64', lod_level=1)

    mov_categories_emb = layers.embedding(
        input=category_id, size=[CATEGORY_DICT_SIZE, 32], is_sparse=IS_SPARSE)
@@ -112,7 +115,8 @@ def get_mov_combined_features():

    MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict())

-    mov_title_id = layers.data(name='movie_title', shape=[1], dtype='int64')
+    mov_title_id = layers.data(
+        name='movie_title', shape=[1], dtype='int64', lod_level=1)

    mov_title_emb = layers.embedding(
        input=mov_title_id, size=[MOV_TITLE_DICT_SIZE, 32], is_sparse=IS_SPARSE)
@@ -142,23 +146,22 @@ def model():
    scale_infer = layers.scale(x=inference, scale=5.0)

    label = layers.data(name='score', shape=[1], dtype='float32')
-
    square_cost = layers.square_error_cost(input=scale_infer, label=label)
-
    avg_cost = layers.mean(x=square_cost)

-    return avg_cost
+    return scale_infer, avg_cost
+

+def train(use_cuda, save_dirname):
+    scale_infer, avg_cost = model()
+
+    # test program
+    test_program = fluid.default_main_program().clone()

-def main():
-    cost = model()
    sgd_optimizer = SGDOptimizer(learning_rate=0.2)
-    opts = sgd_optimizer.minimize(cost)
+    opts = sgd_optimizer.minimize(avg_cost)

-    if USE_GPU:
-        place = core.CUDAPlace(0)
-    else:
-        place = core.CPUPlace()
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()

    exe = Executor(place)
    exe.run(framework.default_startup_program())
@@ -167,6 +170,8 @@ def main():
        paddle.reader.shuffle(
            paddle.dataset.movielens.train(), buf_size=8192),
        batch_size=BATCH_SIZE)
+    test_reader = paddle.batch(
+        paddle.dataset.movielens.test(), batch_size=BATCH_SIZE)

    feeding = {
        'user_id': 0,
@@ -182,7 +187,7 @@ def main():
    def func_feed(feeding, data):
        feed_tensors = {}
        for (key, idx) in feeding.iteritems():
-            tensor = core.LoDTensor()
+            tensor = fluid.LoDTensor()
            if key != "category_id" and key != "movie_title":
                if key == "score":
                    numpy_data = np.array(map(lambda x: x[idx], data)).astype(
@@ -209,14 +214,117 @@ def main():

    PASS_NUM = 100
    for pass_id in range(PASS_NUM):
-        for data in train_reader():
-            outs = exe.run(framework.default_main_program(),
+        for batch_id, data in enumerate(train_reader()):
+            # train a mini-batch
+            outs = exe.run(program=fluid.default_main_program(),
                           feed=func_feed(feeding, data),
-                           fetch_list=[cost])
+                           fetch_list=[avg_cost])
            out = np.array(outs[0])
-            if out[0] < 6.0:
-                # if avg cost less than 6.0, we think our code is good.
-                exit(0)
-
-
-main()
+            if (batch_id + 1) % 10 == 0:
+                avg_cost_set = []
+                for test_data in test_reader():
+                    avg_cost_np = exe.run(program=test_program,
+                                          feed=func_feed(feeding, test_data),
+                                          fetch_list=[avg_cost])
+                    avg_cost_set.append(avg_cost_np[0])
+                    break  # test only 1 segment for speeding up CI
+
+                # get test avg_cost
+                test_avg_cost = np.array(avg_cost_set).mean()
+                if test_avg_cost < 6.0:
+                    # if avg_cost less than 6.0, we think our code is good.
+                    if save_dirname is not None:
+                        fluid.io.save_inference_model(save_dirname, [
+                            "user_id", "gender_id", "age_id", "job_id",
+                            "movie_id", "category_id", "movie_title"
+                        ], [scale_infer], exe)
+                    return
+
+            if math.isnan(float(out[0])):
+                sys.exit("got NaN loss, training failed.")
+
+
+def infer(use_cuda, save_dirname=None):
+    if save_dirname is None:
+        return
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    # Use fluid.io.load_inference_model to obtain the inference program desc,
+    # the feed_target_names (the names of variables that will be feeded
+    # data using feed operators), and the fetch_targets (variables that
+    # we want to obtain data from using fetch operators).
+    [inference_program, feed_target_names,
+     fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+    def create_lod_tensor(data, lod=None):
+        tensor = fluid.LoDTensor()
+        if lod is None:
+            # Tensor, the shape is [batch_size, 1]
+            index = 0
+            lod_0 = [index]
+            for l in range(len(data)):
+                index += 1
+                lod_0.append(index)
+            lod = [lod_0]
+        tensor.set_lod(lod)
+
+        flattened_data = np.concatenate(data, axis=0).astype("int64")
+        flattened_data = flattened_data.reshape([len(flattened_data), 1])
+        tensor.set(flattened_data, place)
+        return tensor
+
+    # Use the first data from paddle.dataset.movielens.test() as input
+    assert feed_target_names[0] == "user_id"
+    user_id = create_lod_tensor([[1]])
+
+    assert feed_target_names[1] == "gender_id"
+    gender_id = create_lod_tensor([[1]])
+
+    assert feed_target_names[2] == "age_id"
+    age_id = create_lod_tensor([[0]])
+
+    assert feed_target_names[3] == "job_id"
+    job_id = create_lod_tensor([[10]])
+
+    assert feed_target_names[4] == "movie_id"
+    movie_id = create_lod_tensor([[783]])
+
+    assert feed_target_names[5] == "category_id"
+    category_id = create_lod_tensor([[10], [8], [9]], [[0, 3]])
+
+    assert feed_target_names[6] == "movie_title"
+    movie_title = create_lod_tensor([[1069], [4140], [2923], [710], [988]],
+                                    [[0, 5]])
+
+    # Construct feed as a dictionary of {feed_target_name: feed_target_data}
+    # and results will contain a list of data corresponding to fetch_targets.
+    results = exe.run(inference_program,
+                      feed={
+                          feed_target_names[0]: user_id,
+                          feed_target_names[1]: gender_id,
+                          feed_target_names[2]: age_id,
+                          feed_target_names[3]: job_id,
+                          feed_target_names[4]: movie_id,
+                          feed_target_names[5]: category_id,
+                          feed_target_names[6]: movie_title
+                      },
+                      fetch_list=fetch_targets,
+                      return_numpy=False)
+    print("inferred score: ", np.array(results[0]))
+
+
+def main(use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+
+    # Directory for saving the inference model
+    save_dirname = "recommender_system.inference.model"
+
+    train(use_cuda, save_dirname)
+    infer(use_cuda, save_dirname)
+
+
+if __name__ == '__main__':
+    main(USE_GPU)
--- a/python/paddle/v2/fluid/tests/book/test_rnn_encoder_decoder.py
+++ b/python/paddle/v2/fluid/tests/book/test_rnn_encoder_decoder.py
@@ -18,6 +18,10 @@ import paddle.v2.fluid as fluid
 import paddle.v2.fluid.core as core
 import paddle.v2.fluid.framework as framework
 import paddle.v2.fluid.layers as layers
+import contextlib
+import math
+import sys
+import unittest
 from paddle.v2.fluid.executor import Executor

 dict_size = 30000
@@ -145,7 +149,7 @@ def seq_to_seq_net():
    cost = fluid.layers.cross_entropy(input=prediction, label=label)
    avg_cost = fluid.layers.mean(x=cost)

-    return avg_cost
+    return avg_cost, prediction


 def to_lodtensor(data, place):
@@ -163,8 +167,16 @@ def to_lodtensor(data, place):
    return res


-def main():
-    avg_cost = seq_to_seq_net()
+def create_random_lodtensor(lod, place, low, high):
+    data = np.random.random_integers(low, high, [lod[-1], 1]).astype("int64")
+    res = fluid.LoDTensor()
+    res.set(data, place)
+    res.set_lod([lod])
+    return res
+
+
+def train(use_cuda, save_dirname=None):
+    [avg_cost, prediction] = seq_to_seq_net()

    optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4)
    optimizer.minimize(avg_cost)
@@ -174,7 +186,7 @@ def main():
            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
        batch_size=batch_size)

-    place = core.CPUPlace()
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    exe = Executor(place)

    exe.run(framework.default_startup_program())
@@ -185,6 +197,7 @@ def main():
            word_data = to_lodtensor(map(lambda x: x[0], data), place)
            trg_word = to_lodtensor(map(lambda x: x[1], data), place)
            trg_word_next = to_lodtensor(map(lambda x: x[2], data), place)
+
            outs = exe.run(framework.default_main_program(),
                           feed={
                               'source_sequence': word_data,
@@ -192,13 +205,86 @@ def main():
                               'label_sequence': trg_word_next
                           },
                           fetch_list=[avg_cost])
+
            avg_cost_val = np.array(outs[0])
            print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
                  " avg_cost=" + str(avg_cost_val))
+            if math.isnan(float(avg_cost_val[0])):
+                sys.exit("got NaN loss, training failed.")
            if batch_id > 3:
-                exit(0)
+                if save_dirname is not None:
+                    fluid.io.save_inference_model(
+                        save_dirname, ['source_sequence',
+                                       'target_sequence'], [prediction], exe)
+                return
+
            batch_id += 1


+def infer(use_cuda, save_dirname=None):
+    if save_dirname is None:
+        return
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    # Use fluid.io.load_inference_model to obtain the inference program desc,
+    # the feed_target_names (the names of variables that will be feeded 
+    # data using feed operators), and the fetch_targets (variables that 
+    # we want to obtain data from using fetch operators).
+    [inference_program, feed_target_names,
+     fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+    lod = [0, 4, 10]
+    word_data = create_random_lodtensor(lod, place, low=0, high=1)
+    trg_word = create_random_lodtensor(lod, place, low=0, high=1)
+
+    # Construct feed as a dictionary of {feed_target_name: feed_target_data}
+    # and results will contain a list of data corresponding to fetch_targets.
+    assert feed_target_names[0] == 'source_sequence'
+    assert feed_target_names[1] == 'target_sequence'
+    results = exe.run(inference_program,
+                      feed={
+                          feed_target_names[0]: word_data,
+                          feed_target_names[1]: trg_word,
+                      },
+                      fetch_list=fetch_targets,
+                      return_numpy=False)
+    print(results[0].lod())
+    np_data = np.array(results[0])
+    print("Inference shape: ", np_data.shape)
+    print("Inference results: ", np_data)
+
+
+def main(use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+
+    # Directory for saving the trained model
+    save_dirname = "rnn_encoder_decoder.inference.model"
+
+    train(use_cuda, save_dirname)
+    infer(use_cuda, save_dirname)
+
+
+class TestRnnEncoderDecoder(unittest.TestCase):
+    def test_cuda(self):
+        with self.scope_prog_guard():
+            main(use_cuda=True)
+
+    def test_cpu(self):
+        with self.scope_prog_guard():
+            main(use_cuda=False)
+
+    @contextlib.contextmanager
+    def scope_prog_guard(self):
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        scope = fluid.core.Scope()
+        with fluid.scope_guard(scope):
+            with fluid.program_guard(prog, startup_prog):
+                yield
+
+
 if __name__ == '__main__':
-    main()
+    unittest.main()
--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment.py
@@ -16,6 +16,8 @@ import unittest
 import paddle.v2.fluid as fluid
 import paddle.v2 as paddle
 import contextlib
+import math
+import sys


 def convolution_net(data, label, input_dim, class_dim=2, emb_dim=32,
@@ -115,6 +117,8 @@ def main(word_dict, net_method, use_cuda):
            print("cost=" + str(cost_val) + " acc=" + str(acc_val))
            if cost_val < 0.4 and acc_val > 0.8:
                return
+            if math.isnan(float(cost_val)):
+                sys.exit("got NaN loss, training failed.")
    raise AssertionError("Cost is too large for {0}".format(
        net_method.__name__))


--- a/python/paddle/v2/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/v2/fluid/tests/book/test_word2vec.py
@@ -16,6 +16,8 @@ import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
 import unittest
 import os
+import math
+import sys


 def main(use_cuda, is_sparse, parallel):
@@ -112,6 +114,9 @@ def main(use_cuda, is_sparse, parallel):
                                  fetch_list=[avg_cost])
            if avg_cost_np[0] < 5.0:
                return
+            if math.isnan(float(avg_cost_np[0])):
+                sys.exit("got NaN loss, training failed.")
+
    raise AssertionError("Cost is too large {0:2.2}".format(avg_cost_np[0]))



--- a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
+++ b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
@@ -15,6 +15,8 @@
 import numpy as np
 import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
+import math
+import sys

 # need to fix random seed and training data to compare the loss
 # value accurately calculated by the default and the memory optimization
@@ -63,4 +65,6 @@ for pass_id in range(PASS_NUM):

        if avg_loss_value[0] < 10.0:
            exit(0)  # if avg cost less than 10.0, we think our code is good.
+        if math.isnan(float(avg_loss_value)):
+            sys.exit("got NaN loss, training failed.")
 exit(1)
--- a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
+++ b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
@@ -18,6 +18,8 @@ import sys

 import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
+import math
+import sys

 # need to fix random seed and training data to compare the loss
 # value accurately calculated by the default and the memory optimization
@@ -152,7 +154,10 @@ for pass_id in range(PASS_NUM):
        print("loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str(
            pass_acc))
        # this model is slow, so if we can train two mini batch, we think it works properly.
+
        if i > 2:
            exit(0)
+        if math.isnan(float(loss)):
+            sys.exit("got NaN loss, training failed.")
        i += 1
 exit(1)
--- a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
+++ b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
@@ -19,6 +19,8 @@ import paddle.v2.fluid.core as core
 import paddle.v2.fluid.framework as framework
 import paddle.v2.fluid.layers as layers
 from paddle.v2.fluid.executor import Executor
+import math
+import sys

 dict_size = 30000
 source_dict_dim = target_dict_dim = dict_size
@@ -137,6 +139,8 @@ def main():
                  " avg_cost=" + str(avg_cost_val))
            if batch_id > 2:
                exit(0)
+            if math.isnan(float(avg_cost_val)):
+                sys.exit("got NaN loss, training failed.")
            batch_id += 1



--- a/python/paddle/v2/fluid/tests/op_test.py
+++ b/python/paddle/v2/fluid/tests/op_test.py
--- a/python/paddle/v2/fluid/tests/test_cpp_reader.py
+++ b/python/paddle/v2/fluid/tests/test_cpp_reader.py
--- a/python/paddle/v2/fluid/tests/test_ctc_align.py
+++ b/python/paddle/v2/fluid/tests/test_ctc_align.py
--- a/python/paddle/v2/fluid/tests/test_cumsum_op.py
+++ b/python/paddle/v2/fluid/tests/test_cumsum_op.py
--- a/python/paddle/v2/fluid/tests/test_layer_norm_op.py
+++ b/python/paddle/v2/fluid/tests/test_layer_norm_op.py
--- a/python/paddle/v2/fluid/tests/test_learning_rate_decay.py
+++ b/python/paddle/v2/fluid/tests/test_learning_rate_decay.py
--- a/python/paddle/v2/fluid/tests/test_optimizer.py
+++ b/python/paddle/v2/fluid/tests/test_optimizer.py