diff --git a/.gitignore b/.gitignore
index ac56a3320ec85769d2c87c072512f5217eca0c24..59e650bdfe801c7e2ff19b6c0a9d60bed1e1ee10 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,9 @@
+paddle/operators/check_t.save
+paddle/operators/check_tensor.ls
+paddle/operators/tensor.save
+python/paddle/v2/fluid/tests/book/image_classification_resnet.inference.model/
+python/paddle/v2/fluid/tests/book/image_classification_vgg.inference.model/
+python/paddle/v2/fluid/tests/book/label_semantic_roles.inference.model/
 *.DS_Store
 build/
 build_doc/
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index 6bea7cf3022242ce48cc882915f7e71810937283..de94bd5008effef1bf0fd3a125d4aed56e1b7f81 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -181,7 +181,8 @@ elseif(CMAKE_BUILD_TYPE  STREQUAL "Release")
 elseif(CMAKE_BUILD_TYPE  STREQUAL "RelWithDebInfo")
     list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
 elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
-    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_MINSIZEREL})
+    # nvcc 9 does not support -Os. Use Release flags instead
+    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELEASE})
 endif()
 
 mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 33ef6860e1d38f4e87c4431addf43f9f8a655fc2..1cb54ba2164fafbfce9f28a3e894ae5e78a9cd68 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -179,20 +179,24 @@ function(cc_library TARGET_NAME)
   set(oneValueArgs "")
   set(multiValueArgs SRCS DEPS)
   cmake_parse_arguments(cc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-  if (cc_library_SRCS)
-    if (cc_library_SHARED OR cc_library_shared) # build *.so
+  if(cc_library_SRCS)
+    if(cc_library_SHARED OR cc_library_shared) # build *.so
       add_library(${TARGET_NAME} SHARED ${cc_library_SRCS})
     else()
       add_library(${TARGET_NAME} STATIC ${cc_library_SRCS})
     endif()
-    if (cc_library_DEPS)
+    if(cc_library_DEPS)
       # Don't need link libwarpctc.so
-      if ("${cc_library_DEPS};" MATCHES "warpctc;")
+      if("${cc_library_DEPS};" MATCHES "warpctc;")
         list(REMOVE_ITEM cc_library_DEPS warpctc)
         add_dependencies(${TARGET_NAME} warpctc)
       endif()
+      # Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
+      target_circle_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
+      if("${cc_library_DEPS}" MATCHES "ARCHIVE_START")
+        list(REMOVE_ITEM cc_library_DEPS ARCHIVE_START ARCHIVE_END)
+      endif()
       add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
-      target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
     endif()
     
     # cpplint code style
diff --git a/doc/api/v2/fluid/layers.rst b/doc/api/v2/fluid/layers.rst
index e24613b94b422b7cdf9c6383c359fa92a4faf6ff..58c493fd7412cf9dbe507c9622d67dae33a5fb25 100644
--- a/doc/api/v2/fluid/layers.rst
+++ b/doc/api/v2/fluid/layers.rst
@@ -323,6 +323,12 @@ batch_norm
 ..  autofunction:: paddle.v2.fluid.layers.batch_norm
     :noindex:
 
+layer_norm
+----------
+
+..  autofunction:: paddle.v2.fluid.layers.layer_norm
+    :noindex:
+
 beam_search_decode
 ------------------
 
diff --git a/doc/howto/rnn/index_cn.rst b/doc/howto/rnn/index_cn.rst
index 9ecab5594cff47cde4700b7ce0f58013a960a16e..bcc8c2f46eb662ec3650e829a77992224dbbb8e7 100644
--- a/doc/howto/rnn/index_cn.rst
+++ b/doc/howto/rnn/index_cn.rst
@@ -1,4 +1,4 @@
-RNN相关模型
+RNN模型
 ===========
 
 ..  toctree::
diff --git a/doc/index_cn.rst b/doc/index_cn.rst
index 63a78428583477792e309a3b3d26af340caccfca..0f645db6fc5d0f84bbe0cbb335677752e3a355ea 100644
--- a/doc/index_cn.rst
+++ b/doc/index_cn.rst
@@ -8,5 +8,4 @@ PaddlePaddle 文档
   build_and_install/index_cn.rst
   howto/index_cn.rst
   dev/index_cn.rst
-  api/index_cn.rst
   faq/index_cn.rst
diff --git a/doc/index_en.rst b/doc/index_en.rst
index 5631381be087017c26b2a6a3984b3c5bdb49f12d..166f56c28f464563a0b36007f58cebb58c286916 100644
--- a/doc/index_en.rst
+++ b/doc/index_en.rst
@@ -8,4 +8,3 @@ PaddlePaddle Documentation
   build_and_install/index_en.rst
   howto/index_en.rst
   dev/index_en.rst
-  api/index_en.rst
diff --git a/paddle/framework/block_desc.cc b/paddle/framework/block_desc.cc
index dd2ed87252102aee6d384f37365d19305f19b281..3e344ea3790f57b0f53f36a40263dcdd326e67a9 100644
--- a/paddle/framework/block_desc.cc
+++ b/paddle/framework/block_desc.cc
@@ -162,9 +162,8 @@ BlockDesc::BlockDesc(const BlockDesc &other, proto::BlockDesc *desc,
     : prog_(prog), desc_(desc) {
   need_update_ = true;
   for (auto &op : other.ops_) {
-    ops_.emplace_back(new OpDesc(*op, this));
+    ops_.emplace_back(new OpDesc(*op->Proto(), prog, this));
   }
-
   for (auto &it : other.vars_) {
     auto *var = new VarDesc(*it.second);
     vars_[it.first].reset(var);
diff --git a/paddle/framework/channel.h b/paddle/framework/channel.h
index b679387b1124e42499df158767b6c7afe1afd0c6..146f0e9e71ea9101a8f6c71e6c023178f131f967 100644
--- a/paddle/framework/channel.h
+++ b/paddle/framework/channel.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/framework/channel_test.cc b/paddle/framework/channel_test.cc
index a307abb4ed37880bb289a8373adf0d293382c97e..d7140dd10661c7b8582930b47872ab0b330c4d66 100644
--- a/paddle/framework/channel_test.cc
+++ b/paddle/framework/channel_test.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -25,6 +25,26 @@ using paddle::framework::CloseChannel;
 using paddle::framework::details::Buffered;
 using paddle::framework::details::UnBuffered;
 
+void RecevingOrderEqualToSendingOrder(Channel<int> *ch) {
+  unsigned sum_send = 0;
+  std::thread t([&]() {
+    for (int i = 0; i < 5; i++) {
+      EXPECT_EQ(ch->Send(&i), true);
+      sum_send += i;
+    }
+  });
+  for (int i = 0; i < 5; i++) {
+    int recv;
+    EXPECT_EQ(ch->Receive(&recv), true);
+    EXPECT_EQ(recv, i);
+  }
+
+  CloseChannel(ch);
+  t.join();
+  EXPECT_EQ(sum_send, 10U);
+  delete ch;
+}
+
 TEST(Channel, MakeAndClose) {
   using paddle::framework::details::Buffered;
   using paddle::framework::details::UnBuffered;
@@ -137,9 +157,7 @@ TEST(Channel, ReceiveFromBufferedChannelReturnResidualValuesTest) {
 
   for (size_t i = 0; i < buffer_size; ++i) {
     EXPECT_EQ(ch->Receive(&out),
-              false);  // after receiving residual values, return zeros.
-    // Note: we cannot check EXPECT_EQ(out, 0), because C++ doesn't
-    // define zero values like Go does.
+              false);  // receiving on closed channel should return false
   }
   delete ch;
 }
@@ -158,7 +176,7 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
       sum += i;
     }
   });
-  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.5 sec
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.1 sec
   EXPECT_EQ(sum, 45U);
 
   CloseChannel(ch);
@@ -166,31 +184,17 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
   delete ch;
 }
 
-TEST(Channel, SimpleUnbufferedChannelTest) {
+TEST(Channel, RecevingOrderEqualToSendingOrderWithUnBufferedChannel) {
   auto ch = MakeChannel<int>(0);
-  unsigned sum_send = 0;
-  std::thread t([&]() {
-    for (int i = 0; i < 5; i++) {
-      EXPECT_EQ(ch->Send(&i), true);
-      sum_send += i;
-    }
-  });
-  for (int i = 0; i < 5; i++) {
-    int recv;
-    EXPECT_EQ(ch->Receive(&recv), true);
-    EXPECT_EQ(recv, i);
-  }
+  RecevingOrderEqualToSendingOrder(ch);
+}
 
-  CloseChannel(ch);
-  t.join();
-  EXPECT_EQ(sum_send, 10U);
-  delete ch;
+TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel) {
+  auto ch = MakeChannel<int>(10);
+  RecevingOrderEqualToSendingOrder(ch);
 }
 
-// This tests that closing a buffered channel also unblocks
-//  any receivers waiting on the channel
-TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) {
-  auto ch = MakeChannel<int>(1);
+void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
   size_t num_threads = 5;
   std::thread t[num_threads];
   bool thread_ended[num_threads];
@@ -201,15 +205,14 @@ TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) {
     t[i] = std::thread(
         [&](bool *p) {
           int data;
-          // All reads should return false
           EXPECT_EQ(ch->Receive(&data), false);
           *p = true;
         },
         &thread_ended[i]);
   }
-  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.1 sec
 
-  // Verify that all threads are blocked
+  // Verify that all the threads are blocked
   for (size_t i = 0; i < num_threads; i++) {
     EXPECT_EQ(thread_ended[i], false);
   }
@@ -218,7 +221,7 @@ TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) {
   // This should unblock all receivers
   CloseChannel(ch);
 
-  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.1 sec
 
   // Verify that all threads got unblocked
   for (size_t i = 0; i < num_threads; i++) {
@@ -226,13 +229,12 @@ TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) {
   }
 
   for (size_t i = 0; i < num_threads; i++) t[i].join();
-  delete ch;
 }
 
-// This tests that closing a buffered channel also unblocks
-//  any senders waiting for channel to have write space
-TEST(Channel, BufferedChannelCloseUnblocksSendersTest) {
-  auto ch = MakeChannel<int>(1);
+void ChannelCloseUnblocksSendersTest(Channel<int> *ch) {
+  using paddle::framework::details::Buffered;
+  using paddle::framework::details::UnBuffered;
+
   size_t num_threads = 5;
   std::thread t[num_threads];
   bool thread_ended[num_threads];
@@ -252,34 +254,56 @@ TEST(Channel, BufferedChannelCloseUnblocksSendersTest) {
   }
   std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
 
-  // Verify that atleast 4 threads are blocked
-  int ct = 0;
-  for (size_t i = 0; i < num_threads; i++) {
-    if (thread_ended[i] == false) ct++;
+  if (dynamic_cast<Buffered<int> *>(ch)) {
+    // If ch is Buffered, atleast 4 threads must be blocked.
+    int ct = 0;
+    for (size_t i = 0; i < num_threads; i++) {
+      if (!thread_ended[i]) ct++;
+    }
+    EXPECT_GE(ct, 4);
+  } else {
+    // If ch is UnBuffered, all the threads should be blocked.
+    for (size_t i = 0; i < num_threads; i++) {
+      EXPECT_EQ(thread_ended[i], false);
+    }
   }
-  // Atleast 4 threads must be blocked
-  EXPECT_GE(ct, 4);
-
   // Explicitly close the thread
   // This should unblock all senders
   CloseChannel(ch);
 
-  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
 
   // Verify that all threads got unblocked
   for (size_t i = 0; i < num_threads; i++) {
     EXPECT_EQ(thread_ended[i], true);
   }
 
-  // Verify that only 1 send was successful
-  ct = 0;
-  for (size_t i = 0; i < num_threads; i++) {
-    if (send_success[i]) ct++;
+  if (dynamic_cast<Buffered<int> *>(ch)) {
+    // Verify that only 1 send was successful
+    int ct = 0;
+    for (size_t i = 0; i < num_threads; i++) {
+      if (send_success[i]) ct++;
+    }
+    // Only 1 send must be successful
+    EXPECT_EQ(ct, 1);
   }
-  // Only 1 send must be successful
-  EXPECT_EQ(ct, 1);
 
   for (size_t i = 0; i < num_threads; i++) t[i].join();
+}
+
+// This tests that closing a buffered channel also unblocks
+//  any receivers waiting on the channel
+TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) {
+  auto ch = MakeChannel<int>(1);
+  ChannelCloseUnblocksReceiversTest(ch);
+  delete ch;
+}
+
+// This tests that closing a buffered channel also unblocks
+//  any senders waiting for channel to have write space
+TEST(Channel, BufferedChannelCloseUnblocksSendersTest) {
+  auto ch = MakeChannel<int>(1);
+  ChannelCloseUnblocksSendersTest(ch);
   delete ch;
 }
 
@@ -287,40 +311,7 @@ TEST(Channel, BufferedChannelCloseUnblocksSendersTest) {
 //  unblocks any receivers waiting for senders
 TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) {
   auto ch = MakeChannel<int>(0);
-  size_t num_threads = 5;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
-
-  // Launches threads that try to read and are blocked becausew of no writers
-  for (size_t i = 0; i < num_threads; i++) {
-    thread_ended[i] = false;
-    t[i] = std::thread(
-        [&](bool *p) {
-          int data;
-          EXPECT_EQ(ch->Receive(&data), false);
-          *p = true;
-        },
-        &thread_ended[i]);
-  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
-
-  // Verify that all the threads are blocked
-  for (size_t i = 0; i < num_threads; i++) {
-    EXPECT_EQ(thread_ended[i], false);
-  }
-
-  // Explicitly close the thread
-  // This should unblock all receivers
-  CloseChannel(ch);
-
-  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
-
-  // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
-    EXPECT_EQ(thread_ended[i], true);
-  }
-
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  ChannelCloseUnblocksReceiversTest(ch);
   delete ch;
 }
 
@@ -328,40 +319,7 @@ TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) {
 //  unblocks any senders waiting for senders
 TEST(Channel, UnbufferedChannelCloseUnblocksSendersTest) {
   auto ch = MakeChannel<int>(0);
-  size_t num_threads = 5;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
-
-  // Launches threads that try to read and are blocked becausew of no writers
-  for (size_t i = 0; i < num_threads; i++) {
-    thread_ended[i] = false;
-    t[i] = std::thread(
-        [&](bool *p) {
-          int data = 10;
-          EXPECT_EQ(ch->Send(&data), false);
-          *p = true;
-        },
-        &thread_ended[i]);
-  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
-
-  // Verify that all the threads are blocked
-  for (size_t i = 0; i < num_threads; i++) {
-    EXPECT_EQ(thread_ended[i], false);
-  }
-
-  // Explicitly close the thread
-  // This should unblock all receivers
-  CloseChannel(ch);
-
-  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
-
-  // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
-    EXPECT_EQ(thread_ended[i], true);
-  }
-
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  ChannelCloseUnblocksReceiversTest(ch);
   delete ch;
 }
 
diff --git a/paddle/framework/details/buffered_channel.h b/paddle/framework/details/buffered_channel.h
index 77eebc9924954b8adbbadceb4ede57f0a21f05aa..227a4e4811f95441158150396b5b882815fd7844 100644
--- a/paddle/framework/details/buffered_channel.h
+++ b/paddle/framework/details/buffered_channel.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -25,6 +25,14 @@ namespace paddle {
 namespace framework {
 namespace details {
 
+// Four of the properties of Buffered Channel:
+// - A send to a full channel blocks temporarily until a receive from the
+// channel or the channel is closed.
+// - A receive from an empty channel blocks temporarily until a send to the
+// channel or the channel is closed.
+// - A send to a closed channel returns false immediately.
+// - A receive from a closed channel returns false immediately.
+
 template <typename T>
 class Buffered : public paddle::framework::Channel<T> {
   friend Channel<T>* paddle::framework::MakeChannel<T>(size_t);
diff --git a/paddle/framework/details/cow_ptr.h b/paddle/framework/details/cow_ptr.h
index 7e308ffb5a49876aa2c1833b3b7e2a2c7eb137aa..69bcea625288eba897e761a1d634f19c41dc0f79 100644
--- a/paddle/framework/details/cow_ptr.h
+++ b/paddle/framework/details/cow_ptr.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
diff --git a/paddle/framework/details/cow_ptr_test.cc b/paddle/framework/details/cow_ptr_test.cc
index 936954a2333e7e5d2a932abad641279db9ef7b9f..1f4a12bca0dcab2d146cc62cd7ce1c2d7abcddf9 100644
--- a/paddle/framework/details/cow_ptr_test.cc
+++ b/paddle/framework/details/cow_ptr_test.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
diff --git a/paddle/framework/details/op_registry.h b/paddle/framework/details/op_registry.h
index 6d50e820b2b625f932768d2ca671d999071f1ca6..31a40bcbcb3905f01aebefe89526f3cfba8cb8c7 100644
--- a/paddle/framework/details/op_registry.h
+++ b/paddle/framework/details/op_registry.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/framework/details/unbuffered_channel.h b/paddle/framework/details/unbuffered_channel.h
index 92a16b4d22bbb6a8c75157444aa8474f700603fe..6b5c2196cb2991051c48f7da8397d2f479ca4c58 100644
--- a/paddle/framework/details/unbuffered_channel.h
+++ b/paddle/framework/details/unbuffered_channel.h
@@ -23,6 +23,13 @@ namespace paddle {
 namespace framework {
 namespace details {
 
+// Four of the properties of UnBuffered Channel:
+// - A send to a channel blocks temporarily until a receive from the
+// channel or the channel is closed.
+// - A receive from a channel blocks temporarily until a send to the
+// channel or the channel is closed.
+// - A send to a closed channel returns false immediately.
+// - A receive from a closed channel returns false immediately.
 template <typename T>
 class UnBuffered : public paddle::framework::Channel<T> {
   friend Channel<T>* paddle::framework::MakeChannel<T>(size_t);
diff --git a/paddle/framework/grad_op_desc_maker.h b/paddle/framework/grad_op_desc_maker.h
index 2082f8bb76fb62bc36f033fecbd4eaa76d12d949..f51753453bee255a0592e9c5b0fb2d9aa380e109 100644
--- a/paddle/framework/grad_op_desc_maker.h
+++ b/paddle/framework/grad_op_desc_maker.h
@@ -122,6 +122,11 @@ class GradOpDescMakerBase {
     return it->second;
   }
 
+  template <typename T>
+  inline const T& Attr(const std::string& name) const {
+    return boost::get<T>(GetAttr(name));
+  }
+
   std::string ForwardOpType() const { return this->fwd_op_.Type(); }
 
  private:
diff --git a/paddle/framework/lod_tensor.h b/paddle/framework/lod_tensor.h
index be2b301619639106ac7b578e5a79cf33f4379e48..9de454428d9fd733aa70601f5012e77b9ceb2022 100644
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@@ -46,29 +46,7 @@ namespace framework {
  *    0 2 4 7
  *    0 2 5 7 10 12 15 20
  */
-struct LoD : public std::vector<Vector<size_t>> {
-  using std::vector<Vector<size_t>>::vector;
-  platform::Place place() const {
-    if (this->size() == 0) {
-      // Not Initialze Yet.
-      return platform::CPUPlace();
-    } else {
-      return this->front().place();
-    }
-  }
-
-  void CopyFromCUDA() {
-    for (auto it = this->begin(); it != this->end(); ++it) {
-      it->CopyFromCUDA();
-    }
-  }
-
-  void CopyToPeer(platform::Place place) {
-    for (auto it = this->begin(); it != this->end(); ++it) {
-      it->CopyToPeer(place);
-    }
-  }
-};
+using LoD = std::vector<Vector<size_t>>;
 
 std::ostream& operator<<(std::ostream& os, const LoD& lod);
 std::ostream& operator<<(std::ostream& os, const LoDTensor& t);
diff --git a/paddle/framework/lod_tensor_test.cu b/paddle/framework/lod_tensor_test.cu
index adea02e3b3fdcf4873de76ff91116f43ac9fe259..a28b7caf86c689d55808c4e7defecd37a5a03442 100644
--- a/paddle/framework/lod_tensor_test.cu
+++ b/paddle/framework/lod_tensor_test.cu
@@ -20,6 +20,7 @@
 #include "paddle/platform/assert.h"
 
 #include <gtest/gtest.h>
+#include <paddle/platform/place.h>
 
 __global__ void test(size_t* a, int size) {
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size;
@@ -36,10 +37,9 @@ TEST(LoD, data) {
   lod.push_back(std::vector<size_t>({0, 1, 6, 8, 10, 11}));
 
   auto& v = lod[0];
-  test<<<1, 1>>>(v.cuda_data(), v.size());
+  paddle::platform::CUDAPlace gpu(0);
+  test<<<1, 1>>>(v.CUDAMutableData(gpu), v.size());
   cudaDeviceSynchronize();
-
-  v.CopyFromCUDA();
   for (size_t i = 0; i < v.size(); ++i) {
     EXPECT_EQ(v[i], i * 2);
   }
@@ -63,9 +63,8 @@ TEST(LoDTensor, LoDInGPU) {
 
   auto lod = lod_tensor.lod();
 
-  test<<<1, 8>>>(lod[0].cuda_data(), lod[0].size());
+  test<<<1, 8>>>(lod[0].CUDAMutableData(place), lod[0].size());
   cudaDeviceSynchronize();
-  lod.CopyFromCUDA();
 
   for (size_t i = 0; i < src_lod[0].size(); ++i) {
     EXPECT_EQ(lod[0].data()[i], src_lod[0].data()[i] * 2);
diff --git a/paddle/framework/mixed_vector.h b/paddle/framework/mixed_vector.h
index 5202775515d335ff81bb17e6ce21338c40041ca3..f776f0317a2bed69cf8795c2a12a467c50ba38d3 100644
--- a/paddle/framework/mixed_vector.h
+++ b/paddle/framework/mixed_vector.h
@@ -17,176 +17,347 @@
 #include <initializer_list>
 #include <vector>
 
-#include "paddle/memory/memcpy.h"
-#include "paddle/memory/memory.h"
-#include "paddle/platform/device_context.h"
-#include "paddle/platform/enforce.h"
-#include "paddle/platform/place.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/framework/tensor_util.h"
+
+#include "glog/logging.h"
 
 namespace paddle {
 namespace framework {
 
-/**
- * @brief Vector support both cpu and gpu.
- * host vector lifetime is same with Vector
- * device vector is lazily malloc and modified.
- */
-
+// Vector<T> implements the std::vector interface, and can get Data or
+// MutableData from any place. The data will be synced implicitly inside.
 template <typename T>
-class Vector : public std::vector<T> {
+class Vector {
  public:
-  using std::vector<T>::vector;
+  using value_type = T;
+
+  // Default ctor. Create empty Vector
+  Vector() { InitEmpty(); }
+
+  // Fill vector with value. The vector size is `count`.
+  explicit Vector(size_t count, const T& value = T()) {
+    if (count == 0) {
+      InitEmpty();
+    } else {
+      resize(count);
+      T* ptr = begin();
+      for (size_t i = 0; i < count; ++i) {
+        ptr[i] = value;
+      }
+    }
+  }
+
+  // Ctor with init_list
+  Vector(std::initializer_list<T> init) {
+    if (init.size() == 0) {
+      InitEmpty();
+    } else {
+      InitByIter(init.size(), init.begin(), init.end());
+    }
+  }
+
+  // implicit cast from std::vector.
+  template <typename U>
+  Vector(const std::vector<U>& dat) {  // NOLINT
+    if (dat.size() == 0) {
+      InitEmpty();
+    } else {
+      InitByIter(dat.size(), dat.begin(), dat.end());
+    }
+  }
+
+  // Copy ctor
+  Vector(const Vector<T>& other) { this->operator=(other); }
+
+  // Copy operator
+  Vector<T>& operator=(const Vector<T>& other) {
+    if (other.size() != 0) {
+      this->InitByIter(other.size(), other.begin(), other.end());
+    } else {
+      InitEmpty();
+    }
+    return *this;
+  }
+
+  // Move ctor
+  Vector(Vector<T>&& other) {
+    this->size_ = other.size_;
+    this->flag_ = other.flag_;
+    if (other.cuda_vec_.memory_size()) {
+      this->cuda_vec_.ShareDataWith(other.cuda_vec_);
+    }
+    if (other.cpu_vec_.memory_size()) {
+      this->cpu_vec_.ShareDataWith(other.cpu_vec_);
+    }
+  }
+
+  // CPU data access method. Mutable.
+  T& operator[](size_t i) {
+    MutableCPU();
+    return const_cast<T*>(cpu_vec_.data<T>())[i];
+  }
+
+  // CPU data access method. Immutable.
+  const T& operator[](size_t i) const {
+    ImmutableCPU();
+    return cpu_vec_.data<T>()[i];
+  }
+
+  // std::vector iterator methods. Based on CPU data access method
+  size_t size() const { return size_; }
+
+  T* begin() { return &this->operator[](0); }
+
+  T* end() { return &this->operator[](size()); }
+
+  T& front() { return *begin(); }
+
+  T& back() {
+    auto it = end();
+    --it;
+    return *it;
+  }
+
+  const T* begin() const { return &this->operator[](0); }
+  const T* end() const { return &this->operator[](size()); }
+
+  const T& back() const {
+    auto it = end();
+    --it;
+    return *it;
+  }
+
+  T* data() { return begin(); }
+
+  const T* data() const { return begin(); }
+
+  const T& front() const { return *begin(); }
+  // end of std::vector iterator methods
+
+  // assign this from iterator.
+  // NOTE: the iterator must support `end-begin`
+  template <typename Iter>
+  void assign(Iter begin, Iter end) {
+    InitByIter(end - begin, begin, end);
+  }
+
+  // push_back. If the previous capacity is not enough, the memory will
+  // double.
+  void push_back(T elem) {
+    if (size_ + 1 > capacity()) {
+      reserve((size_ + 1) << 1);
+    }
+    *end() = elem;
+    ++size_;
+  }
 
-  Vector() {}
-  Vector(const std::vector<T> &v) : std::vector<T>(v) {}  // NOLINT
+  // extend a vector by iterator.
+  // NOTE: the iterator must support end-begin
+  template <typename It>
+  void Extend(It begin, It end) {
+    size_t pre_size = size_;
+    resize(pre_size + (end - begin));
+    T* ptr = this->begin() + pre_size;
+    for (; begin < end; ++begin, ++ptr) {
+      *ptr = *begin;
+    }
+  }
 
-  inline platform::Place place() const { return place_; }
+  // resize the vector
+  void resize(size_t size) {
+    if (size + 1 < capacity()) {
+      size_ = size;
+    } else {
+      MutableCPU();
+      Tensor cpu_tensor;
+      platform::Place cpu = platform::CPUPlace();
+      T* ptr = cpu_tensor.mutable_data<T>(
+          framework::make_ddim({static_cast<int64_t>(size)}), cpu);
+      const T* old_ptr =
+          cpu_vec_.memory_size() == 0 ? nullptr : cpu_vec_.data<T>();
+      if (old_ptr != nullptr) {
+        std::copy(old_ptr, old_ptr + size_, ptr);
+      }
+      size_ = size;
+      cpu_vec_.ShareDataWith(cpu_tensor);
+    }
+  }
 
-  /*! Return a pointer to constant memory block. */
-  inline const T *data(platform::Place place) const;
+  // get cuda ptr. immutable
+  const T* CUDAData(platform::Place place) const {
+    PADDLE_ENFORCE(platform::is_gpu_place(place),
+                   "CUDA Data must on CUDA place");
+    ImmutableCUDA(place);
+    return cuda_vec_.data<T>();
+  }
 
-  /*! Return a pointer to mutable memory block. */
-  inline T *mutable_data(platform::Place place);
+  // get cuda ptr. mutable
+  T* CUDAMutableData(platform::Place place) {
+    const T* ptr = CUDAData(place);
+    flag_ = kDirty | kDataInCUDA;
+    return const_cast<T*>(ptr);
+  }
 
-  // TODO(dzhwinter): below interfaces should be removed
-  /* Get device vector */
-  T *cuda_data() {
-    CopyToCUDA();
-    PADDLE_ENFORCE_NOT_NULL(
-        cuda_ptr_, "No data or Insufficient CUDA memory to allocation");
-    return static_cast<T *>(cuda_ptr_.get());
+  // clear
+  void clear() {
+    size_ = 0;
+    flag_ = kDirty | kDataInCPU;
   }
 
-  /* Get host vector */
-  T *data() { return std::vector<T>::data(); }
-  const T *data() const { return std::vector<T>::data(); }
+  size_t capacity() const {
+    return cpu_vec_.memory_size() / SizeOfType(typeid(T));
+  }
+
+  // reserve data
+  void reserve(size_t size) {
+    size_t pre_size = size_;
+    resize(size);
+    resize(pre_size);
+  }
 
-  T *data(const platform::Place &place) {
-    if (platform::is_cpu_place(place)) {
+  // the unify method to access CPU or CUDA data. immutable.
+  const T* Data(platform::Place place) const {
+    if (platform::is_gpu_place(place)) {
+      return CUDAData(place);
+    } else {
       return data();
+    }
+  }
+
+  // the unify method to access CPU or CUDA data. mutable.
+  T* MutableData(platform::Place place) {
+    if (platform::is_gpu_place(place)) {
+      return CUDAMutableData(place);
     } else {
-      return cuda_data();
+      return data();
     }
   }
 
-  /* Synchronize host vector to device vector */
-  void CopyToCUDA();
-  /* Synchronize device vector to host vector */
-  void CopyFromCUDA();
-  /* Switch device vector location */
-  void CopyToPeer(platform::Place);
+  // implicit cast operator. Vector can be cast to std::vector implicitly.
+  operator std::vector<T>() const {
+    std::vector<T> result;
+    result.resize(size());
+    std::copy(begin(), end(), result.begin());
+    return result;
+  }
+
+  bool operator==(const Vector<T>& other) const {
+    if (size() != other.size()) return false;
+    for (auto it1 = begin(), it2 = other.begin(); it1 < end(); ++it1, ++it2) {
+      if (*it1 != *it2) {
+        return false;
+      }
+    }
+    return true;
+  }
 
  private:
-  std::shared_ptr<void> cuda_ptr_;
-  size_t cuda_size_ = 0;  // device vector numel
-  platform::CUDAPlace place_;
-};
+  void InitEmpty() {
+    size_ = 0;
+    flag_ = kDataInCPU;
+  }
 
-template <typename T>
-inline const T *Vector<T>::data(platform::Place place) const {
-  if (platform::is_cpu_place(place)) {
-    return std::vector<T>::data();
-  } else if (platform::is_gpu_place(place)) {
-    if (cuda_ptr_ == nullptr) {
-      return nullptr;
-    }
-    if (boost::get<platform::CUDAPlace>(place) == place_) {
-      return static_cast<const T *>(cuda_ptr_.get());
+  template <typename Iter>
+  void InitByIter(size_t size, Iter begin, Iter end) {
+    platform::Place cpu = platform::CPUPlace();
+    T* ptr = this->cpu_vec_.template mutable_data<T>(
+        framework::make_ddim({static_cast<int64_t>(size)}), cpu);
+    for (size_t i = 0; i < size; ++i) {
+      *ptr++ = *begin++;
+    }
+    flag_ = kDataInCPU | kDirty;
+    size_ = size;
+  }
+
+  enum DataFlag {
+    kDataInCPU = 0x01,
+    kDataInCUDA = 0x02,
+    // kDirty means the data has been changed in one device.
+    kDirty = 0x10
+  };
+
+  void CopyToCPU() const {
+    // COPY GPU Data To CPU
+    Copy(cuda_vec_, platform::CPUPlace(), &cpu_vec_);
+    WaitPlace(cuda_vec_.place());
+  }
+
+  void MutableCPU() {
+    if (IsInCUDA() && IsDirty()) {
+      CopyToCPU();
+    }
+    flag_ = kDirty | kDataInCPU;
+  }
+
+  void ImmutableCUDA(platform::Place place) const {
+    if (IsDirty()) {
+      if (IsInCPU()) {
+        Copy(cpu_vec_, boost::get<platform::CUDAPlace>(place), &cuda_vec_);
+        WaitPlace(place);
+        UnsetFlag(kDirty);
+        SetFlag(kDataInCUDA);
+      } else if (IsInCUDA() && !(place == cuda_vec_.place())) {
+        framework::Tensor tmp;
+        Copy(cuda_vec_, boost::get<platform::CUDAPlace>(place), &tmp);
+        WaitPlace(cuda_vec_.place());
+        cuda_vec_.ShareDataWith(tmp);
+        // Still dirty
+      } else {
+        // Dirty && DataInCUDA && Device is same
+        // Do nothing
+      }
     } else {
-      PADDLE_THROW(
-          "Unmatched place. Please use `mutable_data` copy lod to the target "
-          "Place first.");
+      if (!IsInCUDA()) {
+        // Even data is not dirty. However, data is not in CUDA. Copy data.
+        Copy(cpu_vec_, boost::get<platform::CUDAPlace>(place), &cuda_vec_);
+        WaitPlace(place);
+        SetFlag(kDataInCUDA);
+      } else if (!(place == cuda_vec_.place())) {
+        framework::Tensor tmp;
+        WaitPlace(cuda_vec_.place());
+        Copy(cuda_vec_, boost::get<platform::CUDAPlace>(place), &tmp);
+        WaitPlace(cuda_vec_.place());
+        WaitPlace(place);
+        cuda_vec_.ShareDataWith(tmp);
+      } else {
+        // Not Dirty && DataInCUDA && Device is same
+        // Do nothing.
+      }
     }
-  } else {
-    PADDLE_THROW("Unsupport Place.");
   }
-}
 
-template <typename T>
-inline T *Vector<T>::mutable_data(platform::Place place) {
-  if (platform::is_cpu_place(place)) {
-    return std::vector<T>::data();
-  } else if (platform::is_gpu_place(place)) {
-    if (boost::get<platform::CUDAPlace>(place) != place_) {
-      place_ = boost::get<platform::CUDAPlace>(place);
-    }
-#ifdef PADDLE_WITH_CUDA
-    if (cuda_size_ < this->size() || cuda_ptr_ == nullptr) {
-      cuda_ptr_.reset(
-          memory::Alloc<platform::CUDAPlace>(place_, this->size() * sizeof(T)),
-          memory::PlainDeleter<void, platform::CUDAPlace>(place_));
-    }
-    cuda_size_ = this->size();
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto *ctx = pool.GetByPlace(place_);
-    memory::Copy(place_, cuda_ptr_.get(), platform::CPUPlace(),
-                 static_cast<const void *>(this->data()),
-                 this->size() * sizeof(T), ctx->stream());
-    ctx->Wait();
-    return static_cast<T *>(cuda_ptr_.get());
-#else
-    return nullptr;
-#endif
-  } else {
-    PADDLE_THROW("Unsupport Place.");
-  }
-}
+  void ImmutableCPU() const {
+    if (IsDirty() &&
+        !IsInCPU()) {  // If data has been changed in CUDA, or CPU has no data.
+      CopyToCPU();
+      UnsetFlag(kDirty);
+    }
+    SetFlag(kDataInCPU);
+  }
 
-template <typename T>
-void Vector<T>::CopyToCUDA() {
-#ifdef PADDLE_WITH_CUDA
-  if (cuda_size_ < this->size() || cuda_ptr_ == nullptr) {
-    cuda_ptr_.reset(
-        memory::Alloc<platform::CUDAPlace>(place_, this->size() * sizeof(T)),
-        memory::PlainDeleter<void, platform::CUDAPlace>(place_));
-  }
-  cuda_size_ = this->size();
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto *ctx = pool.GetByPlace(place_);
-  memory::Copy(place_, cuda_ptr_.get(), platform::CPUPlace(),
-               static_cast<const void *>(this->data()),
-               this->size() * sizeof(T), ctx->stream());
-  ctx->Wait();
-#endif
-}
+  void UnsetFlag(int flag) const { flag_ &= ~flag; }
+  void SetFlag(int flag) const { flag_ |= flag; }
 
-template <typename T>
-void Vector<T>::CopyFromCUDA() {
-#ifdef PADDLE_WITH_CUDA
-  if (cuda_ptr_ == nullptr) {
-    LOG(WARNING) << "No uncommitted cuda data.";
-    return;
-  }
-  this->resize(cuda_size_);
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto *ctx = pool.GetByPlace(place_);
-  memory::Copy(platform::CPUPlace(), static_cast<void *>(this->data()), place_,
-               static_cast<const void *>(cuda_ptr_.get()),
-               this->size() * sizeof(T), ctx->stream());
-  ctx->Wait();
-#endif
-}
+  bool IsDirty() const { return flag_ & kDirty; }
 
-template <typename T>
-void Vector<T>::CopyToPeer(platform::Place place) {
-#ifdef PADDLE_WITH_CUDA
-  if (boost::get<platform::CUDAPlace>(place) != place_) {
-    place_ = boost::get<platform::CUDAPlace>(place);
-  }
-  if (cuda_size_ < this->size() || cuda_ptr_ == nullptr) {
-    cuda_ptr_.reset(
-        memory::Alloc<platform::CUDAPlace>(place_, this->size() * sizeof(T)),
-        memory::PlainDeleter<void, platform::CUDAPlace>(place_));
-  }
-  cuda_size_ = this->size();
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto *ctx = pool.GetByPlace(place_);
-  memory::Copy(place_, cuda_ptr_.get(), platform::CPUPlace(),
-               static_cast<const void *>(this->data()),
-               this->size() * sizeof(T), ctx->stream());
-  ctx->Wait();
-#endif
-}
+  bool IsInCUDA() const { return flag_ & kDataInCUDA; }
+
+  bool IsInCPU() const { return flag_ & kDataInCPU; }
+
+  static void WaitPlace(const platform::Place place) {
+    if (platform::is_gpu_place(place)) {
+      platform::DeviceContextPool::Instance()
+          .Get(boost::get<platform::CUDAPlace>(place))
+          ->Wait();
+    }
+  }
+
+  mutable int flag_;
+  mutable Tensor cpu_vec_;
+  mutable Tensor cuda_vec_;
+  size_t size_;
+};
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/mixed_vector_test.cu b/paddle/framework/mixed_vector_test.cu
index 7b571788ad1ade50e05dc9a70cba35b83f8db3ea..f02db8f612c498ddd103a6b96e80d265e13c0f52 100644
--- a/paddle/framework/mixed_vector_test.cu
+++ b/paddle/framework/mixed_vector_test.cu
@@ -11,62 +11,83 @@
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License. */
-#include <cuda.h>
 #include <cuda_runtime.h>
-#include "gtest/gtest.h"
 
-#include "paddle/framework/init.h"
+#include "glog/logging.h"
+#include "gtest/gtest.h"
 #include "paddle/framework/mixed_vector.h"
-
-using namespace paddle::framework;
-using namespace paddle::platform;
-using namespace paddle::memory;
+#include "paddle/platform/gpu_info.h"
 
 template <typename T>
-__global__ void test(T* data, int size) {
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size;
-       i += blockDim.x * gridDim.x) {
-    data[i] *= 2;
+using vec = paddle::framework::Vector<T>;
+
+TEST(mixed_vector, CPU_VECTOR) {
+  vec<int> tmp;
+  for (int i = 0; i < 10; ++i) {
+    tmp.push_back(i);
+  }
+  ASSERT_EQ(tmp.size(), 10);
+  vec<int> tmp2;
+  tmp2 = tmp;
+  ASSERT_EQ(tmp2.size(), 10);
+  for (int i = 0; i < 10; ++i) {
+    ASSERT_EQ(tmp2[i], i);
+    ASSERT_EQ(tmp2[i], tmp[i]);
+  }
+  int cnt = 0;
+  for (auto& t : tmp2) {
+    ASSERT_EQ(t, cnt);
+    ++cnt;
   }
 }
 
-TEST(Vector, Normal) {
-  // fill the device context pool.
-  InitDevices();
+static __global__ void multiply_10(int* ptr) {
+  for (int i = 0; i < 10; ++i) {
+    ptr[i] *= 10;
+  }
+}
+
+cudaStream_t GetCUDAStream(paddle::platform::CUDAPlace place) {
+  return reinterpret_cast<const paddle::platform::CUDADeviceContext*>(
+             paddle::platform::DeviceContextPool::Instance().Get(place))
+      ->stream();
+}
 
-  Vector<size_t> vec({1, 2, 3});
-  size_t* ptr = vec.data();
-  for (size_t i = 0; i < vec.size(); ++i) {
-    EXPECT_EQ(vec[i], *(ptr + i));
+TEST(mixed_vector, GPU_VECTOR) {
+  vec<int> tmp;
+  for (int i = 0; i < 10; ++i) {
+    tmp.push_back(i);
   }
+  ASSERT_EQ(tmp.size(), 10);
+  paddle::platform::CUDAPlace gpu(0);
 
-  vec.clear();
-  vec.CopyFromCUDA();
+  multiply_10<<<1, 1, 0, GetCUDAStream(gpu)>>>(tmp.MutableData(gpu));
 
-  std::vector<size_t> v = {1, 2, 3};
-  for (size_t i = 0; i < v.size(); ++i) {
-    EXPECT_EQ(v[i], vec[i]);
+  for (int i = 0; i < 10; ++i) {
+    ASSERT_EQ(tmp[i], i * 10);
   }
 }
 
-TEST(Vector, MultipleCopy) {
-  InitDevices();
-  Vector<size_t> vec({1, 2, 3});
-  CUDAPlace place(0);
-  vec.mutable_data(place);
-  auto vec2 = Vector<size_t>(vec);
-  {
-    const size_t* ptr = vec2.data(CPUPlace());
-    for (size_t i = 0; i < vec2.size(); ++i) {
-      EXPECT_EQ(*(ptr + i), vec[i]);
-    }
+TEST(mixed_vector, MultiGPU) {
+  if (paddle::platform::GetCUDADeviceCount() < 2) {
+    LOG(WARNING) << "Skip mixed_vector.MultiGPU since there are not multiple "
+                    "GPUs in your machine.";
+    return;
+  }
+
+  vec<int> tmp;
+  for (int i = 0; i < 10; ++i) {
+    tmp.push_back(i);
   }
-  test<size_t><<<3, 3>>>(vec2.mutable_data(place), vec2.size());
-  vec2.CopyFromCUDA();
-  {
-    const size_t* ptr = vec2.data(CPUPlace());
-    for (size_t i = 0; i < vec2.size(); ++i) {
-      EXPECT_EQ(*(ptr + i), vec[i] * 2);
-    }
+  ASSERT_EQ(tmp.size(), 10);
+  paddle::platform::CUDAPlace gpu0(0);
+  paddle::platform::SetDeviceId(0);
+  multiply_10<<<1, 1, 0, GetCUDAStream(gpu0)>>>(tmp.MutableData(gpu0));
+  paddle::platform::CUDAPlace gpu1(1);
+  auto* gpu1_ptr = tmp.MutableData(gpu1);
+  paddle::platform::SetDeviceId(1);
+  multiply_10<<<1, 1, 0, GetCUDAStream(gpu1)>>>(gpu1_ptr);
+  for (int i = 0; i < 10; ++i) {
+    ASSERT_EQ(tmp[i], i * 100);
   }
 }
diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc
index ea4028750248ec47f5094a67f736fb217216af6d..b51afe499bbc0e6b727aeeb4334f56e400ea81a5 100644
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -125,11 +125,10 @@ OpDesc::OpDesc(const proto::OpDesc &desc, ProgramDesc *prog, BlockDesc *block)
   // restore attrs_
   for (const proto::OpDesc::Attr &attr : desc_.attrs()) {
     std::string attr_name = attr.name();
+    // The sub_block referred to by the BLOCK attr hasn't been added
+    // to ProgramDesc class yet, we skip setting BLOCK attr here.
     if (attr.type() != proto::AttrType::BLOCK) {
       attrs_[attr_name] = GetAttrValue(attr);
-    } else {
-      auto bid = attr.block_idx();
-      attrs_[attr_name] = prog->MutableBlock(bid);
     }
   }
   this->block_ = block;
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index 5de9ae559c435439f30931c7840e54e0d2bb744c..6fb8532b2a807e287de7fab03402f8a290eabcf2 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -143,7 +143,7 @@ class OpKernelRegistrar : public Registrar {
 
 /**
  * Macro to register Operator. When the input is duplicable, you should
- * use REGISTER_OP_EX with deop_empty_grad=false instead.
+ * use REGISTER_OP_EX with drop_empty_grad=false instead.
  */
 #define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, \
                     grad_op_class)                                   \
diff --git a/paddle/framework/program_desc.cc b/paddle/framework/program_desc.cc
index 15ea4035c6e6193105b621210a900e74d1466941..0e937dda4e185590648962a6d4f827eea21eb620 100644
--- a/paddle/framework/program_desc.cc
+++ b/paddle/framework/program_desc.cc
@@ -43,11 +43,20 @@ ProgramDesc::ProgramDesc() {
 
 ProgramDesc::ProgramDesc(const ProgramDesc &o) {
   desc_ = o.desc_;
-
   for (int i = 0; i < desc_.blocks_size(); ++i) {
     auto *block = desc_.mutable_blocks(i);
     blocks_.emplace_back(new BlockDesc(*o.blocks_[i], block, this));
   }
+  for (auto &block : blocks_) {
+    for (auto *op : block->AllOps()) {
+      for (const auto &attr : op->Proto()->attrs()) {
+        if (attr.type() == proto::AttrType::BLOCK) {
+          size_t blk_idx = attr.block_idx();
+          op->SetBlockAttr(attr.name(), *this->MutableBlock(blk_idx));
+        }
+      }
+    }
+  }
 }
 
 ProgramDesc::ProgramDesc(const proto::ProgramDesc &desc) {
@@ -55,6 +64,16 @@ ProgramDesc::ProgramDesc(const proto::ProgramDesc &desc) {
   for (auto &block_desc : *desc_.mutable_blocks()) {
     blocks_.emplace_back(new BlockDesc(this, &block_desc));
   }
+  for (auto &block : blocks_) {
+    for (auto *op : block->AllOps()) {
+      for (const auto &attr : op->Proto()->attrs()) {
+        if (attr.type() == proto::AttrType::BLOCK) {
+          size_t blk_idx = attr.block_idx();
+          op->SetBlockAttr(attr.name(), *this->MutableBlock(blk_idx));
+        }
+      }
+    }
+  }
 }
 
 ProgramDesc::ProgramDesc(const std::string &binary_str) {
diff --git a/paddle/framework/prune.cc b/paddle/framework/prune.cc
index bff8e0bceaca9749101b2c45edddba526d565624..ddd6b993d40f72cba919fad95318f70409c98bca 100644
--- a/paddle/framework/prune.cc
+++ b/paddle/framework/prune.cc
@@ -49,11 +49,28 @@ bool IsTarget(const proto::OpDesc& op_desc) {
   return false;
 }
 
-void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
-                int block_id) {
-  // TODO(tonyyang-svail):
-  //    - will change to use multiple blocks for RNN op and Cond Op
+int GetSubBlockIndex(const proto::OpDesc& op_desc) {
+  for (auto& attr : op_desc.attrs()) {
+    if (attr.type() == proto::AttrType::BLOCK) {
+      PADDLE_ENFORCE(attr.has_block_idx());
+      return attr.block_idx();
+    }
+  }
+  return -1;
+}
+
+bool HasSubBlock(const proto::OpDesc& op_desc) {
+  return GetSubBlockIndex(op_desc) > 0;
+}
 
+// block_id is the idx of the current block in the input desc
+// parent_block_id is the idx of the parent of the current block
+// in the output desc, -1 means the current block is global block
+// dependent_vars is passed recursively from the parent block to
+// the child block to help pruning
+void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
+                int block_id, int parent_block_id,
+                std::set<std::string>& dependent_vars) {
   auto& block = input.blocks(block_id);
   auto& ops = block.ops();
 
@@ -72,11 +89,9 @@ void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
     expect_fetch = (op_desc.type() == kFetchOpType);
   }
 
-  std::set<std::string> dependent_vars;
   std::vector<bool> should_run;
   for (auto op_iter = ops.rbegin(); op_iter != ops.rend(); ++op_iter) {
     auto& op_desc = *op_iter;
-
     if (IsTarget(op_desc) || HasDependentVar(op_desc, dependent_vars)) {
       // insert its input to the dependency graph
       for (auto& var : op_desc.inputs()) {
@@ -84,7 +99,6 @@ void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
           dependent_vars.insert(argu);
         }
       }
-
       should_run.push_back(true);
     } else {
       should_run.push_back(false);
@@ -95,45 +109,81 @@ void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
   // we reverse the should_run vector
   std::reverse(should_run.begin(), should_run.end());
 
-  *output = input;
-  auto* op_field = output->mutable_blocks(block_id)->mutable_ops();
+  // copy the current block from input to output
+  auto* block_field = output->mutable_blocks();
+  *block_field->Add() = input.blocks(block_id);
+
+  int output_block_id = output->blocks_size() - 1;
+  auto* output_block = output->mutable_blocks(output_block_id);
+  output_block->set_idx(output_block_id);
+  output_block->set_parent_idx(parent_block_id);
+
+  auto* op_field = output_block->mutable_ops();
   op_field->Clear();
   for (size_t i = 0; i < should_run.size(); ++i) {
     if (should_run[i]) {
-      *op_field->Add() = input.blocks(block_id).ops(i);
+      auto* op = op_field->Add();
+      *op = input.blocks(block_id).ops(i);
+      if (HasSubBlock(*op)) {
+        // create sub_block_dependent_vars here to help prune the sub block
+        std::set<std::string> sub_block_dependent_vars;
+        for (auto& var : op->inputs()) {
+          for (auto& argu : var.arguments()) {
+            sub_block_dependent_vars.insert(argu);
+          }
+        }
+        for (auto& var : op->outputs()) {
+          for (auto& argu : var.arguments()) {
+            sub_block_dependent_vars.insert(argu);
+          }
+        }
+        // GetSubBlockIndex(*op) is the idx of the sub_block in the input desc
+        // output_block_id is the idx of the current block in the output desc
+        prune_impl(input, output, GetSubBlockIndex(*op), output_block_id,
+                   sub_block_dependent_vars);
+      }
     }
   }
 
   // remove the VarDescs in BlockDesc that are not referenced in
   // the pruned OpDescs
   std::unordered_map<std::string, proto::VarDesc> var_map;
-  auto* var_field = output->mutable_blocks(block_id)->mutable_vars();
+  auto* var_field = output->mutable_blocks(output_block_id)->mutable_vars();
   for (const auto& var : *var_field) {
     var_map[var.name()] = var;
   }
 
-  var_field->Clear();
+  std::set<std::string> var_names;
   for (const auto& op : *op_field) {
-    // add VarDescs of all input arguments for each OpDesc
     auto& input_field = op.inputs();
     for (auto& input_var : input_field) {
       for (auto& arg : input_var.arguments()) {
-        *var_field->Add() = var_map[arg];
+        if (var_map.count(arg) != 0) {
+          var_names.insert(arg);
+        }
       }
     }
-    // add VarDescs of all output arguments for each OpDesc
     auto& output_field = op.outputs();
     for (auto& output_var : output_field) {
       for (auto& arg : output_var.arguments()) {
-        *var_field->Add() = var_map[arg];
+        if (var_map.count(arg) != 0) {
+          var_names.insert(arg);
+        }
       }
     }
   }
+
+  var_field->Clear();
+  for (const auto& name : var_names) {
+    *var_field->Add() = var_map[name];
+  }
 }
 
 // TODO(fengjiayi): Prune() could be inplaced to avoid unnecessary copies
 void Prune(const proto::ProgramDesc& input, proto::ProgramDesc* output) {
-  prune_impl(input, output, 0);
+  std::set<std::string> dependent_vars;
+  output->clear_blocks();
+  prune_impl(input, output, 0, -1, dependent_vars);
 }
 
 void inference_optimize_impl(const proto::ProgramDesc& input,
diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index f0ea709a5c37e769e3ffa1b2e9d1e39721979251..be09b7c94507b99b5b4cbfe6f2039c74ec76b4a2 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -120,6 +120,7 @@ class Tensor {
     return holder_->type();
   }
 
+  // memory size returns the holding memory size in byte.
   size_t memory_size() const;
 
   inline void check_memory_size() const;
diff --git a/paddle/framework/tensor_impl.h b/paddle/framework/tensor_impl.h
index 1340c5e48520ccdd537e694abf452fd79129df99..652d6b8a90ebb8e761aa7ab01902802d93bfecae 100644
--- a/paddle/framework/tensor_impl.h
+++ b/paddle/framework/tensor_impl.h
@@ -52,7 +52,7 @@ struct SizeOfTypeFunctor<HEAD, TAIL...> {
 };
 
 static inline size_t SizeOfType(std::type_index type) {
-  SizeOfTypeFunctor<int, float, double, int16_t, int64_t, bool> functor;
+  SizeOfTypeFunctor<int, float, double, int16_t, int64_t, bool, size_t> functor;
   size_t size = functor(type);
   PADDLE_ENFORCE(size != 0UL, "Cannot get size of type %s", type.name());
   return size;
@@ -61,15 +61,15 @@ static inline size_t SizeOfType(std::type_index type) {
 inline void Tensor::check_memory_size() const {
   PADDLE_ENFORCE_NOT_NULL(
       holder_, "Tensor holds no memory. Call Tensor::mutable_data first.");
-  PADDLE_ENFORCE_GE(
-      holder_->size(), memory_size() + offset_,
+  PADDLE_ENFORCE_LE(
+      numel() * SizeOfType(type()), memory_size(),
       "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
       "first to re-allocate memory.\n"
       "or maybe the required data-type mismatches the data already stored.");
 }
 
 inline size_t Tensor::memory_size() const {
-  return holder_ == nullptr ? 0UL : numel() * SizeOfType(type());
+  return holder_ == nullptr ? 0UL : holder_->size() - offset_;
 }
 
 template <typename T>
diff --git a/paddle/inference/CMakeLists.txt b/paddle/inference/CMakeLists.txt
index 654a6119bdc85f43b0cae631a9dc8f0ccd758889..bdb147955ca0700dc0854b54c38d961caf8845f3 100644
--- a/paddle/inference/CMakeLists.txt
+++ b/paddle/inference/CMakeLists.txt
@@ -4,19 +4,14 @@ cc_library(paddle_fluid_api
     SRCS io.cc
     DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
 
-# Merge all modules into a single static library
+# Create static library
 cc_library(paddle_fluid DEPS paddle_fluid_api ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
 
 # Create shared library
-add_library(paddle_fluid_shared SHARED io.cc)
-
-target_circle_link_libraries(paddle_fluid_shared
-  ARCHIVE_START
-  ${GLOB_OP_LIB}
-  ${FLUID_CORE_MODULES}
-  ARCHIVE_END)
-
-SET_TARGET_PROPERTIES(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
+cc_library(paddle_fluid_shared SHARED
+    SRCS io.cc
+    DEPS ARCHIVE_START ${GLOB_OP_LIB} ${FLUID_CORE_MODULES} ARCHIVE_END)
+set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
 
 if(WITH_TESTING)
   add_subdirectory(tests/book)
diff --git a/paddle/inference/io.cc b/paddle/inference/io.cc
index 1ed14b69c83a7a0fb5a55db9c179df133407440c..784e87970f77857e7f3182df904dc0133c44d6c9 100644
--- a/paddle/inference/io.cc
+++ b/paddle/inference/io.cc
@@ -21,6 +21,17 @@ limitations under the License. */
 namespace paddle {
 namespace inference {
 
+void ReadBinaryFile(const std::string& filename, std::string& contents) {
+  VLOG(3) << "loading model from " << filename;
+  std::ifstream inputfs(filename, std::ios::in | std::ios::binary);
+  inputfs.seekg(0, std::ios::end);
+  contents.clear();
+  contents.resize(inputfs.tellg());
+  inputfs.seekg(0, std::ios::beg);
+  inputfs.read(&contents[0], contents.size());
+  inputfs.close();
+}
+
 bool IsParameter(const framework::VarDesc* var,
                  const framework::ProgramDesc& main_program) {
   if (var->Persistable()) {
@@ -44,12 +55,15 @@ bool IsParameter(const framework::VarDesc* var,
 
 void LoadPersistables(framework::Executor& executor,
                       framework::Scope& scope,
+                      const framework::ProgramDesc& main_program,
                       const std::string& dirname,
-                      const framework::ProgramDesc& main_program) {
+                      const std::string& param_filename) {
   const framework::BlockDesc& global_block = main_program.Block(0);
 
   framework::ProgramDesc* load_program = new framework::ProgramDesc();
   framework::BlockDesc* load_block = load_program->MutableBlock(0);
+  std::vector<std::string> paramlist;
+
   for (auto* var : global_block.AllVars()) {
     if (IsParameter(var, main_program)) {
       VLOG(3) << "parameter's name: " << var->Name();
@@ -61,15 +75,33 @@ void LoadPersistables(framework::Executor& executor,
       new_var->SetLoDLevel(var->GetLoDLevel());
       new_var->SetPersistable(true);
 
-      // append_op
-      framework::OpDesc* op = load_block->AppendOp();
-      op->SetType("load");
-      op->SetOutput("Out", {new_var->Name()});
-      op->SetAttr("file_path", {dirname + "/" + new_var->Name()});
-      op->CheckAttrs();
+      if (!param_filename.empty()) {
+        paramlist.push_back(new_var->Name());
+      } else {
+        // append_op
+        framework::OpDesc* op = load_block->AppendOp();
+        op->SetType("load");
+        op->SetOutput("Out", {new_var->Name()});
+        op->SetAttr("file_path", {dirname + "/" + new_var->Name()});
+        op->CheckAttrs();
+      }
     }
   }
+
+  if (!param_filename.empty()) {
+    // sort paramlist to have consistent ordering
+    std::sort(paramlist.begin(), paramlist.end());
+    // append just the load_combine op
+    framework::OpDesc* op = load_block->AppendOp();
+    op->SetType("load_combine");
+    op->SetOutput("Out", paramlist);
+    op->SetAttr("file_path", {param_filename});
+    op->CheckAttrs();
+  }
+
   executor.Run(*load_program, &scope, 0, true, true);
+
+  VLOG(3) << "Ran loading successfully";
   delete load_program;
 }
 
@@ -77,20 +109,29 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
                                              framework::Scope& scope,
                                              const std::string& dirname) {
   std::string model_filename = dirname + "/__model__";
-  LOG(INFO) << "loading model from " << model_filename;
-  std::ifstream inputfs(model_filename, std::ios::in | std::ios::binary);
   std::string program_desc_str;
-  inputfs.seekg(0, std::ios::end);
-  program_desc_str.resize(inputfs.tellg());
-  inputfs.seekg(0, std::ios::beg);
-  LOG(INFO) << "program_desc_str's size: " << program_desc_str.size();
-  inputfs.read(&program_desc_str[0], program_desc_str.size());
-  inputfs.close();
+  ReadBinaryFile(model_filename, program_desc_str);
+
+  std::unique_ptr<framework::ProgramDesc> main_program(
+      new framework::ProgramDesc(program_desc_str));
+
+  LoadPersistables(executor, scope, *main_program, dirname, "");
+  return main_program;
+}
+
+std::unique_ptr<framework::ProgramDesc> Load(
+    framework::Executor& executor,
+    framework::Scope& scope,
+    const std::string& prog_filename,
+    const std::string& param_filename) {
+  std::string model_filename = prog_filename;
+  std::string program_desc_str;
+  ReadBinaryFile(model_filename, program_desc_str);
 
   std::unique_ptr<framework::ProgramDesc> main_program(
       new framework::ProgramDesc(program_desc_str));
 
-  LoadPersistables(executor, scope, dirname, *main_program);
+  LoadPersistables(executor, scope, *main_program, "", param_filename);
   return main_program;
 }
 
diff --git a/paddle/inference/io.h b/paddle/inference/io.h
index 962b6c4e20d30de3cc28eae1c8c5c33b3ab5f6ac..a7d7c499690620740d8627e7f5085d728d67f7c3 100644
--- a/paddle/inference/io.h
+++ b/paddle/inference/io.h
@@ -26,12 +26,18 @@ namespace inference {
 
 void LoadPersistables(framework::Executor& executor,
                       framework::Scope& scope,
+                      const framework::ProgramDesc& main_program,
                       const std::string& dirname,
-                      const framework::ProgramDesc& main_program);
+                      const std::string& param_filename);
 
 std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
                                              framework::Scope& scope,
                                              const std::string& dirname);
 
+std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
+                                             framework::Scope& scope,
+                                             const std::string& prog_filename,
+                                             const std::string& param_filename);
+
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/inference/tests/book/CMakeLists.txt b/paddle/inference/tests/book/CMakeLists.txt
index 63afeb18aebdf446c01cd4fdac13d238467801e4..9fe76afb582a13b741ab086f0c62d77e86d4e8bb 100644
--- a/paddle/inference/tests/book/CMakeLists.txt
+++ b/paddle/inference/tests/book/CMakeLists.txt
@@ -5,25 +5,30 @@ function(inference_test TARGET_NAME)
   cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
   set(PYTHON_TESTS_DIR ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/tests)
+  set(arg_list "")
   if(inference_test_ARGS)
     foreach(arg ${inference_test_ARGS})
-      cc_test(test_inference_${TARGET_NAME}_${arg}
-          SRCS test_inference_${TARGET_NAME}.cc
-          DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
-          ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}_${arg}.inference.model)
-      set_tests_properties(test_inference_${TARGET_NAME}_${arg}
-          PROPERTIES DEPENDS test_${TARGET_NAME})
+      list(APPEND arg_list "_${arg}")
     endforeach()
   else()
-    cc_test(test_inference_${TARGET_NAME}
+    list(APPEND arg_list "_")
+  endif()
+  foreach(arg ${arg_list})
+    string(REGEX REPLACE "^_$" "" arg "${arg}")
+    cc_test(test_inference_${TARGET_NAME}${arg}
         SRCS test_inference_${TARGET_NAME}.cc
         DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
-        ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}.inference.model)
-    set_tests_properties(test_inference_${TARGET_NAME}
+        ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.inference.model)
+    set_tests_properties(test_inference_${TARGET_NAME}${arg}
         PROPERTIES DEPENDS test_${TARGET_NAME})
-  endif()
+  endforeach()
 endfunction(inference_test)
 
-inference_test(recognize_digits ARGS mlp)
+inference_test(fit_a_line)
 inference_test(image_classification ARGS vgg resnet)
 inference_test(label_semantic_roles)
+inference_test(recognize_digits ARGS mlp)
+inference_test(recommender_system)
+inference_test(rnn_encoder_decoder)
+inference_test(understand_sentiment)
+inference_test(word2vec)
diff --git a/paddle/inference/tests/book/test_helper.h b/paddle/inference/tests/book/test_helper.h
index 32db643fca2b026b674ea0b1ecd9aad5224e9e68..9774f3fbcb4af2bdf40f28456380f7e96189641c 100644
--- a/paddle/inference/tests/book/test_helper.h
+++ b/paddle/inference/tests/book/test_helper.h
@@ -30,6 +30,15 @@ void SetupTensor(paddle::framework::LoDTensor& input,
   }
 }
 
+template <typename T>
+void SetupTensor(paddle::framework::LoDTensor& input,
+                 paddle::framework::DDim dims,
+                 std::vector<T>& data) {
+  CHECK_EQ(paddle::framework::product(dims), static_cast<int64_t>(data.size()));
+  T* input_ptr = input.mutable_data<T>(dims, paddle::platform::CPUPlace());
+  memcpy(input_ptr, data.data(), input.numel() * sizeof(T));
+}
+
 template <typename T>
 void SetupLoDTensor(paddle::framework::LoDTensor& input,
                     paddle::framework::LoD& lod,
@@ -37,7 +46,18 @@ void SetupLoDTensor(paddle::framework::LoDTensor& input,
                     T upper) {
   input.set_lod(lod);
   int dim = lod[0][lod[0].size() - 1];
-  SetupTensor(input, {dim, 1}, lower, upper);
+  SetupTensor<T>(input, {dim, 1}, lower, upper);
+}
+
+template <typename T>
+void SetupLoDTensor(paddle::framework::LoDTensor& input,
+                    paddle::framework::DDim dims,
+                    paddle::framework::LoD lod,
+                    std::vector<T>& data) {
+  const size_t level = lod.size() - 1;
+  CHECK_EQ(dims[0], static_cast<int64_t>((lod[level]).back()));
+  input.set_lod(lod);
+  SetupTensor<T>(input, dims, data);
 }
 
 template <typename T>
@@ -64,20 +84,35 @@ void CheckError(paddle::framework::LoDTensor& output1,
       count++;
     }
   }
-  EXPECT_EQ(count, 0) << "There are " << count << " different elements.";
+  EXPECT_EQ(count, 0U) << "There are " << count << " different elements.";
 }
 
-template <typename Place, typename T>
+template <typename Place, bool IsCombined = false>
 void TestInference(const std::string& dirname,
                    const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
                    std::vector<paddle::framework::LoDTensor*>& cpu_fetchs) {
-  // 1. Define place, executor and scope
+  // 1. Define place, executor, scope
   auto place = Place();
   auto executor = paddle::framework::Executor(place);
   auto* scope = new paddle::framework::Scope();
 
-  // 2. Initialize the inference_program and load all parameters from file
-  auto inference_program = paddle::inference::Load(executor, *scope, dirname);
+  // 2. Initialize the inference_program and load parameters
+  std::unique_ptr<paddle::framework::ProgramDesc> inference_program;
+  if (IsCombined) {
+    // All parameters are saved in a single file.
+    // Hard-coding the file names of program and parameters in unittest.
+    // Users are free to specify different filename
+    // (provided: the filenames are changed in the python api as well: io.py)
+    std::string prog_filename = "__model_combined__";
+    std::string param_filename = "__params_combined__";
+    inference_program = paddle::inference::Load(executor,
+                                                *scope,
+                                                dirname + "/" + prog_filename,
+                                                dirname + "/" + param_filename);
+  } else {
+    // Parameters are saved in separate files sited in the specified `dirname`.
+    inference_program = paddle::inference::Load(executor, *scope, dirname);
+  }
 
   // 3. Get the feed_target_names and fetch_target_names
   const std::vector<std::string>& feed_target_names =
diff --git a/paddle/inference/tests/book/test_inference_fit_a_line.cc b/paddle/inference/tests/book/test_inference_fit_a_line.cc
new file mode 100644
index 0000000000000000000000000000000000000000..201a2801cd69f1ce43173c0072fb35a85aac54e6
--- /dev/null
+++ b/paddle/inference/tests/book/test_inference_fit_a_line.cc
@@ -0,0 +1,57 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "gflags/gflags.h"
+#include "test_helper.h"
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+TEST(inference, fit_a_line) {
+  if (FLAGS_dirname.empty()) {
+    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
+  }
+
+  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
+  std::string dirname = FLAGS_dirname;
+
+  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
+
+  paddle::framework::LoDTensor input;
+  // The second dim of the input tensor should be 13
+  // The input data should be >= 0
+  int64_t batch_size = 10;
+  SetupTensor<float>(
+      input, {batch_size, 13}, static_cast<float>(0), static_cast<float>(10));
+  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+  cpu_feeds.push_back(&input);
+
+  paddle::framework::LoDTensor output1;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+  cpu_fetchs1.push_back(&output1);
+
+  // Run inference on CPU
+  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
+  LOG(INFO) << output1.dims();
+
+#ifdef PADDLE_WITH_CUDA
+  paddle::framework::LoDTensor output2;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
+  cpu_fetchs2.push_back(&output2);
+
+  // Run inference on CUDA GPU
+  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
+  LOG(INFO) << output2.dims();
+
+  CheckError<float>(output1, output2);
+#endif
+}
diff --git a/paddle/inference/tests/book/test_inference_image_classification.cc b/paddle/inference/tests/book/test_inference_image_classification.cc
index 35ff9431e9734bc3d20e1281f9d5d7f3e98f7524..36ea7c77a75fc0540922eb0f9eb3899733a0afa2 100644
--- a/paddle/inference/tests/book/test_inference_image_classification.cc
+++ b/paddle/inference/tests/book/test_inference_image_classification.cc
@@ -29,11 +29,15 @@ TEST(inference, image_classification) {
   // 0. Call `paddle::framework::InitDevices()` initialize all the devices
   // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
 
+  int64_t batch_size = 1;
+
   paddle::framework::LoDTensor input;
   // Use normilized image pixels as input data,
   // which should be in the range [0.0, 1.0].
-  SetupTensor<float>(
-      input, {1, 3, 32, 32}, static_cast<float>(0), static_cast<float>(1));
+  SetupTensor<float>(input,
+                     {batch_size, 3, 32, 32},
+                     static_cast<float>(0),
+                     static_cast<float>(1));
   std::vector<paddle::framework::LoDTensor*> cpu_feeds;
   cpu_feeds.push_back(&input);
 
@@ -42,8 +46,7 @@ TEST(inference, image_classification) {
   cpu_fetchs1.push_back(&output1);
 
   // Run inference on CPU
-  TestInference<paddle::platform::CPUPlace, float>(
-      dirname, cpu_feeds, cpu_fetchs1);
+  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
   LOG(INFO) << output1.dims();
 
 #ifdef PADDLE_WITH_CUDA
@@ -52,8 +55,7 @@ TEST(inference, image_classification) {
   cpu_fetchs2.push_back(&output2);
 
   // Run inference on CUDA GPU
-  TestInference<paddle::platform::CUDAPlace, float>(
-      dirname, cpu_feeds, cpu_fetchs2);
+  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
   LOG(INFO) << output2.dims();
 
   CheckError<float>(output1, output2);
diff --git a/paddle/inference/tests/book/test_inference_label_semantic_roles.cc b/paddle/inference/tests/book/test_inference_label_semantic_roles.cc
index 1eaf4022a1f27235fdd07e77e294eaba37a14249..922dbfd3338433a58632592667307e4da4dac9da 100644
--- a/paddle/inference/tests/book/test_inference_label_semantic_roles.cc
+++ b/paddle/inference/tests/book/test_inference_label_semantic_roles.cc
@@ -58,8 +58,7 @@ TEST(inference, label_semantic_roles) {
   cpu_fetchs1.push_back(&output1);
 
   // Run inference on CPU
-  TestInference<paddle::platform::CPUPlace, float>(
-      dirname, cpu_feeds, cpu_fetchs1);
+  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
   LOG(INFO) << output1.lod();
   LOG(INFO) << output1.dims();
 
@@ -69,8 +68,7 @@ TEST(inference, label_semantic_roles) {
   cpu_fetchs2.push_back(&output2);
 
   // Run inference on CUDA GPU
-  TestInference<paddle::platform::CUDAPlace, float>(
-      dirname, cpu_feeds, cpu_fetchs2);
+  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
   LOG(INFO) << output2.lod();
   LOG(INFO) << output2.dims();
 
diff --git a/paddle/inference/tests/book/test_inference_recognize_digits.cc b/paddle/inference/tests/book/test_inference_recognize_digits.cc
index 48f887e6bc680087af4cce74b5c5422a4eba3726..af8c2b14c3b1651a3714de10422a1b5dd8e1519f 100644
--- a/paddle/inference/tests/book/test_inference_recognize_digits.cc
+++ b/paddle/inference/tests/book/test_inference_recognize_digits.cc
@@ -29,6 +29,50 @@ TEST(inference, recognize_digits) {
   // 0. Call `paddle::framework::InitDevices()` initialize all the devices
   // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
 
+  int64_t batch_size = 1;
+
+  paddle::framework::LoDTensor input;
+  // Use normilized image pixels as input data,
+  // which should be in the range [-1.0, 1.0].
+  SetupTensor<float>(input,
+                     {batch_size, 1, 28, 28},
+                     static_cast<float>(-1),
+                     static_cast<float>(1));
+  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+  cpu_feeds.push_back(&input);
+
+  paddle::framework::LoDTensor output1;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+  cpu_fetchs1.push_back(&output1);
+
+  // Run inference on CPU
+  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
+  LOG(INFO) << output1.dims();
+
+#ifdef PADDLE_WITH_CUDA
+  paddle::framework::LoDTensor output2;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
+  cpu_fetchs2.push_back(&output2);
+
+  // Run inference on CUDA GPU
+  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
+  LOG(INFO) << output2.dims();
+
+  CheckError<float>(output1, output2);
+#endif
+}
+
+TEST(inference, recognize_digits_combine) {
+  if (FLAGS_dirname.empty()) {
+    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
+  }
+
+  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
+  std::string dirname = FLAGS_dirname;
+
+  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
+
   paddle::framework::LoDTensor input;
   // Use normilized image pixels as input data,
   // which should be in the range [-1.0, 1.0].
@@ -42,7 +86,7 @@ TEST(inference, recognize_digits) {
   cpu_fetchs1.push_back(&output1);
 
   // Run inference on CPU
-  TestInference<paddle::platform::CPUPlace, float>(
+  TestInference<paddle::platform::CPUPlace, true>(
       dirname, cpu_feeds, cpu_fetchs1);
   LOG(INFO) << output1.dims();
 
@@ -52,7 +96,7 @@ TEST(inference, recognize_digits) {
   cpu_fetchs2.push_back(&output2);
 
   // Run inference on CUDA GPU
-  TestInference<paddle::platform::CUDAPlace, float>(
+  TestInference<paddle::platform::CUDAPlace, true>(
       dirname, cpu_feeds, cpu_fetchs2);
   LOG(INFO) << output2.dims();
 
diff --git a/paddle/inference/tests/book/test_inference_recommender_system.cc b/paddle/inference/tests/book/test_inference_recommender_system.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ec24c7e6ab7d1573b02bed294f43053ee53e4e57
--- /dev/null
+++ b/paddle/inference/tests/book/test_inference_recommender_system.cc
@@ -0,0 +1,87 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "gflags/gflags.h"
+#include "test_helper.h"
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+TEST(inference, recommender_system) {
+  if (FLAGS_dirname.empty()) {
+    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
+  }
+
+  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
+  std::string dirname = FLAGS_dirname;
+
+  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
+
+  int64_t batch_size = 1;
+
+  paddle::framework::LoDTensor user_id, gender_id, age_id, job_id, movie_id,
+      category_id, movie_title;
+
+  // Use the first data from paddle.dataset.movielens.test() as input
+  std::vector<int64_t> user_id_data = {1};
+  SetupTensor<int64_t>(user_id, {batch_size, 1}, user_id_data);
+
+  std::vector<int64_t> gender_id_data = {1};
+  SetupTensor<int64_t>(gender_id, {batch_size, 1}, gender_id_data);
+
+  std::vector<int64_t> age_id_data = {0};
+  SetupTensor<int64_t>(age_id, {batch_size, 1}, age_id_data);
+
+  std::vector<int64_t> job_id_data = {10};
+  SetupTensor<int64_t>(job_id, {batch_size, 1}, job_id_data);
+
+  std::vector<int64_t> movie_id_data = {783};
+  SetupTensor<int64_t>(movie_id, {batch_size, 1}, movie_id_data);
+
+  std::vector<int64_t> category_id_data = {10, 8, 9};
+  SetupLoDTensor<int64_t>(category_id, {3, 1}, {{0, 3}}, category_id_data);
+
+  std::vector<int64_t> movie_title_data = {1069, 4140, 2923, 710, 988};
+  SetupLoDTensor<int64_t>(movie_title, {5, 1}, {{0, 5}}, movie_title_data);
+
+  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+  cpu_feeds.push_back(&user_id);
+  cpu_feeds.push_back(&gender_id);
+  cpu_feeds.push_back(&age_id);
+  cpu_feeds.push_back(&job_id);
+  cpu_feeds.push_back(&movie_id);
+  cpu_feeds.push_back(&category_id);
+  cpu_feeds.push_back(&movie_title);
+
+  paddle::framework::LoDTensor output1;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+  cpu_fetchs1.push_back(&output1);
+
+  // Run inference on CPU
+  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
+  LOG(INFO) << output1.dims();
+
+#ifdef PADDLE_WITH_CUDA
+  paddle::framework::LoDTensor output2;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
+  cpu_fetchs2.push_back(&output2);
+
+  // Run inference on CUDA GPU
+  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
+  LOG(INFO) << output2.dims();
+
+  CheckError<float>(output1, output2);
+#endif
+}
diff --git a/paddle/inference/tests/book/test_inference_rnn_encoder_decoder.cc b/paddle/inference/tests/book/test_inference_rnn_encoder_decoder.cc
new file mode 100644
index 0000000000000000000000000000000000000000..248b9dce217232f1b88d74af9df31648f7779f98
--- /dev/null
+++ b/paddle/inference/tests/book/test_inference_rnn_encoder_decoder.cc
@@ -0,0 +1,65 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "gflags/gflags.h"
+#include "test_helper.h"
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+TEST(inference, rnn_encoder_decoder) {
+  if (FLAGS_dirname.empty()) {
+    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
+  }
+
+  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
+  std::string dirname = FLAGS_dirname;
+
+  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
+
+  paddle::framework::LoDTensor word_data, trg_word;
+  paddle::framework::LoD lod{{0, 4, 10}};
+
+  SetupLoDTensor(
+      word_data, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
+  SetupLoDTensor(
+      trg_word, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
+
+  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+  cpu_feeds.push_back(&word_data);
+  cpu_feeds.push_back(&trg_word);
+
+  paddle::framework::LoDTensor output1;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+  cpu_fetchs1.push_back(&output1);
+
+  // Run inference on CPU
+  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
+  LOG(INFO) << output1.lod();
+  LOG(INFO) << output1.dims();
+
+#ifdef PADDLE_WITH_CUDA
+  paddle::framework::LoDTensor output2;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
+  cpu_fetchs2.push_back(&output2);
+
+  // Run inference on CUDA GPU
+  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
+  LOG(INFO) << output2.lod();
+  LOG(INFO) << output2.dims();
+
+  CheckError<float>(output1, output2);
+#endif
+}
diff --git a/paddle/inference/tests/book/test_inference_understand_sentiment.cc b/paddle/inference/tests/book/test_inference_understand_sentiment.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1afb644446569fc4fea8d7e9bf806daec016a5b2
--- /dev/null
+++ b/paddle/inference/tests/book/test_inference_understand_sentiment.cc
@@ -0,0 +1,60 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "gflags/gflags.h"
+#include "test_helper.h"
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+TEST(inference, understand_sentiment) {
+  if (FLAGS_dirname.empty()) {
+    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
+  }
+
+  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
+  std::string dirname = FLAGS_dirname;
+
+  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
+
+  paddle::framework::LoDTensor words;
+  paddle::framework::LoD lod{{0, 4, 10}};
+  SetupLoDTensor(words, lod, static_cast<int64_t>(0), static_cast<int64_t>(10));
+
+  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+  cpu_feeds.push_back(&words);
+
+  paddle::framework::LoDTensor output1;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+  cpu_fetchs1.push_back(&output1);
+
+  // Run inference on CPU
+  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
+  LOG(INFO) << output1.lod();
+  LOG(INFO) << output1.dims();
+
+#ifdef PADDLE_WITH_CUDA
+  paddle::framework::LoDTensor output2;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
+  cpu_fetchs2.push_back(&output2);
+
+  // Run inference on CUDA GPU
+  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
+  LOG(INFO) << output2.lod();
+  LOG(INFO) << output2.dims();
+
+  CheckError<float>(output1, output2);
+#endif
+}
diff --git a/paddle/inference/tests/book/test_inference_word2vec.cc b/paddle/inference/tests/book/test_inference_word2vec.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ca0c040ff629c55b3392d67628ba93226c04a0ce
--- /dev/null
+++ b/paddle/inference/tests/book/test_inference_word2vec.cc
@@ -0,0 +1,68 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "gflags/gflags.h"
+#include "test_helper.h"
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+TEST(inference, word2vec) {
+  if (FLAGS_dirname.empty()) {
+    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
+  }
+
+  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
+  std::string dirname = FLAGS_dirname;
+
+  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
+
+  paddle::framework::LoDTensor first_word, second_word, third_word, fourth_word;
+  paddle::framework::LoD lod{{0, 1}};
+  int64_t dict_size = 2072;  // Hard-coding the size of dictionary
+
+  SetupLoDTensor(first_word, lod, static_cast<int64_t>(0), dict_size);
+  SetupLoDTensor(second_word, lod, static_cast<int64_t>(0), dict_size);
+  SetupLoDTensor(third_word, lod, static_cast<int64_t>(0), dict_size);
+  SetupLoDTensor(fourth_word, lod, static_cast<int64_t>(0), dict_size);
+
+  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+  cpu_feeds.push_back(&first_word);
+  cpu_feeds.push_back(&second_word);
+  cpu_feeds.push_back(&third_word);
+  cpu_feeds.push_back(&fourth_word);
+
+  paddle::framework::LoDTensor output1;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+  cpu_fetchs1.push_back(&output1);
+
+  // Run inference on CPU
+  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
+  LOG(INFO) << output1.lod();
+  LOG(INFO) << output1.dims();
+
+#ifdef PADDLE_WITH_CUDA
+  paddle::framework::LoDTensor output2;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
+  cpu_fetchs2.push_back(&output2);
+
+  // Run inference on CUDA GPU
+  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
+  LOG(INFO) << output2.lod();
+  LOG(INFO) << output2.dims();
+
+  CheckError<float>(output1, output2);
+#endif
+}
diff --git a/paddle/operators/adagrad_op.cu b/paddle/operators/adagrad_op.cu
index 00cb6e9cafb4e79ed3d59cd4a6e40ea132e5efda..9a21e00b12bc2795e1bf1591f7db60c0245bacd3 100644
--- a/paddle/operators/adagrad_op.cu
+++ b/paddle/operators/adagrad_op.cu
@@ -101,9 +101,9 @@ struct SparseAdagradFunctor<platform::CUDADeviceContext, T> {
     SparseAdagradFunctorKernel<
         T, 256><<<grid2, threads, 0,
                   reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                      .stream()>>>(grad_merge_data, merge_rows.cuda_data(), lr,
-                                   param_data, moment_data, grad_width,
-                                   epsilon);
+                      .stream()>>>(
+        grad_merge_data, merge_rows.CUDAMutableData(context.GetPlace()), lr,
+        param_data, moment_data, grad_width, epsilon);
   }
 };
 
diff --git a/paddle/operators/adam_op.h b/paddle/operators/adam_op.h
index bf536687d398b8342e6ae76a07c11e5fe47483e0..af2c3ecd725ed1c916ff3b8a0291794d35a70e8b 100644
--- a/paddle/operators/adam_op.h
+++ b/paddle/operators/adam_op.h
@@ -201,7 +201,7 @@ class AdamOpKernel : public framework::OpKernel<T> {
       const T* grad_data = grad_tensor.template data<T>();
       int64_t* rows = nullptr;
       if (platform::is_gpu_place(ctx.GetPlace())) {
-        rows = grad_merge.mutable_rows()->cuda_data();
+        rows = grad_merge.mutable_rows()->CUDAMutableData(ctx.GetPlace());
       } else {
         rows = grad_merge.mutable_rows()->data();
       }
diff --git a/paddle/operators/compare_op.cc b/paddle/operators/compare_op.cc
index 930c295a9cb31238954efeb87ff5ac2d3ca7bdc6..51b5bcb38f9d60b1246f818de62275dba5b087f9 100644
--- a/paddle/operators/compare_op.cc
+++ b/paddle/operators/compare_op.cc
@@ -58,8 +58,8 @@ class CompareOpInferShape : public framework::InferShapeBase {
                    comment.type);
     auto dim_x = context->GetInputDim("X");
     auto dim_y = context->GetInputDim("Y");
-    PADDLE_ENFORCE_EQ(framework::product(dim_x), framework::product(dim_y),
-                      "The number of elements in X and Y should be same");
+    PADDLE_ENFORCE_GE(dim_x.size(), dim_y.size(),
+                      "The size of dim_y should not be greater than dim_x's.");
 
     context->SetOutputDim("Out", context->GetInputDim("X"));
     context->ShareLoD("X", "Out");
diff --git a/paddle/operators/compare_op.h b/paddle/operators/compare_op.h
index b275fd75b3512343825170fc38565dd27f7f1c75..79b8c6f59c7ad3d77aa969f6b4f36f8050cfe823 100644
--- a/paddle/operators/compare_op.h
+++ b/paddle/operators/compare_op.h
@@ -62,7 +62,7 @@ class CompareOpKernel
     z->mutable_data<T>(context.GetPlace());
     int axis = context.Attr<int>("axis");
     ElementwiseComputeEx<Functor, DeviceContext, T, bool>(context, x, y, axis,
-                                                          z);
+                                                          Functor(), z);
   }
 };
 
diff --git a/paddle/operators/ctc_align_op.cu b/paddle/operators/ctc_align_op.cu
index cea595d7c5d461b40198e622abf08248e7ca69e1..6406825d4a5c4538b5e2780efbe5ba86adce5b72 100644
--- a/paddle/operators/ctc_align_op.cu
+++ b/paddle/operators/ctc_align_op.cu
@@ -69,8 +69,9 @@ class CTCAlignOpCUDAKernel : public framework::OpKernel<T> {
 
     auto stream = ctx.cuda_device_context().stream();
     MergeAndDelCudaKernel<T><<<1, 1, 0, stream>>>(
-        num_tokens, tokens, num_seq, input_lod[level].cuda_data(), blank,
-        merge_repeated, dev_out_lod0_ptr, output_data);
+        num_tokens, tokens, num_seq,
+        input_lod[level].CUDAMutableData(ctx.GetPlace()), blank, merge_repeated,
+        dev_out_lod0_ptr, output_data);
 
     // set output lod
     std::vector<size_t> host_out_lod0(dev_out_lod0.begin(), dev_out_lod0.end());
diff --git a/paddle/operators/cum_op.h b/paddle/operators/cum_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..e3813ac9036169345bd69d1d42db4618c6c45ebf
--- /dev/null
+++ b/paddle/operators/cum_op.h
@@ -0,0 +1,111 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+#include "paddle/operators/detail/safe_ref.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename Functor>
+class CumKernel : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  using T = typename Functor::ELEMENT_TYPE;
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto& X = detail::Ref(context.Input<framework::Tensor>("X"),
+                          "Cannot get input tensor X, variable name = %s",
+                          context.op().Input("X"));
+
+    auto& Out = detail::Ref(context.Output<framework::Tensor>("Out"),
+                            "Cannot get output tensor Out, variable name = %s",
+                            context.op().Output("Out"));
+    int axis = context.Attr<int>("axis");
+    bool exclusive = context.Attr<bool>("exclusive");
+    bool reverse = context.Attr<bool>("reverse");
+    auto x_dims = X.dims();
+    if (axis == -1) {
+      axis = x_dims.size() - 1;
+    }
+    PADDLE_ENFORCE_LT(
+        axis, x_dims.size(),
+        "axis should be less than the dimensiotn of the input tensor");
+    Out.mutable_data<T>(context.GetPlace());
+
+    int pre = 1;
+    int post = 1;
+    int mid = x_dims[axis];
+    for (int i = 0; i < axis; ++i) {
+      pre *= x_dims[i];
+    }
+    for (int i = axis + 1; i < x_dims.size(); ++i) {
+      post *= x_dims[i];
+    }
+
+    auto x = framework::EigenVector<T>::Flatten(X);
+    auto out = framework::EigenVector<T>::Flatten(Out);
+    auto* place =
+        context.template device_context<DeviceContext>().eigen_device();
+
+    using IndexT = Eigen::DenseIndex;
+    if (pre == 1) {
+      if (post == 1) {
+        ComputeImp(*place, Eigen::DSizes<IndexT, 1>(mid), x, out,
+                   /* axis= */ 0, reverse, exclusive);
+      } else {
+        ComputeImp(*place, Eigen::DSizes<IndexT, 2>(mid, post), x, out,
+                   /* axis= */ 0, reverse, exclusive);
+      }
+    } else {
+      if (post == 1) {
+        ComputeImp(*place, Eigen::DSizes<IndexT, 2>(pre, mid), x, out,
+                   /* axis= */ 1, reverse, exclusive);
+      } else {
+        ComputeImp(*place, Eigen::DSizes<IndexT, 3>(pre, mid, post), x, out,
+                   /* axis= */ 1, reverse, exclusive);
+      }
+    }
+  }
+
+ private:
+  template <typename Device, typename Dim, typename X, typename Out>
+  void ComputeImp(Device d, const Dim& dims, X x, Out out, int axis,
+                  bool reverse, bool exclusive) const {
+    if (!reverse) {
+      out.reshape(dims).device(d) = Functor()(x.reshape(dims), axis, exclusive);
+    } else {
+      std::array<bool, Dim::count> rev;
+      rev.fill(false);
+      rev[axis] = reverse;
+      out.reshape(dims).device(d) =
+          Functor()(x.reshape(dims).reverse(rev), axis, exclusive).reverse(rev);
+    }
+  }
+};
+
+template <typename T>
+struct CumsumFunctor {
+  using ELEMENT_TYPE = T;
+  template <typename X>
+  const typename X::TensorScanSumOp operator()(X x, int axis,
+                                               bool exclusive) const {
+    return x.cumsum(axis, exclusive);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/cumsum_op.cc b/paddle/operators/cumsum_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4933cc923d46ad053e16e060d94a39083709bbe1
--- /dev/null
+++ b/paddle/operators/cumsum_op.cc
@@ -0,0 +1,82 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/cum_op.h"
+
+namespace paddle {
+namespace operators {
+
+class CumOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class CumsumOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CumsumOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Cumsum operator");
+    AddOutput("Out", "Output of Cumsum operator");
+    AddAttr<int>("axis",
+                 "(int, default -1). The dimenstion to accumulate along. "
+                 "-1 means the last dimenstion")
+        .SetDefault(-1)
+        .EqualGreaterThan(-1);
+    AddAttr<bool>("exclusive",
+                  "bool, default false). Whether to perform exclusive cumsum")
+        .SetDefault(false);
+    AddAttr<bool>("reverse",
+                  "bool, default false). If true, the cumsum is performed in "
+                  "the reversed direction")
+        .SetDefault(false);
+    AddComment(R"DOC(
+The cumulative sum of the elements along a given axis.
+By default, the first element of the result is the same of the first element of
+the input. If exlusive is true, the first element of the result is 0.
+)DOC");
+  }
+};
+
+class CumsumGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("cumsum");
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetAttr("axis", Attr<int>("axis"));
+    grad_op->SetAttr("reverse", !Attr<bool>("reverse"));
+    grad_op->SetAttr("exclusive", Attr<bool>("exclusive"));
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+using CPU = paddle::platform::CPUDeviceContext;
+
+REGISTER_OPERATOR(cumsum, ops::CumOp, ops::CumsumOpMaker, ops::CumsumGradMaker);
+REGISTER_OP_CPU_KERNEL(cumsum, ops::CumKernel<CPU, ops::CumsumFunctor<float>>,
+                       ops::CumKernel<CPU, ops::CumsumFunctor<double>>,
+                       ops::CumKernel<CPU, ops::CumsumFunctor<int>>)
diff --git a/paddle/operators/cumsum_op.cu b/paddle/operators/cumsum_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..90661c4269a9fadaba1446e04b3f661f1227a978
--- /dev/null
+++ b/paddle/operators/cumsum_op.cu
@@ -0,0 +1,22 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/cum_op.h"
+
+namespace ops = paddle::operators;
+using CUDA = paddle::platform::CUDADeviceContext;
+
+REGISTER_OP_CUDA_KERNEL(cumsum, ops::CumKernel<CUDA, ops::CumsumFunctor<float>>,
+                        ops::CumKernel<CUDA, ops::CumsumFunctor<double>>,
+                        ops::CumKernel<CUDA, ops::CumsumFunctor<int>>)
diff --git a/paddle/operators/elementwise_add_op.h b/paddle/operators/elementwise_add_op.h
index c32288d6984f126f2374a13973541f4f663b25a4..c24f97a85092ff14e8211ca8bc4bb9b155510a2c 100644
--- a/paddle/operators/elementwise_add_op.h
+++ b/paddle/operators/elementwise_add_op.h
@@ -35,7 +35,8 @@ class ElementwiseAddKernel : public framework::OpKernel<T> {
     auto* z = ctx.Output<Tensor>("Out");
     z->mutable_data<T>(ctx.GetPlace());
     int axis = ctx.Attr<int>("axis");
-    ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
+    ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                          AddFunctor<T>(), z);
   }
 };
 
diff --git a/paddle/operators/elementwise_div_op.h b/paddle/operators/elementwise_div_op.h
index 07ebade31ff5b3d5c89156e28ff5fa0670a9a842..dc863cc598ec6015067f166b1544a5d20223662a 100644
--- a/paddle/operators/elementwise_div_op.h
+++ b/paddle/operators/elementwise_div_op.h
@@ -35,7 +35,8 @@ class ElementwiseDivKernel : public framework::OpKernel<T> {
     auto* z = ctx.Output<Tensor>("Out");
     z->mutable_data<T>(ctx.GetPlace());
     int axis = ctx.Attr<int>("axis");
-    ElementwiseComputeEx<DivFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
+    ElementwiseComputeEx<DivFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                          DivFunctor<T>(), z);
   }
 };
 
diff --git a/paddle/operators/elementwise_max_op.h b/paddle/operators/elementwise_max_op.h
index 717e45ab31db9b9a6629fb33e17654dbf986d8c5..67efe4e1511e054d54f91b5aa22ce28f222ed20a 100644
--- a/paddle/operators/elementwise_max_op.h
+++ b/paddle/operators/elementwise_max_op.h
@@ -35,7 +35,8 @@ class ElementwiseMaxKernel : public framework::OpKernel<T> {
     auto* z = ctx.Output<Tensor>("Out");
     z->mutable_data<T>(ctx.GetPlace());
     int axis = ctx.Attr<int>("axis");
-    ElementwiseComputeEx<MaxFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
+    ElementwiseComputeEx<MaxFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                          MaxFunctor<T>(), z);
   }
 };
 
diff --git a/paddle/operators/elementwise_min_op.h b/paddle/operators/elementwise_min_op.h
index 0de9a91c52b0ab82cd62604de318ce68e56b767d..cf11759404d3342b8a1c0080fa09f6cd57e735db 100644
--- a/paddle/operators/elementwise_min_op.h
+++ b/paddle/operators/elementwise_min_op.h
@@ -35,7 +35,8 @@ class ElementwiseMinKernel : public framework::OpKernel<T> {
     auto* z = ctx.Output<Tensor>("Out");
     z->mutable_data<T>(ctx.GetPlace());
     int axis = ctx.Attr<int>("axis");
-    ElementwiseComputeEx<MinFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
+    ElementwiseComputeEx<MinFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                          MinFunctor<T>(), z);
   }
 };
 
diff --git a/paddle/operators/elementwise_mul_op.h b/paddle/operators/elementwise_mul_op.h
index ae7a71e0244dfb8ad3e55683ac081f92bc36bea5..773125f5ca54e7b529df47a2823d56a5ad71e50d 100644
--- a/paddle/operators/elementwise_mul_op.h
+++ b/paddle/operators/elementwise_mul_op.h
@@ -34,7 +34,8 @@ class ElementwiseMulKernel : public framework::OpKernel<T> {
     auto* z = ctx.Output<Tensor>("Out");
     z->mutable_data<T>(ctx.GetPlace());
     int axis = ctx.Attr<int>("axis");
-    ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
+    ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                          MulFunctor<T>(), z);
   }
 };
 
diff --git a/paddle/operators/elementwise_op_function.h b/paddle/operators/elementwise_op_function.h
index 213fe1f5a818873e8b666464cb112637261c598c..74abf7c4a58788eb0e53025886f10f5a43021a9e 100644
--- a/paddle/operators/elementwise_op_function.h
+++ b/paddle/operators/elementwise_op_function.h
@@ -365,10 +365,10 @@ template <typename Functor, typename DeviceContext, typename T,
           typename OutType = T>
 void ElementwiseComputeEx(const framework::ExecutionContext& ctx,
                           const framework::Tensor* x,
-                          const framework::Tensor* y, int axis,
+                          const framework::Tensor* y, int axis, Functor func,
                           framework::Tensor* z) {
   TransformFunctor<Functor, T, DeviceContext, OutType> functor(
-      x, y, z, ctx.template device_context<DeviceContext>(), Functor());
+      x, y, z, ctx.template device_context<DeviceContext>(), func);
 
   auto x_dims = x->dims();
   auto y_dims = y->dims();
diff --git a/paddle/operators/elementwise_pow_op.h b/paddle/operators/elementwise_pow_op.h
index 874fd3f09f2afaccfbfca75799cc3448f7393b03..0c5dd031ec46ebecaabb701839c0f69c02678eb0 100644
--- a/paddle/operators/elementwise_pow_op.h
+++ b/paddle/operators/elementwise_pow_op.h
@@ -36,7 +36,8 @@ class ElementwisePowKernel : public framework::OpKernel<T> {
     auto* z = ctx.Output<Tensor>("Out");
     z->mutable_data<T>(ctx.GetPlace());
     int axis = ctx.Attr<int>("axis");
-    ElementwiseComputeEx<PowFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
+    ElementwiseComputeEx<PowFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                          PowFunctor<T>(), z);
   }
 };
 
diff --git a/paddle/operators/elementwise_sub_op.h b/paddle/operators/elementwise_sub_op.h
index c2749a8e6ba689233dab4f3c72de10bf01f39fab..6a88c5f6b4c869f8ab5b4fa3b112ffc264be7145 100644
--- a/paddle/operators/elementwise_sub_op.h
+++ b/paddle/operators/elementwise_sub_op.h
@@ -34,7 +34,8 @@ class ElementwiseSubKernel : public framework::OpKernel<T> {
     auto* z = ctx.Output<Tensor>("Out");
     z->mutable_data<T>(ctx.GetPlace());
     int axis = ctx.Attr<int>("axis");
-    ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
+    ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                          SubFunctor<T>(), z);
   }
 };
 
diff --git a/paddle/operators/layer_norm_op.cc b/paddle/operators/layer_norm_op.cc
index 1c6d2ae4d05becaeed34d66cad398cc90f9d3ece..d9b774272cb7c9d87140bf30d2eabb44f49b2b7c 100644
--- a/paddle/operators/layer_norm_op.cc
+++ b/paddle/operators/layer_norm_op.cc
@@ -21,13 +21,6 @@ using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
 using DataLayout = framework::DataLayout;
 
-template <typename T>
-using EigenMatrixMapRowMajor = Eigen::Map<
-    Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
-template <typename T>
-using ConstEigenMatrixMapRowMajor = Eigen::Map<
-    const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
-
 class LayerNormOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -108,7 +101,6 @@ class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker {
 
     AddComment(R"DOC(
 Layer Normalization.
-
 Layer Norm has been implemented as discussed in the paper:
 https://arxiv.org/abs/1607.06450
 ...
@@ -116,75 +108,6 @@ https://arxiv.org/abs/1607.06450
   }
 };
 
-template <typename T>
-class LayerNormKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const float epsilon = ctx.Attr<float>("epsilon");
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *bias = ctx.Input<Tensor>("Bias");
-    const auto *x = ctx.Input<Tensor>("X");
-    const auto &x_dims = x->dims();
-    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
-
-    auto *output = ctx.Output<Tensor>("Y");
-    auto *mean = ctx.Output<Tensor>("Mean");
-    auto *var = ctx.Output<Tensor>("Variance");
-    output->mutable_data<T>(ctx.GetPlace());
-    mean->mutable_data<T>(ctx.GetPlace());
-    var->mutable_data<T>(ctx.GetPlace());
-
-    auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
-    int left = static_cast<int>(matrix_dim[0]);
-    int right = static_cast<int>(matrix_dim[1]);
-
-    auto input_map = ConstEigenMatrixMapRowMajor<T>(x->data<T>(), left, right);
-
-    auto mean_map = EigenMatrixMapRowMajor<T>(mean->data<T>(), left, 1);
-    auto var_map = EigenMatrixMapRowMajor<T>(var->data<T>(), left, 1);
-    auto output_map = EigenMatrixMapRowMajor<T>(output->data<T>(), left, right);
-
-    auto squre = [](T ele) { return ele * ele; };
-    auto add_epslion = [epsilon](T ele) { return ele + epsilon; };
-
-    mean_map = input_map.rowwise().mean();
-    var_map = (input_map - mean_map.replicate(1, right))
-                  .unaryExpr(squre)
-                  .rowwise()
-                  .mean()
-                  .unaryExpr(add_epslion);
-
-    auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); };
-    // TODO(zcd): Some thinking about output_map, is it appropriate that
-    // `output_map` and `input_map` point to the same memory.
-    auto inv_std = var_map.unaryExpr(inv_std_func);
-    if (scale && bias) {
-      auto scale_map =
-          ConstEigenMatrixMapRowMajor<T>(scale->data<T>(), 1, right);
-      auto bias_map = ConstEigenMatrixMapRowMajor<T>(bias->data<T>(), 1, right);
-      output_map = (input_map - mean_map.replicate(1, right))
-                       .cwiseProduct(inv_std.replicate(1, right))
-                       .cwiseProduct(scale_map.replicate(left, 1)) +
-                   bias_map.replicate(left, 1);
-    } else if (scale) {
-      auto scale_map =
-          ConstEigenMatrixMapRowMajor<T>(scale->data<T>(), 1, right);
-      output_map = (input_map - mean_map.replicate(1, right))
-                       .cwiseProduct(inv_std.replicate(1, right))
-                       .cwiseProduct(scale_map.replicate(left, 1));
-    } else if (bias) {
-      auto bias_map = ConstEigenMatrixMapRowMajor<T>(bias->data<T>(), 1, right);
-      output_map = (input_map - mean_map.replicate(1, right))
-                       .cwiseProduct(inv_std.replicate(1, right)) +
-                   bias_map.replicate(left, 1);
-    } else {
-      output_map = (input_map - mean_map.replicate(1, right))
-                       .cwiseProduct(inv_std.replicate(1, right));
-    }
-  }
-};
-
 class LayerNormGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -193,8 +116,6 @@ class LayerNormGradOp : public framework::OperatorWithKernel {
     // check input
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of LayerNormOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Scale"),
-                   "Input(Scale) of LayerNormOp should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Mean"),
                    "Input(Mean) of LayerNormOp should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Variance"),
@@ -237,125 +158,6 @@ class LayerNormGradOp : public framework::OperatorWithKernel {
   }
 };
 
-template <typename T>
-class LayerNormGradKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *x = ctx.Input<Tensor>("X");
-    const auto *mean = ctx.Input<Tensor>("Mean");
-    const auto *var = ctx.Input<Tensor>("Variance");
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
-
-    const auto &x_dims = x->dims();
-
-    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
-    auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
-    int left = static_cast<int>(matrix_dim[0]);
-    int right = static_cast<int>(matrix_dim[1]);
-
-    // init output
-    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-
-    auto x_map = ConstEigenMatrixMapRowMajor<T>(x->data<T>(), left, right);
-    auto d_y_map = ConstEigenMatrixMapRowMajor<T>(d_y->data<T>(), left, right);
-    auto mean_map = ConstEigenMatrixMapRowMajor<T>(mean->data<T>(), left, 1);
-    auto var_map = ConstEigenMatrixMapRowMajor<T>(var->data<T>(), left, 1);
-
-    if (d_bias) {
-      d_bias->mutable_data<T>(ctx.GetPlace());
-      auto d_bias_map = EigenMatrixMapRowMajor<T>(d_bias->data<T>(), 1, right);
-      d_bias_map = d_y_map.colwise().sum();
-    }
-    if (d_scale) {
-      d_scale->mutable_data<T>(ctx.GetPlace());
-      auto d_scale_map =
-          EigenMatrixMapRowMajor<T>(d_scale->data<T>(), 1, right);
-      auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); };
-      // There are two equation to compute d_scale. One uses "Y" and the other
-      // does not use "Y"
-      d_scale_map =
-          ((x_map - mean_map.replicate(1, right))
-               .cwiseProduct(
-                   var_map.unaryExpr(inv_std_func).replicate(1, right))
-               .cwiseProduct(d_y_map))
-              .colwise()
-              .sum();
-    }
-
-    if (d_x) {
-      d_x->mutable_data<T>(ctx.GetPlace());
-      auto d_x_map = EigenMatrixMapRowMajor<T>(d_x->data<T>(), left, right);
-      auto triple_product_func = [](T ele) { return ele * ele * ele; };
-      auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); };
-      // TODO(zcd): these code can be refined
-      if (d_scale) {
-        auto scale_map =
-            ConstEigenMatrixMapRowMajor<T>(scale->data<T>(), 1, right);
-        // dy_dx
-        auto dx_end = var_map.unaryExpr(inv_std_func)
-                          .replicate(1, right)
-                          .cwiseProduct(d_y_map)
-                          .cwiseProduct(scale_map.replicate(left, 1));
-        // dy_dmean_dx
-        auto dx_mean = (T(-1.0) / right) *
-                       var_map.unaryExpr(inv_std_func)
-                           .replicate(1, right)
-                           .cwiseProduct(d_y_map)
-                           .cwiseProduct(scale_map.replicate(left, 1))
-                           .rowwise()
-                           .sum()
-                           .replicate(1, right);
-        // dy_var_dx
-        auto dvar_end_part = (x_map - mean_map.replicate(1, right))
-                                 .cwiseProduct(scale_map.replicate(left, 1))
-                                 .cwiseProduct(d_y_map)
-                                 .rowwise()
-                                 .sum();
-        auto dvar_end = var_map.unaryExpr(inv_std_func)
-                            .unaryExpr(triple_product_func)
-                            .cwiseProduct(dvar_end_part)
-                            .replicate(1, right);
-        auto dx_var =
-            (T(-1.0) / right) *
-            (x_map - mean_map.replicate(1, right)).cwiseProduct(dvar_end);
-
-        d_x_map = dx_end + dx_mean + dx_var;
-      } else {
-        // dy_dx
-        auto dx_end = var_map.unaryExpr(inv_std_func)
-                          .replicate(1, right)
-                          .cwiseProduct(d_y_map);
-        // dy_dmean_dx
-        auto dx_mean = (T(-1.0) / right) *
-                       var_map.unaryExpr(inv_std_func)
-                           .replicate(1, right)
-                           .cwiseProduct(d_y_map)
-                           .rowwise()
-                           .sum()
-                           .replicate(1, right);
-        // dy_var_dx
-        auto dvar_end_part = (x_map - mean_map.replicate(1, right))
-                                 .cwiseProduct(d_y_map)
-                                 .rowwise()
-                                 .sum();
-        auto dvar_end = var_map.unaryExpr(inv_std_func)
-                            .unaryExpr(triple_product_func)
-                            .cwiseProduct(dvar_end_part)
-                            .replicate(1, right);
-        auto dx_var =
-            (T(-1.0) / right) *
-            (x_map - mean_map.replicate(1, right)).cwiseProduct(dvar_end);
-
-        d_x_map = dx_end + dx_mean + dx_var;
-      }
-    }
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
 
@@ -363,8 +165,9 @@ namespace ops = paddle::operators;
 REGISTER_OP(layer_norm, ops::LayerNormOp, ops::LayerNormOpMaker,
             layer_norm_grad, ops::LayerNormGradOp);
 REGISTER_OP_CPU_KERNEL(
-    layer_norm,
-    ops::LayerNormKernel<paddle::platform::CPUDeviceContext, float>);
+    layer_norm, ops::LayerNormKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LayerNormKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
     layer_norm_grad,
-    ops::LayerNormGradKernel<paddle::platform::CPUDeviceContext, float>);
+    ops::LayerNormGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LayerNormGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/layer_norm_op.cu b/paddle/operators/layer_norm_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..77d13b216f0e8d6d4434742908437f1eb74818c9
--- /dev/null
+++ b/paddle/operators/layer_norm_op.cu
@@ -0,0 +1,25 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/layer_norm_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    layer_norm,
+    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    layer_norm_grad,
+    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/layer_norm_op.h b/paddle/operators/layer_norm_op.h
index bca35b91e6f52d35dee14aac9d080b52914942e3..3c436b89263758bbc0abcd1bb71cef3e1370d2a5 100644
--- a/paddle/operators/layer_norm_op.h
+++ b/paddle/operators/layer_norm_op.h
@@ -16,19 +16,222 @@ limitations under the License. */
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
 
+#include "paddle/operators/elementwise_op_function.h"
+#include "paddle/operators/math/math_function.h"
+
 namespace paddle {
 namespace operators {
 
+template <typename T>
+struct SubAndSquareFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return (a - b) * (a - b); }
+};
+
+template <typename T>
+struct DivAndSqrtFunctor {
+  explicit DivAndSqrtFunctor(T epsilon) { epsilon_ = epsilon; }
+  inline HOSTDEVICE T operator()(T a, T b) const {
+    return a / (sqrt(b + epsilon_));
+  }
+
+ private:
+  T epsilon_;
+};
+
+template <typename T>
+struct MulFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a * b; }
+};
+
+template <typename T>
+struct AddFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a + b; }
+};
+
+template <typename T>
+struct SubFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a - b; }
+};
+
+template <typename T>
+struct MulInvVarFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const {
+    return a * std::sqrt(1.0 / b);
+  }
+};
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using DataLayout = framework::DataLayout;
+
 template <typename DeviceContext, typename T>
 class LayerNormKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& ctx) const override;
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const float epsilon = ctx.Attr<float>("epsilon");
+    auto *scale = ctx.Input<Tensor>("Scale");
+    auto *bias = ctx.Input<Tensor>("Bias");
+    auto x = *ctx.Input<Tensor>("X");
+
+    auto *y = ctx.Output<Tensor>("Y");
+    auto *mean = ctx.Output<Tensor>("Mean");
+    auto *var = ctx.Output<Tensor>("Variance");
+    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
+
+    const auto x_dims = x.dims();
+
+    y->mutable_data<T>(ctx.GetPlace());
+    mean->mutable_data<T>(ctx.GetPlace());
+    var->mutable_data<T>(ctx.GetPlace());
+
+    auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
+    int left = static_cast<int>(matrix_dim[0]);
+    int right = static_cast<int>(matrix_dim[1]);
+    framework::DDim matrix_shape({left, right});
+
+    x.Resize(matrix_shape);
+    Tensor out;
+    out.ShareDataWith(*y);
+    out.Resize(matrix_shape);
+
+    auto &dev_ctx = ctx.template device_context<DeviceContext>();
+    math::RowwiseMean<DeviceContext, T> row_mean;
+
+    // get mean
+    row_mean(dev_ctx, x, mean);
+
+    // get variance
+    ElementwiseComputeEx<SubAndSquareFunctor<T>, DeviceContext, T>(
+        ctx, &x, mean, /*axis*/ 0, SubAndSquareFunctor<T>(), &out);
+    row_mean(dev_ctx, out, var);
+
+    // get x_norm
+    ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
+        ctx, &x, mean, /*axis*/ 0, SubFunctor<T>(), &out);
+    ElementwiseComputeEx<DivAndSqrtFunctor<T>, DeviceContext, T>(
+        ctx, &out, var, /*axis*/ 0,
+        DivAndSqrtFunctor<T>(static_cast<T>(epsilon)), &out);
+
+    if (scale) {
+      ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
+          ctx, &out, scale, /*axis*/ 1, MulFunctor<T>(), &out);
+    }
+    if (bias) {
+      ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(
+          ctx, &out, bias, /*axis*/ 1, AddFunctor<T>(), &out);
+    }
+  }
 };
 
 template <typename DeviceContext, typename T>
 class LayerNormGradKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& ctx) const override;
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const float epsilon = ctx.Attr<float>("epsilon");
+    auto x = *ctx.Input<Tensor>("X");
+    auto *y = ctx.Input<Tensor>("Y");
+    auto *mean = ctx.Input<Tensor>("Mean");
+    auto *var = ctx.Input<Tensor>("Variance");
+    auto *scale = ctx.Input<Tensor>("Scale");
+    auto *bias = ctx.Input<Tensor>("Bias");
+    auto d_y = *ctx.Input<Tensor>(framework::GradVarName("Y"));
+    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
+
+    // init output
+    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    const auto &x_dims = x.dims();
+    auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
+    int left = static_cast<int>(matrix_dim[0]);
+    int right = static_cast<int>(matrix_dim[1]);
+    framework::DDim matrix_shape({left, right});
+
+    d_y.Resize(matrix_shape);
+    auto &dev_ctx = ctx.template device_context<DeviceContext>();
+    math::ColwiseSum<DeviceContext, T> colwise_sum;
+
+    Tensor temp;
+    Tensor temp_norm;
+    if (d_scale || d_x) {
+      x.Resize(matrix_shape);
+      temp.mutable_data<T>(matrix_shape, ctx.GetPlace());
+
+      if (!(bias && scale)) {
+        temp_norm.ShareDataWith(*y);
+        temp_norm.Resize(matrix_shape);
+      } else {
+        temp_norm.mutable_data<T>(matrix_shape, ctx.GetPlace());
+        // get x_norm
+        ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
+            ctx, &x, mean, /*axis*/ 0, SubFunctor<T>(), &temp_norm);
+        ElementwiseComputeEx<DivAndSqrtFunctor<T>, DeviceContext, T>(
+            ctx, &temp_norm, var, /*axis*/ 0,
+            DivAndSqrtFunctor<T>(static_cast<T>(epsilon)), &temp_norm);
+      }
+    }
+
+    if (d_bias) {
+      d_bias->mutable_data<T>(ctx.GetPlace());
+      colwise_sum(dev_ctx, d_y, d_bias);
+    }
+    if (d_scale) {
+      d_scale->mutable_data<T>(ctx.GetPlace());
+      ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
+          ctx, &temp_norm, &d_y, /*axis*/ 0, MulFunctor<T>(), &temp);
+      colwise_sum(dev_ctx, temp, d_scale);
+    }
+
+    if (d_x) {
+      framework::DDim vec_shape({left});
+      d_x->mutable_data<T>(ctx.GetPlace());
+      auto dx_dim = d_x->dims();
+      Tensor temp_vec;
+      temp_vec.mutable_data<T>(vec_shape, ctx.GetPlace());
+
+      math::RowwiseMean<DeviceContext, T> row_mean;
+
+      if (d_scale) {
+        // dy_dx
+        ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
+            ctx, &d_y, scale, /*axis*/ 1, MulFunctor<T>(), &temp);
+        framework::Copy(temp, ctx.GetPlace(), ctx.device_context(), d_x);
+
+        // dy_dmean_dx
+        row_mean(dev_ctx, temp, &temp_vec);
+        ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
+            ctx, d_x, &temp_vec, /*axis*/ 0, SubFunctor<T>(), d_x);
+
+        // dy_var_dx
+        ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
+            ctx, &temp, &temp_norm, /*axis*/ 0, MulFunctor<T>(), &temp);
+      } else {
+        // dy_dx
+        framework::Copy(d_y, ctx.GetPlace(), ctx.device_context(), d_x);
+
+        // dy_dmean_dx
+        row_mean(dev_ctx, d_y, &temp_vec);
+        ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
+            ctx, d_x, &temp_vec, /*axis*/ 0, SubFunctor<T>(), d_x);
+
+        // dy_var_dx
+        ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
+            ctx, &d_y, &temp_norm, /*axis*/ 0, MulFunctor<T>(), &temp);
+      }
+      // dy_var_dx
+      row_mean(dev_ctx, temp, &temp_vec);
+      ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
+          ctx, &temp_norm, &temp_vec, /*axis*/ 0, MulFunctor<T>(), &temp);
+      ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
+          ctx, d_x, &temp, /*axis*/ 0, SubFunctor<T>(), d_x);
+
+      ElementwiseComputeEx<DivAndSqrtFunctor<T>, DeviceContext, T>(
+          ctx, d_x, var, /*axis*/ 0,
+          DivAndSqrtFunctor<T>(static_cast<T>(epsilon)), d_x);
+      d_x->Resize(dx_dim);
+    }
+  }
 };
 
 }  // namespace operators
diff --git a/paddle/operators/lookup_table_op.cu b/paddle/operators/lookup_table_op.cu
index 07372808bbf078bd2e9b0bb5782b95a046253f46..9684b6d4612c8e134ccad658840bd028a8508085 100644
--- a/paddle/operators/lookup_table_op.cu
+++ b/paddle/operators/lookup_table_op.cu
@@ -125,7 +125,9 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
       new_rows.resize(ids_dim[0]);
       auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());
 
-      memory::Copy(platform::CPUPlace(), new_rows.cuda_data(), gpu_place,
+      // TODO(yuyang18): Strange code here.
+      memory::Copy(platform::CPUPlace(),
+                   new_rows.CUDAMutableData(context.GetPlace()), gpu_place,
                    ids_data, ids_dim[0] * sizeof(int64_t), stream);
 
       d_table->set_rows(new_rows);
diff --git a/paddle/operators/math/math_function.cc b/paddle/operators/math/math_function.cc
index dcf4b85e1aadf88e4b1ca70ac7e8b5416fc58cd8..ce0a5f6cff873166e3308a625978ecefaed2aa29 100644
--- a/paddle/operators/math/math_function.cc
+++ b/paddle/operators/math/math_function.cc
@@ -331,6 +331,12 @@ template struct RowwiseAdd<platform::CPUDeviceContext, double>;
 template struct ColwiseSum<platform::CPUDeviceContext, float>;
 template struct ColwiseSum<platform::CPUDeviceContext, double>;
 
+template struct RowwiseSum<platform::CPUDeviceContext, float>;
+template struct RowwiseSum<platform::CPUDeviceContext, double>;
+
+template struct RowwiseMean<platform::CPUDeviceContext, float>;
+template struct RowwiseMean<platform::CPUDeviceContext, double>;
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/math_function.cu b/paddle/operators/math/math_function.cu
index d47a7f818ded61baf31e46ea3b8ae3101324111f..c0a107470a4629506fc06dabc78a4a4716be6649 100644
--- a/paddle/operators/math/math_function.cu
+++ b/paddle/operators/math/math_function.cu
@@ -325,6 +325,31 @@ void ColwiseSum<platform::CUDADeviceContext, double>::operator()(
       vector->data<double>());
 }
 
+template struct RowwiseSum<platform::CUDADeviceContext, float>;
+// template struct RowwiseSum<platform::CUDADeviceContext, double>;
+// TODO(zcd): Following ColwiseSum format, need to confirm.
+// The RowwiseSum<platform::CUDADeviceContext, double> failed in debug mode,
+// and only failed for this case. So reimplemented it.
+template <>
+void RowwiseSum<platform::CUDADeviceContext, double>::operator()(
+    const platform::CUDADeviceContext& context, const framework::Tensor& input,
+    framework::Tensor* vector) {
+  auto in_dims = input.dims();
+  auto size = input.numel() / in_dims[0];
+  PADDLE_ENFORCE_EQ(vector->numel(), in_dims[0]);
+  framework::Tensor one;
+  one.mutable_data<double>({size}, context.GetPlace());
+  SetConstant<platform::CUDADeviceContext, double> set;
+  set(context, &one, static_cast<double>(1.0));
+  gemv<platform::CUDADeviceContext, double>(
+      context, true, static_cast<int>(in_dims[1]), static_cast<int>(in_dims[0]),
+      1.0, one.data<double>(), input.data<double>(), 0.0,
+      vector->data<double>());
+}
+
+template struct RowwiseMean<platform::CUDADeviceContext, float>;
+template struct RowwiseMean<platform::CUDADeviceContext, double>;
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/math_function.h b/paddle/operators/math/math_function.h
index 8cc03c2ba0facae691a0d2b8a4f2ea768cfa5491..cb14d1e57468564710640773fdabd41896c178e0 100644
--- a/paddle/operators/math/math_function.h
+++ b/paddle/operators/math/math_function.h
@@ -128,6 +128,18 @@ struct ColwiseSum {
                   framework::Tensor* vec);
 };
 
+template <typename DeviceContext, typename T>
+struct RowwiseSum {
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  framework::Tensor* vec);
+};
+
+template <typename DeviceContext, typename T>
+struct RowwiseMean {
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  framework::Tensor* vec);
+};
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/math_function_impl.h b/paddle/operators/math/math_function_impl.h
index de591626df28e2bc3391b609f909612411398247..af4127788af0aaeb99199f7d6e2138a449b9fe51 100644
--- a/paddle/operators/math/math_function_impl.h
+++ b/paddle/operators/math/math_function_impl.h
@@ -87,6 +87,88 @@ class ColwiseSum<platform::CPUDeviceContext, T> {
   }
 };
 
+template <typename DeviceContext, typename T>
+void RowwiseMean<DeviceContext, T>::operator()(const DeviceContext& context,
+                                               const framework::Tensor& input,
+                                               framework::Tensor* out) {
+  auto in_dims = input.dims();
+  PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
+  PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]);
+
+  auto in = framework::EigenMatrix<T>::From(input);
+  auto vec = framework::EigenVector<T>::Flatten(*out);
+
+  vec.device(*context.eigen_device()) = in.mean(Eigen::array<int, 1>({{1}}));
+}
+// TODO(zcd): Following ColwiseSum format, need to confirm.
+// Specialize for CPU, since Eigen implement a general reduce. However,
+// rowwise-sum can be easily implemented. General reduce has a huge overhead in
+// CPU
+template <typename T>
+class RowwiseMean<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* out) {
+    auto& in_dims = input.dims();
+    PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
+    auto height = in_dims[0];
+    auto size = in_dims[1];
+    PADDLE_ENFORCE_EQ(out->numel(), height);
+    auto inv_size = 1.0 / size;
+    T* out_buf = out->mutable_data<T>(out->place());
+    const T* in_buf = input.data<T>();
+
+    for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
+      T sum = 0;
+      for (size_t j = 0; j < static_cast<size_t>(size); ++j) {
+        sum += in_buf[i * size + j];
+      }
+      out_buf[i] = sum * inv_size;
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+void RowwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
+                                              const framework::Tensor& input,
+                                              framework::Tensor* out) {
+  auto in_dims = input.dims();
+  PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
+  PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]);
+
+  auto in = framework::EigenMatrix<T>::From(input);
+  auto vec = framework::EigenVector<T>::Flatten(*out);
+
+  vec.device(*context.eigen_device()) = in.sum(Eigen::array<int, 1>({{1}}));
+}
+// TODO(zcd): Following ColwiseSum format, need to confirm.
+// Specialize for CPU, since Eigen implement a general reduce. However,
+// rowwise-sum can be easily implemented. General reduce has a huge overhead in
+// CPU
+template <typename T>
+class RowwiseSum<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* out) {
+    auto& in_dims = input.dims();
+    PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
+    auto height = in_dims[0];
+    auto size = in_dims[1];
+    PADDLE_ENFORCE_EQ(out->numel(), size);
+
+    T* out_buf = out->mutable_data<T>(out->place());
+    const T* in_buf = input.data<T>();
+
+    for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
+      T sum = 0;
+      for (size_t j = 0; j < static_cast<size_t>(size); ++j) {
+        sum += in_buf[i * size + j];
+      }
+      out_buf[i] = sum;
+    }
+  }
+};
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/selected_rows_functor.cc b/paddle/operators/math/selected_rows_functor.cc
index 8a1ebb58c26578f076bf243adfbd51d10c682b99..4e15d01a3071995e1412fed2a451e4ad3f171862 100644
--- a/paddle/operators/math/selected_rows_functor.cc
+++ b/paddle/operators/math/selected_rows_functor.cc
@@ -128,7 +128,7 @@ struct SelectedRowsAddTo<platform::CPUDeviceContext, T> {
     auto* in2_value = input2->mutable_value();
 
     // concat rows
-    in2_rows.insert(in2_rows.end(), in1_rows.begin(), in1_rows.end());
+    in2_rows.Extend(in1_rows.begin(), in1_rows.end());
 
     auto in1_place = input1.place();
     PADDLE_ENFORCE(platform::is_cpu_place(in1_place));
diff --git a/paddle/operators/math/selected_rows_functor.cu b/paddle/operators/math/selected_rows_functor.cu
index acdd87cb3550bc5f3891aed6fefd4301a3395f9f..54a41a67d063fdc9e4453cb339d92d4a406cecc2 100644
--- a/paddle/operators/math/selected_rows_functor.cu
+++ b/paddle/operators/math/selected_rows_functor.cu
@@ -126,7 +126,8 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
     dim3 grid(1, in1_rows.size());
     SelectedRowsAddTensorKernel<
         T, block_size><<<grid, threads, 0, context.stream()>>>(
-        in1_data, in1_rows.cuda_data(), out_data, in1_row_numel);
+        in1_data, in1_rows.CUDAData(context.GetPlace()), out_data,
+        in1_row_numel);
 
     auto out_eigen = framework::EigenVector<T>::Flatten(*output);
     auto in2_eigen = framework::EigenVector<T>::Flatten(input2);
@@ -153,7 +154,9 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
     auto* in2_value = input2->mutable_value();
 
     // concat rows
-    in2_rows.insert(in2_rows.end(), in1_rows.begin(), in1_rows.end());
+    if (in1_rows.size()) {
+      in2_rows.Extend(in1_rows.begin(), in1_rows.end());
+    }
 
     auto in1_place = input1.place();
     PADDLE_ENFORCE(platform::is_gpu_place(in1_place));
@@ -216,7 +219,8 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
     dim3 grid(1, in1_rows.size());
     SelectedRowsAddToTensorKernel<
         T, block_size><<<grid, threads, 0, context.stream()>>>(
-        in1_data, in1_rows.cuda_data(), in2_data, in1_row_numel);
+        in1_data, in1_rows.CUDAData(context.GetPlace()), in2_data,
+        in1_row_numel);
   }
 };
 
@@ -283,9 +287,10 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
     MergeAddKernel<
         T, 256><<<grid1, threads, 0,
                   reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                      .stream()>>>(input_data, input_rows.cuda_data(), out_data,
-                                   out.mutable_rows()->cuda_data(),
-                                   out.rows().size(), input_width);
+                      .stream()>>>(
+        input_data, input_rows.CUDAData(context.GetPlace()), out_data,
+        out.mutable_rows()->CUDAMutableData(context.GetPlace()),
+        out.rows().size(), input_width);
     return out;
   }
 };
diff --git a/paddle/operators/math/sequence2batch.cu b/paddle/operators/math/sequence2batch.cu
index f27631271a42b4d64abef00d7f119b85e32edda4..eaed2c30a80c75d56aef329f6e6f67b8bac3520a 100644
--- a/paddle/operators/math/sequence2batch.cu
+++ b/paddle/operators/math/sequence2batch.cu
@@ -45,7 +45,6 @@ class CopyMatrixRowsFunctor<platform::CUDADeviceContext, T> {
                   const framework::Tensor& src,
                   framework::Vector<size_t> index_lod, framework::Tensor& dst,
                   bool is_src_index) {
-    size_t* index = index_lod.cuda_data();
     auto src_dims = src.dims();
     auto dst_dims = dst.dims();
     PADDLE_ENFORCE_EQ(src_dims.size(), 2,
@@ -63,7 +62,8 @@ class CopyMatrixRowsFunctor<platform::CUDADeviceContext, T> {
     dim3 grid(8, 1);
     auto stream = context.stream();
     CopyMatrixRowsKernel<T, 128, 8, 8><<<grid, threads, 0, stream>>>(
-        src_data, dst_data, index, height, width, is_src_index);
+        src_data, dst_data, index_lod.CUDAData(context.GetPlace()), height,
+        width, is_src_index);
   }
 };
 
diff --git a/paddle/operators/math/sequence_padding.cu b/paddle/operators/math/sequence_padding.cu
index 65c9cfe4a0ec14d220ad237baa71703a783ed0fa..c2bd56448aa363160f6bf621ec67deff9e369c92 100644
--- a/paddle/operators/math/sequence_padding.cu
+++ b/paddle/operators/math/sequence_padding.cu
@@ -121,12 +121,12 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
     if (norm_by_times) {
       SequencePaddingKernel<T, 1, 1><<<grid, threads, 0, context.stream()>>>(
           padding_data, const_cast<T*>(seq_data),
-          abs_offset_lod[level].cuda_data(), sequence_width,
+          abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width,
           max_sequence_length, num_sequences);
     } else {
       SequencePaddingKernel<T, 0, 1><<<grid, threads, 0, context.stream()>>>(
           padding_data, const_cast<T*>(seq_data),
-          abs_offset_lod[level].cuda_data(), sequence_width,
+          abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width,
           max_sequence_length, num_sequences);
     }
   }
@@ -196,12 +196,12 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
     if (norm_by_times) {
       SequencePaddingKernel<T, 1, 0><<<grid, threads, 0, context.stream()>>>(
           const_cast<T*>(padding_data), seq_data,
-          abs_offset_lod[level].cuda_data(), sequence_width,
+          abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width,
           max_sequence_length, num_sequences);
     } else {
       SequencePaddingKernel<T, 0, 0><<<grid, threads, 0, context.stream()>>>(
           const_cast<T*>(padding_data), seq_data,
-          abs_offset_lod[level].cuda_data(), sequence_width,
+          abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width,
           max_sequence_length, num_sequences);
     }
   }
diff --git a/paddle/operators/math/sequence_pooling.cu b/paddle/operators/math/sequence_pooling.cu
index f66534a6812a66c737445ea96914a393077d7d65..c69bd3da7e741d3113de63fe08d22c23f772dda4 100644
--- a/paddle/operators/math/sequence_pooling.cu
+++ b/paddle/operators/math/sequence_pooling.cu
@@ -73,7 +73,8 @@ class MaxSeqPoolFunctor<platform::CUDADeviceContext, T> {
     dim3 grid(num_seq, 1);
     auto stream = context.stream();
     KeMaxSequencePool<T><<<grid, threads, 0, stream>>>(
-        in_data, starts.cuda_data(), out_data, max_index, num_seq, dim);
+        in_data, starts.CUDAData(context.GetPlace()), out_data, max_index,
+        num_seq, dim);
   }
 };
 
diff --git a/paddle/operators/math/sequence_scale.cu b/paddle/operators/math/sequence_scale.cu
index fd4e28f6113729cd1fa9dc179bd9b601d29b8a7f..7cb9242db932ba8b2490f528ee08bd3f4b4e8f83 100644
--- a/paddle/operators/math/sequence_scale.cu
+++ b/paddle/operators/math/sequence_scale.cu
@@ -46,7 +46,8 @@ class ScaleLoDTensorFunctor<platform::CUDADeviceContext, T> {
 
     SequenceScaleKernel<T, PADDLE_CUDA_NUM_THREADS><<<
         num_seq, PADDLE_CUDA_NUM_THREADS, 0, context.stream()>>>(
-        seq_data, abs_offset_lod[level].cuda_data(), scales, seq_width);
+        seq_data, abs_offset_lod[level].CUDAMutableData(context.GetPlace()),
+        scales, seq_width);
   }
 };
 
diff --git a/paddle/operators/parallel_do_op.cc b/paddle/operators/parallel_do_op.cc
index 89045923f9ff2f33bc112b199c493047440e15c4..edb9de82509f9ee7619f5e90f49022de977a2ea4 100644
--- a/paddle/operators/parallel_do_op.cc
+++ b/paddle/operators/parallel_do_op.cc
@@ -79,9 +79,6 @@ inline void CopyOrShare(const framework::Variable &src,
       dst->GetMutable<LoDTensor>()->set_lod(src.Get<LoDTensor>().lod());
     } else {
       Copy(src.Get<LoDTensor>(), dst_place, dst->GetMutable<LoDTensor>());
-      framework::LoD lod(src.Get<LoDTensor>().lod());
-      lod.CopyToPeer(dst_place);
-      dst->GetMutable<LoDTensor>()->set_lod(lod);
     }
   } else if (src.IsType<SelectedRows>()) {
     auto &src_sr = src.Get<SelectedRows>();
@@ -92,9 +89,6 @@ inline void CopyOrShare(const framework::Variable &src,
       dst_sr->set_rows(src_sr.rows());
     } else {
       Copy(src_sr.value(), dst_place, dst_sr->mutable_value());
-      framework::Vector<int64_t> lod(src_sr.rows());
-      lod.CopyToPeer(dst_place);
-      dst_sr->set_rows(lod);
     }
   } else {
     PADDLE_THROW("Expect LoDTensor/SelectedRows, get %s", src.Type().name());
@@ -152,9 +146,6 @@ class ParallelDoOp : public framework::OperatorBase {
         auto *sub_scope = sub_scopes[i];
         auto *dst = sub_scope->Var(param)->GetMutable<LoDTensor>();
         framework::Copy(src, place, dst);
-        framework::LoD lod(src.lod());
-        lod.CopyToPeer(place);
-        dst->set_lod(lod);
       }
     }
     WaitOnPlaces(places);
diff --git a/paddle/operators/row_conv_op.cu b/paddle/operators/row_conv_op.cu
index b3825212e1ac41b13a2f4cad2c128da39c5f6e71..d1a6d119d3da605b1d455d38f38a8808234b8ad1 100644
--- a/paddle/operators/row_conv_op.cu
+++ b/paddle/operators/row_conv_op.cu
@@ -307,7 +307,7 @@ class RowConvKernel<platform::CUDADeviceContext, T>
     int input_dim = X->dims()[1];
     int num_sequence = batch_indices.size() - 1;
     int future_context = Filter->dims()[0];
-    size_t *idx = batch_indices.cuda_data();
+    size_t *idx = batch_indices.CUDAMutableData(context.GetPlace());
     auto stream = context.cuda_device_context().stream();
 
     if (future_context <= 32) {
@@ -345,7 +345,7 @@ class RowConvGradKernel<platform::CUDADeviceContext, T>
     int input_dim = X->dims()[1];
     int num_sequence = batch_indices.size() - 1;
     int future_context = Filter->dims()[0];
-    size_t *idx = batch_indices.cuda_data();
+    size_t *idx = batch_indices.CUDAMutableData(context.GetPlace());
 
     auto &device_ctx = context.cuda_device_context();
     math::SetConstant<platform::CUDADeviceContext, T> zero;
diff --git a/paddle/operators/sequence_erase_op.cu b/paddle/operators/sequence_erase_op.cu
index a5311f15f0c607c880a6f12c0bef10b2dd8c8a79..4a7217cfd656f4f6b46d5a80a9c8e165c839df1d 100644
--- a/paddle/operators/sequence_erase_op.cu
+++ b/paddle/operators/sequence_erase_op.cu
@@ -87,8 +87,7 @@ class SequenceEraseOpCUDAKernel : public framework::OpKernel<T> {
     // Copy LoD to GPU
     auto lod0 = lod[0];
     auto lod_len = lod0.size();
-    thrust::device_vector<size_t> dev_in_lod = lod0;
-    size_t* dev_in_lod_ptr = thrust::raw_pointer_cast(dev_in_lod.data());
+    const size_t* dev_in_lod_ptr = lod0.CUDAData(ctx.GetPlace());
 
     // Calc output LoD
     thrust::device_vector<size_t> dev_out_lod(lod_len);
diff --git a/paddle/operators/sgd_op.cu b/paddle/operators/sgd_op.cu
index 29f5aa3542c26c76a1b80da61ec6752019216131..d27befe4460550f7b7b30aa93a23c8e51aa52da9 100644
--- a/paddle/operators/sgd_op.cu
+++ b/paddle/operators/sgd_op.cu
@@ -102,8 +102,8 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> {
       dim3 grid(1, in_rows.size());
       SparseSGDFunctorKernel<
           T, 256><<<grid, threads, 0, ctx.cuda_device_context().stream()>>>(
-          in_data, in_rows.cuda_data(), learning_rate->data<T>(), out_data,
-          in_row_numel);
+          in_data, in_rows.CUDAData(ctx.GetPlace()), learning_rate->data<T>(),
+          out_data, in_row_numel);
 
     } else {
       PADDLE_THROW("Unsupported Variable Type of Grad");
diff --git a/paddle/operators/target_assign_op.h b/paddle/operators/target_assign_op.h
index 82fca5724c0bd9fbfb60a98b91944700bfab9cdf..574919e1ef8d28c2a27b73b97a91d29e89896a6b 100644
--- a/paddle/operators/target_assign_op.h
+++ b/paddle/operators/target_assign_op.h
@@ -137,8 +137,8 @@ class TargetAssignKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_EQ(gt_lod.data()[i], gt_label_lod.data()[i]);
     }
 
-    size_t* gt_lod_data = gt_lod.data(ctx.GetPlace());
-    size_t* neg_lod_data = neg_lod.data(ctx.GetPlace());
+    size_t* gt_lod_data = gt_lod.MutableData(ctx.GetPlace());
+    size_t* neg_lod_data = neg_lod.MutableData(ctx.GetPlace());
 
     TargetAssignFunctor<T> functor(box_data, label_data, match_idx_data,
                                    gt_lod_data, background_label, num,
diff --git a/paddle/pybind/CMakeLists.txt b/paddle/pybind/CMakeLists.txt
index de53fea0dd692167d61fcca552cc834a7916e209..d62f34030894e2fa21925bbc44e24b4e7d738d15 100644
--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
@@ -7,7 +7,3 @@ if(WITH_PYTHON)
     target_link_libraries(paddle_pybind rt)
   endif(NOT APPLE AND NOT ANDROID)
 endif(WITH_PYTHON)
-
-if(WITH_DOC)
-  cc_binary(print_operators_doc SRCS print_operators_doc.cc DEPS ${GLOB_OP_LIB})
-endif(WITH_DOC)
diff --git a/paddle/pybind/print_operators_doc.cc b/paddle/pybind/print_operators_doc.cc
deleted file mode 100644
index b55ddee17616ced4de659be8e55acd5e072c66b7..0000000000000000000000000000000000000000
--- a/paddle/pybind/print_operators_doc.cc
+++ /dev/null
@@ -1,148 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include <iostream>
-#include <sstream>  // std::stringstream
-#include <string>
-
-#include "paddle/framework/op_info.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/pybind/pybind.h"
-
-std::string Escape(const std::string& s) {
-  std::string r;
-  for (size_t i = 0; i < s.size(); i++) {
-    switch (s[i]) {
-      case '\"':
-        r += "\\\"";
-        break;
-      case '\\':
-        r += "\\\\";
-        break;
-      case '\n':
-        r += "\\n";
-        break;
-      case '\t':
-        r += "\\t";
-      case '\r':
-        break;
-      default:
-        r += s[i];
-        break;
-    }
-  }
-  return r;
-}
-
-std::string AttrType(paddle::framework::proto::AttrType at) {
-  switch (at) {
-    case paddle::framework::proto::INT:
-      return "int";
-    case paddle::framework::proto::FLOAT:
-      return "float";
-    case paddle::framework::proto::STRING:
-      return "string";
-    case paddle::framework::proto::BOOLEAN:
-      return "bool";
-    case paddle::framework::proto::INTS:
-      return "int array";
-    case paddle::framework::proto::FLOATS:
-      return "float array";
-    case paddle::framework::proto::STRINGS:
-      return "string array";
-    case paddle::framework::proto::BOOLEANS:
-      return "bool array";
-    case paddle::framework::proto::BLOCK:
-      return "block id";
-    case paddle::framework::proto::LONG:
-      return "long";
-  }
-  return "UNKNOWN";  // not possible
-}
-
-void PrintVar(const paddle::framework::proto::OpProto::Var& v,
-              std::stringstream& ss) {
-  ss << " { "
-     << "\n"
-     << "   \"name\" : \"" << Escape(v.name()) << "\",\n"
-     << "   \"comment\" : \"" << Escape(v.comment()) << "\",\n"
-     << "   \"duplicable\" : " << v.duplicable() << ",\n"
-     << "   \"intermediate\" : " << v.intermediate() << "\n"
-     << " },";
-}
-
-void PrintAttr(const paddle::framework::proto::OpProto::Attr& a,
-               std::stringstream& ss) {
-  ss << " { "
-     << "\n"
-     << "   \"name\" : \"" << Escape(a.name()) << "\",\n"
-     << "   \"type\" : \"" << AttrType(a.type()) << "\",\n"
-     << "   \"comment\" : \"" << Escape(a.comment()) << "\",\n"
-     << "   \"generated\" : " << a.generated() << "\n"
-     << " },";
-}
-
-void PrintOpProto(const std::string& type,
-                  const paddle::framework::OpInfo& opinfo,
-                  std::stringstream& ss) {
-  std::cerr << "Processing " << type << "\n";
-
-  const paddle::framework::proto::OpProto* p = opinfo.proto_;
-  if (p == nullptr) {
-    return;  // It is possible that an operator doesn't have OpProto.
-  }
-
-  ss << "{\n"
-     << " \"type\" : \"" << Escape(p->type()) << "\",\n"
-     << " \"comment\" : \"" << Escape(p->comment()) << "\",\n";
-
-  ss << " \"inputs\" : [ "
-     << "\n";
-  for (int i = 0; i < p->inputs_size(); i++) {
-    PrintVar(p->inputs(i), ss);
-  }
-  ss.seekp(-1, ss.cur);  // remove the trailing comma
-  ss << " ], "
-     << "\n";
-
-  ss << " \"outputs\" : [ "
-     << "\n";
-  for (int i = 0; i < p->outputs_size(); i++) {
-    PrintVar(p->outputs(i), ss);
-  }
-  ss.seekp(-1, ss.cur);  // remove the trailing comma
-  ss << " ], "
-     << "\n";
-
-  ss << " \"attrs\" : [ "
-     << "\n";
-  for (int i = 0; i < p->attrs_size(); i++) {
-    PrintAttr(p->attrs(i), ss);
-  }
-  ss.seekp(-1, ss.cur);  // remove the trailing comma
-  ss << " ] "
-     << "\n";
-
-  ss << "},";
-}
-
-int main() {
-  std::stringstream ss;
-  ss << "[\n";
-  for (auto& iter : paddle::framework::OpInfoMap::Instance().map()) {
-    PrintOpProto(iter.first, iter.second, ss);
-  }
-  ss.seekp(-1, ss.cur);  // remove the trailing comma
-  ss << "]\n";
-  std::cout << ss.str();
-}
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index ba496db5f834efe767bfe446a46877932faa81a0..2f8dd48efe1d7252f5befd2f682219661a60ba03 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -118,8 +118,6 @@ EOF
         make -j `nproc` gen_proto_py
         make -j `nproc` paddle_python
         make -j `nproc` paddle_docs paddle_docs_cn paddle_api_docs
-        make -j `nproc` print_operators_doc
-        paddle/pybind/print_operators_doc > doc/en/html/operators.json
         popd
     fi
 
diff --git a/paddle/scripts/travis/build_doc.sh b/paddle/scripts/travis/build_doc.sh
index 4af4ac4f5e43543449ae922d7eb2a5740372f68f..486c094a6ac3c9ec260e231ba2333c718dfc164b 100755
--- a/paddle/scripts/travis/build_doc.sh
+++ b/paddle/scripts/travis/build_doc.sh
@@ -10,8 +10,6 @@ cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
 make -j `nproc` gen_proto_py
 make -j `nproc` paddle_python
 make -j `nproc` paddle_docs paddle_docs_cn paddle_api_docs
-make -j `nproc` print_operators_doc
-paddle/pybind/print_operators_doc > doc/en/html/operators.json
 
 # check websites for broken links
 linkchecker doc/en/html/index.html
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
index fd8c4a69da897cc39f31f435036e32c41285fb59..ab84f1c292b97f55d88165e7ef0e32b93d542802 100644
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "paddle/memory/memory.h"
 
 int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
   std::vector<char*> new_argv;
   std::string gflags_env;
   for (int i = 0; i < argc; ++i) {
@@ -35,7 +36,6 @@ int main(int argc, char** argv) {
   int new_argc = static_cast<int>(new_argv.size());
   char** new_argv_address = new_argv.data();
   google::ParseCommandLineFlags(&new_argc, &new_argv_address, false);
-  testing::InitGoogleTest(&argc, argv);
   paddle::memory::Used(paddle::platform::CPUPlace());
 
 #ifdef PADDLE_WITH_CUDA
diff --git a/python/paddle/v2/fluid/__init__.py b/python/paddle/v2/fluid/__init__.py
index 3ee58393c72c0b6f9bec96be51ad3946752a35dd..73acbf3e009965f9eaaade77d2fe4cf4f99d4379 100644
--- a/python/paddle/v2/fluid/__init__.py
+++ b/python/paddle/v2/fluid/__init__.py
@@ -29,7 +29,7 @@ import optimizer
 import learning_rate_decay
 import backward
 import regularizer
-from param_attr import ParamAttr
+from param_attr import ParamAttr, WeightNormParamAttr
 from data_feeder import DataFeeder
 from core import LoDTensor, CPUPlace, CUDAPlace
 from distribute_transpiler import DistributeTranspiler
@@ -41,11 +41,26 @@ import profiler
 Tensor = LoDTensor
 
 __all__ = framework.__all__ + executor.__all__ + [
-    'io', 'initializer', 'layers', 'nets', 'optimizer', 'learning_rate_decay',
-    'backward', 'regularizer', 'LoDTensor', 'CPUPlace', 'CUDAPlace', 'Tensor',
-    'ParamAttr'
-    'DataFeeder', 'clip', 'SimpleDistributeTranspiler', 'DistributeTranspiler',
-    'memory_optimize', 'profiler'
+    'io',
+    'initializer',
+    'layers',
+    'nets',
+    'optimizer',
+    'learning_rate_decay',
+    'backward',
+    'regularizer',
+    'LoDTensor',
+    'CPUPlace',
+    'CUDAPlace',
+    'Tensor',
+    'ParamAttr',
+    'WeightNormParamAttr',
+    'DataFeeder',
+    'clip',
+    'SimpleDistributeTranspiler',
+    'DistributeTranspiler',
+    'memory_optimize',
+    'profiler',
 ]
 
 
diff --git a/python/paddle/v2/fluid/distribute_transpiler.py b/python/paddle/v2/fluid/distribute_transpiler.py
index 121b407cae41fa477843b7252ebacc9053d5f7aa..cd89dba72db2470c1d5bafd4523c21c49e7e2b53 100644
--- a/python/paddle/v2/fluid/distribute_transpiler.py
+++ b/python/paddle/v2/fluid/distribute_transpiler.py
@@ -300,6 +300,9 @@ class DistributeTranspiler:
             pass
         return orig_shape
 
+    def _op_input_var(self, op, varname):
+        pass
+
     def _is_op_on_pserver(self, endpoint, all_ops, idx):
         """
         Recursively check if the op need to run on current server.
@@ -309,44 +312,51 @@ class DistributeTranspiler:
             p.name for p in self.param_grad_ep_mapping[endpoint]["params"]
         ]
         op = all_ops[idx]
-        if op.inputs.has_key("Param"):
-            if op.inputs["Param"].name in param_names:
+        input_names = set(op.input_names)
+        # TODO(typhoonzero): using Param and Grad input name to identify
+        # that the operator is an optimization operator, need a better way.
+        if "Param" in input_names:
+            if op.input("Param")[0] in param_names:
                 return True
             else:
                 for n in param_names:
-                    if same_or_split_var(n, op.inputs[
-                            "Param"].name) and n != op.inputs["Param"].name:
+                    if same_or_split_var(n, op.input("Param")[0]) \
+                            and n != op.input("Param")[0]:
                         return True
                 return False
         else:
             j = idx - 1
             while j >= 0:
                 prev_op = all_ops[j]
-                prev_output_names = [o.name for o in prev_op.outputs.values()]
-                prev_input_names = [o.name for o in prev_op.inputs.values()]
+                # prev_output_names = [o.name for o in prev_op.outputs.values()]
+                # prev_input_names = [o.name for o in prev_op.inputs.values()]
+                # NOTE(typhoonzero): consider list input/output
+                prev_output_names = prev_op.desc.output_arg_names()
+                prev_input_names = prev_op.desc.input_arg_names()
                 found1 = False
                 found2 = False
-                for _, v in op.inputs.iteritems():
-                    if v.name in prev_output_names:
+                for varname in op.desc.input_arg_names():
+                    if varname in prev_output_names:
                         found1 = self._is_op_on_pserver(endpoint, all_ops, j)
                 # later ops may produce output for prev op's next batch use.
-                for _, v in op.outputs.iteritems():
-                    if v.name in prev_input_names:
+                for varname in op.desc.output_arg_names():
+                    if varname in prev_input_names:
                         found2 = self._is_op_on_pserver(endpoint, all_ops, j)
                 if found1 or found2:
                     return True
                 j -= 1
             return False
 
-    def _append_pserver_ops(self, program, pserver_program, opt_op, endpoint):
+    def _append_pserver_ops(self, optimize_block, opt_op, endpoint):
+        program = optimize_block.program
         new_inputs = dict()
         # update param/grad shape first, then other inputs like
         # moment can use the updated shape
-        for key, var in opt_op.inputs.iteritems():
+        for key in opt_op.input_names:
             if key == "Grad":
                 grad_block = None
                 for g in self.param_grad_ep_mapping[endpoint]["grads"]:
-                    if same_or_split_var(g.name, var.name):
+                    if same_or_split_var(g.name, opt_op.input(key)[0]):
                         grad_block = g
                         break
                 if not grad_block:
@@ -362,11 +372,11 @@ class DistributeTranspiler:
                 if self.trainers > 1:
                     vars2merge = self._create_var_for_trainers(
                         program.global_block(), grad_block, self.trainers)
-                    program.global_block().append_op(
+                    optimize_block.append_op(
                         type="sum",
                         inputs={"X": vars2merge},
                         outputs={"Out": merged_var})
-                    program.global_block().append_op(
+                    optimize_block.append_op(
                         type="scale",
                         inputs={"X": merged_var},
                         outputs={"Out": merged_var},
@@ -376,7 +386,7 @@ class DistributeTranspiler:
                 # param is already created on global program
                 param_block = None
                 for p in self.param_grad_ep_mapping[endpoint]["params"]:
-                    if same_or_split_var(p.name, var.name):
+                    if same_or_split_var(p.name, opt_op.input(key)[0]):
                         param_block = p
                         break
                 if not param_block:
@@ -389,11 +399,12 @@ class DistributeTranspiler:
 
                 new_inputs[key] = tmpvar
 
-        for key, var in opt_op.inputs.iteritems():
+        for key in opt_op.input_names:
             if key in ["Param", "Grad"]:
                 continue
             # update accumulator variable shape
             param_shape = new_inputs["Param"].shape
+            var = program.global_block().vars[opt_op.input(key)[0]]
             new_shape = self._get_optimizer_input_shape(opt_op.type, key,
                                                         var.shape, param_shape)
             tmpvar = program.global_block().create_var(
@@ -402,40 +413,41 @@ class DistributeTranspiler:
                 dtype=var.dtype,
                 shape=new_shape)
             new_inputs[key] = tmpvar
-            # create var in pserver program global block.
-            # TODO(typhoonzero): put blocks in one program to avoid create two
-            # variables.
-            pserver_program.global_block().create_var(
-                name=var.name,
-                persistable=var.persistable,
-                dtype=var.dtype,
-                shape=new_shape)
 
         # change output's ParamOut variable
-        opt_op.outputs["ParamOut"] = new_inputs["Param"]
-        program.global_block().append_op(
+        outputs = self._get_output_map_from_op(program.global_block(), opt_op)
+        outputs["ParamOut"] = new_inputs["Param"]
+        optimize_block.append_op(
             type=opt_op.type,
             inputs=new_inputs,
-            outputs=opt_op.outputs,
+            outputs=outputs,
             attrs=opt_op.attrs)
 
-    def _append_pserver_non_opt_ops(self, program, pserver_program, opt_op):
+    def _append_pserver_non_opt_ops(self, optimize_block, opt_op):
+        program = optimize_block.program
         # Append the ops for parameters that do not need to be optimized/updated
-        for _, var in opt_op.inputs.iteritems():
-            program.global_block().create_var(
-                name=var.name,
-                persistable=var.persistable,
-                dtype=var.dtype,
-                shape=var.shape)
-            pserver_program.global_block().create_var(
-                name=var.name,
-                persistable=var.persistable,
-                dtype=var.dtype,
-                shape=var.shape)
-        program.global_block().append_op(
+        inputs = self._get_input_map_from_op(self.program.global_block().vars,
+                                             opt_op)
+        for var in inputs.itervalues():
+            if type(var) == list:
+                varlist = var
+            else:
+                varlist = [var]
+            for var in varlist:
+                if not program.global_block().vars.has_key(var.name):
+                    program.global_block().create_var(
+                        name=var.name,
+                        persistable=var.persistable,
+                        dtype=var.dtype,
+                        shape=var.shape)
+
+        outputs = self._get_output_map_from_op(self.program.global_block().vars,
+                                               opt_op)
+
+        optimize_block.append_op(
             type=opt_op.type,
-            inputs=opt_op.inputs,
-            outputs=opt_op.outputs,
+            inputs=inputs,
+            outputs=outputs,
             attrs=opt_op.attrs)
 
     def get_pserver_program(self, endpoint):
@@ -465,26 +477,25 @@ class DistributeTranspiler:
                     dtype=v.dtype,
                     shape=v.shape)
         # step6
-        optimize_sub_program = Program()
+        optimize_block = pserver_program.create_block(0)
         # Iterate through the ops and append ops as needed
         for idx, opt_op in enumerate(self.optimize_ops):
             is_op_on_pserver = self._is_op_on_pserver(endpoint,
                                                       self.optimize_ops, idx)
             if not is_op_on_pserver:
                 continue
-            if opt_op.inputs.has_key("Grad"):
-                self._append_pserver_ops(optimize_sub_program, pserver_program,
-                                         opt_op, endpoint)
+            if "Grad" in opt_op.desc.input_arg_names():
+                self._append_pserver_ops(optimize_block, opt_op, endpoint)
             else:
-                self._append_pserver_non_opt_ops(optimize_sub_program,
-                                                 pserver_program, opt_op)
+                self._append_pserver_non_opt_ops(optimize_block, opt_op)
+
         # Append the listen_and_serv op
         pserver_program.global_block().append_op(
             type="listen_and_serv",
             inputs={},
             outputs={},
             attrs={
-                "OptimizeBlock": optimize_sub_program.global_block(),
+                "OptimizeBlock": optimize_block,
                 "endpoint": endpoint,
                 "ParamList": [
                     p.name
@@ -499,6 +510,30 @@ class DistributeTranspiler:
         pserver_program.sync_with_cpp()
         return pserver_program
 
+    def _get_input_map_from_op(self, varmap, op):
+        iomap = dict()
+        for key in op.input_names:
+            vars = []
+            for varname in op.input(key):
+                vars.append(varmap[varname])
+            if len(vars) == 1:
+                iomap[key] = vars[0]
+            else:
+                iomap[key] = vars
+        return iomap
+
+    def _get_output_map_from_op(self, varmap, op):
+        iomap = dict()
+        for key in op.output_names:
+            vars = []
+            for varname in op.output(key):
+                vars.append(varmap[varname])
+            if len(vars) == 1:
+                iomap[key] = vars[0]
+            else:
+                iomap[key] = vars
+        return iomap
+
     def get_startup_program(self, endpoint, pserver_program):
         """
         Get startup program for current parameter server.
@@ -529,17 +564,21 @@ class DistributeTranspiler:
 
         # 2. rename op outputs
         for op in orig_s_prog.global_block().ops:
+            new_inputs = dict()
             new_outputs = dict()
             # do not append startup op if var is not on this pserver
             op_on_pserver = False
-            for key, var in op.outputs.iteritems():
-                newname, _ = _get_splited_name_and_shape(var.name)
+            for key in op.output_names:
+                newname, _ = _get_splited_name_and_shape(op.output(key)[0])
                 if newname:
                     op_on_pserver = True
                     new_outputs[key] = created_var_map[newname]
-                elif var.name in pserver_vars:
+                elif op.output(key)[0] in pserver_vars:
                     op_on_pserver = True
-                    new_outputs[key] = pserver_vars[var.name]
+                    new_outputs[key] = pserver_vars[op.output(key)[0]]
+
+            # most startup program ops have no inputs
+            new_inputs = self._get_input_map_from_op(pserver_vars, op)
 
             if op_on_pserver:
                 if op.type in [
@@ -548,7 +587,7 @@ class DistributeTranspiler:
                     op.attrs["shape"] = new_outputs["Out"].shape
                 s_prog.global_block().append_op(
                     type=op.type,
-                    inputs=op.inputs,
+                    inputs=new_inputs,
                     outputs=new_outputs,
                     attrs=op.attrs)
         return s_prog
diff --git a/python/paddle/v2/fluid/executor.py b/python/paddle/v2/fluid/executor.py
index 0eddcc3a5ab6f71aa5500c3b98b63c0937c7ddfc..01cbdb3ec487d6e2e60890619131de0067d40db9 100644
--- a/python/paddle/v2/fluid/executor.py
+++ b/python/paddle/v2/fluid/executor.py
@@ -47,27 +47,13 @@ def as_numpy(tensor):
         return [as_numpy(t) for t in tensor]
     assert isinstance(tensor, core.LoDTensor)
     lod = tensor.lod()
-    tensor_data = np.array(tensor)
-    if len(lod) == 0:
-        ans = tensor_data
-    else:
-        raise RuntimeError("LoD Calculate lacks unit tests and buggy")
-    # elif len(lod) == 1:
-    #     ans = []
-    #     idx = 0
-    #     while idx < len(lod) - 1:
-    #         ans.append(tensor_data[lod[idx]:lod[idx + 1]])
-    #         idx += 1
-    # else:
-    #     for l in reversed(lod):
-    #         ans = []
-    #         idx = 0
-    #         while idx < len(l) - 1:
-    #             ans.append(tensor_data[l[idx]:l[idx + 1]])
-    #             idx += 1
-    #         tensor_data = ans
-    #     ans = tensor_data
-    return ans
+    if len(lod) > 0:
+        raise RuntimeError(
+            "Some of your featched tensors hold LoD information. \
+            They can not be completely cast to Python ndarray. \
+            Please set the parameter 'return_numpy' as 'False' to \
+            return LoDTensor itself directly.")
+    return np.array(tensor)
 
 
 def has_feed_operators(block, feed_targets, feed_holder_name):
@@ -306,7 +292,6 @@ class Executor(object):
             core.get_fetch_variable(scope, fetch_var_name, i)
             for i in xrange(len(fetch_list))
         ]
-
         if return_numpy:
             outs = as_numpy(outs)
         return outs
diff --git a/python/paddle/v2/fluid/framework.py b/python/paddle/v2/fluid/framework.py
index a12427258e9d3142abcb84249a10dabd8e96b792..a517db68c5886fbcbe19e6981aee5bf3971352e4 100644
--- a/python/paddle/v2/fluid/framework.py
+++ b/python/paddle/v2/fluid/framework.py
@@ -740,6 +740,9 @@ class Block(object):
             raise e
         self.desc.remove_op(start, end + 1)
 
+    def slice_ops(self, start, end):
+        return list(self.ops)[start:end]
+
     def prepend_op(self, *args, **kwargs):
         op_desc = self.desc.prepend_op()
         op = Operator(self, op_desc, *args, **kwargs)
diff --git a/python/paddle/v2/fluid/initializer.py b/python/paddle/v2/fluid/initializer.py
index b9c0d12ad6cf09e66df6b1a8da09df275c79a3f6..8c70fd90eff6d40c77ea4b74f5b04d6ab62401be 100644
--- a/python/paddle/v2/fluid/initializer.py
+++ b/python/paddle/v2/fluid/initializer.py
@@ -14,14 +14,37 @@
 
 import framework
 import numpy as np
+import contextlib
 
 __all__ = [
-    'Constant',
-    'Uniform',
-    'Normal',
-    'Xavier',
+    'Constant', 'Uniform', 'Normal', 'Xavier', 'force_init_on_cpu',
+    'init_on_cpu'
 ]
 
+_force_init_on_cpu_ = False
+
+
+def force_init_on_cpu():
+    return _force_init_on_cpu_
+
+
+@contextlib.contextmanager
+def init_on_cpu():
+    """
+    Switch program with `with` statement
+
+    Examples:
+        >>> with init_on_cpu():
+        >>>   step = layers.create_global_var()
+
+    """
+    global _force_init_on_cpu_
+
+    pre_state = force_init_on_cpu()
+    _force_init_on_cpu_ = True
+    yield
+    _force_init_on_cpu_ = pre_state
+
 
 class Initializer(object):
     """Base class for variable initializers
@@ -80,7 +103,7 @@ class ConstantInitializer(Initializer):
     """Implements the constant initializer
     """
 
-    def __init__(self, value=0.0):
+    def __init__(self, value=0.0, force_cpu=False):
         """Constructor for ConstantInitializer
 
         Args:
@@ -89,6 +112,7 @@ class ConstantInitializer(Initializer):
         assert value is not None
         super(ConstantInitializer, self).__init__()
         self._value = value
+        self._force_cpu = force_cpu
 
     def __call__(self, var, block):
         """Add constant initialization ops for a variable
@@ -110,7 +134,8 @@ class ConstantInitializer(Initializer):
             attrs={
                 "shape": var.shape,
                 "dtype": int(var.dtype),
-                "value": self._value
+                "value": float(self._value),
+                'force_cpu': self._force_cpu or force_init_on_cpu()
             })
         var.op = op
         return op
diff --git a/python/paddle/v2/fluid/io.py b/python/paddle/v2/fluid/io.py
index 613dc20b6ea5533d126a73b7ec47796b3f812db5..0f43e46082a8988be4805a2b750227312ba80ff3 100644
--- a/python/paddle/v2/fluid/io.py
+++ b/python/paddle/v2/fluid/io.py
@@ -342,7 +342,11 @@ def save_inference_model(dirname,
     prepend_feed_ops(inference_program, feeded_var_names)
     append_fetch_ops(inference_program, fetch_var_names)
 
-    model_file_name = dirname + "/__model__"
+    if save_file_name == None:
+        model_file_name = dirname + "/__model__"
+    else:
+        model_file_name = dirname + "/__model_combined__"
+
     with open(model_file_name, "wb") as f:
         f.write(inference_program.desc.serialize_to_string())
 
@@ -384,7 +388,11 @@ def load_inference_model(dirname, executor, load_file_name=None):
     if not os.path.isdir(dirname):
         raise ValueError("There is no directory named '%s'", dirname)
 
-    model_file_name = dirname + "/__model__"
+    if load_file_name == None:
+        model_file_name = dirname + "/__model__"
+    else:
+        model_file_name = dirname + "/__model_combined__"
+
     with open(model_file_name, "rb") as f:
         program_desc_str = f.read()
 
diff --git a/python/paddle/v2/fluid/layers/math_op_patch.py b/python/paddle/v2/fluid/layers/math_op_patch.py
index 79a130a3eb148e6c5a8fa3cdf174780b354c23c9..9b5f22759cf5ccba841919ec158ff58e6505dcf7 100644
--- a/python/paddle/v2/fluid/layers/math_op_patch.py
+++ b/python/paddle/v2/fluid/layers/math_op_patch.py
@@ -14,6 +14,7 @@
 
 from ..framework import Variable, unique_name
 from layer_function_generator import OpProtoHolder
+from ..initializer import force_init_on_cpu
 
 __all__ = ['monkey_patch_variable']
 
@@ -36,9 +37,12 @@ def monkey_patch_variable():
         block.append_op(
             type="fill_constant",
             outputs={'Out': [var]},
-            attrs={'dtype': var.dtype,
-                   'shape': shape,
-                   'value': value})
+            attrs={
+                'dtype': var.dtype,
+                'shape': shape,
+                'value': value,
+                'force_cpu': force_init_on_cpu()
+            })
         return var
 
     def create_scalar(block, value, dtype):
diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py
index 99168ecc228045a0206aff1b7de5fc17c1438fe2..5ebd329fc0285a39111a23b3c58c80944cfe23f6 100644
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -65,6 +65,7 @@ __all__ = [
     'beam_search',
     'row_conv',
     'multiplex',
+    'layer_norm',
 ]
 
 
@@ -184,7 +185,7 @@ def fc(input,
         helper.append_op(
             type="sum", inputs={"X": mul_results}, outputs={"Out": pre_bias})
     # add bias
-    pre_activation = helper.append_bias_op(pre_bias)
+    pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)
     # add activation
     return helper.append_activation(pre_activation)
 
@@ -641,8 +642,8 @@ def dynamic_gru(input,
             Choices = ["sigmoid", "tanh", "relu", "identity"], default "tanh".
 
     Returns:
-        Variable: The hidden state of GRU. The shape is (T \\times D), and lod \
-            is the same with the input.
+        Variable: The hidden state of GRU. The shape is :math:`(T \\times D)`, \
+            and lod is the same with the input.
 
     Examples:
         .. code-block:: python
@@ -990,7 +991,7 @@ def square_error_cost(input, label, **kwargs):
        label(Variable): Label tensor, has target labels.
 
     Returns:
-        Variable: The tensor variable storing the element-wise squared error
+        Variable: The tensor variable storing the element-wise squared error \
                   difference of input and label.
 
     Examples:
@@ -1214,7 +1215,7 @@ def conv2d(input,
        act(str): Activation type. Default: None
 
     Returns:
-        Variable: The tensor variable storing the convolution and
+        Variable: The tensor variable storing the convolution and \
                   non-linearity activation result.
 
     Raises:
@@ -1565,6 +1566,102 @@ def batch_norm(input,
     return helper.append_activation(batch_norm_out)
 
 
+def layer_norm(input,
+               scale=True,
+               shift=True,
+               begin_norm_axis=1,
+               epsilon=1e-05,
+               param_attr=None,
+               bias_attr=None,
+               act=None,
+               name=None):
+    """
+    **Layer Normalization**
+
+    Assume feature vectors exist on dimensions 
+    :attr:`begin_norm_axis ... rank(input)` and calculate the moment statistics
+    along these dimensions for each feature vector :math:`a` with size
+    :math:`H`, then normalize each feature vector using the corresponding
+    statistics. After that, apply learnable gain and bias on the normalized
+    tensor to scale and shift if :attr:`scale` and :attr:`shift` are set.
+
+    Refer to `Layer Normalization <https://arxiv.org/pdf/1607.06450v1.pdf>`_
+
+    The formula is as follows:
+
+    .. math::
+
+        \\mu & = \\frac{1}{H}\\sum_{i=1}^{H} a_i
+
+        \\sigma & = \\sqrt{\\frac{1}{H}\sum_{i=1}^{H}(a_i - \\mu)^2}
+
+        h & = f(\\frac{g}{\\sigma}(a - \\mu) + b)
+
+    Args:
+        input(Variable): The input tensor variable.
+        scale(bool): Whether to learn the adaptive gain :math:`g` after 
+            normalization.
+        shift(bool): Whether to learn the adaptive bias :math:`b` after 
+            normalization.
+        begin_norm_axis(bool): The normalization will be performed along 
+            dimensions from :attr:`begin_norm_axis` to :attr:`rank(input)`.
+        epsilon(float): The small value added to the variance to prevent 
+            division by zero.
+        param_attr(ParamAttr|None): The parameter attribute for the learnable
+            gain :math:`g`.
+        bias_attr(ParamAttr|None): The parameter attribute for the learnable
+            bias :math:`b`.
+        act(str): Activation to be applied to the output of layer normalizaiton.
+
+    Returns:
+        Variable: A tensor variable with the same shape as the input.
+
+    Examples:
+        .. code-block:: python
+
+            data = fluid.layers.data(
+              name='data', shape=[3, 32, 32], dtype='float32')
+            x = fluid.layers.layer_norm(input=data, begin_norm_axis=1)
+    """
+    helper = LayerHelper('layer_norm', **locals())
+    dtype = helper.input_dtype()
+
+    # create intput and parameters
+    inputs = {'X': input}
+    input_shape = input.shape
+    param_shape = [reduce(lambda x, y: x * y, input_shape[begin_norm_axis:])]
+    if scale:
+        scale = helper.create_parameter(
+            attr=helper.param_attr,
+            shape=param_shape,
+            dtype=dtype,
+            default_initializer=Constant(1.0))
+        inputs['Scale'] = scale
+    if shift:
+        assert bias_attr is not False
+        bias = helper.create_parameter(
+            attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True)
+        inputs['Bias'] = bias
+
+    # create output
+    mean_out = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
+    variance_out = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
+    layer_norm_out = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type="layer_norm",
+        inputs=inputs,
+        outputs={
+            "Y": layer_norm_out,
+            "Mean": mean_out,
+            "Variance": variance_out,
+        },
+        attrs={"epsilon": epsilon,
+               "begin_norm_axis": begin_norm_axis})
+
+    return helper.append_activation(layer_norm_out)
+
+
 def beam_search_decode(ids, scores, name=None):
     helper = LayerHelper('beam_search_decode', **locals())
     sentence_ids = helper.create_tmp_variable(dtype=ids.dtype)
diff --git a/python/paddle/v2/fluid/layers/ops.py b/python/paddle/v2/fluid/layers/ops.py
index 38dea2892fc18a9c493878d816a246522e9b9886..bb3f71abbb00793a32b8c288ee3ea255cc9c584d 100644
--- a/python/paddle/v2/fluid/layers/ops.py
+++ b/python/paddle/v2/fluid/layers/ops.py
@@ -65,6 +65,8 @@ __all__ = [
     'logical_or',
     'logical_xor',
     'logical_not',
+    'uniform_random',
+    'cumsum',
 ] + __activations__
 
 for _OP in set(__all__):
diff --git a/python/paddle/v2/fluid/layers/tensor.py b/python/paddle/v2/fluid/layers/tensor.py
index 704e040b9f478ef61991cfbe175f1cdeaf102763..2d4e0ab0cc66861d8bd31e196740688ab8fa6262 100644
--- a/python/paddle/v2/fluid/layers/tensor.py
+++ b/python/paddle/v2/fluid/layers/tensor.py
@@ -16,7 +16,7 @@ from ..layer_helper import LayerHelper
 from ..param_attr import ParamAttr
 from ..framework import convert_np_dtype_to_dtype_
 from ..framework import Variable
-from ..initializer import Constant
+from ..initializer import Constant, force_init_on_cpu
 from ..core import DataType
 import numpy
 
@@ -69,12 +69,30 @@ def create_parameter(shape,
                                    default_initializer)
 
 
-def create_global_var(shape, value, dtype, persistable=False, name=None):
+def create_global_var(shape,
+                      value,
+                      dtype,
+                      persistable=False,
+                      force_cpu=False,
+                      name=None):
+    """
+    Create a global variable. such as global_step
+    Args:
+        shape(list[int]): shape of the variable
+        value(float): the value of the variable
+        dtype(string): element type of the parameter
+        persistable(bool): if this variable is persistable
+        force_cpu(bool): force this variable to be on CPU
+
+    Returns:
+        Variable: the created Variable
+    """
     helper = LayerHelper("global_var", **locals())
     var = helper.create_global_variable(
         dtype=dtype, shape=shape, persistable=persistable, name=name)
     helper.set_variable_initializer(
-        var, initializer=Constant(value=float(value)))
+        var, initializer=Constant(
+            value=float(value), force_cpu=force_cpu))
     return var
 
 
@@ -221,6 +239,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None):
         dtype(np.dtype|core.DataType|str): Data type of the output tensor.
         value(float): The constant value used to initialize the output tensor.
         out(Variable): The output tensor.
+        force_cpu(True|False): data should be on CPU if set true.
 
     Returns:
         Variable: The tensor variable storing the output.
@@ -242,7 +261,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None):
             'shape': shape,
             'dtype': out.dtype,
             'value': float(value),
-            'force_cpu': force_cpu
+            'force_cpu': force_cpu or force_init_on_cpu()
         })
     out.stop_gradient = True
     return out
diff --git a/python/paddle/v2/fluid/learning_rate_decay.py b/python/paddle/v2/fluid/learning_rate_decay.py
index 13dc98075f7d32f9dda56a890b98451ef81af363..2a2a29fd9cbedc138dc82ca75ccd78208fd33195 100644
--- a/python/paddle/v2/fluid/learning_rate_decay.py
+++ b/python/paddle/v2/fluid/learning_rate_decay.py
@@ -14,6 +14,7 @@
 
 import layers
 from framework import Variable
+from initializer import init_on_cpu
 
 __all__ = [
     'exponential_decay', 'natural_exp_decay', 'inverse_time_decay',
@@ -54,11 +55,14 @@ def exponential_decay(learning_rate,
     if not isinstance(global_step, Variable):
         raise ValueError("global_step is required for exponential_decay.")
 
-    # update learning_rate
-    div_res = global_step / decay_steps
-    if staircase:
-        div_res = layers.floor(x=div_res)
-    return learning_rate * (decay_rate**div_res)
+    with init_on_cpu():
+        # update learning_rate
+        div_res = global_step / decay_steps
+        if staircase:
+            div_res = layers.floor(x=div_res)
+        decayed_lr = learning_rate * (decay_rate**div_res)
+
+    return decayed_lr
 
 
 def natural_exp_decay(learning_rate,
@@ -88,10 +92,13 @@ def natural_exp_decay(learning_rate,
     if not isinstance(global_step, Variable):
         raise ValueError("global_step is required for natural_exp_decay.")
 
-    div_res = global_step / decay_steps
-    if staircase:
-        div_res = layers.floor(x=div_res)
-    return learning_rate * layers.exp(x=(-1 * decay_rate * div_res))
+    with init_on_cpu():
+        div_res = global_step / decay_steps
+        if staircase:
+            div_res = layers.floor(x=div_res)
+        decayed_lr = learning_rate * layers.exp(x=(-1 * decay_rate * div_res))
+
+    return decayed_lr
 
 
 def inverse_time_decay(learning_rate,
@@ -121,11 +128,14 @@ def inverse_time_decay(learning_rate,
     if not isinstance(global_step, Variable):
         raise ValueError("global_step is required for inverse_time_decay.")
 
-    div_res = global_step / decay_steps
-    if staircase:
-        div_res = layers.floor(x=div_res)
+    with init_on_cpu():
+        div_res = global_step / decay_steps
+        if staircase:
+            div_res = layers.floor(x=div_res)
+
+        decayed_lr = learning_rate / (1 + decay_rate * div_res)
 
-    return learning_rate / (1 + decay_rate * div_res)
+    return decayed_lr
 
 
 def polynomial_decay(learning_rate,
@@ -160,22 +170,27 @@ def polynomial_decay(learning_rate,
     if not isinstance(global_step, Variable):
         raise ValueError("global_step is required for inverse_time_decay.")
 
-    if cycle:
-        div_res = layers.ceil(x=(global_step / decay_steps))
-        zero_var = layers.fill_constant(shape=[1], dtype='float32', value=0.0)
-        one_var = layers.fill_constant(shape=[1], dtype='float32', value=1.0)
-
-        with layers.Switch() as switch:
-            with switch.case(layers.equal(x=global_step, y=zero_var)):
-                layers.assign(input=one_var, output=div_res)
-        decay_steps = decay_steps * div_res
-    else:
-        decay_steps_var = layers.fill_constant(
-            shape=[1], dtype='float32', value=float(decay_steps))
-        global_step = layers.elementwise_min(x=global_step, y=decay_steps_var)
-
-    return (learning_rate - end_learning_rate) * \
-           ((1 - global_step / decay_steps) ** power) + end_learning_rate
+    with init_on_cpu():
+        if cycle:
+            div_res = layers.ceil(x=(global_step / decay_steps))
+            zero_var = layers.fill_constant(
+                shape=[1], dtype='float32', value=0.0)
+            one_var = layers.fill_constant(
+                shape=[1], dtype='float32', value=1.0)
+
+            with layers.Switch() as switch:
+                with switch.case(layers.equal(x=global_step, y=zero_var)):
+                    layers.assign(input=one_var, output=div_res)
+            decay_steps = decay_steps * div_res
+        else:
+            decay_steps_var = layers.fill_constant(
+                shape=[1], dtype='float32', value=float(decay_steps))
+            global_step = layers.elementwise_min(
+                x=global_step, y=decay_steps_var)
+
+        decayed_lr = (learning_rate - end_learning_rate) * \
+                     ((1 - global_step / decay_steps) ** power) + end_learning_rate
+    return decayed_lr
 
 
 def piecewise_decay(global_step, boundaries, values):
@@ -200,24 +215,27 @@ def piecewise_decay(global_step, boundaries, values):
     if not isinstance(global_step, Variable):
         raise ValueError("global_step is required for piecewise_decay.")
 
-    lr = layers.create_global_var(
-        shape=[1],
-        value=0.0,
-        dtype='float32',
-        persistable=True,
-        name="learning_rate")
-
-    with layers.Switch() as switch:
-        for i in range(len(boundaries)):
-            boundary_val = layers.fill_constant(
-                shape=[1], dtype='float32', value=float(boundaries[i]))
-            value_var = layers.fill_constant(
-                shape=[1], dtype='float32', value=float(values[i]))
-            with switch.case(layers.less_than(global_step, boundary_val)):
-                layers.assign(value_var, lr)
-        last_value_var = layers.fill_constant(
-            shape=[1], dtype='float32', value=float(values[len(values) - 1]))
-        with switch.default():
-            layers.assign(last_value_var, lr)
+    with init_on_cpu():
+        lr = layers.create_global_var(
+            shape=[1],
+            value=0.0,
+            dtype='float32',
+            persistable=True,
+            name="learning_rate")
+
+        with layers.Switch() as switch:
+            for i in range(len(boundaries)):
+                boundary_val = layers.fill_constant(
+                    shape=[1], dtype='float32', value=float(boundaries[i]))
+                value_var = layers.fill_constant(
+                    shape=[1], dtype='float32', value=float(values[i]))
+                with switch.case(layers.less_than(global_step, boundary_val)):
+                    layers.assign(value_var, lr)
+            last_value_var = layers.fill_constant(
+                shape=[1],
+                dtype='float32',
+                value=float(values[len(values) - 1]))
+            with switch.default():
+                layers.assign(last_value_var, lr)
 
     return lr
diff --git a/python/paddle/v2/fluid/memory_optimization_transpiler.py b/python/paddle/v2/fluid/memory_optimization_transpiler.py
index 8bb8cf7b1a5ddf44427637229bdc31ac0e151e44..53e0991ee8c318e0c95018b57ad48f404ce8beae 100644
--- a/python/paddle/v2/fluid/memory_optimization_transpiler.py
+++ b/python/paddle/v2/fluid/memory_optimization_transpiler.py
@@ -92,14 +92,13 @@ class ControlFlowGraph(object):
         live_in = defaultdict(set)
         live_out = defaultdict(set)
         while True:
-            for i in range(self.op_size):
+            for i in range(self.op_size, 0, -1):
                 live_in[i] = set(self._live_in[i])
                 live_out[i] = set(self._live_out[i])
-                self._live_in[i] = self._uses[i] | (
-                    self._live_out[i] - self._defs[i])
                 for s in self._successors[i]:
                     self._live_out[i] |= self._live_in[s]
-
+                self._live_in[i] = self._uses[i] | (
+                    self._live_out[i] - self._defs[i])
             if self._reach_fixed_point(live_in, live_out):
                 break
 
diff --git a/python/paddle/v2/fluid/nets.py b/python/paddle/v2/fluid/nets.py
index cb63d43709e23ae04c4d23457bbb79e6f7f0ce3c..be7878f869b509fa1117e305aee662cc0123bbcc 100644
--- a/python/paddle/v2/fluid/nets.py
+++ b/python/paddle/v2/fluid/nets.py
@@ -194,7 +194,7 @@ def scaled_dot_product_attention(queries,
 
     Returns:
 
-        Variable: A 3-D Tensor computed by multi-head scaled dot product
+        Variable: A 3-D Tensor computed by multi-head scaled dot product \
                   attention.
 
     Raises:
@@ -333,6 +333,7 @@ def scaled_dot_product_attention(queries,
             x=product, shape=[-1, product.shape[-1]], act="softmax"),
         shape=product.shape)
     if dropout_rate:
-        weights = layers.dropout(x, dropout_prob=dropout_rate, is_test=False)
+        weights = layers.dropout(
+            weights, dropout_prob=dropout_rate, is_test=False)
     ctx_multiheads = layers.matmul(weights, v)
     return __combine_heads(ctx_multiheads)
diff --git a/python/paddle/v2/fluid/optimizer.py b/python/paddle/v2/fluid/optimizer.py
index 7844a4e2df1ce3989e48082f6472292560fbf1ee..f8a00e3a5fb4038a97a951a01c3a2f1a4488ae75 100644
--- a/python/paddle/v2/fluid/optimizer.py
+++ b/python/paddle/v2/fluid/optimizer.py
@@ -190,6 +190,8 @@ class Optimizer(object):
         # Create any accumulators
         program = loss.block.program
         with program_guard(program, startup_program):
+            global_block = framework.default_main_program().global_block()
+            start = len(global_block.ops)
             self.helper = LayerHelper(self.__class__.__name__)
             self._create_accumulators(loss.block,
                                       [p[0] for p in parameters_and_grads])
@@ -203,19 +205,14 @@ class Optimizer(object):
                                                            param_and_grad)
                     optimize_ops.append(optimize_op)
 
-            # Returned list of ops can include more ops in addition
-            # to optimization ops
-            return_ops = optimize_ops
-
             # Get custom finish ops for subclasses
             # FIXME: Need to fix this once we figure out how to handle dependencies
-            finish_ops = self._finish_update(loss.block)
-            if finish_ops is not None:
-                return_ops += finish_ops
+            self._finish_update(loss.block)
 
             if self._global_step is not None:
-                return_ops.append(self._increment_global_step(loss.block))
-            return return_ops
+                self._increment_global_step(loss.block)
+            end = len(global_block.ops)
+            return global_block.slice_ops(start, end)
 
     def minimize(self,
                  loss,
diff --git a/python/paddle/v2/fluid/tests/book/.gitignore b/python/paddle/v2/fluid/tests/book/.gitignore
index f0b574b9396706a1d68393482296360362dca750..dd28d354f4160b4be68b46a7bebcdf2097d5811a 100644
--- a/python/paddle/v2/fluid/tests/book/.gitignore
+++ b/python/paddle/v2/fluid/tests/book/.gitignore
@@ -1 +1 @@
-recognize_digits_*.inference.model
+*.inference.model
diff --git a/python/paddle/v2/fluid/tests/book/test_fit_a_line.py b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
index 06860a2a465c6f8590336670372eb6ff43b10594..b3332b4810be3c95a5a48ebde92af89cafd94326 100644
--- a/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
@@ -15,15 +15,13 @@
 import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
 import contextlib
+import numpy
 import unittest
 import math
 import sys
 
 
-def main(use_cuda):
-    if use_cuda and not fluid.core.is_compiled_with_cuda():
-        return
-
+def train(use_cuda, save_dirname):
     x = fluid.layers.data(name='x', shape=[13], dtype='float32')
 
     y_predict = fluid.layers.fc(input=x, size=1, act=None)
@@ -51,14 +49,15 @@ def main(use_cuda):
 
     PASS_NUM = 100
     for pass_id in range(PASS_NUM):
-        fluid.io.save_persistables(exe, "./fit_a_line.model/")
-        fluid.io.load_persistables(exe, "./fit_a_line.model/")
         for data in train_reader():
             avg_loss_value, = exe.run(fluid.default_main_program(),
                                       feed=feeder.feed(data),
                                       fetch_list=[avg_cost])
             print(avg_loss_value)
             if avg_loss_value[0] < 10.0:
+                if save_dirname is not None:
+                    fluid.io.save_inference_model(save_dirname, ['x'],
+                                                  [y_predict], exe)
                 return
             if math.isnan(float(avg_loss_value)):
                 sys.exit("got NaN loss, training failed.")
@@ -66,6 +65,43 @@ def main(use_cuda):
         avg_loss_value[0]))
 
 
+def infer(use_cuda, save_dirname=None):
+    if save_dirname is None:
+        return
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    # Use fluid.io.load_inference_model to obtain the inference program desc,
+    # the feed_target_names (the names of variables that will be feeded 
+    # data using feed operators), and the fetch_targets (variables that 
+    # we want to obtain data from using fetch operators).
+    [inference_program, feed_target_names,
+     fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+    # The input's dimension should be 2-D and the second dim is 13
+    # The input data should be >= 0
+    batch_size = 10
+    tensor_x = numpy.random.uniform(0, 10, [batch_size, 13]).astype("float32")
+    assert feed_target_names[0] == 'x'
+    results = exe.run(inference_program,
+                      feed={feed_target_names[0]: tensor_x},
+                      fetch_list=fetch_targets)
+    print("infer shape: ", results[0].shape)
+    print("infer results: ", results[0])
+
+
+def main(use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+
+    # Directory for saving the trained model
+    save_dirname = "fit_a_line.inference.model"
+
+    train(use_cuda, save_dirname)
+    infer(use_cuda, save_dirname)
+
+
 class TestFitALine(unittest.TestCase):
     def test_cpu(self):
         with self.program_scope_guard():
diff --git a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
index 1491f7a8d5496445f8300d3db1d367bb3167d2c7..f33e81186bd8ad74b4d569383bce34afb2c40c24 100644
--- a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
@@ -18,6 +18,7 @@ import numpy as np
 import paddle.v2 as paddle
 import paddle.v2.dataset.conll05 as conll05
 import paddle.v2.fluid as fluid
+from paddle.v2.fluid.initializer import init_on_cpu
 import contextlib
 import time
 import unittest
@@ -167,7 +168,16 @@ def train(use_cuda, save_dirname=None):
 
     # TODO(qiao)
     # check other optimizers and check why out will be NAN
-    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.0001)
+    global_step = fluid.layers.create_global_var(
+        shape=[1], value=0, dtype='float32', force_cpu=True, persistable=True)
+    sgd_optimizer = fluid.optimizer.SGD(
+        learning_rate=fluid.learning_rate_decay.exponential_decay(
+            learning_rate=0.0001,
+            global_step=global_step,
+            decay_steps=100000,
+            decay_rate=0.5,
+            staircase=True),
+        global_step=global_step)
     sgd_optimizer.minimize(avg_cost)
 
     # TODO(qiao)
diff --git a/python/paddle/v2/fluid/tests/book/test_recognize_digits.py b/python/paddle/v2/fluid/tests/book/test_recognize_digits.py
index d8f0ad89cd89215ac83a133bd27a53c4b904363f..244c1749cd522faec26f8cf8e71f7469843f534e 100644
--- a/python/paddle/v2/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits.py
@@ -78,7 +78,7 @@ def conv_net(img, label):
     return loss_net(conv_pool_2, label)
 
 
-def train(nn_type, use_cuda, parallel, save_dirname):
+def train(nn_type, use_cuda, parallel, save_dirname, save_param_filename):
     if use_cuda and not fluid.core.is_compiled_with_cuda():
         return
     img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
@@ -143,8 +143,10 @@ def train(nn_type, use_cuda, parallel, save_dirname):
                 avg_loss_val = numpy.array(avg_loss_set).mean()
                 if float(acc_val) > 0.85:  # test acc > 85%
                     if save_dirname is not None:
-                        fluid.io.save_inference_model(save_dirname, ["img"],
-                                                      [prediction], exe)
+                        fluid.io.save_inference_model(
+                            save_dirname, ["img"], [prediction],
+                            exe,
+                            save_file_name=save_param_filename)
                     return
                 else:
                     print(
@@ -156,7 +158,7 @@ def train(nn_type, use_cuda, parallel, save_dirname):
     raise AssertionError("Loss of recognize digits is too large")
 
 
-def infer(use_cuda, save_dirname=None):
+def infer(use_cuda, save_dirname=None, param_filename=None):
     if save_dirname is None:
         return
 
@@ -167,13 +169,14 @@ def infer(use_cuda, save_dirname=None):
     # the feed_target_names (the names of variables that will be feeded 
     # data using feed operators), and the fetch_targets (variables that 
     # we want to obtain data from using fetch operators).
-    [inference_program, feed_target_names,
-     fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+    [inference_program, feed_target_names, fetch_targets
+     ] = fluid.io.load_inference_model(save_dirname, exe, param_filename)
 
     # The input's dimension of conv should be 4-D or 5-D.
     # Use normilized image pixels as input data, which should be in the range [-1.0, 1.0].
+    batch_size = 1
     tensor_img = numpy.random.uniform(-1.0, 1.0,
-                                      [1, 1, 28, 28]).astype("float32")
+                                      [batch_size, 1, 28, 28]).astype("float32")
 
     # Construct feed as a dictionary of {feed_target_name: feed_target_data}
     # and results will contain a list of data corresponding to fetch_targets.
@@ -183,36 +186,45 @@ def infer(use_cuda, save_dirname=None):
     print("infer results: ", results[0])
 
 
-def main(use_cuda, parallel, nn_type):
+def main(use_cuda, parallel, nn_type, combine):
     if not use_cuda and not parallel:
         save_dirname = "recognize_digits_" + nn_type + ".inference.model"
+        save_filename = None
+        if combine == True:
+            save_filename = "__params_combined__"
     else:
         save_dirname = None
+        save_filename = None
 
     train(
         nn_type=nn_type,
         use_cuda=use_cuda,
         parallel=parallel,
-        save_dirname=save_dirname)
-    infer(use_cuda=use_cuda, save_dirname=save_dirname)
+        save_dirname=save_dirname,
+        save_param_filename=save_filename)
+    infer(
+        use_cuda=use_cuda,
+        save_dirname=save_dirname,
+        param_filename=save_filename)
 
 
 class TestRecognizeDigits(unittest.TestCase):
     pass
 
 
-def inject_test_method(use_cuda, parallel, nn_type):
+def inject_test_method(use_cuda, parallel, nn_type, combine):
     def __impl__(self):
         prog = fluid.Program()
         startup_prog = fluid.Program()
         scope = fluid.core.Scope()
         with fluid.scope_guard(scope):
             with fluid.program_guard(prog, startup_prog):
-                main(use_cuda, parallel, nn_type)
+                main(use_cuda, parallel, nn_type, combine)
 
-    fn = 'test_{0}_{1}_{2}'.format(nn_type, 'cuda'
-                                   if use_cuda else 'cpu', 'parallel'
-                                   if parallel else 'normal')
+    fn = 'test_{0}_{1}_{2}_{3}'.format(nn_type, 'cuda'
+                                       if use_cuda else 'cpu', 'parallel'
+                                       if parallel else 'normal', 'combine'
+                                       if combine else 'separate')
 
     setattr(TestRecognizeDigits, fn, __impl__)
 
@@ -221,7 +233,10 @@ def inject_all_tests():
     for use_cuda in (False, True):
         for parallel in (False, True):
             for nn_type in ('mlp', 'conv'):
-                inject_test_method(use_cuda, parallel, nn_type)
+                inject_test_method(use_cuda, parallel, nn_type, True)
+
+    # One unit-test for saving parameters as separate files
+    inject_test_method(False, False, 'mlp', False)
 
 
 inject_all_tests()
diff --git a/python/paddle/v2/fluid/tests/book/test_recommender_system.py b/python/paddle/v2/fluid/tests/book/test_recommender_system.py
index 9c7ab7d6318472ac9378dd1966b75d19b5505bf5..612d51e08e4fc05b397df9d8aaaf675ba9d783af 100644
--- a/python/paddle/v2/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/v2/fluid/tests/book/test_recommender_system.py
@@ -16,7 +16,7 @@ import math
 import sys
 import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.fluid.core as core
+import paddle.v2.fluid as fluid
 import paddle.v2.fluid.framework as framework
 import paddle.v2.fluid.layers as layers
 import paddle.v2.fluid.nets as nets
@@ -104,7 +104,8 @@ def get_mov_combined_features():
 
     CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories())
 
-    category_id = layers.data(name='category_id', shape=[1], dtype='int64')
+    category_id = layers.data(
+        name='category_id', shape=[1], dtype='int64', lod_level=1)
 
     mov_categories_emb = layers.embedding(
         input=category_id, size=[CATEGORY_DICT_SIZE, 32], is_sparse=IS_SPARSE)
@@ -114,7 +115,8 @@ def get_mov_combined_features():
 
     MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict())
 
-    mov_title_id = layers.data(name='movie_title', shape=[1], dtype='int64')
+    mov_title_id = layers.data(
+        name='movie_title', shape=[1], dtype='int64', lod_level=1)
 
     mov_title_emb = layers.embedding(
         input=mov_title_id, size=[MOV_TITLE_DICT_SIZE, 32], is_sparse=IS_SPARSE)
@@ -144,23 +146,22 @@ def model():
     scale_infer = layers.scale(x=inference, scale=5.0)
 
     label = layers.data(name='score', shape=[1], dtype='float32')
-
     square_cost = layers.square_error_cost(input=scale_infer, label=label)
-
     avg_cost = layers.mean(x=square_cost)
 
-    return avg_cost
+    return scale_infer, avg_cost
+
 
+def train(use_cuda, save_dirname):
+    scale_infer, avg_cost = model()
+
+    # test program
+    test_program = fluid.default_main_program().clone()
 
-def main():
-    cost = model()
     sgd_optimizer = SGDOptimizer(learning_rate=0.2)
-    opts = sgd_optimizer.minimize(cost)
+    opts = sgd_optimizer.minimize(avg_cost)
 
-    if USE_GPU:
-        place = core.CUDAPlace(0)
-    else:
-        place = core.CPUPlace()
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
     exe = Executor(place)
     exe.run(framework.default_startup_program())
@@ -169,6 +170,8 @@ def main():
         paddle.reader.shuffle(
             paddle.dataset.movielens.train(), buf_size=8192),
         batch_size=BATCH_SIZE)
+    test_reader = paddle.batch(
+        paddle.dataset.movielens.test(), batch_size=BATCH_SIZE)
 
     feeding = {
         'user_id': 0,
@@ -184,7 +187,7 @@ def main():
     def func_feed(feeding, data):
         feed_tensors = {}
         for (key, idx) in feeding.iteritems():
-            tensor = core.LoDTensor()
+            tensor = fluid.LoDTensor()
             if key != "category_id" and key != "movie_title":
                 if key == "score":
                     numpy_data = np.array(map(lambda x: x[idx], data)).astype(
@@ -211,16 +214,117 @@ def main():
 
     PASS_NUM = 100
     for pass_id in range(PASS_NUM):
-        for data in train_reader():
-            outs = exe.run(framework.default_main_program(),
+        for batch_id, data in enumerate(train_reader()):
+            # train a mini-batch
+            outs = exe.run(program=fluid.default_main_program(),
                            feed=func_feed(feeding, data),
-                           fetch_list=[cost])
+                           fetch_list=[avg_cost])
             out = np.array(outs[0])
-            if out[0] < 6.0:
-                # if avg cost less than 6.0, we think our code is good.
-                exit(0)
+            if (batch_id + 1) % 10 == 0:
+                avg_cost_set = []
+                for test_data in test_reader():
+                    avg_cost_np = exe.run(program=test_program,
+                                          feed=func_feed(feeding, test_data),
+                                          fetch_list=[avg_cost])
+                    avg_cost_set.append(avg_cost_np[0])
+                    break  # test only 1 segment for speeding up CI
+
+                # get test avg_cost
+                test_avg_cost = np.array(avg_cost_set).mean()
+                if test_avg_cost < 6.0:
+                    # if avg_cost less than 6.0, we think our code is good.
+                    if save_dirname is not None:
+                        fluid.io.save_inference_model(save_dirname, [
+                            "user_id", "gender_id", "age_id", "job_id",
+                            "movie_id", "category_id", "movie_title"
+                        ], [scale_infer], exe)
+                    return
+
             if math.isnan(float(out[0])):
                 sys.exit("got NaN loss, training failed.")
 
 
-main()
+def infer(use_cuda, save_dirname=None):
+    if save_dirname is None:
+        return
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    # Use fluid.io.load_inference_model to obtain the inference program desc,
+    # the feed_target_names (the names of variables that will be feeded
+    # data using feed operators), and the fetch_targets (variables that
+    # we want to obtain data from using fetch operators).
+    [inference_program, feed_target_names,
+     fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+    def create_lod_tensor(data, lod=None):
+        tensor = fluid.LoDTensor()
+        if lod is None:
+            # Tensor, the shape is [batch_size, 1]
+            index = 0
+            lod_0 = [index]
+            for l in range(len(data)):
+                index += 1
+                lod_0.append(index)
+            lod = [lod_0]
+        tensor.set_lod(lod)
+
+        flattened_data = np.concatenate(data, axis=0).astype("int64")
+        flattened_data = flattened_data.reshape([len(flattened_data), 1])
+        tensor.set(flattened_data, place)
+        return tensor
+
+    # Use the first data from paddle.dataset.movielens.test() as input
+    assert feed_target_names[0] == "user_id"
+    user_id = create_lod_tensor([[1]])
+
+    assert feed_target_names[1] == "gender_id"
+    gender_id = create_lod_tensor([[1]])
+
+    assert feed_target_names[2] == "age_id"
+    age_id = create_lod_tensor([[0]])
+
+    assert feed_target_names[3] == "job_id"
+    job_id = create_lod_tensor([[10]])
+
+    assert feed_target_names[4] == "movie_id"
+    movie_id = create_lod_tensor([[783]])
+
+    assert feed_target_names[5] == "category_id"
+    category_id = create_lod_tensor([[10], [8], [9]], [[0, 3]])
+
+    assert feed_target_names[6] == "movie_title"
+    movie_title = create_lod_tensor([[1069], [4140], [2923], [710], [988]],
+                                    [[0, 5]])
+
+    # Construct feed as a dictionary of {feed_target_name: feed_target_data}
+    # and results will contain a list of data corresponding to fetch_targets.
+    results = exe.run(inference_program,
+                      feed={
+                          feed_target_names[0]: user_id,
+                          feed_target_names[1]: gender_id,
+                          feed_target_names[2]: age_id,
+                          feed_target_names[3]: job_id,
+                          feed_target_names[4]: movie_id,
+                          feed_target_names[5]: category_id,
+                          feed_target_names[6]: movie_title
+                      },
+                      fetch_list=fetch_targets,
+                      return_numpy=False)
+    print("inferred score: ", np.array(results[0]))
+
+
+def main(use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+
+    # Directory for saving the inference model
+    save_dirname = "recommender_system.inference.model"
+
+    train(use_cuda, save_dirname)
+    infer(use_cuda, save_dirname)
+
+
+if __name__ == '__main__':
+    main(USE_GPU)
diff --git a/python/paddle/v2/fluid/tests/book/test_rnn_encoder_decoder.py b/python/paddle/v2/fluid/tests/book/test_rnn_encoder_decoder.py
index fdc60861760163d2ebad3b050e551929321baafd..7fe43c680ca9319682c42836986308856185a464 100644
--- a/python/paddle/v2/fluid/tests/book/test_rnn_encoder_decoder.py
+++ b/python/paddle/v2/fluid/tests/book/test_rnn_encoder_decoder.py
@@ -18,6 +18,10 @@ import paddle.v2.fluid as fluid
 import paddle.v2.fluid.core as core
 import paddle.v2.fluid.framework as framework
 import paddle.v2.fluid.layers as layers
+import contextlib
+import math
+import sys
+import unittest
 from paddle.v2.fluid.executor import Executor
 
 dict_size = 30000
@@ -145,7 +149,7 @@ def seq_to_seq_net():
     cost = fluid.layers.cross_entropy(input=prediction, label=label)
     avg_cost = fluid.layers.mean(x=cost)
 
-    return avg_cost
+    return avg_cost, prediction
 
 
 def to_lodtensor(data, place):
@@ -163,8 +167,16 @@ def to_lodtensor(data, place):
     return res
 
 
-def main():
-    avg_cost = seq_to_seq_net()
+def create_random_lodtensor(lod, place, low, high):
+    data = np.random.random_integers(low, high, [lod[-1], 1]).astype("int64")
+    res = fluid.LoDTensor()
+    res.set(data, place)
+    res.set_lod([lod])
+    return res
+
+
+def train(use_cuda, save_dirname=None):
+    [avg_cost, prediction] = seq_to_seq_net()
 
     optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4)
     optimizer.minimize(avg_cost)
@@ -174,7 +186,7 @@ def main():
             paddle.dataset.wmt14.train(dict_size), buf_size=1000),
         batch_size=batch_size)
 
-    place = core.CPUPlace()
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     exe = Executor(place)
 
     exe.run(framework.default_startup_program())
@@ -185,6 +197,7 @@ def main():
             word_data = to_lodtensor(map(lambda x: x[0], data), place)
             trg_word = to_lodtensor(map(lambda x: x[1], data), place)
             trg_word_next = to_lodtensor(map(lambda x: x[2], data), place)
+
             outs = exe.run(framework.default_main_program(),
                            feed={
                                'source_sequence': word_data,
@@ -192,13 +205,86 @@ def main():
                                'label_sequence': trg_word_next
                            },
                            fetch_list=[avg_cost])
+
             avg_cost_val = np.array(outs[0])
             print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
                   " avg_cost=" + str(avg_cost_val))
+            if math.isnan(float(avg_cost_val[0])):
+                sys.exit("got NaN loss, training failed.")
             if batch_id > 3:
-                exit(0)
+                if save_dirname is not None:
+                    fluid.io.save_inference_model(
+                        save_dirname, ['source_sequence',
+                                       'target_sequence'], [prediction], exe)
+                return
+
             batch_id += 1
 
 
+def infer(use_cuda, save_dirname=None):
+    if save_dirname is None:
+        return
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    # Use fluid.io.load_inference_model to obtain the inference program desc,
+    # the feed_target_names (the names of variables that will be feeded 
+    # data using feed operators), and the fetch_targets (variables that 
+    # we want to obtain data from using fetch operators).
+    [inference_program, feed_target_names,
+     fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+    lod = [0, 4, 10]
+    word_data = create_random_lodtensor(lod, place, low=0, high=1)
+    trg_word = create_random_lodtensor(lod, place, low=0, high=1)
+
+    # Construct feed as a dictionary of {feed_target_name: feed_target_data}
+    # and results will contain a list of data corresponding to fetch_targets.
+    assert feed_target_names[0] == 'source_sequence'
+    assert feed_target_names[1] == 'target_sequence'
+    results = exe.run(inference_program,
+                      feed={
+                          feed_target_names[0]: word_data,
+                          feed_target_names[1]: trg_word,
+                      },
+                      fetch_list=fetch_targets,
+                      return_numpy=False)
+    print(results[0].lod())
+    np_data = np.array(results[0])
+    print("Inference shape: ", np_data.shape)
+    print("Inference results: ", np_data)
+
+
+def main(use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+
+    # Directory for saving the trained model
+    save_dirname = "rnn_encoder_decoder.inference.model"
+
+    train(use_cuda, save_dirname)
+    infer(use_cuda, save_dirname)
+
+
+class TestRnnEncoderDecoder(unittest.TestCase):
+    def test_cuda(self):
+        with self.scope_prog_guard():
+            main(use_cuda=True)
+
+    def test_cpu(self):
+        with self.scope_prog_guard():
+            main(use_cuda=False)
+
+    @contextlib.contextmanager
+    def scope_prog_guard(self):
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        scope = fluid.core.Scope()
+        with fluid.scope_guard(scope):
+            with fluid.program_guard(prog, startup_prog):
+                yield
+
+
 if __name__ == '__main__':
-    main()
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/book/test_understand_sentiment.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment.py
index 9c5cb667aed7456b54d32dcd650852cfdbd6cce1..6e0206d41db6265e926991fe35cd53513bd3417d 100644
--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment.py
@@ -17,6 +17,7 @@ import paddle.v2.fluid as fluid
 import paddle.v2 as paddle
 import contextlib
 import math
+import numpy as np
 import sys
 
 
@@ -43,7 +44,7 @@ def convolution_net(data, label, input_dim, class_dim=2, emb_dim=32,
     adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
     adam_optimizer.minimize(avg_cost)
     accuracy = fluid.layers.accuracy(input=prediction, label=label)
-    return avg_cost, accuracy
+    return avg_cost, accuracy, prediction
 
 
 def stacked_lstm_net(data,
@@ -81,13 +82,18 @@ def stacked_lstm_net(data,
     adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
     adam_optimizer.minimize(avg_cost)
     accuracy = fluid.layers.accuracy(input=prediction, label=label)
-    return avg_cost, accuracy
+    return avg_cost, accuracy, prediction
 
 
-def main(word_dict, net_method, use_cuda):
-    if use_cuda and not fluid.core.is_compiled_with_cuda():
-        return
+def create_random_lodtensor(lod, place, low, high):
+    data = np.random.random_integers(low, high, [lod[-1], 1]).astype("int64")
+    res = fluid.LoDTensor()
+    res.set(data, place)
+    res.set_lod([lod])
+    return res
 
+
+def train(word_dict, net_method, use_cuda, save_dirname=None):
     BATCH_SIZE = 128
     PASS_NUM = 5
     dict_dim = len(word_dict)
@@ -96,7 +102,7 @@ def main(word_dict, net_method, use_cuda):
     data = fluid.layers.data(
         name="words", shape=[1], dtype="int64", lod_level=1)
     label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-    cost, acc_out = net_method(
+    cost, acc_out, prediction = net_method(
         data, label, input_dim=dict_dim, class_dim=class_dim)
 
     train_data = paddle.batch(
@@ -116,6 +122,9 @@ def main(word_dict, net_method, use_cuda):
                                         fetch_list=[cost, acc_out])
             print("cost=" + str(cost_val) + " acc=" + str(acc_val))
             if cost_val < 0.4 and acc_val > 0.8:
+                if save_dirname is not None:
+                    fluid.io.save_inference_model(save_dirname, ["words"],
+                                                  prediction, exe)
                 return
             if math.isnan(float(cost_val)):
                 sys.exit("got NaN loss, training failed.")
@@ -123,6 +132,49 @@ def main(word_dict, net_method, use_cuda):
         net_method.__name__))
 
 
+def infer(use_cuda, save_dirname=None):
+    if save_dirname is None:
+        return
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    # Use fluid.io.load_inference_model to obtain the inference program desc,
+    # the feed_target_names (the names of variables that will be feeded 
+    # data using feed operators), and the fetch_targets (variables that 
+    # we want to obtain data from using fetch operators).
+    [inference_program, feed_target_names,
+     fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+    lod = [0, 4, 10]
+    word_dict = paddle.dataset.imdb.word_dict()
+    tensor_words = create_random_lodtensor(
+        lod, place, low=0, high=len(word_dict) - 1)
+
+    # Construct feed as a dictionary of {feed_target_name: feed_target_data}
+    # and results will contain a list of data corresponding to fetch_targets.
+    assert feed_target_names[0] == "words"
+    results = exe.run(inference_program,
+                      feed={feed_target_names[0]: tensor_words},
+                      fetch_list=fetch_targets,
+                      return_numpy=False)
+    print(results[0].lod())
+    np_data = np.array(results[0])
+    print("Inference Shape: ", np_data.shape)
+    print("Inference results: ", np_data)
+
+
+def main(word_dict, net_method, use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+
+    # Directory for saving the trained model
+    save_dirname = "understand_sentiment.inference.model"
+
+    train(word_dict, net_method, use_cuda, save_dirname)
+    infer(use_cuda, save_dirname)
+
+
 class TestUnderstandSentiment(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
diff --git a/python/paddle/v2/fluid/tests/book/test_word2vec.py b/python/paddle/v2/fluid/tests/book/test_word2vec.py
index f013d7f1551bdbfb2f725809e2fb4d7d686560fe..69bfbcee69a08f57e4754f1a94f85534be4baac6 100644
--- a/python/paddle/v2/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/v2/fluid/tests/book/test_word2vec.py
@@ -1,6 +1,5 @@
 #   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
+# # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
@@ -16,14 +15,67 @@ import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
 import unittest
 import os
+import numpy as np
 import math
 import sys
 
 
-def main(use_cuda, is_sparse, parallel):
-    if use_cuda and not fluid.core.is_compiled_with_cuda():
+def create_random_lodtensor(lod, place, low, high):
+    data = np.random.random_integers(low, high, [lod[-1], 1]).astype("int64")
+    res = fluid.LoDTensor()
+    res.set(data, place)
+    res.set_lod([lod])
+    return res
+
+
+def infer(use_cuda, save_dirname=None):
+    if save_dirname is None:
         return
 
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    # Use fluid.io.load_inference_model to obtain the inference program desc,
+    # the feed_target_names (the names of variables that will be feeded 
+    # data using feed operators), and the fetch_targets (variables that 
+    # we want to obtain data from using fetch operators).
+    [inference_program, feed_target_names,
+     fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+    word_dict = paddle.dataset.imikolov.build_dict()
+    dict_size = len(word_dict) - 1
+
+    # Setup input, by creating 4 words, and setting up lod required for 
+    # lookup_table_op
+    lod = [0, 1]
+    first_word = create_random_lodtensor(lod, place, low=0, high=dict_size)
+    second_word = create_random_lodtensor(lod, place, low=0, high=dict_size)
+    third_word = create_random_lodtensor(lod, place, low=0, high=dict_size)
+    fourth_word = create_random_lodtensor(lod, place, low=0, high=dict_size)
+
+    assert feed_target_names[0] == 'firstw'
+    assert feed_target_names[1] == 'secondw'
+    assert feed_target_names[2] == 'thirdw'
+    assert feed_target_names[3] == 'forthw'
+
+    # Construct feed as a dictionary of {feed_target_name: feed_target_data}
+    # and results will contain a list of data corresponding to fetch_targets.
+    results = exe.run(inference_program,
+                      feed={
+                          feed_target_names[0]: first_word,
+                          feed_target_names[1]: second_word,
+                          feed_target_names[2]: third_word,
+                          feed_target_names[3]: fourth_word
+                      },
+                      fetch_list=fetch_targets,
+                      return_numpy=False)
+    print(results[0].lod())
+    np_data = np.array(results[0])
+    print("Inference Shape: ", np_data.shape)
+    print("Inference results: ", np_data)
+
+
+def train(use_cuda, is_sparse, parallel, save_dirname):
     PASS_NUM = 100
     EMBED_SIZE = 32
     HIDDEN_SIZE = 256
@@ -67,7 +119,7 @@ def main(use_cuda, is_sparse, parallel):
                                        act='softmax')
         cost = fluid.layers.cross_entropy(input=predict_word, label=words[4])
         avg_cost = fluid.layers.mean(x=cost)
-        return avg_cost
+        return avg_cost, predict_word
 
     word_dict = paddle.dataset.imikolov.build_dict()
     dict_size = len(word_dict)
@@ -79,13 +131,13 @@ def main(use_cuda, is_sparse, parallel):
     next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
 
     if not parallel:
-        avg_cost = __network__(
+        avg_cost, predict_word = __network__(
             [first_word, second_word, third_word, forth_word, next_word])
     else:
         places = fluid.layers.get_places()
         pd = fluid.layers.ParallelDo(places)
         with pd.do():
-            avg_cost = __network__(
+            avg_cost, predict_word = __network__(
                 map(pd.read_input, [
                     first_word, second_word, third_word, forth_word, next_word
                 ]))
@@ -113,6 +165,10 @@ def main(use_cuda, is_sparse, parallel):
                                   feed=feeder.feed(data),
                                   fetch_list=[avg_cost])
             if avg_cost_np[0] < 5.0:
+                if save_dirname is not None:
+                    fluid.io.save_inference_model(save_dirname, [
+                        'firstw', 'secondw', 'thirdw', 'forthw'
+                    ], [predict_word], exe)
                 return
             if math.isnan(float(avg_cost_np[0])):
                 sys.exit("got NaN loss, training failed.")
@@ -120,6 +176,14 @@ def main(use_cuda, is_sparse, parallel):
     raise AssertionError("Cost is too large {0:2.2}".format(avg_cost_np[0]))
 
 
+def main(use_cuda, is_sparse, parallel):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+    save_dirname = "word2vec.inference.model"
+    train(use_cuda, is_sparse, parallel, save_dirname)
+    infer(use_cuda, save_dirname)
+
+
 FULL_TEST = os.getenv('FULL_TEST',
                       '0').lower() in ['true', '1', 't', 'y', 'yes', 'on']
 SKIP_REASON = "Only run minimum number of tests in CI server, to make CI faster"
@@ -142,7 +206,8 @@ def inject_test_method(use_cuda, is_sparse, parallel):
             with fluid.program_guard(prog, startup_prog):
                 main(use_cuda=use_cuda, is_sparse=is_sparse, parallel=parallel)
 
-    if use_cuda and is_sparse and parallel:
+    # run only 2 cases: use_cuda is either True or False
+    if is_sparse == False and parallel == False:
         fn = __impl__
     else:
         # skip the other test when on CI server
diff --git a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
index 7ad5e2c594f24999e298533b6c05ba688a935f0b..045db8390cd52689a2a803c3387c90776a44ee73 100644
--- a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
+++ b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
@@ -15,6 +15,8 @@
 import numpy as np
 import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
+import math
+import sys
 
 # need to fix random seed and training data to compare the loss
 # value accurately calculated by the default and the memory optimization
@@ -63,4 +65,6 @@ for pass_id in range(PASS_NUM):
 
         if avg_loss_value[0] < 10.0:
             exit(0)  # if avg cost less than 10.0, we think our code is good.
+        if math.isnan(float(avg_loss_value)):
+            sys.exit("got NaN loss, training failed.")
 exit(1)
diff --git a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
index 26673afd83c48328c3f354e82bfa3725aa4805b5..9fbb36d3638bd537020247d6f762afd4ed5d402f 100644
--- a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
+++ b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
@@ -18,6 +18,8 @@ import sys
 
 import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
+import math
+import sys
 
 # need to fix random seed and training data to compare the loss
 # value accurately calculated by the default and the memory optimization
@@ -152,7 +154,10 @@ for pass_id in range(PASS_NUM):
         print("loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str(
             pass_acc))
         # this model is slow, so if we can train two mini batch, we think it works properly.
+
         if i > 2:
             exit(0)
+        if math.isnan(float(loss)):
+            sys.exit("got NaN loss, training failed.")
         i += 1
 exit(1)
diff --git a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
index ffd53e7a78142162317a677de49c1821635a65b5..48abaa8d87563b7132c5d8962bc33283a104e67a 100644
--- a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
+++ b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
@@ -19,6 +19,8 @@ import paddle.v2.fluid.core as core
 import paddle.v2.fluid.framework as framework
 import paddle.v2.fluid.layers as layers
 from paddle.v2.fluid.executor import Executor
+import math
+import sys
 
 dict_size = 30000
 source_dict_dim = target_dict_dim = dict_size
@@ -137,6 +139,8 @@ def main():
                   " avg_cost=" + str(avg_cost_val))
             if batch_id > 2:
                 exit(0)
+            if math.isnan(float(avg_cost_val)):
+                sys.exit("got NaN loss, training failed.")
             batch_id += 1
 
 
diff --git a/python/paddle/v2/fluid/tests/notest_csp.py b/python/paddle/v2/fluid/tests/notest_csp.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fe234a20b5222eb85e6bcea2fcb05c53ddd57e9
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/notest_csp.py
@@ -0,0 +1,37 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle.v2.fluid as fluid
+
+
+class TestCSPFramework(unittest.TestCase):
+    def daisy_chain(self):
+        n = 10000
+        leftmost = fluid.make_channel(dtype=int)
+        right = leftmost
+        left = leftmost
+        with fluid.While(steps=n):
+            right = fluid.make_channel(dtype=int)
+            with fluid.go():
+                fluid.send(left, 1 + fluid.recv(right))
+            left = right
+
+        with fluid.go():
+            fluid.send(right, 1)
+        fluid.Print(fluid.recv(leftmost))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/op_test.py b/python/paddle/v2/fluid/tests/op_test.py
index 3f6d7070c2987d0557c60db84a2c679cd2cfe36b..f8475813c0cb7b8c8da8c9c7e0a4626615e609ac 100644
--- a/python/paddle/v2/fluid/tests/op_test.py
+++ b/python/paddle/v2/fluid/tests/op_test.py
@@ -326,7 +326,8 @@ class OpTest(unittest.TestCase):
                 self.assertTrue(
                     np.allclose(
                         actual_t, expect_t, atol=atol),
-                    "Output (" + out_name + ") has diff at " + str(place))
+                    "Output (" + out_name + ") has diff at " + str(place) +
+                    str(actual_t) + str(expect_t))
                 if isinstance(expect, tuple):
                     self.assertListEqual(actual.lod(), expect[1],
                                          "Output (" + out_name +
diff --git a/python/paddle/v2/fluid/tests/test_cpp_reader.py b/python/paddle/v2/fluid/tests/test_cpp_reader.py
index e71c3a290c9b120749a5190a246c5d76b7bf1955..970f57ed0008b0d7d99ad8b5de1cb7895239ed2c 100644
--- a/python/paddle/v2/fluid/tests/test_cpp_reader.py
+++ b/python/paddle/v2/fluid/tests/test_cpp_reader.py
@@ -32,31 +32,43 @@ create_random_data_generator_op = block.append_op(
         "min": 0.0,
         "max": 1.0
     })
+shuffle_reader = block.create_var(
+    type=fluid.core.VarDesc.VarType.READER, name="ShuffleReader")
+shuffle_reader.desc.set_lod_levels([0, 0])
 
-out1 = block.create_var(
-    type=fluid.core.VarDesc.VarType.LOD_TENSOR,
-    name="Out1",
-    shape=[10, 2],
-    dtype="float32",
-    lod_level=1)
-out2 = block.create_var(
-    type=fluid.core.VarDesc.VarType.LOD_TENSOR,
-    name="Out2",
-    shape=[10, 1],
-    dtype="float32",
-    lod_level=1)
+create_shuffle_reader_op = block.append_op(
+    type="create_shuffle_reader",
+    inputs={"UnderlyingReader": random_reader},
+    outputs={"Out": shuffle_reader},
+    attrs={"buffer_size": 7})
+
+batch_reader = block.create_var(
+    type=fluid.core.VarDesc.VarType.READER, name="BatchReader")
+batch_reader.desc.set_lod_levels([1, 1])
+
+create_batch_reader_op = block.append_op(
+    type="create_batch_reader",
+    inputs={"UnderlyingReader": shuffle_reader},
+    outputs={"Out": batch_reader},
+    attrs={"batch_size": 10})
+
+out1 = block.create_var(type=fluid.core.VarDesc.VarType.LOD_TENSOR, name="Out1")
+out2 = block.create_var(type=fluid.core.VarDesc.VarType.LOD_TENSOR, name="Out2")
 
 read_op = block.append_op(
-    type="read",
-    inputs={"Reader": random_reader},
+    type="read", inputs={"Reader": batch_reader},
     outputs={"Out": [out1, out2]})
 
 place = fluid.CPUPlace()
 exe = fluid.Executor(place)
 
-[res1, res2] = exe.run(prog, fetch_list=[out1, out2])
+[res1, res2] = exe.run(prog, fetch_list=[out1, out2], return_numpy=False)
+
+test_pass = res1.lod() == [range(0, 11)] and res1.lod() == [
+    range(0, 11)
+] and np.array(res1).shape == (10, 2) and np.array(res2).shape == (10, 1)
 
-if len(res1) == 0 or len(res2) == 0:
+if not test_pass:
     exit(1)
 
 exit(0)
diff --git a/python/paddle/v2/fluid/tests/test_cumsum_op.py b/python/paddle/v2/fluid/tests/test_cumsum_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..e45ef457306bbdd33508e560cb8d339e801bb5e4
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_cumsum_op.py
@@ -0,0 +1,127 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestSumOp1(OpTest):
+    def setUp(self):
+        self.op_type = "cumsum"
+        self.attrs = {'axis': 2}
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
+        self.outputs = {'Out': self.inputs['X'].cumsum(axis=2)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestSumOp2(OpTest):
+    def setUp(self):
+        self.op_type = "cumsum"
+        self.attrs = {'axis': -1, 'reverse': True}
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
+        self.outputs = {
+            'Out': np.flip(
+                np.flip(
+                    self.inputs['X'], axis=2).cumsum(axis=2), axis=2)
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestSumOp3(OpTest):
+    def setUp(self):
+        self.op_type = "cumsum"
+        self.attrs = {'axis': 1}
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
+        self.outputs = {'Out': self.inputs['X'].cumsum(axis=1)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestSumOp4(OpTest):
+    def setUp(self):
+        self.op_type = "cumsum"
+        self.attrs = {'axis': 0}
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
+        self.outputs = {'Out': self.inputs['X'].cumsum(axis=0)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestSumOp5(OpTest):
+    def setUp(self):
+        self.op_type = "cumsum"
+        self.inputs = {'X': np.random.random((5, 6)).astype("float64")}
+        self.outputs = {'Out': self.inputs['X'].cumsum(axis=1)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestSumOp7(OpTest):
+    def setUp(self):
+        self.op_type = "cumsum"
+        self.inputs = {'X': np.random.random((6)).astype("float64")}
+        self.outputs = {'Out': self.inputs['X'].cumsum(axis=0)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestSumOp8(OpTest):
+    def setUp(self):
+        self.op_type = "cumsum"
+        self.attrs = {'axis': 2, "exclusive": True}
+        a = np.random.random((5, 6, 3)).astype("float64")
+        self.inputs = {'X': a}
+        self.outputs = {
+            'Out': np.concatenate(
+                (np.zeros(
+                    (5, 6, 1), dtype=np.float64), a[:, :, :-1].cumsum(axis=2)),
+                axis=2)
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_layer_norm_op.py b/python/paddle/v2/fluid/tests/test_layer_norm_op.py
index 68cf8673cd46677065588f652482cd0df08b3450..4460ffaf9c46966178497419a35ef4044464ac9f 100644
--- a/python/paddle/v2/fluid/tests/test_layer_norm_op.py
+++ b/python/paddle/v2/fluid/tests/test_layer_norm_op.py
@@ -20,6 +20,8 @@ import paddle.v2.fluid.core as core
 from paddle.v2.fluid.op import Operator
 from paddle.v2.fluid.framework import grad_var_name
 
+np.random.random(123)
+
 
 def _reference_layer_norm_naive(x, scale, beta, epsilon, begin_norm_axis=1):
     x_shape = x.shape
@@ -62,9 +64,9 @@ def _reference_layer_norm_grad(x, grad_y, scale, mean, var, begin_norm_axis=1):
 
     grad_x = dx_end + d_mean + d_std
 
-    grad_y.shape = x_shape
-    x.shape = x_shape
+    grad_x.shape, x.shape, grad_y.shape = x_shape, x_shape, x_shape
     scale.shape = scale_shape
+    var.shape, mean.shape = [N, ], [N, ]
     return grad_x, d_scale, d_bias
 
 
@@ -112,10 +114,7 @@ def set_output_grad(scope, outputs, place, feed_dict=None):
 
 class TestLayerNormdOp(OpTest):
     def __assert_close(self, tensor, np_array, msg, atol=1e-4):
-        self.assertTrue(
-            np.allclose(
-                np.array(tensor).reshape(np_array.shape), np_array, atol=atol),
-            msg)
+        self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg)
 
     def __assert_grad_close(self,
                             tensor,
@@ -123,7 +122,7 @@ class TestLayerNormdOp(OpTest):
                             name,
                             place,
                             max_relative_error=0.02):
-        a = np.array(tensor).reshape(np_array.shape)
+        a = np.array(tensor)
         b = np_array
         abs_a = np.abs(a)
         abs_a[abs_a < 1e-5] = 1
@@ -151,7 +150,7 @@ class TestLayerNormdOp(OpTest):
             x_shape = shape
             D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1)
             scale_shape = [D]
-            np.random.random(123)
+
             x_val = np.random.random_sample(x_shape).astype(np.float32)
             scale_val = np.random.random_sample(scale_shape).astype(np.float32)
             bias_val = np.random.random_sample(scale_shape).astype(np.float32)
diff --git a/python/paddle/v2/fluid/tests/test_optimizer.py b/python/paddle/v2/fluid/tests/test_optimizer.py
index 480ee7091579ba171ca957cb4d25f0034e0534c0..dc6b84dcdc04dd185d97c3cc4b9f00305a911efb 100644
--- a/python/paddle/v2/fluid/tests/test_optimizer.py
+++ b/python/paddle/v2/fluid/tests/test_optimizer.py
@@ -42,9 +42,9 @@ class TestOptimizer(unittest.TestCase):
             type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
         sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.01)
         opts, _ = sgd_optimizer.minimize(mean_out, init_program)
-        self.assertEqual(len(opts), 1)
-        sgd_op = opts[0]
-        self.assertEqual(sgd_op.type, "sgd")
+        self.assertEqual(len(opts), 3)
+        self.assertEqual([op.type for op in opts],
+                         ["fill_constant", "elementwise_mul", "sgd"])
 
     def test_sgd_optimizer_with_global_step(self):
         init_program = framework.Program()
@@ -72,11 +72,10 @@ class TestOptimizer(unittest.TestCase):
         sgd_optimizer = optimizer.SGDOptimizer(
             learning_rate=learning_rate, global_step=global_step)
         opts, _ = sgd_optimizer.minimize(mean_out, init_program)
-        self.assertEqual(len(opts), 2)
-        sgd_op = opts[0]
-        self.assertEqual(sgd_op.type, "sgd")
-        increment_op = opts[1]
-        self.assertEqual(increment_op.type, "increment")
+        self.assertEqual(len(opts), 4)
+        self.assertEqual(
+            [op.type for op in opts],
+            ["fill_constant", "elementwise_mul", "sgd", "increment"])
 
         # Check init_program
         init_ops = init_program.global_block().ops
@@ -121,9 +120,10 @@ class TestMomentumOptimizer(unittest.TestCase):
         self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
         opts = momentum_optimizer.create_optimization_pass(
             params_grads, mul_out, init_program)
-        self.assertEqual(len(opts), 1)
-        sgd_op = opts[0]
-        self.assertEqual(sgd_op.type, "momentum")
+        self.assertEqual(len(opts), 3)
+        sgd_op = opts[-1]
+        self.assertEqual([op.type for op in opts],
+                         ["fill_constant", "elementwise_mul", "momentum"])
         self.assertFalse(sgd_op.attr('use_nesterov'))
 
         # Check accumulators
@@ -170,9 +170,10 @@ class TestMomentumOptimizer(unittest.TestCase):
         self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
         opts = momentum_optimizer.create_optimization_pass(
             params_grads, mul_out, init_program)
-        self.assertEqual(len(opts), 1)
-        sgd_op = opts[0]
-        self.assertEqual(sgd_op.type, "momentum")
+        self.assertEqual(len(opts), 3)
+        sgd_op = opts[-1]
+        self.assertEqual([op.type for op in opts],
+                         ["fill_constant", "elementwise_mul", "momentum"])
         self.assertTrue(sgd_op.attr('use_nesterov'))
 
         # Check accumulators
@@ -228,9 +229,9 @@ class TestAdagradOptimizer(unittest.TestCase):
         self.assertEqual(len(adagrad_optimizer.get_accumulators()), 0)
         opts = adagrad_optimizer.create_optimization_pass(params_grads, mul_out,
                                                           init_program)
-        self.assertEqual(len(opts), 1)
-        adagrad_op = opts[0]
-        self.assertEqual(adagrad_op.type, "adagrad")
+        self.assertEqual(len(opts), 3)
+        self.assertEqual([op.type for op in opts],
+                         ["fill_constant", "elementwise_mul", "adagrad"])
 
         # Check accumulators
         accumulators = adagrad_optimizer.get_accumulators()
@@ -288,9 +289,10 @@ class TestAdamOptimizer(unittest.TestCase):
         self.assertEqual(len(adam_optimizer.get_accumulators()), 0)
         opts = adam_optimizer.create_optimization_pass(params_grads, mul_out,
                                                        init_program)
-        self.assertEqual(len(opts), 3)
-        adam_op = opts[0]
-        self.assertEqual(adam_op.type, "adam")
+        self.assertEqual(len(opts), 5)
+        self.assertEqual(
+            [op.type for op in opts],
+            ["fill_constant", "elementwise_mul", "adam", "scale", "scale"])
 
         # Check accumulators
         accumulators = adam_optimizer.get_accumulators()
@@ -350,9 +352,10 @@ class TestAdamaxOptimizer(unittest.TestCase):
         self.assertEqual(len(adamax_optimizer.get_accumulators()), 0)
         opts = adamax_optimizer.create_optimization_pass(params_grads, mul_out,
                                                          init_program)
-        self.assertEqual(len(opts), 2)
-        adam_op = opts[0]
-        self.assertEqual(adam_op.type, "adamax")
+        self.assertEqual(len(opts), 4)
+        self.assertEqual(
+            [op.type for op in opts],
+            ["fill_constant", "elementwise_mul", "adamax", "scale"])
 
         # Check accumulators
         accumulators = adamax_optimizer.get_accumulators()
@@ -409,9 +412,10 @@ class TestDecayedAdagradOptimizer(unittest.TestCase):
         self.assertEqual(len(decayed_adagrad_optimizer.get_accumulators()), 0)
         opts = decayed_adagrad_optimizer.create_optimization_pass(
             params_grads, mul_out, init_program)
-        self.assertEqual(len(opts), 1)
-        decayed_adagrad_op = opts[0]
-        self.assertEqual(decayed_adagrad_op.type, "decayed_adagrad")
+        self.assertEqual(len(opts), 3)
+        self.assertEqual(
+            [op.type for op in opts],
+            ["fill_constant", "elementwise_mul", "decayed_adagrad"])
 
         # Check accumulators
         accumulators = decayed_adagrad_optimizer.get_accumulators()