diff --git a/CMakeLists.txt b/CMakeLists.txt
index 49334279f6dc88c0d35fec43daf80e3cbe65760c..3a21574b855bc6bc37fefe61de98d657e712cde7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -156,6 +156,7 @@ include(rdma)               # set rdma libraries
 include(flags)              # set paddle compile flags
 include(version)            # set PADDLE_VERSION
 include(coveralls)          # set code coverage
+include(inference_lib)      # add paddle fluid inference libraries
 
 
 include_directories("${PADDLE_SOURCE_DIR}")
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index d49c8d601102cf865287c33349bff5eee6a90f6d..6a701e076c95372f903a09d35d4208ee73bd584c 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -28,9 +28,3 @@ endif()
 add_dependencies(eigen3 extern_eigen3)
 
 LIST(APPEND external_project_dependencies eigen3)
-
-IF(NOT WITH_C_API AND WITH_FLUID)
-    INSTALL(FILES ${EIGEN_INCLUDE_DIR}/Eigen/Core DESTINATION third_party/eigen3/Eigen)
-    INSTALL(DIRECTORY ${EIGEN_INCLUDE_DIR}/Eigen/src DESTINATION third_party/eigen3/Eigen)
-    INSTALL(DIRECTORY ${EIGEN_INCLUDE_DIR}/unsupported/Eigen DESTINATION third_party/eigen3/unsupported)
-ENDIF()
diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
index 60946304541a20809276c3e665d8524baf209006..d4f252bb9f64c8db82b841fedf0817f5d8596501 100644
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -52,7 +52,7 @@ ADD_DEPENDENCIES(gflags extern_gflags)
 
 LIST(APPEND external_project_dependencies gflags)
 
-IF(WITH_C_API OR WITH_FLUID)
+IF(WITH_C_API)
   INSTALL(DIRECTORY ${GFLAGS_INCLUDE_DIR} DESTINATION third_party/gflags)
   IF(ANDROID)
     INSTALL(FILES ${GFLAGS_LIBRARIES} DESTINATION third_party/gflags/lib/${ANDROID_ABI})
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
index 382fbda3b5cfeba893f03871cf65498d20804f36..0c6b3aafcb4e990b9d4549820137474e5968a7aa 100644
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -68,7 +68,7 @@ LINK_LIBRARIES(glog gflags)
 
 LIST(APPEND external_project_dependencies glog)
 
-IF(WITH_C_API OR WITH_FLUID)
+IF(WITH_C_API)
   INSTALL(DIRECTORY ${GLOG_INCLUDE_DIR} DESTINATION third_party/glog)
   IF(ANDROID)
     INSTALL(FILES ${GLOG_LIBRARIES} DESTINATION third_party/glog/lib/${ANDROID_ABI})
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 365a370a9cfb708379bcff18ae6aa0725d420ae1..ff5855052dabaa0b63099cd219f3f04e22f1aa85 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -250,7 +250,7 @@ IF(NOT PROTOBUF_FOUND)
     SET(PROTOBUF_PROTOC_LIBRARY ${extern_protobuf_PROTOC_LIBRARY}
         CACHE FILEPATH "protoc library." FORCE)
 
-    IF(WITH_C_API OR WITH_FLUID)
+    IF(WITH_C_API)
         INSTALL(DIRECTORY ${PROTOBUF_INCLUDE_DIR} DESTINATION third_party/protobuf)
         IF(ANDROID)
             INSTALL(FILES ${PROTOBUF_LITE_LIBRARY} DESTINATION third_party/protobuf/lib/${ANDROID_ABI})
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..7d53554358497762b1cd91c39bdd23c5807af2bc
--- /dev/null
+++ b/cmake/inference_lib.cmake
@@ -0,0 +1,90 @@
+# make package for paddle fluid shared and static library
+function(copy TARGET)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DSTS DEPS)
+    cmake_parse_arguments(copy_lib "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    list(LENGTH copy_lib_SRCS copy_lib_SRCS_len)
+    list(LENGTH copy_lib_DSTS copy_lib_DSTS_len)
+    if(NOT ${copy_lib_SRCS_len} EQUAL ${copy_lib_DSTS_len})
+        message(FATAL_ERROR "${TARGET} source numbers are not equal to destination numbers")
+    endif()
+    math(EXPR len "${copy_lib_SRCS_len} - 1")
+    
+    add_custom_target(${TARGET} DEPENDS ${copy_lib_DEPS})
+    foreach(index RANGE ${len})
+        list(GET copy_lib_SRCS ${index} src)
+        list(GET copy_lib_DSTS ${index} dst)
+        add_custom_command(TARGET ${TARGET} PRE_BUILD COMMAND mkdir -p "${dst}")
+        if(IS_DIRECTORY ${src})
+            add_custom_command(TARGET ${TARGET} PRE_BUILD COMMAND cp -r "${src}" "${dst}")
+        else()
+            add_custom_command(TARGET ${TARGET} PRE_BUILD COMMAND cp "${src}" "${dst}")
+        endif()
+    endforeach()
+endfunction()
+
+# third party
+set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/eigen3")
+copy(eigen3_lib
+  SRCS ${EIGEN_INCLUDE_DIR}/Eigen/Core ${EIGEN_INCLUDE_DIR}/Eigen/src ${EIGEN_INCLUDE_DIR}/unsupported/Eigen
+  DSTS ${dst_dir}/Eigen ${dst_dir}/Eigen ${dst_dir}/unsupported
+)
+
+set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/gflags")
+copy(gflags_lib
+  SRCS ${GFLAGS_INCLUDE_DIR} ${GFLAGS_LIBRARIES}
+  DSTS ${dst_dir} ${dst_dir}/lib
+)
+
+set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/glog")
+copy(glog_lib
+  SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES}
+  DSTS ${dst_dir} ${dst_dir}/lib
+)
+
+IF(NOT PROTOBUF_FOUND)
+    set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/protobuf")
+    copy(protobuf_lib
+      SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LITE_LIBRARY}
+      DSTS ${dst_dir} ${dst_dir}/lib
+    )
+ENDIF(NOT PROTOBUF_FOUND)
+
+# paddle fluid module
+set(src_dir "${PADDLE_SOURCE_DIR}/paddle")
+set(dst_dir "${CMAKE_INSTALL_PREFIX}/paddle")
+set(module "framework")
+copy(framework_lib DEPS framework_py_proto 
+  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/framework/framework.pb.h
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module}
+)
+
+set(module "memory")
+copy(memory_lib
+  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/detail/*.h
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail
+)
+
+set(module "inference")
+copy(inference_lib DEPENDS paddle_fluid_shared
+  SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/inference/libpaddle_fluid.so
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module}
+)
+
+set(module "platform")
+copy(platform_lib
+  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details
+)
+
+set(module "string")
+copy(string_lib
+  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/tinyformat/*.h
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/tinyformat
+)
+
+add_custom_target(inference_lib_dist DEPENDS 
+  inference_lib framework_lib memory_lib platform_lib string_lib
+  gflags_lib glog_lib protobuf_lib eigen3_lib)
diff --git a/doc/howto/dev/build_cn.md b/doc/build_and_install/build_cn.md
similarity index 100%
rename from doc/howto/dev/build_cn.md
rename to doc/build_and_install/build_cn.md
diff --git a/doc/howto/dev/build_en.md b/doc/build_and_install/build_en.md
similarity index 100%
rename from doc/howto/dev/build_en.md
rename to doc/build_and_install/build_en.md
diff --git a/doc/getstarted/build_and_install/build_from_source_cn.rst b/doc/build_and_install/build_from_source_cn.rst
similarity index 100%
rename from doc/getstarted/build_and_install/build_from_source_cn.rst
rename to doc/build_and_install/build_from_source_cn.rst
diff --git a/doc/getstarted/build_and_install/build_from_source_en.rst b/doc/build_and_install/build_from_source_en.rst
similarity index 100%
rename from doc/getstarted/build_and_install/build_from_source_en.rst
rename to doc/build_and_install/build_from_source_en.rst
diff --git a/doc/getstarted/build_and_install/docker_install_cn.rst b/doc/build_and_install/docker_install_cn.rst
similarity index 100%
rename from doc/getstarted/build_and_install/docker_install_cn.rst
rename to doc/build_and_install/docker_install_cn.rst
diff --git a/doc/getstarted/build_and_install/docker_install_en.rst b/doc/build_and_install/docker_install_en.rst
similarity index 100%
rename from doc/getstarted/build_and_install/docker_install_en.rst
rename to doc/build_and_install/docker_install_en.rst
diff --git a/doc/getstarted/build_and_install/index_cn.rst b/doc/build_and_install/index_cn.rst
similarity index 94%
rename from doc/getstarted/build_and_install/index_cn.rst
rename to doc/build_and_install/index_cn.rst
index c9ba84c842b530162c92713046e64fdf82bd441b..4220ff2279333f25eb644227100308428bf72362 100644
--- a/doc/getstarted/build_and_install/index_cn.rst
+++ b/doc/build_and_install/index_cn.rst
@@ -13,7 +13,7 @@ PaddlePaddle提供pip和Docker的安装方式：
 
    pip_install_cn.rst
    docker_install_cn.rst
-   ../../howto/dev/build_cn.md
+   build_cn.md
 
 编译流程
 ++++++++
diff --git a/doc/getstarted/build_and_install/index_en.rst b/doc/build_and_install/index_en.rst
similarity index 95%
rename from doc/getstarted/build_and_install/index_en.rst
rename to doc/build_and_install/index_en.rst
index 32d66d63dd5b2a30d5de4a088dc80b680830cb84..db6b5be742be1619c52f5f7000bec013e818693d 100644
--- a/doc/getstarted/build_and_install/index_en.rst
+++ b/doc/build_and_install/index_en.rst
@@ -13,7 +13,7 @@ You can choose either pip or Docker to complete your install:
 
    pip_install_en.rst
    docker_install_en.rst
-   ../../howto/dev/build_en.md
+   build_en.md
 
 
 Build from Source
diff --git a/doc/getstarted/build_and_install/paddleci.png b/doc/build_and_install/paddleci.png
similarity index 100%
rename from doc/getstarted/build_and_install/paddleci.png
rename to doc/build_and_install/paddleci.png
diff --git a/doc/getstarted/build_and_install/pip_install_cn.rst b/doc/build_and_install/pip_install_cn.rst
similarity index 100%
rename from doc/getstarted/build_and_install/pip_install_cn.rst
rename to doc/build_and_install/pip_install_cn.rst
diff --git a/doc/getstarted/build_and_install/pip_install_en.rst b/doc/build_and_install/pip_install_en.rst
similarity index 100%
rename from doc/getstarted/build_and_install/pip_install_en.rst
rename to doc/build_and_install/pip_install_en.rst
diff --git a/doc/design/cpp_data_feeding.md b/doc/design/cpp_data_feeding.md
new file mode 100644
index 0000000000000000000000000000000000000000..40205350f99722f0b71bfa6f390fe9d01d831966
--- /dev/null
+++ b/doc/design/cpp_data_feeding.md
@@ -0,0 +1,79 @@
+# C++ Data Feeding
+
+In training with Paddle V2 API, data feeding wholly dependents on Python code. To get rid of the Python environment and achieve the goal of "wrapping the whole training by a while loop op" in Paddle Fluid, a C++ data feeding mechanism is required. 
+
+In this document we show the fundamental design of C++ data feeding process, which includes the data reading, shuffling and batching.
+
+## Reader
+
+A new concept named 'Reader' is introduced. `Reader` is a series of inherited classes which can be hold by our `Variable` and they are used to read or process file data.
+
+
+### `ReaderBase`
+
+`ReaderBase` is the abstract base class of all readers. It defines the all readers' interfaces.
+
+```cpp
+class ReaderBase {
+ public:
+  explicit ReaderBase(const std::vector<DDim>& shapes) : shapes_(shapes) {
+    PADDLE_ENFORCE(!shapes_.empty());
+  }
+  // Read the next batch of data. (A 'batch' can be only one instance)
+  virtual void ReadNext(std::vector<LoDTensor>* out) = 0;
+  // Show whether the next bacth exists.
+  virtual bool HasNext() const = 0;
+  
+  // Reinitialize the reader and read the file from the begin.
+  virtual void ReInit() = 0;
+  
+  // Get a certain read in data's shape.
+  DDim shape(size_t idx) const;
+  // Get shapes of all read in data.
+  std::vector<DDim> shapes() const { return shapes_; }
+  // Set shapes of read in data.
+  void set_shapes(const std::vector<DDim>& shapes) { shapes_ = shapes; }
+
+  virtual ~ReaderBase() {}
+
+ protected:
+  std::vector<DDim> shapes_;
+};
+```
+
+### `FileReader` and `DecoratedReader`
+
+These two classes are derived from the `ReaderBase` and will further be derived by respective specific readers. That is to say, in our design, there are two kinds of readers: file readers and decorated readers. A file reader reads from a file of some specific format, and yield only one instance of data at a time. e.g. RecordIO reader, jpg reader, .... A decorated reader takes another reader(both file reader and decorated reader are OK) as its 'underlying reader'. It gets data from its underlying reader, does some process on them(shuffling, or batching), then yields processed data. The output data of a decorated reader can be a single instance or a batch. `ShuffleReader` and `BatchReader` are both decorated readers.
+
+All the readers share exactly the same interfaces defined in `ReaderBase`. So they can be decorated for more than one time: We can **shuffle** a reader's outputs and then **batch** the shuffle outputs. The interface consistency also allows related ops use readers without knowing what they are exactly.
+
+
+### `ReaderHolder`
+
+Different readers belong to different class types. It leads to a problem: How can we drop them into `Variable`s and fetch them out by a unified method? For example, if a Variable holds a `BatchReader`, we can not get it by the following code:
+
+```cpp
+var->Get<ReaderBase>("batch_reader");
+```
+
+we have to write:
+
+```cpp
+var->Get<BatchReader>("batch_reader");
+```
+
+This requires each time getting a reader from a variable we must know the reader's type exactly. It is nearly impossible.
+
+To solve this problem, we introduce `ReaderHolder` as a wrapper. It acts as an empty decorator of `ReaderBase`, which erases reader's type. With `ReaderHolder` we are able to fetch all types of readers by `var->Get<ReaderHolder>("...")` and regard the obtained object as a reader.
+
+## Related Operators
+
+To create and invoke readers, some now ops are introduced:
+
+### `CreateReaderOp`
+
+Each reader has its creating op. File readers' creating ops have no input and yield the created file reader as its output. Decorated readers' creating ops take the underlying readers as inputs and then yield new decorated readers.
+
+### `ReadOp`
+
+A reader is only a Variable. It cannot trigger the reading process by itself. So we add the `ReadOp` to execute it. A `ReadOp` takes a reader Variable as its input. Each time it runs, it invokes the reader‘s `ReadNext()` function and gets a new batch of data(or only one instance of data, if we use file reader directly). The output data of a reader are in the form of `std::vector<LoDTenosr>`, so the `ReadOp` also needs to split the vector and move LoDTensors to their respective output Variables.
diff --git a/doc/howto/dev/FullyConnected.jpg b/doc/dev/FullyConnected.jpg
similarity index 100%
rename from doc/howto/dev/FullyConnected.jpg
rename to doc/dev/FullyConnected.jpg
diff --git a/doc/howto/dev/contribute_to_paddle_cn.md b/doc/dev/contribute_to_paddle_cn.md
similarity index 100%
rename from doc/howto/dev/contribute_to_paddle_cn.md
rename to doc/dev/contribute_to_paddle_cn.md
diff --git a/doc/dev/contribute_to_paddle_en.md b/doc/dev/contribute_to_paddle_en.md
new file mode 120000
index 0000000000000000000000000000000000000000..f939e75f21a8badb5c40f527abd0e098fe9bc472
--- /dev/null
+++ b/doc/dev/contribute_to_paddle_en.md
@@ -0,0 +1 @@
+../../CONTRIBUTING.md
\ No newline at end of file
diff --git a/doc/dev/index_cn.rst b/doc/dev/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..487db868bb2a0a5383d56c3a723912d9fd5910b7
--- /dev/null
+++ b/doc/dev/index_cn.rst
@@ -0,0 +1,8 @@
+开发标准
+========
+
+..  toctree::
+  :maxdepth: 1
+
+  contribute_to_paddle_cn.md
+  write_docs_cn.rst
diff --git a/doc/dev/index_en.rst b/doc/dev/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..5dd12d2233cff20e021b90beb94571a2817bd1ad
--- /dev/null
+++ b/doc/dev/index_en.rst
@@ -0,0 +1,9 @@
+Development
+------------
+
+..  toctree::
+  :maxdepth: 1
+
+  new_layer_en.rst
+  contribute_to_paddle_en.md
+  write_docs_en.rst
diff --git a/doc/howto/dev/new_layer_cn.rst b/doc/dev/new_layer_cn.rst
similarity index 100%
rename from doc/howto/dev/new_layer_cn.rst
rename to doc/dev/new_layer_cn.rst
diff --git a/doc/howto/dev/new_layer_en.rst b/doc/dev/new_layer_en.rst
similarity index 100%
rename from doc/howto/dev/new_layer_en.rst
rename to doc/dev/new_layer_en.rst
diff --git a/doc/howto/dev/new_op_cn.md b/doc/dev/new_op_cn.md
similarity index 100%
rename from doc/howto/dev/new_op_cn.md
rename to doc/dev/new_op_cn.md
diff --git a/doc/howto/dev/new_op_en.md b/doc/dev/new_op_en.md
similarity index 100%
rename from doc/howto/dev/new_op_en.md
rename to doc/dev/new_op_en.md
diff --git a/doc/howto/dev/new_op_kernel_en.md b/doc/dev/new_op_kernel_en.md
similarity index 100%
rename from doc/howto/dev/new_op_kernel_en.md
rename to doc/dev/new_op_kernel_en.md
diff --git a/doc/howto/dev/use_eigen_cn.md b/doc/dev/use_eigen_cn.md
similarity index 100%
rename from doc/howto/dev/use_eigen_cn.md
rename to doc/dev/use_eigen_cn.md
diff --git a/doc/howto/dev/use_eigen_en.md b/doc/dev/use_eigen_en.md
similarity index 100%
rename from doc/howto/dev/use_eigen_en.md
rename to doc/dev/use_eigen_en.md
diff --git a/doc/howto/dev/write_docs_cn.rst b/doc/dev/write_docs_cn.rst
similarity index 98%
rename from doc/howto/dev/write_docs_cn.rst
rename to doc/dev/write_docs_cn.rst
index 1bc947c260d7adb75ee5a2bb10e6b91bc0be2d4c..f79769b810b91c6984016d95f40b89186bfb61b0 100644
--- a/doc/howto/dev/write_docs_cn.rst
+++ b/doc/dev/write_docs_cn.rst
@@ -1,6 +1,6 @@
-##################
-如何贡献/修改文档
-##################
+#############
+如何贡献文档
+#############
 
 PaddlePaddle的文档包括英文文档 ``doc`` 和中文文档 ``doc_cn`` 两个部分。文档都是通过 `cmake`_ 驱动 `sphinx`_ 编译生成，生成后的文档分别存储在编译目录的 ``doc`` 和 ``doc_cn`` 两个子目录下。
 也可以利用PaddlePaddle 工具来编译文档，这个情况下所有的文件会存在整理过的的文件目录 .ppo_workspace/content 下
diff --git a/doc/howto/dev/write_docs_en.rst b/doc/dev/write_docs_en.rst
similarity index 98%
rename from doc/howto/dev/write_docs_en.rst
rename to doc/dev/write_docs_en.rst
index b3ef07eb1d0012827df8e6a4f27c5fa643649492..f3408a84269aaeef19986c220454555fbbe30e23 100644
--- a/doc/howto/dev/write_docs_en.rst
+++ b/doc/dev/write_docs_en.rst
@@ -1,6 +1,6 @@
-##################
+########################
 Contribute Documentation
-##################
+########################
 
 PaddlePaddle supports English documentation ``doc`` and Chinese documentation ``doc_cn``.
 Both are compiled by `cmake`_ and `sphinx`_ , the compiled documentations will be stored under ``doc`` and ``doc_cn`` directories.
diff --git a/doc/getstarted/concepts/use_concepts_cn.rst b/doc/getstarted/concepts/use_concepts_cn.rst
index e695ff283e2e806377a51c559b37e8068360a4ff..608f49f5a969b3291eb43bf2acf582af74e566a1 100644
--- a/doc/getstarted/concepts/use_concepts_cn.rst
+++ b/doc/getstarted/concepts/use_concepts_cn.rst
@@ -4,7 +4,7 @@
 
 PaddlePaddle是源于百度的一个深度学习平台。PaddlePaddle为深度学习研究人员提供了丰富的API，可以轻松地完成神经网络配置，模型训练等任务。
 这里将介绍PaddlePaddle的基本使用概念，并且展示了如何利用PaddlePaddle来解决一个经典的线性回归问题。
-在使用该文档之前，请参考 `安装文档 <../build_and_install/index_cn.html>`_ 完成PaddlePaddle的安装。
+在使用该文档之前，请参考 `安装文档 <../../build_and_install/index_cn.html>`_ 完成PaddlePaddle的安装。
 
 
 配置网络
diff --git a/doc/getstarted/index_cn.rst b/doc/getstarted/index_cn.rst
index 9f6ee25987d51dcca3a37cf0f62a70a5a5a2d89a..1dc141396b95bda776aeff87ac30fad6baf37bd2 100644
--- a/doc/getstarted/index_cn.rst
+++ b/doc/getstarted/index_cn.rst
@@ -1,61 +1,8 @@
 新手入门
 ============
 
-.. _quick_install:
-
-快速安装
-++++++++
-
-PaddlePaddle支持使用pip快速安装，目前支持CentOS 6以上, Ubuntu 14.04以及MacOS 10.12，并安装有Python2.7。
-执行下面的命令完成快速安装，版本为cpu_avx_openblas：
-
-  .. code-block:: bash
-
-     pip install paddlepaddle
-
-如果需要安装支持GPU的版本（cuda7.5_cudnn5_avx_openblas），需要执行：
-
-  .. code-block:: bash
-
-     pip install paddlepaddle-gpu
-
-更详细的安装和编译方法参考：
-
-..  toctree::
-  :maxdepth: 1
-
-  build_and_install/index_cn.rst
-
-.. _quick_start:
-
-快速开始
-++++++++
-
-创建一个 housing.py 并粘贴此Python代码：
-
-  .. code-block:: python
-
-     import paddle.v2 as paddle
-
-     # Initialize PaddlePaddle.
-     paddle.init(use_gpu=False, trainer_count=1)
-
-     # Configure the neural network.
-     x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
-     y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
-
-     # Infer using provided test data.
-     probs = paddle.infer(
-         output_layer=y_predict,
-         parameters=paddle.dataset.uci_housing.model(),
-         input=[item for item in paddle.dataset.uci_housing.test()()])
-
-     for i in xrange(len(probs)):
-         print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000)
-
-执行 :code:`python housing.py` 瞧！ 它应该打印出预测住房数据的清单。
-
 ..  toctree::
   :maxdepth: 1
 
+  quickstart_cn.rst
   concepts/use_concepts_cn.rst
diff --git a/doc/getstarted/index_en.rst b/doc/getstarted/index_en.rst
index 063d9d880c82550f7f5d47d3d0b1fff59865bca7..c680e1903750117073bee64cb4d4f4ccfff5ac3d 100644
--- a/doc/getstarted/index_en.rst
+++ b/doc/getstarted/index_en.rst
@@ -1,61 +1,7 @@
 GET STARTED
 ============
 
-.. _quick_install:
-
-Quick Install
-----------------------
-
-You can use pip to install PaddlePaddle with a single command, supports
-CentOS 6 above, Ubuntu 14.04 above or MacOS 10.12, with Python 2.7 installed.
-Simply run the following command to install, the version is cpu_avx_openblas:
-
-  .. code-block:: bash
-
-     pip install paddlepaddle
-
-If you need to install GPU version (cuda7.5_cudnn5_avx_openblas), run:
-
-  .. code-block:: bash
-
-     pip install paddlepaddle-gpu
-
-For more details about installation and build:
-
 ..  toctree::
   :maxdepth: 1
 
-  build_and_install/index_en.rst
-
-
-.. _quick_start:
-
-Quick Start
-++++++++
-
-Create a new file called housing.py, and paste this Python
-code:
-
-
-  .. code-block:: python
-
-     import paddle.v2 as paddle
-
-     # Initialize PaddlePaddle.
-     paddle.init(use_gpu=False, trainer_count=1)
-
-     # Configure the neural network.
-     x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
-     y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
-
-     # Infer using provided test data.
-     probs = paddle.infer(
-         output_layer=y_predict,
-         parameters=paddle.dataset.uci_housing.model(),
-         input=[item for item in paddle.dataset.uci_housing.test()()])
-
-     for i in xrange(len(probs)):
-         print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000)
-
-Run :code:`python housing.py` and voila! It should print out a list of predictions
-for the test housing data.
+  quickstart_en.rst
diff --git a/doc/getstarted/quickstart_cn.rst b/doc/getstarted/quickstart_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..d511cead262dabafd095f68adb5ffc596a7fe596
--- /dev/null
+++ b/doc/getstarted/quickstart_cn.rst
@@ -0,0 +1,47 @@
+快速开始
+========
+
+快速安装
+--------
+
+PaddlePaddle支持使用pip快速安装，目前支持CentOS 6以上, Ubuntu 14.04以及MacOS 10.12，并安装有Python2.7。
+执行下面的命令完成快速安装，版本为cpu_avx_openblas：
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+如果需要安装支持GPU的版本（cuda7.5_cudnn5_avx_openblas），需要执行：
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+更详细的安装和编译方法参考：:ref:`install_steps` 。
+
+快速使用
+--------
+
+创建一个 housing.py 并粘贴此Python代码：
+
+  .. code-block:: python
+
+     import paddle.v2 as paddle
+
+     # Initialize PaddlePaddle.
+     paddle.init(use_gpu=False, trainer_count=1)
+
+     # Configure the neural network.
+     x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
+     y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
+
+     # Infer using provided test data.
+     probs = paddle.infer(
+         output_layer=y_predict,
+         parameters=paddle.dataset.uci_housing.model(),
+         input=[item for item in paddle.dataset.uci_housing.test()()])
+
+     for i in xrange(len(probs)):
+         print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000)
+
+执行 :code:`python housing.py` 瞧！ 它应该打印出预测住房数据的清单。
diff --git a/doc/getstarted/quickstart_en.rst b/doc/getstarted/quickstart_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..70f7fe0646068aa79cd72955c6848ac0250c2300
--- /dev/null
+++ b/doc/getstarted/quickstart_en.rst
@@ -0,0 +1,51 @@
+Quick Start
+============
+
+Quick Install
+-------------
+
+You can use pip to install PaddlePaddle with a single command, supports
+CentOS 6 above, Ubuntu 14.04 above or MacOS 10.12, with Python 2.7 installed.
+Simply run the following command to install, the version is cpu_avx_openblas:
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+If you need to install GPU version (cuda7.5_cudnn5_avx_openblas), run:
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+For more details about installation and build: :ref:`install_steps` .
+
+Quick Use
+---------
+
+Create a new file called housing.py, and paste this Python
+code:
+
+
+  .. code-block:: python
+
+     import paddle.v2 as paddle
+
+     # Initialize PaddlePaddle.
+     paddle.init(use_gpu=False, trainer_count=1)
+
+     # Configure the neural network.
+     x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
+     y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
+
+     # Infer using provided test data.
+     probs = paddle.infer(
+         output_layer=y_predict,
+         parameters=paddle.dataset.uci_housing.model(),
+         input=[item for item in paddle.dataset.uci_housing.test()()])
+
+     for i in xrange(len(probs)):
+         print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000)
+
+Run :code:`python housing.py` and voila! It should print out a list of predictions
+for the test housing data.
diff --git a/doc/howto/usage/capi/compile_paddle_lib_cn.md b/doc/howto/capi/compile_paddle_lib_cn.md
similarity index 99%
rename from doc/howto/usage/capi/compile_paddle_lib_cn.md
rename to doc/howto/capi/compile_paddle_lib_cn.md
index ac5ecffe2ea8ddc3703a32e9a0a8ee83bbe5dd14..fd8dec8164580b9dcb716e69f3cc5357639f17d3 100644
--- a/doc/howto/usage/capi/compile_paddle_lib_cn.md
+++ b/doc/howto/capi/compile_paddle_lib_cn.md
@@ -1,4 +1,4 @@
-## 编译 PaddlePaddle 预测库
+## 安装与编译C-API预测库
 
 ### 概述
 
diff --git a/doc/howto/usage/capi/images/csr.png b/doc/howto/capi/images/csr.png
similarity index 100%
rename from doc/howto/usage/capi/images/csr.png
rename to doc/howto/capi/images/csr.png
diff --git a/doc/howto/usage/capi/images/sequence_data.png b/doc/howto/capi/images/sequence_data.png
similarity index 100%
rename from doc/howto/usage/capi/images/sequence_data.png
rename to doc/howto/capi/images/sequence_data.png
diff --git a/doc/howto/usage/capi/images/workflow_of_CAPI.png b/doc/howto/capi/images/workflow_of_CAPI.png
similarity index 100%
rename from doc/howto/usage/capi/images/workflow_of_CAPI.png
rename to doc/howto/capi/images/workflow_of_CAPI.png
diff --git a/doc/howto/usage/capi/index_cn.rst b/doc/howto/capi/index_cn.rst
similarity index 87%
rename from doc/howto/usage/capi/index_cn.rst
rename to doc/howto/capi/index_cn.rst
index fd774fbc742671c5a8009cb742f2c9d06a525199..e589a6d346a1e23a4eed9801e02727c80782ae8b 100644
--- a/doc/howto/usage/capi/index_cn.rst
+++ b/doc/howto/capi/index_cn.rst
@@ -1,4 +1,4 @@
-PaddlePaddle C-API
+C-API预测库
 ==================
 
 ..  toctree::
diff --git a/doc/howto/usage/capi/organization_of_the_inputs_cn.md b/doc/howto/capi/organization_of_the_inputs_cn.md
similarity index 100%
rename from doc/howto/usage/capi/organization_of_the_inputs_cn.md
rename to doc/howto/capi/organization_of_the_inputs_cn.md
diff --git a/doc/howto/usage/capi/workflow_of_capi_cn.md b/doc/howto/capi/workflow_of_capi_cn.md
similarity index 99%
rename from doc/howto/usage/capi/workflow_of_capi_cn.md
rename to doc/howto/capi/workflow_of_capi_cn.md
index e0a42fff12cf0f53dee18165e059150861524f74..a61d2267bfdb7c32da528735b20d7c6a531aaa1f 100644
--- a/doc/howto/usage/capi/workflow_of_capi_cn.md
+++ b/doc/howto/capi/workflow_of_capi_cn.md
@@ -1,4 +1,4 @@
-## C-API 使用流程
+## C-API使用流程
 
 这篇文档介绍 PaddlePaddle C-API 整体使用流程。
 
diff --git a/doc/howto/usage/cluster/cluster_train_cn.md b/doc/howto/cluster/cmd_argument_cn.md
similarity index 56%
rename from doc/howto/usage/cluster/cluster_train_cn.md
rename to doc/howto/cluster/cmd_argument_cn.md
index 0f3db59607fb6b43da01f5fdb46949087517ed6c..5c575dd5b53f6e4ea025a8fbaebdb2d1a1f1c9ed 100644
--- a/doc/howto/usage/cluster/cluster_train_cn.md
+++ b/doc/howto/cluster/cmd_argument_cn.md
@@ -1,41 +1,7 @@
-# 分布式训练
-
-
-## 概述
-
-本文将介绍如何使用PaddlePaddle在不同的集群框架下完成分布式训练。分布式训练架构如下图所示：
-
-<img src="https://user-images.githubusercontent.com/13348433/31772175-5f419eca-b511-11e7-9db7-5231fe3d9ccb.png" width="500">
-
-- 数据分片（Data shard): 用于训练神经网络的数据，被切分成多个部分，每个部分分别给每个trainer使用。
-- 计算节点（Trainer）: 每个trainer启动后读取切分好的一部分数据，开始神经网络的“前馈”和“后馈”计算，并和参数服务器通信。在完成一定量数据的训练后，上传计算得出的梯度（gradients），然后下载优化更新后的神经网络参数（parameters）。
-- 参数服务器（Parameter server）:每个参数服务器只保存整个神经网络所有参数的一部分。参数服务器接收从计算节点上传的梯度，并完成参数优化更新，再将更新后的参数下发到每个计算节点。
-
-这样，通过计算节点和参数服务器的分布式协作，可以完成神经网络的SGD方法的训练。PaddlePaddle可以同时支持同步随机梯度下降（SGD）和异步随机梯度下降。
-
-在使用同步SGD训练神经网络时，PaddlePaddle使用同步屏障（barrier），使梯度的提交和参数的更新按照顺序方式执行。在异步SGD中，则并不会等待所有trainer提交梯度才更新参数，这样极大地提高了计算的并行性：参数服务器之间不相互依赖，并行地接收梯度和更新参数，参数服务器也不会等待计算节点全部都提交梯度之后才开始下一步，计算节点之间也不会相互依赖，并行地执行模型的训练。可以看出，虽然异步SGD方式会提高参数更新并行度, 但是并不能保证参数同步更新，在任意时间某一台参数服务器上保存的参数可能比另一台要更新，与同步SGD相比，梯度会有噪声。
-
-
-## 环境准备
-
-1. 准备您的计算集群。计算集群通常由一组（几台到几千台规模）的Linux服务器组成。服务器之间可以通过局域网（LAN）联通，每台服务器具有集群中唯一的IP地址（或者可被DNS解析的主机名）。集群中的每台计算机通常被成为一个“节点”。
-1. 我们需要在集群的所有节点上安装 PaddlePaddle。 如果要启用GPU，还需要在节点上安装对应的GPU驱动以及CUDA。PaddlePaddle的安装可以参考[build_and_install](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/index_cn.html)的多种安装方式。我们推荐使用[Docker](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/docker_install_cn.html)安装方式来快速安装PaddlePaddle。
-
-安装完成之后，执行下面的命令可以查看已经安装的版本（docker安装方式可以进入docker容器执行：`docker run -it paddlepaddle/paddle:[tag] /bin/bash`）：
-```bash
-$ paddle version
-PaddlePaddle 0.10.0, compiled with
-    with_avx: ON
-    with_gpu: OFF
-    with_double: OFF
-    with_python: ON
-    with_rdma: OFF
-    with_timer: OFF
-```
+## 启动参数说明
 
-下面以`doc/howto/usage/cluster/src/word2vec`中的代码作为实例，介绍使用PaddlePaddle v2 API完成分布式训练。
+下面以`doc/howto/cluster/src/word2vec`中的代码作为实例，介绍使用PaddlePaddle v2 API完成分布式训练。
 
-## 启动参数说明
 ### 启动参数服务器
 执行以下的命令启动一个参数服务器并等待和计算节点的数据交互
 ```bash
@@ -167,22 +133,3 @@ test.txt-00002
 
 - `train_data_dir`：包含训练数据的目录，可以是从分布式存储挂载过来的，也可以是在任务启动前下载到本地的。
 - `test_data_dir`：包含测试数据集的目录。
-
-## 使用分布式计算平台或工具
-
-PaddlePaddle可以使用多种分布式计算平台构建分布式计算任务，包括：
-- [Kubernetes](http://kubernetes.io) Google开源的容器集群的调度框架，支持大规模集群生产环境的完整集群方案。
-- [OpenMPI](https://www.open-mpi.org) 成熟的高性能并行计算框架。
-- [Fabric](http://www.fabfile.org) 集群管理工具。可以使用`Fabric`编写集群任务提交和管理脚本。
-
-对于不同的集群平台，会分别介绍集群作业的启动和停止方法。这些例子都可以在[cluster_train_v2](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2)找到。
-
-在使用分布式计算平台进行训练时，任务被调度在集群中时，分布式计算平台通常会通过API或者环境变量提供任务运行需要的参数，比如节点的ID、IP和任务节点个数等。
-
-## 在不同集群中运行
-
-  - [fabric集群](fabric_cn.md)
-  - [openmpi集群](openmpi_cn.md)
-  - [kubernetes单机](k8s_cn.md)
-  - [kubernetes distributed分布式](k8s_distributed_cn.md)
-  - [AWS上运行kubernetes集群训练](k8s_aws_cn.md)
diff --git a/doc/howto/usage/cluster/cluster_train_en.md b/doc/howto/cluster/cmd_argument_en.md
similarity index 58%
rename from doc/howto/usage/cluster/cluster_train_en.md
rename to doc/howto/cluster/cmd_argument_en.md
index f9424f8f1a29fcf001c4e7976086512b22f6e858..06fd5717564c99e3bb46835a2bd5071dff665f23 100644
--- a/doc/howto/usage/cluster/cluster_train_en.md
+++ b/doc/howto/cluster/cmd_argument_en.md
@@ -1,40 +1,7 @@
-# Distributed Training
-
-## Introduction
-
-In this article, we'll explain how to run distributed training jobs with PaddlePaddle on different types of clusters. The diagram below shows the main architecture of a distributed trainning job:
-
-<img src="https://user-images.githubusercontent.com/13348433/31772146-41523d84-b511-11e7-8a12-a69fd136c283.png" width="500">
-
-- Data shard: training data will be split into multiple partitions, trainers use the partitions of the whole dataset to do the training job.
-- Trainer: each trainer reads the data shard, and train the neural network. Then the trainer will upload calculated "gradients" to parameter servers, and wait for parameters to be optimized on the parameter server side. When that finishes, the trainer download optimized parameters and continues its training.
-- Parameter server: every parameter server stores part of the whole neural network model data. They will do optimization calculations when gradients are uploaded from trainers, and then send updated parameters to trainers.
-
-PaddlePaddle can support both synchronize stochastic gradient descent (SGD) and asynchronous SGD.
-
-When training with synchronize SGD, PaddlePaddle uses an internal "synchronize barrier" which makes gradients update and parameter download in strict order. On the other hand, asynchronous SGD won't wait for all trainers to finish upload at a single step, this will increase the parallelism of distributed training: parameter servers do not depend on each other, they'll do parameter optimization concurrently. Parameter servers will not wait for trainers, so trainers will also do their work concurrently. But asynchronous SGD will introduce more randomness and noises in the gradient.
-
-## Preparations
-1. Prepare your computer cluster. It's normally a bunch of Linux servers connected by LAN. Each server will be assigned a unique IP address. The computers in the cluster can be called "nodes".
-2. Install PaddlePaddle on every node. If you are going to take advantage of GPU cards, you'll also need to install proper driver and CUDA libraries. To install PaddlePaddle please read [this build and install](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html) document. We strongly recommend using [Docker installation](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/docker_install_en.html).
-
-After installation, you can check the version by typing the below command (run a docker container  if using docker: `docker run -it paddlepaddle/paddle:[tag] /bin/bash`):
-
-```bash
-$ paddle version
-PaddlePaddle 0.10.0rc, compiled with
-    with_avx: ON
-    with_gpu: OFF
-    with_double: OFF
-    with_python: ON
-    with_rdma: OFF
-    with_timer: OFF
-```
-
-We'll take `doc/howto/usage/cluster/src/word2vec` as an example to introduce distributed training using PaddlePaddle v2 API.
-
 ## Command-line arguments
 
+We'll take `doc/howto/cluster/src/word2vec` as an example to introduce distributed training using PaddlePaddle v2 API.
+
 ### Starting parameter server
 
 Type the below command to start a parameter server which will wait for trainers to connect:
@@ -171,21 +138,3 @@ Your workspace may looks like:
 
 - `train_data_dir`: containing training data. Mount from storage service or copy trainning data to here.
 - `test_data_dir`: containing testing data.
-
-## Use cluster platforms or cluster management tools
-
-PaddlePaddle supports running jobs on several platforms including:
-- [Kubernetes](http://kubernetes.io) open-source system for automating deployment, scaling, and management of containerized applications from Google.
-- [OpenMPI](https://www.open-mpi.org) Mature high performance parallel computing framework.
-- [Fabric](http://www.fabfile.org) A cluster management tool. Write scripts to submit jobs or manage the cluster.
-
-We'll introduce cluster job management on these platforms. The examples can be found under [cluster_train_v2](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2).
-
-These cluster platforms provide API or environment variables for training processes, when the job is dispatched to different nodes. Like node ID, IP or total number of nodes etc.
-
-## Use different clusters
-
-  - [fabric](fabric_en.md)
-  - [openmpi](openmpi_en.md)
-  - [kubernetes](k8s_en.md)
-  - [kubernetes on AWS](k8s_aws_en.md)
diff --git a/doc/howto/usage/cluster/fluid_cluster_train_en.md b/doc/howto/cluster/fluid_cluster_train_en.md
similarity index 100%
rename from doc/howto/usage/cluster/fluid_cluster_train_en.md
rename to doc/howto/cluster/fluid_cluster_train_en.md
diff --git a/doc/howto/cluster/index_cn.rst b/doc/howto/cluster/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a60521b4a9646bdc6d9f1bf6da482acc989d8bf3
--- /dev/null
+++ b/doc/howto/cluster/index_cn.rst
@@ -0,0 +1,22 @@
+分布式训练
+==========
+
+本节将介绍如何使用PaddlePaddle在不同的集群框架下完成分布式训练。分布式训练架构如下图所示：
+
+.. image:: src/ps_cn.png
+   :width: 500
+
+- 数据分片（Data shard): 用于训练神经网络的数据，被切分成多个部分，每个部分分别给每个trainer使用。
+- 计算节点（Trainer）: 每个trainer启动后读取切分好的一部分数据，开始神经网络的“前馈”和“后馈”计算，并和参数服务器通信。在完成一定量数据的训练后，上传计算得出的梯度（gradients），然后下载优化更新后的神经网络参数（parameters）。
+- 参数服务器（Parameter server）:每个参数服务器只保存整个神经网络所有参数的一部分。参数服务器接收从计算节点上传的梯度，并完成参数优化更新，再将更新后的参数下发到每个计算节点。
+
+这样，通过计算节点和参数服务器的分布式协作，可以完成神经网络的SGD方法的训练。PaddlePaddle可以同时支持同步随机梯度下降（SGD）和异步随机梯度下降。
+
+在使用同步SGD训练神经网络时，PaddlePaddle使用同步屏障（barrier），使梯度的提交和参数的更新按照顺序方式执行。在异步SGD中，则并不会等待所有trainer提交梯度才更新参数，这样极大地提高了计算的并行性：参数服务器之间不相互依赖，并行地接收梯度和更新参数，参数服务器也不会等待计算节点全部都提交梯度之后才开始下一步，计算节点之间也不会相互依赖，并行地执行模型的训练。可以看出，虽然异步SGD方式会提高参数更新并行度, 但是并不能保证参数同步更新，在任意时间某一台参数服务器上保存的参数可能比另一台要更新，与同步SGD相比，梯度会有噪声。
+
+..  toctree::
+  :maxdepth: 1
+
+  preparations_cn.md
+  cmd_argument_cn.md
+  multi_cluster/index_cn.rst
diff --git a/doc/howto/cluster/index_en.rst b/doc/howto/cluster/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2640a09dcc904619bc97c9bd3f3d81a9dc307663
--- /dev/null
+++ b/doc/howto/cluster/index_en.rst
@@ -0,0 +1,22 @@
+Distributed Training
+====================
+
+In this section, we'll explain how to run distributed training jobs with PaddlePaddle on different types of clusters. The diagram below shows the main architecture of a distributed trainning job:
+
+.. image:: src/ps_en.png
+   :width: 500
+
+- Data shard: training data will be split into multiple partitions, trainers use the partitions of the whole dataset to do the training job.
+- Trainer: each trainer reads the data shard, and train the neural network. Then the trainer will upload calculated "gradients" to parameter servers, and wait for parameters to be optimized on the parameter server side. When that finishes, the trainer download optimized parameters and continues its training.
+- Parameter server: every parameter server stores part of the whole neural network model data. They will do optimization calculations when gradients are uploaded from trainers, and then send updated parameters to trainers.
+
+PaddlePaddle can support both synchronize stochastic gradient descent (SGD) and asynchronous SGD.
+
+When training with synchronize SGD, PaddlePaddle uses an internal "synchronize barrier" which makes gradients update and parameter download in strict order. On the other hand, asynchronous SGD won't wait for all trainers to finish upload at a single step, this will increase the parallelism of distributed training: parameter servers do not depend on each other, they'll do parameter optimization concurrently. Parameter servers will not wait for trainers, so trainers will also do their work concurrently. But asynchronous SGD will introduce more randomness and noises in the gradient.
+
+..  toctree::
+  :maxdepth: 1
+
+  preparations_en.md
+  cmd_argument_en.md
+  multi_cluster/index_en.rst
diff --git a/doc/howto/usage/cluster/fabric_cn.md b/doc/howto/cluster/multi_cluster/fabric_cn.md
similarity index 100%
rename from doc/howto/usage/cluster/fabric_cn.md
rename to doc/howto/cluster/multi_cluster/fabric_cn.md
diff --git a/doc/howto/usage/cluster/fabric_en.md b/doc/howto/cluster/multi_cluster/fabric_en.md
similarity index 100%
rename from doc/howto/usage/cluster/fabric_en.md
rename to doc/howto/cluster/multi_cluster/fabric_en.md
diff --git a/doc/howto/cluster/multi_cluster/index_cn.rst b/doc/howto/cluster/multi_cluster/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ef56b6ddb38e59f20f7248de1ceb952c7627ce76
--- /dev/null
+++ b/doc/howto/cluster/multi_cluster/index_cn.rst
@@ -0,0 +1,20 @@
+在不同集群中运行
+================
+
+PaddlePaddle可以使用多种分布式计算平台构建分布式计算任务，包括：
+- `Kubernetes <http://kubernetes.io>`_ Google开源的容器集群的调度框架，支持大规模集群生产环境的完整集群方案。
+- `OpenMPI <https://www.open-mpi.org>`_ 成熟的高性能并行计算框架。
+- `Fabric <http://www.fabfile.org>`_ 集群管理工具。可以使用`Fabric`编写集群任务提交和管理脚本。
+
+对于不同的集群平台，会分别介绍集群作业的启动和停止方法。这些例子都可以在 `cluster_train_v2 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2>`_ 找到。
+
+在使用分布式计算平台进行训练时，任务被调度在集群中时，分布式计算平台通常会通过API或者环境变量提供任务运行需要的参数，比如节点的ID、IP和任务节点个数等。
+
+..  toctree::
+  :maxdepth: 1
+
+  fabric_cn.md
+  openmpi_cn.md
+  k8s_cn.md
+  k8s_distributed_cn.md
+  k8s_aws_cn.md
diff --git a/doc/howto/cluster/multi_cluster/index_en.rst b/doc/howto/cluster/multi_cluster/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..dac7aaef085c80851c1bbb89250faf2151de4ca6
--- /dev/null
+++ b/doc/howto/cluster/multi_cluster/index_en.rst
@@ -0,0 +1,19 @@
+Use different clusters
+======================
+
+PaddlePaddle supports running jobs on several platforms including:
+- `Kubernetes <http://kubernetes.io>`_ open-source system for automating deployment, scaling, and management of containerized applications from Google.
+- `OpenMPI <https://www.open-mpi.org>`_ Mature high performance parallel computing framework.
+- `Fabric <http://www.fabfile.org>`_ A cluster management tool. Write scripts to submit jobs or manage the cluster.
+
+We'll introduce cluster job management on these platforms. The examples can be found under `cluster_train_v2 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2>`_ .
+
+These cluster platforms provide API or environment variables for training processes, when the job is dispatched to different nodes. Like node ID, IP or total number of nodes etc.
+
+..  toctree::
+  :maxdepth: 1
+
+  fabric_en.md
+  openmpi_en.md
+  k8s_en.md
+  k8s_aws_en.md
diff --git a/doc/howto/usage/cluster/k8s_aws_cn.md b/doc/howto/cluster/multi_cluster/k8s_aws_cn.md
similarity index 100%
rename from doc/howto/usage/cluster/k8s_aws_cn.md
rename to doc/howto/cluster/multi_cluster/k8s_aws_cn.md
diff --git a/doc/howto/usage/cluster/k8s_aws_en.md b/doc/howto/cluster/multi_cluster/k8s_aws_en.md
similarity index 100%
rename from doc/howto/usage/cluster/k8s_aws_en.md
rename to doc/howto/cluster/multi_cluster/k8s_aws_en.md
diff --git a/doc/howto/usage/cluster/k8s_cn.md b/doc/howto/cluster/multi_cluster/k8s_cn.md
similarity index 100%
rename from doc/howto/usage/cluster/k8s_cn.md
rename to doc/howto/cluster/multi_cluster/k8s_cn.md
diff --git a/doc/howto/usage/cluster/k8s_distributed_cn.md b/doc/howto/cluster/multi_cluster/k8s_distributed_cn.md
similarity index 100%
rename from doc/howto/usage/cluster/k8s_distributed_cn.md
rename to doc/howto/cluster/multi_cluster/k8s_distributed_cn.md
diff --git a/doc/howto/usage/cluster/k8s_en.md b/doc/howto/cluster/multi_cluster/k8s_en.md
similarity index 100%
rename from doc/howto/usage/cluster/k8s_en.md
rename to doc/howto/cluster/multi_cluster/k8s_en.md
diff --git a/doc/howto/usage/cluster/openmpi_cn.md b/doc/howto/cluster/multi_cluster/openmpi_cn.md
similarity index 100%
rename from doc/howto/usage/cluster/openmpi_cn.md
rename to doc/howto/cluster/multi_cluster/openmpi_cn.md
diff --git a/doc/howto/usage/cluster/openmpi_en.md b/doc/howto/cluster/multi_cluster/openmpi_en.md
similarity index 100%
rename from doc/howto/usage/cluster/openmpi_en.md
rename to doc/howto/cluster/multi_cluster/openmpi_en.md
diff --git a/doc/howto/usage/cluster/src/add_security_group.png b/doc/howto/cluster/multi_cluster/src/add_security_group.png
similarity index 100%
rename from doc/howto/usage/cluster/src/add_security_group.png
rename to doc/howto/cluster/multi_cluster/src/add_security_group.png
diff --git a/doc/howto/usage/cluster/src/create_efs.png b/doc/howto/cluster/multi_cluster/src/create_efs.png
similarity index 100%
rename from doc/howto/usage/cluster/src/create_efs.png
rename to doc/howto/cluster/multi_cluster/src/create_efs.png
diff --git a/doc/howto/usage/cluster/src/k8s-paddle-arch.png b/doc/howto/cluster/multi_cluster/src/k8s-paddle-arch.png
similarity index 100%
rename from doc/howto/usage/cluster/src/k8s-paddle-arch.png
rename to doc/howto/cluster/multi_cluster/src/k8s-paddle-arch.png
diff --git a/doc/howto/usage/cluster/src/k8s_data/Dockerfile b/doc/howto/cluster/multi_cluster/src/k8s_data/Dockerfile
similarity index 100%
rename from doc/howto/usage/cluster/src/k8s_data/Dockerfile
rename to doc/howto/cluster/multi_cluster/src/k8s_data/Dockerfile
diff --git a/doc/howto/usage/cluster/src/k8s_data/README.md b/doc/howto/cluster/multi_cluster/src/k8s_data/README.md
similarity index 100%
rename from doc/howto/usage/cluster/src/k8s_data/README.md
rename to doc/howto/cluster/multi_cluster/src/k8s_data/README.md
diff --git a/doc/howto/usage/cluster/src/k8s_data/get_data.sh b/doc/howto/cluster/multi_cluster/src/k8s_data/get_data.sh
similarity index 100%
rename from doc/howto/usage/cluster/src/k8s_data/get_data.sh
rename to doc/howto/cluster/multi_cluster/src/k8s_data/get_data.sh
diff --git a/doc/howto/usage/cluster/src/k8s_train/Dockerfile b/doc/howto/cluster/multi_cluster/src/k8s_train/Dockerfile
similarity index 100%
rename from doc/howto/usage/cluster/src/k8s_train/Dockerfile
rename to doc/howto/cluster/multi_cluster/src/k8s_train/Dockerfile
diff --git a/doc/howto/usage/cluster/src/k8s_train/README.md b/doc/howto/cluster/multi_cluster/src/k8s_train/README.md
similarity index 100%
rename from doc/howto/usage/cluster/src/k8s_train/README.md
rename to doc/howto/cluster/multi_cluster/src/k8s_train/README.md
diff --git a/doc/howto/usage/cluster/src/k8s_train/start.sh b/doc/howto/cluster/multi_cluster/src/k8s_train/start.sh
similarity index 100%
rename from doc/howto/usage/cluster/src/k8s_train/start.sh
rename to doc/howto/cluster/multi_cluster/src/k8s_train/start.sh
diff --git a/doc/howto/usage/cluster/src/k8s_train/start_paddle.py b/doc/howto/cluster/multi_cluster/src/k8s_train/start_paddle.py
similarity index 100%
rename from doc/howto/usage/cluster/src/k8s_train/start_paddle.py
rename to doc/howto/cluster/multi_cluster/src/k8s_train/start_paddle.py
diff --git a/doc/howto/usage/cluster/src/pserver_and_trainer.png b/doc/howto/cluster/multi_cluster/src/pserver_and_trainer.png
similarity index 100%
rename from doc/howto/usage/cluster/src/pserver_and_trainer.png
rename to doc/howto/cluster/multi_cluster/src/pserver_and_trainer.png
diff --git a/doc/howto/usage/cluster/src/route53_create_recordset.png b/doc/howto/cluster/multi_cluster/src/route53_create_recordset.png
similarity index 100%
rename from doc/howto/usage/cluster/src/route53_create_recordset.png
rename to doc/howto/cluster/multi_cluster/src/route53_create_recordset.png
diff --git a/doc/howto/usage/cluster/src/route53_create_zone.png b/doc/howto/cluster/multi_cluster/src/route53_create_zone.png
similarity index 100%
rename from doc/howto/usage/cluster/src/route53_create_zone.png
rename to doc/howto/cluster/multi_cluster/src/route53_create_zone.png
diff --git a/doc/howto/usage/cluster/src/worker_security_group.png b/doc/howto/cluster/multi_cluster/src/worker_security_group.png
similarity index 100%
rename from doc/howto/usage/cluster/src/worker_security_group.png
rename to doc/howto/cluster/multi_cluster/src/worker_security_group.png
diff --git a/doc/howto/cluster/preparations_cn.md b/doc/howto/cluster/preparations_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..ce40697e703503b66f6306e15ebdb0ce1329991d
--- /dev/null
+++ b/doc/howto/cluster/preparations_cn.md
@@ -0,0 +1,16 @@
+## 环境准备
+
+1. 准备您的计算集群。计算集群通常由一组（几台到几千台规模）的Linux服务器组成。服务器之间可以通过局域网（LAN）联通，每台服务器具有集群中唯一的IP地址（或者可被DNS解析的主机名）。集群中的每台计算机通常被成为一个“节点”。
+1. 我们需要在集群的所有节点上安装 PaddlePaddle。 如果要启用GPU，还需要在节点上安装对应的GPU驱动以及CUDA。PaddlePaddle的安装可以参考[build_and_install](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/index_cn.html)的多种安装方式。我们推荐使用[Docker](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/docker_install_cn.html)安装方式来快速安装PaddlePaddle。
+
+安装完成之后，执行下面的命令可以查看已经安装的版本（docker安装方式可以进入docker容器执行：`docker run -it paddlepaddle/paddle:[tag] /bin/bash`）：
+```bash
+$ paddle version
+PaddlePaddle 0.10.0, compiled with
+    with_avx: ON
+    with_gpu: OFF
+    with_double: OFF
+    with_python: ON
+    with_rdma: OFF
+    with_timer: OFF
+```
diff --git a/doc/howto/cluster/preparations_en.md b/doc/howto/cluster/preparations_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..4b77b293907ae0548134fc65ceed3aa0ed0b845d
--- /dev/null
+++ b/doc/howto/cluster/preparations_en.md
@@ -0,0 +1,17 @@
+## Preparations
+
+1. Prepare your computer cluster. It's normally a bunch of Linux servers connected by LAN. Each server will be assigned a unique IP address. The computers in the cluster can be called "nodes".
+2. Install PaddlePaddle on every node. If you are going to take advantage of GPU cards, you'll also need to install proper driver and CUDA libraries. To install PaddlePaddle please read [this build and install](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html) document. We strongly recommend using [Docker installation](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/docker_install_en.html).
+
+After installation, you can check the version by typing the below command (run a docker container  if using docker: `docker run -it paddlepaddle/paddle:[tag] /bin/bash`):
+
+```bash
+$ paddle version
+PaddlePaddle 0.10.0rc, compiled with
+    with_avx: ON
+    with_gpu: OFF
+    with_double: OFF
+    with_python: ON
+    with_rdma: OFF
+    with_timer: OFF
+```
diff --git a/doc/howto/usage/cluster/src/Dockerfile b/doc/howto/cluster/src/Dockerfile
similarity index 100%
rename from doc/howto/usage/cluster/src/Dockerfile
rename to doc/howto/cluster/src/Dockerfile
diff --git a/doc/howto/usage/cluster/src/efs_mount.png b/doc/howto/cluster/src/efs_mount.png
similarity index 100%
rename from doc/howto/usage/cluster/src/efs_mount.png
rename to doc/howto/cluster/src/efs_mount.png
diff --git a/doc/howto/usage/cluster/src/managed_policy.png b/doc/howto/cluster/src/managed_policy.png
similarity index 100%
rename from doc/howto/usage/cluster/src/managed_policy.png
rename to doc/howto/cluster/src/managed_policy.png
diff --git a/doc/howto/usage/cluster/src/trainer_cn.png b/doc/howto/cluster/src/ps_cn.png
similarity index 100%
rename from doc/howto/usage/cluster/src/trainer_cn.png
rename to doc/howto/cluster/src/ps_cn.png
diff --git a/doc/howto/usage/cluster/src/trainer.png b/doc/howto/cluster/src/ps_en.png
similarity index 100%
rename from doc/howto/usage/cluster/src/trainer.png
rename to doc/howto/cluster/src/ps_en.png
diff --git a/doc/howto/cluster/src/trainer.png b/doc/howto/cluster/src/trainer.png
new file mode 100644
index 0000000000000000000000000000000000000000..6537d3d56589ca9f19a77a50a970e4b5275e6ce0
Binary files /dev/null and b/doc/howto/cluster/src/trainer.png differ
diff --git a/doc/howto/cluster/src/trainer_cn.png b/doc/howto/cluster/src/trainer_cn.png
new file mode 100644
index 0000000000000000000000000000000000000000..f9525739cc8bc6506adde642aafa0a85ae3ebebc
Binary files /dev/null and b/doc/howto/cluster/src/trainer_cn.png differ
diff --git a/doc/howto/usage/cluster/src/word2vec/api_train_v2.py b/doc/howto/cluster/src/word2vec/api_train_v2.py
similarity index 100%
rename from doc/howto/usage/cluster/src/word2vec/api_train_v2.py
rename to doc/howto/cluster/src/word2vec/api_train_v2.py
diff --git a/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py b/doc/howto/cluster/src/word2vec/api_train_v2_cluster.py
similarity index 100%
rename from doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py
rename to doc/howto/cluster/src/word2vec/api_train_v2_cluster.py
diff --git a/doc/howto/usage/cluster/src/word2vec/prepare.py b/doc/howto/cluster/src/word2vec/prepare.py
similarity index 100%
rename from doc/howto/usage/cluster/src/word2vec/prepare.py
rename to doc/howto/cluster/src/word2vec/prepare.py
diff --git a/doc/howto/usage/cmd_parameter/arguments_cn.md b/doc/howto/cmd_parameter/arguments_cn.md
similarity index 100%
rename from doc/howto/usage/cmd_parameter/arguments_cn.md
rename to doc/howto/cmd_parameter/arguments_cn.md
diff --git a/doc/howto/usage/cmd_parameter/arguments_en.md b/doc/howto/cmd_parameter/arguments_en.md
similarity index 100%
rename from doc/howto/usage/cmd_parameter/arguments_en.md
rename to doc/howto/cmd_parameter/arguments_en.md
diff --git a/doc/howto/usage/cmd_parameter/detail_introduction_cn.md b/doc/howto/cmd_parameter/detail_introduction_cn.md
similarity index 100%
rename from doc/howto/usage/cmd_parameter/detail_introduction_cn.md
rename to doc/howto/cmd_parameter/detail_introduction_cn.md
diff --git a/doc/howto/usage/cmd_parameter/detail_introduction_en.md b/doc/howto/cmd_parameter/detail_introduction_en.md
similarity index 100%
rename from doc/howto/usage/cmd_parameter/detail_introduction_en.md
rename to doc/howto/cmd_parameter/detail_introduction_en.md
diff --git a/doc/howto/usage/cmd_parameter/index_cn.rst b/doc/howto/cmd_parameter/index_cn.rst
similarity index 85%
rename from doc/howto/usage/cmd_parameter/index_cn.rst
rename to doc/howto/cmd_parameter/index_cn.rst
index 4c8729821110b9aec99351fc0a83a1ba75a8a2bb..17b379f6295d66d864e2b53108012eff5895d96b 100644
--- a/doc/howto/usage/cmd_parameter/index_cn.rst
+++ b/doc/howto/cmd_parameter/index_cn.rst
@@ -1,6 +1,6 @@
 ..  _cmd_line_index:
 
-设置命令行参数
+命令行参数设置
 ===============
 
 ..  toctree::
diff --git a/doc/howto/usage/cmd_parameter/index_en.rst b/doc/howto/cmd_parameter/index_en.rst
similarity index 100%
rename from doc/howto/usage/cmd_parameter/index_en.rst
rename to doc/howto/cmd_parameter/index_en.rst
diff --git a/doc/howto/usage/cmd_parameter/use_case_cn.md b/doc/howto/cmd_parameter/use_case_cn.md
similarity index 100%
rename from doc/howto/usage/cmd_parameter/use_case_cn.md
rename to doc/howto/cmd_parameter/use_case_cn.md
diff --git a/doc/howto/usage/cmd_parameter/use_case_en.md b/doc/howto/cmd_parameter/use_case_en.md
similarity index 100%
rename from doc/howto/usage/cmd_parameter/use_case_en.md
rename to doc/howto/cmd_parameter/use_case_en.md
diff --git a/doc/howto/dev/contribute_to_paddle_en.md b/doc/howto/dev/contribute_to_paddle_en.md
deleted file mode 120000
index c97564d93a7f0a753a23cd97d2467d595bd154ff..0000000000000000000000000000000000000000
--- a/doc/howto/dev/contribute_to_paddle_en.md
+++ /dev/null
@@ -1 +0,0 @@
-../../../CONTRIBUTING.md
\ No newline at end of file
diff --git a/doc/howto/index_cn.rst b/doc/howto/index_cn.rst
index e0c69f7a6a4043abe999af6c8dd2555178b68424..0c534f107b6e047035c424ed2ea59f3982799b63 100644
--- a/doc/howto/index_cn.rst
+++ b/doc/howto/index_cn.rst
@@ -1,37 +1,11 @@
-进阶指南
+进阶使用
 ========
 
-使用说明
---------
-
-..  toctree::
-  :maxdepth: 1
-
-  usage/cmd_parameter/index_cn.rst
-  usage/cluster/cluster_train_cn.md
-  usage/capi/index_cn.rst
-
-开发标准
---------
-
-..  toctree::
-  :maxdepth: 1
-
-  dev/contribute_to_paddle_cn.md
-  dev/write_docs_cn.rst
-
-模型配置
---------
-
-..  toctree::
-  :maxdepth: 1
-
-  deep_model/rnn/index_cn.rst
-
-性能优化
---------
-
 ..  toctree::
   :maxdepth: 1
 
+  cmd_parameter/index_cn.rst
+  cluster/index_cn.rst
+  capi/index_cn.rst
+  rnn/index_cn.rst
   optimization/gpu_profiling_cn.rst
diff --git a/doc/howto/index_en.rst b/doc/howto/index_en.rst
index 6d1bf7dfc003da6de31410ee0a7959233adfaf76..ae8b86f75b5de770312fb2fdc46db490a18e5ff6 100644
--- a/doc/howto/index_en.rst
+++ b/doc/howto/index_en.rst
@@ -1,37 +1,10 @@
 HOW TO
 =======
 
-Usage
--------
-
-..  toctree::
-  :maxdepth: 1
-
-  usage/cmd_parameter/index_en.rst
-  usage/cluster/cluster_train_en.md
-
-Development
-------------
-
-..  toctree::
-  :maxdepth: 1
-
-  dev/new_layer_en.rst
-  dev/contribute_to_paddle_en.md
-  dev/write_docs_en.rst
-
-Configuration
--------------
-
-..  toctree::
-  :maxdepth: 1
-
-  deep_model/rnn/index_en.rst
-
-Optimization
--------------
-
 ..  toctree::
   :maxdepth: 1
 
+  cmd_parameter/index_en.rst
+  cluster/index_en.rst
+  rnn/index_en.rst
   optimization/gpu_profiling_en.rst
diff --git a/doc/howto/optimization/cpu_profiling.md b/doc/howto/optimization/cpu_profiling_en.md
similarity index 100%
rename from doc/howto/optimization/cpu_profiling.md
rename to doc/howto/optimization/cpu_profiling_en.md
diff --git a/doc/howto/optimization/gpu_profiling_cn.rst b/doc/howto/optimization/gpu_profiling_cn.rst
index e2b0b0396e0034b01ed2c5081effdd3bcabf31ae..0239eef4f118197bf92f9fc7d323be58344b0ded 100644
--- a/doc/howto/optimization/gpu_profiling_cn.rst
+++ b/doc/howto/optimization/gpu_profiling_cn.rst
@@ -1,6 +1,6 @@
-==================
-GPU性能分析与调优
-==================
+============
+GPU性能调优
+============
 
 ..  contents::
 
diff --git a/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst b/doc/howto/rnn/hierarchical_layer_cn.rst
similarity index 100%
rename from doc/howto/deep_model/rnn/hierarchical_layer_cn.rst
rename to doc/howto/rnn/hierarchical_layer_cn.rst
diff --git a/doc/howto/deep_model/rnn/hrnn_rnn_api_compare_cn.rst b/doc/howto/rnn/hrnn_rnn_api_compare_cn.rst
similarity index 100%
rename from doc/howto/deep_model/rnn/hrnn_rnn_api_compare_cn.rst
rename to doc/howto/rnn/hrnn_rnn_api_compare_cn.rst
diff --git a/doc/howto/deep_model/rnn/index_cn.rst b/doc/howto/rnn/index_cn.rst
similarity index 100%
rename from doc/howto/deep_model/rnn/index_cn.rst
rename to doc/howto/rnn/index_cn.rst
diff --git a/doc/howto/deep_model/rnn/index_en.rst b/doc/howto/rnn/index_en.rst
similarity index 100%
rename from doc/howto/deep_model/rnn/index_en.rst
rename to doc/howto/rnn/index_en.rst
diff --git a/doc/howto/deep_model/rnn/recurrent_group_cn.md b/doc/howto/rnn/recurrent_group_cn.md
similarity index 100%
rename from doc/howto/deep_model/rnn/recurrent_group_cn.md
rename to doc/howto/rnn/recurrent_group_cn.md
diff --git a/doc/howto/deep_model/rnn/rnn_config_cn.rst b/doc/howto/rnn/rnn_config_cn.rst
similarity index 100%
rename from doc/howto/deep_model/rnn/rnn_config_cn.rst
rename to doc/howto/rnn/rnn_config_cn.rst
diff --git a/doc/howto/deep_model/rnn/rnn_config_en.rst b/doc/howto/rnn/rnn_config_en.rst
similarity index 100%
rename from doc/howto/deep_model/rnn/rnn_config_en.rst
rename to doc/howto/rnn/rnn_config_en.rst
diff --git a/doc/howto/deep_model/rnn/src/bi_lstm.jpg b/doc/howto/rnn/src/bi_lstm.jpg
similarity index 100%
rename from doc/howto/deep_model/rnn/src/bi_lstm.jpg
rename to doc/howto/rnn/src/bi_lstm.jpg
diff --git a/doc/howto/deep_model/rnn/src/encoder-decoder-attention-model.png b/doc/howto/rnn/src/encoder-decoder-attention-model.png
similarity index 100%
rename from doc/howto/deep_model/rnn/src/encoder-decoder-attention-model.png
rename to doc/howto/rnn/src/encoder-decoder-attention-model.png
diff --git a/doc/howto/deep_model/rnn/src/glossary_rnn.dot b/doc/howto/rnn/src/glossary_rnn.dot
similarity index 100%
rename from doc/howto/deep_model/rnn/src/glossary_rnn.dot
rename to doc/howto/rnn/src/glossary_rnn.dot
diff --git a/doc/howto/deep_model/rnn/src/glossary_rnn_with_memory.dot b/doc/howto/rnn/src/glossary_rnn_with_memory.dot
similarity index 100%
rename from doc/howto/deep_model/rnn/src/glossary_rnn_with_memory.dot
rename to doc/howto/rnn/src/glossary_rnn_with_memory.dot
diff --git a/doc/howto/deep_model/rnn/src/simple_full_hierarchical_recurrent.dot b/doc/howto/rnn/src/simple_full_hierarchical_recurrent.dot
similarity index 100%
rename from doc/howto/deep_model/rnn/src/simple_full_hierarchical_recurrent.dot
rename to doc/howto/rnn/src/simple_full_hierarchical_recurrent.dot
diff --git a/doc/howto/deep_model/rnn/src/simple_full_recurrent.dot b/doc/howto/rnn/src/simple_full_recurrent.dot
similarity index 100%
rename from doc/howto/deep_model/rnn/src/simple_full_recurrent.dot
rename to doc/howto/rnn/src/simple_full_recurrent.dot
diff --git a/doc/index_cn.rst b/doc/index_cn.rst
index 9279bac7f4b2898c18979630a8d6dfcb2dba70e0..63a78428583477792e309a3b3d26af340caccfca 100644
--- a/doc/index_cn.rst
+++ b/doc/index_cn.rst
@@ -5,6 +5,8 @@ PaddlePaddle 文档
   :maxdepth: 1
 
   getstarted/index_cn.rst
+  build_and_install/index_cn.rst
   howto/index_cn.rst
+  dev/index_cn.rst
   api/index_cn.rst
   faq/index_cn.rst
diff --git a/doc/index_en.rst b/doc/index_en.rst
index 64684b8b9b27e245c6b32ea28809d3bbce22fab9..5631381be087017c26b2a6a3984b3c5bdb49f12d 100644
--- a/doc/index_en.rst
+++ b/doc/index_en.rst
@@ -5,5 +5,7 @@ PaddlePaddle Documentation
   :maxdepth: 1
 
   getstarted/index_en.rst
+  build_and_install/index_en.rst
   howto/index_en.rst
+  dev/index_en.rst
   api/index_en.rst
diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 8b71f73c36c33d882b34c833031c50cd14817e76..ef1bc07c2dbe71268c706a119056d3a9fcfc7f8c 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -20,10 +20,13 @@ endif()
 
 cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
 
+nv_test(mixed_vector_test SRCS mixed_vector_test.cu DEPS place paddle_memory device_context init)
 cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto)
 cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor paddle_memory)
 nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor init)
 
+cc_library(reader SRCS reader.cc DEPS lod_tensor ddim)
+
 cc_test(variable_test SRCS variable_test.cc)
 
 cc_library(threadpool SRCS threadpool.cc DEPS enforce)
@@ -92,11 +95,4 @@ cc_test(init_test SRCS init_test.cc DEPS init)
 cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto)
 cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
       
-if(NOT WITH_C_API AND WITH_FLUID)
-  file(GLOB FRAMEWORK_HEADERS *.h)
-  install(FILES ${FRAMEWORK_HEADERS} DESTINATION include/paddle/framework)
-  install(FILES ${CMAKE_CURRENT_BINARY_DIR}/framework.pb.h DESTINATION include/paddle/framework)
-  install(FILES details/cow_ptr.h details/op_registry.h DESTINATION include/paddle/framework/details)
-endif()
-
 cc_test(channel_test SRCS channel_test.cc)
diff --git a/paddle/framework/channel_test.cc b/paddle/framework/channel_test.cc
index df9e15e22b890347a03d6816e8549c99b010bb38..a307abb4ed37880bb289a8373adf0d293382c97e 100644
--- a/paddle/framework/channel_test.cc
+++ b/paddle/framework/channel_test.cc
@@ -22,6 +22,8 @@ limitations under the License. */
 using paddle::framework::Channel;
 using paddle::framework::MakeChannel;
 using paddle::framework::CloseChannel;
+using paddle::framework::details::Buffered;
+using paddle::framework::details::UnBuffered;
 
 TEST(Channel, MakeAndClose) {
   using paddle::framework::details::Buffered;
@@ -60,13 +62,54 @@ TEST(Channel, SufficientBufferSizeDoesntBlock) {
   delete ch;
 }
 
-TEST(Channel, SendOnClosedChannelPanics) {
-  const size_t buffer_size = 10;
-  auto ch = MakeChannel<size_t>(buffer_size);
-  size_t i = 5;
-  EXPECT_EQ(ch->Send(&i), true);  // should not block or panic
+// This tests that a  channel must return false
+// on send and receive performed after closing the channel.
+// Receive will only return false after close when queue is empty.
+// By creating separate threads for sending and receiving, we make this
+// function able to test both buffered and unbuffered channels.
+void SendReceiveWithACloseChannelShouldPanic(Channel<size_t> *ch) {
+  const size_t data = 5;
+  std::thread send_thread{[&]() {
+    size_t i = data;
+    EXPECT_EQ(ch->Send(&i), true);  // should not block
+  }};
+
+  std::thread recv_thread{[&]() {
+    size_t i;
+    EXPECT_EQ(ch->Receive(&i), true);  // should not block
+    EXPECT_EQ(i, data);
+  }};
+
+  send_thread.join();
+  recv_thread.join();
+
+  // After closing send should return false. Receive should
+  // also return false as there is no data in queue.
   CloseChannel(ch);
-  EXPECT_EQ(ch->Send(&i), false);  // should panic
+  send_thread = std::thread{[&]() {
+    size_t i = data;
+    EXPECT_EQ(ch->Send(&i), false);  // should return false
+  }};
+  recv_thread = std::thread{[&]() {
+    size_t i;
+    // should return false because channel is closed and queue is empty
+    EXPECT_EQ(ch->Receive(&i), false);
+  }};
+
+  send_thread.join();
+  recv_thread.join();
+}
+
+TEST(Channel, SendReceiveClosedBufferedChannelPanics) {
+  size_t buffer_size = 10;
+  auto ch = MakeChannel<size_t>(buffer_size);
+  SendReceiveWithACloseChannelShouldPanic(ch);
+  delete ch;
+}
+
+TEST(Channel, SendReceiveClosedUnBufferedChannelPanics) {
+  auto ch = MakeChannel<size_t>(0);
+  SendReceiveWithACloseChannelShouldPanic(ch);
   delete ch;
 }
 
@@ -381,3 +424,129 @@ TEST(Channel, UnbufferedMoreReceiveLessSendTest) {
   EXPECT_EQ(sum_receive, 28U);
   delete ch;
 }
+
+// This tests that destroying a channel unblocks
+//  any senders waiting for channel to have write space
+void ChannelDestroyUnblockSenders(Channel<int> *ch) {
+  size_t num_threads = 5;
+  std::thread t[num_threads];
+  bool thread_ended[num_threads];
+  bool send_success[num_threads];
+
+  // Launches threads that try to write and are blocked because of no readers
+  for (size_t i = 0; i < num_threads; i++) {
+    thread_ended[i] = false;
+    send_success[i] = false;
+    t[i] = std::thread(
+        [&](bool *ended, bool *success) {
+          int data = 10;
+          *success = ch->Send(&data);
+          *ended = true;
+        },
+        &thread_ended[i], &send_success[i]);
+  }
+
+  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
+  bool is_buffered_channel = false;
+  if (dynamic_cast<Buffered<int> *>(ch)) is_buffered_channel = true;
+
+  if (is_buffered_channel) {
+    // If channel is buffered, verify that atleast 4 threads are blocked
+    int ct = 0;
+    for (size_t i = 0; i < num_threads; i++) {
+      if (thread_ended[i] == false) ct++;
+    }
+    // Atleast 4 threads must be blocked
+    EXPECT_GE(ct, 4);
+  } else {
+    // Verify that all the threads are blocked
+    for (size_t i = 0; i < num_threads; i++) {
+      EXPECT_EQ(thread_ended[i], false);
+    }
+  }
+  // Explicitly destroy the channel
+  delete ch;
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
+
+  // Verify that all threads got unblocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], true);
+  }
+
+  // Count number of successfuld sends
+  int ct = 0;
+  for (size_t i = 0; i < num_threads; i++) {
+    if (send_success[i]) ct++;
+  }
+
+  if (is_buffered_channel) {
+    // Only 1 send must be successful
+    EXPECT_EQ(ct, 1);
+  } else {
+    // In unbuffered channel, no send should be successful
+    EXPECT_EQ(ct, 0);
+  }
+
+  // Join all threads
+  for (size_t i = 0; i < num_threads; i++) t[i].join();
+}
+
+// This tests that destroying a channel also unblocks
+//  any receivers waiting on the channel
+void ChannelDestroyUnblockReceivers(Channel<int> *ch) {
+  size_t num_threads = 5;
+  std::thread t[num_threads];
+  bool thread_ended[num_threads];
+
+  // Launches threads that try to read and are blocked because of no writers
+  for (size_t i = 0; i < num_threads; i++) {
+    thread_ended[i] = false;
+    t[i] = std::thread(
+        [&](bool *p) {
+          int data;
+          // All reads should return false
+          EXPECT_EQ(ch->Receive(&data), false);
+          *p = true;
+        },
+        &thread_ended[i]);
+  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
+
+  // Verify that all threads are blocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], false);
+  }
+  // delete the channel
+  delete ch;
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
+  // Verify that all threads got unblocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], true);
+  }
+
+  for (size_t i = 0; i < num_threads; i++) t[i].join();
+}
+
+TEST(Channel, BufferedChannelDestroyUnblocksReceiversTest) {
+  size_t buffer_size = 1;
+  auto ch = MakeChannel<int>(buffer_size);
+  ChannelDestroyUnblockReceivers(ch);
+}
+
+TEST(Channel, BufferedChannelDestroyUnblocksSendersTest) {
+  size_t buffer_size = 1;
+  auto ch = MakeChannel<int>(buffer_size);
+  ChannelDestroyUnblockSenders(ch);
+}
+
+// This tests that destroying an unbuffered channel also unblocks
+//  unblocks any receivers waiting for senders
+TEST(Channel, UnbufferedChannelDestroyUnblocksReceiversTest) {
+  auto ch = MakeChannel<int>(0);
+  ChannelDestroyUnblockReceivers(ch);
+}
+
+TEST(Channel, UnbufferedChannelDestroyUnblocksSendersTest) {
+  auto ch = MakeChannel<int>(0);
+  ChannelDestroyUnblockSenders(ch);
+}
diff --git a/paddle/framework/details/buffered_channel.h b/paddle/framework/details/buffered_channel.h
index 00b63da4da7844b41168c03f55e2faa84ff44154..77eebc9924954b8adbbadceb4ede57f0a21f05aa 100644
--- a/paddle/framework/details/buffered_channel.h
+++ b/paddle/framework/details/buffered_channel.h
@@ -42,8 +42,11 @@ class Buffered : public paddle::framework::Channel<T> {
   std::mutex mu_;
   std::condition_variable empty_cond_var_;
   std::condition_variable full_cond_var_;
+  std::condition_variable destructor_cond_var_;
   std::deque<T> channel_;
   std::atomic<bool> closed_{false};
+  std::atomic<unsigned> send_ctr{0};
+  std::atomic<unsigned> recv_ctr{0};
 
   Buffered(size_t cap) : cap_(cap), closed_(false) {
     PADDLE_ENFORCE_GT(cap, 0);
@@ -58,6 +61,7 @@ bool Buffered<T>::Send(T* item) {
   if (closed_) {
     return ret;
   }
+  send_ctr++;
   std::unique_lock<std::mutex> lock(mu_);
   full_cond_var_.wait(lock,
                       [this]() { return channel_.size() < cap_ || closed_; });
@@ -67,20 +71,30 @@ bool Buffered<T>::Send(T* item) {
     empty_cond_var_.notify_one();
     ret = true;
   }
+  send_ctr--;
+  destructor_cond_var_.notify_one();
   return ret;
 }
 
 template <typename T>
 bool Buffered<T>::Receive(T* item) {
+  bool ret = false;
+  // Once the channel has been closed and all data has been consumed,
+  // just return false. Don't even try acquiring the mutex.
+  if (closed_ && channel_.empty()) {
+    return false;
+  }
+  recv_ctr++;
   std::unique_lock<std::mutex> lock(mu_);
   empty_cond_var_.wait(lock, [this]() { return !channel_.empty() || closed_; });
-  bool ret = false;
   if (!channel_.empty()) {
     *item = std::move(channel_.front());
     channel_.pop_front();
     full_cond_var_.notify_one();
     ret = true;
   }
+  recv_ctr--;
+  destructor_cond_var_.notify_one();
   return ret;
 }
 
@@ -100,6 +114,12 @@ Buffered<T>::~Buffered() {
   closed_ = true;
   channel_.clear();
   NotifyAllParticipants(&lock);
+
+  // The destructor must wait for all readers and writers to complete their task
+  // The channel has been closed, so we will not accept new readers and writers
+  lock.lock();
+  destructor_cond_var_.wait(
+      lock, [this]() { return send_ctr == 0 && recv_ctr == 0; });
 }
 
 template <typename T>
diff --git a/paddle/framework/details/unbuffered_channel.h b/paddle/framework/details/unbuffered_channel.h
index 815cebad2d8c08aa31bb566bc6c51250870383d8..92a16b4d22bbb6a8c75157444aa8474f700603fe 100644
--- a/paddle/framework/details/unbuffered_channel.h
+++ b/paddle/framework/details/unbuffered_channel.h
@@ -45,9 +45,11 @@ class UnBuffered : public paddle::framework::Channel<T> {
   // A transaction occurs only when both are true
   std::atomic<bool> reader_found_{false}, writer_found_{false};
   std::condition_variable cv_channel_;
-  std::condition_variable_any cv_reader_, cv_writer_;
+  std::condition_variable_any cv_reader_, cv_writer_, cv_destructor_;
   T* item{nullptr};
   std::atomic<bool> closed_{false};
+  std::atomic<unsigned> send_ctr{0};
+  std::atomic<unsigned> recv_ctr{0};
 
   UnBuffered() : closed_(false) {}
 
@@ -62,6 +64,7 @@ bool UnBuffered<T>::Send(T* data) {
   if (closed_) {
     return ret;
   }
+  send_ctr++;
   // Prevent other writers from entering
   std::unique_lock<std::recursive_mutex> writer_lock(mu_write_);
   writer_found_ = true;
@@ -81,6 +84,8 @@ bool UnBuffered<T>::Send(T* data) {
     ret = true;
   }
   writer_found_ = false;
+  send_ctr--;
+  cv_destructor_.notify_one();
   return ret;
 }
 
@@ -88,6 +93,12 @@ bool UnBuffered<T>::Send(T* data) {
 // data that was sent by a writer is read from a reader.
 template <typename T>
 bool UnBuffered<T>::Receive(T* data) {
+  bool ret = false;
+  // If channel is closed, we don't even want any reader to enter.
+  // Unlike a buffered channel, an unbuffered channel does not allow
+  // readers to read after closing because there is no buffer to be consumed.
+  if (closed_) return ret;
+  recv_ctr++;
   // Prevent other readers from entering
   std::unique_lock<std::recursive_mutex> read_lock{mu_read_};
   reader_found_ = true;
@@ -96,7 +107,6 @@ bool UnBuffered<T>::Receive(T* data) {
   cv_reader_.wait(cv_lock,
                   [this]() { return writer_found_ == true || closed_; });
   cv_writer_.notify_one();
-  bool ret = false;
   if (!closed_) {
     std::unique_lock<std::mutex> lock_ch{mu_ch_};
     // Reader should wait for the writer to first write its data
@@ -110,6 +120,8 @@ bool UnBuffered<T>::Receive(T* data) {
     cv_channel_.notify_one();
   }
   reader_found_ = false;
+  recv_ctr--;
+  cv_destructor_.notify_one();
   return ret;
 }
 
@@ -135,6 +147,9 @@ UnBuffered<T>::~UnBuffered() {
   item = nullptr;
   closed_ = true;
   NotifyAllParticipants(&lock);
+  lock.lock();
+  cv_destructor_.wait(lock,
+                      [this]() { return send_ctr == 0 && recv_ctr == 0; });
 }
 
 // This function notifies all the readers, writers and
diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index 9a232b08434d299d10bb2acdb6e96295de875d56..2a88e5a92985fab7311c1edd266cb89f7d76d867 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -22,6 +22,7 @@ limitations under the License. */
 #include "paddle/framework/lod_rank_table.h"
 #include "paddle/framework/lod_tensor_array.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/framework/reader.h"
 #include "paddle/platform/place.h"
 #include "paddle/platform/profiler.h"
 
@@ -52,11 +53,13 @@ static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) {
     var->GetMutable<LoDTensorArray>();
   } else if (var_type == proto::VarDesc::PLACE_LIST) {
     var->GetMutable<platform::PlaceList>();
+  } else if (var_type == proto::VarDesc::READER) {
+    var->GetMutable<ReaderHolder>();
   } else {
     PADDLE_THROW(
         "Variable type %d is not in "
-        "[LoDTensor, SelectedRows, FEED_MINIBATCH, FETCH_LIST, LOD_RANK_TABLE,"
-        " PLACE_LIST]",
+        "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
+        "LOD_RANK_TABLE, PLACE_LIST, READER]",
         var_type);
   }
 }
diff --git a/paddle/framework/framework.proto b/paddle/framework/framework.proto
index f65ccae6e6a4df4490d49128e871efa55800b505..d7be1a7352da56e411396614e33919bb55bc3b0f 100644
--- a/paddle/framework/framework.proto
+++ b/paddle/framework/framework.proto
@@ -116,7 +116,7 @@ message LoDTensorArrayDesc {
   optional int32 lod_level = 2 [ default = 0 ];
 }
 
-message Reader { repeated LoDTensorDesc lod_tensor = 1; }
+message ReaderDesc { repeated LoDTensorDesc lod_tensor = 1; }
 
 message VarDesc {
   enum VarType {
@@ -136,7 +136,7 @@ message VarDesc {
   optional LoDTensorDesc lod_tensor = 4;
   optional TensorDesc selected_rows = 5;
   optional LoDTensorArrayDesc tensor_array = 6;
-  optional Reader reader = 7;
+  optional ReaderDesc reader = 7;
 }
 
 message BlockDesc {
diff --git a/paddle/framework/lod_tensor.h b/paddle/framework/lod_tensor.h
index d0ab640485baf6d76ee629ea420b603f42b031b4..be2b301619639106ac7b578e5a79cf33f4379e48 100644
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@@ -48,12 +48,26 @@ namespace framework {
  */
 struct LoD : public std::vector<Vector<size_t>> {
   using std::vector<Vector<size_t>>::vector;
+  platform::Place place() const {
+    if (this->size() == 0) {
+      // Not Initialze Yet.
+      return platform::CPUPlace();
+    } else {
+      return this->front().place();
+    }
+  }
 
   void CopyFromCUDA() {
     for (auto it = this->begin(); it != this->end(); ++it) {
       it->CopyFromCUDA();
     }
   }
+
+  void CopyToPeer(platform::Place place) {
+    for (auto it = this->begin(); it != this->end(); ++it) {
+      it->CopyToPeer(place);
+    }
+  }
 };
 
 std::ostream& operator<<(std::ostream& os, const LoD& lod);
diff --git a/paddle/framework/lod_tensor_test.cu b/paddle/framework/lod_tensor_test.cu
index d4c9f00bd9c00f3cae68858ca46c5320fc117405..adea02e3b3fdcf4873de76ff91116f43ac9fe259 100644
--- a/paddle/framework/lod_tensor_test.cu
+++ b/paddle/framework/lod_tensor_test.cu
@@ -28,28 +28,6 @@ __global__ void test(size_t* a, int size) {
   }
 }
 
-TEST(Vector, Normal) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-  using namespace paddle::memory;
-
-  paddle::framework::InitDevices();
-
-  paddle::framework::Vector<size_t> vec({1, 2, 3});
-  size_t* ptr = vec.data();
-  for (size_t i = 0; i < vec.size(); ++i) {
-    EXPECT_EQ(vec[i], *(ptr + i));
-  }
-
-  vec.clear();
-  vec.CopyFromCUDA();
-
-  std::vector<size_t> v = {1, 2, 3};
-  for (size_t i = 0; i < v.size(); ++i) {
-    EXPECT_EQ(v[i], vec[i]);
-  }
-}
-
 TEST(LoD, data) {
   paddle::framework::InitDevices();
 
diff --git a/paddle/framework/mixed_vector.h b/paddle/framework/mixed_vector.h
index 85caac8dcd9ede4fe997e2fd246d1421aa73c80a..5202775515d335ff81bb17e6ce21338c40041ca3 100644
--- a/paddle/framework/mixed_vector.h
+++ b/paddle/framework/mixed_vector.h
@@ -40,26 +40,35 @@ class Vector : public std::vector<T> {
   Vector() {}
   Vector(const std::vector<T> &v) : std::vector<T>(v) {}  // NOLINT
 
-  virtual ~Vector() {
-#ifdef PADDLE_WITH_CUDA
-    if (cuda_ptr_ != nullptr) {
-      memory::Free<platform::CUDAPlace>(place_, cuda_ptr_);
-    }
-#endif
-  }
+  inline platform::Place place() const { return place_; }
+
+  /*! Return a pointer to constant memory block. */
+  inline const T *data(platform::Place place) const;
 
+  /*! Return a pointer to mutable memory block. */
+  inline T *mutable_data(platform::Place place);
+
+  // TODO(dzhwinter): below interfaces should be removed
   /* Get device vector */
   T *cuda_data() {
     CopyToCUDA();
     PADDLE_ENFORCE_NOT_NULL(
         cuda_ptr_, "No data or Insufficient CUDA memory to allocation");
-    return static_cast<T *>(cuda_ptr_);
+    return static_cast<T *>(cuda_ptr_.get());
   }
 
   /* Get host vector */
   T *data() { return std::vector<T>::data(); }
   const T *data() const { return std::vector<T>::data(); }
 
+  T *data(const platform::Place &place) {
+    if (platform::is_cpu_place(place)) {
+      return data();
+    } else {
+      return cuda_data();
+    }
+  }
+
   /* Synchronize host vector to device vector */
   void CopyToCUDA();
   /* Synchronize device vector to host vector */
@@ -68,25 +77,73 @@ class Vector : public std::vector<T> {
   void CopyToPeer(platform::Place);
 
  private:
-  void *cuda_ptr_ = nullptr;
+  std::shared_ptr<void> cuda_ptr_;
   size_t cuda_size_ = 0;  // device vector numel
   platform::CUDAPlace place_;
 };
 
 template <typename T>
-void Vector<T>::CopyToCUDA() {
+inline const T *Vector<T>::data(platform::Place place) const {
+  if (platform::is_cpu_place(place)) {
+    return std::vector<T>::data();
+  } else if (platform::is_gpu_place(place)) {
+    if (cuda_ptr_ == nullptr) {
+      return nullptr;
+    }
+    if (boost::get<platform::CUDAPlace>(place) == place_) {
+      return static_cast<const T *>(cuda_ptr_.get());
+    } else {
+      PADDLE_THROW(
+          "Unmatched place. Please use `mutable_data` copy lod to the target "
+          "Place first.");
+    }
+  } else {
+    PADDLE_THROW("Unsupport Place.");
+  }
+}
+
+template <typename T>
+inline T *Vector<T>::mutable_data(platform::Place place) {
+  if (platform::is_cpu_place(place)) {
+    return std::vector<T>::data();
+  } else if (platform::is_gpu_place(place)) {
+    if (boost::get<platform::CUDAPlace>(place) != place_) {
+      place_ = boost::get<platform::CUDAPlace>(place);
+    }
 #ifdef PADDLE_WITH_CUDA
-  if (cuda_size_ < this->size()) {
-    if (cuda_ptr_ != nullptr) {
-      memory::Free<platform::CUDAPlace>(place_, cuda_ptr_);
+    if (cuda_size_ < this->size() || cuda_ptr_ == nullptr) {
+      cuda_ptr_.reset(
+          memory::Alloc<platform::CUDAPlace>(place_, this->size() * sizeof(T)),
+          memory::PlainDeleter<void, platform::CUDAPlace>(place_));
     }
-    cuda_ptr_ =
-        memory::Alloc<platform::CUDAPlace>(place_, this->size() * sizeof(T));
+    cuda_size_ = this->size();
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto *ctx = pool.GetByPlace(place_);
+    memory::Copy(place_, cuda_ptr_.get(), platform::CPUPlace(),
+                 static_cast<const void *>(this->data()),
+                 this->size() * sizeof(T), ctx->stream());
+    ctx->Wait();
+    return static_cast<T *>(cuda_ptr_.get());
+#else
+    return nullptr;
+#endif
+  } else {
+    PADDLE_THROW("Unsupport Place.");
+  }
+}
+
+template <typename T>
+void Vector<T>::CopyToCUDA() {
+#ifdef PADDLE_WITH_CUDA
+  if (cuda_size_ < this->size() || cuda_ptr_ == nullptr) {
+    cuda_ptr_.reset(
+        memory::Alloc<platform::CUDAPlace>(place_, this->size() * sizeof(T)),
+        memory::PlainDeleter<void, platform::CUDAPlace>(place_));
   }
   cuda_size_ = this->size();
   platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
   auto *ctx = pool.GetByPlace(place_);
-  memory::Copy(place_, cuda_ptr_, platform::CPUPlace(),
+  memory::Copy(place_, cuda_ptr_.get(), platform::CPUPlace(),
                static_cast<const void *>(this->data()),
                this->size() * sizeof(T), ctx->stream());
   ctx->Wait();
@@ -104,32 +161,32 @@ void Vector<T>::CopyFromCUDA() {
   platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
   auto *ctx = pool.GetByPlace(place_);
   memory::Copy(platform::CPUPlace(), static_cast<void *>(this->data()), place_,
-               static_cast<const void *>(cuda_ptr_), this->size() * sizeof(T),
-               ctx->stream());
+               static_cast<const void *>(cuda_ptr_.get()),
+               this->size() * sizeof(T), ctx->stream());
   ctx->Wait();
 #endif
 }
 
 template <typename T>
-void Vector<T>::CopyToPeer(platform::Place peer_place) {
+void Vector<T>::CopyToPeer(platform::Place place) {
 #ifdef PADDLE_WITH_CUDA
-  auto *ctx = platform::DeviceContextPool::Instance().GetByPlace(place_);
-  void *peer_cuda_ptr = memory::Alloc<platform::CUDAPlace>(
-      boost::get<platform::CUDAPlace>(peer_place), this->size() * sizeof(T));
-  memory::Copy(boost::get<platform::CUDAPlace>(peer_place), peer_cuda_ptr,
-               place_, cuda_ptr_, this->size() * sizeof(T), ctx->stream());
+  if (boost::get<platform::CUDAPlace>(place) != place_) {
+    place_ = boost::get<platform::CUDAPlace>(place);
+  }
+  if (cuda_size_ < this->size() || cuda_ptr_ == nullptr) {
+    cuda_ptr_.reset(
+        memory::Alloc<platform::CUDAPlace>(place_, this->size() * sizeof(T)),
+        memory::PlainDeleter<void, platform::CUDAPlace>(place_));
+  }
+  cuda_size_ = this->size();
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  auto *ctx = pool.GetByPlace(place_);
+  memory::Copy(place_, cuda_ptr_.get(), platform::CPUPlace(),
+               static_cast<const void *>(this->data()),
+               this->size() * sizeof(T), ctx->stream());
   ctx->Wait();
-
-  memory::Free<platform::CUDAPlace>(place_, cuda_ptr_);
-  place_ = boost::get<platform::CUDAPlace>(peer_place);
-  cuda_ptr_ = peer_cuda_ptr;
 #endif
 }
 
-template class Vector<int>;
-template class Vector<unsigned>;
-template class Vector<size_t>;
-template class Vector<int64_t>;
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/mixed_vector_test.cu b/paddle/framework/mixed_vector_test.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7b571788ad1ade50e05dc9a70cba35b83f8db3ea
--- /dev/null
+++ b/paddle/framework/mixed_vector_test.cu
@@ -0,0 +1,72 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "gtest/gtest.h"
+
+#include "paddle/framework/init.h"
+#include "paddle/framework/mixed_vector.h"
+
+using namespace paddle::framework;
+using namespace paddle::platform;
+using namespace paddle::memory;
+
+template <typename T>
+__global__ void test(T* data, int size) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size;
+       i += blockDim.x * gridDim.x) {
+    data[i] *= 2;
+  }
+}
+
+TEST(Vector, Normal) {
+  // fill the device context pool.
+  InitDevices();
+
+  Vector<size_t> vec({1, 2, 3});
+  size_t* ptr = vec.data();
+  for (size_t i = 0; i < vec.size(); ++i) {
+    EXPECT_EQ(vec[i], *(ptr + i));
+  }
+
+  vec.clear();
+  vec.CopyFromCUDA();
+
+  std::vector<size_t> v = {1, 2, 3};
+  for (size_t i = 0; i < v.size(); ++i) {
+    EXPECT_EQ(v[i], vec[i]);
+  }
+}
+
+TEST(Vector, MultipleCopy) {
+  InitDevices();
+  Vector<size_t> vec({1, 2, 3});
+  CUDAPlace place(0);
+  vec.mutable_data(place);
+  auto vec2 = Vector<size_t>(vec);
+  {
+    const size_t* ptr = vec2.data(CPUPlace());
+    for (size_t i = 0; i < vec2.size(); ++i) {
+      EXPECT_EQ(*(ptr + i), vec[i]);
+    }
+  }
+  test<size_t><<<3, 3>>>(vec2.mutable_data(place), vec2.size());
+  vec2.CopyFromCUDA();
+  {
+    const size_t* ptr = vec2.data(CPUPlace());
+    for (size_t i = 0; i < vec2.size(); ++i) {
+      EXPECT_EQ(*(ptr + i), vec[i] * 2);
+    }
+  }
+}
diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc
index ad361852ec9f2bb35b24209179f96b15300ca8d0..ea4028750248ec47f5094a67f736fb217216af6d 100644
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -72,6 +72,11 @@ class CompileTimeInferShapeContext : public InferShapeContext {
 
   void SetDim(const std::string &name, const DDim &dim) override;
 
+  std::vector<DDim> GetRepeatedDims(const std::string &name) const override;
+
+  void SetRepeatedDims(const std::string &name,
+                       const std::vector<DDim> &dims) override;
+
   const OpDesc &op_;
   const BlockDesc &block_;
 };
@@ -457,23 +462,48 @@ const std::vector<std::string> &CompileTimeInferShapeContext::Outputs(
 DDim CompileTimeInferShapeContext::GetDim(const std::string &name) const {
   auto var = block_.FindVarRecursive(name);
   PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
+  DDim res;
   try {
     auto shape = var->GetShape();
-    if (shape.empty()) {
-      return framework::make_ddim({0UL});
-    } else {
-      return framework::make_ddim(var->GetShape());
-    }
+    res = shape.empty() ? make_ddim({0UL}) : make_ddim(shape);
   } catch (...) {
     VLOG(5) << "GetDim of variable " << name << " error";
     std::rethrow_exception(std::current_exception());
   }
+  return res;
+}
+
+std::vector<DDim> CompileTimeInferShapeContext::GetRepeatedDims(
+    const std::string &name) const {
+  auto var = block_.FindVarRecursive(name);
+  PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
+  std::vector<DDim> res;
+  try {
+    auto shapes = var->GetShapes();
+    for (const auto &s : shapes) {
+      res.push_back(s.empty() ? make_ddim({0UL}) : make_ddim(s));
+    }
+  } catch (...) {
+    VLOG(5) << "GetRepeatedDim of variable " << name << " error.";
+    std::rethrow_exception(std::current_exception());
+  }
+  return res;
 }
 
 void CompileTimeInferShapeContext::SetDim(const std::string &name,
                                           const DDim &dim) {
-  block_.FindVarRecursive(name)->SetShape(framework::vectorize(dim));
+  block_.FindVarRecursive(name)->SetShape(vectorize(dim));
 }
+
+void CompileTimeInferShapeContext::SetRepeatedDims(
+    const std::string &name, const std::vector<DDim> &dims) {
+  auto var = block_.FindVarRecursive(name);
+  PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
+  std::vector<std::vector<int64_t>> dim_vec(dims.size());
+  std::transform(dims.begin(), dims.end(), dim_vec.begin(), vectorize);
+  var->SetShapes(dim_vec);
+}
+
 bool CompileTimeInferShapeContext::IsRuntime() const { return false; }
 
 proto::VarDesc::VarType CompileTimeInferShapeContext::GetVarType(
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index 81fa8cf477423fc2a54c719c9a743729215513c3..52387aabd9d0b41b13814499fb3f0107f42401e7 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -320,8 +320,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
     if (length == 0) {
       return false;
     }
-    PADDLE_ENFORCE_EQ(length, 1UL, "Input %s should have more than one inputs",
-                      name);
+    PADDLE_ENFORCE_EQ(length, 1UL,
+                      "Input %s should not have more than one inputs", name);
     auto ipt = ins[0];
     auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
     return var != nullptr;
@@ -333,8 +333,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
     if (length == 0) {
       return false;
     }
-    PADDLE_ENFORCE_EQ(length, 1UL, "Output %s should have more than one inputs",
-                      name);
+    PADDLE_ENFORCE_EQ(length, 1UL,
+                      "Output %s should not have more than one inputs", name);
     auto ipt = outs[0];
     auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
     return var != nullptr;
@@ -421,8 +421,22 @@ class RuntimeInferShapeContext : public InferShapeContext {
     } else if (var->IsType<SelectedRows>()) {
       return var->Get<SelectedRows>().GetCompleteDims();
     } else {
-      PADDLE_THROW("Variable %s type_id %s, expect LoDTensor/SelectedRows.",
-                   name, var->Type().name());
+      PADDLE_THROW(
+          "Only LoDTensor/SelectedRows support 'GetDim', but Variable %s's "
+          "type_id is %s.",
+          name, var->Type().name());
+    }
+  }
+
+  std::vector<DDim> GetRepeatedDims(const std::string& name) const override {
+    Variable* var = scope_.FindVar(name);
+    if (var->IsType<ReaderHolder>()) {
+      return var->Get<ReaderHolder>().shapes();
+    } else {
+      PADDLE_THROW(
+          "Only ReaderHolder support 'GetRepeatedDims', but Variable %s's "
+          "type_id is %s.",
+          name, var->Type().name());
     }
   }
 
@@ -438,6 +452,19 @@ class RuntimeInferShapeContext : public InferShapeContext {
     }
   }
 
+  void SetRepeatedDims(const std::string& name,
+                       const std::vector<DDim>& dims) override {
+    Variable* var = scope_.FindVar(name);
+    if (var->IsType<ReaderHolder>()) {
+      var->GetMutable<ReaderHolder>()->set_shapes(dims);
+    } else {
+      PADDLE_THROW(
+          "Only ReaderHolder support 'SetRepeatedDims', but Variable %s's "
+          "type_id is %s.",
+          name, var->Type().name());
+    }
+  }
+
   proto::VarDesc::VarType GetVarType(const std::string& name) const override {
     auto* var = scope_.FindVar(name);
     return ToVarType(var->Type());
diff --git a/paddle/framework/reader.cc b/paddle/framework/reader.cc
new file mode 100644
index 0000000000000000000000000000000000000000..928b661aaadb4a59294de24cc1c414795c2878d5
--- /dev/null
+++ b/paddle/framework/reader.cc
@@ -0,0 +1,122 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/framework/reader.h"
+
+namespace paddle {
+namespace framework {
+
+DDim ReaderBase::shape(size_t idx) const {
+  PADDLE_ENFORCE_LT(
+      idx, shapes_.size(),
+      "Cannot get the %d'th shape, 'shapes_' only has %d elements.", idx,
+      shapes_.size());
+  return shapes_[idx];
+}
+
+void ShuffleReader::ReadNext(std::vector<LoDTensor>* out) {
+  if (iteration_pos_ >= buffer_.size()) {
+    // Reload buffer with new data
+    buffer_.clear();
+    buffer_.reserve(buffer_size_);
+    for (int i = 0; i < buffer_size_; ++i) {
+      if (reader_->HasNext()) {
+        buffer_.push_back(std::vector<LoDTensor>());
+        reader_->ReadNext(&buffer_.back());
+      } else {
+        break;
+      }
+    }
+    // TODO(fengjiayi): 'std::random_shuffle' can be very slow. It needs to be
+    // optimize.
+    std::random_shuffle(buffer_.begin(), buffer_.end());
+    iteration_pos_ = 0;
+  }
+  out->clear();
+  if (!buffer_.empty()) {
+    std::swap(*out, buffer_[iteration_pos_++]);
+  }
+  // if buffer_ is empty, the 'out' will return as an empty vector.
+}
+
+void BatchReader::ReadNext(std::vector<LoDTensor>* out) {
+  buffer_.clear();
+  buffer_.reserve(batch_size_);
+  for (int i = 0; i < batch_size_; ++i) {
+    if (reader_->HasNext()) {
+      buffer_.push_back(std::vector<LoDTensor>());
+      reader_->ReadNext(&buffer_.back());
+    } else {
+      break;
+    }
+  }
+  // Concat instances
+  out->clear();
+  if (buffer_.empty()) {
+    // if buffer_ is empty, the 'out' will return as an empty vector.
+    return;
+  }
+  int out_num = buffer_[0].size();
+  out->reserve(out_num);
+  for (int j = 0; j < out_num; ++j) {
+    // Merge shape and check date type
+    std::type_index batch_type = buffer_[0][j].type();
+    DDim batch_shape = buffer_[0][j].dims();
+    for (size_t i = 1; i < buffer_.size(); ++i) {
+      std::type_index ins_type = buffer_[i][j].type();
+      DDim ins_shape = buffer_[i][j].dims();
+      PADDLE_ENFORCE_EQ(batch_type, ins_type);
+      PADDLE_ENFORCE_EQ(slice_ddim(batch_shape, 1, batch_shape.size()),
+                        slice_ddim(ins_shape, 1, ins_shape.size()));
+      PADDLE_ENFORCE_GT(ins_shape[0], 0);
+      batch_shape[0] += ins_shape[0];
+    }
+
+    LoDTensor out_tensor;
+    out_tensor.Resize(batch_shape);
+    out_tensor.mutable_data(platform::CPUPlace(), batch_type);
+    int64_t dst_offset = 0;
+
+    // Merge lod and data
+    LoD batch_lod;
+    std::vector<size_t> top_level_lod({0});
+    for (size_t i = 0; i < buffer_.size(); ++i) {
+      DDim ins_shape = buffer_[i][j].dims();
+      LoD ins_lod = buffer_[i][j].lod();
+      if (i == 0) {
+        batch_lod = ins_lod;
+      } else {
+        PADDLE_ENFORCE_EQ(batch_lod.size(), ins_lod.size());
+        for (size_t level_idx = 0; level_idx < batch_lod.size(); ++level_idx) {
+          auto& lod_level = batch_lod[level_idx];
+          for (size_t k = 1; k < ins_lod[level_idx].size(); ++k) {
+            lod_level.push_back(ins_lod[level_idx][k] + lod_level.back());
+          }
+        }
+      }
+      top_level_lod.push_back(
+          top_level_lod.back() +
+          (ins_lod.empty() ? ins_shape[0] : (ins_lod[0].size() - 1)));
+
+      Tensor dst = out_tensor.Slice(dst_offset, dst_offset + ins_shape[0]);
+      Copy(buffer_[i][j], platform::CPUPlace(), &dst);
+      dst_offset += ins_shape[0];
+    }
+    batch_lod.insert(batch_lod.begin(), top_level_lod);
+    out_tensor.set_lod(batch_lod);
+    out->push_back(out_tensor);
+  }
+}
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/reader.h b/paddle/framework/reader.h
new file mode 100644
index 0000000000000000000000000000000000000000..534894cfbd66687fc982f1def4cb0d05d77a4583
--- /dev/null
+++ b/paddle/framework/reader.h
@@ -0,0 +1,161 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/framework/ddim.h"
+#include "paddle/framework/lod_tensor_array.h"
+
+namespace paddle {
+namespace framework {
+
+class ReaderBase {
+ public:
+  explicit ReaderBase(const std::vector<DDim>& shapes) : shapes_(shapes) {
+    PADDLE_ENFORCE(!shapes_.empty());
+  }
+  virtual void ReadNext(std::vector<LoDTensor>* out) = 0;
+  virtual bool HasNext() const = 0;
+
+  virtual void ReInit() = 0;
+
+  DDim shape(size_t idx) const;
+  std::vector<DDim> shapes() const { return shapes_; }
+  void set_shapes(const std::vector<DDim>& shapes) { shapes_ = shapes; }
+
+  virtual ~ReaderBase() {}
+
+ protected:
+  std::vector<DDim> shapes_;
+};
+
+class FileReader : public ReaderBase {
+ public:
+  explicit FileReader(const std::vector<DDim>& shapes) : ReaderBase(shapes) {}
+};
+
+class DecoratedReader : public ReaderBase {
+ public:
+  explicit DecoratedReader(ReaderBase* reader)
+      : ReaderBase(reader->shapes()), reader_(reader) {
+    PADDLE_ENFORCE_NOT_NULL(reader_);
+  }
+
+  bool HasNext() const override { return reader_->HasNext(); }
+
+  void ReInit() override { reader_->ReInit(); }
+
+ protected:
+  ReaderBase* reader_;
+};
+
+// file readers
+
+template <typename T>
+class RandomDataGenerator : public FileReader {
+ public:
+  RandomDataGenerator(const std::vector<DDim>& shapes, float min, float max)
+      : FileReader(shapes), min_(min), max_(max) {
+    PADDLE_ENFORCE_LE(
+        min, max, "'min' shouldn't be greater than 'max'.(%f vs %f)", min, max);
+    unsigned int seed = std::random_device()();
+    engine_.seed(seed);
+    dist_ = std::uniform_real_distribution<float>(min_, max_);
+  }
+
+  void ReadNext(std::vector<LoDTensor>* out) override {
+    out->clear();
+    out->reserve(shapes_.size());
+    for (const DDim& shape : shapes_) {
+      PADDLE_ENFORCE_GE(
+          shape.size(), 2,
+          "The rank of reader's output data should be 2 at least.(Now it's %d)",
+          shape.size());
+      LoDTensor out_tensor;
+      out_tensor.Resize(shape);
+      T* data = out_tensor.mutable_data<T>(platform::CPUPlace());
+      int64_t numel = product(shape);
+      for (int64_t i = 0; i < numel; ++i) {
+        data[i] = dist_(engine_);
+      }
+      out->push_back(out_tensor);
+    }
+  }
+
+  bool HasNext() const override { return true; }
+
+  void ReInit() override { return; }
+
+ private:
+  float min_;
+  float max_;
+  std::minstd_rand engine_;
+  std::uniform_real_distribution<float> dist_;
+};
+
+// decorated readers
+
+class ShuffleReader : public DecoratedReader {
+ public:
+  ShuffleReader(ReaderBase* reader, int buffer_size)
+      : DecoratedReader(reader), buffer_size_(buffer_size), iteration_pos_(0) {
+    buffer_.reserve(buffer_size);
+  }
+
+  void ReadNext(std::vector<LoDTensor>* out) override;
+
+ private:
+  int buffer_size_;
+  std::vector<std::vector<LoDTensor>> buffer_;
+  size_t iteration_pos_;
+};
+
+class BatchReader : public DecoratedReader {
+ public:
+  BatchReader(ReaderBase* reader, int batch_size)
+      : DecoratedReader(reader), batch_size_(batch_size) {
+    buffer_.reserve(batch_size_);
+  }
+
+  void ReadNext(std::vector<LoDTensor>* out) override;
+
+ private:
+  int batch_size_;
+  std::vector<std::vector<LoDTensor>> buffer_;
+};
+
+// The ReaderHolder is used as readers' unified wrapper,
+// making it easier to access different type readers in Variables.
+class ReaderHolder {
+ public:
+  void Reset(ReaderBase* reader) { reader_.reset(reader); }
+
+  ReaderBase* Get() const { return reader_.get(); }
+
+  void ReadNext(std::vector<LoDTensor>* out) { reader_->ReadNext(out); }
+  bool HasNext() const { return reader_->HasNext(); }
+  void ReInit() { reader_->ReInit(); }
+
+  DDim shape(size_t idx) const { return reader_->shape(idx); }
+  std::vector<DDim> shapes() const { return reader_->shapes(); }
+  void set_shapes(const std::vector<DDim>& shapes) {
+    reader_->set_shapes(shapes);
+  }
+
+ private:
+  std::unique_ptr<ReaderBase> reader_;
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/shape_inference.cc b/paddle/framework/shape_inference.cc
index a0fa467291bb42c59b65f5efeabe9c2235e15b2a..2f4d45057715d2c6f26bca74d1d691207b528207 100644
--- a/paddle/framework/shape_inference.cc
+++ b/paddle/framework/shape_inference.cc
@@ -32,6 +32,16 @@ std::vector<DDim> InferShapeContext::GetInputsDim(
   return GetDims(arg_names);
 }
 
+std::vector<DDim> InferShapeContext::GetReaderDims(
+    const std::string &name) const {
+  const std::vector<std::string> &arg_names = Inputs(name);
+  PADDLE_ENFORCE_EQ(
+      arg_names.size(), 1UL,
+      "Reader input '%s' should hold one element, but now it holds %d", name,
+      arg_names.size());
+  return this->GetRepeatedDims(arg_names[0]);
+}
+
 DDim InferShapeContext::GetInputsElementDim(const std::string &name,
                                             int idx) const {
   const std::vector<std::string> &names = Inputs(name);
@@ -52,6 +62,16 @@ void InferShapeContext::SetOutputsDim(const std::string &name,
   SetDims(names, dims);
 }
 
+void InferShapeContext::SetReaderDims(const std::string &name,
+                                      const std::vector<DDim> &dims) {
+  const std::vector<std::string> &arg_names = Outputs(name);
+  PADDLE_ENFORCE_EQ(
+      arg_names.size(), 1UL,
+      "Reader output '%s' should hold one element, but now it holds %d", name,
+      arg_names.size());
+  return this->SetRepeatedDims(arg_names[0], dims);
+}
+
 std::vector<DDim> InferShapeContext::GetDims(
     const std::vector<std::string> &names) const {
   std::vector<DDim> ret;
@@ -61,6 +81,7 @@ std::vector<DDim> InferShapeContext::GetDims(
       [this](const std::string &name) { return this->GetDim(name); });
   return ret;
 }
+
 void InferShapeContext::SetDims(const std::vector<std::string> &names,
                                 const std::vector<DDim> &dims) {
   size_t length = names.size();
@@ -72,14 +93,17 @@ void InferShapeContext::SetDims(const std::vector<std::string> &names,
     SetDim(names[i], dims[i]);
   }
 }
+
 std::vector<proto::VarDesc::VarType> InferShapeContext::GetInputsVarType(
     const std::string &name) const {
   return GetVarTypes(Inputs(name));
 }
+
 std::vector<proto::VarDesc::VarType> InferShapeContext::GetOutputsVarType(
     const std::string &name) const {
   return GetVarTypes(Outputs(name));
 }
+
 std::vector<proto::VarDesc::VarType> InferShapeContext::GetVarTypes(
     const std::vector<std::string> &names) const {
   std::vector<proto::VarDesc::VarType> retv;
diff --git a/paddle/framework/shape_inference.h b/paddle/framework/shape_inference.h
index 830f199ed1451538f12fc8dd34fb7b2bfc356a71..7bee86985239de73fca9aef1faefc04f7615f3ce 100644
--- a/paddle/framework/shape_inference.h
+++ b/paddle/framework/shape_inference.h
@@ -36,12 +36,13 @@ class InferShapeContext {
   virtual bool HasOutputs(const std::string &name) const = 0;
 
   DDim GetInputDim(const std::string &name) const;
-
   std::vector<DDim> GetInputsDim(const std::string &name) const;
+  std::vector<DDim> GetReaderDims(const std::string &name) const;
   DDim GetInputsElementDim(const std::string &name, int idx) const;
 
   void SetOutputDim(const std::string &name, const DDim &dim);
   void SetOutputsDim(const std::string &name, const std::vector<DDim> &dims);
+  void SetReaderDims(const std::string &name, const std::vector<DDim> &dims);
 
   virtual AttrReader Attrs() const = 0;
   virtual const std::vector<std::string> &Inputs(
@@ -61,6 +62,9 @@ class InferShapeContext {
  protected:
   virtual DDim GetDim(const std::string &name) const = 0;
   virtual void SetDim(const std::string &name, const DDim &dim) = 0;
+  virtual std::vector<DDim> GetRepeatedDims(const std::string &name) const = 0;
+  virtual void SetRepeatedDims(const std::string &name,
+                               const std::vector<DDim> &dims) = 0;
 
   std::vector<DDim> GetDims(const std::vector<std::string> &names) const;
   std::vector<proto::VarDesc::VarType> GetVarTypes(
diff --git a/paddle/framework/var_desc.cc b/paddle/framework/var_desc.cc
index 6d83e2e41126db0b3ac6fc1c86c9ed7fc4dfb39b..11a4daf2c991fc85a65c242403a0c83d06c4c44c 100644
--- a/paddle/framework/var_desc.cc
+++ b/paddle/framework/var_desc.cc
@@ -57,10 +57,13 @@ size_t VarDesc::GetTensorDescNum() const {
 
 void VarDesc::SetShapes(
     const std::vector<std::vector<int64_t>> &multiple_dims) {
-  PADDLE_ENFORCE_EQ(multiple_dims.size(), GetTensorDescNum(),
-                    "The number of given shapes(%d) doesn't equal to the "
-                    "number of sub tensor.",
-                    multiple_dims.size(), GetTensorDescNum());
+  if (multiple_dims.size() != GetTensorDescNum()) {
+    VLOG(3) << "WARNING: The number of given shapes(" << multiple_dims.size()
+            << ") doesn't match the existing tensor number("
+            << GetTensorDescNum()
+            << "). The Reader is going to be reinitialized.";
+    SetTensorDescNum(multiple_dims.size());
+  }
   std::vector<proto::TensorDesc *> tensors = mutable_tensor_descs();
   for (size_t i = 0; i < multiple_dims.size(); ++i) {
     VectorToRepeated(multiple_dims[i], tensors[i]->mutable_dims());
@@ -87,10 +90,14 @@ void VarDesc::SetDataType(proto::DataType data_type) {
 
 void VarDesc::SetDataTypes(
     const std::vector<proto::DataType> &multiple_data_type) {
-  PADDLE_ENFORCE_EQ(multiple_data_type.size(), GetTensorDescNum(),
-                    "The number of given data types(%d) doesn't equal to the "
-                    "number of sub tensor.",
-                    multiple_data_type.size(), GetTensorDescNum());
+  if (multiple_data_type.size() != GetTensorDescNum()) {
+    VLOG(3) << "WARNING: The number of given data types("
+            << multiple_data_type.size()
+            << ") doesn't match the existing tensor number("
+            << GetTensorDescNum()
+            << "). The Reader is going to be reinitialized.";
+    SetTensorDescNum(multiple_data_type.size());
+  }
   std::vector<proto::TensorDesc *> tensor_descs = mutable_tensor_descs();
   for (size_t i = 0; i < multiple_data_type.size(); ++i) {
     tensor_descs[i]->set_data_type(multiple_data_type[i]);
@@ -127,10 +134,14 @@ void VarDesc::SetLoDLevel(int32_t lod_level) {
 }
 
 void VarDesc::SetLoDLevels(const std::vector<int32_t> &multiple_lod_level) {
-  PADDLE_ENFORCE_EQ(multiple_lod_level.size(), GetTensorDescNum(),
-                    "The number of given data types(%d) doesn't equal to the "
-                    "number of sub tensor.",
-                    multiple_lod_level.size(), GetTensorDescNum());
+  if (multiple_lod_level.size() != GetTensorDescNum()) {
+    VLOG(3) << "WARNING: The number of given lod_levels("
+            << multiple_lod_level.size()
+            << ") doesn't match the existing tensor number("
+            << GetTensorDescNum()
+            << "). The Reader is going to be reinitialized.";
+    SetTensorDescNum(multiple_lod_level.size());
+  }
   switch (desc_.type()) {
     case proto::VarDesc::READER: {
       size_t i = 0;
diff --git a/paddle/framework/var_type.h b/paddle/framework/var_type.h
index 5b7a08a08732a6ccbc206f6a4f0aa4788ce4a219..599d45149024ca0fb395c2a1c6deeb7d8cd5eb17 100644
--- a/paddle/framework/var_type.h
+++ b/paddle/framework/var_type.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/framework/lod_rank_table.h"
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/lod_tensor_array.h"
+#include "paddle/framework/reader.h"
 #include "paddle/framework/selected_rows.h"
 #include "paddle/framework/variable.h"
 
@@ -31,6 +32,8 @@ inline proto::VarDesc::VarType ToVarType(std::type_index type) {
     return proto::VarDesc_VarType_LOD_TENSOR_ARRAY;
   } else if (type.hash_code() == typeid(SelectedRows).hash_code()) {
     return proto::VarDesc_VarType_SELECTED_ROWS;
+  } else if (type.hash_code() == typeid(ReaderHolder).hash_code()) {
+    return proto::VarDesc_VarType_READER;
   } else {
     PADDLE_THROW("ToVarType:Unsupported type %s", type.name());
   }
@@ -40,7 +43,7 @@ template <typename Visitor>
 inline void VisitVarType(const framework::Variable& var, Visitor visitor) {
   switch (ToVarType(var.Type())) {
     case proto::VarDesc_VarType_LOD_TENSOR:
-      visitor(var.Get<framework::LoDTensor>());
+      visitor(var.Get<LoDTensor>());
       return;
     case proto::VarDesc_VarType_LOD_RANK_TABLE:
       visitor(var.Get<LoDRankTable>());
@@ -51,6 +54,9 @@ inline void VisitVarType(const framework::Variable& var, Visitor visitor) {
     case proto::VarDesc_VarType_SELECTED_ROWS:
       visitor(var.Get<SelectedRows>());
       return;
+    case proto::VarDesc_VarType_READER:
+      visitor(var.Get<ReaderHolder>());
+      return;
     default:
       PADDLE_THROW("Not supported visit type, %d", ToVarType(var.Type()));
   }
diff --git a/paddle/gserver/tests/test_CompareSparse.cpp b/paddle/gserver/tests/test_CompareSparse.cpp
index c6e07650fc4805a25baf38b9059f6c996d00cafc..2495d8b60a56713ba554156d2d9b25e4f6a567d7 100644
--- a/paddle/gserver/tests/test_CompareSparse.cpp
+++ b/paddle/gserver/tests/test_CompareSparse.cpp
@@ -212,6 +212,10 @@ TEST(compareSparse, NeuralNetwork) {
 }
 
 int main(int argc, char** argv) {
+  // FIXME(tonyyang-svail):
+  //   Turn off this test due CI failure:
+  //   https://paddleci.ngrok.io/viewLog.html?buildId=27608&buildTypeId=Paddle_PrCi&tab=buildLog&_focus=10430
+  return 0;
   testing::InitGoogleTest(&argc, argv);
   initMain(argc, argv);
   initPython(argc, argv);
diff --git a/paddle/inference/CMakeLists.txt b/paddle/inference/CMakeLists.txt
index 2289ddc139cbddfbaa5238e683b2f8e784a7291e..654a6119bdc85f43b0cae631a9dc8f0ccd758889 100644
--- a/paddle/inference/CMakeLists.txt
+++ b/paddle/inference/CMakeLists.txt
@@ -13,17 +13,11 @@ add_library(paddle_fluid_shared SHARED io.cc)
 target_circle_link_libraries(paddle_fluid_shared
   ARCHIVE_START
   ${GLOB_OP_LIB}
-  ARCHIVE_END
-  ${FLUID_CORE_MODULES})
+  ${FLUID_CORE_MODULES}
+  ARCHIVE_END)
 
 SET_TARGET_PROPERTIES(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
 
-# install library & headers
-if(NOT WITH_C_API AND WITH_FLUID)
-  install(FILES io.h DESTINATION include/paddle/inference)
-  install(TARGETS paddle_fluid_shared DESTINATION lib)
-endif()
-
 if(WITH_TESTING)
   add_subdirectory(tests/book)
 endif()
diff --git a/paddle/inference/tests/book/CMakeLists.txt b/paddle/inference/tests/book/CMakeLists.txt
index 8f48b2f0e02b4c9a3c42aa9768855192ebf0b966..63afeb18aebdf446c01cd4fdac13d238467801e4 100644
--- a/paddle/inference/tests/book/CMakeLists.txt
+++ b/paddle/inference/tests/book/CMakeLists.txt
@@ -1,25 +1,29 @@
-set(PYTHON_TESTS_DIR ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/tests)
-cc_test(test_inference_recognize_digits_mlp
-    SRCS test_inference_recognize_digits.cc
-    DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
-    ARGS --dirname=${PYTHON_TESTS_DIR}/book/recognize_digits_mlp.inference.model)
-cc_test(test_inference_image_classification_vgg
-    SRCS test_inference_image_classification.cc
-    DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
-    ARGS --dirname=${PYTHON_TESTS_DIR}/book/image_classification_vgg.inference.model)
-cc_test(test_inference_image_classification_resnet
-    SRCS test_inference_image_classification.cc
-    DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
-    ARGS --dirname=${PYTHON_TESTS_DIR}/book/image_classification_resnet.inference.model)
-cc_test(test_inference_label_semantic_roles
-    SRCS test_inference_label_semantic_roles.cc
-    DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
-    ARGS --dirname=${PYTHON_TESTS_DIR}/book/label_semantic_roles.inference.model)
-set_tests_properties(test_inference_recognize_digits_mlp
-    PROPERTIES DEPENDS test_recognize_digits)
-set_tests_properties(test_inference_image_classification_vgg
-    PROPERTIES DEPENDS test_image_classification_train)
-set_tests_properties(test_inference_image_classification_resnet
-    PROPERTIES DEPENDS test_image_classification_train)
-set_tests_properties(test_inference_label_semantic_roles
-    PROPERTIES DEPENDS test_label_semantic_roles) 
+function(inference_test TARGET_NAME)
+  set(options "")
+  set(oneValueArgs "")
+  set(multiValueArgs ARGS)
+  cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  set(PYTHON_TESTS_DIR ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/tests)
+  if(inference_test_ARGS)
+    foreach(arg ${inference_test_ARGS})
+      cc_test(test_inference_${TARGET_NAME}_${arg}
+          SRCS test_inference_${TARGET_NAME}.cc
+          DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
+          ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}_${arg}.inference.model)
+      set_tests_properties(test_inference_${TARGET_NAME}_${arg}
+          PROPERTIES DEPENDS test_${TARGET_NAME})
+    endforeach()
+  else()
+    cc_test(test_inference_${TARGET_NAME}
+        SRCS test_inference_${TARGET_NAME}.cc
+        DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
+        ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}.inference.model)
+    set_tests_properties(test_inference_${TARGET_NAME}
+        PROPERTIES DEPENDS test_${TARGET_NAME})
+  endif()
+endfunction(inference_test)
+
+inference_test(recognize_digits ARGS mlp)
+inference_test(image_classification ARGS vgg resnet)
+inference_test(label_semantic_roles)
diff --git a/paddle/inference/tests/book/test_helper.h b/paddle/inference/tests/book/test_helper.h
index 17c3d58de6ab57c437096a25613d834d56f418c7..32db643fca2b026b674ea0b1ecd9aad5224e9e68 100644
--- a/paddle/inference/tests/book/test_helper.h
+++ b/paddle/inference/tests/book/test_helper.h
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <time.h>
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/inference/io.h"
 
diff --git a/paddle/inference/tests/book/test_inference_image_classification.cc b/paddle/inference/tests/book/test_inference_image_classification.cc
index e01f5b312a097ce3d7b20ce74e3803c79d942e51..35ff9431e9734bc3d20e1281f9d5d7f3e98f7524 100644
--- a/paddle/inference/tests/book/test_inference_image_classification.cc
+++ b/paddle/inference/tests/book/test_inference_image_classification.cc
@@ -13,51 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include <time.h>
-#include <sstream>
 #include "gflags/gflags.h"
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/inference/io.h"
+#include "test_helper.h"
 
 DEFINE_string(dirname, "", "Directory of the inference model.");
 
-template <typename Place, typename T>
-void TestInference(const std::string& dirname,
-                   const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
-                   std::vector<paddle::framework::LoDTensor*>& cpu_fetchs) {
-  // 1. Define place, executor and scope
-  auto place = Place();
-  auto executor = paddle::framework::Executor(place);
-  auto* scope = new paddle::framework::Scope();
-
-  // 2. Initialize the inference_program and load all parameters from file
-  auto inference_program = paddle::inference::Load(executor, *scope, dirname);
-
-  // 3. Get the feed_target_names and fetch_target_names
-  const std::vector<std::string>& feed_target_names =
-      inference_program->GetFeedTargetNames();
-  const std::vector<std::string>& fetch_target_names =
-      inference_program->GetFetchTargetNames();
-
-  // 4. Prepare inputs: set up maps for feed targets
-  std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
-  for (size_t i = 0; i < feed_target_names.size(); ++i) {
-    // Please make sure that cpu_feeds[i] is right for feed_target_names[i]
-    feed_targets[feed_target_names[i]] = cpu_feeds[i];
-  }
-
-  // 5. Define Tensor to get the outputs: set up maps for fetch targets
-  std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
-  for (size_t i = 0; i < fetch_target_names.size(); ++i) {
-    fetch_targets[fetch_target_names[i]] = cpu_fetchs[i];
-  }
-
-  // 6. Run the inference program
-  executor.Run(*inference_program, scope, feed_targets, fetch_targets);
-
-  delete scope;
-}
-
 TEST(inference, image_classification) {
   if (FLAGS_dirname.empty()) {
     LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
@@ -70,12 +30,10 @@ TEST(inference, image_classification) {
   // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
 
   paddle::framework::LoDTensor input;
-  srand(time(0));
-  float* input_ptr =
-      input.mutable_data<float>({1, 3, 32, 32}, paddle::platform::CPUPlace());
-  for (int i = 0; i < 3072; ++i) {
-    input_ptr[i] = rand() / (static_cast<float>(RAND_MAX));
-  }
+  // Use normilized image pixels as input data,
+  // which should be in the range [0.0, 1.0].
+  SetupTensor<float>(
+      input, {1, 3, 32, 32}, static_cast<float>(0), static_cast<float>(1));
   std::vector<paddle::framework::LoDTensor*> cpu_feeds;
   cpu_feeds.push_back(&input);
 
@@ -98,16 +56,6 @@ TEST(inference, image_classification) {
       dirname, cpu_feeds, cpu_fetchs2);
   LOG(INFO) << output2.dims();
 
-  EXPECT_EQ(output1.dims(), output2.dims());
-  EXPECT_EQ(output1.numel(), output2.numel());
-
-  float err = 1E-3;
-  int count = 0;
-  for (int64_t i = 0; i < output1.numel(); ++i) {
-    if (fabs(output1.data<float>()[i] - output2.data<float>()[i]) > err) {
-      count++;
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different elements.";
+  CheckError<float>(output1, output2);
 #endif
 }
diff --git a/paddle/inference/tests/book/test_inference_label_semantic_roles.cc b/paddle/inference/tests/book/test_inference_label_semantic_roles.cc
index c5646db2a77571c470e51a4ee74ad55cc0aeb9cd..1eaf4022a1f27235fdd07e77e294eaba37a14249 100644
--- a/paddle/inference/tests/book/test_inference_label_semantic_roles.cc
+++ b/paddle/inference/tests/book/test_inference_label_semantic_roles.cc
@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include <time.h>
-#include <sstream>
 #include "gflags/gflags.h"
 #include "test_helper.h"
 
diff --git a/paddle/inference/tests/book/test_inference_recognize_digits.cc b/paddle/inference/tests/book/test_inference_recognize_digits.cc
index 2c0cf941001c793021d4b59a3e968433bd9de98b..48f887e6bc680087af4cce74b5c5422a4eba3726 100644
--- a/paddle/inference/tests/book/test_inference_recognize_digits.cc
+++ b/paddle/inference/tests/book/test_inference_recognize_digits.cc
@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include <time.h>
-#include <sstream>
 #include "gflags/gflags.h"
 #include "test_helper.h"
 
diff --git a/paddle/memory/CMakeLists.txt b/paddle/memory/CMakeLists.txt
index 496098f80423854be62dc99b8601209ff6a6b182..1a61c484823b292234d4758cdc1959d7a21510e6 100644
--- a/paddle/memory/CMakeLists.txt
+++ b/paddle/memory/CMakeLists.txt
@@ -14,10 +14,3 @@ cc_library(paddle_memory
     system_allocator)
 
 cc_test(memory_test SRCS memory_test.cc DEPS place paddle_memory)
-
-if(NOT WITH_C_API AND WITH_FLUID)
-  file(GLOB MEMORY_HEADERS *.h)
-  file(GLOB MEMORY_DETAIL_HEADERS detail/*.h)
-  install(FILES ${MEMORY_HEADERS} DESTINATION include/paddle/memory)
-  install(FILES ${MEMORY_DETAIL_HEADERS} DESTINATION include/paddle/memory/detail)
-endif()
diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h
index 7012b6d331d0c4631a3d120fbaf3db7c97298ac7..30ed68c6e0ea95d206658d16684800e169ededc5 100644
--- a/paddle/memory/memory.h
+++ b/paddle/memory/memory.h
@@ -81,5 +81,23 @@ class PODDeleter {
   Place place_;
 };
 
+/**
+ * \brief   Free memory block in one place does not meet POD
+ *
+ * \note    In some cases, custom deleter is used to
+ *          deallocate the memory automatically for
+ *          std::unique_ptr<T> in tensor.h.
+ *
+ */
+template <typename T, typename Place>
+class PlainDeleter {
+ public:
+  explicit PlainDeleter(Place place) : place_(place) {}
+  void operator()(T* ptr) { Free(place_, reinterpret_cast<void*>(ptr)); }
+
+ private:
+  Place place_;
+};
+
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index 000c2089c176adf8d845a56a1f98528734f47ea1..25bb7187d36c5f696890ef72d4cb91bce94fddf8 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -62,7 +62,7 @@ function(op_library TARGET)
     endif()
 
     # Define operators that don't need pybind here.
-    foreach(manual_pybind_op "net_op" "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op")
+    foreach(manual_pybind_op "net_op" "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op" "create_reader_op")
         if ("${TARGET}" STREQUAL "${manual_pybind_op}")
             set(pybind_flag 1)
         endif()
@@ -155,6 +155,7 @@ op_library(recurrent_op DEPS executor)
 op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale math_function)
 op_library(cos_sim_op DEPS cos_sim_functor)
 op_library(parallel_do_op DEPS executor)
+op_library(create_reader_op DEPS reader)
 
 # Regist multiple Kernel to pybind
 if (WITH_GPU)
@@ -185,7 +186,7 @@ list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
 foreach(src ${GENERAL_OPS})
     op_library(${src})
 endforeach()
-file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n")
+file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\nUSE_NO_KERNEL_OP(create_random_data_generator);\n")
 
 set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
 
diff --git a/paddle/operators/create_reader_op.cc b/paddle/operators/create_reader_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5ba2a25ab4c679f638e928a9e04c20d683a93630
--- /dev/null
+++ b/paddle/operators/create_reader_op.cc
@@ -0,0 +1,205 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/reader.h"
+
+namespace paddle {
+namespace operators {
+
+static std::vector<framework::DDim> RestoreShapes(
+    const std::vector<int>& shape_concat, const std::vector<int>& ranks) {
+  std::vector<framework::DDim> res;
+  int offset = 0;
+  for (int len : ranks) {
+    auto start_it = shape_concat.begin() + offset;
+    auto end_it = start_it + len;
+    res.push_back(framework::make_ddim(std::vector<int>(start_it, end_it)));
+    offset += len;
+  }
+  return res;
+}
+
+// general infershape for file readers
+class CreateFileReaderInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "The output file reader should not be null.");
+    const auto shape_concat =
+        ctx->Attrs().Get<std::vector<int>>("shape_concat");
+    const auto ranks = ctx->Attrs().Get<std::vector<int>>("ranks");
+    std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks);
+    ctx->SetReaderDims("Out", shapes);
+  }
+};
+
+// general infershape for decorated readers
+class CreateDecoratedReaderInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("UnderlyingReader"),
+                   "Input(UnderlyingReader) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "The output decorated reader should not be null.");
+    ctx->SetReaderDims("Out", ctx->GetReaderDims("UnderlyingReader"));
+  }
+};
+
+// general var type inference for all readers
+class CreateReaderInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
+    std::string reader_name = op_desc.Output("Out")[0];
+    framework::VarDesc* reader = block->FindVarRecursive(reader_name);
+    reader->SetType(framework::proto::VarDesc::READER);
+  }
+};
+
+template <typename T>
+class CreateRandomDataGeneratorOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+  void Run(const framework::Scope& scope,
+           const platform::Place& dev_place) const override {
+    const auto& shape_concat = Attr<std::vector<int>>("shape_concat");
+    const auto& ranks = Attr<std::vector<int>>("ranks");
+    PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty());
+    PADDLE_ENFORCE_EQ(std::accumulate(ranks.begin(), ranks.end(), 0),
+                      int(shape_concat.size()),
+                      "The accumulate of all ranks should be equal to the "
+                      "shape concat's length.");
+    std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks);
+    auto* out = scope.FindVar(Output("Out"))
+                    ->template GetMutable<framework::ReaderHolder>();
+    out->Reset(new framework::RandomDataGenerator<T>(shapes, Attr<float>("min"),
+                                                     Attr<float>("max")));
+  }
+};
+
+class CreateRandomDataGeneratorOpMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  CreateRandomDataGeneratorOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(op_proto, op_checker) {
+    AddOutput("Out", "(ReaderHolder) The created random reader.");
+    AddAttr<std::vector<int>>("shape_concat",
+                              "The concat of all data's shapes.");
+    AddAttr<std::vector<int>>(
+        "ranks",
+        "The ranks of each data."
+        "e.g."
+        "shape_concat = [2,3,4,5,6]"
+        "ranks = [3,2]"
+        "It means the reader will generate two data each time,"
+        "whose shapes are [2,3,4] and [5,6] respectively.");
+    AddAttr<float>("min", "The lower bound of reader's uniform distribution.");
+    AddAttr<float>("max", "The upper bound of reader's uniform distribution.");
+    AddComment(R"DOC(
+      CreateRandomDataGenerator Operator
+
+      This Op creates a random reader. 
+      The reader generates random data instead of really reading from files.
+      Generated data follow an uniform distribution between 'min' and 'max'.
+    )DOC");
+  }
+};
+
+class CreateShuffleReaderOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+  void Run(const framework::Scope& scope,
+           const platform::Place& dev_place) const override {
+    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
+                                        ->Get<framework::ReaderHolder>();
+    auto* out = scope.FindVar(Output("Out"))
+                    ->template GetMutable<framework::ReaderHolder>();
+    out->Reset(new framework::ShuffleReader(underlying_reader.Get(),
+                                            Attr<int>("buffer_size")));
+  }
+};
+
+class CreateShuffleReaderOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CreateShuffleReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(op_proto, op_checker) {
+    AddInput(
+        "UnderlyingReader",
+        "(ReaderHolder) The underlying reader for creating a shuffle reader.");
+    AddOutput("Out", "(ReaderHolder) The created shuffle reader.");
+    AddAttr<int>("buffer_size", "The shuffle buffer size.").GreaterThan(0);
+    AddComment(R"DOC(
+      CreateShuffleReader Operator
+
+      A shuffle reader takes another reader as its 'underlying reader'
+      and yields the underlying reader's outputs in a shuffled order. 
+    )DOC");
+  }
+};
+
+class CreateBatchReaderOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+  void Run(const framework::Scope& scope,
+           const platform::Place& dev_place) const override {
+    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
+                                        ->Get<framework::ReaderHolder>();
+    auto* out = scope.FindVar(Output("Out"))
+                    ->template GetMutable<framework::ReaderHolder>();
+    out->Reset(new framework::BatchReader(underlying_reader.Get(),
+                                          Attr<int>("batch_size")));
+  }
+};
+
+class CreateBatchReaderOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CreateBatchReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(op_proto, op_checker) {
+    AddInput(
+        "UnderlyingReader",
+        "(ReaderHolder) The underlying reader for creating a batch reader.");
+    AddOutput("Out", "(ReaderHolder) The created batch reader.");
+    AddAttr<int>("batch_size",
+                 "How many instances the batch reader yields each time.")
+        .GreaterThan(0);
+    AddComment(R"DOC(
+      CreateBatchReader Operator
+
+      A batch reader takes another reader as its 'underlying reader', 
+      gathers the underlying reader's outputs and then yields them in batches. 
+    )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(create_random_data_generator,
+                  ops::CreateRandomDataGeneratorOp<float>,
+                  ops::CreateFileReaderInferShape,
+                  ops::CreateRandomDataGeneratorOpMaker,
+                  paddle::framework::EmptyGradOpMaker,
+                  ops::CreateReaderInferVarType);
+REGISTER_OPERATOR(create_shuffle_reader, ops::CreateShuffleReaderOp,
+                  ops::CreateDecoratedReaderInferShape,
+                  ops::CreateShuffleReaderOpMaker,
+                  paddle::framework::EmptyGradOpMaker,
+                  ops::CreateReaderInferVarType);
+REGISTER_OPERATOR(create_batch_reader, ops::CreateBatchReaderOp,
+                  ops::CreateDecoratedReaderInferShape,
+                  ops::CreateBatchReaderOpMaker,
+                  paddle::framework::EmptyGradOpMaker,
+                  ops::CreateReaderInferVarType);
diff --git a/paddle/operators/ctc_align_op.cu b/paddle/operators/ctc_align_op.cu
index 2a970cd9fa965b4126356eaa1519068f9c7a7f34..cea595d7c5d461b40198e622abf08248e7ca69e1 100644
--- a/paddle/operators/ctc_align_op.cu
+++ b/paddle/operators/ctc_align_op.cu
@@ -80,6 +80,14 @@ class CTCAlignOpCUDAKernel : public framework::OpKernel<T> {
 
     // resize output dims
     output->Resize({static_cast<int64_t>(host_out_lod0.back()), 1});
+
+    if (host_out_lod0.back() == 0) {
+      output->Resize({1, 1});
+      output->mutable_data<T>(ctx.GetPlace());
+      math::SetConstant<platform::CUDADeviceContext, T> set_constant;
+      set_constant(ctx.template device_context<platform::CUDADeviceContext>(),
+                   output, -1);
+    }
   }
 };
 
diff --git a/paddle/operators/ctc_align_op.h b/paddle/operators/ctc_align_op.h
index fed89aa1e899a2450b315f352b9695056ed13aec..54ad1d6f5cc96c884c9e0c101c44d8d629792f8f 100644
--- a/paddle/operators/ctc_align_op.h
+++ b/paddle/operators/ctc_align_op.h
@@ -16,6 +16,8 @@ limitations under the License. */
 
 #include <string.h>
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+
 namespace paddle {
 namespace operators {
 
@@ -65,9 +67,14 @@ class CTCAlignKernel : public framework::OpKernel<T> {
     framework::LoD output_lod;
     output_lod.push_back(output_lod0);
     output->set_lod(output_lod);
-
     // resize output dims
     output->Resize({static_cast<int64_t>(output_lod0.back()), 1});
+    // for empty sequence
+    if (output_lod0.back() == 0) {
+      output->Resize({1, 1});
+      output_data = output->mutable_data<T>(ctx.GetPlace());
+      output_data[0] = -1;
+    }
   }
 };
 
diff --git a/paddle/operators/nccl_op_test.cu.cc b/paddle/operators/nccl_op_test.cu.cc
index 072e4eb2eff1f6f3d8745ac8e16709b8e1a69725..827a62534778e48c8d4f03d2634056b7d1392ae8 100644
--- a/paddle/operators/nccl_op_test.cu.cc
+++ b/paddle/operators/nccl_op_test.cu.cc
@@ -287,6 +287,9 @@ TEST_F(NCCLTester, ncclBcastOp) {
 }
 
 int main(int argc, char **argv) {
+  // FIXME(tonyyang-svail):
+  //   Due to the driver issue on our CI, disable for now
+  return 0;
   const int dev_count = p::GetCUDADeviceCount();
   if (dev_count <= 1) {
     LOG(WARNING)
diff --git a/paddle/operators/parallel_do_op.cc b/paddle/operators/parallel_do_op.cc
index 67f9854c02fa92d0141463088915e720733306fb..89045923f9ff2f33bc112b199c493047440e15c4 100644
--- a/paddle/operators/parallel_do_op.cc
+++ b/paddle/operators/parallel_do_op.cc
@@ -76,18 +76,25 @@ inline void CopyOrShare(const framework::Variable &src,
   if (src.IsType<LoDTensor>()) {
     if (src.Get<LoDTensor>().place() == dst_place) {
       dst->GetMutable<LoDTensor>()->ShareDataWith(src.Get<LoDTensor>());
+      dst->GetMutable<LoDTensor>()->set_lod(src.Get<LoDTensor>().lod());
     } else {
       Copy(src.Get<LoDTensor>(), dst_place, dst->GetMutable<LoDTensor>());
+      framework::LoD lod(src.Get<LoDTensor>().lod());
+      lod.CopyToPeer(dst_place);
+      dst->GetMutable<LoDTensor>()->set_lod(lod);
     }
   } else if (src.IsType<SelectedRows>()) {
     auto &src_sr = src.Get<SelectedRows>();
     auto *dst_sr = dst->GetMutable<SelectedRows>();
-    dst_sr->set_rows(src_sr.rows());
     dst_sr->set_height(src_sr.height());
     if (src_sr.value().place() == dst_place) {
       dst_sr->mutable_value()->ShareDataWith(src_sr.value());
+      dst_sr->set_rows(src_sr.rows());
     } else {
       Copy(src_sr.value(), dst_place, dst_sr->mutable_value());
+      framework::Vector<int64_t> lod(src_sr.rows());
+      lod.CopyToPeer(dst_place);
+      dst_sr->set_rows(lod);
     }
   } else {
     PADDLE_THROW("Expect LoDTensor/SelectedRows, get %s", src.Type().name());
@@ -145,6 +152,9 @@ class ParallelDoOp : public framework::OperatorBase {
         auto *sub_scope = sub_scopes[i];
         auto *dst = sub_scope->Var(param)->GetMutable<LoDTensor>();
         framework::Copy(src, place, dst);
+        framework::LoD lod(src.lod());
+        lod.CopyToPeer(place);
+        dst->set_lod(lod);
       }
     }
     WaitOnPlaces(places);
@@ -248,17 +258,19 @@ class ParallelDoGradOp : public framework::OperatorBase {
                       const std::vector<framework::Scope *> &sub_scopes,
                       const platform::PlaceList &places) const {
     for (auto &s : Outputs(framework::GradVarName(kParameters))) {
+      VLOG(3) << "Accumulating " << s;
+      if (s == framework::kEmptyVarName) continue;
       std::string tmp_name;
       auto *tmp = sub_scopes[0]->Var(&tmp_name);
 
       for (size_t i = 1; i < sub_scopes.size(); ++i) {
         CopyOrShare(*sub_scopes[i]->FindVar(s), places[0], tmp);
-        WaitOnPlace(places[0]);
+        WaitOnPlaces(places);
 
         auto sum_op = framework::OpRegistry::CreateOp(
             "sum", {{"X", {s, tmp_name}}}, {{"Out", {s}}},
             framework::AttributeMap{});
-        VLOG(3) << sum_op->DebugStringEx(sub_scopes[0]);
+        VLOG(10) << sum_op->DebugStringEx(sub_scopes[0]);
         sum_op->Run(*sub_scopes[0], places[0]);
         WaitOnPlace(places[0]);
       }
@@ -334,16 +346,9 @@ class ParallelDoGradOpDescMaker : public framework::SingleGradOpDescMaker {
 class ParallelDoGradOpShapeInference : public framework::InferShapeBase {
  public:
   void operator()(framework::InferShapeContext *ctx) const override {
-    std::vector<std::string> input{kParameters, kInputs};
-    std::vector<std::string> output{kOutputs};
-
     PADDLE_ENFORCE(ctx->HasInputs(kParameters));
-    PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kParameters)));
     PADDLE_ENFORCE(ctx->HasInputs(kInputs));
-
-    for (auto &s : output) {
-      PADDLE_ENFORCE(ctx->HasInputs(s));
-    }
+    PADDLE_ENFORCE(ctx->HasInputs(kOutputs));
 
     ctx->SetOutputsDim(framework::GradVarName(kParameters),
                        ctx->GetInputsDim(kParameters));
@@ -360,10 +365,14 @@ class ParallelDoGradOpShapeInference : public framework::InferShapeBase {
       ctx->SetDims({ig_name}, {i_dims[i]});
     }
 
-    if (ctx->HasInputs(kParameters)) {
-      PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kParameters)));
-      ctx->SetOutputsDim(framework::GradVarName(kParameters),
-                         ctx->GetInputsDim(kParameters));
+    auto p_dims = ctx->GetInputsDim(kParameters);
+    auto pg_names = ctx->Outputs(framework::GradVarName(kParameters));
+    for (size_t i = 0; i < pg_names.size(); ++i) {
+      auto &pg_name = pg_names[i];
+      if (pg_name == framework::kEmptyVarName) {
+        continue;
+      }
+      ctx->SetDims({pg_name}, {p_dims[i]});
     }
   }
 };
diff --git a/paddle/operators/read_op.cc b/paddle/operators/read_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3ae454101f585cf412a306fd3198f99fbdb8324d
--- /dev/null
+++ b/paddle/operators/read_op.cc
@@ -0,0 +1,99 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/reader.h"
+
+namespace paddle {
+namespace operators {
+
+class ReadInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Reader"),
+                   "The ReadOp must take a reader as input.");
+    PADDLE_ENFORCE(ctx->HasOutputs("Out"),
+                   "The ReadOp should be assigned with output.");
+    std::vector<framework::DDim> reader_dims = ctx->GetReaderDims("Reader");
+    std::vector<std::string> out_names = ctx->Outputs("Out");
+    PADDLE_ENFORCE_EQ(
+        reader_dims.size(), out_names.size(),
+        "The reader's dim number doesn't match the output number.");
+    ctx->SetOutputsDim("Out", reader_dims);
+  }
+};
+
+class ReadInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
+    std::string reader_name = op_desc.Input("Reader")[0];
+    std::vector<std::string> out_names = op_desc.Output("Out");
+    framework::VarDesc* reader = block->FindVarRecursive(reader_name);
+    auto dtypes = reader->GetDataTypes();
+    PADDLE_ENFORCE_EQ(dtypes.size(), out_names.size());
+    for (size_t i = 0; i < dtypes.size(); ++i) {
+      framework::VarDesc& out = block->FindRecursiveOrCreateVar(out_names[i]);
+      out.SetType(framework::proto::VarDesc::LOD_TENSOR);
+      out.SetDataType(dtypes[i]);
+    }
+  }
+};
+
+class ReadOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+  void Run(const framework::Scope& scope,
+           const platform::Place& dev_place) const override {
+    framework::ReaderHolder* reader =
+        scope.FindVar(Input("Reader"))->GetMutable<framework::ReaderHolder>();
+    if (!reader->HasNext()) {
+      reader->ReInit();
+      PADDLE_ENFORCE(
+          reader->HasNext(),
+          "Reader can not read the next data even it has been re-initialized.");
+    }
+    std::vector<std::string> out_arg_names = Outputs("Out");
+    std::vector<framework::LoDTensor> ins;
+    reader->ReadNext(&ins);
+    PADDLE_ENFORCE_EQ(ins.size(), out_arg_names.size());
+    for (size_t i = 0; i < ins.size(); ++i) {
+      auto* out =
+          scope.FindVar(out_arg_names[i])->GetMutable<framework::LoDTensor>();
+      out->ShareDataWith(ins[i]);
+      out->set_lod(ins[i].lod());
+    }
+  }
+};
+
+class ReadOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ReadOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(op_proto, op_checker) {
+    AddInput("Reader", "(ReaderHolder) The executed reader.");
+    AddOutput("Out", "(LoDTensor) The output data.").AsDuplicable();
+    AddComment(R"DOC(
+      Read Operator
+
+      Execute a given reader once and output data.
+    )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(read, ops::ReadOp, ops::ReadInferShape, ops::ReadOpMaker,
+                  paddle::framework::EmptyGradOpMaker, ops::ReadInferVarType);
diff --git a/paddle/operators/target_assign_op.cc b/paddle/operators/target_assign_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..615ca857ceb45d442b75fffc6662cc2bda19562d
--- /dev/null
+++ b/paddle/operators/target_assign_op.cc
@@ -0,0 +1,202 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/target_assign_op.h"
+
+namespace paddle {
+namespace operators {
+
+class TargetAssignOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    // checkout inputs
+    PADDLE_ENFORCE(ctx->HasInput("EncodedGTBBox"),
+                   "Input(EncodedGTBBox) of TargetAssignOp should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("GTScoreLabel"),
+                   "Input(GTScoreLabel) of TargetAssignOp should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("MatchIndices"),
+                   "Input(MatchIndices) of TargetAssignOp should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("NegIndices"),
+                   "Input(NegIndices) of TargetAssignOp should not be null");
+
+    // checkout outputs
+    PADDLE_ENFORCE(
+        ctx->HasOutput("PredBBoxLabel"),
+        "Output(PredBBoxLabel) of TargetAssignOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("PredBBoxWeight"),
+        "Output(PredBBoxWeight) of TargetAssignOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("PredScoreLabel"),
+        "Output(PredScoreLabel) of TargetAssignOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("PredScoreWeight"),
+        "Output(PredScoreWeight) of TargetAssignOp should not be null.");
+
+    auto blabel_dims = ctx->GetInputDim("EncodedGTBBox");
+    auto slabel_dims = ctx->GetInputDim("GTScoreLabel");
+    auto mi_dims = ctx->GetInputDim("MatchIndices");
+    auto neg_dims = ctx->GetInputDim("NegIndices");
+
+    PADDLE_ENFORCE_EQ(blabel_dims.size(), 3UL,
+                      "The rank of Input(EncodedGTBBox) must be 3.");
+    PADDLE_ENFORCE_EQ(slabel_dims.size(), 2UL,
+                      "The rank of Input(GTScoreLabel) must be 2.");
+    PADDLE_ENFORCE_EQ(mi_dims.size(), 2UL,
+                      "The rank of Input(MatchIndices) must be 2.");
+    PADDLE_ENFORCE_EQ(neg_dims.size(), 2UL,
+                      "The rank of Input(NegIndices) must be 2.");
+
+    PADDLE_ENFORCE_EQ(blabel_dims[0], slabel_dims[0],
+                      "The 1st dimension (means the total number of "
+                      "ground-truth bounding boxes) of Input(EncodedGTBBox) "
+                      "and Input(GTScoreLabel) must be the same.");
+    PADDLE_ENFORCE_EQ(blabel_dims[1], mi_dims[1],
+                      "The 2nd dimension (means the number of priod boxes) "
+                      "of Input(EncodedGTBBox) and "
+                      "Input(MatchIndices) must be the same.");
+    PADDLE_ENFORCE_EQ(blabel_dims[2], 4,
+                      "The 3rd dimension of Input(EncodedGTBBox) must be 4.");
+
+    auto n = mi_dims[0];
+    auto np = mi_dims[1];
+    ctx->SetOutputDim("PredBBoxLabel", {n, np, 4});
+    ctx->SetOutputDim("PredBBoxWeight", {n, np, 1});
+    ctx->SetOutputDim("PredScoreLabel", {n, np, 1});
+    ctx->SetOutputDim("PredScoreWeight", {n, np, 1});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(
+            ctx.Input<framework::LoDTensor>("EncodedGTBBox")->type()),
+        ctx.device_context());
+  }
+};
+
+class TargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  TargetAssignOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("EncodedGTBBox",
+             "(LoDTensor), The encoded ground-truth bounding boxes with shape "
+             "[Ng, Np, 4], where Ng is the total number of ground-truth boxes "
+             "in this mini-batch, Np the number of predictions, 4 is the "
+             "number of coordinate in [xmin, ymin, xmax, ymax] layout.");
+    AddInput("GTScoreLabel",
+             "(LoDTensor, default LoDTensor<int>),  The input ground-truth "
+             "labels with shape [Ng, 1], where the Ng is the same as it in "
+             "the input of EncodedGTBBox.");
+    AddInput("MatchIndices",
+             "(Tensor, default Tensor<int>), The input matched indices "
+             "with shape [N, Np], where N is the batch size, Np is the same "
+             "as it in the input of EncodedGTBBox. If MatchIndices[i][j] "
+             "is -1, the j-th prior box is not matched to any ground-truh "
+             "box in i-th instance.");
+    AddInput("NegIndices",
+             "(LoDTensor, default LoDTensor<int>), The input negative example "
+             "indices with shape [Neg, 1], where is the total number of "
+             "negative example indices.");
+    AddAttr<int>("background_label",
+                 "(int, default 0), Label index of background class.")
+        .SetDefault(0);
+    AddOutput("PredBBoxLabel",
+              "(Tensor), The output encoded ground-truth labels "
+              "with shape [N, Np, 4], N is the batch size and Np, 4 is the "
+              "same as they in input of EncodedGTBBox. If MatchIndices[i][j] "
+              "is -1, the PredBBoxLabel[i][j][:] is the encoded ground-truth "
+              "box for background_label in i-th instance.");
+    AddOutput("PredBBoxWeight",
+              "(Tensor), The weight for PredBBoxLabel with the shape "
+              "of [N, Np, 1]");
+    AddOutput("PredScoreLabel",
+              "(Tensor, default Tensor<int>), The output score labels for "
+              "each predictions with shape [N, Np, 1]. If MatchIndices[i][j] "
+              "is -1, PredScoreLabel[i][j] = background_label.");
+    AddOutput("PredScoreWeight",
+              "(Tensor), The weight for PredScoreLabel with the shape "
+              "of [N, Np, 1]");
+    AddComment(R"DOC(
+This operator is, for given the encoded boxes between prior boxes and
+ground-truth boxes and ground-truth class labels, to assign classification
+and regression targets to each prior box as well as weights to each
+prior box. The weights is used to specify which prior box would not contribute
+to training loss.
+
+For each instance, the output `PredBBoxLabel`, `PredBBoxWeight`,
+`PredScoreLabel` and `PredScoreWeight` are assigned based on `MatchIndices`.
+Assumed that the row offset for each instance in `EncodedGTBBox` is called lod,
+this operato assigns classification/regression targets by performing the
+following steps:
+
+1. Assigning all outpts based on `MatchIndices`:
+
+If id = MatchIndices[i][j] > 0,
+
+    PredBBoxLabel[i][j] = EncodedGTBBox[lod[i] + id][j]
+    PredBBoxWeight[i][j] = 1.
+    PredScoreLabel[i][j] = GTScoreLabel[lod[i] + id]
+    PredScoreWeight[i][j] = 1.
+
+Otherwise, 
+
+    PredBBoxLabel[j][j] = [0., 0., 0., 0.]
+    PredBBoxWeight[i][j] = 0.
+    PredScoreLabel[i][j] = background_label
+    PredScoreWeight[i][j] = 0.
+
+2. Assigning PredScoreWeight based on `NegIndices`:
+
+Assumed that the row offset for each instance in `NegIndices` is caleed neg_lod,
+for i-th instance and all ids of NegIndices in this instance:
+
+    PredScoreLabel[i][id] = background_label
+    PredScoreWeight[i][id] = 1.0
+
+    )DOC");
+  }
+};
+
+template <typename T>
+struct NegTargetAssignFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& ctx, const int* neg_indices,
+                  const size_t* lod, const int num, const int num_prior_box,
+                  const int background_label, int* out_label, T* out_label_wt) {
+    for (int i = 0; i < num; ++i) {
+      for (size_t j = lod[i]; j < lod[i + 1]; ++j) {
+        int id = neg_indices[j];
+        out_label[i * num_prior_box + id] = background_label;
+        out_label_wt[i * num_prior_box + id] = static_cast<T>(1.0);
+      }
+    }
+  }
+};
+
+template struct NegTargetAssignFunctor<platform::CPUDeviceContext, float>;
+template struct NegTargetAssignFunctor<platform::CPUDeviceContext, double>;
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(target_assign, ops::TargetAssignOp,
+                             ops::TargetAssignOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    target_assign,
+    ops::TargetAssignKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::TargetAssignKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/target_assign_op.cu b/paddle/operators/target_assign_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..fc0a1000a4202adeca3e0d6fbb05e832a79dbaba
--- /dev/null
+++ b/paddle/operators/target_assign_op.cu
@@ -0,0 +1,61 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/target_assign_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__global__ void NegTargetAssignKernel(const int* neg_indices, const size_t* lod,
+                                      const int num, const int num_prior_box,
+                                      const int background_label,
+                                      int* out_label, T* out_label_wt) {
+  int bidx = blockIdx.x;
+  int st = lod[bidx];
+  int ed = lod[bidx + 1];
+
+  int row_start = bidx * num_prior_box;
+  for (int i = st + threadIdx.x; i < ed; i += blockDim.x) {
+    int id = row_start + neg_indices[i];
+    out_label[id] = background_label;
+    out_label_wt[id] = 1.;
+  }
+}
+
+template <typename T>
+struct NegTargetAssignFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& ctx,
+                  const int* neg_indices, const size_t* lod, const int num,
+                  const int num_prior_box, const int background_label,
+                  int* out_label, T* out_label_wt) {
+    const int block_size = 256;
+    const int grid_size = num;
+    NegTargetAssignKernel<T><<<grid_size, block_size, 0, ctx.stream()>>>(
+        neg_indices, lod, num, num_prior_box, background_label, out_label,
+        out_label_wt);
+  }
+};
+
+template struct NegTargetAssignFunctor<platform::CUDADeviceContext, float>;
+template struct NegTargetAssignFunctor<platform::CUDADeviceContext, double>;
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    target_assign,
+    ops::TargetAssignKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::TargetAssignKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/target_assign_op.h b/paddle/operators/target_assign_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..82fca5724c0bd9fbfb60a98b91944700bfab9cdf
--- /dev/null
+++ b/paddle/operators/target_assign_op.h
@@ -0,0 +1,160 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/assert.h"
+#include "paddle/platform/for_range.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct TargetAssignFunctor {
+  const T* gt_box_;
+  const int* gt_label_;
+  const int* match_indices_;
+  const size_t* lod_;
+  const int background_label_;
+  const int64_t num_;
+  const int64_t num_prior_box_;
+
+  T* out_box_;
+  T* out_box_wt_;
+  int* out_label_;
+  T* out_label_wt_;
+
+  TargetAssignFunctor(const T* gt_box, const int* gt_label,
+                      const int* match_indices, const size_t* lod,
+                      const int background_label, const int64_t num,
+                      const int64_t np, T* out_box, T* out_box_wt,
+                      int* out_label, T* out_label_wt)
+      : gt_box_(gt_box),
+        gt_label_(gt_label),
+        match_indices_(match_indices),
+        lod_(lod),
+        background_label_(background_label),
+        num_(num),
+        num_prior_box_(np),
+        out_box_(out_box),
+        out_box_wt_(out_box_wt),
+        out_label_(out_label),
+        out_label_wt_(out_label_wt) {}
+
+  HOSTDEVICE void operator()(size_t i) const {
+    int row = i / num_prior_box_;
+    int col = i - row * num_prior_box_;
+
+    size_t row_off = lod_[row];
+    int offset = row * num_prior_box_ + col;
+
+    int id = match_indices_[offset];
+    T* obox = out_box_ + offset * 4;
+    int* olabel = out_label_ + offset;
+    T* obox_wt = out_box_wt_ + offset;
+    T* olabel_wt = out_label_wt_ + offset;
+
+    if (id > -1) {
+      const T* gtbox = gt_box_ + ((row_off + id) * num_prior_box_ + col) * 4;
+
+      obox[0] = gtbox[0];
+      obox[1] = gtbox[1];
+      obox[2] = gtbox[2];
+      obox[3] = gtbox[3];
+
+      olabel[0] = gt_label_[row_off + id];
+      obox_wt[0] = static_cast<T>(1.);
+      olabel_wt[0] = static_cast<T>(1.);
+    } else {
+      obox[0] = static_cast<T>(0.);
+      obox[1] = static_cast<T>(0.);
+      obox[2] = static_cast<T>(0.);
+      obox[3] = static_cast<T>(0.);
+
+      olabel[0] = background_label_;
+      obox_wt[0] = static_cast<T>(0.);
+      olabel_wt[0] = static_cast<T>(0.);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+struct NegTargetAssignFunctor {
+  void operator()(const platform::DeviceContext& ctx, const int* neg_indices,
+                  const size_t* lod, const int num, const int num_prior_box,
+                  const int background_label, int* out_label,
+                  T* out_label_wt) const;
+};
+
+template <typename DeviceContext, typename T>
+class TargetAssignKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* enc_gt_box = ctx.Input<framework::LoDTensor>("EncodedGTBBox");
+    auto* gt_label = ctx.Input<framework::LoDTensor>("GTScoreLabel");
+    auto* match_indices = ctx.Input<framework::Tensor>("MatchIndices");
+    auto* neg_indices = ctx.Input<framework::LoDTensor>("NegIndices");
+
+    auto* out_box = ctx.Output<framework::Tensor>("PredBBoxLabel");
+    auto* out_box_wt = ctx.Output<framework::Tensor>("PredBBoxWeight");
+    auto* out_label = ctx.Output<framework::Tensor>("PredScoreLabel");
+    auto* out_label_wt = ctx.Output<framework::Tensor>("PredScoreWeight");
+
+    PADDLE_ENFORCE_EQ(enc_gt_box->lod().size(), 1UL);
+    PADDLE_ENFORCE_EQ(gt_label->lod().size(), 1UL);
+    PADDLE_ENFORCE_EQ(neg_indices->lod().size(), 1UL);
+
+    int background_label = ctx.Attr<int>("background_label");
+
+    const T* box_data = enc_gt_box->data<T>();
+    const int* label_data = gt_label->data<int>();
+    const int* match_idx_data = match_indices->data<int>();
+    const int* neg_idx_data = neg_indices->data<int>();
+
+    T* obox_data = out_box->mutable_data<T>(ctx.GetPlace());
+    T* obox_wt_data = out_box_wt->mutable_data<T>(ctx.GetPlace());
+    int* olabel_data = out_label->mutable_data<int>(ctx.GetPlace());
+    T* olabel_wt_data = out_label_wt->mutable_data<T>(ctx.GetPlace());
+
+    int64_t num = match_indices->dims()[0];
+    int64_t num_prior_box = match_indices->dims()[1];
+
+    auto gt_lod = enc_gt_box->lod().back();
+    auto gt_label_lod = gt_label->lod().back();
+    auto neg_lod = neg_indices->lod().back();
+    for (size_t i = 0; i < gt_lod.size(); ++i) {
+      PADDLE_ENFORCE_EQ(gt_lod.data()[i], gt_label_lod.data()[i]);
+    }
+
+    size_t* gt_lod_data = gt_lod.data(ctx.GetPlace());
+    size_t* neg_lod_data = neg_lod.data(ctx.GetPlace());
+
+    TargetAssignFunctor<T> functor(box_data, label_data, match_idx_data,
+                                   gt_lod_data, background_label, num,
+                                   num_prior_box, obox_data, obox_wt_data,
+                                   olabel_data, olabel_wt_data);
+
+    auto& device_ctx = ctx.template device_context<DeviceContext>();
+    platform::ForRange<DeviceContext> for_range(device_ctx,
+                                                num * num_prior_box);
+    for_range(functor);
+
+    NegTargetAssignFunctor<DeviceContext, T> neg_trg_functor;
+    neg_trg_functor(device_ctx, neg_idx_data, neg_lod_data, num, num_prior_box,
+                    background_label, olabel_data, olabel_wt_data);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index d68caea99719b37816391f9bddcc5cac051025b2..5ce4b3de39d93e1935c6349ae446dec11d2fa986 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -39,11 +39,3 @@ nv_test(nccl_test SRCS nccl_test.cu DEPS dynload_cuda gpu_info device_context)
 
 cc_library(profiler SRCS profiler.cc DEPS device_context)
 cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
-
-if(NOT WITH_C_API AND WITH_FLUID)
-  file(GLOB PLATFORM_HEADERS *.h)
-  file(GLOB PLATFORM_dynload_HEADERS dynload/*.h)
-  install(FILES ${PLATFORM_HEADERS} DESTINATION include/paddle/platform)
-  install(FILES ${PLATFORM_HEADERS} DESTINATION include/paddle/platform/dynload)
-  install(FILES details/device_ptr_cast.h DESTINATION include/paddle/platform/details)
-endif()
diff --git a/paddle/platform/assert.h b/paddle/platform/assert.h
index d813b9529ba2c8d5a3f39eadeb82d7569acd5fdd..1f5a8f6a195738ec3b0681aff8565885258a91fb 100644
--- a/paddle/platform/assert.h
+++ b/paddle/platform/assert.h
@@ -1,16 +1,16 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/platform/nccl_test.cu b/paddle/platform/nccl_test.cu
index ef6d845874745af1150e4425f8d6be416cc44ece..84f5ac28be319473d045dc554bf2cb3c0e48803a 100644
--- a/paddle/platform/nccl_test.cu
+++ b/paddle/platform/nccl_test.cu
@@ -127,6 +127,9 @@ TEST(NCCL, all_reduce) {
 }  // namespace paddle
 
 int main(int argc, char** argv) {
+  // FIXME(tonyyang-svail):
+  //   Due to the driver issue on our CI, disable for now
+  return 0;
   dev_count = paddle::platform::GetCUDADeviceCount();
   if (dev_count <= 1) {
     LOG(WARNING)
diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc
index 0f1953abe0864c70671c5bb32e15b495ecf993d0..0a92e10927caf00be60fdd8107600b4033cf09ea 100644
--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
@@ -217,8 +217,6 @@ void BindVarDsec(py::module &m) {
       .def("set_shapes", &VarDesc::SetShapes)
       .def("set_dtype", &VarDesc::SetDataType)
       .def("set_dtypes", &VarDesc::SetDataTypes)
-      .def("set_tensor_num", &VarDesc::SetTensorDescNum)
-      .def("tensor_num", &VarDesc::GetTensorDescNum)
       .def("shape", &VarDesc::GetShape, py::return_value_policy::reference)
       .def("shapes", &VarDesc::GetShapes, py::return_value_policy::reference)
       .def("dtype", &VarDesc::GetDataType, py::return_value_policy::reference)
diff --git a/paddle/string/CMakeLists.txt b/paddle/string/CMakeLists.txt
index 751776dbb5c00972c0b6893fcfb2e710f3f082d7..1fe7f42ca1c692e4d7034883022852657be8cc20 100644
--- a/paddle/string/CMakeLists.txt
+++ b/paddle/string/CMakeLists.txt
@@ -2,9 +2,3 @@ cc_library(stringpiece SRCS piece.cc)
 cc_test(stringpiece_test SRCS piece_test.cc DEPS stringpiece glog gflags)
 cc_test(stringprintf_test SRCS printf_test.cc DEPS glog gflags)
 cc_test(to_string_test SRCS to_string_test.cc)
-
-if(NOT WITH_C_API AND WITH_FLUID)
-  file(GLOB STRING_HEADERS *.h)
-  install(FILES ${STRING_HEADERS} DESTINATION include/paddle/string)
-  install(FILES tinyformat/tinyformat.h DESTINATION include/paddle/string/tinyformat)
-endif()
diff --git a/python/paddle/v2/fluid/layers/control_flow.py b/python/paddle/v2/fluid/layers/control_flow.py
index e71f3858b0a6dd6416a1e5e36fc26e2bb74f5776..71a9459d556e2b3e25b1cd4ae768a8fb8ae41273 100644
--- a/python/paddle/v2/fluid/layers/control_flow.py
+++ b/python/paddle/v2/fluid/layers/control_flow.py
@@ -38,6 +38,7 @@ __all__ = [
     'array_write',
     'create_array',
     'less_than',
+    'equal',
     'array_read',
     'shrink_memory',
     'array_length',
@@ -276,21 +277,20 @@ class ParallelDo(object):
         parent_block = self.parent_block()
 
         local_inputs = set()
-
-        for op in current_block.ops:
-            for oname in op.output_names:
-                for out_var_name in op.output(oname):
-                    local_inputs.add(out_var_name)
-
+        params = list()
         for var in self.inputs:
             local_inputs.add(var.name)
 
-        params = list()
         for op in current_block.ops:
             for iname in op.input_names:
                 for in_var_name in op.input(iname):
                     if in_var_name not in local_inputs:
                         params.append(in_var_name)
+
+            for oname in op.output_names:
+                for out_var_name in op.output(oname):
+                    local_inputs.add(out_var_name)
+
         params = list(set(params))
 
         return [parent_block.var(name) for name in params]
@@ -975,6 +975,36 @@ def less_than(x, y, cond=None, **ignored):
     return cond
 
 
+def equal(x, y, cond=None, **ignored):
+    """
+    **equal**
+
+    This layer returns the truth value of :math:`x == y` elementwise.
+
+    Args:
+        x(Variable): First operand of *equal*
+        y(Variable): Second operand of *equal*
+        cond(Variable|None): Optional output variable to store the result of *equal*
+
+    Returns:
+        Variable: The tensor variable storing the output of *equal*.
+
+    Examples:
+        .. code-block:: python
+
+          less = fluid.layers.equal(x=label, y=limit)
+    """
+    helper = LayerHelper("equal", **locals())
+    if cond is None:
+        cond = helper.create_tmp_variable(dtype='bool')
+        cond.stop_gradient = True
+
+    helper.append_op(
+        type='equal', inputs={'X': [x],
+                              'Y': [y]}, outputs={'Out': [cond]})
+    return cond
+
+
 def array_read(array, i):
     """This function performs the operation to read the data in as an
     LOD_TENSOR_ARRAY.
diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py
index a79479f469a0c489edf2676bc5d07066bb480664..99168ecc228045a0206aff1b7de5fc17c1438fe2 100644
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -92,7 +92,7 @@ def fc(input,
 
     .. math::
 
-        Out = Act({\sum_{i=0}^{N-1}W_iX_i + b})
+        Out = Act({\sum_{i=0}^{N-1}X_iW_i + b})
 
     In the above equation:
 
@@ -410,12 +410,12 @@ def dynamic_lstmp(input,
     """
     **Dynamic LSTMP Layer**
 
-    LSTMP (LSTM with recurrent projection) layer has a separate projection 
-    layer after the LSTM layer, projecting the original hidden state to a 
-    lower-dimensional one, which is proposed to reduce the number of total 
-    parameters and furthermore computational complexity for the LSTM, 
-    espeacially for the case that the size of output units is relative 
-    large (https://research.google.com/pubs/archive/43905.pdf). 
+    LSTMP (LSTM with recurrent projection) layer has a separate projection
+    layer after the LSTM layer, projecting the original hidden state to a
+    lower-dimensional one, which is proposed to reduce the number of total
+    parameters and furthermore computational complexity for the LSTM,
+    espeacially for the case that the size of output units is relative
+    large (https://research.google.com/pubs/archive/43905.pdf).
 
     The formula is as follows:
 
@@ -441,27 +441,27 @@ def dynamic_lstmp(input,
           the matrix of weights from the input gate to the input).
     * :math:`W_{ic}`, :math:`W_{fc}`, :math:`W_{oc}`: Diagonal weight \
           matrices for peephole connections. In our implementation, \
-          we use vectors to reprenset these diagonal weight matrices. 
+          we use vectors to reprenset these diagonal weight matrices.
     * :math:`b`: Denotes bias vectors (e.g. :math:`b_i` is the input gate \
-          bias vector). 
+          bias vector).
     * :math:`\sigma`: The activation, such as logistic sigmoid function.
     * :math:`i, f, o` and :math:`c`: The input gate, forget gate, output \
           gate, and cell activation vectors, respectively, all of which have \
-          the same size as the cell output activation vector :math:`h`. 
+          the same size as the cell output activation vector :math:`h`.
     * :math:`h`: The hidden state.
-    * :math:`r`: The recurrent projection of the hidden state. 
+    * :math:`r`: The recurrent projection of the hidden state.
     * :math:`\\tilde{c_t}`: The candidate hidden state, whose \
           computation is based on the current input and previous hidden state.
-    * :math:`\odot`: The element-wise product of the vectors. 
+    * :math:`\odot`: The element-wise product of the vectors.
     * :math:`act_g` and :math:`act_h`: The cell input and cell output \
-          activation functions and `tanh` is usually used for them. 
+          activation functions and `tanh` is usually used for them.
     * :math:`\overline{act_h}`: The activation function for the projection \
           output, usually using `identity` or same as :math:`act_h`.
 
     Set `use_peepholes` to `False` to disable peephole connection. The formula
     is omitted here, please refer to the paper
     http://www.bioinf.jku.at/publications/older/2604.pdf for details.
-    
+
     Note that these :math:`W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}`
     operations on the input :math:`x_{t}` are NOT included in this operator.
     Users can choose to use fully-connected layer before LSTMP layer.
@@ -479,8 +479,8 @@ def dynamic_lstmp(input,
 
                                - Hidden-hidden weight = {:math:`W_{ch}, W_{ih}, \
                                                 W_{fh}, W_{oh}`}.
-                               - The shape of hidden-hidden weight is (P x 4D), 
-                                 where P is the projection size and D the hidden 
+                               - The shape of hidden-hidden weight is (P x 4D),
+                                 where P is the projection size and D the hidden
                                  size.
                                - Projection weight = {:math:`W_{rh}`}.
                                - The shape of projection weight is (D x P).
@@ -525,9 +525,9 @@ def dynamic_lstmp(input,
             hidden_dim, proj_dim = 512, 256
             fc_out = fluid.layers.fc(input=input_seq, size=hidden_dim * 4,
                                      act=None, bias_attr=None)
-            proj_out, _ = fluid.layers.dynamic_lstmp(input=fc_out, 
-                                                     size=hidden_dim * 4, 
-                                                     proj_size=proj_dim, 
+            proj_out, _ = fluid.layers.dynamic_lstmp(input=fc_out,
+                                                     size=hidden_dim * 4,
+                                                     proj_size=proj_dim,
                                                      use_peepholes=False,
                                                      is_reverse=True,
                                                      cell_activation="tanh",
@@ -2525,7 +2525,8 @@ def ctc_greedy_decoder(input, blank, name=None):
                     interval [0, num_classes + 1).
 
     Returns:
-        Variable: CTC greedy decode result.
+        Variable: CTC greedy decode result. If all the sequences in result were
+        empty, the result LoDTensor will be [-1] with LoD [[0]] and dims [1, 1].
 
     Examples:
         .. code-block:: python
diff --git a/python/paddle/v2/fluid/learning_rate_decay.py b/python/paddle/v2/fluid/learning_rate_decay.py
index 96b3e9a0d73cede5d6e36308a53ab8927a95a6da..13dc98075f7d32f9dda56a890b98451ef81af363 100644
--- a/python/paddle/v2/fluid/learning_rate_decay.py
+++ b/python/paddle/v2/fluid/learning_rate_decay.py
@@ -15,7 +15,10 @@
 import layers
 from framework import Variable
 
-__all__ = ['exponential_decay', 'natural_exp_decay', 'inverse_time_decay']
+__all__ = [
+    'exponential_decay', 'natural_exp_decay', 'inverse_time_decay',
+    'polynomial_decay', 'piecewise_decay'
+]
 """
 When training a model, it's often useful to decay the
 learning rate during training process, this is called
@@ -101,7 +104,7 @@ def inverse_time_decay(learning_rate,
     ```python
     if staircase:
       decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step / decay_step))
-    else
+    else:
       decayed_learning_rate = learning_rate / (1 + decay_rate * global_step / decay_step)
     ```
     Args:
@@ -123,3 +126,98 @@ def inverse_time_decay(learning_rate,
         div_res = layers.floor(x=div_res)
 
     return learning_rate / (1 + decay_rate * div_res)
+
+
+def polynomial_decay(learning_rate,
+                     global_step,
+                     decay_steps,
+                     end_learning_rate=0.0001,
+                     power=1.0,
+                     cycle=False):
+    """Applies polynomial decay to the initial learning rate.
+
+    ```python
+    if cycle:
+        decay_steps = decay_steps * ceil(global_step / decay_steps)
+    else:
+        global_step = min(global_step, decay_steps)
+    decayed_learning_rate = (learning_rate - end_learning_rate) *
+                      (1 - global_step / decay_steps) ^ power +
+                      end_learning_rate
+    ```
+    Args:
+        learning_rate: A scalar float32 value or a Variable. This
+          will be the initial learning rate during training
+        global_step: A Variable that record the training step.
+        decay_steps: A Python `int32` number.
+        end_learning_rate: A Python `float` number.
+        power: A Python `float` number
+        cycle: Boolean. If set true, decay the learning rate every decay_steps.
+
+    Returns:
+        The decayed learning rate
+    """
+    if not isinstance(global_step, Variable):
+        raise ValueError("global_step is required for inverse_time_decay.")
+
+    if cycle:
+        div_res = layers.ceil(x=(global_step / decay_steps))
+        zero_var = layers.fill_constant(shape=[1], dtype='float32', value=0.0)
+        one_var = layers.fill_constant(shape=[1], dtype='float32', value=1.0)
+
+        with layers.Switch() as switch:
+            with switch.case(layers.equal(x=global_step, y=zero_var)):
+                layers.assign(input=one_var, output=div_res)
+        decay_steps = decay_steps * div_res
+    else:
+        decay_steps_var = layers.fill_constant(
+            shape=[1], dtype='float32', value=float(decay_steps))
+        global_step = layers.elementwise_min(x=global_step, y=decay_steps_var)
+
+    return (learning_rate - end_learning_rate) * \
+           ((1 - global_step / decay_steps) ** power) + end_learning_rate
+
+
+def piecewise_decay(global_step, boundaries, values):
+    """Applies piecewise decay to the initial learning rate.
+
+    ```python
+    boundaries = [10000, 20000]
+    values = [1.0, 0.5, 0.1]
+
+    if step < 10000:
+        learning_rate = 1.0
+    elif step >= 10000 and step < 20000:
+        learning_rate = 0.5
+    else:
+        learning_rate = 0.1
+    ```
+    """
+
+    if len(values) - len(boundaries) != 1:
+        raise ValueError("len(values) - len(boundaries) should be 1")
+
+    if not isinstance(global_step, Variable):
+        raise ValueError("global_step is required for piecewise_decay.")
+
+    lr = layers.create_global_var(
+        shape=[1],
+        value=0.0,
+        dtype='float32',
+        persistable=True,
+        name="learning_rate")
+
+    with layers.Switch() as switch:
+        for i in range(len(boundaries)):
+            boundary_val = layers.fill_constant(
+                shape=[1], dtype='float32', value=float(boundaries[i]))
+            value_var = layers.fill_constant(
+                shape=[1], dtype='float32', value=float(values[i]))
+            with switch.case(layers.less_than(global_step, boundary_val)):
+                layers.assign(value_var, lr)
+        last_value_var = layers.fill_constant(
+            shape=[1], dtype='float32', value=float(values[len(values) - 1]))
+        with switch.default():
+            layers.assign(last_value_var, lr)
+
+    return lr
diff --git a/python/paddle/v2/fluid/memory_optimization_transpiler.py b/python/paddle/v2/fluid/memory_optimization_transpiler.py
index 2b00923f5e85e6ba8fcdedebf5bbbc29403472c6..8bb8cf7b1a5ddf44427637229bdc31ac0e151e44 100644
--- a/python/paddle/v2/fluid/memory_optimization_transpiler.py
+++ b/python/paddle/v2/fluid/memory_optimization_transpiler.py
@@ -145,7 +145,6 @@ class ControlFlowGraph(object):
             if op.type() == "while" or op.type() == "while_grad":
                 continue
             block_desc = op.block()
-            self.current_block_desc = block_desc
             is_forward = i < self._forward_num
             if self.pool:
                 defs_can_optimize = filter(
@@ -156,6 +155,9 @@ class ControlFlowGraph(object):
                     for x in defs_can_optimize
                 ]
                 for x, x_shape in out_pair:
+                    # If x is both in uses and defs, it can not be optimized!
+                    if x in self._uses[i]:
+                        continue
                     for index, cache_pair in enumerate(self.pool):
                         cache_var = cache_pair[0]
                         cache_shape = cache_pair[1]
@@ -208,17 +210,17 @@ def get_cfgs(input_program):
 
     while_sub_block_ids = []
     while_grad_sub_block_ids = []
-    while_op_output = set()
     while_block_id_pair = []
+    while_op_dict = {}
 
     for i in range(op_size):
         op = block_desc.op(i)
         if op.type() == "while":
             while_sub_block_ids.append(op.attr("sub_block").id)
-            while_op_output.update(op.output_arg_names())
+            while_op_dict[op.attr("sub_block").id] = op
         elif op.type() == "while_grad":
             while_grad_sub_block_ids.append(op.attr("sub_block").id)
-            while_op_output.update(op.output_arg_names())
+            while_op_dict[op.attr("sub_block").id] = op
 
     # Find while/while_grad block pair
     for grad_id in while_grad_sub_block_ids:
@@ -240,6 +242,10 @@ def get_cfgs(input_program):
         for i in range(while_grad_block_op_size):
             while_block_ops.append(while_grad_block.op(i))
 
+        while_op_output = set()
+        while_op_output.update(while_op_dict[parent_id].output_arg_names())
+        while_op_output.update(while_op_dict[grad_id].output_arg_names())
+
         ops_list.append((while_block_ops, while_block_op_size, while_op_output))
 
     # Process rest while block ops
@@ -250,9 +256,15 @@ def get_cfgs(input_program):
         for i in range(while_block_op_size):
             while_block_ops.append(while_block.op(i))
 
-        ops_list.append((while_block_ops, while_block_op_size))
+        while_op_output = set()
+        while_op_output.update(while_op_dict[parent_id].output_arg_names())
+
+        ops_list.append((while_block_ops, while_block_op_size, while_op_output))
 
-    cfgs = [ControlFlowGraph(input_program, i, j, k) for i, j, k in ops_list]
+    cfgs = [
+        ControlFlowGraph(input_program, ops, forward_num, skip_opt)
+        for ops, forward_num, skip_opt in ops_list
+    ]
     return cfgs
 
 
diff --git a/python/paddle/v2/fluid/tests/book/test_fit_a_line.py b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
index 27f34b17339db31ef3c07555db946fa76d6f1922..06860a2a465c6f8590336670372eb6ff43b10594 100644
--- a/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
@@ -16,6 +16,8 @@ import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
 import contextlib
 import unittest
+import math
+import sys
 
 
 def main(use_cuda):
@@ -58,6 +60,8 @@ def main(use_cuda):
             print(avg_loss_value)
             if avg_loss_value[0] < 10.0:
                 return
+            if math.isnan(float(avg_loss_value)):
+                sys.exit("got NaN loss, training failed.")
     raise AssertionError("Fit a line cost is too large, {0:2.2}".format(
         avg_loss_value[0]))
 
diff --git a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py b/python/paddle/v2/fluid/tests/book/test_image_classification.py
similarity index 98%
rename from python/paddle/v2/fluid/tests/book/test_image_classification_train.py
rename to python/paddle/v2/fluid/tests/book/test_image_classification.py
index 03b009ebb0714a91329a1c56ff3939beecb03435..ffbe5bdbd646a03884868df659eb9d0089f9479e 100644
--- a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
+++ b/python/paddle/v2/fluid/tests/book/test_image_classification.py
@@ -17,6 +17,8 @@ from __future__ import print_function
 import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
 import contextlib
+import math
+import sys
 import numpy
 import unittest
 
@@ -145,6 +147,8 @@ def train(net_type, use_cuda, save_dirname):
                     loss_t, acc_t = exe.run(program=test_program,
                                             feed=feeder.feed(test_data),
                                             fetch_list=[avg_cost, acc])
+                    if math.isnan(float(loss_t)):
+                        sys.exit("got NaN loss, training failed.")
                     acc_list.append(float(acc_t))
                     avg_loss_list.append(float(loss_t))
                     break  # Use 1 segment for speeding up CI
diff --git a/python/paddle/v2/fluid/tests/book/test_recognize_digits.py b/python/paddle/v2/fluid/tests/book/test_recognize_digits.py
index fb6b1f7192d51dcd654543e4c4ae5ee0c6fe060f..d8f0ad89cd89215ac83a133bd27a53c4b904363f 100644
--- a/python/paddle/v2/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits.py
@@ -18,6 +18,8 @@ import paddle.v2 as paddle
 import sys
 import numpy
 import unittest
+import math
+import sys
 
 
 def parse_arg():
@@ -65,6 +67,7 @@ def conv_net(img, label):
         pool_size=2,
         pool_stride=2,
         act="relu")
+    conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
     conv_pool_2 = fluid.nets.simple_img_conv_pool(
         input=conv_pool_1,
         filter_size=5,
@@ -148,6 +151,8 @@ def train(nn_type, use_cuda, parallel, save_dirname):
                         'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}'.
                         format(pass_id, batch_id + 1,
                                float(avg_loss_val), float(acc_val)))
+                    if math.isnan(float(avg_loss_val)):
+                        sys.exit("got NaN loss, training failed.")
     raise AssertionError("Loss of recognize digits is too large")
 
 
diff --git a/python/paddle/v2/fluid/tests/book/test_recommender_system.py b/python/paddle/v2/fluid/tests/book/test_recommender_system.py
index d4a694e5721415fd9c953a83d927b25b80f5fb47..9c7ab7d6318472ac9378dd1966b75d19b5505bf5 100644
--- a/python/paddle/v2/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/v2/fluid/tests/book/test_recommender_system.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import math
+import sys
 import numpy as np
 import paddle.v2 as paddle
 import paddle.v2.fluid.core as core
@@ -217,6 +219,8 @@ def main():
             if out[0] < 6.0:
                 # if avg cost less than 6.0, we think our code is good.
                 exit(0)
+            if math.isnan(float(out[0])):
+                sys.exit("got NaN loss, training failed.")
 
 
 main()
diff --git a/python/paddle/v2/fluid/tests/book/test_understand_sentiment.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment.py
index 2ba9077a26202b1c16cc480823115f7ad55c2c67..9c5cb667aed7456b54d32dcd650852cfdbd6cce1 100644
--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment.py
@@ -16,6 +16,8 @@ import unittest
 import paddle.v2.fluid as fluid
 import paddle.v2 as paddle
 import contextlib
+import math
+import sys
 
 
 def convolution_net(data, label, input_dim, class_dim=2, emb_dim=32,
@@ -115,6 +117,8 @@ def main(word_dict, net_method, use_cuda):
             print("cost=" + str(cost_val) + " acc=" + str(acc_val))
             if cost_val < 0.4 and acc_val > 0.8:
                 return
+            if math.isnan(float(cost_val)):
+                sys.exit("got NaN loss, training failed.")
     raise AssertionError("Cost is too large for {0}".format(
         net_method.__name__))
 
diff --git a/python/paddle/v2/fluid/tests/book/test_word2vec.py b/python/paddle/v2/fluid/tests/book/test_word2vec.py
index 766ba9681d1bb816170e0458f540b32511c02933..f013d7f1551bdbfb2f725809e2fb4d7d686560fe 100644
--- a/python/paddle/v2/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/v2/fluid/tests/book/test_word2vec.py
@@ -16,6 +16,8 @@ import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
 import unittest
 import os
+import math
+import sys
 
 
 def main(use_cuda, is_sparse, parallel):
@@ -112,6 +114,9 @@ def main(use_cuda, is_sparse, parallel):
                                   fetch_list=[avg_cost])
             if avg_cost_np[0] < 5.0:
                 return
+            if math.isnan(float(avg_cost_np[0])):
+                sys.exit("got NaN loss, training failed.")
+
     raise AssertionError("Cost is too large {0:2.2}".format(avg_cost_np[0]))
 
 
diff --git a/python/paddle/v2/fluid/tests/test_cpp_reader.py b/python/paddle/v2/fluid/tests/test_cpp_reader.py
new file mode 100644
index 0000000000000000000000000000000000000000..e71c3a290c9b120749a5190a246c5d76b7bf1955
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_cpp_reader.py
@@ -0,0 +1,62 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+import numpy as np
+
+prog = fluid.framework.Program()
+block = prog.current_block()
+
+random_reader = block.create_var(
+    type=fluid.core.VarDesc.VarType.READER, name="RandomDataGenerator")
+random_reader.desc.set_lod_levels([0, 0])
+
+create_random_data_generator_op = block.append_op(
+    type="create_random_data_generator",
+    outputs={"Out": random_reader},
+    attrs={
+        "shape_concat": [1, 2, 1, 1],
+        "ranks": [2, 2],
+        "min": 0.0,
+        "max": 1.0
+    })
+
+out1 = block.create_var(
+    type=fluid.core.VarDesc.VarType.LOD_TENSOR,
+    name="Out1",
+    shape=[10, 2],
+    dtype="float32",
+    lod_level=1)
+out2 = block.create_var(
+    type=fluid.core.VarDesc.VarType.LOD_TENSOR,
+    name="Out2",
+    shape=[10, 1],
+    dtype="float32",
+    lod_level=1)
+
+read_op = block.append_op(
+    type="read",
+    inputs={"Reader": random_reader},
+    outputs={"Out": [out1, out2]})
+
+place = fluid.CPUPlace()
+exe = fluid.Executor(place)
+
+[res1, res2] = exe.run(prog, fetch_list=[out1, out2])
+
+if len(res1) == 0 or len(res2) == 0:
+    exit(1)
+
+exit(0)
diff --git a/python/paddle/v2/fluid/tests/test_ctc_align.py b/python/paddle/v2/fluid/tests/test_ctc_align.py
index 773c69d1ad0794d2e4edfb1f6f8140cbcd64bee6..cc815d8e9e16d36c4612009bd40414c454dc59fd 100644
--- a/python/paddle/v2/fluid/tests/test_ctc_align.py
+++ b/python/paddle/v2/fluid/tests/test_ctc_align.py
@@ -31,6 +31,8 @@ def CTCAlign(input, lod, blank, merge_repeated):
                 result.append(token)
             prev_token = token
     result = np.array(result).reshape([len(result), 1]).astype("int32")
+    if len(result) == 0:
+        result = np.array([-1])
     return result
 
 
@@ -72,5 +74,14 @@ class TestCTCAlignOpCase1(TestCTCAlignOp):
                 [19, 1]).astype("int32")
 
 
+class TestCTCAlignOpCase2(TestCTCAlignOp):
+    def config(self):
+        self.op_type = "ctc_align"
+        self.input_lod = [[0, 4]]
+        self.blank = 0
+        self.merge_repeated = True
+        self.input = np.array([0, 0, 0, 0]).reshape([4, 1]).astype("int32")
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_learning_rate_decay.py b/python/paddle/v2/fluid/tests/test_learning_rate_decay.py
index dc348cf2d21693290095900f8ab63c29923b4673..1d6bab3d6c44b2b3403778d5db086e405bb30dee 100644
--- a/python/paddle/v2/fluid/tests/test_learning_rate_decay.py
+++ b/python/paddle/v2/fluid/tests/test_learning_rate_decay.py
@@ -15,6 +15,8 @@
 import unittest
 
 import math
+import copy
+
 import paddle.v2.fluid.framework as framework
 import paddle.v2.fluid as fluid
 import paddle.v2.fluid.layers as layers
@@ -54,21 +56,37 @@ def inverse_time_decay(learning_rate,
     return learning_rate / (1 + decay_rate * temp)
 
 
-class TestLearningRateDecay(unittest.TestCase):
-    def check_decay(self, python_decay_fn, fluid_decay_fn, staircase):
-        init_lr = 1.0
-        decay_steps = 5
-        decay_rate = 0.5
+def polynomial_decay(learning_rate,
+                     global_step,
+                     decay_steps,
+                     end_learning_rate=0.0001,
+                     power=1.0,
+                     cycle=False):
+    if cycle:
+        div = math.ceil(global_step / float(decay_steps))
+        if div == 0:
+            div = 1
+        decay_steps = decay_steps * div
+    else:
+        global_step = min(global_step, decay_steps)
+    return (learning_rate - end_learning_rate) * \
+           ((1 - float(global_step) / float(decay_steps)) ** power) + end_learning_rate
+
+
+def piecewise_decay(global_step, boundaries, values):
+    assert len(boundaries) + 1 == len(values)
+    for i in range(len(boundaries)):
+        if global_step < boundaries[i]:
+            return values[i]
+    return values[len(values) - 1]
 
+
+class TestLearningRateDecay(unittest.TestCase):
+    def check_decay(self, python_decay_fn, fluid_decay_fn, kwargs):
         global_step = layers.create_global_var(
             shape=[1], value=0.0, dtype='float32', persistable=True)
 
-        decayed_lr = fluid_decay_fn(
-            learning_rate=init_lr,
-            global_step=global_step,
-            decay_steps=decay_steps,
-            decay_rate=decay_rate,
-            staircase=staircase)
+        decayed_lr = fluid_decay_fn(global_step=global_step, **kwargs)
         layers.increment(global_step, 1.0)
 
         place = fluid.CPUPlace()
@@ -79,31 +97,52 @@ class TestLearningRateDecay(unittest.TestCase):
             step_val, lr_val = exe.run(fluid.default_main_program(),
                                        feed=[],
                                        fetch_list=[global_step, decayed_lr])
-            python_decayed_lr = python_decay_fn(
-                learning_rate=init_lr,
-                global_step=step,
-                decay_steps=decay_steps,
-                decay_rate=decay_rate,
-                staircase=staircase)
+            python_decayed_lr = python_decay_fn(global_step=step, **kwargs)
             self.assertAlmostEqual(python_decayed_lr, lr_val[0])
 
     def test_decay(self):
+        common_kwargs_true = {
+            "learning_rate": 1.0,
+            "decay_steps": 5,
+            "decay_rate": 0.5,
+            "staircase": True
+        }
+        common_kwargs_false = copy.deepcopy(common_kwargs_true)
+        common_kwargs_false["staircase"] = False
+
         decay_fns = [
-            (exponential_decay, lr_decay.exponential_decay, True),
-            (exponential_decay, lr_decay.exponential_decay, False),
-            (natural_exp_decay, lr_decay.natural_exp_decay, True),
-            (natural_exp_decay, lr_decay.natural_exp_decay, False),
-            (inverse_time_decay, lr_decay.inverse_time_decay, True),
-            (inverse_time_decay, lr_decay.inverse_time_decay, False),
+            (exponential_decay, lr_decay.exponential_decay, common_kwargs_true),
+            (exponential_decay, lr_decay.exponential_decay,
+             common_kwargs_false),
+            (natural_exp_decay, lr_decay.natural_exp_decay, common_kwargs_true),
+            (natural_exp_decay, lr_decay.natural_exp_decay,
+             common_kwargs_false),
+            (inverse_time_decay, lr_decay.inverse_time_decay,
+             common_kwargs_true),
+            (inverse_time_decay, lr_decay.inverse_time_decay,
+             common_kwargs_false),
+            (polynomial_decay, lr_decay.polynomial_decay, {
+                "learning_rate": 1.0,
+                "decay_steps": 5,
+                "cycle": True
+            }),
+            (polynomial_decay, lr_decay.polynomial_decay, {
+                "learning_rate": 1.0,
+                "decay_steps": 5,
+                "cycle": False
+            }),
+            (piecewise_decay, lr_decay.piecewise_decay, {
+                "boundaries": [3, 6, 9],
+                "values": [0.1, 0.2, 0.3, 0.4]
+            }),
         ]
 
-        for py_decay_fn, fluid_decay_fn, staircase in decay_fns:
-            print("decay_fn=" + str(py_decay_fn) + " staircase=" + str(
-                staircase))
+        for py_decay_fn, fluid_decay_fn, kwargs in decay_fns:
+            print("decay_fn=" + py_decay_fn.__name__ + " kwargs=" + str(kwargs))
             main_program = framework.Program()
             startup_program = framework.Program()
             with framework.program_guard(main_program, startup_program):
-                self.check_decay(py_decay_fn, fluid_decay_fn, staircase)
+                self.check_decay(py_decay_fn, fluid_decay_fn, kwargs)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/fluid/tests/test_protobuf_descs.py b/python/paddle/v2/fluid/tests/test_protobuf_descs.py
index 8f335d13db3ddb999058a58cadc57ff23ff1fbc5..c590bf1c6570a2320962f2d610619dbd88b473d1 100644
--- a/python/paddle/v2/fluid/tests/test_protobuf_descs.py
+++ b/python/paddle/v2/fluid/tests/test_protobuf_descs.py
@@ -120,7 +120,6 @@ class TestVarDesc(unittest.TestCase):
         block = program_desc.block(0)
         var = block.var('my_reader')
         var.set_type(core.VarDesc.VarType.READER)
-        var.set_tensor_num(3)
         src_shapes = [[2, 3, 3], [4, 5], [6, 7, 8, 9]]
         var.set_shapes(src_shapes)
         res_shapes = var.shapes()
@@ -141,7 +140,6 @@ class TestVarDesc(unittest.TestCase):
         block = program_desc.block(0)
         var = block.var('my_reader')
         var.set_type(core.VarDesc.VarType.READER)
-        var.set_tensor_num(3)
         src_types = [
             core.DataType.INT32, core.DataType.FP64, core.DataType.FP32
         ]
@@ -154,7 +152,6 @@ class TestVarDesc(unittest.TestCase):
         block = program_desc.block(0)
         var = block.var('my_reader')
         var.set_type(core.VarDesc.VarType.READER)
-        var.set_tensor_num(3)
         src_types = [3, 1, 2]
         var.set_lod_levels(src_types)
         self.assertEqual(src_types, var.lod_levels())
diff --git a/python/paddle/v2/fluid/tests/test_target_assign_op.py b/python/paddle/v2/fluid/tests/test_target_assign_op.py
new file mode 100755
index 0000000000000000000000000000000000000000..8a1155c6217401b1b85e3c0bdc47f438f482bcbb
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_target_assign_op.py
@@ -0,0 +1,122 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import random
+from op_test import OpTest
+
+
+def gen_match_and_neg_indices(num_prior, gt_lod, neg_lod):
+    if len(gt_lod) != len(neg_lod):
+        raise AssertionError("The input arguments are illegal.")
+
+    batch_size = len(gt_lod) - 1
+
+    match_indices = -1 * np.ones((batch_size, num_prior)).astype('int32')
+    neg_indices = np.zeros((neg_lod[-1], 1)).astype('int32')
+
+    for n in range(batch_size):
+        gt_num = gt_lod[n + 1] - gt_lod[n]
+        ids = random.sample([i for i in range(num_prior)], gt_num)
+        match_indices[n, ids] = [i for i in range(gt_num)]
+
+        ret_ids = set([i for i in range(num_prior)]) - set(ids)
+        s = neg_lod[n]
+        e = neg_lod[n + 1]
+        l = e - s
+        neg_ids = random.sample(ret_ids, l)
+        neg_indices[s:e, :] = np.array(neg_ids).astype('int32').reshape(l, 1)
+
+    return match_indices, neg_indices
+
+
+def target_assign(encoded_box, gt_label, match_indices, neg_indices, gt_lod,
+                  neg_lod, background_label):
+    batch_size, num_prior = match_indices.shape
+
+    # init target bbox
+    trg_box = np.zeros((batch_size, num_prior, 4)).astype('float32')
+    # init weight for target bbox
+    trg_box_wt = np.zeros((batch_size, num_prior, 1)).astype('float32')
+    # init target label
+    trg_label = np.ones((batch_size, num_prior, 1)).astype('int32')
+    trg_label = trg_label * background_label
+    # init weight for target label
+    trg_label_wt = np.zeros((batch_size, num_prior, 1)).astype('float32')
+
+    for i in range(batch_size):
+        cur_indices = match_indices[i]
+        col_ids = np.where(cur_indices > -1)
+        col_val = cur_indices[col_ids]
+
+        gt_start = gt_lod[i]
+        # target bbox
+        for v, c in zip(col_val + gt_start, col_ids[0].tolist()):
+            trg_box[i][c][:] = encoded_box[v][c][:]
+
+        # weight for target bbox
+        trg_box_wt[i][col_ids] = 1.0
+
+        trg_label[i][col_ids] = gt_label[col_val + gt_start]
+
+        trg_label_wt[i][col_ids] = 1.0
+        # set target label weight to 1.0 for the negative samples
+        neg_ids = neg_indices[neg_lod[i]:neg_lod[i + 1]]
+        trg_label_wt[i][neg_ids] = 1.0
+
+    return trg_box, trg_box_wt, trg_label, trg_label_wt
+
+
+class TestTargetAssginOp(OpTest):
+    def setUp(self):
+        self.op_type = "target_assign"
+
+        num_prior = 120
+        num_class = 21
+        gt_lod = [0, 5, 11, 23]
+        neg_lod = [0, 4, 7, 13]
+        batch_size = len(gt_lod) - 1
+        num_gt = gt_lod[-1]
+        background_label = 0
+
+        encoded_box = np.random.random((num_gt, num_prior, 4)).astype('float32')
+        gt_label = np.random.randint(
+            num_class, size=(num_gt, 1)).astype('int32')
+        match_indices, neg_indices = gen_match_and_neg_indices(num_prior,
+                                                               gt_lod, neg_lod)
+        trg_box, trg_box_wt, trg_label, trg_label_wt = target_assign(
+            encoded_box, gt_label, match_indices, neg_indices, gt_lod, neg_lod,
+            background_label)
+
+        self.inputs = {
+            'EncodedGTBBox': (encoded_box, [gt_lod]),
+            'GTScoreLabel': (gt_label, [gt_lod]),
+            'MatchIndices': (match_indices),
+            'NegIndices': (neg_indices, [neg_lod]),
+        }
+        self.attrs = {'background_label': background_label}
+        self.outputs = {
+            'PredBBoxLabel': (trg_box),
+            'PredBBoxWeight': (trg_box_wt),
+            'PredScoreLabel': (trg_label),
+            'PredScoreWeight': (trg_label_wt),
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == '__main__':
+    unittest.main()